Mercurial > hg > eSciDocImport
view src/de/mpiwg/itgroup/eSciDoc/harvesting/ESciDocDataHarvester.java @ 4:cb5668b07bfc
neuer check ob datei schon existiert.
author | dwinter |
---|---|
date | Wed, 12 Jan 2011 16:34:42 +0100 |
parents | 58b52df9763c |
children | a42dabfcffdf |
line wrap: on
line source
package de.mpiwg.itgroup.eSciDoc.harvesting; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.log4j.xml.DOMConfigurator; import org.jdom.JDOMException; import de.mpiwg.itgroup.eSciDoc.Tools.EScidocBasicHandler; import de.mpiwg.itgroup.eSciDoc.Tools.EScidocTools; import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHOObject; import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHORessource; import de.mpiwg.itgroup.eSciDoc.exceptions.ConnectorException; import de.mpiwg.itgroup.eSciDoc.exceptions.ESciDocXmlObjectException; import de.mpiwg.itgroup.eSciDoc.exceptions.ObjectNotUniqueError; import de.mpiwg.itgroup.eSciDoc.exceptions.TransformerException; import de.mpiwg.itgroup.eSciDoc.importer.ECHOImporter; import de.mpiwg.itgroup.eSciDoc.importer.Importer; import de.mpiwg.itgroup.eSciDoc.transformer.ECHOTransformer; import de.mpiwg.itgroup.eSciDoc.transformer.Transformer; import de.mpiwg.itgroup.eSciDoc.utils.eSciDocXmlObject; /** * @author dwinter * Main class for data harveting from index.meta files into eScidoc */ public class ESciDocDataHarvester { static int MAX_REC=1000; // maximale Anzahl von Records die in einem Stueck eingelesen bzw. bearbeitet werden. //static int MAX_REC=5; protected Logger logger = Logger.getRootLogger(); protected Importer importer; protected EScidocBasicHandler connector; protected Transformer transformer; private String echoContext; private Logger addedFile = Logger.getLogger("addedFilesLogger"); private Logger notAddedFile = Logger.getLogger("notAddedFilesLogger"); /** * @param importer Importer for dataObjects, describes how to access the objects * @param transformer Transformer, generates the eScidocMetaDatasets * @param connector connects to the eScidocRepository * @param context Escidoc context path z.b. /ir/context/escidoc:12001 */ public ESciDocDataHarvester(Importer importer, Transformer transformer, EScidocBasicHandler connector, String context) { this.importer = importer; this.transformer = transformer; this.connector = connector; //this.tools = new EScidocTools(connector); this.echoContext = context; } /** Read objects into eScidoc or updates the objects if indexMeta has changed. * @param type restrict the imported objects to a specific type, possible types should be defined in * the given importer @see {@link #importer} * @return * @throws ConnectorException * @throws TransformerException * @throws ESciDocXmlObjectException */ @Deprecated public Boolean readObjectsFromInstanceOLD(String type) throws ConnectorException, TransformerException, ESciDocXmlObjectException { ArrayList<String> addedObjects = new ArrayList<String>(); ArrayList<String> notAddedObjects = new ArrayList<String>(); for (ECHOObject obj : importer.getObjectList(type)) { if (ECHORessource.class.isInstance(obj)) { try { ECHOObject old; try { old = connector.alreadyExists( "/md-records/md-record/admin/archivePath", ((ECHORessource) obj).archivePath, echoContext); } catch (ObjectNotUniqueError e) { // TODO Auto-generated catch block e.printStackTrace(); continue; } if (old!=null) { logger.debug("already exist:" + ((ECHORessource) obj).archivePath); handleExistingObject(obj,old); continue; } } catch (ConnectorException e) { logger.debug("already exist error:"); e.printStackTrace(); continue; } } obj.context = echoContext; String contid = connector.getIDfromPID(obj.pid, echoContext); if (contid != null) { System.out.println("------- belongsTo:" + contid); } else { eSciDocXmlObject escidocItem = transformer.transform(obj); try { logger.info(escidocItem.printXML()); // TODO write PID to back to echo-obj Boolean result = connector.createItem(escidocItem); if (result) { addedObjects.add(escidocItem.getESciDocId()); addedFile.debug(escidocItem.getESciDocId() + "\n"); } else { notAddedObjects.add(obj.echoUrl); notAddedFile.debug(obj.echoUrl); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); throw new ESciDocXmlObjectException(); } catch (JDOMException e) { // TODO Auto-generated catch block e.printStackTrace(); throw new ESciDocXmlObjectException(); } } } if (logger.getLevel() == Level.DEBUG) { for (String addedObject : addedObjects) { logger.debug(addedObject); } } return true; } /** Read objects into eScidoc or updates the objects if indexMeta has changed. * @param type restrict the imported objects to a specific type, possible types should be defined in * the given importer @see {@link #importer} * @return * @throws ConnectorException * @throws TransformerException * @throws ESciDocXmlObjectException */ public Boolean readObjectsFromInstance(String type) throws ConnectorException, TransformerException, ESciDocXmlObjectException { ArrayList<String> addedObjects = new ArrayList<String>(); ArrayList<String> notAddedObjects = new ArrayList<String>(); for (ECHOObject obj : importer.getObjectList(type)) { if (ECHORessource.class.isInstance(obj)) { try { // checke zuerst, ob die MD5 schon im publiziert Teil der Metadaten ist, dann tue nichts String md5 = ((ECHORessource) obj).getIndexMetaMD5onServer(); List<eSciDocXmlObject> results = connector.getObjectsFromSearch("escidoc.component.checksum",md5); if (results.size()>0){ //index.meta schon abgespeichert continue; } ECHOObject old; try { old = connector.alreadyExists( "/md-records/md-record/admin/archivePath", ((ECHORessource) obj).archivePath, echoContext); } catch (ObjectNotUniqueError e) { // TODO Auto-generated catch block e.printStackTrace(); continue; } if (old!=null) { logger.debug("already exist:" + ((ECHORessource) obj).archivePath); handleExistingObject(obj,old); continue; } } catch (ConnectorException e) { logger.debug("already exist error:"); e.printStackTrace(); continue; } } obj.context = echoContext; String contid = connector.getIDfromPID(obj.pid, echoContext); if (contid != null) { System.out.println("------- belongsTo:" + contid); } else { eSciDocXmlObject escidocItem = transformer.transform(obj); try { logger.info(escidocItem.printXML()); // TODO write PID to back to echo-obj Boolean result = connector.createItem(escidocItem); if (result) { addedObjects.add(escidocItem.getESciDocId()); addedFile.debug(escidocItem.getESciDocId() + "\n"); } else { notAddedObjects.add(obj.echoUrl); notAddedFile.debug(obj.echoUrl); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); throw new ESciDocXmlObjectException(); } catch (JDOMException e) { // TODO Auto-generated catch block e.printStackTrace(); throw new ESciDocXmlObjectException(); } } } if (logger.getLevel() == Level.DEBUG) { for (String addedObject : addedObjects) { logger.debug(addedObject); } } return true; } /** * Deal with existing objects, do nothing if md5 of stored metadata and metadata on the server is the same otherwise call {@link #updateObject(ECHOObject)}. * @param objNew * @param old * @throws TransformerException * @throws ESciDocXmlObjectException */ private void handleExistingObject(ECHOObject objNew, ECHOObject old) throws TransformerException, ESciDocXmlObjectException { ECHORessource objNewRes = (ECHORessource)objNew; ECHORessource objOldRes = (ECHORessource)old; String md5onServer = objNewRes.getIndexMetaMD5onServer(); String md5=objOldRes.getIndexMetaMD5stored(); if (md5onServer.equals(md5)) return; else { updateObject(objNew, old); } } private void updateObject(ECHOObject objNew, ECHOObject objOld) throws TransformerException, ESciDocXmlObjectException { objNew.context = echoContext; eSciDocXmlObject escidocItem = transformer.transform(objNew); String lastModificationDateOld = objOld.lastModificationDate; escidocItem.setLastModificationDate(lastModificationDateOld); try { HttpResponse ret = connector.eScidocPut(objOld.eScidocId, EScidocBasicHandler.convertStringToStream(escidocItem.printXML())); HttpEntity ent = ret.getEntity(); if (ret.getStatusLine().getStatusCode() != 200) { logger.debug("Can not update:" + objOld.eScidocId); // res.getEntity().consumeContent(); // necessary to release // the conneciton ent.consumeContent(); } InputStream restream = ret.getEntity().getContent(); logger.debug(EScidocBasicHandler.convertStreamToString(restream)); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * @param command * @param objectXPath * @param mode 0 : only submit, 1:only release, 2:release and submit * @throws Exception */ public void releaseAndSubmitObjects(String command, String objectXPath,int mode) throws Exception { Integer numberOfHits = connector.getNumberOfHitsFromFilterResult( command, objectXPath,mode); int tausend = ((numberOfHits-1) / MAX_REC); String queryRestrict=""; if(mode==0 | mode==2){ queryRestrict="query=%22/properties/version/status%22=pending"; } else { queryRestrict="query=%22/properties/version/status%22=submitted"; } for (int t = 0; t <= tausend; t++) { int start = t * MAX_REC+1; // int max=Math.min((t+1)*1000, numberOfHits); String query = "?maximumRecords="+String.valueOf(MAX_REC)+"&startRecord=" + String.valueOf(start)+"&"+queryRestrict; for (eSciDocXmlObject obj : connector .getObjectsFromFilterResult(command+query, objectXPath)) { //TODO is the following really necessary, currently the obj in the list is sometimes not the current one. try{ HttpResponse resObj = connector.eScidocGet(obj.getESciDocId()); HttpEntity ent = resObj.getEntity(); if (ent!=null){ obj= new eSciDocXmlObject(ent.getContent()); } else { logger.debug("Can not retrieve:" + obj.getESciDocId()); continue; } } catch (Exception e){ logger.debug("Can not retrieve:" + obj.getESciDocId()); continue; } if (mode==0 | mode==2){ HttpResponse res = connector.submitAnObject(obj, "first release"); logger.debug(res.getStatusLine()); if (res.getStatusLine().getStatusCode() != 200) { logger.debug("Can not submit:" + obj.getESciDocId()); // res.getEntity().consumeContent(); // necessary to release // the conneciton } InputStream restream = res.getEntity().getContent(); logger.debug(EScidocBasicHandler.convertStreamToString(restream)); //res.getEntity().consumeContent(); // necessary to release the // conneciton if (!connector.upDateObject(obj)) { logger.debug("Can not update:" + obj.getESciDocId()); // continue; } } if (mode==1 | mode==2){ HttpResponse res = connector.releaseAnObject(obj, "first release"); logger.debug(res.getStatusLine()); if (res.getStatusLine().getStatusCode() != 200) { logger.debug("Can not release:" + obj.getESciDocId()); res.getEntity().consumeContent(); // necessary to release // the conneciton continue; } addedFile.debug("RELEASED:" + obj.getESciDocId()); res.getEntity().consumeContent(); // necessary to release the // connecito } } } } public static void main(String[] args) throws Exception { Logger rl = Logger.getRootLogger(); DOMConfigurator.configure("log4uconf.xml"); rl.setLevel(Level.DEBUG); EScidocBasicHandler connector = new EScidocBasicHandler("escidoc-test.mpiwg-berlin.mpg.de",8080,"dwinter","weikiki7"); ECHOImporter newimporter = new ECHOImporter(new URL( "file:///Users/dwinter/libcoll.rdf")); ESciDocDataHarvester hv = new ESciDocDataHarvester(newimporter, new ECHOTransformer(), connector, "/ir/context/escidoc:1001"); // hv.readObjectsFromInstance("ECHO_collection"); // hv.readObjectsFromInstance("ECHO_resource"); hv.releaseAndSubmitObjects( "/ir/context/escidoc:1001/resources/members", "//escidocItem:item",1); // newimporter.organizeRessourcesInCollections(connector, // "/ir/context/escidoc:1001"); // hv.releaseAndSubmitObjects("/ir/containers","//container:container"); } }