Mercurial > hg > eSciDocImport
view src/de/mpiwg/itgroup/eSciDoc/harvesting/ESciDocDataHarvester.java @ 8:a844f6948dd8
?nderungen im Walker
tools f?r pubman
author | dwinter |
---|---|
date | Mon, 14 May 2012 09:58:45 +0200 |
parents | 4b1ae52418c1 |
children | b6cf6462d709 |
line wrap: on
line source
package de.mpiwg.itgroup.eSciDoc.harvesting; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.log4j.xml.DOMConfigurator; import org.jdom.JDOMException; import de.mpiwg.itgroup.eSciDoc.Tools.EScidocBasicHandler; import de.mpiwg.itgroup.eSciDoc.Tools.EScidocTools; import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHOObject; import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHORessource; import de.mpiwg.itgroup.eSciDoc.exceptions.ConnectorException; import de.mpiwg.itgroup.eSciDoc.exceptions.ESciDocXmlObjectException; import de.mpiwg.itgroup.eSciDoc.exceptions.ObjectNotUniqueError; import de.mpiwg.itgroup.eSciDoc.exceptions.TransformerException; import de.mpiwg.itgroup.eSciDoc.importer.ECHOImporter; import de.mpiwg.itgroup.eSciDoc.importer.Importer; import de.mpiwg.itgroup.eSciDoc.transformer.ECHOTransformer; import de.mpiwg.itgroup.eSciDoc.transformer.Transformer; import de.mpiwg.itgroup.eSciDoc.utils.eSciDocXmlObject; /** * @author dwinter * Main class for data harveting from index.meta files into eScidoc */ public class ESciDocDataHarvester { static int MAX_REC=1000; // maximale Anzahl von Records die in einem Stueck eingelesen bzw. bearbeitet werden. //static int MAX_REC=5; protected Logger logger = Logger.getRootLogger(); protected Importer importer; protected EScidocBasicHandler connector; protected Transformer transformer; private String context; private Logger addedFile = Logger.getLogger("addedFilesLogger"); private Logger notAddedFile = Logger.getLogger("notAddedFilesLogger"); /** * @param importer Importer for dataObjects, describes how to access the objects * @param transformer Transformer, generates the eScidocMetaDatasets * @param connector connects to the eScidocRepository * @param context Escidoc context path z.b. /ir/context/escidoc:12001 */ public ESciDocDataHarvester(Importer importer, Transformer transformer, EScidocBasicHandler connector, String context) { this.importer = importer; this.transformer = transformer; this.connector = connector; //this.tools = new EScidocTools(connector); this.context = context; } /** Read objects into eScidoc or updates the objects if indexMeta has changed. * @param type restrict the imported objects to a specific type, possible types should be defined in * the given importer @see {@link #importer} * @return * @throws ConnectorException * @throws TransformerException * @throws ESciDocXmlObjectException */ @Deprecated public Boolean readObjectsFromInstanceOLD(String type) throws ConnectorException, TransformerException, ESciDocXmlObjectException { ArrayList<String> addedObjects = new ArrayList<String>(); ArrayList<String> notAddedObjects = new ArrayList<String>(); for (ECHOObject obj : importer.getObjectList(type)) { if (ECHORessource.class.isInstance(obj)) { try { ECHOObject old; try { old = connector.alreadyExists( "admin.archivePath", ((ECHORessource) obj).archivePath, context,"="); if (old==null){ //FIXME Problem bei der Erfassungder Metadaten sollte eigentlich nicht vorkommen! old = connector.alreadyExists( "admin.archivePath", ((ECHORessource) obj).archivePath.replace("/mpiwg/online/permanent", "/Volumes/online/permanent"), context,"="); } if (old==null){ old = connector.alreadyExists( "admin.archivePath", ((ECHORessource) obj).archivePath.replace("/mpiwg/online/experimental", "/Volumes/online/experimental"), context,"="); } } catch (ObjectNotUniqueError e) { // TODO Auto-generated catch block e.printStackTrace(); continue; } if (old!=null) { logger.debug("already exist:" + ((ECHORessource) obj).archivePath); handleExistingObject(obj,old); continue; } } catch (ConnectorException e) { logger.debug("already exist error:"); e.printStackTrace(); continue; } } obj.context = context; String contid = connector.getIDfromPID(obj.pid, context); if (contid != null) { System.out.println("------- belongsTo:" + contid); } else { eSciDocXmlObject escidocItem = transformer.transform(obj); try { logger.info(escidocItem.printXML()); // TODO write PID to back to echo-obj Boolean result = connector.createItem(escidocItem); if (result) { addedObjects.add(escidocItem.getESciDocId()); addedFile.debug(escidocItem.getESciDocId() + "\n"); } else { notAddedObjects.add(obj.echoUrl); notAddedFile.debug(obj.echoUrl); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); throw new ESciDocXmlObjectException(); } catch (JDOMException e) { // TODO Auto-generated catch block e.printStackTrace(); throw new ESciDocXmlObjectException(); } } } if (logger.getLevel() == Level.DEBUG) { for (String addedObject : addedObjects) { logger.debug(addedObject); } } return true; } /** Read objects into eScidoc or updates the objects if indexMeta has changed. * @param type restrict the imported objects to a specific type, possible types should be defined in * the given importer @see {@link #importer} * @return * @throws ConnectorException * @throws TransformerException * @throws ESciDocXmlObjectException */ public Boolean readObjectsFromInstance(String type) throws ConnectorException, TransformerException, ESciDocXmlObjectException { ArrayList<String> addedObjects = new ArrayList<String>(); ArrayList<String> notAddedObjects = new ArrayList<String>(); for (ECHOObject obj : importer.getObjectList(type)) { if (obj==null) continue; if (ECHORessource.class.isInstance(obj)) { try { // checke zuerst, ob die MD5 schon im publiziert Teil der Metadaten ist, dann tue nichts String md5 = ((ECHORessource) obj).getIndexMetaMD5onServer(); //List<eSciDocXmlObject> results = connector.getObjectsFromSearch("escidoc.component.checksum",md5); List<eSciDocXmlObject> results = connector.getObjectsFromSearch("escidoc.component.checksum",md5,context); if (results.size()>0){ //index.meta schon abgespeichert notAddedFile.debug("alredy exist:"+obj.echoUrl); ((ECHORessource) obj).writeEsciDocIDToIndexMeta(results.get(0)); continue; } ECHOObject old; try { old = connector.alreadyExists( "admin.archivePath", ((ECHORessource) obj).archivePath, context,"="); if (old==null){ //FIXME Problem bei der Erfassungder Metadaten sollte eigentlich nicht vorkommen! old = connector.alreadyExists( "admin.archivePath", ((ECHORessource) obj).archivePath.replace("/mpiwg/online/permanent", "/Volumes/online_permanent"), context,"="); } if (old==null){ old = connector.alreadyExists( "admin.archivePath", ((ECHORessource) obj).archivePath.replace("/mpiwg/online/experimental", "/Volumes/online_experimental"), context,"="); } } catch (ObjectNotUniqueError e) { // TODO Auto-generated catch block e.printStackTrace(); continue; } if (old!=null) { logger.debug("already exist:" + ((ECHORessource) obj).archivePath); handleExistingObject(obj,old); continue; } } catch (ConnectorException e) { logger.debug("already exist error:"); e.printStackTrace(); continue; } } obj.context = context; String contid=null; if (obj.pid!=null) contid = connector.getIDfromPID(obj.pid, context); if (contid != null) { System.out.println("------- belongsTo:" + contid); } else { eSciDocXmlObject escidocItem = transformer.transform(obj); try { logger.info(escidocItem.printXML()); // TODO write PID to back to echo-obj Boolean result = connector.createItem(escidocItem); if (result) { addedObjects.add(escidocItem.getESciDocId()); addedFile.debug(escidocItem.getESciDocId() + "\n"); } else { notAddedObjects.add(obj.echoUrl); notAddedFile.debug(obj.echoUrl); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); throw new ESciDocXmlObjectException(); } catch (JDOMException e) { // TODO Auto-generated catch block e.printStackTrace(); throw new ESciDocXmlObjectException(); } } } if (logger.getLevel() == Level.DEBUG) { for (String addedObject : addedObjects) { logger.debug(addedObject); } } return true; } /** * Deal with existing objects, do nothing if md5 of stored metadata and metadata on the server is the same otherwise call {@link #updateObject(ECHOObject)}. * @param objNew * @param old * @throws TransformerException * @throws ESciDocXmlObjectException */ private void handleExistingObject(ECHOObject objNew, ECHOObject old) throws TransformerException, ESciDocXmlObjectException { ECHORessource objNewRes = (ECHORessource)objNew; ECHORessource objOldRes = (ECHORessource)old; String md5onServer = objNewRes.getIndexMetaMD5onServer(); String md5=objOldRes.getIndexMetaMD5stored(); objNewRes.writeEsciDocIDToIndexMeta(objOldRes.eScidocId); if (md5onServer.equals(md5)) return; else { updateObject(objNew, old); } } private void updateObject(ECHOObject objNew, ECHOObject objOld) throws TransformerException, ESciDocXmlObjectException { objNew.context = context; //erzeuge erst einmal ein neues XML Object aus den neuen Daten. eSciDocXmlObject escidocItem = transformer.transform(objNew); String lastModificationDateOld = objOld.lastModificationDate; //jetzt das alte ModeificationDate dort rein (wegen, optimitistic locking) escidocItem.setLastModificationDate(lastModificationDateOld); try { HttpResponse ret = connector.eScidocPut("/ir/item/"+objOld.eScidocId, EScidocBasicHandler.convertStringToStream(escidocItem.printXML())); HttpEntity ent = ret.getEntity(); if (ret.getStatusLine().getStatusCode() != 200) { logger.debug("Can not update:" + objOld.eScidocId); // res.getEntity().consumeContent(); // necessary to release // the conneciton ent.consumeContent(); } InputStream restream = ret.getEntity().getContent(); logger.debug(EScidocBasicHandler.convertStreamToString(restream)); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } addedFile.debug("updated:"+objOld.eScidocId); } /** * @param command * @param objectXPath * @param comment der in escidoc bei der Operation abgespeichert wird. * @param mode 0 : only submit, 1:only release, 2:release and submit * @throws Exception */ public void releaseAndSubmitObjects(String command, String objectXPath,String comment,int mode) throws Exception { Integer numberOfHits = connector.getNumberOfHitsFromFilterResult( command, objectXPath,mode); int tausend = ((numberOfHits-1) / MAX_REC); String queryRestrict=""; if(mode==0 | mode==2){ queryRestrict="query=%22/properties/version/status%22=pending"; } else { queryRestrict="query=%22/properties/version/status%22=submitted"; } for (int t = 0; t <= tausend; t++) { int start = t * MAX_REC+1; // int max=Math.min((t+1)*1000, numberOfHits); String query = "?maximumRecords="+String.valueOf(MAX_REC)+"&startRecord=" + String.valueOf(start)+"&"+queryRestrict; for (eSciDocXmlObject obj : connector .getObjectsFromFilterResult(command+query, objectXPath)) { //TODO is the following really necessary, currently the obj in the list is sometimes not the current one. try{ HttpResponse resObj = connector.eScidocGet(obj.getESciDocId()); HttpEntity ent = resObj.getEntity(); if (ent!=null){ obj= new eSciDocXmlObject(ent.getContent()); } else { logger.debug("Can not retrieve:" + obj.getESciDocId()); continue; } } catch (Exception e){ logger.debug("Can not retrieve:" + obj.getESciDocId()); continue; } if (mode==0 | mode==2){ HttpResponse res = connector.submitAnObject(obj, comment); logger.debug(res.getStatusLine()); if (res.getStatusLine().getStatusCode() != 200) { logger.debug("Can not submit:" + obj.getESciDocId()); // res.getEntity().consumeContent(); // necessary to release // the conneciton } InputStream restream = res.getEntity().getContent(); logger.debug(EScidocBasicHandler.convertStreamToString(restream)); //res.getEntity().consumeContent(); // necessary to release the // conneciton if (!connector.upDateObject(obj)) { logger.debug("Can not update:" + obj.getESciDocId()); // continue; } } if (mode==1 | mode==2){ HttpResponse res = connector.releaseAnObject(obj, comment); logger.debug(res.getStatusLine()); if (res.getStatusLine().getStatusCode() != 200) { logger.debug("Can not release:" + obj.getESciDocId()); res.getEntity().consumeContent(); // necessary to release // the conneciton continue; } addedFile.debug("RELEASED:" + obj.getESciDocId()); res.getEntity().consumeContent(); // necessary to release the // connecito } } } } public static void main(String[] args) throws Exception { Logger rl = Logger.getRootLogger(); DOMConfigurator.configure("/etc/escidocImportConfig.xml"); rl.setLevel(Level.DEBUG); EScidocBasicHandler connector = new EScidocBasicHandler("escidoc-test.mpiwg-berlin.mpg.de",8080,"dwinter","weikiki7"); //ECHOImporter newimporter = new ECHOImporter(new URL( // "file:///Users/dwinter/libcoll.rdf")); ECHOImporter newimporter = new ECHOImporter(new URL( "http://xserve09.mpiwg-berlin.mpg.de:19280/echo_nav/echo_pages/content/showRDF")); ESciDocDataHarvester hv = new ESciDocDataHarvester(newimporter, new ECHOTransformer(), connector, "/ir/context/escidoc:1001"); // hv.readObjectsFromInstance("ECHO_collection"); hv.readObjectsFromInstance("ECHO_resource"); hv.releaseAndSubmitObjects( "/ir/context/escidoc:1001/resources/members", "//escidocItem:item","first release",0); hv.releaseAndSubmitObjects( "/ir/context/escidoc:1001/resources/members", "//escidocItem:item","first release",1); // newimporter.organizeRessourcesInCollections(connector, // "/ir/context/escidoc:1001"); // hv.releaseAndSubmitObjects("/ir/containers","//container:container"); } }