view src/de/mpiwg/itgroup/eSciDoc/harvesting/ESciDocDataHarvester.java @ 8:a844f6948dd8

?nderungen im Walker tools f?r pubman
author dwinter
date Mon, 14 May 2012 09:58:45 +0200
parents 4b1ae52418c1
children b6cf6462d709
line wrap: on
line source

package de.mpiwg.itgroup.eSciDoc.harvesting;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.xml.DOMConfigurator;
import org.jdom.JDOMException;

import de.mpiwg.itgroup.eSciDoc.Tools.EScidocBasicHandler;
import de.mpiwg.itgroup.eSciDoc.Tools.EScidocTools;
import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHOObject;
import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHORessource;
import de.mpiwg.itgroup.eSciDoc.exceptions.ConnectorException;
import de.mpiwg.itgroup.eSciDoc.exceptions.ESciDocXmlObjectException;
import de.mpiwg.itgroup.eSciDoc.exceptions.ObjectNotUniqueError;
import de.mpiwg.itgroup.eSciDoc.exceptions.TransformerException;
import de.mpiwg.itgroup.eSciDoc.importer.ECHOImporter;
import de.mpiwg.itgroup.eSciDoc.importer.Importer;
import de.mpiwg.itgroup.eSciDoc.transformer.ECHOTransformer;
import de.mpiwg.itgroup.eSciDoc.transformer.Transformer;
import de.mpiwg.itgroup.eSciDoc.utils.eSciDocXmlObject;

/**
 * @author dwinter
 * Main class for data harveting from index.meta files into eScidoc
 */
public class ESciDocDataHarvester {
	static int MAX_REC=1000; // maximale Anzahl von Records die in einem Stueck eingelesen bzw. bearbeitet werden.
	//static int MAX_REC=5;
	protected Logger logger = Logger.getRootLogger();
	protected Importer importer;
	protected EScidocBasicHandler connector;
	protected Transformer transformer;

	private String context;
	private Logger addedFile = Logger.getLogger("addedFilesLogger");
	private Logger notAddedFile = Logger.getLogger("notAddedFilesLogger");

	/**
	 * @param importer Importer for dataObjects, describes how to access the objects
	 * @param transformer Transformer, generates the eScidocMetaDatasets
	 * @param connector connects to the eScidocRepository	
	 * @param context Escidoc context path z.b. /ir/context/escidoc:12001
	 */
	public ESciDocDataHarvester(Importer importer, Transformer transformer,
			EScidocBasicHandler connector, String context) {
		this.importer = importer;
		this.transformer = transformer;
		this.connector = connector;
		//this.tools = new EScidocTools(connector);
		this.context = context;

	}

	/** Read objects into eScidoc or updates the objects if indexMeta has changed.
	 * @param type restrict the imported objects to a specific type, possible types should be defined in 
	 * the given importer @see {@link #importer}
	 * @return
	 * @throws ConnectorException
	 * @throws TransformerException
	 * @throws ESciDocXmlObjectException
	 */
	@Deprecated
	public Boolean readObjectsFromInstanceOLD(String type) throws ConnectorException, TransformerException, ESciDocXmlObjectException {
		ArrayList<String> addedObjects = new ArrayList<String>();
		ArrayList<String> notAddedObjects = new ArrayList<String>();
		for (ECHOObject obj : importer.getObjectList(type)) {

			if (ECHORessource.class.isInstance(obj)) {
				try {
					ECHOObject old;
					try {
						old = connector.alreadyExists( 
								"admin.archivePath",
								((ECHORessource) obj).archivePath, context,"=");
						if (old==null){ //FIXME Problem bei der Erfassungder Metadaten sollte eigentlich nicht vorkommen!
							old = connector.alreadyExists( 
									"admin.archivePath",
									((ECHORessource) obj).archivePath.replace("/mpiwg/online/permanent", "/Volumes/online/permanent"), context,"=");
						}
						if (old==null){
							old = connector.alreadyExists( 
									"admin.archivePath",
									((ECHORessource) obj).archivePath.replace("/mpiwg/online/experimental", "/Volumes/online/experimental"), context,"=");
						}
					} catch (ObjectNotUniqueError e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
						continue;
					} 
					if (old!=null) {
						logger.debug("already exist:"
								+ ((ECHORessource) obj).archivePath);
						handleExistingObject(obj,old);
						continue;
					}
				} catch (ConnectorException e) {
					logger.debug("already exist error:");
					e.printStackTrace();
					continue;
				}
			}

			obj.context = context;

			String contid = connector.getIDfromPID(obj.pid, context);
			if (contid != null) {
				System.out.println("------- belongsTo:" + contid);
			} else {

				eSciDocXmlObject escidocItem = transformer.transform(obj);


				try {
					logger.info(escidocItem.printXML());
					// TODO write PID to back to echo-obj
					Boolean result = connector.createItem(escidocItem);
					if (result) {
						addedObjects.add(escidocItem.getESciDocId());
						addedFile.debug(escidocItem.getESciDocId() + "\n");

					} else {
						notAddedObjects.add(obj.echoUrl);
						notAddedFile.debug(obj.echoUrl);

					}

				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
					throw new ESciDocXmlObjectException();
				} catch (JDOMException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
					throw new ESciDocXmlObjectException();
				}

			}
		}
		if (logger.getLevel() == Level.DEBUG) {
			for (String addedObject : addedObjects) {
				logger.debug(addedObject);
			}
		}

		return true;
	}


	/** Read objects into eScidoc or updates the objects if indexMeta has changed.
	 * @param type restrict the imported objects to a specific type, possible types should be defined in 
	 * the given importer @see {@link #importer}
	 * @return
	 * @throws ConnectorException
	 * @throws TransformerException
	 * @throws ESciDocXmlObjectException
	 */
	public Boolean readObjectsFromInstance(String type) throws ConnectorException, TransformerException, ESciDocXmlObjectException {
		ArrayList<String> addedObjects = new ArrayList<String>();
		ArrayList<String> notAddedObjects = new ArrayList<String>();
		for (ECHOObject obj : importer.getObjectList(type)) {
			if (obj==null)
				continue;
			if (ECHORessource.class.isInstance(obj)) {
				try {

					// checke zuerst, ob die MD5 schon im publiziert Teil der Metadaten ist, dann tue nichts
					String md5 = ((ECHORessource) obj).getIndexMetaMD5onServer();
					//List<eSciDocXmlObject> results = connector.getObjectsFromSearch("escidoc.component.checksum",md5);
					List<eSciDocXmlObject> results = connector.getObjectsFromSearch("escidoc.component.checksum",md5,context);
					if (results.size()>0){ //index.meta schon abgespeichert
						notAddedFile.debug("alredy exist:"+obj.echoUrl);

						((ECHORessource) obj).writeEsciDocIDToIndexMeta(results.get(0));

						continue;
					}

					


					ECHOObject old;
					try {

						old = connector.alreadyExists( 
								"admin.archivePath",
								((ECHORessource) obj).archivePath, context,"=");
						if (old==null){ //FIXME Problem bei der Erfassungder Metadaten sollte eigentlich nicht vorkommen!
							old = connector.alreadyExists( 
									"admin.archivePath",
									((ECHORessource) obj).archivePath.replace("/mpiwg/online/permanent", "/Volumes/online_permanent"), context,"=");
						}
						if (old==null){
							old = connector.alreadyExists( 
									"admin.archivePath",
									((ECHORessource) obj).archivePath.replace("/mpiwg/online/experimental", "/Volumes/online_experimental"), context,"=");
						}
					} catch (ObjectNotUniqueError e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
						continue;
					} 
					if (old!=null) {
						logger.debug("already exist:"
								+ ((ECHORessource) obj).archivePath);
						handleExistingObject(obj,old);
						continue;
					}
				} catch (ConnectorException e) {
					logger.debug("already exist error:");
					e.printStackTrace();
					continue;
				}
			}

			obj.context = context;
			
			String contid=null;
			
			if (obj.pid!=null)
				contid = connector.getIDfromPID(obj.pid, context);
			if (contid != null) {
				System.out.println("------- belongsTo:" + contid);
			} else {

				eSciDocXmlObject escidocItem = transformer.transform(obj);


				try {
					logger.info(escidocItem.printXML());
					// TODO write PID to back to echo-obj
					Boolean result = connector.createItem(escidocItem);
					if (result) {
						addedObjects.add(escidocItem.getESciDocId());
						addedFile.debug(escidocItem.getESciDocId() + "\n");

					} else {
						notAddedObjects.add(obj.echoUrl);
						notAddedFile.debug(obj.echoUrl);

					}

				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
					throw new ESciDocXmlObjectException();
				} catch (JDOMException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
					throw new ESciDocXmlObjectException();
				}

			}
		}
		if (logger.getLevel() == Level.DEBUG) {
			for (String addedObject : addedObjects) {
				logger.debug(addedObject);
			}
		}

		return true;
	}




	/**
	 * Deal with existing objects, do nothing if md5 of stored metadata and metadata on the server is the same otherwise call {@link #updateObject(ECHOObject)}.
	 * @param objNew
	 * @param old
	 * @throws TransformerException 
	 * @throws ESciDocXmlObjectException 
	 */
	private void handleExistingObject(ECHOObject objNew, ECHOObject old) throws TransformerException, ESciDocXmlObjectException {
		ECHORessource objNewRes = (ECHORessource)objNew;
		ECHORessource objOldRes = (ECHORessource)old;
		String md5onServer = objNewRes.getIndexMetaMD5onServer();
		String md5=objOldRes.getIndexMetaMD5stored();
		
		objNewRes.writeEsciDocIDToIndexMeta(objOldRes.eScidocId);
		
		if (md5onServer.equals(md5))
			return;
		else {
			
			updateObject(objNew, old);
			

		}

	}

	private void updateObject(ECHOObject objNew, ECHOObject objOld) throws TransformerException, ESciDocXmlObjectException {
		objNew.context = context;
		
		//erzeuge erst einmal ein neues XML Object aus den neuen Daten.
		eSciDocXmlObject escidocItem = transformer.transform(objNew);
		String lastModificationDateOld = objOld.lastModificationDate;
		
		//jetzt das alte ModeificationDate dort rein (wegen, optimitistic locking)
		escidocItem.setLastModificationDate(lastModificationDateOld);
		try {
			HttpResponse ret = connector.eScidocPut("/ir/item/"+objOld.eScidocId, EScidocBasicHandler.convertStringToStream(escidocItem.printXML()));
			HttpEntity ent = ret.getEntity();
			if (ret.getStatusLine().getStatusCode() != 200) {
				logger.debug("Can not update:" + objOld.eScidocId);
				// res.getEntity().consumeContent(); // necessary to release
				// the conneciton
				ent.consumeContent();

			}
			InputStream restream = ret.getEntity().getContent();
			logger.debug(EScidocBasicHandler.convertStreamToString(restream));
		} catch (UnsupportedEncodingException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		addedFile.debug("updated:"+objOld.eScidocId);
	}

	/**
	 * @param command
	 * @param objectXPath
	 * @param comment der in escidoc bei der Operation abgespeichert wird.
	 * @param mode 0 : only submit, 1:only release, 2:release and submit
	 * @throws Exception
	 */
	public void releaseAndSubmitObjects(String command, String objectXPath,String comment,int mode)
			throws Exception {

		Integer numberOfHits = connector.getNumberOfHitsFromFilterResult(
				command, objectXPath,mode);


		int tausend = ((numberOfHits-1) / MAX_REC);

		String queryRestrict="";
		if(mode==0 | mode==2){
			queryRestrict="query=%22/properties/version/status%22=pending";
		} else {
			queryRestrict="query=%22/properties/version/status%22=submitted";
		}

		for (int t = 0; t <= tausend; t++) {
			int start = t * MAX_REC+1;
			// int max=Math.min((t+1)*1000, numberOfHits);
			String query = "?maximumRecords="+String.valueOf(MAX_REC)+"&startRecord="
					+ String.valueOf(start)+"&"+queryRestrict;
			for (eSciDocXmlObject obj : connector
					.getObjectsFromFilterResult(command+query, objectXPath)) {

				//TODO is the following really necessary, currently the obj in the list is sometimes not the current one.
				try{
					HttpResponse resObj = connector.eScidocGet(obj.getESciDocId());
					HttpEntity ent = resObj.getEntity();
					if (ent!=null){
						obj= new eSciDocXmlObject(ent.getContent());
					} else {
						logger.debug("Can not retrieve:" + obj.getESciDocId());
						continue;
					}
				} catch (Exception e){
					logger.debug("Can not retrieve:" + obj.getESciDocId());
					continue;
				}
				if (mode==0 | mode==2){
					HttpResponse res = connector.submitAnObject(obj,
							comment);
					logger.debug(res.getStatusLine());

					if (res.getStatusLine().getStatusCode() != 200) {
						logger.debug("Can not submit:" + obj.getESciDocId());
						// res.getEntity().consumeContent(); // necessary to release
						// the conneciton

					}
					InputStream restream = res.getEntity().getContent();
					logger.debug(EScidocBasicHandler.convertStreamToString(restream));
					//res.getEntity().consumeContent(); // necessary to release the
					// conneciton

					if (!connector.upDateObject(obj)) {
						logger.debug("Can not update:" + obj.getESciDocId());
						// continue;

					}
				}

				if (mode==1 | mode==2){
					HttpResponse res = connector.releaseAnObject(obj, comment);
					logger.debug(res.getStatusLine());
					if (res.getStatusLine().getStatusCode() != 200) {
						logger.debug("Can not release:" + obj.getESciDocId());
						res.getEntity().consumeContent(); // necessary to release
						// the conneciton
						continue;
					}
					addedFile.debug("RELEASED:" + obj.getESciDocId());
					res.getEntity().consumeContent(); // necessary to release the
					// connecito
				}
			}
		}

	}

	public static void main(String[] args) throws Exception {

		Logger rl = Logger.getRootLogger();
		DOMConfigurator.configure("/etc/escidocImportConfig.xml");
		rl.setLevel(Level.DEBUG);


		EScidocBasicHandler connector = new EScidocBasicHandler("escidoc-test.mpiwg-berlin.mpg.de",8080,"dwinter","weikiki7");


		//ECHOImporter newimporter = new ECHOImporter(new URL(
		//		"file:///Users/dwinter/libcoll.rdf"));

		ECHOImporter newimporter = new ECHOImporter(new URL(
				"http://xserve09.mpiwg-berlin.mpg.de:19280/echo_nav/echo_pages/content/showRDF"));
		ESciDocDataHarvester hv = new ESciDocDataHarvester(newimporter,
				new ECHOTransformer(), connector, "/ir/context/escidoc:1001");

		// hv.readObjectsFromInstance("ECHO_collection");
		hv.readObjectsFromInstance("ECHO_resource");

		hv.releaseAndSubmitObjects(
				"/ir/context/escidoc:1001/resources/members",
				"//escidocItem:item","first release",0);
		hv.releaseAndSubmitObjects(
				"/ir/context/escidoc:1001/resources/members",
				"//escidocItem:item","first release",1);


		// newimporter.organizeRessourcesInCollections(connector,
		// "/ir/context/escidoc:1001");
		// hv.releaseAndSubmitObjects("/ir/containers","//container:container");
	}
}