view src/de/mpiwg/itgroup/eSciDoc/harvesting/ESciDocDataHarvester.java @ 3:58b52df9763c

added update functionality if index.meta has changed
author dwinter
date Wed, 12 Jan 2011 11:00:14 +0100
parents fab8e78184fa
children cb5668b07bfc
line wrap: on
line source

package de.mpiwg.itgroup.eSciDoc.harvesting;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Map;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.xml.DOMConfigurator;
import org.jdom.JDOMException;

import de.mpiwg.itgroup.eSciDoc.Tools.EScidocBasicHandler;
import de.mpiwg.itgroup.eSciDoc.Tools.EScidocTools;
import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHOObject;
import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHORessource;
import de.mpiwg.itgroup.eSciDoc.exceptions.ConnectorException;
import de.mpiwg.itgroup.eSciDoc.exceptions.ESciDocXmlObjectException;
import de.mpiwg.itgroup.eSciDoc.exceptions.ObjectNotUniqueError;
import de.mpiwg.itgroup.eSciDoc.exceptions.TransformerException;
import de.mpiwg.itgroup.eSciDoc.importer.ECHOImporter;
import de.mpiwg.itgroup.eSciDoc.importer.Importer;
import de.mpiwg.itgroup.eSciDoc.transformer.ECHOTransformer;
import de.mpiwg.itgroup.eSciDoc.transformer.Transformer;
import de.mpiwg.itgroup.eSciDoc.utils.eSciDocXmlObject;

/**
 * @author dwinter
 * Main class for data harveting from index.meta files into eScidoc
 */
public class ESciDocDataHarvester {
	static int MAX_REC=1000; // maximale Anzahl von Records die in einem Stueck eingelesen bzw. bearbeitet werden.
	//static int MAX_REC=5;
	protected Logger logger = Logger.getRootLogger();
	protected Importer importer;
	protected EScidocBasicHandler connector;
	protected Transformer transformer;
	
	private String echoContext;
	private Logger addedFile = Logger.getLogger("addedFilesLogger");
	private Logger notAddedFile = Logger.getLogger("notAddedFilesLogger");

	/**
	 * @param importer Importer for dataObjects, describes how to access the objects
	 * @param transformer Transformer, generates the eScidocMetaDatasets
	 * @param connector connects to the eScidocRepository	
	 * @param context Escidoc context path z.b. /ir/context/escidoc:12001
	 */
	public ESciDocDataHarvester(Importer importer, Transformer transformer,
			EScidocBasicHandler connector, String context) {
		this.importer = importer;
		this.transformer = transformer;
		this.connector = connector;
		//this.tools = new EScidocTools(connector);
		this.echoContext = context;

	}

	/** Read objects into eScidoc or updates the objects if indexMeta has changed.
	 * @param type restrict the imported objects to a specific type, possible types should be defined in 
	 * the given importer @see {@link #importer}
	 * @return
	 * @throws ConnectorException
	 * @throws TransformerException
	 * @throws ESciDocXmlObjectException
	 */
	public Boolean readObjectsFromInstance(String type) throws ConnectorException, TransformerException, ESciDocXmlObjectException {
		ArrayList<String> addedObjects = new ArrayList<String>();
		ArrayList<String> notAddedObjects = new ArrayList<String>();
		for (ECHOObject obj : importer.getObjectList(type)) {

			if (ECHORessource.class.isInstance(obj)) {
				try {
					ECHOObject old;
					try {
						old = connector.alreadyExists(
								"/md-records/md-record/admin/archivePath",
								((ECHORessource) obj).archivePath, echoContext);
					} catch (ObjectNotUniqueError e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
						continue;
					} 
					if (old!=null) {
						logger.debug("already exist:"
								+ ((ECHORessource) obj).archivePath);
						handleExistingObject(obj,old);
						continue;
					}
				} catch (ConnectorException e) {
					logger.debug("already exist error:");
					e.printStackTrace();
					continue;
				}
			}

			obj.context = echoContext;

			String contid = connector.getIDfromPID(obj.pid, echoContext);
			if (contid != null) {
				System.out.println("------- belongsTo:" + contid);
			} else {

				eSciDocXmlObject escidocItem = transformer.transform(obj);
				
		
					try {
						logger.info(escidocItem.printXML());
						// TODO write PID to back to echo-obj
						Boolean result = connector.createItem(escidocItem);
						if (result) {
							addedObjects.add(escidocItem.getESciDocId());
							addedFile.debug(escidocItem.getESciDocId() + "\n");

						} else {
							notAddedObjects.add(obj.echoUrl);
							notAddedFile.debug(obj.echoUrl);
				
						}
					
					} catch (IOException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
						throw new ESciDocXmlObjectException();
					} catch (JDOMException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
						throw new ESciDocXmlObjectException();
					}
				
			}
		}
		if (logger.getLevel() == Level.DEBUG) {
			for (String addedObject : addedObjects) {
				logger.debug(addedObject);
			}
		}

		return true;
	}

	
		
	

	/**
	 * Deal with existing objects, do nothing if md5 of stored metadata and metadata on the server is the same otherwise call {@link #updateObject(ECHOObject)}.
	 * @param objNew
	 * @param old
	 * @throws TransformerException 
	 * @throws ESciDocXmlObjectException 
	 */
	private void handleExistingObject(ECHOObject objNew, ECHOObject old) throws TransformerException, ESciDocXmlObjectException {
		ECHORessource objNewRes = (ECHORessource)objNew;
		ECHORessource objOldRes = (ECHORessource)old;
		String md5onServer = objNewRes.getIndexMetaMD5onServer();
		String md5=objOldRes.getIndexMetaMD5stored();
		if (md5onServer.equals(md5))
			return;
		else {
			updateObject(objNew, old);
		}
		
	}

	private void updateObject(ECHOObject objNew, ECHOObject objOld) throws TransformerException, ESciDocXmlObjectException {
		objNew.context = echoContext;
		eSciDocXmlObject escidocItem = transformer.transform(objNew);
		String lastModificationDateOld = objOld.lastModificationDate;
		escidocItem.setLastModificationDate(lastModificationDateOld);
		try {
			HttpResponse ret = connector.eScidocPut(objOld.eScidocId, EScidocBasicHandler.convertStringToStream(escidocItem.printXML()));
			HttpEntity ent = ret.getEntity();
			if (ret.getStatusLine().getStatusCode() != 200) {
				logger.debug("Can not update:" + objOld.eScidocId);
				// res.getEntity().consumeContent(); // necessary to release
				// the conneciton
				ent.consumeContent();

			}
			InputStream restream = ret.getEntity().getContent();
			logger.debug(EScidocBasicHandler.convertStreamToString(restream));
		} catch (UnsupportedEncodingException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
		
	}

	/**
	 * @param command
	 * @param objectXPath
	 * @param mode 0 : only submit, 1:only release, 2:release and submit
	 * @throws Exception
	 */
	public void releaseAndSubmitObjects(String command, String objectXPath,int mode)
			throws Exception {

		Integer numberOfHits = connector.getNumberOfHitsFromFilterResult(
				command, objectXPath,mode);
		
		
		int tausend = ((numberOfHits-1) / MAX_REC);
		
		String queryRestrict="";
		if(mode==0 | mode==2){
			queryRestrict="query=%22/properties/version/status%22=pending";
		} else {
			queryRestrict="query=%22/properties/version/status%22=submitted";
		}
		
		for (int t = 0; t <= tausend; t++) {
			int start = t * MAX_REC+1;
			// int max=Math.min((t+1)*1000, numberOfHits);
			String query = "?maximumRecords="+String.valueOf(MAX_REC)+"&startRecord="
					+ String.valueOf(start)+"&"+queryRestrict;
			for (eSciDocXmlObject obj : connector
					.getObjectListFromFilterResult(command+query, objectXPath)) {
				
				//TODO is the following really necessary, currently the obj in the list is sometimes not the current one.
				try{
				HttpResponse resObj = connector.eScidocGet(obj.getESciDocId());
				HttpEntity ent = resObj.getEntity();
				if (ent!=null){
				obj= new eSciDocXmlObject(ent.getContent());
				} else {
					logger.debug("Can not retrieve:" + obj.getESciDocId());
					continue;
				}
				} catch (Exception e){
					logger.debug("Can not retrieve:" + obj.getESciDocId());
					continue;
				}
				if (mode==0 | mode==2){
					HttpResponse res = connector.submitAnObject(obj,
							"first release");
					logger.debug(res.getStatusLine());
				
					if (res.getStatusLine().getStatusCode() != 200) {
						logger.debug("Can not submit:" + obj.getESciDocId());
						// res.getEntity().consumeContent(); // necessary to release
						// the conneciton
	
					}
					InputStream restream = res.getEntity().getContent();
					logger.debug(EScidocBasicHandler.convertStreamToString(restream));
					//res.getEntity().consumeContent(); // necessary to release the
														// conneciton
	
					if (!connector.upDateObject(obj)) {
						logger.debug("Can not update:" + obj.getESciDocId());
						// continue;
	
					}
				}
				
				if (mode==1 | mode==2){
					HttpResponse res = connector.releaseAnObject(obj, "first release");
					logger.debug(res.getStatusLine());
					if (res.getStatusLine().getStatusCode() != 200) {
						logger.debug("Can not release:" + obj.getESciDocId());
						res.getEntity().consumeContent(); // necessary to release
															// the conneciton
						continue;
					}
					addedFile.debug("RELEASED:" + obj.getESciDocId());
					res.getEntity().consumeContent(); // necessary to release the
														// connecito
				}
			}
		}

	}

	public static void main(String[] args) throws Exception {

		Logger rl = Logger.getRootLogger();
		DOMConfigurator.configure("log4uconf.xml");
		rl.setLevel(Level.DEBUG);

				
		EScidocBasicHandler connector = new EScidocBasicHandler("escidoc-test.mpiwg-berlin.mpg.de",8080,"dwinter","weikiki7");
		
		
		ECHOImporter newimporter = new ECHOImporter(new URL(
				"file:///Users/dwinter/libcoll.rdf"));
		ESciDocDataHarvester hv = new ESciDocDataHarvester(newimporter,
				new ECHOTransformer(), connector, "/ir/context/escidoc:1001");

		// hv.readObjectsFromInstance("ECHO_collection");
		// hv.readObjectsFromInstance("ECHO_resource");

		hv.releaseAndSubmitObjects(
				"/ir/context/escidoc:1001/resources/members",
				"//escidocItem:item",1);

		// newimporter.organizeRessourcesInCollections(connector,
		// "/ir/context/escidoc:1001");
		// hv.releaseAndSubmitObjects("/ir/containers","//container:container");
	}
}