diff src/de/mpiwg/itgroup/eSciDoc/harvesting/ESciDocDataHarvester.java @ 3:58b52df9763c

added update functionality if index.meta has changed
author dwinter
date Wed, 12 Jan 2011 11:00:14 +0100
parents fab8e78184fa
children cb5668b07bfc
line wrap: on
line diff
--- a/src/de/mpiwg/itgroup/eSciDoc/harvesting/ESciDocDataHarvester.java	Mon Jan 10 12:42:27 2011 +0100
+++ b/src/de/mpiwg/itgroup/eSciDoc/harvesting/ESciDocDataHarvester.java	Wed Jan 12 11:00:14 2011 +0100
@@ -1,16 +1,14 @@
 package de.mpiwg.itgroup.eSciDoc.harvesting;
 
-import java.io.File;
-import java.io.FileWriter;
 import java.io.IOException;
 import java.io.InputStream;
-import java.net.MalformedURLException;
+import java.io.UnsupportedEncodingException;
 import java.net.URL;
 import java.util.ArrayList;
+import java.util.Map;
 
 import org.apache.http.HttpEntity;
 import org.apache.http.HttpResponse;
-import org.apache.log4j.BasicConfigurator;
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.log4j.xml.DOMConfigurator;
@@ -20,50 +18,81 @@
 import de.mpiwg.itgroup.eSciDoc.Tools.EScidocTools;
 import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHOObject;
 import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHORessource;
+import de.mpiwg.itgroup.eSciDoc.exceptions.ConnectorException;
+import de.mpiwg.itgroup.eSciDoc.exceptions.ESciDocXmlObjectException;
+import de.mpiwg.itgroup.eSciDoc.exceptions.ObjectNotUniqueError;
+import de.mpiwg.itgroup.eSciDoc.exceptions.TransformerException;
 import de.mpiwg.itgroup.eSciDoc.importer.ECHOImporter;
 import de.mpiwg.itgroup.eSciDoc.importer.Importer;
 import de.mpiwg.itgroup.eSciDoc.transformer.ECHOTransformer;
 import de.mpiwg.itgroup.eSciDoc.transformer.Transformer;
 import de.mpiwg.itgroup.eSciDoc.utils.eSciDocXmlObject;
 
+/**
+ * @author dwinter
+ * Main class for data harveting from index.meta files into eScidoc
+ */
 public class ESciDocDataHarvester {
-	static int MAX_REC=1000;
+	static int MAX_REC=1000; // maximale Anzahl von Records die in einem Stueck eingelesen bzw. bearbeitet werden.
 	//static int MAX_REC=5;
 	protected Logger logger = Logger.getRootLogger();
 	protected Importer importer;
 	protected EScidocBasicHandler connector;
 	protected Transformer transformer;
-	private EScidocTools tools;
+	
 	private String echoContext;
 	private Logger addedFile = Logger.getLogger("addedFilesLogger");
 	private Logger notAddedFile = Logger.getLogger("notAddedFilesLogger");
 
+	/**
+	 * @param importer Importer for dataObjects, describes how to access the objects
+	 * @param transformer Transformer, generates the eScidocMetaDatasets
+	 * @param connector connects to the eScidocRepository	
+	 * @param context Escidoc context path z.b. /ir/context/escidoc:12001
+	 */
 	public ESciDocDataHarvester(Importer importer, Transformer transformer,
-			EScidocBasicHandler connector, String context) throws IOException {
+			EScidocBasicHandler connector, String context) {
 		this.importer = importer;
 		this.transformer = transformer;
 		this.connector = connector;
-		this.tools = new EScidocTools(connector);
+		//this.tools = new EScidocTools(connector);
 		this.echoContext = context;
 
 	}
 
-	public Boolean readObjectsFromInstance(String type) throws Exception {
+	/** Read objects into eScidoc or updates the objects if indexMeta has changed.
+	 * @param type restrict the imported objects to a specific type, possible types should be defined in 
+	 * the given importer @see {@link #importer}
+	 * @return
+	 * @throws ConnectorException
+	 * @throws TransformerException
+	 * @throws ESciDocXmlObjectException
+	 */
+	public Boolean readObjectsFromInstance(String type) throws ConnectorException, TransformerException, ESciDocXmlObjectException {
 		ArrayList<String> addedObjects = new ArrayList<String>();
 		ArrayList<String> notAddedObjects = new ArrayList<String>();
 		for (ECHOObject obj : importer.getObjectList(type)) {
 
 			if (ECHORessource.class.isInstance(obj)) {
 				try {
-					if (connector.alreadyExists(
-							"/md-records/md-record/admin/archivePath",
-							((ECHORessource) obj).archivePath, echoContext)) {
+					ECHOObject old;
+					try {
+						old = connector.alreadyExists(
+								"/md-records/md-record/admin/archivePath",
+								((ECHORessource) obj).archivePath, echoContext);
+					} catch (ObjectNotUniqueError e) {
+						// TODO Auto-generated catch block
+						e.printStackTrace();
+						continue;
+					} 
+					if (old!=null) {
 						logger.debug("already exist:"
 								+ ((ECHORessource) obj).archivePath);
+						handleExistingObject(obj,old);
 						continue;
 					}
-				} catch (Exception e) {
-					logger.debug("already exist error");
+				} catch (ConnectorException e) {
+					logger.debug("already exist error:");
 					e.printStackTrace();
 					continue;
 				}
@@ -77,28 +106,32 @@
 			} else {
 
 				eSciDocXmlObject escidocItem = transformer.transform(obj);
-				logger.info(escidocItem.printXML());
-				// TODO write PID to back to echo-obj
-				Boolean result = connector.createItem(escidocItem);
-				if (result) {
-					addedObjects.add(escidocItem.getESciDocId());
-					addedFile.debug(escidocItem.getESciDocId() + "\n");
-					// addedFile.write(escidocItem.getESciDocId()+"\n");
-					// addedFile.flush();
+				
+		
+					try {
+						logger.info(escidocItem.printXML());
+						// TODO write PID to back to echo-obj
+						Boolean result = connector.createItem(escidocItem);
+						if (result) {
+							addedObjects.add(escidocItem.getESciDocId());
+							addedFile.debug(escidocItem.getESciDocId() + "\n");
 
-				} else {
-					notAddedObjects.add(obj.echoUrl);
-					notAddedFile.debug(obj.echoUrl);
-					// notAddedFile.write(obj.echoUrl+"\n");
-					// notAddedFile.flush();
-				}
-				// if (result == ESciDocConnector.WRITE_RESULT_PID_EXISTS){
-				// logger.info("PID already exists:"+obj);
-				// } else if (result ==
-				// ESciDocConnector.WRITE_RESULT_OBJ_WITH_SAME_REFERENCE){
-				// logger.info("Object with reference to the same digital object already exists:"+obj);
-				// }
-
+						} else {
+							notAddedObjects.add(obj.echoUrl);
+							notAddedFile.debug(obj.echoUrl);
+				
+						}
+					
+					} catch (IOException e) {
+						// TODO Auto-generated catch block
+						e.printStackTrace();
+						throw new ESciDocXmlObjectException();
+					} catch (JDOMException e) {
+						// TODO Auto-generated catch block
+						e.printStackTrace();
+						throw new ESciDocXmlObjectException();
+					}
+				
 			}
 		}
 		if (logger.getLevel() == Level.DEBUG) {
@@ -107,18 +140,61 @@
 			}
 		}
 
-		// File outFile = new File("/tmp/import.out");
-		// FileWriter fw = new FileWriter(outFile);
-		// for (String addedObject:addedObjects){
-		// fw.write(addedObject+"\n");
-		// }
-		// for (String addedObject:notAddedObjects){
-		// fw.write(addedObject+"\n");
-		// }
-		// fw.close();
 		return true;
 	}
 
+	
+		
+	
+
+	/**
+	 * Deal with existing objects, do nothing if md5 of stored metadata and metadata on the server is the same otherwise call {@link #updateObject(ECHOObject)}.
+	 * @param objNew
+	 * @param old
+	 * @throws TransformerException 
+	 * @throws ESciDocXmlObjectException 
+	 */
+	private void handleExistingObject(ECHOObject objNew, ECHOObject old) throws TransformerException, ESciDocXmlObjectException {
+		ECHORessource objNewRes = (ECHORessource)objNew;
+		ECHORessource objOldRes = (ECHORessource)old;
+		String md5onServer = objNewRes.getIndexMetaMD5onServer();
+		String md5=objOldRes.getIndexMetaMD5stored();
+		if (md5onServer.equals(md5))
+			return;
+		else {
+			updateObject(objNew, old);
+		}
+		
+	}
+
+	private void updateObject(ECHOObject objNew, ECHOObject objOld) throws TransformerException, ESciDocXmlObjectException {
+		objNew.context = echoContext;
+		eSciDocXmlObject escidocItem = transformer.transform(objNew);
+		String lastModificationDateOld = objOld.lastModificationDate;
+		escidocItem.setLastModificationDate(lastModificationDateOld);
+		try {
+			HttpResponse ret = connector.eScidocPut(objOld.eScidocId, EScidocBasicHandler.convertStringToStream(escidocItem.printXML()));
+			HttpEntity ent = ret.getEntity();
+			if (ret.getStatusLine().getStatusCode() != 200) {
+				logger.debug("Can not update:" + objOld.eScidocId);
+				// res.getEntity().consumeContent(); // necessary to release
+				// the conneciton
+				ent.consumeContent();
+
+			}
+			InputStream restream = ret.getEntity().getContent();
+			logger.debug(EScidocBasicHandler.convertStreamToString(restream));
+		} catch (UnsupportedEncodingException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+		
+		
+	}
+
 	/**
 	 * @param command
 	 * @param objectXPath