view src/de/mpiwg/itgroup/eSciDoc/harvesting/ESciDocDataHarvester.java @ 2:fab8e78184fa

minor
author dwinter
date Mon, 10 Jan 2011 12:42:27 +0100
parents c6929e63b0b8
children 58b52df9763c
line wrap: on
line source

package de.mpiwg.itgroup.eSciDoc.harvesting;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.xml.DOMConfigurator;
import org.jdom.JDOMException;

import de.mpiwg.itgroup.eSciDoc.Tools.EScidocBasicHandler;
import de.mpiwg.itgroup.eSciDoc.Tools.EScidocTools;
import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHOObject;
import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHORessource;
import de.mpiwg.itgroup.eSciDoc.importer.ECHOImporter;
import de.mpiwg.itgroup.eSciDoc.importer.Importer;
import de.mpiwg.itgroup.eSciDoc.transformer.ECHOTransformer;
import de.mpiwg.itgroup.eSciDoc.transformer.Transformer;
import de.mpiwg.itgroup.eSciDoc.utils.eSciDocXmlObject;

public class ESciDocDataHarvester {
	static int MAX_REC=1000;
	//static int MAX_REC=5;
	protected Logger logger = Logger.getRootLogger();
	protected Importer importer;
	protected EScidocBasicHandler connector;
	protected Transformer transformer;
	private EScidocTools tools;
	private String echoContext;
	private Logger addedFile = Logger.getLogger("addedFilesLogger");
	private Logger notAddedFile = Logger.getLogger("notAddedFilesLogger");

	public ESciDocDataHarvester(Importer importer, Transformer transformer,
			EScidocBasicHandler connector, String context) throws IOException {
		this.importer = importer;
		this.transformer = transformer;
		this.connector = connector;
		this.tools = new EScidocTools(connector);
		this.echoContext = context;

	}

	public Boolean readObjectsFromInstance(String type) throws Exception {
		ArrayList<String> addedObjects = new ArrayList<String>();
		ArrayList<String> notAddedObjects = new ArrayList<String>();
		for (ECHOObject obj : importer.getObjectList(type)) {

			if (ECHORessource.class.isInstance(obj)) {
				try {
					if (connector.alreadyExists(
							"/md-records/md-record/admin/archivePath",
							((ECHORessource) obj).archivePath, echoContext)) {
						logger.debug("already exist:"
								+ ((ECHORessource) obj).archivePath);
						continue;
					}
				} catch (Exception e) {
					logger.debug("already exist error");
					e.printStackTrace();
					continue;
				}
			}

			obj.context = echoContext;

			String contid = connector.getIDfromPID(obj.pid, echoContext);
			if (contid != null) {
				System.out.println("------- belongsTo:" + contid);
			} else {

				eSciDocXmlObject escidocItem = transformer.transform(obj);
				logger.info(escidocItem.printXML());
				// TODO write PID to back to echo-obj
				Boolean result = connector.createItem(escidocItem);
				if (result) {
					addedObjects.add(escidocItem.getESciDocId());
					addedFile.debug(escidocItem.getESciDocId() + "\n");
					// addedFile.write(escidocItem.getESciDocId()+"\n");
					// addedFile.flush();

				} else {
					notAddedObjects.add(obj.echoUrl);
					notAddedFile.debug(obj.echoUrl);
					// notAddedFile.write(obj.echoUrl+"\n");
					// notAddedFile.flush();
				}
				// if (result == ESciDocConnector.WRITE_RESULT_PID_EXISTS){
				// logger.info("PID already exists:"+obj);
				// } else if (result ==
				// ESciDocConnector.WRITE_RESULT_OBJ_WITH_SAME_REFERENCE){
				// logger.info("Object with reference to the same digital object already exists:"+obj);
				// }

			}
		}
		if (logger.getLevel() == Level.DEBUG) {
			for (String addedObject : addedObjects) {
				logger.debug(addedObject);
			}
		}

		// File outFile = new File("/tmp/import.out");
		// FileWriter fw = new FileWriter(outFile);
		// for (String addedObject:addedObjects){
		// fw.write(addedObject+"\n");
		// }
		// for (String addedObject:notAddedObjects){
		// fw.write(addedObject+"\n");
		// }
		// fw.close();
		return true;
	}

	/**
	 * @param command
	 * @param objectXPath
	 * @param mode 0 : only submit, 1:only release, 2:release and submit
	 * @throws Exception
	 */
	public void releaseAndSubmitObjects(String command, String objectXPath,int mode)
			throws Exception {

		Integer numberOfHits = connector.getNumberOfHitsFromFilterResult(
				command, objectXPath,mode);
		
		
		int tausend = ((numberOfHits-1) / MAX_REC);
		
		String queryRestrict="";
		if(mode==0 | mode==2){
			queryRestrict="query=%22/properties/version/status%22=pending";
		} else {
			queryRestrict="query=%22/properties/version/status%22=submitted";
		}
		
		for (int t = 0; t <= tausend; t++) {
			int start = t * MAX_REC+1;
			// int max=Math.min((t+1)*1000, numberOfHits);
			String query = "?maximumRecords="+String.valueOf(MAX_REC)+"&startRecord="
					+ String.valueOf(start)+"&"+queryRestrict;
			for (eSciDocXmlObject obj : connector
					.getObjectListFromFilterResult(command+query, objectXPath)) {
				
				//TODO is the following really necessary, currently the obj in the list is sometimes not the current one.
				try{
				HttpResponse resObj = connector.eScidocGet(obj.getESciDocId());
				HttpEntity ent = resObj.getEntity();
				if (ent!=null){
				obj= new eSciDocXmlObject(ent.getContent());
				} else {
					logger.debug("Can not retrieve:" + obj.getESciDocId());
					continue;
				}
				} catch (Exception e){
					logger.debug("Can not retrieve:" + obj.getESciDocId());
					continue;
				}
				if (mode==0 | mode==2){
					HttpResponse res = connector.submitAnObject(obj,
							"first release");
					logger.debug(res.getStatusLine());
				
					if (res.getStatusLine().getStatusCode() != 200) {
						logger.debug("Can not submit:" + obj.getESciDocId());
						// res.getEntity().consumeContent(); // necessary to release
						// the conneciton
	
					}
					InputStream restream = res.getEntity().getContent();
					logger.debug(EScidocBasicHandler.convertStreamToString(restream));
					//res.getEntity().consumeContent(); // necessary to release the
														// conneciton
	
					if (!connector.upDateObject(obj)) {
						logger.debug("Can not update:" + obj.getESciDocId());
						// continue;
	
					}
				}
				
				if (mode==1 | mode==2){
					HttpResponse res = connector.releaseAnObject(obj, "first release");
					logger.debug(res.getStatusLine());
					if (res.getStatusLine().getStatusCode() != 200) {
						logger.debug("Can not release:" + obj.getESciDocId());
						res.getEntity().consumeContent(); // necessary to release
															// the conneciton
						continue;
					}
					addedFile.debug("RELEASED:" + obj.getESciDocId());
					res.getEntity().consumeContent(); // necessary to release the
														// connecito
				}
			}
		}

	}

	public static void main(String[] args) throws Exception {

		Logger rl = Logger.getRootLogger();
		DOMConfigurator.configure("log4uconf.xml");
		rl.setLevel(Level.DEBUG);

				
		EScidocBasicHandler connector = new EScidocBasicHandler("escidoc-test.mpiwg-berlin.mpg.de",8080,"dwinter","weikiki7");
		
		
		ECHOImporter newimporter = new ECHOImporter(new URL(
				"file:///Users/dwinter/libcoll.rdf"));
		ESciDocDataHarvester hv = new ESciDocDataHarvester(newimporter,
				new ECHOTransformer(), connector, "/ir/context/escidoc:1001");

		// hv.readObjectsFromInstance("ECHO_collection");
		// hv.readObjectsFromInstance("ECHO_resource");

		hv.releaseAndSubmitObjects(
				"/ir/context/escidoc:1001/resources/members",
				"//escidocItem:item",1);

		// newimporter.organizeRessourcesInCollections(connector,
		// "/ir/context/escidoc:1001");
		// hv.releaseAndSubmitObjects("/ir/containers","//container:container");
	}
}