view src/de/mpiwg/itgroup/eSciDoc/Tools/IngestECHO.java @ 0:c6929e63b0b8

first import
author dwinter
date Wed, 24 Nov 2010 16:52:07 +0100
parents
children
line wrap: on
line source

package de.mpiwg.itgroup.eSciDoc.Tools;

//todo: create context for echo and contentmodell
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;

import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.xmlrpc.XmlRpcException;
import org.apache.xmlrpc.client.XmlRpcClient;
import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;
import org.jdom.JDOMException;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXParseException;

import sun.misc.Regexp;

//import fedora.client.FedoraClient;
//import fedora.server.access.FedoraAPIA;
//import fedora.server.management.FedoraAPIM;
//import fedora.server.types.gen.ComparisonOperator;
//import fedora.server.types.gen.Condition;
//import fedora.server.types.gen.FieldSearchQuery;
//import fedora.server.types.gen.FieldSearchResult;
//import fedora.server.types.gen.ListSession;
//import fedora.server.types.gen.MIMETypedStream;
//import fedora.server.types.gen.ObjectFields;

public class IngestECHO extends Ingestor {

	protected String ECHORESOURCE_TEMPLATE_XML;
	protected String ECHOCONTAINER_TEMPLATE_XML;
	private String SERVLETURL;
	protected String ECHOURL;
	protected String ECHO_CONTAINER_ID;
	protected String ECHO_ROOT_ID;
	protected String MAIN_CONTEXT;
	private HashMap<String, String> pids;

	protected static String ESCIDOC_SERVER_URL = "euler.mpiwg-berlin.mpg.de";
	protected static String ZOPEPROVIDER = "http://127.0.0.1:18080";

	private static int PORT = 8080;
	
	IngestECHO(String user, String password){
	
    super(ESCIDOC_SERVER_URL, PORT, ZOPEPROVIDER, user, password);
	ECHORESOURCE_TEMPLATE_XML = "ECHOResourceTemplate.xml";
	
	SERVLETURL= "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter?fn=";

	ECHOURL = "http://echo.mpiwg-berlin.mpg.de";

	//ZOPEPROVIDER = "http://127.0.0.1:18080";

	 ECHO_CONTAINER_ID = "escidoc:3006"; // enthaelt alle ECHO
	// objecte
	ECHO_ROOT_ID = "escidoc:3005"; // enthaelt alle Objekte die
	// keiner ECHO collection
	// angehoeren

	MAIN_CONTEXT = "escidoc:3002";

	HashMap<String, String> pids = null;
	
	
	}

	void ingestECHOCollections() throws XmlRpcException, IOException {
		ArrayList<String> urls = getAllCollections();
		HashMap<String, String> success = new HashMap<String, String>();
		HashMap<String, String> nosuccess = new HashMap<String, String>();

		for (String url : urls) {

			try {
				String id = ingestECHOCollection(url);
				success.put(id, url);
			} catch (Exception e) {

				ByteArrayOutputStream out = new ByteArrayOutputStream();
				PrintStream s = new PrintStream(out);
				e.printStackTrace(s);

				nosuccess.put(url, out.toString());

				e.printStackTrace();
			}
		}
		System.out.println("SUCCESSFULL INGEST");
		for (String id : success.keySet())
			System.out.println("ID:" + id + "     URL:" + success.get(id));

		System.out.println("ERRORS:");
		for (String id : nosuccess.keySet()) {
			System.out.println("URL:" + id);
			System.out.println("Message:" + nosuccess.get(id));
		}

	}

	void organizeECHOCollections() throws XmlRpcException, IOException,
			JDOMException {
		ArrayList<String> urls = getAllCollections();
		HashMap<String, String> success = new HashMap<String, String>();
		HashMap<String, String> nosuccess = new HashMap<String, String>();

		for (String url : urls) {

			XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
			XmlRpcClient client = new XmlRpcClient();
			config.setServerURL(new URL(url));
			client.setConfig(config);

			Object[] params = new Object[] {};

			if (pids == null) {
				pids = getPIDsAndEscidocIdsOfCollections(ECHO_CONTAINER_ID);
			}

			try {
				String parentPid;
				String pid = (String) client.execute("getPID", params);
				String contid = pids.get("mpiwg:" + pid);
				addECHOObjectToCollection(client, contid);
				success.put(pid, url);
			} catch (Exception e) {

				ByteArrayOutputStream out = new ByteArrayOutputStream();
				PrintStream s = new PrintStream(out);
				e.printStackTrace(s);

				nosuccess.put(url, out.toString());

				e.printStackTrace();
			}
		}
		System.out.println("SUCCESSFULL ORGANIZED");
		for (String id : success.keySet())
			System.out.println("ID:" + id + "     URL:" + success.get(id));

		System.out.println("ERRORS:");
		for (String id : nosuccess.keySet()) {
			System.out.println("URL:" + id);
			System.out.println("Message:" + nosuccess.get(id));
		}

	}

	void organizeECHORessources() throws XmlRpcException, IOException,
			JDOMException {
		ArrayList<String> urls = getAllResources();
		HashMap<String, String> success = new HashMap<String, String>();
		HashMap<String, String> nosuccess = new HashMap<String, String>();

		for (String url : urls) {

			XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
			XmlRpcClient client = new XmlRpcClient();
			config.setServerURL(new URL(url));
			client.setConfig(config);

			Object[] params = new Object[] {};

			if (pids == null) {
				pids = getPIDsAndEscidocIdsOfCollections(ECHO_CONTAINER_ID);
			}

			try {
				String parentPid;
				String pid = (String) client.execute("getPID", params);
				String contid = getIDfromPID("mpiwg:" + pid);
				addECHOObjectToCollection(client, contid);
				success.put(pid, url);
			} catch (Exception e) {

				ByteArrayOutputStream out = new ByteArrayOutputStream();
				PrintStream s = new PrintStream(out);
				e.printStackTrace(s);

				nosuccess.put(url, out.toString());

				e.printStackTrace();
			}
		}
		System.out.println("SUCCESSFULL ORGANIZED");
		for (String id : success.keySet())
			System.out.println("ID:" + id + "     URL:" + success.get(id));

		System.out.println("ERRORS:");
		for (String id : nosuccess.keySet()) {
			System.out.println("URL:" + id);
			System.out.println("Message:" + nosuccess.get(id));
		}

	}



	private String getIDfromPID(String pid) throws ClientProtocolException,
			IOException {
		InputStream res = getXMLfromPID(pid,MAIN_CONTEXT);
		return EScidocBasicHandler.getId(EScidocBasicHandler
				.convertStreamToString(res));
	}

	/**
	 * FŸgt die ECHO Collection unter der URL in eScidoc ein. Der Link auf die
	 * Web-Seite wird in einem eigenen item hinterlegt, dass in Collection
	 * eingefŸgt wird.
	 * 
	 * @param url
	 * @throws Exception
	 */
	private String ingestECHOCollection(String url) throws Exception {

		// get a PID for the Collection
		System.out.println("Processing:" + url);

		HashMap<String, String> dcs = new HashMap<String, String>(); // Store
		// for
		// the
		// metadata

		// Verbinde dich mit der Collection Ÿber XML-rpc

		XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
		XmlRpcClient client = new XmlRpcClient();
		config.setServerURL(new URL(url));
		client.setConfig(config);

		String pid = getOrCreatePID(client);
		
		if (pidAlreadyExists("mpiwg:"+pid))
		{
			System.out.println("PID:"+pid);
			String contid=getIDfromPID("mpiwg:"+pid);
			System.out.println("------- belongsTo:"+contid);
			return contid;
		}
		Object[] params = new Object[] {};

		eSciDocXmlObject obj = new eSciDocXmlObject("mpiwg:" + pid,
				ECHOCONTAINER_TEMPLATE_XML);

		String result = (String) client.execute("getDescription", params);

		String x = new String(result.getBytes("UTF-8"), ("UTF-8"));
		// System.out.println("DESCR"+x);
		dcs.put("description", x);
		String title = (String) client.execute("getTitle", params);
		dcs.put("title", title);
		obj.insertDC(dcs);
		obj.addOrigUrlToMPIWGMetaData(url);

		// obj.setRelationship("info:fedora/echo:col1");
		String xml = obj.printXML();
		// System.out.println(xml);
		String ret = ingest("/ir/container", xml);
		String xr = ingestCollectionWebSite(title, url);
		// System.out.println(xr);
		String objid = EScidocBasicHandler.getId(xr);
		String dateStamp = EScidocBasicHandler.getDateStamp(ret);
		String addTxt = "<param last-modification-date=\"" + dateStamp + "\">";
		addTxt += "<id>" + objid + "</id>";
		addTxt += "</param>";

		String contid = EScidocBasicHandler.getId(ret);

		ByteArrayInputStream stream = new ByteArrayInputStream(addTxt
				.getBytes("utf-8"));

		eSciDocHandler.eScidocPost("/ir/container/" + contid + "/members/add",
				stream);
		// System.out.println(response.getStatusLine());
		// System.out.println(EScidocBasicHandler.convertStreamToString(response.getEntity().getContent()));
		System.out.println("Processed:" + url + "------>" + contid);

		addToCollection(ECHO_CONTAINER_ID, contid);

		params = new Object[] { pid };
		client.execute("setPID", params);

		System.out.println(ret);

		addECHOObjectToCollection(client, contid);
		return contid;

	}

	public ArrayList<String> findMissingItems() throws XmlRpcException, IOException{
		return findMissingItemsFromECHOUrls(getAllResources());
	}
		
	public ArrayList<String> findMissingCollections() throws XmlRpcException, IOException{
		return findMissingItemsFromECHOUrls(getAllCollections());
	}
	
	public ArrayList<String> findMissingItemsFromECHOUrls(List<String> urls) throws XmlRpcException, IOException{
		//ArrayList<String> urls = getAllCollections();
		System.out.println("GOT the collections");
		ArrayList<String> ret = new ArrayList<String>();
		for (String url : urls) {
			System.out.println("checking:"+url);
			XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
			XmlRpcClient client = new XmlRpcClient();
			
			config.setServerURL(new URL(url));
			client.setConfig(config);
			String pid;
			try {
				Object[] parameters = new Object[] {};

				pid = (String) client.execute("getPID", parameters);
			} catch (Exception e) {
				pid = null;
			}
		
			if (pid == null){
				ret.add(url);
				System.out.println("             -- no pid");
			} else {
				String id;
				try {
				id = getIDfromPID("mpiwg:"+pid);
				} catch (Exception e) {
					id = "NO";
					ret.add(url);
				}
				
				
				System.out.println("             -- id:"+id);
			}
		
				
				
		}
		return ret;
	}
	private String getOrCreatePID(XmlRpcClient client) throws XmlRpcException,
			MalformedURLException {
		Object[] parameters = new Object[] {};

		String pid = null;

		// Hole pid aus ECHO
		try {
			pid = (String) client.execute("getPID", parameters);
		} catch (Exception e) {
			pid = null;
		}
		// Falls dort noch keine ist, erzeuge ein neue
		if (pid == null)
			pid = getID();
		else
			System.out.println("PID from ECHO:" + pid);
			
		return pid;
	}

	private void addECHOObjectToCollection(XmlRpcClient client, String contid)
			throws ClientProtocolException, IOException, JDOMException {
		Object[] params;
		params = new Object[] {};

		if (pids == null) {
			pids = getPIDsAndEscidocIdsOfCollections(ECHO_CONTAINER_ID);
		}

		String parentId;
		String parentPid;
		try {
			parentPid = (String) client.execute("getParentPID", params);
			parentId = pids.get("mpiwg:" + parentPid);
		} catch (Exception e) {
			parentId = ECHO_ROOT_ID;
		}
		addToCollection(parentId, contid);

	}

	private String ingestCollectionWebSite(String title, String url)
			throws Exception {
		String pid = getID();
		eSciDocXmlObject obj = new eSciDocXmlObject("mpiwg:" + pid,
				"ECHOCollectionWebRepresentationTemplate.xml");

		HashMap<String, String> dcs = new HashMap<String, String>();

		obj.addWebUrl(url);
		// obj.setRelationship("info:fedora/echo:col1");

		dcs.put("title", title); // ersatzweise den titel aus der echo
		// collection
		obj.insertDC(dcs);

		String xml = obj.printXML();
		// System.out.println(xml);
		String res = ingest("/ir/item", xml);
		return res;
	}

	public void ingestECHOResources() throws IOException {
		ingestECHOResources(null);
		
	}
	public void ingestECHOResources(Pattern match) throws IOException {
		ArrayList<String> urls = getAllResources();
		HashMap<String, String> success = new HashMap<String, String>();
		HashMap<String, String> nosuccess = new HashMap<String, String>();

		for (String url : urls) {

			try {
				Boolean ingest=false;
				
				if (match == null)
					ingest=true;
				else {
					Matcher m = match.matcher(url);
					if (m.matches())
						ingest=true;
					
				}
				if (ingest){
				String id = ingestECHOResource(url);
				success.put(id, url);
				}
			} catch (Exception e) {

				ByteArrayOutputStream out = new ByteArrayOutputStream();
				PrintStream s = new PrintStream(out);
				e.printStackTrace(s);

				nosuccess.put(url, out.toString());

				e.printStackTrace();
			}
		}
		System.out.println("SUCCESSFULL INGEST");
		for (String id : success.keySet())
			System.out.println("ID:" + id + "     URL:" + success.get(id));

		System.out.println("ERRORS:");
		for (String id : nosuccess.keySet()) {
			System.out.println("URL:" + id);
			System.out.println("Message:" + nosuccess.get(id));
		}

	}

	protected ArrayList<String> getAllResources() throws IOException {
		URL echoUrl = new URL(ECHOURL + "/getResourcesXML");
		Pattern p = Pattern.compile("echoLink=\"([^\"]*)\"");
		BufferedReader in = new BufferedReader(new InputStreamReader(echoUrl
				.openStream()));

		ArrayList<String> ret = new ArrayList<String>();
		String inputLine;
		Matcher m;
		while ((inputLine = in.readLine()) != null) {
			m = p.matcher(inputLine);
			String lit;
			if (m.find()) {

				lit = m.group(1);
				ret.add(lit);
			}
		}

		in.close();
		return ret;

	}

	protected String ingestECHOResource(String url) throws Exception {
		return ingestECHOResource(url, false);
	}
	
	protected String ingestECHOResource(String url,boolean withfullText) throws Exception {

		System.out.println("Starting:" + url);

		HashMap<String, String> dcs = new HashMap<String, String>();

		XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
		XmlRpcClient client = new XmlRpcClient();
		config.setServerURL(new URL(url));
		client.setConfig(config);

		String pid = getOrCreatePID(client);

		if (pidAlreadyExists("mpiwg:"+pid))
		{
			System.out.println("PID:"+pid);
			String contid=getIDfromPID("mpiwg:"+pid);
			System.out.println("------- belongsTo:"+contid);
			return contid;
		}
		eSciDocXmlObject obj = new eSciDocXmlObject("mpiwg:" + pid,
				ECHORESOURCE_TEMPLATE_XML);
		Object[] params = new Object[] {};

		String title = (String) client.execute("getTitle", params);
		String ml = (String) client.execute("getMetaDataLink", params);
		if (withfullText){
			String fulltextURL = url+"/getFullTextXML";
			obj.addFullText(fulltextURL);
			
		}
		ml = correctML(ml);

		obj.addWebUrl(url);
		obj.addOrigUrlToMPIWGMetaData(url);
		
		// obj.setRelationship("info:fedora/echo:col1");

		config.setServerURL(new URL(ZOPEPROVIDER + "/metadataMain"));
		client.setConfig(config);
		params = new Object[] { ml };

		try {
			String result = (String) client.execute("getDCFormatted", params);
			System.out.println("dC:"+result);
			DocumentBuilderFactory factory = DocumentBuilderFactory
					.newInstance();
			factory.setNamespaceAware(true);
			DocumentBuilder db = factory.newDocumentBuilder();
			
			InputSource resultStream = new InputSource(new StringReader(result));
			Document dc = db.parse(resultStream);
			obj.insertDC(dc);

			Document indexmeta = db.parse(ml);

			XPath xpath = XPathFactory.newInstance().newXPath();
			xpath.setNamespaceContext(new EScidocNameSpaceContext());
			
			NodeList test = (NodeList) xpath.evaluate("//meta", indexmeta,
					XPathConstants.NODESET);
			if (test.getLength() != 1)
			{
				test = (NodeList) xpath.evaluate("//mpiwg:meta", indexmeta,
						XPathConstants.NODESET);
			
				if (test.getLength() !=1)
					throw new Exception();
			}
				obj.insertMeta(test.item(0));

			obj.addIndexMetaUrl(ml);

		} catch (XmlRpcException e) {
			System.err.println("Ressource:" + url);
			System.err.println("METADATA CANNOT BE PARSED:" + ml);
			HashMap<String, String> dc = new HashMap<String, String>();
			dc.put("title", title); // ersatzweise den titel aus der echo
			// collection
			obj.insertDC(dc);
		} catch (SAXParseException e) {
			System.err.println("METADATA RESULT CANNOT BE PARSED:");
			HashMap<String, String> dc = new HashMap<String, String>();
			dc.put("title", title); // ersatzweise den titel aus der echo
			// collection
			obj.insertDC(dc);
		}

		String xml = obj.printXML();
		System.out.println(xml);
		return "XXX";
		String result = ingest("/ir/item", xml);
//		String contid = EScidocBasicHandler.getId(result);
//		//String contid="NNNN";
//		System.out.println("------->" + contid);
//
//		params = new Object[] { pid };
//		config.setServerURL(new URL(url));
//		client.setConfig(config);
//
//		client.execute("setPID", params);
//		addToCollection(ECHO_CONTAINER_ID, contid);
//
//		addECHOObjectToCollection(client, contid);
//		return contid;

	}

	private boolean pidAlreadyExists(String pid) {
		String id;
		try{
			id = getIDfromPID(pid);
		} catch (Exception e){
			return false;
		}
		if (!id.equals(""))
			return true;
		return false;
	}

	private String correctML(String ml) {
		Pattern p = Pattern.compile("experimental/(.*)");
		Matcher m = p.matcher(ml);
		String pf;
		if (m.find())
			pf = "experimental/" + m.group(1);
		else {
			p = Pattern.compile("permanent/(.*)");
			m = p.matcher(ml);
			if (m.find())
				pf = "permanent/" + m.group(1);
			else
				return ml;
		}
		return SERVLETURL + pf;
	}

	protected ArrayList<String> getAllCollections() throws XmlRpcException,
			IOException {
		System.out.println("ECHO:"+ECHOURL);
		URL echoUrl = new URL(ECHOURL + "/getCollectionsXML");
		Pattern p = Pattern.compile("echoLink=\"(.*)\"");
		BufferedReader in = new BufferedReader(new InputStreamReader(echoUrl
				.openStream()));

		ArrayList<String> ret = new ArrayList<String>();
		String inputLine;
		Matcher m;
		while ((inputLine = in.readLine()) != null) {
			m = p.matcher(inputLine);
			String lit;
			if (m.find()) {

				lit = m.group(1);
				ret.add(lit);
			}
		}

		in.close();
		return ret;
	}

	private void submitAndReleaseAnObject(String href) throws ClientProtocolException,
			IOException, JDOMException {

		addVersionPid(href);
		HttpResponse res = submitAnObject(href, "submit");
		System.out.println(EScidocBasicHandler.convertStreamToString(res
				.getEntity().getContent()));
		res = releaseAnObject(href, "first release");
		System.out.println(EScidocBasicHandler.convertStreamToString(res
				.getEntity().getContent()));

	}

	void releaseECHORessources() throws XmlRpcException, IOException,
			JDOMException {
		ArrayList<String> urls = getAllResources();
		HashMap<String, String> success = new HashMap<String, String>();
		HashMap<String, String> nosuccess = new HashMap<String, String>();
		int numOfUrl= urls.size();
		int count = 0;
		for (String url : urls) {

			XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
			XmlRpcClient client = new XmlRpcClient();
			config.setServerURL(new URL(url));
			client.setConfig(config);

			Object[] params = new Object[] {};

			if (pids == null) {
				pids = getPIDsAndEscidocIdsOfCollections(ECHO_CONTAINER_ID);
			}

			try {
				String parentPid;
				String pid = (String) client.execute("getPID", params);
				String contid = getIDfromPID("mpiwg:" + pid);
				submitAndReleaseAnObject("/ir/item/"+contid);
				success.put(pid, url);
			} catch (Exception e) {

				ByteArrayOutputStream out = new ByteArrayOutputStream();
				PrintStream s = new PrintStream(out);
				e.printStackTrace(s);

				nosuccess.put(url, out.toString());

				e.printStackTrace();
			}
			count+=1;
			System.out.println("DONE:"+count+" of "+numOfUrl);
		}
		System.out.println("SUCCESSFULL ORGANIZED");
		for (String id : success.keySet())
			System.out.println("ID:" + id + "     URL:" + success.get(id));

		System.out.println("ERRORS:");
		for (String id : nosuccess.keySet()) {
			System.out.println("URL:" + id);
			System.out.println("Message:" + nosuccess.get(id));
		}

	}
}