Mercurial > hg > eSciDocImport
view src/de/mpiwg/itgroup/eSciDoc/Tools/IngestECHO.java @ 0:c6929e63b0b8
first import
author | dwinter |
---|---|
date | Wed, 24 Nov 2010 16:52:07 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.itgroup.eSciDoc.Tools; //todo: create context for echo and contentmodell import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintStream; import java.io.StringReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathFactory; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.xmlrpc.XmlRpcException; import org.apache.xmlrpc.client.XmlRpcClient; import org.apache.xmlrpc.client.XmlRpcClientConfigImpl; import org.jdom.JDOMException; import org.w3c.dom.Document; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXParseException; import sun.misc.Regexp; //import fedora.client.FedoraClient; //import fedora.server.access.FedoraAPIA; //import fedora.server.management.FedoraAPIM; //import fedora.server.types.gen.ComparisonOperator; //import fedora.server.types.gen.Condition; //import fedora.server.types.gen.FieldSearchQuery; //import fedora.server.types.gen.FieldSearchResult; //import fedora.server.types.gen.ListSession; //import fedora.server.types.gen.MIMETypedStream; //import fedora.server.types.gen.ObjectFields; public class IngestECHO extends Ingestor { protected String ECHORESOURCE_TEMPLATE_XML; protected String ECHOCONTAINER_TEMPLATE_XML; private String SERVLETURL; protected String ECHOURL; protected String ECHO_CONTAINER_ID; protected String ECHO_ROOT_ID; protected String MAIN_CONTEXT; private HashMap<String, String> pids; protected static String ESCIDOC_SERVER_URL = "euler.mpiwg-berlin.mpg.de"; protected static String ZOPEPROVIDER = "http://127.0.0.1:18080"; private static int PORT = 8080; IngestECHO(String user, String password){ super(ESCIDOC_SERVER_URL, PORT, ZOPEPROVIDER, user, password); ECHORESOURCE_TEMPLATE_XML = "ECHOResourceTemplate.xml"; SERVLETURL= "http://nausikaa2.mpiwg-berlin.mpg.de/digitallibrary/servlet/Texter?fn="; ECHOURL = "http://echo.mpiwg-berlin.mpg.de"; //ZOPEPROVIDER = "http://127.0.0.1:18080"; ECHO_CONTAINER_ID = "escidoc:3006"; // enthaelt alle ECHO // objecte ECHO_ROOT_ID = "escidoc:3005"; // enthaelt alle Objekte die // keiner ECHO collection // angehoeren MAIN_CONTEXT = "escidoc:3002"; HashMap<String, String> pids = null; } void ingestECHOCollections() throws XmlRpcException, IOException { ArrayList<String> urls = getAllCollections(); HashMap<String, String> success = new HashMap<String, String>(); HashMap<String, String> nosuccess = new HashMap<String, String>(); for (String url : urls) { try { String id = ingestECHOCollection(url); success.put(id, url); } catch (Exception e) { ByteArrayOutputStream out = new ByteArrayOutputStream(); PrintStream s = new PrintStream(out); e.printStackTrace(s); nosuccess.put(url, out.toString()); e.printStackTrace(); } } System.out.println("SUCCESSFULL INGEST"); for (String id : success.keySet()) System.out.println("ID:" + id + " URL:" + success.get(id)); System.out.println("ERRORS:"); for (String id : nosuccess.keySet()) { System.out.println("URL:" + id); System.out.println("Message:" + nosuccess.get(id)); } } void organizeECHOCollections() throws XmlRpcException, IOException, JDOMException { ArrayList<String> urls = getAllCollections(); HashMap<String, String> success = new HashMap<String, String>(); HashMap<String, String> nosuccess = new HashMap<String, String>(); for (String url : urls) { XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl(); XmlRpcClient client = new XmlRpcClient(); config.setServerURL(new URL(url)); client.setConfig(config); Object[] params = new Object[] {}; if (pids == null) { pids = getPIDsAndEscidocIdsOfCollections(ECHO_CONTAINER_ID); } try { String parentPid; String pid = (String) client.execute("getPID", params); String contid = pids.get("mpiwg:" + pid); addECHOObjectToCollection(client, contid); success.put(pid, url); } catch (Exception e) { ByteArrayOutputStream out = new ByteArrayOutputStream(); PrintStream s = new PrintStream(out); e.printStackTrace(s); nosuccess.put(url, out.toString()); e.printStackTrace(); } } System.out.println("SUCCESSFULL ORGANIZED"); for (String id : success.keySet()) System.out.println("ID:" + id + " URL:" + success.get(id)); System.out.println("ERRORS:"); for (String id : nosuccess.keySet()) { System.out.println("URL:" + id); System.out.println("Message:" + nosuccess.get(id)); } } void organizeECHORessources() throws XmlRpcException, IOException, JDOMException { ArrayList<String> urls = getAllResources(); HashMap<String, String> success = new HashMap<String, String>(); HashMap<String, String> nosuccess = new HashMap<String, String>(); for (String url : urls) { XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl(); XmlRpcClient client = new XmlRpcClient(); config.setServerURL(new URL(url)); client.setConfig(config); Object[] params = new Object[] {}; if (pids == null) { pids = getPIDsAndEscidocIdsOfCollections(ECHO_CONTAINER_ID); } try { String parentPid; String pid = (String) client.execute("getPID", params); String contid = getIDfromPID("mpiwg:" + pid); addECHOObjectToCollection(client, contid); success.put(pid, url); } catch (Exception e) { ByteArrayOutputStream out = new ByteArrayOutputStream(); PrintStream s = new PrintStream(out); e.printStackTrace(s); nosuccess.put(url, out.toString()); e.printStackTrace(); } } System.out.println("SUCCESSFULL ORGANIZED"); for (String id : success.keySet()) System.out.println("ID:" + id + " URL:" + success.get(id)); System.out.println("ERRORS:"); for (String id : nosuccess.keySet()) { System.out.println("URL:" + id); System.out.println("Message:" + nosuccess.get(id)); } } private String getIDfromPID(String pid) throws ClientProtocolException, IOException { InputStream res = getXMLfromPID(pid,MAIN_CONTEXT); return EScidocBasicHandler.getId(EScidocBasicHandler .convertStreamToString(res)); } /** * FŸgt die ECHO Collection unter der URL in eScidoc ein. Der Link auf die * Web-Seite wird in einem eigenen item hinterlegt, dass in Collection * eingefŸgt wird. * * @param url * @throws Exception */ private String ingestECHOCollection(String url) throws Exception { // get a PID for the Collection System.out.println("Processing:" + url); HashMap<String, String> dcs = new HashMap<String, String>(); // Store // for // the // metadata // Verbinde dich mit der Collection Ÿber XML-rpc XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl(); XmlRpcClient client = new XmlRpcClient(); config.setServerURL(new URL(url)); client.setConfig(config); String pid = getOrCreatePID(client); if (pidAlreadyExists("mpiwg:"+pid)) { System.out.println("PID:"+pid); String contid=getIDfromPID("mpiwg:"+pid); System.out.println("------- belongsTo:"+contid); return contid; } Object[] params = new Object[] {}; eSciDocXmlObject obj = new eSciDocXmlObject("mpiwg:" + pid, ECHOCONTAINER_TEMPLATE_XML); String result = (String) client.execute("getDescription", params); String x = new String(result.getBytes("UTF-8"), ("UTF-8")); // System.out.println("DESCR"+x); dcs.put("description", x); String title = (String) client.execute("getTitle", params); dcs.put("title", title); obj.insertDC(dcs); obj.addOrigUrlToMPIWGMetaData(url); // obj.setRelationship("info:fedora/echo:col1"); String xml = obj.printXML(); // System.out.println(xml); String ret = ingest("/ir/container", xml); String xr = ingestCollectionWebSite(title, url); // System.out.println(xr); String objid = EScidocBasicHandler.getId(xr); String dateStamp = EScidocBasicHandler.getDateStamp(ret); String addTxt = "<param last-modification-date=\"" + dateStamp + "\">"; addTxt += "<id>" + objid + "</id>"; addTxt += "</param>"; String contid = EScidocBasicHandler.getId(ret); ByteArrayInputStream stream = new ByteArrayInputStream(addTxt .getBytes("utf-8")); eSciDocHandler.eScidocPost("/ir/container/" + contid + "/members/add", stream); // System.out.println(response.getStatusLine()); // System.out.println(EScidocBasicHandler.convertStreamToString(response.getEntity().getContent())); System.out.println("Processed:" + url + "------>" + contid); addToCollection(ECHO_CONTAINER_ID, contid); params = new Object[] { pid }; client.execute("setPID", params); System.out.println(ret); addECHOObjectToCollection(client, contid); return contid; } public ArrayList<String> findMissingItems() throws XmlRpcException, IOException{ return findMissingItemsFromECHOUrls(getAllResources()); } public ArrayList<String> findMissingCollections() throws XmlRpcException, IOException{ return findMissingItemsFromECHOUrls(getAllCollections()); } public ArrayList<String> findMissingItemsFromECHOUrls(List<String> urls) throws XmlRpcException, IOException{ //ArrayList<String> urls = getAllCollections(); System.out.println("GOT the collections"); ArrayList<String> ret = new ArrayList<String>(); for (String url : urls) { System.out.println("checking:"+url); XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl(); XmlRpcClient client = new XmlRpcClient(); config.setServerURL(new URL(url)); client.setConfig(config); String pid; try { Object[] parameters = new Object[] {}; pid = (String) client.execute("getPID", parameters); } catch (Exception e) { pid = null; } if (pid == null){ ret.add(url); System.out.println(" -- no pid"); } else { String id; try { id = getIDfromPID("mpiwg:"+pid); } catch (Exception e) { id = "NO"; ret.add(url); } System.out.println(" -- id:"+id); } } return ret; } private String getOrCreatePID(XmlRpcClient client) throws XmlRpcException, MalformedURLException { Object[] parameters = new Object[] {}; String pid = null; // Hole pid aus ECHO try { pid = (String) client.execute("getPID", parameters); } catch (Exception e) { pid = null; } // Falls dort noch keine ist, erzeuge ein neue if (pid == null) pid = getID(); else System.out.println("PID from ECHO:" + pid); return pid; } private void addECHOObjectToCollection(XmlRpcClient client, String contid) throws ClientProtocolException, IOException, JDOMException { Object[] params; params = new Object[] {}; if (pids == null) { pids = getPIDsAndEscidocIdsOfCollections(ECHO_CONTAINER_ID); } String parentId; String parentPid; try { parentPid = (String) client.execute("getParentPID", params); parentId = pids.get("mpiwg:" + parentPid); } catch (Exception e) { parentId = ECHO_ROOT_ID; } addToCollection(parentId, contid); } private String ingestCollectionWebSite(String title, String url) throws Exception { String pid = getID(); eSciDocXmlObject obj = new eSciDocXmlObject("mpiwg:" + pid, "ECHOCollectionWebRepresentationTemplate.xml"); HashMap<String, String> dcs = new HashMap<String, String>(); obj.addWebUrl(url); // obj.setRelationship("info:fedora/echo:col1"); dcs.put("title", title); // ersatzweise den titel aus der echo // collection obj.insertDC(dcs); String xml = obj.printXML(); // System.out.println(xml); String res = ingest("/ir/item", xml); return res; } public void ingestECHOResources() throws IOException { ingestECHOResources(null); } public void ingestECHOResources(Pattern match) throws IOException { ArrayList<String> urls = getAllResources(); HashMap<String, String> success = new HashMap<String, String>(); HashMap<String, String> nosuccess = new HashMap<String, String>(); for (String url : urls) { try { Boolean ingest=false; if (match == null) ingest=true; else { Matcher m = match.matcher(url); if (m.matches()) ingest=true; } if (ingest){ String id = ingestECHOResource(url); success.put(id, url); } } catch (Exception e) { ByteArrayOutputStream out = new ByteArrayOutputStream(); PrintStream s = new PrintStream(out); e.printStackTrace(s); nosuccess.put(url, out.toString()); e.printStackTrace(); } } System.out.println("SUCCESSFULL INGEST"); for (String id : success.keySet()) System.out.println("ID:" + id + " URL:" + success.get(id)); System.out.println("ERRORS:"); for (String id : nosuccess.keySet()) { System.out.println("URL:" + id); System.out.println("Message:" + nosuccess.get(id)); } } protected ArrayList<String> getAllResources() throws IOException { URL echoUrl = new URL(ECHOURL + "/getResourcesXML"); Pattern p = Pattern.compile("echoLink=\"([^\"]*)\""); BufferedReader in = new BufferedReader(new InputStreamReader(echoUrl .openStream())); ArrayList<String> ret = new ArrayList<String>(); String inputLine; Matcher m; while ((inputLine = in.readLine()) != null) { m = p.matcher(inputLine); String lit; if (m.find()) { lit = m.group(1); ret.add(lit); } } in.close(); return ret; } protected String ingestECHOResource(String url) throws Exception { return ingestECHOResource(url, false); } protected String ingestECHOResource(String url,boolean withfullText) throws Exception { System.out.println("Starting:" + url); HashMap<String, String> dcs = new HashMap<String, String>(); XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl(); XmlRpcClient client = new XmlRpcClient(); config.setServerURL(new URL(url)); client.setConfig(config); String pid = getOrCreatePID(client); if (pidAlreadyExists("mpiwg:"+pid)) { System.out.println("PID:"+pid); String contid=getIDfromPID("mpiwg:"+pid); System.out.println("------- belongsTo:"+contid); return contid; } eSciDocXmlObject obj = new eSciDocXmlObject("mpiwg:" + pid, ECHORESOURCE_TEMPLATE_XML); Object[] params = new Object[] {}; String title = (String) client.execute("getTitle", params); String ml = (String) client.execute("getMetaDataLink", params); if (withfullText){ String fulltextURL = url+"/getFullTextXML"; obj.addFullText(fulltextURL); } ml = correctML(ml); obj.addWebUrl(url); obj.addOrigUrlToMPIWGMetaData(url); // obj.setRelationship("info:fedora/echo:col1"); config.setServerURL(new URL(ZOPEPROVIDER + "/metadataMain")); client.setConfig(config); params = new Object[] { ml }; try { String result = (String) client.execute("getDCFormatted", params); System.out.println("dC:"+result); DocumentBuilderFactory factory = DocumentBuilderFactory .newInstance(); factory.setNamespaceAware(true); DocumentBuilder db = factory.newDocumentBuilder(); InputSource resultStream = new InputSource(new StringReader(result)); Document dc = db.parse(resultStream); obj.insertDC(dc); Document indexmeta = db.parse(ml); XPath xpath = XPathFactory.newInstance().newXPath(); xpath.setNamespaceContext(new EScidocNameSpaceContext()); NodeList test = (NodeList) xpath.evaluate("//meta", indexmeta, XPathConstants.NODESET); if (test.getLength() != 1) { test = (NodeList) xpath.evaluate("//mpiwg:meta", indexmeta, XPathConstants.NODESET); if (test.getLength() !=1) throw new Exception(); } obj.insertMeta(test.item(0)); obj.addIndexMetaUrl(ml); } catch (XmlRpcException e) { System.err.println("Ressource:" + url); System.err.println("METADATA CANNOT BE PARSED:" + ml); HashMap<String, String> dc = new HashMap<String, String>(); dc.put("title", title); // ersatzweise den titel aus der echo // collection obj.insertDC(dc); } catch (SAXParseException e) { System.err.println("METADATA RESULT CANNOT BE PARSED:"); HashMap<String, String> dc = new HashMap<String, String>(); dc.put("title", title); // ersatzweise den titel aus der echo // collection obj.insertDC(dc); } String xml = obj.printXML(); System.out.println(xml); return "XXX"; String result = ingest("/ir/item", xml); // String contid = EScidocBasicHandler.getId(result); // //String contid="NNNN"; // System.out.println("------->" + contid); // // params = new Object[] { pid }; // config.setServerURL(new URL(url)); // client.setConfig(config); // // client.execute("setPID", params); // addToCollection(ECHO_CONTAINER_ID, contid); // // addECHOObjectToCollection(client, contid); // return contid; } private boolean pidAlreadyExists(String pid) { String id; try{ id = getIDfromPID(pid); } catch (Exception e){ return false; } if (!id.equals("")) return true; return false; } private String correctML(String ml) { Pattern p = Pattern.compile("experimental/(.*)"); Matcher m = p.matcher(ml); String pf; if (m.find()) pf = "experimental/" + m.group(1); else { p = Pattern.compile("permanent/(.*)"); m = p.matcher(ml); if (m.find()) pf = "permanent/" + m.group(1); else return ml; } return SERVLETURL + pf; } protected ArrayList<String> getAllCollections() throws XmlRpcException, IOException { System.out.println("ECHO:"+ECHOURL); URL echoUrl = new URL(ECHOURL + "/getCollectionsXML"); Pattern p = Pattern.compile("echoLink=\"(.*)\""); BufferedReader in = new BufferedReader(new InputStreamReader(echoUrl .openStream())); ArrayList<String> ret = new ArrayList<String>(); String inputLine; Matcher m; while ((inputLine = in.readLine()) != null) { m = p.matcher(inputLine); String lit; if (m.find()) { lit = m.group(1); ret.add(lit); } } in.close(); return ret; } private void submitAndReleaseAnObject(String href) throws ClientProtocolException, IOException, JDOMException { addVersionPid(href); HttpResponse res = submitAnObject(href, "submit"); System.out.println(EScidocBasicHandler.convertStreamToString(res .getEntity().getContent())); res = releaseAnObject(href, "first release"); System.out.println(EScidocBasicHandler.convertStreamToString(res .getEntity().getContent())); } void releaseECHORessources() throws XmlRpcException, IOException, JDOMException { ArrayList<String> urls = getAllResources(); HashMap<String, String> success = new HashMap<String, String>(); HashMap<String, String> nosuccess = new HashMap<String, String>(); int numOfUrl= urls.size(); int count = 0; for (String url : urls) { XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl(); XmlRpcClient client = new XmlRpcClient(); config.setServerURL(new URL(url)); client.setConfig(config); Object[] params = new Object[] {}; if (pids == null) { pids = getPIDsAndEscidocIdsOfCollections(ECHO_CONTAINER_ID); } try { String parentPid; String pid = (String) client.execute("getPID", params); String contid = getIDfromPID("mpiwg:" + pid); submitAndReleaseAnObject("/ir/item/"+contid); success.put(pid, url); } catch (Exception e) { ByteArrayOutputStream out = new ByteArrayOutputStream(); PrintStream s = new PrintStream(out); e.printStackTrace(s); nosuccess.put(url, out.toString()); e.printStackTrace(); } count+=1; System.out.println("DONE:"+count+" of "+numOfUrl); } System.out.println("SUCCESSFULL ORGANIZED"); for (String id : success.keySet()) System.out.println("ID:" + id + " URL:" + success.get(id)); System.out.println("ERRORS:"); for (String id : nosuccess.keySet()) { System.out.println("URL:" + id); System.out.println("Message:" + nosuccess.get(id)); } } }