Mercurial > hg > eSciDocImport
view src/de/mpiwg/itgroup/eSciDoc/importer/ECHOImporter.java @ 0:c6929e63b0b8
first import
author | dwinter |
---|---|
date | Wed, 24 Nov 2010 16:52:07 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpiwg.itgroup.eSciDoc.importer; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.StringReader; import java.lang.reflect.Array; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.jdom.Attribute; import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.Namespace; import org.jdom.input.SAXBuilder; import org.jdom.output.XMLOutputter; import org.jdom.xpath.XPath; import org.w3c.dom.Entity; import de.mpiwg.itgroup.eSciDoc.Tools.EScidocBasicHandler; import de.mpiwg.itgroup.eSciDoc.Tools.EScidocTools; import de.mpiwg.itgroup.eSciDoc.Tools.Html2Text; import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHOCollection; import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHOObject; import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHORessource; import de.mpiwg.itgroup.eSciDoc.utils.eSciDocXmlObject; public class ECHOImporter implements Importer { private Logger logger = Logger.getRootLogger(); private static long MAX_RES = 1000000L; // for debugging private URL instanceUrl; private String collectionCMM = "/cmm/content-model/escidoc:11004"; public ECHOImporter(URL url) { this.instanceUrl = url; } @Override public Iterable<ECHOObject> getObjectList(String type) { try { return getObjectListfromRDF(type); } catch (JDOMException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return new ArrayList<ECHOObject>(); } private ArrayList<ECHOObject> getObjectListfromRDF(String type) throws JDOMException, IOException { ArrayList<ECHOObject> ret = new ArrayList<ECHOObject>(); SAXBuilder builder = new SAXBuilder(); Document doc = builder.build(instanceUrl); Element el = doc.getRootElement(); // get resources XPath xpathResources = XPath .newInstance("//rdf:Description[echonavigation:type='" + type + "']"); xpathResources.addNamespace("MPIWG", "http://www.mpiwg-berlin.mpg.de/ns/mpiwg"); xpathResources.addNamespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"); xpathResources .addNamespace("echonavigation", "http://www.echo.eu/rdf#"); List<Element> paths = xpathResources.selectNodes(el); int counter = 0; for (Element path : paths) { counter++; logger.debug("resource counter:" + String.valueOf(counter)); if (logger.getLevel().equals(Level.DEBUG) && (counter > MAX_RES)) break; ECHOObject obj = getECHORessourceFromRDF(el, path, type); // fueger dem object seine PID hinzu. String pid; try { pid = obj.getOrCreatePID(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); pid = null; } if (pid == null) { logger.error("Cannot createOrGetAn a PID for:" + obj.toString()); logger.error("Object will not be added"); } else { ret.add(obj); } } return ret; } private ECHOObject getECHORessourceFromRDF(Element el, Element path, String echotype) throws JDOMException, IOException { XPath xpath = EScidocTools.getESciDocXpath("./@rdf:about"); Attribute aboutAttr = (Attribute) xpath.selectSingleNode(path); String aboutString = aboutAttr.getValue(); // hole das object xpath = EScidocTools.getESciDocXpath(".//echonavigation:name"); String name = ((Element) xpath.selectSingleNode(path)).getTextTrim(); Html2Text htmlParser = new Html2Text(); // filter html codes htmlParser.parse(new StringReader(name)); name = htmlParser.getText(); xpath = EScidocTools.getESciDocXpath(".//mpiwg:archive-path"); Element archiveElement = (Element) xpath.selectSingleNode(path); String archivePath = ""; if (archiveElement != null) archivePath = archiveElement.getTextTrim(); xpath = EScidocTools.getESciDocXpath("@rdf:about"); String about = ((Attribute) xpath.selectSingleNode(path)).getValue(); // hole seq des objectes String sequenceString = ("//rdf:Seq[@rdf:about='" + about + "']/rdf:li/@rdf:resource"); xpath = EScidocTools.getESciDocXpath(sequenceString); List<Attribute> seqs = xpath.selectNodes(el); ECHOObject er = null; if (echotype.equals("ECHO_resource")) { er = new ECHORessource(name, archivePath, aboutString); } else if (echotype.equals("ECHO_collection")) { er = new ECHOCollection(name, aboutString); } // set description DefaultHttpClient hc = new DefaultHttpClient(); URI echoUri; try { echoUri = new URI(er.echoUrl + "/getDescription"); HttpGet hg = new HttpGet(echoUri); HttpResponse resp = hc.execute(hg); HttpEntity respEnt = resp.getEntity(); if (respEnt != null) { // er.description=EScidocBasicHandler.convertStreamToString(respEnt.getContent()); // filter html codes htmlParser.parse(new InputStreamReader(respEnt.getContent())); er.description = htmlParser.getText(); } } catch (Exception e1) { logger.debug("echoImporter no URI:" + er.echoUrl); // e1.printStackTrace(); } for (Attribute seq : seqs) { String typeString = ("//rdf:Description[@rdf:about='" + seq.getValue() + "']/echonavigation:type"); xpath = EScidocTools.getESciDocXpath(typeString); Element typeNode = (Element) xpath.selectSingleNode(el); if (typeNode==null){ logger.debug("getRessourceFromRDF, no type in:"+typeString); continue; } String type = (typeNode).getTextTrim(); if (ECHORessource.class.isInstance(er) && type.equals("ECHO_metaData")) { HttpClient client = new DefaultHttpClient(); HttpGet get = new HttpGet(seq.getValue().replace( "showMetaDataXML", "getMetaDataLink")); HttpResponse ret = null; try { ret = client.execute(get); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } int code = ret.getStatusLine().getStatusCode(); try { if ((code == 204) || (code >= 300)) ((ECHORessource) er).metaData = ""; else { String str = EScidocBasicHandler .convertStreamToString(ret.getEntity() .getContent()); ((ECHORessource) er).metaData = ((ECHORessource) er) .correctML(str); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } else if (ECHORessource.class.isInstance(er) && type.equals("ECHO_fulltext")) { HttpClient client = new DefaultHttpClient(); HttpGet get = new HttpGet(seq.getValue() + "?noredirect=yes"); HttpResponse ret = null; try { ret = client.execute(get); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } try { String str = EScidocBasicHandler.convertStreamToString(ret .getEntity().getContent()); ((ECHORessource) er).fullText = new String(str); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } logger.debug(er.toString()); return er; } /** * Erzeugt Collections auf der Basis der in den Metadaten gespeicherten ECHO * urls. Dabei wir der Pfad schrittweise analysiert und dann ein Baum * aufgebaut. * * @param handler * eScidoc Serververbindung * @param context * Context der Kollektion (sollte eine Kollektio sein die aus * einer ECHO webseite aufgebaut wurde. * @throws Exception */ public void organizeRessourcesInCollections(EScidocBasicHandler handler, String context) throws Exception { HashMap<String, ArrayList<String>> tree = new HashMap<String, ArrayList<String>>(); // nimmt // den // tree // der // items // auf HashMap<String, String> url2escidocId = new HashMap<String, String>(); HashMap<String, String> containerUrl2escidocId = new HashMap<String, String>(); HashMap<String, ArrayList<String>> containerTree = new HashMap<String, ArrayList<String>>(); // nimmt // den // tree // der // container // auf ArrayList<String> urls = handler.getAllLinksOfContext("web_page", context); generateTreeAndConversion(urls, tree, url2escidocId); File tt = new File("/tmp/list.out"); FileWriter fw= new FileWriter(tt); for (String containerUrl : tree.keySet()) { fw.write(containerUrl); } fw.close(); // erzeuge jetzt die container for (String containerUrl : tree.keySet()) { XPath xp; // erzeuge Document des Container mit dem entsprechenden Kontext und den Metadaten aus dem Context. Document doc = createContainer(handler, context, url2escidocId, containerUrl); if (doc==null){ doc= createContainerFromECHO(handler, containerUrl, context); } // now fill the container xp = EScidocTools.getESciDocXpath("//struct-map:struct-map"); Element structmap = (Element) xp.selectSingleNode(doc); // fuege die Collection selbst in den container putContentInStructMap(structmap, url2escidocId.get(containerUrl)); //fuege nun nur die ressourcen hinzu for (String content : tree.get(containerUrl)) { if (!contentIsCollection(handler,content)) putContentInStructMap(structmap, content); } logger.debug(printXML(doc)); try { // rrzeuge das object jetzt in escidoc String result = handler.createObject("/ir/container", printXML(doc)); xp = EScidocTools .getESciDocXpath("//container:container/@xlink:href"); Document containerDoc = new SAXBuilder().build(EScidocBasicHandler .convertStringToStream(result)); Attribute containerHref = (Attribute) xp.selectSingleNode(containerDoc); logger.debug("added container:" + containerHref); Logger.getLogger("addedFilesLogger").debug( "added container:" + containerHref); // sichere jetzt den neuen container im container tree String[] splitted = containerUrl.split("/"); // teile dazu die // container url // wieder auf. StringBuffer buffer = new StringBuffer(); for (int i = 0; i < splitted.length - 2; i++) { buffer.append(splitted[i]); buffer.append("/"); } buffer.append(splitted[splitted.length - 2]); String parentContainer = buffer.toString(); if (!containerTree.containsKey(parentContainer)) { containerTree.put(parentContainer, new ArrayList<String>()); } containerTree.get(parentContainer).add(containerHref.getValue()); containerUrl2escidocId.put(containerUrl, containerHref.getValue()); } catch (Exception e) { Logger.getLogger("notAddedFilesLogger").debug( "notadded container:" + containerUrl); logger.debug("notadded container:" + containerUrl); } } addContainer(handler, containerTree, containerUrl2escidocId, context); // add // the // container // to // the // struct // maps // of // the // parents } /** Teste ob sich hinter content eine ressource oder eine collection versteckt * @param content, (escidocid,echourl) des content * @return * @throws IOException * @throws JDOMException */ private boolean contentIsCollection(EScidocBasicHandler handler, String content) throws IOException, JDOMException { String url = content.split(",")[0]; HttpResponse result = handler.eScidocGet(url); InputStream xml = result.getEntity().getContent(); String cmm = EScidocBasicHandler.getContentModel(xml); return cmm.equals(collectionCMM); } /** FŸge einen content in die struct-map * @param structmap * @param content, (escidocID,url) der Ressource */ public void putContentInStructMap(Element structmap, String content) { if (content==null) // existiert nicht return; String[] urlSplit = content.split(","); // urls von get all // links haben immer die // form escidoc:1,url String newItemUrl = urlSplit[0]; Element newItem = new Element("item", EScidocTools.srel); Namespace ns = Namespace.getNamespace("xlink", EScidocTools.xlink); newItem.setAttribute("href", newItemUrl, ns); structmap.addContent(newItem); } /** Erzeuge eine Container * @param handler Context des Containers * @param url2escidocId Liste mit url -> escidocId Zurordnungen * @param collectionURL, echo url der collection zu der der Container erzeugt werden soll * @return * @throws JDOMException * @throws IOException * @throws ClientProtocolException */ public Document createContainer(EScidocBasicHandler handler, String context, HashMap<String, String> url2escidocId, String collectionURL) throws JDOMException, IOException, ClientProtocolException { InputStream is = getClass() .getResourceAsStream( "/de/mpiwg/itgroup/eSciDoc/xmlTemplates/ECHOCollection_container.xml"); Document doc = new SAXBuilder().build(is); XPath xp = EScidocTools .getESciDocXpath("//srel:context/@xlink:href"); Attribute href = (Attribute) xp.selectSingleNode(doc); href.setValue(context); xp = EScidocTools .getESciDocXpath("//srel:content-model/@xlink:href"); href = (Attribute) xp.selectSingleNode(doc); href.setValue(collectionCMM); // TODO mache das // konfigurierbar, // nimm z.z. // echocollection // modell String cmd = url2escidocId.get(collectionURL); // ensprechende collection existiert nicht. if (cmd==null){ return null; } InputStream in = handler .eScidocGet(cmd).getEntity() .getContent(); Document ecDoc = new SAXBuilder().build(in); // copy description from collection to container xp = EScidocTools.getESciDocXpath("/escidocItem:item//metadata-records:md-record[@name='escidoc']//dc:title"); Element item = (Element) xp.selectSingleNode(ecDoc); String title = "anon"; if (item != null) title = item.getTextTrim(); xp = EScidocTools.getESciDocXpath("/container:container//metadata-records:md-record[@name='escidoc']//dc:title"); item = (Element) xp.selectSingleNode(doc); item.setText(title); xp = EScidocTools.getESciDocXpath("/escidocItem:item//metadata-records:md-record[@name='escidoc']//dc:description"); item = (Element) xp.selectSingleNode(ecDoc); String description; if (item != null) { description = item.getTextTrim(); xp = EScidocTools.getESciDocXpath("/container:container//metadata-records:md-record[@name='escidoc']//dc:description"); item = (Element) xp.selectSingleNode(doc); item.setText(description); } else { // get description from ECHO XPath url = EScidocTools .getESciDocXpath(".//escidocComponents:component[escidocComponents:properties/prop:content-category[text()='web_page']]/escidocComponents:content/@xlink:href"); Attribute hrefECHO = (Attribute) url.selectSingleNode(ecDoc); if (hrefECHO != null) { DefaultHttpClient hc = new DefaultHttpClient(); HttpGet hg = new HttpGet(hrefECHO.getValue() + "/getDescription"); HttpResponse resp = hc.execute(hg); HttpEntity respEnt = resp.getEntity(); if (respEnt != null) { item = (Element) xp.selectSingleNode(doc); item.setText(EScidocBasicHandler .convertStreamToString(respEnt.getContent())); } } } return doc; } /** * Erzeugt aus einer Liste von urls der Form escidocID,url einen hierarchischen Tree, sowie jeweils eine Zuordnung der url zur escidocid * @param urls, liste der urls der Form "escidocID,url" * @param tree, hier wird der Tree rein geschrieben, sollte ein leerer HashMap sein * @param url2escidocId, hier wird die Zuordnung, url -> escidocID abgespeichert */ public void generateTreeAndConversion(ArrayList<String> urls, HashMap<String, ArrayList<String>> tree, HashMap<String, String> url2escidocId) { for (String url : urls) { // teile die url auf die url ohne den letzten teil ist die url der // collection String[] splitted = url.split("/"); url2escidocId.put(url.split(",")[1], url.split(",")[0]); if (splitted.length > 1) // pfad is lang genug { StringBuffer buffer = new StringBuffer(); for (int i = 0; i < splitted.length - 2; i++) { buffer.append(splitted[i]); buffer.append("/"); } buffer.append(splitted[splitted.length - 2]); String collection = buffer.toString(); String collectionUrl = collection.split(",")[1]; // nur die url // nicht den // escidoc-anteil. if (!tree.containsKey(collectionUrl)) { tree.put(collectionUrl, new ArrayList<String>()); } tree.get(collectionUrl).add(url); } } } private void addContainer(EScidocBasicHandler handler, HashMap<String, ArrayList<String>> containerTree, HashMap<String, String> containerUrl2escidocId, String context) throws Exception { for (String containerUrl : containerTree.keySet()) { String escidocId = containerUrl2escidocId.get(containerUrl); // if (escidocId == null) { // // // TODO: some containers have no ECHOcollection or // // ECHO_ressourceif this is the case create it here // escidocId = createContainerFromECHO(handler, containerUrl, // context); // logger.debug("container not in containerUrl2escidoc:" // + containerUrl); // } if (escidocId==null){ logger.debug("addContainer problem not in containerUrl2escidocId:"+containerUrl); Document doc = createContainerFromECHO(handler, containerUrl, context); String res = handler.createObject("/ir/container",printXML(doc)); escidocId = "/ir/container/"+EScidocBasicHandler.getId(res); //return "/ir/container/"+EScidocBasicHandler.getId(res); } HttpResponse result = handler.eScidocGet(escidocId); String obj = EScidocBasicHandler.convertStreamToString(result .getEntity().getContent()); String datestamp = EScidocBasicHandler.getDateStamp(obj); String body = String.format( "<param last-modification-date=\"%s\">", datestamp); // fuege jetzt die id aller sub container ein for (String content : containerTree.get(containerUrl)) { String[] tmp = content.split("/"); String addID = tmp[tmp.length - 1]; body += String.format("<id>%s</id>", addID); } body += "</param>"; result = handler.eScidocPost(escidocId + "/members/add", EScidocBasicHandler.convertStringToStream(body)); String retText = EScidocBasicHandler.convertStreamToString(result .getEntity().getContent()); logger.debug("adding result:" + retText); } } /** * Erzeuge einen container aus echo daten * @param handler * @param url * @param context * @return * @throws Exception */ private Document createContainerFromECHO(EScidocBasicHandler handler, String url, String context) throws Exception { InputStream is = getClass() .getResourceAsStream( "/de/mpiwg/itgroup/eSciDoc/xmlTemplates/ECHOCollection_container.xml"); Document doc = new SAXBuilder().build(is); XPath xp = EScidocTools.getESciDocXpath("//srel:context/@xlink:href"); Attribute href = (Attribute) xp.selectSingleNode(doc); href.setValue(context); xp = EScidocTools.getESciDocXpath("//srel:content-model/@xlink:href"); href = (Attribute) xp.selectSingleNode(doc); href.setValue(collectionCMM); // TODO mache das // konfigurierbar, // nimm z.z. // echocollection // modell xp = EScidocTools.getESciDocXpath("//dc:title"); String title = url; Element item = (Element) xp.selectSingleNode(doc); item.setText(title); //String res = handler.createObject("/ir/container", // printXML(doc)); //return "/ir/container/"+EScidocBasicHandler.getId(res); return doc; } private String printXML(Document doc) { XMLOutputter out = new XMLOutputter(); String string = out.outputString(doc); return string; } }