view src/de/mpiwg/itgroup/eSciDoc/importer/ECHOImporter.java @ 0:c6929e63b0b8

first import
author dwinter
date Wed, 24 Nov 2010 16:52:07 +0100
parents
children
line wrap: on
line source

package de.mpiwg.itgroup.eSciDoc.importer;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.lang.reflect.Array;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.jdom.Attribute;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Namespace;
import org.jdom.input.SAXBuilder;
import org.jdom.output.XMLOutputter;
import org.jdom.xpath.XPath;
import org.w3c.dom.Entity;

import de.mpiwg.itgroup.eSciDoc.Tools.EScidocBasicHandler;
import de.mpiwg.itgroup.eSciDoc.Tools.EScidocTools;
import de.mpiwg.itgroup.eSciDoc.Tools.Html2Text;
import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHOCollection;
import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHOObject;
import de.mpiwg.itgroup.eSciDoc.echoObjects.ECHORessource;
import de.mpiwg.itgroup.eSciDoc.utils.eSciDocXmlObject;

public class ECHOImporter implements Importer {

	private Logger logger = Logger.getRootLogger();
	private static long MAX_RES = 1000000L; // for debugging
	private URL instanceUrl;
	private String collectionCMM = "/cmm/content-model/escidoc:11004";
	
	public ECHOImporter(URL url) {
		this.instanceUrl = url;
	}

	@Override
	public Iterable<ECHOObject> getObjectList(String type) {

		try {
			return getObjectListfromRDF(type);
		} catch (JDOMException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return new ArrayList<ECHOObject>();
	}

	private ArrayList<ECHOObject> getObjectListfromRDF(String type)
			throws JDOMException, IOException {

		ArrayList<ECHOObject> ret = new ArrayList<ECHOObject>();

		SAXBuilder builder = new SAXBuilder();

		Document doc = builder.build(instanceUrl);

		Element el = doc.getRootElement();

		// get resources
		XPath xpathResources = XPath
				.newInstance("//rdf:Description[echonavigation:type='" + type
						+ "']");
		xpathResources.addNamespace("MPIWG",
				"http://www.mpiwg-berlin.mpg.de/ns/mpiwg");
		xpathResources.addNamespace("rdf",
				"http://www.w3.org/1999/02/22-rdf-syntax-ns#");
		xpathResources
				.addNamespace("echonavigation", "http://www.echo.eu/rdf#");

		List<Element> paths = xpathResources.selectNodes(el);

		int counter = 0;
		for (Element path : paths) {
			counter++;
			logger.debug("resource counter:" + String.valueOf(counter));
			if (logger.getLevel().equals(Level.DEBUG) && (counter > MAX_RES))
				break;

			ECHOObject obj = getECHORessourceFromRDF(el, path, type);
			// fueger dem object seine PID hinzu.
			String pid;
			try {
				pid = obj.getOrCreatePID();
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
				pid = null;
			}
			if (pid == null) {
				logger.error("Cannot createOrGetAn a PID for:" + obj.toString());
				logger.error("Object will not be added");
			} else {
				ret.add(obj);
			}
		}

		return ret;
	}

	private ECHOObject getECHORessourceFromRDF(Element el, Element path,
			String echotype) throws JDOMException, IOException {

		XPath xpath = EScidocTools.getESciDocXpath("./@rdf:about");
		Attribute aboutAttr = (Attribute) xpath.selectSingleNode(path);
		String aboutString = aboutAttr.getValue();
		// hole das object

		xpath = EScidocTools.getESciDocXpath(".//echonavigation:name");
		String name = ((Element) xpath.selectSingleNode(path)).getTextTrim();

		Html2Text htmlParser = new Html2Text(); // filter html codes
		htmlParser.parse(new StringReader(name));
		name = htmlParser.getText();

		xpath = EScidocTools.getESciDocXpath(".//mpiwg:archive-path");
		Element archiveElement = (Element) xpath.selectSingleNode(path);
		String archivePath = "";
		if (archiveElement != null)
			archivePath = archiveElement.getTextTrim();

		xpath = EScidocTools.getESciDocXpath("@rdf:about");
		String about = ((Attribute) xpath.selectSingleNode(path)).getValue();

		// hole seq des objectes
		String sequenceString = ("//rdf:Seq[@rdf:about='" + about + "']/rdf:li/@rdf:resource");
		xpath = EScidocTools.getESciDocXpath(sequenceString);

		List<Attribute> seqs = xpath.selectNodes(el);

		ECHOObject er = null;
		if (echotype.equals("ECHO_resource")) {
			er = new ECHORessource(name, archivePath, aboutString);
		} else if (echotype.equals("ECHO_collection")) {
			er = new ECHOCollection(name, aboutString);
		}

		// set description
		DefaultHttpClient hc = new DefaultHttpClient();
		URI echoUri;
		try {
			echoUri = new URI(er.echoUrl + "/getDescription");

			HttpGet hg = new HttpGet(echoUri);

			HttpResponse resp = hc.execute(hg);
			HttpEntity respEnt = resp.getEntity();
			if (respEnt != null) {
				// er.description=EScidocBasicHandler.convertStreamToString(respEnt.getContent());
				// filter html codes
				htmlParser.parse(new InputStreamReader(respEnt.getContent()));
				er.description = htmlParser.getText();

			}
		} catch (Exception e1) {
			logger.debug("echoImporter no URI:" + er.echoUrl);
			// e1.printStackTrace();
		}

		for (Attribute seq : seqs) {
			String typeString = ("//rdf:Description[@rdf:about='"
					+ seq.getValue() + "']/echonavigation:type");
			xpath = EScidocTools.getESciDocXpath(typeString);

			Element typeNode = (Element) xpath.selectSingleNode(el);
			if (typeNode==null){
				logger.debug("getRessourceFromRDF, no type in:"+typeString);
				continue;
			}
			String type = (typeNode).getTextTrim();
			if (ECHORessource.class.isInstance(er)
					&& type.equals("ECHO_metaData")) {

				HttpClient client = new DefaultHttpClient();
				HttpGet get = new HttpGet(seq.getValue().replace(
						"showMetaDataXML", "getMetaDataLink"));
				HttpResponse ret = null;
				try {
					ret = client.execute(get);

				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				int code = ret.getStatusLine().getStatusCode();
				try {
					if ((code == 204) || (code >= 300))
						((ECHORessource) er).metaData = "";
					else {
						String str = EScidocBasicHandler
								.convertStreamToString(ret.getEntity()
										.getContent());
						((ECHORessource) er).metaData = ((ECHORessource) er)
								.correctML(str);
					}

				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}

			} else if (ECHORessource.class.isInstance(er)
					&& type.equals("ECHO_fulltext")) {

				HttpClient client = new DefaultHttpClient();
				HttpGet get = new HttpGet(seq.getValue() + "?noredirect=yes");
				HttpResponse ret = null;
				try {
					ret = client.execute(get);

				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}

				try {
					String str = EScidocBasicHandler.convertStreamToString(ret
							.getEntity().getContent());
					((ECHORessource) er).fullText = new String(str);
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}

			}

		}

		logger.debug(er.toString());
		return er;
	}

	/**
	 * Erzeugt Collections auf der Basis der in den Metadaten gespeicherten ECHO
	 * urls. Dabei wir der Pfad schrittweise analysiert und dann ein Baum
	 * aufgebaut.
	 * 
	 * @param handler
	 *            eScidoc Serververbindung
	 * @param context
	 *            Context der Kollektion (sollte eine Kollektio sein die aus
	 *            einer ECHO webseite aufgebaut wurde.
	 * @throws Exception
	 */
	public void organizeRessourcesInCollections(EScidocBasicHandler handler,
			String context) throws Exception {
		HashMap<String, ArrayList<String>> tree = new HashMap<String, ArrayList<String>>(); // nimmt
																							// den
																							// tree
																							// der
																							// items
																							// auf
		HashMap<String, String> url2escidocId = new HashMap<String, String>();

		HashMap<String, String> containerUrl2escidocId = new HashMap<String, String>();
		HashMap<String, ArrayList<String>> containerTree = new HashMap<String, ArrayList<String>>(); // nimmt
																										// den
																										// tree
																										// der
																										// container
																										// auf

		ArrayList<String> urls = handler.getAllLinksOfContext("web_page",
				context);
		generateTreeAndConversion(urls, tree, url2escidocId);
		
		File tt = new File("/tmp/list.out");
		FileWriter fw= new FileWriter(tt);
		for (String containerUrl : tree.keySet()) {
			fw.write(containerUrl);
		}
		fw.close();
		
		// erzeuge jetzt die container
		for (String containerUrl : tree.keySet()) {
			XPath xp;
			
			// erzeuge Document des Container mit dem entsprechenden Kontext und den Metadaten aus dem Context.
			Document doc = createContainer(handler, context, url2escidocId,
					containerUrl);
			
			if (doc==null){
				doc= createContainerFromECHO(handler, containerUrl, context);
			}
				
			// now fill the container

			xp = EScidocTools.getESciDocXpath("//struct-map:struct-map");
			Element structmap = (Element) xp.selectSingleNode(doc);

			// fuege die Collection selbst in den container
			putContentInStructMap(structmap, url2escidocId.get(containerUrl));
			
			
			//fuege nun nur die ressourcen hinzu
			for (String content : tree.get(containerUrl)) {
				if (!contentIsCollection(handler,content))
					putContentInStructMap(structmap, content);
			}
			logger.debug(printXML(doc));
			
			try {
				
				// rrzeuge das object jetzt in escidoc
				String result = handler.createObject("/ir/container",
						printXML(doc));
				xp = EScidocTools
						.getESciDocXpath("//container:container/@xlink:href");
				Document containerDoc = new SAXBuilder().build(EScidocBasicHandler
						.convertStringToStream(result));
				Attribute containerHref = (Attribute) xp.selectSingleNode(containerDoc);
				logger.debug("added container:" + containerHref);
				Logger.getLogger("addedFilesLogger").debug(
						"added container:" + containerHref);

				// sichere jetzt den neuen container im container tree
				String[] splitted = containerUrl.split("/"); // teile dazu die
																// container url
																// wieder auf.
				StringBuffer buffer = new StringBuffer();
				for (int i = 0; i < splitted.length - 2; i++) {
					buffer.append(splitted[i]);
					buffer.append("/");

				}
				buffer.append(splitted[splitted.length - 2]);

				String parentContainer = buffer.toString();

				if (!containerTree.containsKey(parentContainer)) {
					containerTree.put(parentContainer, new ArrayList<String>());
				}
				containerTree.get(parentContainer).add(containerHref.getValue());

				containerUrl2escidocId.put(containerUrl, containerHref.getValue());

			} catch (Exception e) {
				Logger.getLogger("notAddedFilesLogger").debug(
						"notadded container:" + containerUrl);
				logger.debug("notadded container:" + containerUrl);
			}

		}
		addContainer(handler, containerTree, containerUrl2escidocId, context); // add
																				// the
																				// container
																				// to
																				// the
																				// struct
																				// maps
																				// of
																				// the
																				// parents
	}

	/** Teste ob sich hinter content eine ressource oder eine collection versteckt
	 * @param content, (escidocid,echourl) des content
	 * @return
	 * @throws IOException 
	 * @throws JDOMException 
	 */
	private boolean contentIsCollection(EScidocBasicHandler handler, String content) throws IOException, JDOMException {
		String url = content.split(",")[0];
		HttpResponse result = handler.eScidocGet(url);
		InputStream xml = result.getEntity().getContent();
		String cmm = EScidocBasicHandler.getContentModel(xml);
		
		return cmm.equals(collectionCMM);
	}

	/** FŸge einen content in die struct-map
	 * @param structmap
	 * @param content, (escidocID,url) der Ressource
	 */
	public void putContentInStructMap(Element structmap, String content) {
		if (content==null) // existiert nicht
			return;
		
		String[] urlSplit = content.split(","); // urls von get all
												// links haben immer die
												// form escidoc:1,url
		String newItemUrl = urlSplit[0];

		Element newItem = new Element("item", EScidocTools.srel);

		Namespace ns = Namespace.getNamespace("xlink",
				EScidocTools.xlink);
		newItem.setAttribute("href", newItemUrl, ns);
		structmap.addContent(newItem);
	}

	/** Erzeuge eine Container
	 * @param handler  Context des Containers
	 * @param url2escidocId Liste mit url -> escidocId Zurordnungen
	 * @param collectionURL, echo url der collection zu der der Container erzeugt werden soll
	 * @return
	 * @throws JDOMException
	 * @throws IOException
	 * @throws ClientProtocolException
	 */
	public Document createContainer(EScidocBasicHandler handler,
			String context, HashMap<String, String> url2escidocId,
			String collectionURL) throws JDOMException, IOException,
			ClientProtocolException {
		
		InputStream is = getClass()
				.getResourceAsStream(
						"/de/mpiwg/itgroup/eSciDoc/xmlTemplates/ECHOCollection_container.xml");
		Document doc = new SAXBuilder().build(is);

		XPath xp = EScidocTools
				.getESciDocXpath("//srel:context/@xlink:href");
		Attribute href = (Attribute) xp.selectSingleNode(doc);
		href.setValue(context);

		xp = EScidocTools
				.getESciDocXpath("//srel:content-model/@xlink:href");
		href = (Attribute) xp.selectSingleNode(doc);
		href.setValue(collectionCMM); // TODO mache das
															// konfigurierbar,
															// nimm z.z.
															// echocollection
															// modell
		String cmd = url2escidocId.get(collectionURL); // ensprechende collection existiert nicht.
		if (cmd==null){ 
			return null;
		}
		InputStream in = handler
				.eScidocGet(cmd).getEntity()
				.getContent();
		Document ecDoc = new SAXBuilder().build(in);

		
		// copy description from collection to container
		xp = EScidocTools.getESciDocXpath("/escidocItem:item//metadata-records:md-record[@name='escidoc']//dc:title"); 
				

		Element item = (Element) xp.selectSingleNode(ecDoc);
		String title = "anon";
		if (item != null)
			title = item.getTextTrim();

		xp = EScidocTools.getESciDocXpath("/container:container//metadata-records:md-record[@name='escidoc']//dc:title"); 
		item = (Element) xp.selectSingleNode(doc);

		item.setText(title);

		xp = EScidocTools.getESciDocXpath("/escidocItem:item//metadata-records:md-record[@name='escidoc']//dc:description");
		item = (Element) xp.selectSingleNode(ecDoc);
		String description;
		if (item != null) {
			description = item.getTextTrim();
			xp = EScidocTools.getESciDocXpath("/container:container//metadata-records:md-record[@name='escidoc']//dc:description"); 
			item = (Element) xp.selectSingleNode(doc);
			item.setText(description);
		} else {
			// get description from ECHO
			XPath url = EScidocTools
					.getESciDocXpath(".//escidocComponents:component[escidocComponents:properties/prop:content-category[text()='web_page']]/escidocComponents:content/@xlink:href");
			Attribute hrefECHO = (Attribute) url.selectSingleNode(ecDoc);
			if (hrefECHO != null) {
				DefaultHttpClient hc = new DefaultHttpClient();
				HttpGet hg = new HttpGet(hrefECHO.getValue()
						+ "/getDescription");
				HttpResponse resp = hc.execute(hg);
				HttpEntity respEnt = resp.getEntity();
				if (respEnt != null) {
					item = (Element) xp.selectSingleNode(doc);
					item.setText(EScidocBasicHandler
							.convertStreamToString(respEnt.getContent()));
				}

			}
		}
		return doc;
	}

	/**
	 * Erzeugt aus einer Liste von urls der Form escidocID,url einen hierarchischen Tree, sowie jeweils eine Zuordnung der url zur escidocid
	 * @param urls, liste der urls der Form "escidocID,url"
	 * @param tree, hier wird der Tree rein geschrieben, sollte ein leerer HashMap sein
	 * @param url2escidocId, hier wird die Zuordnung, url -> escidocID abgespeichert
	 */
	public void generateTreeAndConversion(ArrayList<String> urls,
			HashMap<String, ArrayList<String>> tree,
			HashMap<String, String> url2escidocId) {
		for (String url : urls) {
			// teile die url auf die url ohne den letzten teil ist die url der
			// collection
			String[] splitted = url.split("/");
			url2escidocId.put(url.split(",")[1],
					url.split(",")[0]);


			if (splitted.length > 1) // pfad is lang genug
			{
				StringBuffer buffer = new StringBuffer();
				for (int i = 0; i < splitted.length - 2; i++) {
					buffer.append(splitted[i]);
					buffer.append("/");

				}
				buffer.append(splitted[splitted.length - 2]);

				String collection = buffer.toString();
				String collectionUrl = collection.split(",")[1]; // nur die url
																	// nicht den
																	// escidoc-anteil.
				
				if (!tree.containsKey(collectionUrl)) {
					tree.put(collectionUrl, new ArrayList<String>());
				}
				tree.get(collectionUrl).add(url);

			}
		}
	}

	private void addContainer(EScidocBasicHandler handler,
			HashMap<String, ArrayList<String>> containerTree,
			HashMap<String, String> containerUrl2escidocId, String context)
			throws Exception {
		for (String containerUrl : containerTree.keySet()) {
			String escidocId = containerUrl2escidocId.get(containerUrl);
//			if (escidocId == null) {
//
//				// TODO: some containers have no ECHOcollection or
//				// ECHO_ressourceif this is the case create it here
//				escidocId = createContainerFromECHO(handler, containerUrl,
//						context);
//				logger.debug("container not in containerUrl2escidoc:"
//						+ containerUrl);
//			}
			if (escidocId==null){
				logger.debug("addContainer problem not in containerUrl2escidocId:"+containerUrl);
				Document doc = createContainerFromECHO(handler, containerUrl, context);
				String res = handler.createObject("/ir/container",printXML(doc));
				escidocId = "/ir/container/"+EScidocBasicHandler.getId(res);
				//return "/ir/container/"+EScidocBasicHandler.getId(res);
						
			}
			HttpResponse result = handler.eScidocGet(escidocId);
			String obj = EScidocBasicHandler.convertStreamToString(result
					.getEntity().getContent());
			String datestamp = EScidocBasicHandler.getDateStamp(obj);
			String body = String.format(
					"<param last-modification-date=\"%s\">", datestamp);

			// fuege jetzt die id aller sub container ein
			for (String content : containerTree.get(containerUrl)) {
				String[] tmp = content.split("/");
				String addID = tmp[tmp.length - 1];
				body += String.format("<id>%s</id>", addID);
			}
			body += "</param>";
			result = handler.eScidocPost(escidocId + "/members/add",
					EScidocBasicHandler.convertStringToStream(body));
			String retText = EScidocBasicHandler.convertStreamToString(result
					.getEntity().getContent());
			logger.debug("adding result:" + retText);
		}

	}

	/**
	 * Erzeuge einen container aus echo daten
	 * @param handler
	 * @param url
	 * @param context
	 * @return
	 * @throws Exception
	 */
	private Document createContainerFromECHO(EScidocBasicHandler handler,
			String url, String context) throws Exception {
		InputStream is = getClass()
				.getResourceAsStream(
						"/de/mpiwg/itgroup/eSciDoc/xmlTemplates/ECHOCollection_container.xml");
		Document doc = new SAXBuilder().build(is);

		XPath xp = EScidocTools.getESciDocXpath("//srel:context/@xlink:href");
		Attribute href = (Attribute) xp.selectSingleNode(doc);
		href.setValue(context);

		xp = EScidocTools.getESciDocXpath("//srel:content-model/@xlink:href");
		href = (Attribute) xp.selectSingleNode(doc);
		href.setValue(collectionCMM); // TODO mache das
															// konfigurierbar,
															// nimm z.z.
															// echocollection
															// modell

		
		xp = EScidocTools.getESciDocXpath("//dc:title");

		
		String title = url;
	

		Element item = (Element) xp.selectSingleNode(doc);

		item.setText(title);

		//String res = handler.createObject("/ir/container",
		//		printXML(doc));
		
		//return "/ir/container/"+EScidocBasicHandler.getId(res);
				
		return doc;	
	}

	private String printXML(Document doc) {
		XMLOutputter out = new XMLOutputter();

		String string = out.outputString(doc);
		return string;

	}
}