view src/de/mpiwg/dwinter/fulltext/search/xmlsearchadapter/XMLSearchServerAdapter.java @ 2:2b29b0b6db16 default tip

Version mit integrierter Suche ?ber XML-Volltexte
author dwinter
date Wed, 26 Jan 2011 14:41:09 +0100
parents
children
line wrap: on
line source

/**
 * 
 */
package de.mpiwg.dwinter.fulltext.search.xmlsearchadapter;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.log4j.Logger;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.jdom.xpath.XPath;

import de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines;
import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc;
import de.mpiwg.dwinter.fulltext.search.utils.OCRLine;
import de.mpiwg.dwinter.fulltext.ticket.TicketWriter;
import de.mpiwg.dwinter.lucencetools.documents.FileDocument;

/**
 * @author dwinter
 * 
 */
public class XMLSearchServerAdapter implements IFulltextSearchDocsLines {

	protected static Logger logger = Logger.getRootLogger();

	public static String XMLServerSearchBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/queryResult.xql?";
	//public static String XMLDocSearchBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/doc-query.xql?";
	public static String XMLDocSearchBase = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?";
	public static String XMLServerBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/";
	
	
	//http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&mode=text&query-type=fulltext&query=quantitas
	// http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&queryType=fulltext&query=quantitas
	/*
	 * (non-Javadoc)
	 * 
	 * @see
	 * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesToDir
	 * (org.apache.lucene.search.Query, java.lang.String, java.lang.String)
	 */
	
	@Override
	public void searchInLinesToDir(Query query, String calledLanguage, String ticket)
			throws IOException {
		String languageFolderName;
		String language;
		//check format of the language string could be lang:xml or just lang
		String[] langsplitted = calledLanguage.split(":");
		if(langsplitted.length>1){
			if(langsplitted[1].equals("XML")){
				language=langsplitted[0];
				languageFolderName=calledLanguage;
			} else {
				language=calledLanguage;
				languageFolderName=calledLanguage+":XML";
			}
			
		} else {
			language=calledLanguage;
			languageFolderName=calledLanguage+":XML";
		}
		
		
		
		TicketWriter tw = new TicketWriter(ticket, query, languageFolderName);

		File languageFile = new File(tw.ticketFile.getAbsolutePath()
				+ tw.PATHSEPARATOR + languageFolderName);
		if (!languageFile.exists()) {
			logger.debug("Create Languagefolder:"
					+ languageFile.getCanonicalPath());
			if (!languageFile.mkdirs())
				throw new IOException();
		}

		// docbase=archimedes&docbase=echo&queryType=fulltextMorph&language=la&ftMorphQuery=quantitas&pn=1&output=xml&pageSize=50
		SAXBuilder parser = new SAXBuilder();

		String queryString = XMLServerSearchBase
				+ "docbase=archimedes&docbase=echo&queryType=fulltextMorph";
		queryString += "&language=" + language;

		Set<Term> terms = new HashSet<Term>();
		query.extractTerms(terms);
		String morphQuery = "";
		for (Term t : terms) {
			if (t.field().equals("contents"))
				morphQuery = t.text();
		}
		queryString += "&ftMorphQuery=" + morphQuery;
		queryString += "&pn=1&output=xml&pageSize=500";
		Document doc;
		try {
			doc = parser.build(queryString);
		} catch (JDOMException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			return;
		}

		int counter = writeResults(tw, languageFolderName, ticket, doc);
		writeResultInfo(tw, doc, counter, languageFolderName);

		tw.commitTicket();

		List<Element> docElements;
		try {
			XPath docsXP = XPath.newInstance("//document");
			docElements = docsXP.selectNodes(doc);
		} catch (JDOMException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
			return;
		}

		for (Element e : docElements) {
			Element textIdElement;
			try {
				textIdElement = (Element) XPath.selectSingleNode(e, "uri");
			} catch (JDOMException e1) {
				// TODO Auto-generated catch block
				continue;
			}
			String textId = textIdElement.getTextTrim();

			File docFile = new File(languageFile.getAbsolutePath()
					+ TicketWriter.PATHSEPARATOR
					+ textId.replace(TicketWriter.PATHSEPARATORCHAR, ':'));
			if (!docFile.exists()) {
				logger.debug("Create Docfolder:" + docFile.getCanonicalPath());
				if (!docFile.mkdirs())
					throw new IOException();
			}

			// TODO: jetzt fuer jede seite ein file, zur Zeit jeweils nur ein
			// File pro Document!
			// for (String page:ocrDoc.linesInPage.keySet()){

			File pageFile = new File(docFile.getAbsolutePath()
					+ TicketWriter.PATHSEPARATOR
					+ textId.replace(TicketWriter.PATHSEPARATORCHAR, ':'));
			FileWriter pageFileWriter = new FileWriter(pageFile);
			// http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&queryType=fulltext&query=quantitas
			http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&mode=text&query-type=fulltext&query=quantitas
			// for (OCRLine line: ocrDoc.linesInPage.get(page)){
			// pageFileWriter.write("allLines"+"\n");
			// }
			queryString = XMLDocSearchBase + "document=" + textId;
			//queryString += "&queryType=fulltext&query=" + morphQuery;
			queryString += "&mode=text&query-type=fulltext&query=" + morphQuery;

			try {
				doc = parser.build(queryString);
			} catch (JDOMException e2) {
				// TODO Auto-generated catch block
				e2.printStackTrace();
				return;
			}

			XMLOutputter op = new XMLOutputter(Format.getCompactFormat());
			op.output(doc, pageFileWriter);
			pageFileWriter.close();
		}
		tw.closeTicket(languageFolderName);
	}

	private void writeResultInfo(TicketWriter tw, Document doc, int counter,
			String languageFolderName) throws IOException {
		//String languageFolderName = language + "_XML";

		File languageFile = new File(tw.ticketFile.getAbsolutePath()
				+ TicketWriter.PATHSEPARATOR + languageFolderName);
		File resultFile = new File(languageFile.getAbsolutePath()
				+ TicketWriter.PATHSEPARATOR + "resultInfo");
		FileOutputStream fs = new FileOutputStream(resultFile);
		OutputStreamWriter rw = new OutputStreamWriter(fs, "utf-8");
		String ret = "<resultInfo>";
		// int hits = docs.totalHits;
		ret += "<lang>" + languageFolderName+"</lang>";
		ret += "<hits>" + counter + "</hits>";
		ret += "<totalHits>" + counter + "</totalHits>";// TODO: gibt es in
														// diesem fall einen
														// unterschied zwischen
														// hits und totalhits?

		ret += "</resultInfo>";
		rw.write(ret);
		rw.close();
	}

	private int writeResults(TicketWriter tw, String languageFolderName, String ticket,
			Document doc) throws IOException {
		OutputStreamWriter rw = null;
		File languageFile;
		//String languageFolderName = language + "_XML";

		try {
			languageFile = new File(tw.ticketFile.getAbsolutePath()
					+ tw.PATHSEPARATOR + languageFolderName);
			File resultFile = new File(languageFile.getAbsolutePath()
					+ tw.PATHSEPARATOR + "result");
			FileOutputStream fs = new FileOutputStream(resultFile);
			rw = new OutputStreamWriter(fs, "utf-8");
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			return -1;
		} catch (UnsupportedEncodingException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			return -1;
		}

		Set<String> idsAlreadyDone = new HashSet<String>(); // TODO: aus
															// irgendwelche
															// gruenden gibt es
															// ein Dokument
															// mehrfach in den
															// Fundstellen

		// FileWriter rw = new FileWriter(resultFile);

		if (!languageFile.exists()) {
			logger.debug("Create Languagefolder:"
					+ languageFile.getCanonicalPath());
			if (!languageFile.mkdirs())
				throw new IOException();
		}
		Integer counter = 0;
		@SuppressWarnings("unchecked")
		List<Element> elements;
		try {
			XPath xpathDoc = XPath.newInstance("//document");
			elements = xpathDoc.selectNodes(doc);
		} catch (JDOMException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
			return -1;
		}

		for (Element e : elements) {
			try {
				XPath xpathUri = XPath.newInstance("uri");
				Element uri = (Element) xpathUri.selectSingleNode(e);

				String id = uri.getTextTrim();
				if (!idsAlreadyDone.contains(id)) { // stelle sicher das alle
													// treffer nur einmal in die
													// date geschrieben werden.
					idsAlreadyDone.add(id);

					String textId = id;
					String md = "";
					Element mdEl = (Element) XPath
							.selectSingleNode(e, "author");
					md += "<dc:creator>" + formatXML(mdEl.getTextTrim())
							+ "</dc:creator>";

					mdEl = (Element) XPath.selectSingleNode(e, "title");
					md += "<dc:title>" + formatXML(mdEl.getTextTrim())
							+ "</dc:title>";

					mdEl = (Element) XPath.selectSingleNode(e, "place");
					md += "<dc:place>" + formatXML(mdEl.getTextTrim())
							+ "</dc:place>";

					mdEl = (Element) XPath.selectSingleNode(e, "date");
					md += "<dc:date>" + formatXML(mdEl.getTextTrim())
							+ "</dc:date>";

					String ret = "<result xmlns:dc=\"http://dublincore.org/documents/dcmi-namespace/\">";
					ret += "<cleanedPath>" + textId + "</cleanedPath>";
					ret += "<textId>" + textId.replace("/", ":") + "</textId>";
					ret += "<textIdCleaned>" + textId.replace("/", "_")
							+ "</textIdCleaned>";
					ret += "<md>" + md + "</md>";
					ret += "</result>";

					rw.write(ret);
					counter++;
				}
			} catch (JDOMException e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
				return -1;
			}
		}
		rw.close();
		return counter;
	}

	private String formatXML(String string) {
		String retStr = string.replace("&", "&amp;");
		retStr = retStr.replace("<", "&lt;");
		retStr = retStr.replace(">", "&gt;");
		return retStr;
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see
	 * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesDoc
	 * (java.lang.String, org.apache.lucene.search.Query, java.lang.String)
	 */
	@Override
	public OCRDoc searchInLinesDoc(String textId, Query query, String language)
			throws IOException {
		// TODO Auto-generated method stub
		return null;
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see
	 * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLines
	 * (org.apache.lucene.search.Query, java.lang.String)
	 */
	@Override
	public List<OCRDoc> searchInLines(Query query, String language)
			throws IOException {
		// TODO Auto-generated method stub
		return null;
	}

	public static void main(String[] args) {
		Term t = new Term("contents", "quantitas");
		Query q = new TermQuery(t);
		XMLSearchServerAdapter sa = new XMLSearchServerAdapter();
		try {
			sa.searchInLinesToDir(q, "la", "121");
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	public static List<String> getSupportedLanguages() {
		String langs[] = new String[] { "la:XML", "it:XML" };
		return Arrays.asList(langs);

	}
}