Mercurial > hg > fulltextSearch
diff src/de/mpiwg/dwinter/fulltext/search/xmlsearchadapter/XMLSearchServerAdapter.java @ 2:2b29b0b6db16 default tip
Version mit integrierter Suche ?ber XML-Volltexte
author | dwinter |
---|---|
date | Wed, 26 Jan 2011 14:41:09 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/fulltext/search/xmlsearchadapter/XMLSearchServerAdapter.java Wed Jan 26 14:41:09 2011 +0100 @@ -0,0 +1,364 @@ +/** + * + */ +package de.mpiwg.dwinter.fulltext.search.xmlsearchadapter; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.log4j.Logger; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.jdom.Document; +import org.jdom.Element; +import org.jdom.JDOMException; +import org.jdom.input.SAXBuilder; +import org.jdom.output.Format; +import org.jdom.output.XMLOutputter; +import org.jdom.xpath.XPath; + +import de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines; +import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc; +import de.mpiwg.dwinter.fulltext.search.utils.OCRLine; +import de.mpiwg.dwinter.fulltext.ticket.TicketWriter; +import de.mpiwg.dwinter.lucencetools.documents.FileDocument; + +/** + * @author dwinter + * + */ +public class XMLSearchServerAdapter implements IFulltextSearchDocsLines { + + protected static Logger logger = Logger.getRootLogger(); + + public static String XMLServerSearchBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/queryResult.xql?"; + //public static String XMLDocSearchBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/doc-query.xql?"; + public static String XMLDocSearchBase = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?"; + public static String XMLServerBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/"; + + + //http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&mode=text&query-type=fulltext&query=quantitas + // http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&queryType=fulltext&query=quantitas + /* + * (non-Javadoc) + * + * @see + * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesToDir + * (org.apache.lucene.search.Query, java.lang.String, java.lang.String) + */ + + @Override + public void searchInLinesToDir(Query query, String calledLanguage, String ticket) + throws IOException { + String languageFolderName; + String language; + //check format of the language string could be lang:xml or just lang + String[] langsplitted = calledLanguage.split(":"); + if(langsplitted.length>1){ + if(langsplitted[1].equals("XML")){ + language=langsplitted[0]; + languageFolderName=calledLanguage; + } else { + language=calledLanguage; + languageFolderName=calledLanguage+":XML"; + } + + } else { + language=calledLanguage; + languageFolderName=calledLanguage+":XML"; + } + + + + TicketWriter tw = new TicketWriter(ticket, query, languageFolderName); + + File languageFile = new File(tw.ticketFile.getAbsolutePath() + + tw.PATHSEPARATOR + languageFolderName); + if (!languageFile.exists()) { + logger.debug("Create Languagefolder:" + + languageFile.getCanonicalPath()); + if (!languageFile.mkdirs()) + throw new IOException(); + } + + // docbase=archimedes&docbase=echo&queryType=fulltextMorph&language=la&ftMorphQuery=quantitas&pn=1&output=xml&pageSize=50 + SAXBuilder parser = new SAXBuilder(); + + String queryString = XMLServerSearchBase + + "docbase=archimedes&docbase=echo&queryType=fulltextMorph"; + queryString += "&language=" + language; + + Set<Term> terms = new HashSet<Term>(); + query.extractTerms(terms); + String morphQuery = ""; + for (Term t : terms) { + if (t.field().equals("contents")) + morphQuery = t.text(); + } + queryString += "&ftMorphQuery=" + morphQuery; + queryString += "&pn=1&output=xml&pageSize=500"; + Document doc; + try { + doc = parser.build(queryString); + } catch (JDOMException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + return; + } + + int counter = writeResults(tw, languageFolderName, ticket, doc); + writeResultInfo(tw, doc, counter, languageFolderName); + + tw.commitTicket(); + + List<Element> docElements; + try { + XPath docsXP = XPath.newInstance("//document"); + docElements = docsXP.selectNodes(doc); + } catch (JDOMException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + return; + } + + for (Element e : docElements) { + Element textIdElement; + try { + textIdElement = (Element) XPath.selectSingleNode(e, "uri"); + } catch (JDOMException e1) { + // TODO Auto-generated catch block + continue; + } + String textId = textIdElement.getTextTrim(); + + File docFile = new File(languageFile.getAbsolutePath() + + TicketWriter.PATHSEPARATOR + + textId.replace(TicketWriter.PATHSEPARATORCHAR, ':')); + if (!docFile.exists()) { + logger.debug("Create Docfolder:" + docFile.getCanonicalPath()); + if (!docFile.mkdirs()) + throw new IOException(); + } + + // TODO: jetzt fuer jede seite ein file, zur Zeit jeweils nur ein + // File pro Document! + // for (String page:ocrDoc.linesInPage.keySet()){ + + File pageFile = new File(docFile.getAbsolutePath() + + TicketWriter.PATHSEPARATOR + + textId.replace(TicketWriter.PATHSEPARATORCHAR, ':')); + FileWriter pageFileWriter = new FileWriter(pageFile); + // http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&queryType=fulltext&query=quantitas + http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&mode=text&query-type=fulltext&query=quantitas + // for (OCRLine line: ocrDoc.linesInPage.get(page)){ + // pageFileWriter.write("allLines"+"\n"); + // } + queryString = XMLDocSearchBase + "document=" + textId; + //queryString += "&queryType=fulltext&query=" + morphQuery; + queryString += "&mode=text&query-type=fulltext&query=" + morphQuery; + + try { + doc = parser.build(queryString); + } catch (JDOMException e2) { + // TODO Auto-generated catch block + e2.printStackTrace(); + return; + } + + XMLOutputter op = new XMLOutputter(Format.getCompactFormat()); + op.output(doc, pageFileWriter); + pageFileWriter.close(); + } + tw.closeTicket(languageFolderName); + } + + private void writeResultInfo(TicketWriter tw, Document doc, int counter, + String languageFolderName) throws IOException { + //String languageFolderName = language + "_XML"; + + File languageFile = new File(tw.ticketFile.getAbsolutePath() + + TicketWriter.PATHSEPARATOR + languageFolderName); + File resultFile = new File(languageFile.getAbsolutePath() + + TicketWriter.PATHSEPARATOR + "resultInfo"); + FileOutputStream fs = new FileOutputStream(resultFile); + OutputStreamWriter rw = new OutputStreamWriter(fs, "utf-8"); + String ret = "<resultInfo>"; + // int hits = docs.totalHits; + ret += "<lang>" + languageFolderName+"</lang>"; + ret += "<hits>" + counter + "</hits>"; + ret += "<totalHits>" + counter + "</totalHits>";// TODO: gibt es in + // diesem fall einen + // unterschied zwischen + // hits und totalhits? + + ret += "</resultInfo>"; + rw.write(ret); + rw.close(); + } + + private int writeResults(TicketWriter tw, String languageFolderName, String ticket, + Document doc) throws IOException { + OutputStreamWriter rw = null; + File languageFile; + //String languageFolderName = language + "_XML"; + + try { + languageFile = new File(tw.ticketFile.getAbsolutePath() + + tw.PATHSEPARATOR + languageFolderName); + File resultFile = new File(languageFile.getAbsolutePath() + + tw.PATHSEPARATOR + "result"); + FileOutputStream fs = new FileOutputStream(resultFile); + rw = new OutputStreamWriter(fs, "utf-8"); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + return -1; + } catch (UnsupportedEncodingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + return -1; + } + + Set<String> idsAlreadyDone = new HashSet<String>(); // TODO: aus + // irgendwelche + // gruenden gibt es + // ein Dokument + // mehrfach in den + // Fundstellen + + // FileWriter rw = new FileWriter(resultFile); + + if (!languageFile.exists()) { + logger.debug("Create Languagefolder:" + + languageFile.getCanonicalPath()); + if (!languageFile.mkdirs()) + throw new IOException(); + } + Integer counter = 0; + @SuppressWarnings("unchecked") + List<Element> elements; + try { + XPath xpathDoc = XPath.newInstance("//document"); + elements = xpathDoc.selectNodes(doc); + } catch (JDOMException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + return -1; + } + + for (Element e : elements) { + try { + XPath xpathUri = XPath.newInstance("uri"); + Element uri = (Element) xpathUri.selectSingleNode(e); + + String id = uri.getTextTrim(); + if (!idsAlreadyDone.contains(id)) { // stelle sicher das alle + // treffer nur einmal in die + // date geschrieben werden. + idsAlreadyDone.add(id); + + String textId = id; + String md = ""; + Element mdEl = (Element) XPath + .selectSingleNode(e, "author"); + md += "<dc:creator>" + formatXML(mdEl.getTextTrim()) + + "</dc:creator>"; + + mdEl = (Element) XPath.selectSingleNode(e, "title"); + md += "<dc:title>" + formatXML(mdEl.getTextTrim()) + + "</dc:title>"; + + mdEl = (Element) XPath.selectSingleNode(e, "place"); + md += "<dc:place>" + formatXML(mdEl.getTextTrim()) + + "</dc:place>"; + + mdEl = (Element) XPath.selectSingleNode(e, "date"); + md += "<dc:date>" + formatXML(mdEl.getTextTrim()) + + "</dc:date>"; + + String ret = "<result xmlns:dc=\"http://dublincore.org/documents/dcmi-namespace/\">"; + ret += "<cleanedPath>" + textId + "</cleanedPath>"; + ret += "<textId>" + textId.replace("/", ":") + "</textId>"; + ret += "<textIdCleaned>" + textId.replace("/", "_") + + "</textIdCleaned>"; + ret += "<md>" + md + "</md>"; + ret += "</result>"; + + rw.write(ret); + counter++; + } + } catch (JDOMException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + return -1; + } + } + rw.close(); + return counter; + } + + private String formatXML(String string) { + String retStr = string.replace("&", "&"); + retStr = retStr.replace("<", "<"); + retStr = retStr.replace(">", ">"); + return retStr; + } + + /* + * (non-Javadoc) + * + * @see + * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesDoc + * (java.lang.String, org.apache.lucene.search.Query, java.lang.String) + */ + @Override + public OCRDoc searchInLinesDoc(String textId, Query query, String language) + throws IOException { + // TODO Auto-generated method stub + return null; + } + + /* + * (non-Javadoc) + * + * @see + * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLines + * (org.apache.lucene.search.Query, java.lang.String) + */ + @Override + public List<OCRDoc> searchInLines(Query query, String language) + throws IOException { + // TODO Auto-generated method stub + return null; + } + + public static void main(String[] args) { + Term t = new Term("contents", "quantitas"); + Query q = new TermQuery(t); + XMLSearchServerAdapter sa = new XMLSearchServerAdapter(); + try { + sa.searchInLinesToDir(q, "la", "121"); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + public static List<String> getSupportedLanguages() { + String langs[] = new String[] { "la:XML", "it:XML" }; + return Arrays.asList(langs); + + } +}