Mercurial > hg > fulltextSearch
view src/de/mpiwg/dwinter/fulltext/search/xmlsearchadapter/XMLSearchServerAdapter.java @ 2:2b29b0b6db16 default tip
Version mit integrierter Suche ?ber XML-Volltexte
author | dwinter |
---|---|
date | Wed, 26 Jan 2011 14:41:09 +0100 |
parents | |
children |
line wrap: on
line source
/** * */ package de.mpiwg.dwinter.fulltext.search.xmlsearchadapter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.log4j.Logger; import org.apache.lucene.index.Term; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.input.SAXBuilder; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; import org.jdom.xpath.XPath; import de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines; import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc; import de.mpiwg.dwinter.fulltext.search.utils.OCRLine; import de.mpiwg.dwinter.fulltext.ticket.TicketWriter; import de.mpiwg.dwinter.lucencetools.documents.FileDocument; /** * @author dwinter * */ public class XMLSearchServerAdapter implements IFulltextSearchDocsLines { protected static Logger logger = Logger.getRootLogger(); public static String XMLServerSearchBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/queryResult.xql?"; //public static String XMLDocSearchBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/doc-query.xql?"; public static String XMLDocSearchBase = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?"; public static String XMLServerBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/"; //http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&mode=text&query-type=fulltext&query=quantitas // http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&queryType=fulltext&query=quantitas /* * (non-Javadoc) * * @see * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesToDir * (org.apache.lucene.search.Query, java.lang.String, java.lang.String) */ @Override public void searchInLinesToDir(Query query, String calledLanguage, String ticket) throws IOException { String languageFolderName; String language; //check format of the language string could be lang:xml or just lang String[] langsplitted = calledLanguage.split(":"); if(langsplitted.length>1){ if(langsplitted[1].equals("XML")){ language=langsplitted[0]; languageFolderName=calledLanguage; } else { language=calledLanguage; languageFolderName=calledLanguage+":XML"; } } else { language=calledLanguage; languageFolderName=calledLanguage+":XML"; } TicketWriter tw = new TicketWriter(ticket, query, languageFolderName); File languageFile = new File(tw.ticketFile.getAbsolutePath() + tw.PATHSEPARATOR + languageFolderName); if (!languageFile.exists()) { logger.debug("Create Languagefolder:" + languageFile.getCanonicalPath()); if (!languageFile.mkdirs()) throw new IOException(); } // docbase=archimedes&docbase=echo&queryType=fulltextMorph&language=la&ftMorphQuery=quantitas&pn=1&output=xml&pageSize=50 SAXBuilder parser = new SAXBuilder(); String queryString = XMLServerSearchBase + "docbase=archimedes&docbase=echo&queryType=fulltextMorph"; queryString += "&language=" + language; Set<Term> terms = new HashSet<Term>(); query.extractTerms(terms); String morphQuery = ""; for (Term t : terms) { if (t.field().equals("contents")) morphQuery = t.text(); } queryString += "&ftMorphQuery=" + morphQuery; queryString += "&pn=1&output=xml&pageSize=500"; Document doc; try { doc = parser.build(queryString); } catch (JDOMException e) { // TODO Auto-generated catch block e.printStackTrace(); return; } int counter = writeResults(tw, languageFolderName, ticket, doc); writeResultInfo(tw, doc, counter, languageFolderName); tw.commitTicket(); List<Element> docElements; try { XPath docsXP = XPath.newInstance("//document"); docElements = docsXP.selectNodes(doc); } catch (JDOMException e1) { // TODO Auto-generated catch block e1.printStackTrace(); return; } for (Element e : docElements) { Element textIdElement; try { textIdElement = (Element) XPath.selectSingleNode(e, "uri"); } catch (JDOMException e1) { // TODO Auto-generated catch block continue; } String textId = textIdElement.getTextTrim(); File docFile = new File(languageFile.getAbsolutePath() + TicketWriter.PATHSEPARATOR + textId.replace(TicketWriter.PATHSEPARATORCHAR, ':')); if (!docFile.exists()) { logger.debug("Create Docfolder:" + docFile.getCanonicalPath()); if (!docFile.mkdirs()) throw new IOException(); } // TODO: jetzt fuer jede seite ein file, zur Zeit jeweils nur ein // File pro Document! // for (String page:ocrDoc.linesInPage.keySet()){ File pageFile = new File(docFile.getAbsolutePath() + TicketWriter.PATHSEPARATOR + textId.replace(TicketWriter.PATHSEPARATORCHAR, ':')); FileWriter pageFileWriter = new FileWriter(pageFile); // http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&queryType=fulltext&query=quantitas http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&mode=text&query-type=fulltext&query=quantitas // for (OCRLine line: ocrDoc.linesInPage.get(page)){ // pageFileWriter.write("allLines"+"\n"); // } queryString = XMLDocSearchBase + "document=" + textId; //queryString += "&queryType=fulltext&query=" + morphQuery; queryString += "&mode=text&query-type=fulltext&query=" + morphQuery; try { doc = parser.build(queryString); } catch (JDOMException e2) { // TODO Auto-generated catch block e2.printStackTrace(); return; } XMLOutputter op = new XMLOutputter(Format.getCompactFormat()); op.output(doc, pageFileWriter); pageFileWriter.close(); } tw.closeTicket(languageFolderName); } private void writeResultInfo(TicketWriter tw, Document doc, int counter, String languageFolderName) throws IOException { //String languageFolderName = language + "_XML"; File languageFile = new File(tw.ticketFile.getAbsolutePath() + TicketWriter.PATHSEPARATOR + languageFolderName); File resultFile = new File(languageFile.getAbsolutePath() + TicketWriter.PATHSEPARATOR + "resultInfo"); FileOutputStream fs = new FileOutputStream(resultFile); OutputStreamWriter rw = new OutputStreamWriter(fs, "utf-8"); String ret = "<resultInfo>"; // int hits = docs.totalHits; ret += "<lang>" + languageFolderName+"</lang>"; ret += "<hits>" + counter + "</hits>"; ret += "<totalHits>" + counter + "</totalHits>";// TODO: gibt es in // diesem fall einen // unterschied zwischen // hits und totalhits? ret += "</resultInfo>"; rw.write(ret); rw.close(); } private int writeResults(TicketWriter tw, String languageFolderName, String ticket, Document doc) throws IOException { OutputStreamWriter rw = null; File languageFile; //String languageFolderName = language + "_XML"; try { languageFile = new File(tw.ticketFile.getAbsolutePath() + tw.PATHSEPARATOR + languageFolderName); File resultFile = new File(languageFile.getAbsolutePath() + tw.PATHSEPARATOR + "result"); FileOutputStream fs = new FileOutputStream(resultFile); rw = new OutputStreamWriter(fs, "utf-8"); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); return -1; } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); return -1; } Set<String> idsAlreadyDone = new HashSet<String>(); // TODO: aus // irgendwelche // gruenden gibt es // ein Dokument // mehrfach in den // Fundstellen // FileWriter rw = new FileWriter(resultFile); if (!languageFile.exists()) { logger.debug("Create Languagefolder:" + languageFile.getCanonicalPath()); if (!languageFile.mkdirs()) throw new IOException(); } Integer counter = 0; @SuppressWarnings("unchecked") List<Element> elements; try { XPath xpathDoc = XPath.newInstance("//document"); elements = xpathDoc.selectNodes(doc); } catch (JDOMException e1) { // TODO Auto-generated catch block e1.printStackTrace(); return -1; } for (Element e : elements) { try { XPath xpathUri = XPath.newInstance("uri"); Element uri = (Element) xpathUri.selectSingleNode(e); String id = uri.getTextTrim(); if (!idsAlreadyDone.contains(id)) { // stelle sicher das alle // treffer nur einmal in die // date geschrieben werden. idsAlreadyDone.add(id); String textId = id; String md = ""; Element mdEl = (Element) XPath .selectSingleNode(e, "author"); md += "<dc:creator>" + formatXML(mdEl.getTextTrim()) + "</dc:creator>"; mdEl = (Element) XPath.selectSingleNode(e, "title"); md += "<dc:title>" + formatXML(mdEl.getTextTrim()) + "</dc:title>"; mdEl = (Element) XPath.selectSingleNode(e, "place"); md += "<dc:place>" + formatXML(mdEl.getTextTrim()) + "</dc:place>"; mdEl = (Element) XPath.selectSingleNode(e, "date"); md += "<dc:date>" + formatXML(mdEl.getTextTrim()) + "</dc:date>"; String ret = "<result xmlns:dc=\"http://dublincore.org/documents/dcmi-namespace/\">"; ret += "<cleanedPath>" + textId + "</cleanedPath>"; ret += "<textId>" + textId.replace("/", ":") + "</textId>"; ret += "<textIdCleaned>" + textId.replace("/", "_") + "</textIdCleaned>"; ret += "<md>" + md + "</md>"; ret += "</result>"; rw.write(ret); counter++; } } catch (JDOMException e1) { // TODO Auto-generated catch block e1.printStackTrace(); return -1; } } rw.close(); return counter; } private String formatXML(String string) { String retStr = string.replace("&", "&"); retStr = retStr.replace("<", "<"); retStr = retStr.replace(">", ">"); return retStr; } /* * (non-Javadoc) * * @see * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesDoc * (java.lang.String, org.apache.lucene.search.Query, java.lang.String) */ @Override public OCRDoc searchInLinesDoc(String textId, Query query, String language) throws IOException { // TODO Auto-generated method stub return null; } /* * (non-Javadoc) * * @see * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLines * (org.apache.lucene.search.Query, java.lang.String) */ @Override public List<OCRDoc> searchInLines(Query query, String language) throws IOException { // TODO Auto-generated method stub return null; } public static void main(String[] args) { Term t = new Term("contents", "quantitas"); Query q = new TermQuery(t); XMLSearchServerAdapter sa = new XMLSearchServerAdapter(); try { sa.searchInLinesToDir(q, "la", "121"); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static List<String> getSupportedLanguages() { String langs[] = new String[] { "la:XML", "it:XML" }; return Arrays.asList(langs); } }