Mercurial > hg > fulltextSearch
changeset 2:2b29b0b6db16 default tip
Version mit integrierter Suche ?ber XML-Volltexte
author | dwinter |
---|---|
date | Wed, 26 Jan 2011 14:41:09 +0100 |
parents | 5c9c31510f0c |
children | |
files | src/de/mpiwg/dwinter/fulltext/search/FulltextSearch.java src/de/mpiwg/dwinter/fulltext/search/FulltextSearchDocsLines.java src/de/mpiwg/dwinter/fulltext/search/IFulltextSearch.java src/de/mpiwg/dwinter/fulltext/search/IFulltextSearchDocsLines.java src/de/mpiwg/dwinter/fulltext/search/xmlsearchadapter/XMLSearchServerAdapter.java src/de/mpiwg/dwinter/fulltext/searcher/ILanguageSearcher.java src/de/mpiwg/dwinter/fulltext/searcher/LanguageSearcher.java src/de/mpiwg/dwinter/fulltext/ticket/TicketWriter.java |
diffstat | 8 files changed, 490 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/src/de/mpiwg/dwinter/fulltext/search/FulltextSearch.java Wed Nov 03 12:26:29 2010 +0100 +++ b/src/de/mpiwg/dwinter/fulltext/search/FulltextSearch.java Wed Jan 26 14:41:09 2011 +0100 @@ -43,7 +43,7 @@ -public class FulltextSearch { +public class FulltextSearch implements IFulltextSearch { protected File index_dir; @@ -82,14 +82,23 @@ languageSearchers.add(new LanguageSearcher("la",new StandardAnalyzer(Version.LUCENE_30),dir)); } + /* (non-Javadoc) + * @see de.mpiwg.dwinter.fulltext.search.IFulltextSearch#search(org.apache.lucene.search.Query, java.lang.String) + */ public Collector search(Query query,String language) throws IOException{ return languageSearchers.searchLanguage(query, language); } + /* (non-Javadoc) + * @see de.mpiwg.dwinter.fulltext.search.IFulltextSearch#search(org.apache.lucene.search.Query) + */ public HashMap<String, Collector> search(Query query) throws IOException{ return languageSearchers.searchAllLanguages(query); } + /* (non-Javadoc) + * @see de.mpiwg.dwinter.fulltext.search.IFulltextSearch#searchMD(java.lang.String, java.lang.String, java.util.ArrayList) + */ public HashMap<String, Collector> searchMD(String searchString, String mdString, ArrayList<String> languages) throws ParseException, IOException { if (mdString==null & languages==null) return languageSearchers.parseAndsearchAllLanguages(searchString); @@ -98,6 +107,9 @@ } + /* (non-Javadoc) + * @see de.mpiwg.dwinter.fulltext.search.IFulltextSearch#search(java.lang.String) + */ public HashMap<String, Collector> search(String searchString) throws IOException, ParseException { return languageSearchers.parseAndsearchAllLanguages(searchString); @@ -109,6 +121,9 @@ // return languageSearchers.parseAndsearchAndAnalyseAllLanguages(searchString); // } + /* (non-Javadoc) + * @see de.mpiwg.dwinter.fulltext.search.IFulltextSearch#searchForMorph(java.lang.String, java.lang.String) + */ public ArrayList<String> searchForMorph(String path, String word) throws ParseException, IOException, ParserConfigurationException, SAXException, XPathExpressionException { LanguageSearcher searcher = languageSearchers.getSearcherByLanguage("morph");
--- a/src/de/mpiwg/dwinter/fulltext/search/FulltextSearchDocsLines.java Wed Nov 03 12:26:29 2010 +0100 +++ b/src/de/mpiwg/dwinter/fulltext/search/FulltextSearchDocsLines.java Wed Jan 26 14:41:09 2011 +0100 @@ -34,7 +34,7 @@ import de.mpiwg.dwinter.fulltext.searcher.LanguageSearchers; import de.mpiwg.dwinter.fulltext.ticket.TicketWriter; -public class FulltextSearchDocsLines extends FulltextSearch { +public class FulltextSearchDocsLines extends FulltextSearch implements IFulltextSearchDocsLines { protected static Logger logger = Logger.getRootLogger(); @@ -71,16 +71,8 @@ new StandardAnalyzer(Version.LUCENE_30), dir)); } - /** - * Erzeugt Ergebnisliste im Filesystem fuer die Weiterbenutzung bers - * Servlet - * - * @param query - * @param language - * @param ticket - * ticket unter dem auf die Daten zurckgegriffen werden soll. - * @return - * @throws IOException + /* (non-Javadoc) + * @see de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesToDir(org.apache.lucene.search.Query, java.lang.String, java.lang.String) */ public void searchInLinesToDir(Query query, String language, String ticket) throws IOException { @@ -167,6 +159,9 @@ } + /* (non-Javadoc) + * @see de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesDoc(java.lang.String, org.apache.lucene.search.Query, java.lang.String) + */ public OCRDoc searchInLinesDoc(String textId,Query query, String language) throws IOException{ Query textIDQuery = new TermQuery(new Term("textId", textId)); BooleanQuery booleanQuery = new BooleanQuery(); @@ -182,6 +177,9 @@ } + /* (non-Javadoc) + * @see de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLines(org.apache.lucene.search.Query, java.lang.String) + */ public List<OCRDoc> searchInLines(Query query, String language) throws IOException {
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/fulltext/search/IFulltextSearch.java Wed Jan 26 14:41:09 2011 +0100 @@ -0,0 +1,34 @@ +package de.mpiwg.dwinter.fulltext.search; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.xpath.XPathExpressionException; + +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.Query; +import org.xml.sax.SAXException; + +public interface IFulltextSearch { + + public abstract Collector search(Query query, String language) + throws IOException; + + public abstract HashMap<String, Collector> search(Query query) + throws IOException; + + public abstract HashMap<String, Collector> searchMD(String searchString, + String mdString, ArrayList<String> languages) + throws ParseException, IOException; + + public abstract HashMap<String, Collector> search(String searchString) + throws IOException, ParseException; + + public abstract ArrayList<String> searchForMorph(String path, String word) + throws ParseException, IOException, ParserConfigurationException, + SAXException, XPathExpressionException; + +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/fulltext/search/IFulltextSearchDocsLines.java Wed Jan 26 14:41:09 2011 +0100 @@ -0,0 +1,32 @@ +package de.mpiwg.dwinter.fulltext.search; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.search.Query; + +import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc; + +public interface IFulltextSearchDocsLines { + + /** + * Erzeugt Ergebnisliste im Filesystem fuer die Weiterbenutzung bers + * Servlet + * + * @param query + * @param language + * @param ticket + * ticket unter dem auf die Daten zurckgegriffen werden soll. + * @return + * @throws IOException + */ + public abstract void searchInLinesToDir(Query query, String language, + String ticket) throws IOException; + + public abstract OCRDoc searchInLinesDoc(String textId, Query query, + String language) throws IOException; + + public abstract List<OCRDoc> searchInLines(Query query, String language) + throws IOException; + +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/fulltext/search/xmlsearchadapter/XMLSearchServerAdapter.java Wed Jan 26 14:41:09 2011 +0100 @@ -0,0 +1,364 @@ +/** + * + */ +package de.mpiwg.dwinter.fulltext.search.xmlsearchadapter; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.log4j.Logger; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.jdom.Document; +import org.jdom.Element; +import org.jdom.JDOMException; +import org.jdom.input.SAXBuilder; +import org.jdom.output.Format; +import org.jdom.output.XMLOutputter; +import org.jdom.xpath.XPath; + +import de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines; +import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc; +import de.mpiwg.dwinter.fulltext.search.utils.OCRLine; +import de.mpiwg.dwinter.fulltext.ticket.TicketWriter; +import de.mpiwg.dwinter.lucencetools.documents.FileDocument; + +/** + * @author dwinter + * + */ +public class XMLSearchServerAdapter implements IFulltextSearchDocsLines { + + protected static Logger logger = Logger.getRootLogger(); + + public static String XMLServerSearchBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/queryResult.xql?"; + //public static String XMLDocSearchBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/doc-query.xql?"; + public static String XMLDocSearchBase = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?"; + public static String XMLServerBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/"; + + + //http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&mode=text&query-type=fulltext&query=quantitas + // http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&queryType=fulltext&query=quantitas + /* + * (non-Javadoc) + * + * @see + * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesToDir + * (org.apache.lucene.search.Query, java.lang.String, java.lang.String) + */ + + @Override + public void searchInLinesToDir(Query query, String calledLanguage, String ticket) + throws IOException { + String languageFolderName; + String language; + //check format of the language string could be lang:xml or just lang + String[] langsplitted = calledLanguage.split(":"); + if(langsplitted.length>1){ + if(langsplitted[1].equals("XML")){ + language=langsplitted[0]; + languageFolderName=calledLanguage; + } else { + language=calledLanguage; + languageFolderName=calledLanguage+":XML"; + } + + } else { + language=calledLanguage; + languageFolderName=calledLanguage+":XML"; + } + + + + TicketWriter tw = new TicketWriter(ticket, query, languageFolderName); + + File languageFile = new File(tw.ticketFile.getAbsolutePath() + + tw.PATHSEPARATOR + languageFolderName); + if (!languageFile.exists()) { + logger.debug("Create Languagefolder:" + + languageFile.getCanonicalPath()); + if (!languageFile.mkdirs()) + throw new IOException(); + } + + // docbase=archimedes&docbase=echo&queryType=fulltextMorph&language=la&ftMorphQuery=quantitas&pn=1&output=xml&pageSize=50 + SAXBuilder parser = new SAXBuilder(); + + String queryString = XMLServerSearchBase + + "docbase=archimedes&docbase=echo&queryType=fulltextMorph"; + queryString += "&language=" + language; + + Set<Term> terms = new HashSet<Term>(); + query.extractTerms(terms); + String morphQuery = ""; + for (Term t : terms) { + if (t.field().equals("contents")) + morphQuery = t.text(); + } + queryString += "&ftMorphQuery=" + morphQuery; + queryString += "&pn=1&output=xml&pageSize=500"; + Document doc; + try { + doc = parser.build(queryString); + } catch (JDOMException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + return; + } + + int counter = writeResults(tw, languageFolderName, ticket, doc); + writeResultInfo(tw, doc, counter, languageFolderName); + + tw.commitTicket(); + + List<Element> docElements; + try { + XPath docsXP = XPath.newInstance("//document"); + docElements = docsXP.selectNodes(doc); + } catch (JDOMException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + return; + } + + for (Element e : docElements) { + Element textIdElement; + try { + textIdElement = (Element) XPath.selectSingleNode(e, "uri"); + } catch (JDOMException e1) { + // TODO Auto-generated catch block + continue; + } + String textId = textIdElement.getTextTrim(); + + File docFile = new File(languageFile.getAbsolutePath() + + TicketWriter.PATHSEPARATOR + + textId.replace(TicketWriter.PATHSEPARATORCHAR, ':')); + if (!docFile.exists()) { + logger.debug("Create Docfolder:" + docFile.getCanonicalPath()); + if (!docFile.mkdirs()) + throw new IOException(); + } + + // TODO: jetzt fuer jede seite ein file, zur Zeit jeweils nur ein + // File pro Document! + // for (String page:ocrDoc.linesInPage.keySet()){ + + File pageFile = new File(docFile.getAbsolutePath() + + TicketWriter.PATHSEPARATOR + + textId.replace(TicketWriter.PATHSEPARATORCHAR, ':')); + FileWriter pageFileWriter = new FileWriter(pageFile); + // http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&queryType=fulltext&query=quantitas + http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&mode=text&query-type=fulltext&query=quantitas + // for (OCRLine line: ocrDoc.linesInPage.get(page)){ + // pageFileWriter.write("allLines"+"\n"); + // } + queryString = XMLDocSearchBase + "document=" + textId; + //queryString += "&queryType=fulltext&query=" + morphQuery; + queryString += "&mode=text&query-type=fulltext&query=" + morphQuery; + + try { + doc = parser.build(queryString); + } catch (JDOMException e2) { + // TODO Auto-generated catch block + e2.printStackTrace(); + return; + } + + XMLOutputter op = new XMLOutputter(Format.getCompactFormat()); + op.output(doc, pageFileWriter); + pageFileWriter.close(); + } + tw.closeTicket(languageFolderName); + } + + private void writeResultInfo(TicketWriter tw, Document doc, int counter, + String languageFolderName) throws IOException { + //String languageFolderName = language + "_XML"; + + File languageFile = new File(tw.ticketFile.getAbsolutePath() + + TicketWriter.PATHSEPARATOR + languageFolderName); + File resultFile = new File(languageFile.getAbsolutePath() + + TicketWriter.PATHSEPARATOR + "resultInfo"); + FileOutputStream fs = new FileOutputStream(resultFile); + OutputStreamWriter rw = new OutputStreamWriter(fs, "utf-8"); + String ret = "<resultInfo>"; + // int hits = docs.totalHits; + ret += "<lang>" + languageFolderName+"</lang>"; + ret += "<hits>" + counter + "</hits>"; + ret += "<totalHits>" + counter + "</totalHits>";// TODO: gibt es in + // diesem fall einen + // unterschied zwischen + // hits und totalhits? + + ret += "</resultInfo>"; + rw.write(ret); + rw.close(); + } + + private int writeResults(TicketWriter tw, String languageFolderName, String ticket, + Document doc) throws IOException { + OutputStreamWriter rw = null; + File languageFile; + //String languageFolderName = language + "_XML"; + + try { + languageFile = new File(tw.ticketFile.getAbsolutePath() + + tw.PATHSEPARATOR + languageFolderName); + File resultFile = new File(languageFile.getAbsolutePath() + + tw.PATHSEPARATOR + "result"); + FileOutputStream fs = new FileOutputStream(resultFile); + rw = new OutputStreamWriter(fs, "utf-8"); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + return -1; + } catch (UnsupportedEncodingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + return -1; + } + + Set<String> idsAlreadyDone = new HashSet<String>(); // TODO: aus + // irgendwelche + // gruenden gibt es + // ein Dokument + // mehrfach in den + // Fundstellen + + // FileWriter rw = new FileWriter(resultFile); + + if (!languageFile.exists()) { + logger.debug("Create Languagefolder:" + + languageFile.getCanonicalPath()); + if (!languageFile.mkdirs()) + throw new IOException(); + } + Integer counter = 0; + @SuppressWarnings("unchecked") + List<Element> elements; + try { + XPath xpathDoc = XPath.newInstance("//document"); + elements = xpathDoc.selectNodes(doc); + } catch (JDOMException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + return -1; + } + + for (Element e : elements) { + try { + XPath xpathUri = XPath.newInstance("uri"); + Element uri = (Element) xpathUri.selectSingleNode(e); + + String id = uri.getTextTrim(); + if (!idsAlreadyDone.contains(id)) { // stelle sicher das alle + // treffer nur einmal in die + // date geschrieben werden. + idsAlreadyDone.add(id); + + String textId = id; + String md = ""; + Element mdEl = (Element) XPath + .selectSingleNode(e, "author"); + md += "<dc:creator>" + formatXML(mdEl.getTextTrim()) + + "</dc:creator>"; + + mdEl = (Element) XPath.selectSingleNode(e, "title"); + md += "<dc:title>" + formatXML(mdEl.getTextTrim()) + + "</dc:title>"; + + mdEl = (Element) XPath.selectSingleNode(e, "place"); + md += "<dc:place>" + formatXML(mdEl.getTextTrim()) + + "</dc:place>"; + + mdEl = (Element) XPath.selectSingleNode(e, "date"); + md += "<dc:date>" + formatXML(mdEl.getTextTrim()) + + "</dc:date>"; + + String ret = "<result xmlns:dc=\"http://dublincore.org/documents/dcmi-namespace/\">"; + ret += "<cleanedPath>" + textId + "</cleanedPath>"; + ret += "<textId>" + textId.replace("/", ":") + "</textId>"; + ret += "<textIdCleaned>" + textId.replace("/", "_") + + "</textIdCleaned>"; + ret += "<md>" + md + "</md>"; + ret += "</result>"; + + rw.write(ret); + counter++; + } + } catch (JDOMException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + return -1; + } + } + rw.close(); + return counter; + } + + private String formatXML(String string) { + String retStr = string.replace("&", "&"); + retStr = retStr.replace("<", "<"); + retStr = retStr.replace(">", ">"); + return retStr; + } + + /* + * (non-Javadoc) + * + * @see + * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesDoc + * (java.lang.String, org.apache.lucene.search.Query, java.lang.String) + */ + @Override + public OCRDoc searchInLinesDoc(String textId, Query query, String language) + throws IOException { + // TODO Auto-generated method stub + return null; + } + + /* + * (non-Javadoc) + * + * @see + * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLines + * (org.apache.lucene.search.Query, java.lang.String) + */ + @Override + public List<OCRDoc> searchInLines(Query query, String language) + throws IOException { + // TODO Auto-generated method stub + return null; + } + + public static void main(String[] args) { + Term t = new Term("contents", "quantitas"); + Query q = new TermQuery(t); + XMLSearchServerAdapter sa = new XMLSearchServerAdapter(); + try { + sa.searchInLinesToDir(q, "la", "121"); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + public static List<String> getSupportedLanguages() { + String langs[] = new String[] { "la:XML", "it:XML" }; + return Arrays.asList(langs); + + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/fulltext/searcher/ILanguageSearcher.java Wed Jan 26 14:41:09 2011 +0100 @@ -0,0 +1,16 @@ +package de.mpiwg.dwinter.fulltext.searcher; + +import java.io.IOException; + +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.search.Collector; + +public interface ILanguageSearcher { + + public abstract Collector parseAndSearch(String searchString) + throws ParseException, IOException; + + public abstract Collector parseAndSearch(String searchString, + String mdString) throws ParseException, IOException; + +} \ No newline at end of file
--- a/src/de/mpiwg/dwinter/fulltext/searcher/LanguageSearcher.java Wed Nov 03 12:26:29 2010 +0100 +++ b/src/de/mpiwg/dwinter/fulltext/searcher/LanguageSearcher.java Wed Jan 26 14:41:09 2011 +0100 @@ -28,7 +28,7 @@ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer; -public class LanguageSearcher extends LanguageAnalyzer { +public class LanguageSearcher extends LanguageAnalyzer implements ILanguageSearcher { static final int MAX_RESULTS = 10000; // Maximalanzahl Treffer public Searcher searcher=null; @@ -42,6 +42,9 @@ } + /* (non-Javadoc) + * @see de.mpiwg.dwinter.fulltext.searcher.ILanguageSearcher#parseAndSearch(java.lang.String) + */ public Collector parseAndSearch(String searchString) throws ParseException, IOException { Query query= parser.parse(searchString); @@ -57,6 +60,9 @@ } + /* (non-Javadoc) + * @see de.mpiwg.dwinter.fulltext.searcher.ILanguageSearcher#parseAndSearch(java.lang.String, java.lang.String) + */ public Collector parseAndSearch(String searchString, String mdString) throws ParseException, IOException { Query query= parser.parse(searchString +" AND dcMetaData:"+mdString); System.out.println("Parse and search:"+query);
--- a/src/de/mpiwg/dwinter/fulltext/ticket/TicketWriter.java Wed Nov 03 12:26:29 2010 +0100 +++ b/src/de/mpiwg/dwinter/fulltext/ticket/TicketWriter.java Wed Jan 26 14:41:09 2011 +0100 @@ -53,9 +53,9 @@ private static final String TICKET_PATH = "/tmp/ticketfolder"; - protected static final String PATHSEPARATOR = "/"; - protected static final char PATHSEPARATORCHAR = '/'; - protected File ticketFile; + public static final String PATHSEPARATOR = "/"; + public static final char PATHSEPARATORCHAR = '/'; + public File ticketFile; private Map<String,LanguageWriter> languageFolders = new HashMap<String,LanguageWriter>(); private String ticket; @@ -171,7 +171,7 @@ lastChangeFw.close(); - }// TODO Auto-generated method stub + } /** @@ -349,7 +349,7 @@ return readFileToString(qf); } - private static String getQueryString(String lang,String ticket) throws FileNotFoundException, IOException { + public static String getQueryString(String lang,String ticket) throws FileNotFoundException, IOException { String ticketString = TICKET_PATH+PATHSEPARATOR+ticket+PATHSEPARATOR+lang; File qf = new File(ticketString+PATHSEPARATOR+"query"); @@ -492,6 +492,13 @@ } + public static String getFileContent(String ticket, String lang, String textId, + String pageFileName) throws FileNotFoundException, IOException { + File ticketFile = new File(TICKET_PATH+PATHSEPARATOR+ticket+PATHSEPARATOR+lang+PATHSEPARATOR+textId+PATHSEPARATOR+pageFileName); + return readFileToString(ticketFile); + + } +