# HG changeset patch # User dwinter # Date 1296049294 -3600 # Node ID 83e9a828e794a18f95510829eec3c14ab4371a38 # Parent db87c1b7eb6de2bb2341f3ed53b2f9bb87ef0192 Version mit integrierter Suche ?ber XML-Volltexte diff -r db87c1b7eb6d -r 83e9a828e794 WebContent/WEB-INF/web.xml --- a/WebContent/WEB-INF/web.xml Wed Nov 03 12:18:46 2010 +0100 +++ b/WebContent/WEB-INF/web.xml Wed Jan 26 14:41:34 2011 +0100 @@ -13,7 +13,7 @@ de.mpwig.dwinter.fulltextSearchServer.lineIndex -/Volumes/data/indexLibcollLines +/Volumes/data/indexLibcollLines2 de.mpwig.dwinter.fulltextSearchServer.docIndex diff -r db87c1b7eb6d -r 83e9a828e794 src/de/mpiwg/dwinter/fulltextSearchServer/SearchLines.java --- a/src/de/mpiwg/dwinter/fulltextSearchServer/SearchLines.java Wed Nov 03 12:18:46 2010 +0100 +++ b/src/de/mpiwg/dwinter/fulltextSearchServer/SearchLines.java Wed Jan 26 14:41:34 2011 +0100 @@ -50,7 +50,7 @@ import de.mpiwg.dwinter.fulltext.search.FulltextSearchDocsLines; import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc; import de.mpiwg.dwinter.fulltext.search.utils.OCRLine; -import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher; +import de.mpiwg.dwinter.fulltext.searcher.ILanguageSearcher; import de.mpiwg.dwinter.fulltext.ticket.TicketWriter; import de.mpiwg.dwinter.fulltextSearchServer.Utils.ConfigurationManager; import de.mpiwg.dwinter.fulltextSearchServer.Utils.DigilibTools; @@ -222,7 +222,7 @@ FulltextSearchDocsLines searcher = new FulltextSearchDocsLines( docIndex, lineDir); - LanguageSearcher ls = searcher.languageSearchers + ILanguageSearcher ls = searcher.languageSearchers .getSearcherByLanguage(lang); if (ls == null) { setStatus(Status.CLIENT_ERROR_NOT_FOUND); diff -r db87c1b7eb6d -r 83e9a828e794 src/de/mpiwg/dwinter/fulltextSearchServer/SearchServerInfo.java --- a/src/de/mpiwg/dwinter/fulltextSearchServer/SearchServerInfo.java Wed Nov 03 12:18:46 2010 +0100 +++ b/src/de/mpiwg/dwinter/fulltextSearchServer/SearchServerInfo.java Wed Jan 26 14:41:34 2011 +0100 @@ -5,6 +5,7 @@ import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; +import java.util.List; import java.util.Properties; import org.restlet.data.Form; @@ -17,6 +18,7 @@ import org.restlet.resource.Get; import de.mpiwg.dwinter.fulltext.search.FulltextSearchConfig; +import de.mpiwg.dwinter.fulltext.search.xmlsearchadapter.XMLSearchServerAdapter; import de.mpiwg.dwinter.fulltextSearchServer.Utils.ConfigurationManager; public class SearchServerInfo extends ServerResource{ //need options for crossdomain scripting @@ -90,13 +92,20 @@ } private Representation getSupportedLanguages() { - // TODO Auto-generated method stub + - ArrayList langs = config.getSupportedLanguages(); + List langs = config.getSupportedLanguages(); String ret=""; for (String lang: langs){ ret+=""+lang+""; } + + langs= XMLSearchServerAdapter.getSupportedLanguages(); + for (String lang: langs){ + ret+=""+lang+""; + } + + ret+=""; return new StringRepresentation(ret, MediaType.TEXT_XML); } @@ -104,12 +113,21 @@ private Representation getSupportedLanguagesHTML() { // TODO Auto-generated method stub - ArrayList langs = config.getSupportedLanguages(); String ret="
"; + + List langs = config.getSupportedLanguages(); for (String lang: langs){ ret+="
"+lang+"
"; } + langs= XMLSearchServerAdapter.getSupportedLanguages(); + for (String lang: langs){ + ret+="
"+lang+"
"; + } + + ret+="
"; + + return new StringRepresentation(ret, MediaType.TEXT_HTML); } } diff -r db87c1b7eb6d -r 83e9a828e794 src/de/mpiwg/dwinter/fulltextSearchServer/SearchTicket.java --- a/src/de/mpiwg/dwinter/fulltextSearchServer/SearchTicket.java Wed Nov 03 12:18:46 2010 +0100 +++ b/src/de/mpiwg/dwinter/fulltextSearchServer/SearchTicket.java Wed Jan 26 14:41:34 2011 +0100 @@ -3,11 +3,14 @@ import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; +import java.util.List; import java.util.Properties; import java.util.concurrent.ConcurrentMap; import org.apache.log4j.Logger; import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.search.Query; import org.apache.lucene.store.LockObtainFailedException; import org.restlet.Context; import org.restlet.data.Form; @@ -27,6 +30,9 @@ import de.mpiwg.dwinter.fulltext.search.FulltextSearchDocsLines; +import de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines; +import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc; +import de.mpiwg.dwinter.fulltext.search.xmlsearchadapter.XMLSearchServerAdapter; import de.mpiwg.dwinter.fulltext.ticket.TicketWriter; import de.mpiwg.dwinter.fulltextSearchServer.Utils.ConfigurationManager; import de.mpiwg.dwinter.fulltextSearchServer.searchThreads.SearchInlinesThread; @@ -103,7 +109,14 @@ //String ticket = generateTicket(); String searchString=searchForm.getValues("searchString"); - String languages=searchForm.getValues("languages"); // language der form la1_la2_la3___ + String languages; + try { + languages = URLDecoder.decode(searchForm.getValues("languages"),"utf-8"); + } catch (UnsupportedEncodingException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + languages=""; + } // language der form la1_la2_la3___ String searchMetaData=searchForm.getValues("searchMetaData"); //no Searchstring if (searchString==null || languages==null ) @@ -146,11 +159,14 @@ } + IFulltextSearchDocsLines[] fulltextSearchers = new IFulltextSearchDocsLines[]{fulltextSearcher, new XMLSearchServerAdapter()}; + + String[] langs = languages.split("_"); for (String lang: langs){ - SearchInlinesThread st = new SearchInlinesThread(fulltextSearcher, searchString, searchMetaData,lang,ticket); + SearchInlinesThread st = new SearchInlinesThread(fulltextSearchers, searchString, searchMetaData,lang,ticket); st.start(); } diff -r db87c1b7eb6d -r 83e9a828e794 src/de/mpiwg/dwinter/fulltextSearchServer/ShowContentOfDocument.java --- a/src/de/mpiwg/dwinter/fulltextSearchServer/ShowContentOfDocument.java Wed Nov 03 12:18:46 2010 +0100 +++ b/src/de/mpiwg/dwinter/fulltextSearchServer/ShowContentOfDocument.java Wed Jan 26 14:41:34 2011 +0100 @@ -24,8 +24,11 @@ import javax.xml.transform.stream.StreamSource; import org.apache.log4j.Logger; +import org.restlet.Request; +import org.restlet.Response; import org.restlet.data.Form; import org.restlet.data.MediaType; +import org.restlet.data.Reference; import org.restlet.data.Status; import org.restlet.representation.Representation; import org.restlet.representation.StringRepresentation; @@ -35,6 +38,7 @@ import org.w3c.dom.Document; import org.xml.sax.SAXException; +import de.mpiwg.dwinter.fulltext.search.xmlsearchadapter.XMLSearchServerAdapter; import de.mpiwg.dwinter.fulltext.ticket.TicketWriter; import de.mpiwg.dwinter.fulltextSearchServer.Utils.DigilibTools; import de.mpiwg.dwinter.fulltextSearchServer.Utils.XMLTools; @@ -141,28 +145,28 @@ logger.debug("lang:"+lang); logger.debug("textId:"+textId); logger.debug("restpath:"+restPath); - String xml; - String txt; + String html; + if (restPath.equals("")){ - xml = showContent(ticket,lang,textId); - if (xml==null){ + html = showContent(ticket,lang,textId); + if (html==null){ setStatus(Status.SUCCESS_ACCEPTED); //still waiting return new StringRepresentation("waiting",MediaType.TEXT_HTML); } - txt = XMLTools.transformToHTML(xml,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/showContentOfDocumentToHTML.xsl"); + html = XMLTools.transformToHTML(html,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/showContentOfDocumentToHTML.xsl"); } else { - xml = processRestPath(ticket,lang,textId,restPath); - if (xml==null){ + html = processRestPath(ticket,lang,textId,restPath); + if (html==null){ setStatus(Status.SUCCESS_ACCEPTED); //still waiting return new StringRepresentation("waiting",MediaType.TEXT_HTML); } - txt = XMLTools.transformToHTML(xml,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl"); + //txt=xml; } - StringRepresentation representation = new StringRepresentation(txt, + StringRepresentation representation = new StringRepresentation(html, MediaType.TEXT_HTML) ; return representation; @@ -224,13 +228,12 @@ xml=null; } - return xml; + return XMLTools.transformToHTML(xml,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl"); } /** - * Rueckgabe der Treffer gemaess dem Darstellungsmode z.Z. wird nur digilib unterstuetzt, bzw. egal welcher - * Mode angegeben wird es wird immer das gleiche gemacht, naemlich ein Link auf Digilib ausgegeben. + * Rueckgabe der Treffer gemaess dem Darstellungsmode z.Z. wird nur digilib und generic unterstuetzt. * Der Pfad ist in DIGIVIEWBASICSTRING festgelegt. * @param ticket * @param lang @@ -246,26 +249,42 @@ String ret=""; ret+=""+pageFileName+""; ArrayList points = new ArrayList(); - + try { String xml = TicketWriter.getHitsOnPage(ticket,lang,textId,pageFileName); + Pattern linePattern = Pattern.compile("(.*?)",Pattern.MULTILINE); Matcher m = linePattern.matcher(xml); - while(m.find()){ - Double[] point = DigilibTools.calculatePoint(m.group(1)); - points.add(point); - } - - String url = String.format(DigilibTools.DIGIVIEWBASICSTRING, DigilibTools.generateImagePath(textId,pageFileName),DigilibTools.generateMarksFromPoints(points)); + // teste format des Restes wenn noch "/" dann Aufruf einer Seite direkt - ret+=String.format("%s",url,textId); - ret+=""; + if (mode.equals("digilib")){ + String txt=handleDigilib(textId, pageFileName, ret, points, m); + return XMLTools.transformToHTML(txt,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl"); + } + else if (mode.equals("generic")){ + if (m.find()){ // xmlfile ist fuer digilib ok, dann digilib + String txt = handleDigilib(textId, pageFileName, ret, points, m); + return XMLTools.transformToHTML(txt,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl"); + } else { + //assume xml-treffer liste + + + String txt = handleXMLFullText(textId, pageFileName,ret, ticket, lang); + return XMLTools.transformToHTML(txt,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl"); + } + - return ret; + } else if (mode.equals("showXMLhits")){ + //String txt=""+ + //""+ + //""; + String txt= TicketWriter.getFileContent(ticket,lang,textId,pageFileName); + return txt; //+""; + } } catch (FileNotFoundException e) { @@ -282,6 +301,50 @@ +protected String handleXMLFullText(String textId, String pageFileName, String ret, String ticket, + String lang) { + + try { + String[] morphquerySplitted=TicketWriter.getQueryString(lang,ticket).split(":"); + //string has normally the format field:query + String morphQuery= morphquerySplitted[morphquerySplitted.length-1]; + + String queryString = XMLSearchServerAdapter.XMLDocSearchBase + "document=" + textId.replace(":","/"); + //queryString += "&queryType=fulltext&query=" + morphQuery; + queryString += "&mode=text&query-type=fulltext&query=" + morphQuery; + + ret+=String.format("%s",queryString,textId); + ret+=""; + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return ret; +} + +protected String handleDigilib(String textId, String pageFileName, String ret, + ArrayList points, Matcher m) { + while(m.find()){ + Double[] point = DigilibTools.calculatePoint(m.group(1)); + points.add(point); + } + + String url = String.format(DigilibTools.DIGIVIEWBASICSTRING, DigilibTools.generateImagePath(textId,pageFileName),DigilibTools.generateMarksFromPoints(points)); + + + ret+=String.format("%s",url,textId); + ret+=""; + + + return ret; +} + + + + diff -r db87c1b7eb6d -r 83e9a828e794 src/de/mpiwg/dwinter/fulltextSearchServer/searchThreads/SearchInlinesThread.java --- a/src/de/mpiwg/dwinter/fulltextSearchServer/searchThreads/SearchInlinesThread.java Wed Nov 03 12:18:46 2010 +0100 +++ b/src/de/mpiwg/dwinter/fulltextSearchServer/searchThreads/SearchInlinesThread.java Wed Jan 26 14:41:34 2011 +0100 @@ -2,79 +2,122 @@ import java.io.File; import java.io.IOException; +import java.net.URLDecoder; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; import de.mpiwg.dwinter.fulltext.search.FulltextSearchDocsLines; +import de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines; +import de.mpiwg.dwinter.fulltext.search.xmlsearchadapter.XMLSearchServerAdapter; +import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher; public class SearchInlinesThread extends Thread { - //private File docIndex; - //private File lineDir; + // private File docIndex; + // private File lineDir; private String searchString; private String ticket; private String lang; - private FulltextSearchDocsLines fulltextSearcher; + private IFulltextSearchDocsLines[] fulltextSearcher; private Logger logger; private String searchMetaData; - - public SearchInlinesThread(FulltextSearchDocsLines fulltextSearcher, String searchString, String searchMetaData,String lang, String ticket){ - //this.docIndex = docIndex; - //this.lineDir = lineDir; + + public SearchInlinesThread(IFulltextSearchDocsLines[] fulltextSearcher, + String searchString, String searchMetaData, String lang, + String ticket) { + // this.docIndex = docIndex; + // this.lineDir = lineDir; this.fulltextSearcher = fulltextSearcher; this.searchString = searchString; this.searchMetaData = searchMetaData; - this.ticket=ticket; - this.lang =lang; - - - this.logger=Logger.getRootLogger(); + this.ticket = ticket; + this.lang= lang; + this.logger = Logger.getRootLogger(); } - public void run(){ - String text; - - - FulltextSearchDocsLines fulltextSearcher; - try { - //fulltextSearcher = new FulltextSearchDocsLines(docIndex,lineDir); - + + public void run() { + // String text; + + for (int i = 0; i < fulltextSearcher.length; i++) { + IFulltextSearchDocsLines currentSearcher = fulltextSearcher[i]; + + // IFulltextSearchDocsLines fulltextSearcher; + try { + // fulltextSearcher = new + // FulltextSearchDocsLines(docIndex,lineDir); + Query query = null; + if (FulltextSearchDocsLines.class.isInstance(currentSearcher)) {// lucenebased + // searcher + FulltextSearchDocsLines ftsdl = (FulltextSearchDocsLines) currentSearcher; + + + LanguageSearcher ls = ftsdl.languageSearchers + .getSearcherByLanguage(lang); + + if(ls==null) // language not supported + continue; + + Analyzer analyzer = ls.analyzer; + QueryParser parser = new QueryParser(Version.LUCENE_30, + "contents", analyzer); + logger.debug(searchString); + query = parser.parse(searchString); - Analyzer analyzer = this.fulltextSearcher.languageSearchers.getSearcherByLanguage(lang).analyzer; - QueryParser parser = new QueryParser(Version.LUCENE_30,"contents",analyzer); - logger.debug(searchString); - Query query= parser.parse(searchString); - - if ((searchMetaData!=null) && !searchMetaData.equals("")){ - QueryParser parserMD = new QueryParser(Version.LUCENE_30,"dcMetaData",analyzer); - Query queryMD= parserMD.parse(searchMetaData); - BooleanQuery booleanQuery = new BooleanQuery(); - booleanQuery.add(queryMD, BooleanClause.Occur.MUST); - booleanQuery.add(query, BooleanClause.Occur.MUST); - - query = booleanQuery; - } - this.fulltextSearcher.searchInLinesToDir(query,lang,ticket); - } catch (CorruptIndexException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (LockObtainFailedException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (ParseException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + if ((searchMetaData != null) && !searchMetaData.equals("")) { + QueryParser parserMD = new QueryParser( + Version.LUCENE_30, "dcMetaData", analyzer); + Query queryMD = parserMD.parse(searchMetaData); + BooleanQuery booleanQuery = new BooleanQuery(); + booleanQuery.add(queryMD, BooleanClause.Occur.MUST); + booleanQuery.add(query, BooleanClause.Occur.MUST); + + query = booleanQuery; + } + } else if (XMLSearchServerAdapter.class + .isInstance(currentSearcher)) { + + if (!XMLSearchServerAdapter.getSupportedLanguages().contains(lang)){ + continue; // language not supported + } + Term t = new Term("contents", searchString); + query = new TermQuery(t); + + if ((searchMetaData != null) && !searchMetaData.equals("")) { + Term t2 = new Term("dcMetaData", searchMetaData); + Query query2 = new TermQuery(t2); + + BooleanQuery booleanQuery = new BooleanQuery(); + booleanQuery.add(query2, BooleanClause.Occur.MUST); + booleanQuery.add(query, BooleanClause.Occur.MUST); + + query = booleanQuery; + } + } + currentSearcher.searchInLinesToDir(query, lang, ticket); + } catch (CorruptIndexException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (LockObtainFailedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (ParseException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } } } } diff -r db87c1b7eb6d -r 83e9a828e794 src/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl --- a/src/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl Wed Nov 03 12:18:46 2010 +0100 +++ b/src/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl Wed Jan 26 14:41:34 2011 +0100 @@ -8,5 +8,12 @@ + + +
+ +
+
+ \ No newline at end of file