# HG changeset patch
# User dwinter
# Date 1296049294 -3600
# Node ID 83e9a828e794a18f95510829eec3c14ab4371a38
# Parent db87c1b7eb6de2bb2341f3ed53b2f9bb87ef0192
Version mit integrierter Suche ?ber XML-Volltexte
diff -r db87c1b7eb6d -r 83e9a828e794 WebContent/WEB-INF/web.xml
--- a/WebContent/WEB-INF/web.xml Wed Nov 03 12:18:46 2010 +0100
+++ b/WebContent/WEB-INF/web.xml Wed Jan 26 14:41:34 2011 +0100
@@ -13,7 +13,7 @@
de.mpwig.dwinter.fulltextSearchServer.lineIndex
-/Volumes/data/indexLibcollLines
+/Volumes/data/indexLibcollLines2
de.mpwig.dwinter.fulltextSearchServer.docIndex
diff -r db87c1b7eb6d -r 83e9a828e794 src/de/mpiwg/dwinter/fulltextSearchServer/SearchLines.java
--- a/src/de/mpiwg/dwinter/fulltextSearchServer/SearchLines.java Wed Nov 03 12:18:46 2010 +0100
+++ b/src/de/mpiwg/dwinter/fulltextSearchServer/SearchLines.java Wed Jan 26 14:41:34 2011 +0100
@@ -50,7 +50,7 @@
import de.mpiwg.dwinter.fulltext.search.FulltextSearchDocsLines;
import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc;
import de.mpiwg.dwinter.fulltext.search.utils.OCRLine;
-import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher;
+import de.mpiwg.dwinter.fulltext.searcher.ILanguageSearcher;
import de.mpiwg.dwinter.fulltext.ticket.TicketWriter;
import de.mpiwg.dwinter.fulltextSearchServer.Utils.ConfigurationManager;
import de.mpiwg.dwinter.fulltextSearchServer.Utils.DigilibTools;
@@ -222,7 +222,7 @@
FulltextSearchDocsLines searcher = new FulltextSearchDocsLines(
docIndex, lineDir);
- LanguageSearcher ls = searcher.languageSearchers
+ ILanguageSearcher ls = searcher.languageSearchers
.getSearcherByLanguage(lang);
if (ls == null) {
setStatus(Status.CLIENT_ERROR_NOT_FOUND);
diff -r db87c1b7eb6d -r 83e9a828e794 src/de/mpiwg/dwinter/fulltextSearchServer/SearchServerInfo.java
--- a/src/de/mpiwg/dwinter/fulltextSearchServer/SearchServerInfo.java Wed Nov 03 12:18:46 2010 +0100
+++ b/src/de/mpiwg/dwinter/fulltextSearchServer/SearchServerInfo.java Wed Jan 26 14:41:34 2011 +0100
@@ -5,6 +5,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
+import java.util.List;
import java.util.Properties;
import org.restlet.data.Form;
@@ -17,6 +18,7 @@
import org.restlet.resource.Get;
import de.mpiwg.dwinter.fulltext.search.FulltextSearchConfig;
+import de.mpiwg.dwinter.fulltext.search.xmlsearchadapter.XMLSearchServerAdapter;
import de.mpiwg.dwinter.fulltextSearchServer.Utils.ConfigurationManager;
public class SearchServerInfo extends ServerResource{
//need options for crossdomain scripting
@@ -90,13 +92,20 @@
}
private Representation getSupportedLanguages() {
- // TODO Auto-generated method stub
+
- ArrayList langs = config.getSupportedLanguages();
+ List langs = config.getSupportedLanguages();
String ret="";
for (String lang: langs){
ret+=""+lang+"";
}
+
+ langs= XMLSearchServerAdapter.getSupportedLanguages();
+ for (String lang: langs){
+ ret+=""+lang+"";
+ }
+
+
ret+="";
return new StringRepresentation(ret, MediaType.TEXT_XML);
}
@@ -104,12 +113,21 @@
private Representation getSupportedLanguagesHTML() {
// TODO Auto-generated method stub
- ArrayList langs = config.getSupportedLanguages();
String ret="";
+
+ List
langs = config.getSupportedLanguages();
for (String lang: langs){
ret+=""+lang+"
";
}
+ langs= XMLSearchServerAdapter.getSupportedLanguages();
+ for (String lang: langs){
+ ret+=""+lang+"
";
+ }
+
+
ret+=" ";
+
+
return new StringRepresentation(ret, MediaType.TEXT_HTML);
}
}
diff -r db87c1b7eb6d -r 83e9a828e794 src/de/mpiwg/dwinter/fulltextSearchServer/SearchTicket.java
--- a/src/de/mpiwg/dwinter/fulltextSearchServer/SearchTicket.java Wed Nov 03 12:18:46 2010 +0100
+++ b/src/de/mpiwg/dwinter/fulltextSearchServer/SearchTicket.java Wed Jan 26 14:41:34 2011 +0100
@@ -3,11 +3,14 @@
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.util.List;
import java.util.Properties;
import java.util.concurrent.ConcurrentMap;
import org.apache.log4j.Logger;
import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.search.Query;
import org.apache.lucene.store.LockObtainFailedException;
import org.restlet.Context;
import org.restlet.data.Form;
@@ -27,6 +30,9 @@
import de.mpiwg.dwinter.fulltext.search.FulltextSearchDocsLines;
+import de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines;
+import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc;
+import de.mpiwg.dwinter.fulltext.search.xmlsearchadapter.XMLSearchServerAdapter;
import de.mpiwg.dwinter.fulltext.ticket.TicketWriter;
import de.mpiwg.dwinter.fulltextSearchServer.Utils.ConfigurationManager;
import de.mpiwg.dwinter.fulltextSearchServer.searchThreads.SearchInlinesThread;
@@ -103,7 +109,14 @@
//String ticket = generateTicket();
String searchString=searchForm.getValues("searchString");
- String languages=searchForm.getValues("languages"); // language der form la1_la2_la3___
+ String languages;
+ try {
+ languages = URLDecoder.decode(searchForm.getValues("languages"),"utf-8");
+ } catch (UnsupportedEncodingException e1) {
+ // TODO Auto-generated catch block
+ e1.printStackTrace();
+ languages="";
+ } // language der form la1_la2_la3___
String searchMetaData=searchForm.getValues("searchMetaData");
//no Searchstring
if (searchString==null || languages==null )
@@ -146,11 +159,14 @@
}
+ IFulltextSearchDocsLines[] fulltextSearchers = new IFulltextSearchDocsLines[]{fulltextSearcher, new XMLSearchServerAdapter()};
+
+
String[] langs = languages.split("_");
for (String lang: langs){
- SearchInlinesThread st = new SearchInlinesThread(fulltextSearcher, searchString, searchMetaData,lang,ticket);
+ SearchInlinesThread st = new SearchInlinesThread(fulltextSearchers, searchString, searchMetaData,lang,ticket);
st.start();
}
diff -r db87c1b7eb6d -r 83e9a828e794 src/de/mpiwg/dwinter/fulltextSearchServer/ShowContentOfDocument.java
--- a/src/de/mpiwg/dwinter/fulltextSearchServer/ShowContentOfDocument.java Wed Nov 03 12:18:46 2010 +0100
+++ b/src/de/mpiwg/dwinter/fulltextSearchServer/ShowContentOfDocument.java Wed Jan 26 14:41:34 2011 +0100
@@ -24,8 +24,11 @@
import javax.xml.transform.stream.StreamSource;
import org.apache.log4j.Logger;
+import org.restlet.Request;
+import org.restlet.Response;
import org.restlet.data.Form;
import org.restlet.data.MediaType;
+import org.restlet.data.Reference;
import org.restlet.data.Status;
import org.restlet.representation.Representation;
import org.restlet.representation.StringRepresentation;
@@ -35,6 +38,7 @@
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
+import de.mpiwg.dwinter.fulltext.search.xmlsearchadapter.XMLSearchServerAdapter;
import de.mpiwg.dwinter.fulltext.ticket.TicketWriter;
import de.mpiwg.dwinter.fulltextSearchServer.Utils.DigilibTools;
import de.mpiwg.dwinter.fulltextSearchServer.Utils.XMLTools;
@@ -141,28 +145,28 @@
logger.debug("lang:"+lang);
logger.debug("textId:"+textId);
logger.debug("restpath:"+restPath);
- String xml;
- String txt;
+ String html;
+
if (restPath.equals("")){
- xml = showContent(ticket,lang,textId);
- if (xml==null){
+ html = showContent(ticket,lang,textId);
+ if (html==null){
setStatus(Status.SUCCESS_ACCEPTED);
//still waiting
return new StringRepresentation("waiting",MediaType.TEXT_HTML);
}
- txt = XMLTools.transformToHTML(xml,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/showContentOfDocumentToHTML.xsl");
+ html = XMLTools.transformToHTML(html,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/showContentOfDocumentToHTML.xsl");
} else {
- xml = processRestPath(ticket,lang,textId,restPath);
- if (xml==null){
+ html = processRestPath(ticket,lang,textId,restPath);
+ if (html==null){
setStatus(Status.SUCCESS_ACCEPTED);
//still waiting
return new StringRepresentation("waiting",MediaType.TEXT_HTML);
}
- txt = XMLTools.transformToHTML(xml,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl");
+
//txt=xml;
}
- StringRepresentation representation = new StringRepresentation(txt,
+ StringRepresentation representation = new StringRepresentation(html,
MediaType.TEXT_HTML) ;
return representation;
@@ -224,13 +228,12 @@
xml=null;
}
- return xml;
+ return XMLTools.transformToHTML(xml,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl");
}
/**
- * Rueckgabe der Treffer gemaess dem Darstellungsmode z.Z. wird nur digilib unterstuetzt, bzw. egal welcher
- * Mode angegeben wird es wird immer das gleiche gemacht, naemlich ein Link auf Digilib ausgegeben.
+ * Rueckgabe der Treffer gemaess dem Darstellungsmode z.Z. wird nur digilib und generic unterstuetzt.
* Der Pfad ist in DIGIVIEWBASICSTRING festgelegt.
* @param ticket
* @param lang
@@ -246,26 +249,42 @@
String ret="";
ret+=""+pageFileName+"";
ArrayList points = new ArrayList();
-
+
try {
String xml = TicketWriter.getHitsOnPage(ticket,lang,textId,pageFileName);
+
Pattern linePattern = Pattern.compile("(.*?)",Pattern.MULTILINE);
Matcher m = linePattern.matcher(xml);
- while(m.find()){
- Double[] point = DigilibTools.calculatePoint(m.group(1));
- points.add(point);
- }
-
- String url = String.format(DigilibTools.DIGIVIEWBASICSTRING, DigilibTools.generateImagePath(textId,pageFileName),DigilibTools.generateMarksFromPoints(points));
+ // teste format des Restes wenn noch "/" dann Aufruf einer Seite direkt
- ret+=String.format("%s",url,textId);
- ret+="";
+ if (mode.equals("digilib")){
+ String txt=handleDigilib(textId, pageFileName, ret, points, m);
+ return XMLTools.transformToHTML(txt,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl");
+ }
+ else if (mode.equals("generic")){
+ if (m.find()){ // xmlfile ist fuer digilib ok, dann digilib
+ String txt = handleDigilib(textId, pageFileName, ret, points, m);
+ return XMLTools.transformToHTML(txt,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl");
+ } else {
+ //assume xml-treffer liste
+
+
+ String txt = handleXMLFullText(textId, pageFileName,ret, ticket, lang);
+ return XMLTools.transformToHTML(txt,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl");
+ }
+
- return ret;
+ } else if (mode.equals("showXMLhits")){
+ //String txt=""+
+ //""+
+ //"";
+ String txt= TicketWriter.getFileContent(ticket,lang,textId,pageFileName);
+ return txt; //+"";
+ }
} catch (FileNotFoundException e) {
@@ -282,6 +301,50 @@
+protected String handleXMLFullText(String textId, String pageFileName, String ret, String ticket,
+ String lang) {
+
+ try {
+ String[] morphquerySplitted=TicketWriter.getQueryString(lang,ticket).split(":");
+ //string has normally the format field:query
+ String morphQuery= morphquerySplitted[morphquerySplitted.length-1];
+
+ String queryString = XMLSearchServerAdapter.XMLDocSearchBase + "document=" + textId.replace(":","/");
+ //queryString += "&queryType=fulltext&query=" + morphQuery;
+ queryString += "&mode=text&query-type=fulltext&query=" + morphQuery;
+
+ ret+=String.format("%s",queryString,textId);
+ ret+="";
+ } catch (FileNotFoundException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ return ret;
+}
+
+protected String handleDigilib(String textId, String pageFileName, String ret,
+ ArrayList points, Matcher m) {
+ while(m.find()){
+ Double[] point = DigilibTools.calculatePoint(m.group(1));
+ points.add(point);
+ }
+
+ String url = String.format(DigilibTools.DIGIVIEWBASICSTRING, DigilibTools.generateImagePath(textId,pageFileName),DigilibTools.generateMarksFromPoints(points));
+
+
+ ret+=String.format("%s",url,textId);
+ ret+="";
+
+
+ return ret;
+}
+
+
+
+
diff -r db87c1b7eb6d -r 83e9a828e794 src/de/mpiwg/dwinter/fulltextSearchServer/searchThreads/SearchInlinesThread.java
--- a/src/de/mpiwg/dwinter/fulltextSearchServer/searchThreads/SearchInlinesThread.java Wed Nov 03 12:18:46 2010 +0100
+++ b/src/de/mpiwg/dwinter/fulltextSearchServer/searchThreads/SearchInlinesThread.java Wed Jan 26 14:41:34 2011 +0100
@@ -2,79 +2,122 @@
import java.io.File;
import java.io.IOException;
+import java.net.URLDecoder;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import de.mpiwg.dwinter.fulltext.search.FulltextSearchDocsLines;
+import de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines;
+import de.mpiwg.dwinter.fulltext.search.xmlsearchadapter.XMLSearchServerAdapter;
+import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher;
public class SearchInlinesThread extends Thread {
- //private File docIndex;
- //private File lineDir;
+ // private File docIndex;
+ // private File lineDir;
private String searchString;
private String ticket;
private String lang;
- private FulltextSearchDocsLines fulltextSearcher;
+ private IFulltextSearchDocsLines[] fulltextSearcher;
private Logger logger;
private String searchMetaData;
-
- public SearchInlinesThread(FulltextSearchDocsLines fulltextSearcher, String searchString, String searchMetaData,String lang, String ticket){
- //this.docIndex = docIndex;
- //this.lineDir = lineDir;
+
+ public SearchInlinesThread(IFulltextSearchDocsLines[] fulltextSearcher,
+ String searchString, String searchMetaData, String lang,
+ String ticket) {
+ // this.docIndex = docIndex;
+ // this.lineDir = lineDir;
this.fulltextSearcher = fulltextSearcher;
this.searchString = searchString;
this.searchMetaData = searchMetaData;
- this.ticket=ticket;
- this.lang =lang;
-
-
- this.logger=Logger.getRootLogger();
+ this.ticket = ticket;
+ this.lang= lang;
+ this.logger = Logger.getRootLogger();
}
- public void run(){
- String text;
-
-
- FulltextSearchDocsLines fulltextSearcher;
- try {
- //fulltextSearcher = new FulltextSearchDocsLines(docIndex,lineDir);
-
+
+ public void run() {
+ // String text;
+
+ for (int i = 0; i < fulltextSearcher.length; i++) {
+ IFulltextSearchDocsLines currentSearcher = fulltextSearcher[i];
+
+ // IFulltextSearchDocsLines fulltextSearcher;
+ try {
+ // fulltextSearcher = new
+ // FulltextSearchDocsLines(docIndex,lineDir);
+ Query query = null;
+ if (FulltextSearchDocsLines.class.isInstance(currentSearcher)) {// lucenebased
+ // searcher
+ FulltextSearchDocsLines ftsdl = (FulltextSearchDocsLines) currentSearcher;
+
+
+ LanguageSearcher ls = ftsdl.languageSearchers
+ .getSearcherByLanguage(lang);
+
+ if(ls==null) // language not supported
+ continue;
+
+ Analyzer analyzer = ls.analyzer;
+ QueryParser parser = new QueryParser(Version.LUCENE_30,
+ "contents", analyzer);
+ logger.debug(searchString);
+ query = parser.parse(searchString);
- Analyzer analyzer = this.fulltextSearcher.languageSearchers.getSearcherByLanguage(lang).analyzer;
- QueryParser parser = new QueryParser(Version.LUCENE_30,"contents",analyzer);
- logger.debug(searchString);
- Query query= parser.parse(searchString);
-
- if ((searchMetaData!=null) && !searchMetaData.equals("")){
- QueryParser parserMD = new QueryParser(Version.LUCENE_30,"dcMetaData",analyzer);
- Query queryMD= parserMD.parse(searchMetaData);
- BooleanQuery booleanQuery = new BooleanQuery();
- booleanQuery.add(queryMD, BooleanClause.Occur.MUST);
- booleanQuery.add(query, BooleanClause.Occur.MUST);
-
- query = booleanQuery;
- }
- this.fulltextSearcher.searchInLinesToDir(query,lang,ticket);
- } catch (CorruptIndexException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (ParseException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
+ if ((searchMetaData != null) && !searchMetaData.equals("")) {
+ QueryParser parserMD = new QueryParser(
+ Version.LUCENE_30, "dcMetaData", analyzer);
+ Query queryMD = parserMD.parse(searchMetaData);
+ BooleanQuery booleanQuery = new BooleanQuery();
+ booleanQuery.add(queryMD, BooleanClause.Occur.MUST);
+ booleanQuery.add(query, BooleanClause.Occur.MUST);
+
+ query = booleanQuery;
+ }
+ } else if (XMLSearchServerAdapter.class
+ .isInstance(currentSearcher)) {
+
+ if (!XMLSearchServerAdapter.getSupportedLanguages().contains(lang)){
+ continue; // language not supported
+ }
+ Term t = new Term("contents", searchString);
+ query = new TermQuery(t);
+
+ if ((searchMetaData != null) && !searchMetaData.equals("")) {
+ Term t2 = new Term("dcMetaData", searchMetaData);
+ Query query2 = new TermQuery(t2);
+
+ BooleanQuery booleanQuery = new BooleanQuery();
+ booleanQuery.add(query2, BooleanClause.Occur.MUST);
+ booleanQuery.add(query, BooleanClause.Occur.MUST);
+
+ query = booleanQuery;
+ }
+ }
+ currentSearcher.searchInLinesToDir(query, lang, ticket);
+ } catch (CorruptIndexException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch (LockObtainFailedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch (ParseException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
}
}
}
diff -r db87c1b7eb6d -r 83e9a828e794 src/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl
--- a/src/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl Wed Nov 03 12:18:46 2010 +0100
+++ b/src/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl Wed Jan 26 14:41:34 2011 +0100
@@ -8,5 +8,12 @@
+
+
+
+
+
+
+
\ No newline at end of file