changeset 1:83e9a828e794

Version mit integrierter Suche ?ber XML-Volltexte
author dwinter
date Wed, 26 Jan 2011 14:41:34 +0100
parents db87c1b7eb6d
children eef69274ef15
files WebContent/WEB-INF/web.xml src/de/mpiwg/dwinter/fulltextSearchServer/SearchLines.java src/de/mpiwg/dwinter/fulltextSearchServer/SearchServerInfo.java src/de/mpiwg/dwinter/fulltextSearchServer/SearchTicket.java src/de/mpiwg/dwinter/fulltextSearchServer/ShowContentOfDocument.java src/de/mpiwg/dwinter/fulltextSearchServer/searchThreads/SearchInlinesThread.java src/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl
diffstat 7 files changed, 224 insertions(+), 77 deletions(-) [+]
line wrap: on
line diff
--- a/WebContent/WEB-INF/web.xml	Wed Nov 03 12:18:46 2010 +0100
+++ b/WebContent/WEB-INF/web.xml	Wed Jan 26 14:41:34 2011 +0100
@@ -13,7 +13,7 @@
 </context-param>
 <context-param>
 <param-name>de.mpwig.dwinter.fulltextSearchServer.lineIndex</param-name>
-<param-value>/Volumes/data/indexLibcollLines</param-value>
+<param-value>/Volumes/data/indexLibcollLines2</param-value>
 </context-param>
 <context-param>
 <param-name>de.mpwig.dwinter.fulltextSearchServer.docIndex</param-name>
--- a/src/de/mpiwg/dwinter/fulltextSearchServer/SearchLines.java	Wed Nov 03 12:18:46 2010 +0100
+++ b/src/de/mpiwg/dwinter/fulltextSearchServer/SearchLines.java	Wed Jan 26 14:41:34 2011 +0100
@@ -50,7 +50,7 @@
 import de.mpiwg.dwinter.fulltext.search.FulltextSearchDocsLines;
 import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc;
 import de.mpiwg.dwinter.fulltext.search.utils.OCRLine;
-import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher;
+import de.mpiwg.dwinter.fulltext.searcher.ILanguageSearcher;
 import de.mpiwg.dwinter.fulltext.ticket.TicketWriter;
 import de.mpiwg.dwinter.fulltextSearchServer.Utils.ConfigurationManager;
 import de.mpiwg.dwinter.fulltextSearchServer.Utils.DigilibTools;
@@ -222,7 +222,7 @@
 		FulltextSearchDocsLines searcher = new FulltextSearchDocsLines(
 				docIndex, lineDir);
 
-		LanguageSearcher ls = searcher.languageSearchers
+		ILanguageSearcher ls = searcher.languageSearchers
 				.getSearcherByLanguage(lang);
 		if (ls == null) {
 			setStatus(Status.CLIENT_ERROR_NOT_FOUND);
--- a/src/de/mpiwg/dwinter/fulltextSearchServer/SearchServerInfo.java	Wed Nov 03 12:18:46 2010 +0100
+++ b/src/de/mpiwg/dwinter/fulltextSearchServer/SearchServerInfo.java	Wed Jan 26 14:41:34 2011 +0100
@@ -5,6 +5,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
+import java.util.List;
 import java.util.Properties;
 
 import org.restlet.data.Form;
@@ -17,6 +18,7 @@
 import org.restlet.resource.Get;
 
 import de.mpiwg.dwinter.fulltext.search.FulltextSearchConfig;
+import de.mpiwg.dwinter.fulltext.search.xmlsearchadapter.XMLSearchServerAdapter;
 import de.mpiwg.dwinter.fulltextSearchServer.Utils.ConfigurationManager;
 public class SearchServerInfo extends ServerResource{
 	//need options for crossdomain scripting
@@ -90,13 +92,20 @@
 	}
 
 	private Representation getSupportedLanguages() {
-		// TODO Auto-generated method stub
+		
 		
-		ArrayList<String> langs = config.getSupportedLanguages();
+		List<String> langs = config.getSupportedLanguages();
 		String ret="<info><supportedLanguages>";
 		for (String lang: langs){
 			ret+="<lang>"+lang+"</lang>";
 		}
+		
+		langs= XMLSearchServerAdapter.getSupportedLanguages();
+		for (String lang: langs){
+			ret+="<lang>"+lang+"</lang>";
+		}
+		
+		
 		ret+="</supportedLanguages></info>";
 		return new StringRepresentation(ret, MediaType.TEXT_XML);
 	}
@@ -104,12 +113,21 @@
 	private Representation getSupportedLanguagesHTML() {
 		// TODO Auto-generated method stub
 		
-		ArrayList<String> langs = config.getSupportedLanguages();
 		String ret="<div class=\"supportedLanguages\">";
+		
+		List<String> langs = config.getSupportedLanguages();
 		for (String lang: langs){
 			ret+="<div class=\"supportedLanguage\">"+lang+"</div>";
 		}
+		langs= XMLSearchServerAdapter.getSupportedLanguages();
+		for (String lang: langs){
+			ret+="<div class=\"supportedLanguage\">"+lang+"</div>";
+		}
+		
+		
 		ret+="</div>";
+		
+		
 		return new StringRepresentation(ret, MediaType.TEXT_HTML);
 	}
 }
--- a/src/de/mpiwg/dwinter/fulltextSearchServer/SearchTicket.java	Wed Nov 03 12:18:46 2010 +0100
+++ b/src/de/mpiwg/dwinter/fulltextSearchServer/SearchTicket.java	Wed Jan 26 14:41:34 2011 +0100
@@ -3,11 +3,14 @@
 import java.io.File;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.util.List;
 import java.util.Properties;
 import java.util.concurrent.ConcurrentMap;
 
 import org.apache.log4j.Logger;
 import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.search.Query;
 import org.apache.lucene.store.LockObtainFailedException;
 import org.restlet.Context;
 import org.restlet.data.Form;
@@ -27,6 +30,9 @@
 
 
 import de.mpiwg.dwinter.fulltext.search.FulltextSearchDocsLines;
+import de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines;
+import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc;
+import de.mpiwg.dwinter.fulltext.search.xmlsearchadapter.XMLSearchServerAdapter;
 import de.mpiwg.dwinter.fulltext.ticket.TicketWriter;
 import de.mpiwg.dwinter.fulltextSearchServer.Utils.ConfigurationManager;
 import de.mpiwg.dwinter.fulltextSearchServer.searchThreads.SearchInlinesThread;
@@ -103,7 +109,14 @@
 		
 		//String ticket = generateTicket();
 		String searchString=searchForm.getValues("searchString");
-		String languages=searchForm.getValues("languages"); // language der form la1_la2_la3___
+		String languages;
+		try {
+			languages = URLDecoder.decode(searchForm.getValues("languages"),"utf-8");
+		} catch (UnsupportedEncodingException e1) {
+			// TODO Auto-generated catch block
+			e1.printStackTrace();
+			languages="";
+		} // language der form la1_la2_la3___
 		String searchMetaData=searchForm.getValues("searchMetaData");
 		//no Searchstring
 		if (searchString==null || languages==null )
@@ -146,11 +159,14 @@
 		}
 		
 	
+		IFulltextSearchDocsLines[] fulltextSearchers = new IFulltextSearchDocsLines[]{fulltextSearcher, new XMLSearchServerAdapter()};
+			
+			
 		
 		String[] langs = languages.split("_");
 		
 		for (String lang: langs){
-		SearchInlinesThread st = new SearchInlinesThread(fulltextSearcher, searchString, searchMetaData,lang,ticket);
+		SearchInlinesThread st = new SearchInlinesThread(fulltextSearchers, searchString, searchMetaData,lang,ticket);
 		st.start();
 		}
 		
--- a/src/de/mpiwg/dwinter/fulltextSearchServer/ShowContentOfDocument.java	Wed Nov 03 12:18:46 2010 +0100
+++ b/src/de/mpiwg/dwinter/fulltextSearchServer/ShowContentOfDocument.java	Wed Jan 26 14:41:34 2011 +0100
@@ -24,8 +24,11 @@
 import javax.xml.transform.stream.StreamSource;
 
 import org.apache.log4j.Logger;
+import org.restlet.Request;
+import org.restlet.Response;
 import org.restlet.data.Form;
 import org.restlet.data.MediaType;
+import org.restlet.data.Reference;
 import org.restlet.data.Status;
 import org.restlet.representation.Representation;
 import org.restlet.representation.StringRepresentation;
@@ -35,6 +38,7 @@
 import org.w3c.dom.Document;
 import org.xml.sax.SAXException;
 
+import de.mpiwg.dwinter.fulltext.search.xmlsearchadapter.XMLSearchServerAdapter;
 import de.mpiwg.dwinter.fulltext.ticket.TicketWriter;
 import de.mpiwg.dwinter.fulltextSearchServer.Utils.DigilibTools;
 import de.mpiwg.dwinter.fulltextSearchServer.Utils.XMLTools;
@@ -141,28 +145,28 @@
 		logger.debug("lang:"+lang);
 		logger.debug("textId:"+textId);
 		logger.debug("restpath:"+restPath);
-		String xml;
-		String txt;
+		String html;
+		
 		if (restPath.equals("")){
-			xml = showContent(ticket,lang,textId);
-			if (xml==null){
+			html = showContent(ticket,lang,textId);
+			if (html==null){
 				setStatus(Status.SUCCESS_ACCEPTED);
 				//still waiting
 				return new StringRepresentation("<dev>waiting</dev>",MediaType.TEXT_HTML);
 			}
-			txt = XMLTools.transformToHTML(xml,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/showContentOfDocumentToHTML.xsl");
+			html = XMLTools.transformToHTML(html,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/showContentOfDocumentToHTML.xsl");
 		} else {
-			xml = processRestPath(ticket,lang,textId,restPath);
-			if (xml==null){
+			html = processRestPath(ticket,lang,textId,restPath);
+			if (html==null){
 				setStatus(Status.SUCCESS_ACCEPTED);
 				//still waiting
 				return new StringRepresentation("<dev>waiting</dev>",MediaType.TEXT_HTML);
 			}
-			txt = XMLTools.transformToHTML(xml,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl");
+			
 			//txt=xml;
 		}
 		  
-          StringRepresentation representation = new StringRepresentation(txt, 
+          StringRepresentation representation = new StringRepresentation(html, 
      	            MediaType.TEXT_HTML) ;
           return representation;
     
@@ -224,13 +228,12 @@
 		xml=null;
 	}
 
-	return xml;
+	return XMLTools.transformToHTML(xml,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl");
    
 }
 
 /**
- * Rueckgabe der Treffer gemaess dem Darstellungsmode z.Z. wird nur digilib unterstuetzt, bzw. egal welcher 
- * Mode angegeben wird es wird immer das gleiche gemacht, naemlich ein Link auf Digilib ausgegeben.
+ * Rueckgabe der Treffer gemaess dem Darstellungsmode z.Z. wird nur digilib und generic unterstuetzt.
  * Der Pfad ist in DIGIVIEWBASICSTRING festgelegt.
  * @param ticket
  * @param lang
@@ -246,26 +249,42 @@
 		String ret="<xml xmlns:xlink=\"http://www.w3.org/1999/xlink\">";
 		ret+="<pageFileName>"+pageFileName+"</pageFileName>";
 		ArrayList<Double[]> points = new ArrayList<Double[]>();
-		
+	
 	
 	try {
 		String xml = TicketWriter.getHitsOnPage(ticket,lang,textId,pageFileName);
+		
 		Pattern linePattern = Pattern.compile("<line>(.*?)</line>",Pattern.MULTILINE);
 		Matcher m = linePattern.matcher(xml);
 		
-		while(m.find()){
-			Double[] point = DigilibTools.calculatePoint(m.group(1));
-			points.add(point);		
-		}
-		
-		String url = String.format(DigilibTools.DIGIVIEWBASICSTRING, DigilibTools.generateImagePath(textId,pageFileName),DigilibTools.generateMarksFromPoints(points));
+		// teste format des Restes wenn noch "/" dann Aufruf einer Seite direkt
 		
 		
-		ret+=String.format("<hitsOnPage xlink:href=\"%s\">%s</hitsOnPage>",url,textId);
-		ret+="</xml>";
+		if (mode.equals("digilib")){
 		
+		String txt=handleDigilib(textId, pageFileName, ret, points, m);
+		return XMLTools.transformToHTML(txt,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl");
+		}
+		else if (mode.equals("generic")){
+			if (m.find()){ // xmlfile ist fuer digilib ok, dann digilib
+				String txt = handleDigilib(textId, pageFileName, ret, points, m);
+				return XMLTools.transformToHTML(txt,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl");
+			} else {
+				//assume xml-treffer liste
+				
+				
+				String txt = handleXMLFullText(textId, pageFileName,ret, ticket, lang);
+				return XMLTools.transformToHTML(txt,"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl");
+			}
+			
 		
-		return ret;
+		} else if (mode.equals("showXMLhits")){
+			//String txt="<html><head>"+
+			//"<base href=\""+XMLSearchServerAdapter.XMLServerBase+"\" />"+
+			//"</head><body>";
+			String txt= TicketWriter.getFileContent(ticket,lang,textId,pageFileName);
+			return txt; //+"</body></html>";
+		}
 	
 	
 	} catch (FileNotFoundException e) {
@@ -282,6 +301,50 @@
 
 
 
+protected String handleXMLFullText(String textId, String pageFileName, String ret, String ticket,
+		String lang) {
+	
+	try {
+		String[] morphquerySplitted=TicketWriter.getQueryString(lang,ticket).split(":");
+		//string has normally the format field:query
+		String morphQuery= morphquerySplitted[morphquerySplitted.length-1];
+		
+		String queryString = XMLSearchServerAdapter.XMLDocSearchBase + "document=" + textId.replace(":","/");
+		//queryString += "&queryType=fulltext&query=" + morphQuery;
+		queryString += "&amp;mode=text&amp;query-type=fulltext&amp;query=" + morphQuery;
+
+		ret+=String.format("<hitsOnPage xlink:href=\"%s\">%s</hitsOnPage>",queryString,textId);
+		ret+="</xml>";
+	} catch (FileNotFoundException e) {
+		// TODO Auto-generated catch block
+		e.printStackTrace();
+	} catch (IOException e) {
+		// TODO Auto-generated catch block
+		e.printStackTrace();
+	}
+	return ret;
+}
+
+protected String handleDigilib(String textId, String pageFileName, String ret,
+		ArrayList<Double[]> points, Matcher m) {
+	while(m.find()){
+		Double[] point = DigilibTools.calculatePoint(m.group(1));
+		points.add(point);		
+	}
+	
+	String url = String.format(DigilibTools.DIGIVIEWBASICSTRING, DigilibTools.generateImagePath(textId,pageFileName),DigilibTools.generateMarksFromPoints(points));
+	
+	
+	ret+=String.format("<hitsOnPage xlink:href=\"%s\">%s</hitsOnPage>",url,textId);
+	ret+="</xml>";
+	
+	
+	return ret;
+}
+
+
+
+
 				
 	 
 
--- a/src/de/mpiwg/dwinter/fulltextSearchServer/searchThreads/SearchInlinesThread.java	Wed Nov 03 12:18:46 2010 +0100
+++ b/src/de/mpiwg/dwinter/fulltextSearchServer/searchThreads/SearchInlinesThread.java	Wed Jan 26 14:41:34 2011 +0100
@@ -2,79 +2,122 @@
 
 import java.io.File;
 import java.io.IOException;
+import java.net.URLDecoder;
 
 import org.apache.log4j.Logger;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.Term;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.LockObtainFailedException;
 import org.apache.lucene.util.Version;
 
 import de.mpiwg.dwinter.fulltext.search.FulltextSearchDocsLines;
+import de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines;
+import de.mpiwg.dwinter.fulltext.search.xmlsearchadapter.XMLSearchServerAdapter;
+import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher;
 
 public class SearchInlinesThread extends Thread {
 
-	//private File docIndex;
-	//private File lineDir;
+	// private File docIndex;
+	// private File lineDir;
 	private String searchString;
 	private String ticket;
 	private String lang;
-	private FulltextSearchDocsLines fulltextSearcher;
+	private IFulltextSearchDocsLines[] fulltextSearcher;
 	private Logger logger;
 	private String searchMetaData;
-	
-	public SearchInlinesThread(FulltextSearchDocsLines fulltextSearcher, String searchString, String searchMetaData,String lang, String ticket){
-		//this.docIndex = docIndex;
-		//this.lineDir = lineDir;
+
+	public SearchInlinesThread(IFulltextSearchDocsLines[] fulltextSearcher,
+			String searchString, String searchMetaData, String lang,
+			String ticket) {
+		// this.docIndex = docIndex;
+		// this.lineDir = lineDir;
 		this.fulltextSearcher = fulltextSearcher;
 		this.searchString = searchString;
 		this.searchMetaData = searchMetaData;
-		this.ticket=ticket;
-		this.lang =lang;
-		
-		
-		this.logger=Logger.getRootLogger();
+		this.ticket = ticket;
+		this.lang= lang;
+		this.logger = Logger.getRootLogger();
 	}
-	public void run(){
-		String text;
-		
-		
-		FulltextSearchDocsLines fulltextSearcher;
-		try {
-			//fulltextSearcher = new FulltextSearchDocsLines(docIndex,lineDir);
-		
+
+	public void run() {
+		// String text;
+
+		for (int i = 0; i < fulltextSearcher.length; i++) {
+			IFulltextSearchDocsLines currentSearcher = fulltextSearcher[i];
+
+			// IFulltextSearchDocsLines fulltextSearcher;
+			try {
+				// fulltextSearcher = new
+				// FulltextSearchDocsLines(docIndex,lineDir);
+				Query query = null;
+				if (FulltextSearchDocsLines.class.isInstance(currentSearcher)) {// lucenebased
+																				// searcher
+					FulltextSearchDocsLines ftsdl = (FulltextSearchDocsLines) currentSearcher;
+
+					
+					LanguageSearcher ls = ftsdl.languageSearchers
+					.getSearcherByLanguage(lang);
+					
+					if(ls==null) // language not supported
+						continue;
+					
+					Analyzer analyzer = ls.analyzer;
+					QueryParser parser = new QueryParser(Version.LUCENE_30,
+							"contents", analyzer);
+					logger.debug(searchString);
+					query = parser.parse(searchString);
 
-		Analyzer analyzer = this.fulltextSearcher.languageSearchers.getSearcherByLanguage(lang).analyzer;
-		QueryParser parser = new QueryParser(Version.LUCENE_30,"contents",analyzer);
-		logger.debug(searchString);
-		Query query= parser.parse(searchString);
-		
-		if ((searchMetaData!=null) && !searchMetaData.equals("")){
-			QueryParser parserMD = new QueryParser(Version.LUCENE_30,"dcMetaData",analyzer);
-			Query queryMD= parserMD.parse(searchMetaData);
-			BooleanQuery booleanQuery = new BooleanQuery();
-			booleanQuery.add(queryMD, BooleanClause.Occur.MUST);
-			booleanQuery.add(query, BooleanClause.Occur.MUST);
-			
-			query = booleanQuery;
-		}
-		this.fulltextSearcher.searchInLinesToDir(query,lang,ticket);
-		} catch (CorruptIndexException e) {
-			// TODO Auto-generated catch block
-			e.printStackTrace();
-		} catch (LockObtainFailedException e) {
-			// TODO Auto-generated catch block
-			e.printStackTrace();
-		} catch (IOException e) {
-			// TODO Auto-generated catch block
-			e.printStackTrace();
-		} catch (ParseException e) {
-			// TODO Auto-generated catch block
-			e.printStackTrace();
+					if ((searchMetaData != null) && !searchMetaData.equals("")) {
+						QueryParser parserMD = new QueryParser(
+								Version.LUCENE_30, "dcMetaData", analyzer);
+						Query queryMD = parserMD.parse(searchMetaData);
+						BooleanQuery booleanQuery = new BooleanQuery();
+						booleanQuery.add(queryMD, BooleanClause.Occur.MUST);
+						booleanQuery.add(query, BooleanClause.Occur.MUST);
+
+						query = booleanQuery;
+					}
+				} else if (XMLSearchServerAdapter.class
+						.isInstance(currentSearcher)) {
+					
+					if (!XMLSearchServerAdapter.getSupportedLanguages().contains(lang)){
+						continue; // language not supported
+					}
+					Term t = new Term("contents", searchString);
+					query = new TermQuery(t);
+
+					if ((searchMetaData != null) && !searchMetaData.equals("")) {
+						Term t2 = new Term("dcMetaData", searchMetaData);
+						Query query2 = new TermQuery(t2);
+
+						BooleanQuery booleanQuery = new BooleanQuery();
+						booleanQuery.add(query2, BooleanClause.Occur.MUST);
+						booleanQuery.add(query, BooleanClause.Occur.MUST);
+
+						query = booleanQuery;
+					}
+				}
+				currentSearcher.searchInLinesToDir(query, lang, ticket);
+			} catch (CorruptIndexException e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			} catch (LockObtainFailedException e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			} catch (IOException e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			} catch (ParseException e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+			}
 		}
 	}
 }
--- a/src/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl	Wed Nov 03 12:18:46 2010 +0100
+++ b/src/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl	Wed Jan 26 14:41:34 2011 +0100
@@ -8,5 +8,12 @@
 		   <a><xsl:attribute name="href"><xsl:value-of select="./@xlink:href"/></xsl:attribute>
 		   <xsl:value-of select="//pageFileName"/></a></div>
 		</xsl:for-each>
+		
+		<xsl:for-each select="//line">
+		   <div class="foundPage">
+		   <line><xsl:copy-of select="."/></line>
+		   </div>
+		</xsl:for-each>
+	
 	</xsl:template>
 </xsl:stylesheet>
\ No newline at end of file