diff src/de/mpiwg/dwinter/fulltext/search/FulltextSearchDocsLines.java @ 0:72a015318a6d

CLOSED - # 16: Zeige nur eine konfigurierbare Anzahl von Treffern an. https://it-dev.mpiwg-berlin.mpg.de/tracs/pythonOcropusTools/ticket/16
author dwinter
date Wed, 03 Nov 2010 12:26:20 +0100
parents
children 2b29b0b6db16
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/fulltext/search/FulltextSearchDocsLines.java	Wed Nov 03 12:26:20 2010 +0100
@@ -0,0 +1,272 @@
+package de.mpiwg.dwinter.fulltext.search;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.log4j.Logger;
+import org.apache.lucene.analysis.de.GermanAnalyzer;
+import org.apache.lucene.analysis.fr.FrenchAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.TopDocsCollector;
+import org.apache.lucene.search.TopScoreDocCollector;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.apache.lucene.util.Version;
+
+import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc;
+import de.mpiwg.dwinter.fulltext.search.utils.OCRLine;
+import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher;
+import de.mpiwg.dwinter.fulltext.searcher.LanguageSearchers;
+import de.mpiwg.dwinter.fulltext.ticket.TicketWriter;
+
+public class FulltextSearchDocsLines extends FulltextSearch {
+
+	protected static Logger logger = Logger.getRootLogger();
+
+	private static final int MAX_LINES = 10000;
+
+	protected File line_index_dir; // Index mit dem Zeilenindex
+
+	public LanguageSearchers languageSearchersLines = new LanguageSearchers();
+
+	public FulltextSearchDocsLines(File index_dir, File line_index_dir)
+			throws CorruptIndexException, LockObtainFailedException,
+			IOException {
+
+		super(index_dir);
+		this.line_index_dir = line_index_dir;
+		init_language_searchers_lines(line_index_dir);
+
+	}
+
+	protected void init_language_searchers_lines(File dir)
+			throws CorruptIndexException, LockObtainFailedException,
+			IOException {
+		languageSearchersLines.add(new LanguageSearcher("de",
+				new GermanAnalyzer(Version.LUCENE_30), dir));
+		languageSearchersLines.add(new LanguageSearcher("en",
+				new StandardAnalyzer(Version.LUCENE_30), dir));
+		languageSearchersLines.add(new LanguageSearcher("fr",
+				new FrenchAnalyzer(Version.LUCENE_30), dir));
+		languageSearchersLines.add(new LanguageSearcher("all",
+				new StandardAnalyzer(Version.LUCENE_30), dir));
+		languageSearchersLines.add(new LanguageSearcher("morph",
+				new StandardAnalyzer(Version.LUCENE_30), dir));
+		languageSearchersLines.add(new LanguageSearcher("la",
+				new StandardAnalyzer(Version.LUCENE_30), dir));
+	}
+
+	/**
+	 * Erzeugt Ergebnisliste im Filesystem fuer die Weiterbenutzung Ÿbers
+	 * Servlet
+	 * 
+	 * @param query
+	 * @param language
+	 * @param ticket
+	 *            ticket unter dem auf die Daten zurŸckgegriffen werden soll.
+	 * @return
+	 * @throws IOException
+	 */
+	public void searchInLinesToDir(Query query, String language, String ticket)
+			throws IOException {
+
+		// first step search docs
+		logger.debug("Start searching docs");
+		TopScoreDocCollector col = (TopScoreDocCollector) search(query,
+				language);
+		TopDocs docs = col.topDocs();
+		ScoreDoc[] scoreDocs = docs.scoreDocs;
+		// ArrayList<OCRDoc> ocrDocs = new ArrayList<OCRDoc>();
+
+		TicketWriter tw = new TicketWriter(ticket, query, language);
+
+		LanguageSearcher searcher = languageSearchers
+				.getSearcherByLanguage(language);
+		logger.debug("Start writing docs");
+		tw.writeResultsForLanguageSearch(language, docs, searcher.reader);
+		tw.commitTicket();
+		logger.debug("Wrote docs");
+		LanguageSearcher lineSearcher = languageSearchersLines
+				.getSearcherByLanguage(language);
+
+		Set<String> textIds = new HashSet<String>();
+
+		for (ScoreDoc doc : scoreDocs) {
+			Document d = searcher.reader.document(doc.doc);
+			String textID = d.get("textId");
+			logger.debug("Start:" + textID);
+
+			// teste ob schon gesucht TODO: warum sind manchmal textid mehrfach
+			// in der treffer liste?
+			if (!textIds.contains(textID)) {
+				textIds.add(textID);
+
+				Query textIDQuery = new TermQuery(new Term("textId", textID));
+				//Query[] queries = new Query[] { query, textIDQuery };
+				//Query lineQuery = query.combine(queries);
+
+				BooleanQuery booleanQuery = new BooleanQuery();
+				booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST);
+				booleanQuery.add(query, BooleanClause.Occur.MUST);
+				// suche jetzt die Zeilen
+				TopScoreDocCollector lineCol = TopScoreDocCollector.create(
+						MAX_LINES, false);
+				lineSearcher.searcher.search(booleanQuery, lineCol);
+				logger.debug("Searched:" + textID);
+				OCRDoc ocrDoc = new OCRDoc();
+				ocrDoc.docId = doc.doc;
+				ocrDoc.document = d;
+				ocrDoc.textId = d.get("textId");
+
+				Map<String, ArrayList<OCRLine>> ocrPages = new HashMap<String, ArrayList<OCRLine>>();
+
+				for (ScoreDoc line : lineCol.topDocs().scoreDocs) {
+					// fuege alle zeile zusammen
+					OCRLine ocrLine = new OCRLine();
+					Document lineD = lineSearcher.reader.document(line.doc);
+					ocrLine.pageDimension = lineD.get("pageDimension");
+					ocrLine.bbox = lineD.get("bbox");
+					ocrLine.lineNumber = lineD.get("lineNumber");
+
+					String pageNumber = getPageName(lineD.get("cleanedPath"));
+
+					if (!ocrPages.containsKey(pageNumber)) {
+						ocrPages.put(pageNumber, new ArrayList<OCRLine>());
+					}
+
+					ArrayList<OCRLine> page = ocrPages.get(pageNumber);
+					page.add(ocrLine);
+
+				}
+				logger.debug("collected:" + textID);
+				ocrDoc.linesInPage = ocrPages;
+				tw.writeDoc(language, ocrDoc);
+				tw.commitTicket();
+				logger.debug("written:" + textID);
+			} else {
+				logger.debug("already done:" + textID);
+			}
+		}
+		tw.closeTicket(language);
+		logger.debug("everything done!");
+	}
+
+	
+	public OCRDoc searchInLinesDoc(String textId,Query query, String language) throws IOException{
+		Query textIDQuery = new TermQuery(new Term("textId", textId));
+		BooleanQuery booleanQuery = new BooleanQuery();
+		booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST);
+		booleanQuery.add(query, BooleanClause.Occur.MUST);
+		
+		List<OCRDoc> docs = searchInLines(booleanQuery, language);
+		
+		if (docs.size()==0)
+			return new OCRDoc();
+		else
+			return docs.get(0);
+		
+			
+	}
+	public List<OCRDoc> searchInLines(Query query, String language)
+			throws IOException {
+
+		// first step search docs
+		logger.debug("Start searching docs.");
+
+		TopScoreDocCollector col = (TopScoreDocCollector) search(query,
+				language);
+		TopDocs docs = col.topDocs();
+		ScoreDoc[] scoreDocs = docs.scoreDocs;
+		ArrayList<OCRDoc> ocrDocs = new ArrayList<OCRDoc>();
+
+		LanguageSearcher searcher = languageSearchers
+				.getSearcherByLanguage(language);
+		LanguageSearcher lineSearcher = languageSearchersLines
+				.getSearcherByLanguage(language);
+
+		logger.debug("found docs.");
+		Set<String> textIds = new HashSet<String>();
+		for (ScoreDoc doc : scoreDocs) {
+			Document d = searcher.reader.document(doc.doc);
+			String textID = d.get("textId");
+
+			if (!textIds.contains(textID)) {
+				textIds.add(textID);
+
+				Query textIDQuery = new TermQuery(new Term("textId", textID));
+				//Query[] queries = new Query[] { query, textIDQuery };
+				//Query lineQuery = query.combine(queries);
+
+				BooleanQuery booleanQuery = new BooleanQuery();
+				booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST);
+				booleanQuery.add(query, BooleanClause.Occur.MUST);
+				
+
+				// suche jtzt die Zeilen
+				TopScoreDocCollector lineCol = TopScoreDocCollector.create(
+						MAX_LINES, false);
+				lineSearcher.searcher.search(booleanQuery, lineCol);
+				logger.debug("Searched:" + textID);
+				OCRDoc ocrDoc = new OCRDoc();
+				ocrDoc.docId = doc.doc;
+				ocrDoc.document = d;
+
+				Map<String, ArrayList<OCRLine>> ocrPages = new HashMap<String, ArrayList<OCRLine>>();
+
+				for (ScoreDoc line : lineCol.topDocs().scoreDocs) {
+					// fuege alle zeile zusammen
+					OCRLine ocrLine = new OCRLine();
+					Document lineD = lineSearcher.reader.document(line.doc);
+					ocrLine.pageDimension = lineD.get("pageDimension");
+					ocrLine.bbox = lineD.get("bbox");
+					ocrLine.lineNumber = lineD.get("lineNumber");
+
+					String pageNumber = getPageName(lineD.get("cleanedPath"));
+					logger.debug("collect:" + pageNumber);
+					if (!ocrPages.containsKey(pageNumber)) {
+						ocrPages.put(pageNumber, new ArrayList<OCRLine>());
+					}
+
+					ArrayList<OCRLine> page = ocrPages.get(pageNumber);
+					page.add(ocrLine);
+				}
+				logger.debug("collected:" + textID);
+				ocrDoc.linesInPage = ocrPages;
+				ocrDocs.add(ocrDoc);
+			} else {
+				logger.debug("already done:" + textID);
+			}
+		}
+
+		return ocrDocs;
+	}
+
+	/**
+	 * Gibt aus dem Pfad denDateinamen zurueck, der dann als Seitenname benutzt
+	 * wird.
+	 * 
+	 * @param path
+	 * @return
+	 */
+	private String getPageName(String path) {
+		File f = new File(path);
+
+		return f.getName();
+	}
+
+}