Mercurial > hg > fulltextSearch
diff src/de/mpiwg/dwinter/fulltext/search/FulltextSearchDocsLines.java @ 0:72a015318a6d
CLOSED - # 16: Zeige nur eine konfigurierbare Anzahl von Treffern an.
https://it-dev.mpiwg-berlin.mpg.de/tracs/pythonOcropusTools/ticket/16
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:26:20 +0100 |
parents | |
children | 2b29b0b6db16 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/fulltext/search/FulltextSearchDocsLines.java Wed Nov 03 12:26:20 2010 +0100 @@ -0,0 +1,272 @@ +package de.mpiwg.dwinter.fulltext.search; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.log4j.Logger; +import org.apache.lucene.analysis.de.GermanAnalyzer; +import org.apache.lucene.analysis.fr.FrenchAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopDocsCollector; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.util.Version; + +import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc; +import de.mpiwg.dwinter.fulltext.search.utils.OCRLine; +import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher; +import de.mpiwg.dwinter.fulltext.searcher.LanguageSearchers; +import de.mpiwg.dwinter.fulltext.ticket.TicketWriter; + +public class FulltextSearchDocsLines extends FulltextSearch { + + protected static Logger logger = Logger.getRootLogger(); + + private static final int MAX_LINES = 10000; + + protected File line_index_dir; // Index mit dem Zeilenindex + + public LanguageSearchers languageSearchersLines = new LanguageSearchers(); + + public FulltextSearchDocsLines(File index_dir, File line_index_dir) + throws CorruptIndexException, LockObtainFailedException, + IOException { + + super(index_dir); + this.line_index_dir = line_index_dir; + init_language_searchers_lines(line_index_dir); + + } + + protected void init_language_searchers_lines(File dir) + throws CorruptIndexException, LockObtainFailedException, + IOException { + languageSearchersLines.add(new LanguageSearcher("de", + new GermanAnalyzer(Version.LUCENE_30), dir)); + languageSearchersLines.add(new LanguageSearcher("en", + new StandardAnalyzer(Version.LUCENE_30), dir)); + languageSearchersLines.add(new LanguageSearcher("fr", + new FrenchAnalyzer(Version.LUCENE_30), dir)); + languageSearchersLines.add(new LanguageSearcher("all", + new StandardAnalyzer(Version.LUCENE_30), dir)); + languageSearchersLines.add(new LanguageSearcher("morph", + new StandardAnalyzer(Version.LUCENE_30), dir)); + languageSearchersLines.add(new LanguageSearcher("la", + new StandardAnalyzer(Version.LUCENE_30), dir)); + } + + /** + * Erzeugt Ergebnisliste im Filesystem fuer die Weiterbenutzung bers + * Servlet + * + * @param query + * @param language + * @param ticket + * ticket unter dem auf die Daten zurckgegriffen werden soll. + * @return + * @throws IOException + */ + public void searchInLinesToDir(Query query, String language, String ticket) + throws IOException { + + // first step search docs + logger.debug("Start searching docs"); + TopScoreDocCollector col = (TopScoreDocCollector) search(query, + language); + TopDocs docs = col.topDocs(); + ScoreDoc[] scoreDocs = docs.scoreDocs; + // ArrayList<OCRDoc> ocrDocs = new ArrayList<OCRDoc>(); + + TicketWriter tw = new TicketWriter(ticket, query, language); + + LanguageSearcher searcher = languageSearchers + .getSearcherByLanguage(language); + logger.debug("Start writing docs"); + tw.writeResultsForLanguageSearch(language, docs, searcher.reader); + tw.commitTicket(); + logger.debug("Wrote docs"); + LanguageSearcher lineSearcher = languageSearchersLines + .getSearcherByLanguage(language); + + Set<String> textIds = new HashSet<String>(); + + for (ScoreDoc doc : scoreDocs) { + Document d = searcher.reader.document(doc.doc); + String textID = d.get("textId"); + logger.debug("Start:" + textID); + + // teste ob schon gesucht TODO: warum sind manchmal textid mehrfach + // in der treffer liste? + if (!textIds.contains(textID)) { + textIds.add(textID); + + Query textIDQuery = new TermQuery(new Term("textId", textID)); + //Query[] queries = new Query[] { query, textIDQuery }; + //Query lineQuery = query.combine(queries); + + BooleanQuery booleanQuery = new BooleanQuery(); + booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST); + booleanQuery.add(query, BooleanClause.Occur.MUST); + // suche jetzt die Zeilen + TopScoreDocCollector lineCol = TopScoreDocCollector.create( + MAX_LINES, false); + lineSearcher.searcher.search(booleanQuery, lineCol); + logger.debug("Searched:" + textID); + OCRDoc ocrDoc = new OCRDoc(); + ocrDoc.docId = doc.doc; + ocrDoc.document = d; + ocrDoc.textId = d.get("textId"); + + Map<String, ArrayList<OCRLine>> ocrPages = new HashMap<String, ArrayList<OCRLine>>(); + + for (ScoreDoc line : lineCol.topDocs().scoreDocs) { + // fuege alle zeile zusammen + OCRLine ocrLine = new OCRLine(); + Document lineD = lineSearcher.reader.document(line.doc); + ocrLine.pageDimension = lineD.get("pageDimension"); + ocrLine.bbox = lineD.get("bbox"); + ocrLine.lineNumber = lineD.get("lineNumber"); + + String pageNumber = getPageName(lineD.get("cleanedPath")); + + if (!ocrPages.containsKey(pageNumber)) { + ocrPages.put(pageNumber, new ArrayList<OCRLine>()); + } + + ArrayList<OCRLine> page = ocrPages.get(pageNumber); + page.add(ocrLine); + + } + logger.debug("collected:" + textID); + ocrDoc.linesInPage = ocrPages; + tw.writeDoc(language, ocrDoc); + tw.commitTicket(); + logger.debug("written:" + textID); + } else { + logger.debug("already done:" + textID); + } + } + tw.closeTicket(language); + logger.debug("everything done!"); + } + + + public OCRDoc searchInLinesDoc(String textId,Query query, String language) throws IOException{ + Query textIDQuery = new TermQuery(new Term("textId", textId)); + BooleanQuery booleanQuery = new BooleanQuery(); + booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST); + booleanQuery.add(query, BooleanClause.Occur.MUST); + + List<OCRDoc> docs = searchInLines(booleanQuery, language); + + if (docs.size()==0) + return new OCRDoc(); + else + return docs.get(0); + + + } + public List<OCRDoc> searchInLines(Query query, String language) + throws IOException { + + // first step search docs + logger.debug("Start searching docs."); + + TopScoreDocCollector col = (TopScoreDocCollector) search(query, + language); + TopDocs docs = col.topDocs(); + ScoreDoc[] scoreDocs = docs.scoreDocs; + ArrayList<OCRDoc> ocrDocs = new ArrayList<OCRDoc>(); + + LanguageSearcher searcher = languageSearchers + .getSearcherByLanguage(language); + LanguageSearcher lineSearcher = languageSearchersLines + .getSearcherByLanguage(language); + + logger.debug("found docs."); + Set<String> textIds = new HashSet<String>(); + for (ScoreDoc doc : scoreDocs) { + Document d = searcher.reader.document(doc.doc); + String textID = d.get("textId"); + + if (!textIds.contains(textID)) { + textIds.add(textID); + + Query textIDQuery = new TermQuery(new Term("textId", textID)); + //Query[] queries = new Query[] { query, textIDQuery }; + //Query lineQuery = query.combine(queries); + + BooleanQuery booleanQuery = new BooleanQuery(); + booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST); + booleanQuery.add(query, BooleanClause.Occur.MUST); + + + // suche jtzt die Zeilen + TopScoreDocCollector lineCol = TopScoreDocCollector.create( + MAX_LINES, false); + lineSearcher.searcher.search(booleanQuery, lineCol); + logger.debug("Searched:" + textID); + OCRDoc ocrDoc = new OCRDoc(); + ocrDoc.docId = doc.doc; + ocrDoc.document = d; + + Map<String, ArrayList<OCRLine>> ocrPages = new HashMap<String, ArrayList<OCRLine>>(); + + for (ScoreDoc line : lineCol.topDocs().scoreDocs) { + // fuege alle zeile zusammen + OCRLine ocrLine = new OCRLine(); + Document lineD = lineSearcher.reader.document(line.doc); + ocrLine.pageDimension = lineD.get("pageDimension"); + ocrLine.bbox = lineD.get("bbox"); + ocrLine.lineNumber = lineD.get("lineNumber"); + + String pageNumber = getPageName(lineD.get("cleanedPath")); + logger.debug("collect:" + pageNumber); + if (!ocrPages.containsKey(pageNumber)) { + ocrPages.put(pageNumber, new ArrayList<OCRLine>()); + } + + ArrayList<OCRLine> page = ocrPages.get(pageNumber); + page.add(ocrLine); + } + logger.debug("collected:" + textID); + ocrDoc.linesInPage = ocrPages; + ocrDocs.add(ocrDoc); + } else { + logger.debug("already done:" + textID); + } + } + + return ocrDocs; + } + + /** + * Gibt aus dem Pfad denDateinamen zurueck, der dann als Seitenname benutzt + * wird. + * + * @param path + * @return + */ + private String getPageName(String path) { + File f = new File(path); + + return f.getName(); + } + +}