Mercurial > hg > fulltextSearch
view src/de/mpiwg/dwinter/fulltext/search/FulltextSearchDocsLines.java @ 0:72a015318a6d
CLOSED - # 16: Zeige nur eine konfigurierbare Anzahl von Treffern an.
https://it-dev.mpiwg-berlin.mpg.de/tracs/pythonOcropusTools/ticket/16
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:26:20 +0100 |
parents | |
children | 2b29b0b6db16 |
line wrap: on
line source
package de.mpiwg.dwinter.fulltext.search; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.log4j.Logger; import org.apache.lucene.analysis.de.GermanAnalyzer; import org.apache.lucene.analysis.fr.FrenchAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.Collector; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocsCollector; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc; import de.mpiwg.dwinter.fulltext.search.utils.OCRLine; import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher; import de.mpiwg.dwinter.fulltext.searcher.LanguageSearchers; import de.mpiwg.dwinter.fulltext.ticket.TicketWriter; public class FulltextSearchDocsLines extends FulltextSearch { protected static Logger logger = Logger.getRootLogger(); private static final int MAX_LINES = 10000; protected File line_index_dir; // Index mit dem Zeilenindex public LanguageSearchers languageSearchersLines = new LanguageSearchers(); public FulltextSearchDocsLines(File index_dir, File line_index_dir) throws CorruptIndexException, LockObtainFailedException, IOException { super(index_dir); this.line_index_dir = line_index_dir; init_language_searchers_lines(line_index_dir); } protected void init_language_searchers_lines(File dir) throws CorruptIndexException, LockObtainFailedException, IOException { languageSearchersLines.add(new LanguageSearcher("de", new GermanAnalyzer(Version.LUCENE_30), dir)); languageSearchersLines.add(new LanguageSearcher("en", new StandardAnalyzer(Version.LUCENE_30), dir)); languageSearchersLines.add(new LanguageSearcher("fr", new FrenchAnalyzer(Version.LUCENE_30), dir)); languageSearchersLines.add(new LanguageSearcher("all", new StandardAnalyzer(Version.LUCENE_30), dir)); languageSearchersLines.add(new LanguageSearcher("morph", new StandardAnalyzer(Version.LUCENE_30), dir)); languageSearchersLines.add(new LanguageSearcher("la", new StandardAnalyzer(Version.LUCENE_30), dir)); } /** * Erzeugt Ergebnisliste im Filesystem fuer die Weiterbenutzung bers * Servlet * * @param query * @param language * @param ticket * ticket unter dem auf die Daten zurckgegriffen werden soll. * @return * @throws IOException */ public void searchInLinesToDir(Query query, String language, String ticket) throws IOException { // first step search docs logger.debug("Start searching docs"); TopScoreDocCollector col = (TopScoreDocCollector) search(query, language); TopDocs docs = col.topDocs(); ScoreDoc[] scoreDocs = docs.scoreDocs; // ArrayList<OCRDoc> ocrDocs = new ArrayList<OCRDoc>(); TicketWriter tw = new TicketWriter(ticket, query, language); LanguageSearcher searcher = languageSearchers .getSearcherByLanguage(language); logger.debug("Start writing docs"); tw.writeResultsForLanguageSearch(language, docs, searcher.reader); tw.commitTicket(); logger.debug("Wrote docs"); LanguageSearcher lineSearcher = languageSearchersLines .getSearcherByLanguage(language); Set<String> textIds = new HashSet<String>(); for (ScoreDoc doc : scoreDocs) { Document d = searcher.reader.document(doc.doc); String textID = d.get("textId"); logger.debug("Start:" + textID); // teste ob schon gesucht TODO: warum sind manchmal textid mehrfach // in der treffer liste? if (!textIds.contains(textID)) { textIds.add(textID); Query textIDQuery = new TermQuery(new Term("textId", textID)); //Query[] queries = new Query[] { query, textIDQuery }; //Query lineQuery = query.combine(queries); BooleanQuery booleanQuery = new BooleanQuery(); booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST); booleanQuery.add(query, BooleanClause.Occur.MUST); // suche jetzt die Zeilen TopScoreDocCollector lineCol = TopScoreDocCollector.create( MAX_LINES, false); lineSearcher.searcher.search(booleanQuery, lineCol); logger.debug("Searched:" + textID); OCRDoc ocrDoc = new OCRDoc(); ocrDoc.docId = doc.doc; ocrDoc.document = d; ocrDoc.textId = d.get("textId"); Map<String, ArrayList<OCRLine>> ocrPages = new HashMap<String, ArrayList<OCRLine>>(); for (ScoreDoc line : lineCol.topDocs().scoreDocs) { // fuege alle zeile zusammen OCRLine ocrLine = new OCRLine(); Document lineD = lineSearcher.reader.document(line.doc); ocrLine.pageDimension = lineD.get("pageDimension"); ocrLine.bbox = lineD.get("bbox"); ocrLine.lineNumber = lineD.get("lineNumber"); String pageNumber = getPageName(lineD.get("cleanedPath")); if (!ocrPages.containsKey(pageNumber)) { ocrPages.put(pageNumber, new ArrayList<OCRLine>()); } ArrayList<OCRLine> page = ocrPages.get(pageNumber); page.add(ocrLine); } logger.debug("collected:" + textID); ocrDoc.linesInPage = ocrPages; tw.writeDoc(language, ocrDoc); tw.commitTicket(); logger.debug("written:" + textID); } else { logger.debug("already done:" + textID); } } tw.closeTicket(language); logger.debug("everything done!"); } public OCRDoc searchInLinesDoc(String textId,Query query, String language) throws IOException{ Query textIDQuery = new TermQuery(new Term("textId", textId)); BooleanQuery booleanQuery = new BooleanQuery(); booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST); booleanQuery.add(query, BooleanClause.Occur.MUST); List<OCRDoc> docs = searchInLines(booleanQuery, language); if (docs.size()==0) return new OCRDoc(); else return docs.get(0); } public List<OCRDoc> searchInLines(Query query, String language) throws IOException { // first step search docs logger.debug("Start searching docs."); TopScoreDocCollector col = (TopScoreDocCollector) search(query, language); TopDocs docs = col.topDocs(); ScoreDoc[] scoreDocs = docs.scoreDocs; ArrayList<OCRDoc> ocrDocs = new ArrayList<OCRDoc>(); LanguageSearcher searcher = languageSearchers .getSearcherByLanguage(language); LanguageSearcher lineSearcher = languageSearchersLines .getSearcherByLanguage(language); logger.debug("found docs."); Set<String> textIds = new HashSet<String>(); for (ScoreDoc doc : scoreDocs) { Document d = searcher.reader.document(doc.doc); String textID = d.get("textId"); if (!textIds.contains(textID)) { textIds.add(textID); Query textIDQuery = new TermQuery(new Term("textId", textID)); //Query[] queries = new Query[] { query, textIDQuery }; //Query lineQuery = query.combine(queries); BooleanQuery booleanQuery = new BooleanQuery(); booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST); booleanQuery.add(query, BooleanClause.Occur.MUST); // suche jtzt die Zeilen TopScoreDocCollector lineCol = TopScoreDocCollector.create( MAX_LINES, false); lineSearcher.searcher.search(booleanQuery, lineCol); logger.debug("Searched:" + textID); OCRDoc ocrDoc = new OCRDoc(); ocrDoc.docId = doc.doc; ocrDoc.document = d; Map<String, ArrayList<OCRLine>> ocrPages = new HashMap<String, ArrayList<OCRLine>>(); for (ScoreDoc line : lineCol.topDocs().scoreDocs) { // fuege alle zeile zusammen OCRLine ocrLine = new OCRLine(); Document lineD = lineSearcher.reader.document(line.doc); ocrLine.pageDimension = lineD.get("pageDimension"); ocrLine.bbox = lineD.get("bbox"); ocrLine.lineNumber = lineD.get("lineNumber"); String pageNumber = getPageName(lineD.get("cleanedPath")); logger.debug("collect:" + pageNumber); if (!ocrPages.containsKey(pageNumber)) { ocrPages.put(pageNumber, new ArrayList<OCRLine>()); } ArrayList<OCRLine> page = ocrPages.get(pageNumber); page.add(ocrLine); } logger.debug("collected:" + textID); ocrDoc.linesInPage = ocrPages; ocrDocs.add(ocrDoc); } else { logger.debug("already done:" + textID); } } return ocrDocs; } /** * Gibt aus dem Pfad denDateinamen zurueck, der dann als Seitenname benutzt * wird. * * @param path * @return */ private String getPageName(String path) { File f = new File(path); return f.getName(); } }