comparison src/de/mpiwg/dwinter/fulltext/search/FulltextSearchDocsLines.java @ 0:72a015318a6d

CLOSED - # 16: Zeige nur eine konfigurierbare Anzahl von Treffern an. https://it-dev.mpiwg-berlin.mpg.de/tracs/pythonOcropusTools/ticket/16
author dwinter
date Wed, 03 Nov 2010 12:26:20 +0100
parents
children 2b29b0b6db16
comparison
equal deleted inserted replaced
-1:000000000000 0:72a015318a6d
1 package de.mpiwg.dwinter.fulltext.search;
2
3 import java.io.File;
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.HashMap;
7 import java.util.HashSet;
8 import java.util.List;
9 import java.util.Map;
10 import java.util.Set;
11
12 import org.apache.log4j.Logger;
13 import org.apache.lucene.analysis.de.GermanAnalyzer;
14 import org.apache.lucene.analysis.fr.FrenchAnalyzer;
15 import org.apache.lucene.analysis.standard.StandardAnalyzer;
16 import org.apache.lucene.document.Document;
17 import org.apache.lucene.index.CorruptIndexException;
18 import org.apache.lucene.index.Term;
19 import org.apache.lucene.search.BooleanQuery;
20 import org.apache.lucene.search.BooleanClause;
21 import org.apache.lucene.search.Collector;
22 import org.apache.lucene.search.Query;
23 import org.apache.lucene.search.ScoreDoc;
24 import org.apache.lucene.search.TermQuery;
25 import org.apache.lucene.search.TopDocs;
26 import org.apache.lucene.search.TopDocsCollector;
27 import org.apache.lucene.search.TopScoreDocCollector;
28 import org.apache.lucene.store.LockObtainFailedException;
29 import org.apache.lucene.util.Version;
30
31 import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc;
32 import de.mpiwg.dwinter.fulltext.search.utils.OCRLine;
33 import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher;
34 import de.mpiwg.dwinter.fulltext.searcher.LanguageSearchers;
35 import de.mpiwg.dwinter.fulltext.ticket.TicketWriter;
36
37 public class FulltextSearchDocsLines extends FulltextSearch {
38
39 protected static Logger logger = Logger.getRootLogger();
40
41 private static final int MAX_LINES = 10000;
42
43 protected File line_index_dir; // Index mit dem Zeilenindex
44
45 public LanguageSearchers languageSearchersLines = new LanguageSearchers();
46
47 public FulltextSearchDocsLines(File index_dir, File line_index_dir)
48 throws CorruptIndexException, LockObtainFailedException,
49 IOException {
50
51 super(index_dir);
52 this.line_index_dir = line_index_dir;
53 init_language_searchers_lines(line_index_dir);
54
55 }
56
57 protected void init_language_searchers_lines(File dir)
58 throws CorruptIndexException, LockObtainFailedException,
59 IOException {
60 languageSearchersLines.add(new LanguageSearcher("de",
61 new GermanAnalyzer(Version.LUCENE_30), dir));
62 languageSearchersLines.add(new LanguageSearcher("en",
63 new StandardAnalyzer(Version.LUCENE_30), dir));
64 languageSearchersLines.add(new LanguageSearcher("fr",
65 new FrenchAnalyzer(Version.LUCENE_30), dir));
66 languageSearchersLines.add(new LanguageSearcher("all",
67 new StandardAnalyzer(Version.LUCENE_30), dir));
68 languageSearchersLines.add(new LanguageSearcher("morph",
69 new StandardAnalyzer(Version.LUCENE_30), dir));
70 languageSearchersLines.add(new LanguageSearcher("la",
71 new StandardAnalyzer(Version.LUCENE_30), dir));
72 }
73
74 /**
75 * Erzeugt Ergebnisliste im Filesystem fuer die Weiterbenutzung Ÿbers
76 * Servlet
77 *
78 * @param query
79 * @param language
80 * @param ticket
81 * ticket unter dem auf die Daten zurŸckgegriffen werden soll.
82 * @return
83 * @throws IOException
84 */
85 public void searchInLinesToDir(Query query, String language, String ticket)
86 throws IOException {
87
88 // first step search docs
89 logger.debug("Start searching docs");
90 TopScoreDocCollector col = (TopScoreDocCollector) search(query,
91 language);
92 TopDocs docs = col.topDocs();
93 ScoreDoc[] scoreDocs = docs.scoreDocs;
94 // ArrayList<OCRDoc> ocrDocs = new ArrayList<OCRDoc>();
95
96 TicketWriter tw = new TicketWriter(ticket, query, language);
97
98 LanguageSearcher searcher = languageSearchers
99 .getSearcherByLanguage(language);
100 logger.debug("Start writing docs");
101 tw.writeResultsForLanguageSearch(language, docs, searcher.reader);
102 tw.commitTicket();
103 logger.debug("Wrote docs");
104 LanguageSearcher lineSearcher = languageSearchersLines
105 .getSearcherByLanguage(language);
106
107 Set<String> textIds = new HashSet<String>();
108
109 for (ScoreDoc doc : scoreDocs) {
110 Document d = searcher.reader.document(doc.doc);
111 String textID = d.get("textId");
112 logger.debug("Start:" + textID);
113
114 // teste ob schon gesucht TODO: warum sind manchmal textid mehrfach
115 // in der treffer liste?
116 if (!textIds.contains(textID)) {
117 textIds.add(textID);
118
119 Query textIDQuery = new TermQuery(new Term("textId", textID));
120 //Query[] queries = new Query[] { query, textIDQuery };
121 //Query lineQuery = query.combine(queries);
122
123 BooleanQuery booleanQuery = new BooleanQuery();
124 booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST);
125 booleanQuery.add(query, BooleanClause.Occur.MUST);
126 // suche jetzt die Zeilen
127 TopScoreDocCollector lineCol = TopScoreDocCollector.create(
128 MAX_LINES, false);
129 lineSearcher.searcher.search(booleanQuery, lineCol);
130 logger.debug("Searched:" + textID);
131 OCRDoc ocrDoc = new OCRDoc();
132 ocrDoc.docId = doc.doc;
133 ocrDoc.document = d;
134 ocrDoc.textId = d.get("textId");
135
136 Map<String, ArrayList<OCRLine>> ocrPages = new HashMap<String, ArrayList<OCRLine>>();
137
138 for (ScoreDoc line : lineCol.topDocs().scoreDocs) {
139 // fuege alle zeile zusammen
140 OCRLine ocrLine = new OCRLine();
141 Document lineD = lineSearcher.reader.document(line.doc);
142 ocrLine.pageDimension = lineD.get("pageDimension");
143 ocrLine.bbox = lineD.get("bbox");
144 ocrLine.lineNumber = lineD.get("lineNumber");
145
146 String pageNumber = getPageName(lineD.get("cleanedPath"));
147
148 if (!ocrPages.containsKey(pageNumber)) {
149 ocrPages.put(pageNumber, new ArrayList<OCRLine>());
150 }
151
152 ArrayList<OCRLine> page = ocrPages.get(pageNumber);
153 page.add(ocrLine);
154
155 }
156 logger.debug("collected:" + textID);
157 ocrDoc.linesInPage = ocrPages;
158 tw.writeDoc(language, ocrDoc);
159 tw.commitTicket();
160 logger.debug("written:" + textID);
161 } else {
162 logger.debug("already done:" + textID);
163 }
164 }
165 tw.closeTicket(language);
166 logger.debug("everything done!");
167 }
168
169
170 public OCRDoc searchInLinesDoc(String textId,Query query, String language) throws IOException{
171 Query textIDQuery = new TermQuery(new Term("textId", textId));
172 BooleanQuery booleanQuery = new BooleanQuery();
173 booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST);
174 booleanQuery.add(query, BooleanClause.Occur.MUST);
175
176 List<OCRDoc> docs = searchInLines(booleanQuery, language);
177
178 if (docs.size()==0)
179 return new OCRDoc();
180 else
181 return docs.get(0);
182
183
184 }
185 public List<OCRDoc> searchInLines(Query query, String language)
186 throws IOException {
187
188 // first step search docs
189 logger.debug("Start searching docs.");
190
191 TopScoreDocCollector col = (TopScoreDocCollector) search(query,
192 language);
193 TopDocs docs = col.topDocs();
194 ScoreDoc[] scoreDocs = docs.scoreDocs;
195 ArrayList<OCRDoc> ocrDocs = new ArrayList<OCRDoc>();
196
197 LanguageSearcher searcher = languageSearchers
198 .getSearcherByLanguage(language);
199 LanguageSearcher lineSearcher = languageSearchersLines
200 .getSearcherByLanguage(language);
201
202 logger.debug("found docs.");
203 Set<String> textIds = new HashSet<String>();
204 for (ScoreDoc doc : scoreDocs) {
205 Document d = searcher.reader.document(doc.doc);
206 String textID = d.get("textId");
207
208 if (!textIds.contains(textID)) {
209 textIds.add(textID);
210
211 Query textIDQuery = new TermQuery(new Term("textId", textID));
212 //Query[] queries = new Query[] { query, textIDQuery };
213 //Query lineQuery = query.combine(queries);
214
215 BooleanQuery booleanQuery = new BooleanQuery();
216 booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST);
217 booleanQuery.add(query, BooleanClause.Occur.MUST);
218
219
220 // suche jtzt die Zeilen
221 TopScoreDocCollector lineCol = TopScoreDocCollector.create(
222 MAX_LINES, false);
223 lineSearcher.searcher.search(booleanQuery, lineCol);
224 logger.debug("Searched:" + textID);
225 OCRDoc ocrDoc = new OCRDoc();
226 ocrDoc.docId = doc.doc;
227 ocrDoc.document = d;
228
229 Map<String, ArrayList<OCRLine>> ocrPages = new HashMap<String, ArrayList<OCRLine>>();
230
231 for (ScoreDoc line : lineCol.topDocs().scoreDocs) {
232 // fuege alle zeile zusammen
233 OCRLine ocrLine = new OCRLine();
234 Document lineD = lineSearcher.reader.document(line.doc);
235 ocrLine.pageDimension = lineD.get("pageDimension");
236 ocrLine.bbox = lineD.get("bbox");
237 ocrLine.lineNumber = lineD.get("lineNumber");
238
239 String pageNumber = getPageName(lineD.get("cleanedPath"));
240 logger.debug("collect:" + pageNumber);
241 if (!ocrPages.containsKey(pageNumber)) {
242 ocrPages.put(pageNumber, new ArrayList<OCRLine>());
243 }
244
245 ArrayList<OCRLine> page = ocrPages.get(pageNumber);
246 page.add(ocrLine);
247 }
248 logger.debug("collected:" + textID);
249 ocrDoc.linesInPage = ocrPages;
250 ocrDocs.add(ocrDoc);
251 } else {
252 logger.debug("already done:" + textID);
253 }
254 }
255
256 return ocrDocs;
257 }
258
259 /**
260 * Gibt aus dem Pfad denDateinamen zurueck, der dann als Seitenname benutzt
261 * wird.
262 *
263 * @param path
264 * @return
265 */
266 private String getPageName(String path) {
267 File f = new File(path);
268
269 return f.getName();
270 }
271
272 }