Mercurial > hg > fulltextSearch
comparison src/de/mpiwg/dwinter/fulltext/search/FulltextSearchDocsLines.java @ 0:72a015318a6d
CLOSED - # 16: Zeige nur eine konfigurierbare Anzahl von Treffern an.
https://it-dev.mpiwg-berlin.mpg.de/tracs/pythonOcropusTools/ticket/16
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:26:20 +0100 |
parents | |
children | 2b29b0b6db16 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:72a015318a6d |
---|---|
1 package de.mpiwg.dwinter.fulltext.search; | |
2 | |
3 import java.io.File; | |
4 import java.io.IOException; | |
5 import java.util.ArrayList; | |
6 import java.util.HashMap; | |
7 import java.util.HashSet; | |
8 import java.util.List; | |
9 import java.util.Map; | |
10 import java.util.Set; | |
11 | |
12 import org.apache.log4j.Logger; | |
13 import org.apache.lucene.analysis.de.GermanAnalyzer; | |
14 import org.apache.lucene.analysis.fr.FrenchAnalyzer; | |
15 import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
16 import org.apache.lucene.document.Document; | |
17 import org.apache.lucene.index.CorruptIndexException; | |
18 import org.apache.lucene.index.Term; | |
19 import org.apache.lucene.search.BooleanQuery; | |
20 import org.apache.lucene.search.BooleanClause; | |
21 import org.apache.lucene.search.Collector; | |
22 import org.apache.lucene.search.Query; | |
23 import org.apache.lucene.search.ScoreDoc; | |
24 import org.apache.lucene.search.TermQuery; | |
25 import org.apache.lucene.search.TopDocs; | |
26 import org.apache.lucene.search.TopDocsCollector; | |
27 import org.apache.lucene.search.TopScoreDocCollector; | |
28 import org.apache.lucene.store.LockObtainFailedException; | |
29 import org.apache.lucene.util.Version; | |
30 | |
31 import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc; | |
32 import de.mpiwg.dwinter.fulltext.search.utils.OCRLine; | |
33 import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher; | |
34 import de.mpiwg.dwinter.fulltext.searcher.LanguageSearchers; | |
35 import de.mpiwg.dwinter.fulltext.ticket.TicketWriter; | |
36 | |
37 public class FulltextSearchDocsLines extends FulltextSearch { | |
38 | |
39 protected static Logger logger = Logger.getRootLogger(); | |
40 | |
41 private static final int MAX_LINES = 10000; | |
42 | |
43 protected File line_index_dir; // Index mit dem Zeilenindex | |
44 | |
45 public LanguageSearchers languageSearchersLines = new LanguageSearchers(); | |
46 | |
47 public FulltextSearchDocsLines(File index_dir, File line_index_dir) | |
48 throws CorruptIndexException, LockObtainFailedException, | |
49 IOException { | |
50 | |
51 super(index_dir); | |
52 this.line_index_dir = line_index_dir; | |
53 init_language_searchers_lines(line_index_dir); | |
54 | |
55 } | |
56 | |
57 protected void init_language_searchers_lines(File dir) | |
58 throws CorruptIndexException, LockObtainFailedException, | |
59 IOException { | |
60 languageSearchersLines.add(new LanguageSearcher("de", | |
61 new GermanAnalyzer(Version.LUCENE_30), dir)); | |
62 languageSearchersLines.add(new LanguageSearcher("en", | |
63 new StandardAnalyzer(Version.LUCENE_30), dir)); | |
64 languageSearchersLines.add(new LanguageSearcher("fr", | |
65 new FrenchAnalyzer(Version.LUCENE_30), dir)); | |
66 languageSearchersLines.add(new LanguageSearcher("all", | |
67 new StandardAnalyzer(Version.LUCENE_30), dir)); | |
68 languageSearchersLines.add(new LanguageSearcher("morph", | |
69 new StandardAnalyzer(Version.LUCENE_30), dir)); | |
70 languageSearchersLines.add(new LanguageSearcher("la", | |
71 new StandardAnalyzer(Version.LUCENE_30), dir)); | |
72 } | |
73 | |
74 /** | |
75 * Erzeugt Ergebnisliste im Filesystem fuer die Weiterbenutzung bers | |
76 * Servlet | |
77 * | |
78 * @param query | |
79 * @param language | |
80 * @param ticket | |
81 * ticket unter dem auf die Daten zurckgegriffen werden soll. | |
82 * @return | |
83 * @throws IOException | |
84 */ | |
85 public void searchInLinesToDir(Query query, String language, String ticket) | |
86 throws IOException { | |
87 | |
88 // first step search docs | |
89 logger.debug("Start searching docs"); | |
90 TopScoreDocCollector col = (TopScoreDocCollector) search(query, | |
91 language); | |
92 TopDocs docs = col.topDocs(); | |
93 ScoreDoc[] scoreDocs = docs.scoreDocs; | |
94 // ArrayList<OCRDoc> ocrDocs = new ArrayList<OCRDoc>(); | |
95 | |
96 TicketWriter tw = new TicketWriter(ticket, query, language); | |
97 | |
98 LanguageSearcher searcher = languageSearchers | |
99 .getSearcherByLanguage(language); | |
100 logger.debug("Start writing docs"); | |
101 tw.writeResultsForLanguageSearch(language, docs, searcher.reader); | |
102 tw.commitTicket(); | |
103 logger.debug("Wrote docs"); | |
104 LanguageSearcher lineSearcher = languageSearchersLines | |
105 .getSearcherByLanguage(language); | |
106 | |
107 Set<String> textIds = new HashSet<String>(); | |
108 | |
109 for (ScoreDoc doc : scoreDocs) { | |
110 Document d = searcher.reader.document(doc.doc); | |
111 String textID = d.get("textId"); | |
112 logger.debug("Start:" + textID); | |
113 | |
114 // teste ob schon gesucht TODO: warum sind manchmal textid mehrfach | |
115 // in der treffer liste? | |
116 if (!textIds.contains(textID)) { | |
117 textIds.add(textID); | |
118 | |
119 Query textIDQuery = new TermQuery(new Term("textId", textID)); | |
120 //Query[] queries = new Query[] { query, textIDQuery }; | |
121 //Query lineQuery = query.combine(queries); | |
122 | |
123 BooleanQuery booleanQuery = new BooleanQuery(); | |
124 booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST); | |
125 booleanQuery.add(query, BooleanClause.Occur.MUST); | |
126 // suche jetzt die Zeilen | |
127 TopScoreDocCollector lineCol = TopScoreDocCollector.create( | |
128 MAX_LINES, false); | |
129 lineSearcher.searcher.search(booleanQuery, lineCol); | |
130 logger.debug("Searched:" + textID); | |
131 OCRDoc ocrDoc = new OCRDoc(); | |
132 ocrDoc.docId = doc.doc; | |
133 ocrDoc.document = d; | |
134 ocrDoc.textId = d.get("textId"); | |
135 | |
136 Map<String, ArrayList<OCRLine>> ocrPages = new HashMap<String, ArrayList<OCRLine>>(); | |
137 | |
138 for (ScoreDoc line : lineCol.topDocs().scoreDocs) { | |
139 // fuege alle zeile zusammen | |
140 OCRLine ocrLine = new OCRLine(); | |
141 Document lineD = lineSearcher.reader.document(line.doc); | |
142 ocrLine.pageDimension = lineD.get("pageDimension"); | |
143 ocrLine.bbox = lineD.get("bbox"); | |
144 ocrLine.lineNumber = lineD.get("lineNumber"); | |
145 | |
146 String pageNumber = getPageName(lineD.get("cleanedPath")); | |
147 | |
148 if (!ocrPages.containsKey(pageNumber)) { | |
149 ocrPages.put(pageNumber, new ArrayList<OCRLine>()); | |
150 } | |
151 | |
152 ArrayList<OCRLine> page = ocrPages.get(pageNumber); | |
153 page.add(ocrLine); | |
154 | |
155 } | |
156 logger.debug("collected:" + textID); | |
157 ocrDoc.linesInPage = ocrPages; | |
158 tw.writeDoc(language, ocrDoc); | |
159 tw.commitTicket(); | |
160 logger.debug("written:" + textID); | |
161 } else { | |
162 logger.debug("already done:" + textID); | |
163 } | |
164 } | |
165 tw.closeTicket(language); | |
166 logger.debug("everything done!"); | |
167 } | |
168 | |
169 | |
170 public OCRDoc searchInLinesDoc(String textId,Query query, String language) throws IOException{ | |
171 Query textIDQuery = new TermQuery(new Term("textId", textId)); | |
172 BooleanQuery booleanQuery = new BooleanQuery(); | |
173 booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST); | |
174 booleanQuery.add(query, BooleanClause.Occur.MUST); | |
175 | |
176 List<OCRDoc> docs = searchInLines(booleanQuery, language); | |
177 | |
178 if (docs.size()==0) | |
179 return new OCRDoc(); | |
180 else | |
181 return docs.get(0); | |
182 | |
183 | |
184 } | |
185 public List<OCRDoc> searchInLines(Query query, String language) | |
186 throws IOException { | |
187 | |
188 // first step search docs | |
189 logger.debug("Start searching docs."); | |
190 | |
191 TopScoreDocCollector col = (TopScoreDocCollector) search(query, | |
192 language); | |
193 TopDocs docs = col.topDocs(); | |
194 ScoreDoc[] scoreDocs = docs.scoreDocs; | |
195 ArrayList<OCRDoc> ocrDocs = new ArrayList<OCRDoc>(); | |
196 | |
197 LanguageSearcher searcher = languageSearchers | |
198 .getSearcherByLanguage(language); | |
199 LanguageSearcher lineSearcher = languageSearchersLines | |
200 .getSearcherByLanguage(language); | |
201 | |
202 logger.debug("found docs."); | |
203 Set<String> textIds = new HashSet<String>(); | |
204 for (ScoreDoc doc : scoreDocs) { | |
205 Document d = searcher.reader.document(doc.doc); | |
206 String textID = d.get("textId"); | |
207 | |
208 if (!textIds.contains(textID)) { | |
209 textIds.add(textID); | |
210 | |
211 Query textIDQuery = new TermQuery(new Term("textId", textID)); | |
212 //Query[] queries = new Query[] { query, textIDQuery }; | |
213 //Query lineQuery = query.combine(queries); | |
214 | |
215 BooleanQuery booleanQuery = new BooleanQuery(); | |
216 booleanQuery.add(textIDQuery, BooleanClause.Occur.MUST); | |
217 booleanQuery.add(query, BooleanClause.Occur.MUST); | |
218 | |
219 | |
220 // suche jtzt die Zeilen | |
221 TopScoreDocCollector lineCol = TopScoreDocCollector.create( | |
222 MAX_LINES, false); | |
223 lineSearcher.searcher.search(booleanQuery, lineCol); | |
224 logger.debug("Searched:" + textID); | |
225 OCRDoc ocrDoc = new OCRDoc(); | |
226 ocrDoc.docId = doc.doc; | |
227 ocrDoc.document = d; | |
228 | |
229 Map<String, ArrayList<OCRLine>> ocrPages = new HashMap<String, ArrayList<OCRLine>>(); | |
230 | |
231 for (ScoreDoc line : lineCol.topDocs().scoreDocs) { | |
232 // fuege alle zeile zusammen | |
233 OCRLine ocrLine = new OCRLine(); | |
234 Document lineD = lineSearcher.reader.document(line.doc); | |
235 ocrLine.pageDimension = lineD.get("pageDimension"); | |
236 ocrLine.bbox = lineD.get("bbox"); | |
237 ocrLine.lineNumber = lineD.get("lineNumber"); | |
238 | |
239 String pageNumber = getPageName(lineD.get("cleanedPath")); | |
240 logger.debug("collect:" + pageNumber); | |
241 if (!ocrPages.containsKey(pageNumber)) { | |
242 ocrPages.put(pageNumber, new ArrayList<OCRLine>()); | |
243 } | |
244 | |
245 ArrayList<OCRLine> page = ocrPages.get(pageNumber); | |
246 page.add(ocrLine); | |
247 } | |
248 logger.debug("collected:" + textID); | |
249 ocrDoc.linesInPage = ocrPages; | |
250 ocrDocs.add(ocrDoc); | |
251 } else { | |
252 logger.debug("already done:" + textID); | |
253 } | |
254 } | |
255 | |
256 return ocrDocs; | |
257 } | |
258 | |
259 /** | |
260 * Gibt aus dem Pfad denDateinamen zurueck, der dann als Seitenname benutzt | |
261 * wird. | |
262 * | |
263 * @param path | |
264 * @return | |
265 */ | |
266 private String getPageName(String path) { | |
267 File f = new File(path); | |
268 | |
269 return f.getName(); | |
270 } | |
271 | |
272 } |