Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/lucene/IndexHandler.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.cms.lucene; | |
2 | |
3 import java.io.File; | |
4 import java.io.FileInputStream; | |
5 import java.io.FileReader; | |
6 import java.io.IOException; | |
7 import java.io.InputStreamReader; | |
8 import java.io.StringReader; | |
9 import java.util.ArrayList; | |
10 import java.util.Date; | |
11 import java.util.HashMap; | |
12 import java.util.HashSet; | |
13 import java.util.Map; | |
14 | |
15 import org.apache.commons.io.FileUtils; | |
16 import org.apache.lucene.analysis.Analyzer; | |
17 import org.apache.lucene.analysis.KeywordAnalyzer; | |
18 import org.apache.lucene.analysis.PerFieldAnalyzerWrapper; | |
19 import org.apache.lucene.analysis.TokenStream; | |
20 import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
21 import org.apache.lucene.document.Document; | |
22 import org.apache.lucene.document.Field; | |
23 import org.apache.lucene.document.FieldSelector; | |
24 import org.apache.lucene.document.Fieldable; | |
25 import org.apache.lucene.document.SetBasedFieldSelector; | |
26 import org.apache.lucene.index.IndexReader; | |
27 import org.apache.lucene.index.IndexWriter; | |
28 import org.apache.lucene.index.IndexWriterConfig; | |
29 import org.apache.lucene.index.IndexWriterConfig.OpenMode; | |
30 import org.apache.lucene.index.Term; | |
31 import org.apache.lucene.index.TermEnum; | |
32 import org.apache.lucene.index.TermFreqVector; | |
33 import org.apache.lucene.queryParser.QueryParser; | |
34 import org.apache.lucene.search.BooleanClause; | |
35 import org.apache.lucene.search.BooleanQuery; | |
36 import org.apache.lucene.search.FuzzyQuery; | |
37 import org.apache.lucene.search.IndexSearcher; | |
38 import org.apache.lucene.search.MatchAllDocsQuery; | |
39 import org.apache.lucene.search.PhraseQuery; | |
40 import org.apache.lucene.search.PrefixQuery; | |
41 import org.apache.lucene.search.Query; | |
42 import org.apache.lucene.search.SearcherManager; | |
43 import org.apache.lucene.search.Sort; | |
44 import org.apache.lucene.search.SortField; | |
45 import org.apache.lucene.search.TermQuery; | |
46 import org.apache.lucene.search.TermRangeQuery; | |
47 import org.apache.lucene.search.TopDocs; | |
48 import org.apache.lucene.search.highlight.Highlighter; | |
49 import org.apache.lucene.search.highlight.QueryScorer; | |
50 import org.apache.lucene.search.highlight.SimpleHTMLFormatter; | |
51 import org.apache.lucene.search.highlight.TextFragment; | |
52 import org.apache.lucene.search.highlight.TokenSources; | |
53 import org.apache.lucene.search.similar.MoreLikeThis; | |
54 import org.apache.lucene.store.FSDirectory; | |
55 import org.apache.lucene.util.Version; | |
56 import org.xml.sax.InputSource; | |
57 import org.xml.sax.SAXException; | |
58 import org.xml.sax.XMLReader; | |
59 | |
60 import com.sun.org.apache.xerces.internal.parsers.SAXParser; | |
61 | |
62 import de.mpg.mpiwg.berlin.mpdl.cms.confmanager.CollectionReader; | |
63 import de.mpg.mpiwg.berlin.mpdl.cms.confmanager.ConfManagerResultWrapper; | |
64 import de.mpg.mpiwg.berlin.mpdl.cms.document.DocumentHandler; | |
65 import de.mpg.mpiwg.berlin.mpdl.cms.document.Hits; | |
66 import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord; | |
67 import de.mpg.mpiwg.berlin.mpdl.cms.document.Token; | |
68 import de.mpg.mpiwg.berlin.mpdl.cms.general.Constants; | |
69 import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsDocOperation; | |
70 import de.mpg.mpiwg.berlin.mpdl.cms.transform.XslResourceTransformer; | |
71 import de.mpg.mpiwg.berlin.mpdl.cms.translator.MicrosoftTranslator; | |
72 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
73 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; | |
74 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; | |
75 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; | |
76 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; | |
77 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler; | |
78 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer; | |
79 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler; | |
80 import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; | |
81 import de.mpg.mpiwg.berlin.mpdl.util.Util; | |
82 | |
83 public class IndexHandler { | |
84 private static IndexHandler instance; | |
85 private IndexWriter documentsIndexWriter; | |
86 private IndexWriter nodesIndexWriter; | |
87 private SearcherManager documentsSearcherManager; | |
88 private SearcherManager nodesSearcherManager; | |
89 private IndexReader documentsIndexReader; | |
90 private PerFieldAnalyzerWrapper documentsPerFieldAnalyzer; | |
91 private PerFieldAnalyzerWrapper nodesPerFieldAnalyzer; | |
92 | |
93 | |
94 public static IndexHandler getInstance() throws ApplicationException { | |
95 if (instance == null) { | |
96 instance = new IndexHandler(); | |
97 instance.init(); | |
98 } | |
99 return instance; | |
100 } | |
101 | |
102 private void init() throws ApplicationException { | |
103 documentsIndexWriter = getDocumentsWriter(); | |
104 documentsIndexWriter.setMaxFieldLength(1000000); | |
105 nodesIndexWriter = getNodesWriter(); | |
106 nodesIndexWriter.setMaxFieldLength(1000000); | |
107 documentsSearcherManager = getNewSearcherManager(documentsIndexWriter); | |
108 nodesSearcherManager = getNewSearcherManager(nodesIndexWriter); | |
109 documentsIndexReader = getDocumentsReader(); | |
110 } | |
111 | |
112 public void indexDocument(CmsDocOperation docOperation) throws ApplicationException { | |
113 try { | |
114 // first delete document in documentsIndex and nodesIndex | |
115 deleteDocumentLocal(docOperation); | |
116 indexDocumentLocal(docOperation); | |
117 documentsIndexWriter.commit(); | |
118 nodesIndexWriter.commit(); | |
119 } catch (Exception e) { | |
120 try { | |
121 documentsIndexWriter.rollback(); | |
122 nodesIndexWriter.rollback(); | |
123 } catch (Exception ex) { | |
124 // nothing | |
125 } | |
126 throw new ApplicationException(e); | |
127 } | |
128 } | |
129 | |
130 private void indexDocumentLocal(CmsDocOperation docOperation) throws ApplicationException { | |
131 FileReader fr = null; | |
132 try { | |
133 MetadataRecord mdRecord = docOperation.getMdRecord(); | |
134 String docId = mdRecord.getDocId(); | |
135 DocumentHandler docHandler = new DocumentHandler(); | |
136 String docFileName = docHandler.getDocFullFileName(docId) + ".upgrade"; | |
137 // add document to documentsIndex | |
138 Document doc = new Document(); | |
139 Field docIdField = new Field("docId", docId, Field.Store.YES, Field.Index.ANALYZED); | |
140 doc.add(docIdField); | |
141 String docIdSortedStr = docId.toLowerCase(); // so that sorting is lower case | |
142 Field docIdFieldSorted = new Field("docIdSorted", docIdSortedStr, Field.Store.YES, Field.Index.NOT_ANALYZED); | |
143 doc.add(docIdFieldSorted); | |
144 String identifier = mdRecord.getIdentifier(); | |
145 if (identifier != null) { | |
146 Field identifierField = new Field("identifier", identifier, Field.Store.YES, Field.Index.ANALYZED); | |
147 doc.add(identifierField); | |
148 } | |
149 String uri = docOperation.getSrcUrl(); | |
150 if (uri != null) { | |
151 Field uriField = new Field("uri", uri, Field.Store.YES, Field.Index.ANALYZED); | |
152 doc.add(uriField); | |
153 } | |
154 String collectionNames = docOperation.getCollectionNames(); | |
155 if (collectionNames != null) { | |
156 Field collectionNamesField = new Field("collectionNames", collectionNames, Field.Store.YES, Field.Index.ANALYZED); | |
157 doc.add(collectionNamesField); | |
158 } | |
159 if (mdRecord.getCreator() != null) { | |
160 String authorStr = mdRecord.getCreator(); | |
161 Field authorField = new Field("author", authorStr, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); | |
162 doc.add(authorField); | |
163 if (authorStr != null) | |
164 authorStr = authorStr.toLowerCase(); // so that sorting is lower case | |
165 Field authorFieldSorted = new Field("authorSorted", authorStr, Field.Store.YES, Field.Index.NOT_ANALYZED); | |
166 doc.add(authorFieldSorted); | |
167 } | |
168 if (mdRecord.getTitle() != null) { | |
169 String titleStr = mdRecord.getTitle(); | |
170 Field titleField = new Field("title", titleStr, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); | |
171 doc.add(titleField); | |
172 if (titleStr != null) | |
173 titleStr = titleStr.toLowerCase(); // so that sorting is lower case | |
174 Field titleFieldSorted = new Field("titleSorted", titleStr, Field.Store.YES, Field.Index.NOT_ANALYZED); | |
175 doc.add(titleFieldSorted); | |
176 } | |
177 if (mdRecord.getLanguage() != null) { | |
178 String langStr = mdRecord.getLanguage(); | |
179 if (langStr != null) | |
180 langStr = langStr.toLowerCase(); // all language codes are lower case | |
181 Field languageField = new Field("language",langStr, Field.Store.YES, Field.Index.ANALYZED); | |
182 doc.add(languageField); | |
183 Field languageFieldSorted = new Field("languageSorted", langStr, Field.Store.YES, Field.Index.NOT_ANALYZED); | |
184 doc.add(languageFieldSorted); | |
185 } | |
186 if (mdRecord.getPublisher() != null) { | |
187 String publisherStr = mdRecord.getPublisher(); | |
188 Field publisherField = new Field("publisher", publisherStr, Field.Store.YES, Field.Index.ANALYZED); | |
189 doc.add(publisherField); | |
190 if (publisherStr != null) | |
191 publisherStr = publisherStr.toLowerCase(); // so that sorting is lower case | |
192 Field publisherFieldSorted = new Field("publisherSorted", publisherStr, Field.Store.YES, Field.Index.NOT_ANALYZED); | |
193 doc.add(publisherFieldSorted); | |
194 } | |
195 if (mdRecord.getYear() != null) { | |
196 Field dateField = new Field("date", mdRecord.getYear(), Field.Store.YES, Field.Index.ANALYZED); | |
197 doc.add(dateField); | |
198 Field dateFieldSorted = new Field("dateSorted", mdRecord.getYear(), Field.Store.YES, Field.Index.NOT_ANALYZED); | |
199 doc.add(dateFieldSorted); | |
200 } | |
201 if (mdRecord.getSubject() != null) { | |
202 Field subjectField = new Field("subject", mdRecord.getSubject(), Field.Store.YES, Field.Index.ANALYZED); | |
203 doc.add(subjectField); | |
204 } | |
205 if (mdRecord.getRights() != null) { | |
206 Field rightsField = new Field("rights", mdRecord.getRights(), Field.Store.YES, Field.Index.ANALYZED); | |
207 doc.add(rightsField); | |
208 } | |
209 if (mdRecord.getLicense() != null) { | |
210 Field licenseField = new Field("license", mdRecord.getLicense(), Field.Store.YES, Field.Index.ANALYZED); | |
211 doc.add(licenseField); | |
212 } | |
213 if (mdRecord.getAccessRights() != null) { | |
214 Field accessRightsField = new Field("accessRights", mdRecord.getAccessRights(), Field.Store.YES, Field.Index.ANALYZED); | |
215 doc.add(accessRightsField); | |
216 } | |
217 String echoId = mdRecord.getEchoId(); | |
218 if (echoId != null) { | |
219 Field echoIdField = new Field("echoId", echoId, Field.Store.YES, Field.Index.ANALYZED); | |
220 doc.add(echoIdField); | |
221 } | |
222 String echoPageImageDir = mdRecord.getEchoPageImageDir(); | |
223 if (echoPageImageDir != null) { | |
224 Field echoPageImageDirField = new Field("echoPageImageDir", echoPageImageDir, Field.Store.YES, Field.Index.ANALYZED); | |
225 doc.add(echoPageImageDirField); | |
226 } | |
227 String echoFiguresDir = mdRecord.getEchoFiguresDir(); | |
228 if (echoFiguresDir != null) { | |
229 Field echoFiguresDirField = new Field("echoFiguresDir", echoFiguresDir, Field.Store.YES, Field.Index.ANALYZED); | |
230 doc.add(echoFiguresDirField); | |
231 } | |
232 String mpiwgDocId = mdRecord.getMpiwgDocId(); | |
233 if (mpiwgDocId != null) { | |
234 Field mpiwgDocIdField = new Field("mpiwgDocId", mpiwgDocId, Field.Store.YES, Field.Index.ANALYZED); | |
235 doc.add(mpiwgDocIdField); | |
236 } | |
237 if (mdRecord.getLastModified() != null) { | |
238 Date lastModified = mdRecord.getLastModified(); | |
239 String xsDateStr = new Util().toXsDate(lastModified); | |
240 Field lastModifiedField = new Field("lastModified", xsDateStr, Field.Store.YES, Field.Index.ANALYZED); | |
241 doc.add(lastModifiedField); | |
242 long time = lastModified.getTime(); | |
243 String timeStr = String.valueOf(time); | |
244 Field lastModifiedFieldSorted = new Field("lastModifiedSorted", timeStr, Field.Store.YES, Field.Index.NOT_ANALYZED); | |
245 doc.add(lastModifiedFieldSorted); | |
246 } | |
247 if (mdRecord.getSchemaName() != null) { | |
248 String schemNameStr = mdRecord.getSchemaName(); | |
249 Field schemaField = new Field("schemaName", schemNameStr, Field.Store.YES, Field.Index.ANALYZED); | |
250 doc.add(schemaField); | |
251 if (schemNameStr != null) | |
252 schemNameStr = schemNameStr.toLowerCase(); // so that sorting is lower case | |
253 Field schemaFieldSorted = new Field("schemaNameSorted", schemNameStr, Field.Store.YES, Field.Index.NOT_ANALYZED); | |
254 doc.add(schemaFieldSorted); | |
255 } | |
256 | |
257 String language = mdRecord.getLanguage(); | |
258 InputStreamReader docFileReader = new InputStreamReader(new FileInputStream(docFileName), "utf-8"); | |
259 // to guarantee that utf-8 is used (if not done, it does not work on Tomcat which has another default charset) | |
260 XmlTokenizer docXmlTokenizer = new XmlTokenizer(docFileReader); | |
261 docXmlTokenizer.setDocIdentifier(docId); | |
262 docXmlTokenizer.setLanguage(language); | |
263 docXmlTokenizer.setOutputFormat("string"); | |
264 String[] outputOptionsWithLemmas = { "withLemmas" }; // so all tokens are | |
265 // fetched with lemmas (costs performance) | |
266 docXmlTokenizer.setOutputOptions(outputOptionsWithLemmas); | |
267 String[] normFunctionNone = { "none" }; | |
268 docXmlTokenizer.setNormFunctions(normFunctionNone); | |
269 docXmlTokenizer.tokenize(); | |
270 | |
271 int pageCount = docXmlTokenizer.getPageCount(); | |
272 if (pageCount == 0) | |
273 pageCount = 1; // each document at least has one page | |
274 String pageCountStr = String.valueOf(pageCount); | |
275 Field pageCountField = new Field("pageCount", pageCountStr, Field.Store.YES, Field.Index.ANALYZED); | |
276 doc.add(pageCountField); | |
277 | |
278 String[] outputOptionsEmpty = {}; | |
279 docXmlTokenizer.setOutputOptions(outputOptionsEmpty); | |
280 // must be set to null so that the normalization function works | |
281 String docTokensOrig = docXmlTokenizer.getStringResult(); | |
282 String[] normFunctionReg = { "reg" }; | |
283 docXmlTokenizer.setNormFunctions(normFunctionReg); | |
284 String docTokensReg = docXmlTokenizer.getStringResult(); | |
285 String[] normFunctionNorm = { "norm" }; | |
286 docXmlTokenizer.setNormFunctions(normFunctionNorm); | |
287 String docTokensNorm = docXmlTokenizer.getStringResult(); | |
288 docXmlTokenizer.setOutputOptions(outputOptionsWithLemmas); | |
289 String docTokensMorph = docXmlTokenizer.getStringResult(); | |
290 | |
291 Field tokenOrigField = new Field("tokenOrig", docTokensOrig, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); | |
292 Field tokenRegField = new Field("tokenReg", docTokensReg, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); | |
293 Field tokenNormField = new Field("tokenNorm", docTokensNorm, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); | |
294 Field tokenMorphField = new Field("tokenMorph", docTokensMorph, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); | |
295 doc.add(tokenOrigField); | |
296 doc.add(tokenRegField); | |
297 doc.add(tokenNormField); | |
298 doc.add(tokenMorphField); | |
299 | |
300 // save original content of the doc file | |
301 File docFile = new File(docFileName); | |
302 String contentXml = FileUtils.readFileToString(docFile, "utf-8"); | |
303 Field contentXmlField = new Field("xmlContent", contentXml, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); | |
304 doc.add(contentXmlField); | |
305 | |
306 // generate original chars content | |
307 XslResourceTransformer charsTransformer = new XslResourceTransformer("chars.xsl"); | |
308 String content = charsTransformer.transform(docFileName); | |
309 Field contentField = new Field("content", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); | |
310 doc.add(contentField); | |
311 | |
312 documentsIndexWriter.addDocument(doc); | |
313 | |
314 // add all elements with the specified names of the document to nodesIndex | |
315 String[] elementNamesArray = docOperation.getElementNames(); | |
316 String elementNames = ""; | |
317 for (int i = 0; i < elementNamesArray.length; i++) { | |
318 String elemName = elementNamesArray[i]; | |
319 elementNames = elementNames + elemName + " "; | |
320 } | |
321 elementNames = elementNames.substring(0, elementNames.length() - 1); | |
322 ArrayList<XmlTokenizerContentHandler.Element> elements = docXmlTokenizer.getElements(elementNames); | |
323 for (int i = 0; i < elements.size(); i++) { | |
324 XmlTokenizerContentHandler.Element element = elements.get(i); | |
325 Document nodeDoc = new Document(); | |
326 nodeDoc.add(docIdField); | |
327 String nodeLanguage = element.lang; | |
328 if (nodeLanguage == null) | |
329 nodeLanguage = language; | |
330 String nodePageNumber = String.valueOf(element.pageNumber); | |
331 String nodeLineNumber = String.valueOf(element.lineNumber); | |
332 String nodeElementName = String.valueOf(element.name); | |
333 String nodeElementDocPosition = String.valueOf(element.docPosition); | |
334 String nodeElementAbsolutePosition = String.valueOf(element.position); | |
335 String nodeElementPagePosition = String.valueOf(element.pagePosition); | |
336 String nodeElementPosition = String.valueOf(element.elemPosition); | |
337 String nodeXmlId = element.xmlId; | |
338 String nodeXpath = element.xpath; | |
339 String nodeXmlContent = element.toXmlString(); | |
340 String nodeTokensOrig = element.getTokensStr("orig"); | |
341 String nodeTokensReg = element.getTokensStr("reg"); | |
342 String nodeTokensNorm = element.getTokensStr("norm"); | |
343 String nodeTokensMorph = element.getTokensStr("morph"); | |
344 if (nodeLanguage != null) { | |
345 Field nodeLanguageField = new Field("language", nodeLanguage, Field.Store.YES, Field.Index.ANALYZED); | |
346 nodeDoc.add(nodeLanguageField); | |
347 } | |
348 Field nodePageNumberField = new Field("pageNumber", nodePageNumber, Field.Store.YES, Field.Index.ANALYZED); | |
349 nodeDoc.add(nodePageNumberField); | |
350 Field nodeLineNumberField = new Field("lineNumber", nodeLineNumber, Field.Store.YES, Field.Index.ANALYZED); | |
351 nodeDoc.add(nodeLineNumberField); | |
352 Field nodeElementNameField = new Field("elementName", nodeElementName, Field.Store.YES, Field.Index.ANALYZED); | |
353 nodeDoc.add(nodeElementNameField); | |
354 Field nodeElementDocPositionField = new Field("elementDocPosition", nodeElementDocPosition, Field.Store.YES, Field.Index.ANALYZED); | |
355 nodeDoc.add(nodeElementDocPositionField); | |
356 Field nodeElementDocPositionFieldSorted = new Field("elementDocPositionSorted", nodeElementDocPosition, Field.Store.YES, Field.Index.NOT_ANALYZED); | |
357 nodeDoc.add(nodeElementDocPositionFieldSorted); | |
358 Field nodeElementAbsolutePositionField = new Field("elementAbsolutePosition", nodeElementAbsolutePosition, Field.Store.YES, Field.Index.ANALYZED); | |
359 nodeDoc.add(nodeElementAbsolutePositionField); | |
360 Field nodeElementPagePositionField = new Field("elementPagePosition", nodeElementPagePosition, Field.Store.YES, Field.Index.ANALYZED); | |
361 nodeDoc.add(nodeElementPagePositionField); | |
362 Field nodeElementPositionField = new Field("elementPosition", nodeElementPosition, Field.Store.YES, Field.Index.ANALYZED); | |
363 nodeDoc.add(nodeElementPositionField); | |
364 if (nodeXmlId != null) { | |
365 Field nodeXmlIdField = new Field("xmlId", nodeXmlId, Field.Store.YES, Field.Index.ANALYZED); | |
366 nodeDoc.add(nodeXmlIdField); | |
367 } | |
368 if (nodeXpath != null) { | |
369 Field nodeXpathField = new Field("xpath", nodeXpath, Field.Store.YES, Field.Index.ANALYZED); | |
370 nodeDoc.add(nodeXpathField); | |
371 } | |
372 if (nodeXmlContent != null) { | |
373 Field nodeXmlContentField = new Field("xmlContent", nodeXmlContent, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); | |
374 nodeDoc.add(nodeXmlContentField); | |
375 } | |
376 if (nodeXmlContent != null) { | |
377 String nodeXmlContentTokenized = toTokenizedXmlString(nodeXmlContent, nodeLanguage); | |
378 byte[] blabla = nodeXmlContentTokenized.getBytes("utf-8"); // TODO why is tokenizedXmlStr not already utf-8 on page 444 Benedetti ? | |
379 nodeXmlContentTokenized = new String(blabla, "utf-8"); | |
380 nodeXmlContentTokenized = enrichWordsOrigRegNorm(nodeXmlContentTokenized); | |
381 Field nodeXmlContentTokenizedField = new Field("xmlContentTokenized", nodeXmlContentTokenized, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); | |
382 nodeDoc.add(nodeXmlContentTokenizedField); | |
383 } | |
384 if (nodeTokensOrig != null) { | |
385 Field nodeTokenOrigField = new Field("tokenOrig", nodeTokensOrig, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); | |
386 nodeDoc.add(nodeTokenOrigField); | |
387 } | |
388 if (nodeTokensReg != null) { | |
389 Field nodeTokenRegField = new Field("tokenReg", nodeTokensReg, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); | |
390 nodeDoc.add(nodeTokenRegField); | |
391 } | |
392 if (nodeTokensNorm != null) { | |
393 Field nodeTokenNormField = new Field("tokenNorm", nodeTokensNorm, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); | |
394 nodeDoc.add(nodeTokenNormField); | |
395 } | |
396 if (nodeTokensMorph != null) { | |
397 Field nodeTokenMorphField = new Field("tokenMorph", nodeTokensMorph, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); | |
398 nodeDoc.add(nodeTokenMorphField); | |
399 } | |
400 | |
401 nodesIndexWriter.addDocument(nodeDoc); | |
402 } | |
403 } catch (Exception e) { | |
404 throw new ApplicationException(e); | |
405 } finally { | |
406 try { | |
407 if (fr != null) | |
408 fr.close(); | |
409 } catch (Exception e) { | |
410 // nothing | |
411 } | |
412 } | |
413 } | |
414 | |
415 public void deleteDocument(CmsDocOperation docOperation) throws ApplicationException { | |
416 try { | |
417 deleteDocumentLocal(docOperation); | |
418 documentsIndexWriter.commit(); | |
419 nodesIndexWriter.commit(); | |
420 } catch (Exception e) { | |
421 try { | |
422 documentsIndexWriter.rollback(); | |
423 nodesIndexWriter.rollback(); | |
424 } catch (Exception ex) { | |
425 // nothing | |
426 } | |
427 throw new ApplicationException(e); | |
428 } | |
429 } | |
430 | |
431 private void deleteDocumentLocal(CmsDocOperation docOperation) throws ApplicationException { | |
432 String docId = docOperation.getDocIdentifier(); | |
433 try { | |
434 Term termIdentifier = new Term("docId", docId); | |
435 documentsIndexWriter.deleteDocuments(termIdentifier); | |
436 nodesIndexWriter.deleteDocuments(termIdentifier); | |
437 } catch (Exception e) { | |
438 throw new ApplicationException(e); | |
439 } | |
440 } | |
441 | |
442 public Hits queryDocuments(String queryStr, String[] sortFieldNames, String language, int from, int to, boolean withHitFragments, boolean translate) throws ApplicationException { | |
443 Hits hits = null; | |
444 IndexSearcher searcher = null; | |
445 try { | |
446 makeDocumentsSearcherManagerUpToDate(); | |
447 searcher = documentsSearcherManager.acquire(); | |
448 String defaultQueryFieldName = "tokenOrig"; | |
449 QueryParser queryParser = new QueryParser(Version.LUCENE_35, defaultQueryFieldName, documentsPerFieldAnalyzer); | |
450 Query query = null; | |
451 if (queryStr.equals("*")) { | |
452 query = new MatchAllDocsQuery(); | |
453 } else { | |
454 query = queryParser.parse(queryStr); | |
455 } | |
456 Query morphQuery = buildMorphQuery(query, language, false, translate); | |
457 Query highlighterQuery = buildMorphQuery(query, language, true, translate); | |
458 if (query instanceof PhraseQuery || query instanceof PrefixQuery || query instanceof FuzzyQuery || query instanceof TermRangeQuery) { | |
459 highlighterQuery = query; // TODO wenn sie rekursiv enthalten sind | |
460 } | |
461 String beginHitMark = "!!!BEGIN_HIT!!!"; | |
462 String endHitMark = "!!!END_HIT!!!"; | |
463 SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(beginHitMark, endHitMark); // marks should not contain xml special chars | |
464 QueryScorer queryScorer = new QueryScorer(highlighterQuery); | |
465 Highlighter highlighter = new Highlighter(htmlFormatter, queryScorer); | |
466 TopDocs resultDocs = null; | |
467 if (sortFieldNames != null) { | |
468 Sort sort = buildSort(sortFieldNames, "doc"); // build sort criteria | |
469 resultDocs = searcher.search(morphQuery, 10000, sort); | |
470 } else { | |
471 resultDocs = searcher.search(morphQuery, 10000); | |
472 } | |
473 resultDocs.setMaxScore(1); | |
474 int toTmp = to; | |
475 if (resultDocs.scoreDocs.length <= to) | |
476 toTmp = resultDocs.scoreDocs.length - 1; | |
477 if (resultDocs != null) { | |
478 ArrayList<de.mpg.mpiwg.berlin.mpdl.cms.document.Document> docs = new ArrayList<de.mpg.mpiwg.berlin.mpdl.cms.document.Document>(); | |
479 for (int i=from; i<=toTmp; i++) { | |
480 int docID = resultDocs.scoreDocs[i].doc; | |
481 FieldSelector docFieldSelector = getDocFieldSelector(); | |
482 Document luceneDoc = searcher.doc(docID, docFieldSelector); | |
483 de.mpg.mpiwg.berlin.mpdl.cms.document.Document doc = new de.mpg.mpiwg.berlin.mpdl.cms.document.Document(luceneDoc); | |
484 if (withHitFragments) { | |
485 ArrayList<String> hitFragments = new ArrayList<String>(); | |
486 Fieldable docContentField = luceneDoc.getFieldable("content"); | |
487 if (docContentField != null) { | |
488 String docContent = docContentField.stringValue(); | |
489 TokenStream tokenStream = TokenSources.getAnyTokenStream(this.documentsIndexReader, docID, docContentField.name(), luceneDoc, documentsPerFieldAnalyzer); | |
490 // highlighter.setMaxDocCharsToAnalyze(100000); // the first 100000 chars are fetched maximal, but performance is not really better | |
491 TextFragment[] textfragments = highlighter.getBestTextFragments(tokenStream, docContent, false, 5); | |
492 if (textfragments.length > 0) { | |
493 for (int j=0; j<textfragments.length; j++) { | |
494 String textFragment = textfragments[j].toString().trim(); | |
495 textFragment = StringUtils.deresolveXmlEntities(textFragment); | |
496 textFragment = textFragment.replaceAll(beginHitMark, "<span class=\"highlight hit\">"); | |
497 textFragment = textFragment.replaceAll(endHitMark, "</span>"); | |
498 hitFragments.add(checkHitFragment(textFragment)); | |
499 } | |
500 } | |
501 } | |
502 if (! hitFragments.isEmpty()) | |
503 doc.setHitFragments(hitFragments); | |
504 } | |
505 docs.add(doc); | |
506 } | |
507 if (docs != null) { | |
508 hits = new Hits(docs, from, to); | |
509 hits.setSize(resultDocs.scoreDocs.length); | |
510 hits.setQuery(morphQuery); | |
511 } | |
512 } | |
513 } catch (Exception e) { | |
514 throw new ApplicationException(e); | |
515 } finally { | |
516 try { | |
517 if (searcher != null) | |
518 documentsSearcherManager.release(searcher); | |
519 } catch (IOException e) { | |
520 // nothing | |
521 } | |
522 } | |
523 // Do not use searcher after this! | |
524 searcher = null; | |
525 return hits; | |
526 } | |
527 | |
528 public Hits queryDocument(String docId, String queryStr, int from, int to) throws ApplicationException { | |
529 Hits hits = null; | |
530 IndexSearcher searcher = null; | |
531 MetadataRecord docMetadataRecord = getDocMetadata(docId); | |
532 if (docMetadataRecord == null) | |
533 return null; // no document with that docId is in index | |
534 try { | |
535 makeNodesSearcherManagerUpToDate(); | |
536 searcher = nodesSearcherManager.acquire(); | |
537 String fieldNameDocId = "docId"; | |
538 Query queryDocId = new QueryParser(Version.LUCENE_35, fieldNameDocId, nodesPerFieldAnalyzer).parse(docId); | |
539 String defaultQueryFieldName = "tokenOrig"; | |
540 Query query = new QueryParser(Version.LUCENE_35, defaultQueryFieldName, nodesPerFieldAnalyzer).parse(queryStr); | |
541 String language = docMetadataRecord.getLanguage(); | |
542 if (language == null || language.equals("")) { | |
543 String collectionNames = docMetadataRecord.getCollectionNames(); | |
544 ConfManagerResultWrapper collectionInfo = CollectionReader.getInstance().getResultWrapper(collectionNames); | |
545 if (collectionInfo != null) { | |
546 String mainLang = collectionInfo.getMainLanguage(); | |
547 if (mainLang != null) | |
548 language = mainLang; | |
549 } | |
550 } | |
551 Query morphQuery = buildMorphQuery(query, language); | |
552 BooleanQuery queryDoc = new BooleanQuery(); | |
553 queryDoc.add(queryDocId, BooleanClause.Occur.MUST); | |
554 queryDoc.add(morphQuery, BooleanClause.Occur.MUST); | |
555 String[] sortFieldNames = {"elementDocPosition"}; | |
556 Sort sortByPosition = buildSort(sortFieldNames, "node"); | |
557 TopDocs topDocs = searcher.search(queryDoc, 100000, sortByPosition); | |
558 topDocs.setMaxScore(1); | |
559 int toTmp = to; | |
560 if (topDocs.scoreDocs.length <= to) | |
561 toTmp = topDocs.scoreDocs.length - 1; | |
562 if (topDocs != null) { | |
563 ArrayList<de.mpg.mpiwg.berlin.mpdl.cms.document.Document> docs = new ArrayList<de.mpg.mpiwg.berlin.mpdl.cms.document.Document>(); | |
564 for (int i=from; i<=toTmp; i++) { | |
565 int docID = topDocs.scoreDocs[i].doc; | |
566 FieldSelector nodeFieldSelector = getNodeFieldSelector(); | |
567 Document luceneDoc = searcher.doc(docID, nodeFieldSelector); | |
568 de.mpg.mpiwg.berlin.mpdl.cms.document.Document doc = new de.mpg.mpiwg.berlin.mpdl.cms.document.Document(luceneDoc); | |
569 docs.add(doc); | |
570 } | |
571 if (docs != null) { | |
572 hits = new Hits(docs, from, to); | |
573 hits.setSize(topDocs.scoreDocs.length); | |
574 } | |
575 } | |
576 searcher.close(); | |
577 } catch (Exception e) { | |
578 throw new ApplicationException(e); | |
579 } finally { | |
580 try { | |
581 if (searcher != null) | |
582 documentsSearcherManager.release(searcher); | |
583 } catch (IOException e) { | |
584 // nothing | |
585 } | |
586 } | |
587 // Do not use searcher after this! | |
588 searcher = null; | |
589 return hits; | |
590 } | |
591 | |
592 public Hits moreLikeThis(String docId, int from, int to) throws ApplicationException { | |
593 Hits hits = null; | |
594 ArrayList<de.mpg.mpiwg.berlin.mpdl.cms.document.Document> wspDocs = null; | |
595 IndexSearcher searcher1 = null; | |
596 IndexSearcher searcher2 = null; | |
597 try { | |
598 makeDocumentsSearcherManagerUpToDate(); | |
599 searcher1 = documentsSearcherManager.acquire(); | |
600 String fieldNameDocId = "docId"; | |
601 Query queryDocId = new QueryParser(Version.LUCENE_35, fieldNameDocId, documentsPerFieldAnalyzer).parse(docId); | |
602 TopDocs topDocs = searcher1.search(queryDocId, 100000); | |
603 topDocs.setMaxScore(1); | |
604 int docID = -1; | |
605 if (topDocs != null && topDocs.scoreDocs != null && topDocs.scoreDocs.length > 0) { | |
606 docID = topDocs.scoreDocs[0].doc; | |
607 } | |
608 makeDocumentsSearcherManagerUpToDate(); | |
609 searcher2 = documentsSearcherManager.acquire(); | |
610 MoreLikeThis mlt = new MoreLikeThis(documentsIndexReader); // TODO documentsIndexReader is ok ? | |
611 mlt.setFieldNames(new String[]{"content"}); // similarity function works against these fields | |
612 mlt.setMinWordLen(2); | |
613 mlt.setBoost(true); | |
614 Query queryMoreLikeThis = mlt.like(docID); | |
615 TopDocs moreLikeThisDocs = searcher2.search(queryMoreLikeThis, 10); | |
616 moreLikeThisDocs.setMaxScore(10); | |
617 if (moreLikeThisDocs != null) { | |
618 if (wspDocs == null) | |
619 wspDocs = new ArrayList<de.mpg.mpiwg.berlin.mpdl.cms.document.Document>(); | |
620 for (int i=0; i<moreLikeThisDocs.scoreDocs.length; i++) { | |
621 int docIdent = moreLikeThisDocs.scoreDocs[i].doc; | |
622 Document luceneDoc = searcher2.doc(docIdent); | |
623 de.mpg.mpiwg.berlin.mpdl.cms.document.Document wspDoc = new de.mpg.mpiwg.berlin.mpdl.cms.document.Document(luceneDoc); | |
624 wspDocs.add(wspDoc); | |
625 } | |
626 } | |
627 if (wspDocs != null) { | |
628 hits = new Hits(wspDocs, from, to); | |
629 hits.setSize(moreLikeThisDocs.scoreDocs.length); | |
630 } | |
631 } catch (Exception e) { | |
632 throw new ApplicationException(e); | |
633 } finally { | |
634 try { | |
635 if (searcher1 != null) | |
636 documentsSearcherManager.release(searcher1); | |
637 if (searcher2 != null) | |
638 documentsSearcherManager.release(searcher2); | |
639 } catch (IOException e) { | |
640 // nothing | |
641 } | |
642 } | |
643 // Do not use searcher after this! | |
644 searcher1 = null; | |
645 searcher2 = null; | |
646 return hits; | |
647 } | |
648 | |
649 public MetadataRecord getDocMetadata(String docId) throws ApplicationException { | |
650 MetadataRecord mdRecord = null; | |
651 Document doc = getDocument(docId); | |
652 if (doc != null) { | |
653 String identifier = null; | |
654 Fieldable identifierField = doc.getFieldable("identifier"); | |
655 if (identifierField != null) | |
656 identifier = identifierField.stringValue(); | |
657 String uri = null; | |
658 Fieldable uriField = doc.getFieldable("uri"); | |
659 if (uriField != null) | |
660 uri = uriField.stringValue(); | |
661 String collectionNames = null; | |
662 Fieldable collectionNamesField = doc.getFieldable("collectionNames"); | |
663 if (collectionNamesField != null) | |
664 collectionNames = collectionNamesField.stringValue(); | |
665 String author = null; | |
666 Fieldable authorField = doc.getFieldable("author"); | |
667 if (authorField != null) | |
668 author = authorField.stringValue(); | |
669 String title = null; | |
670 Fieldable titleField = doc.getFieldable("title"); | |
671 if (titleField != null) | |
672 title = titleField.stringValue(); | |
673 String language = null; | |
674 Fieldable languageField = doc.getFieldable("language"); | |
675 if (languageField != null) | |
676 language = languageField.stringValue(); | |
677 else { | |
678 ConfManagerResultWrapper collectionInfo = CollectionReader.getInstance().getResultWrapper(collectionNames); | |
679 if (collectionInfo != null) { | |
680 String mainLang = collectionInfo.getMainLanguage(); | |
681 if (mainLang != null) | |
682 language = mainLang; | |
683 } | |
684 } | |
685 String publisher = null; | |
686 Fieldable publisherField = doc.getFieldable("publisher"); | |
687 if (publisherField != null) | |
688 publisher = publisherField.stringValue(); | |
689 Date yearDate = null; | |
690 Fieldable dateField = doc.getFieldable("date"); | |
691 if (dateField != null) { | |
692 String dateStr = dateField.stringValue(); | |
693 if (dateStr != null && !dateStr.equals("")) { | |
694 dateStr = StringUtils.deresolveXmlEntities(dateStr); | |
695 String yearStr = new Util().toYearStr(dateStr); // test if possible | |
696 // etc | |
697 if (yearStr != null) { | |
698 yearDate = new Util().toDate(yearStr + "-01-01T00:00:00.000Z"); | |
699 } | |
700 } | |
701 } | |
702 String rights = null; | |
703 Fieldable rightsField = doc.getFieldable("rights"); | |
704 if (rightsField != null) | |
705 rights = rightsField.stringValue(); | |
706 String license = null; | |
707 Fieldable licenseField = doc.getFieldable("license"); | |
708 if (licenseField != null) | |
709 license = licenseField.stringValue(); | |
710 String accessRights = null; | |
711 Fieldable accessRightsField = doc.getFieldable("accessRights"); | |
712 if (accessRightsField != null) | |
713 accessRights = accessRightsField.stringValue(); | |
714 String echoId = null; | |
715 Fieldable echoIdField = doc.getFieldable("echoId"); | |
716 if (echoIdField != null) | |
717 echoId = echoIdField.stringValue(); | |
718 String echoPageImageDir = null; | |
719 Fieldable echoPageImageDirField = doc.getFieldable("echoPageImageDir"); | |
720 if (echoPageImageDirField != null) | |
721 echoPageImageDir = echoPageImageDirField.stringValue(); | |
722 String echoFiguresDir = null; | |
723 Fieldable echoFiguresDirField = doc.getFieldable("echoFiguresDir"); | |
724 if (echoFiguresDirField != null) | |
725 echoFiguresDir = echoFiguresDirField.stringValue(); | |
726 String mpiwgDocId = null; | |
727 Fieldable mpiwgDocIdField = doc.getFieldable("mpiwgDocId"); | |
728 if (mpiwgDocIdField != null) | |
729 mpiwgDocId = mpiwgDocIdField.stringValue(); | |
730 int pageCount = -1; | |
731 Fieldable pageCountField = doc.getFieldable("pageCount"); | |
732 if (pageCountField != null) { | |
733 String pageCountStr = pageCountField.stringValue(); | |
734 pageCount = Integer.valueOf(pageCountStr); | |
735 } | |
736 String schemaName = null; | |
737 Fieldable schemaNameField = doc.getFieldable("schemaName"); | |
738 if (schemaNameField != null) | |
739 schemaName = schemaNameField.stringValue(); | |
740 Date lastModified = null; | |
741 Fieldable lastModifiedField = doc.getFieldable("lastModified"); | |
742 if (lastModifiedField != null) { | |
743 String lastModifiedXSDateStr = lastModifiedField.stringValue(); | |
744 lastModified = new Util().toDate(lastModifiedXSDateStr); | |
745 } | |
746 mdRecord = new MetadataRecord(); | |
747 mdRecord.setDocId(docId); | |
748 mdRecord.setUri(uri); | |
749 mdRecord.setIdentifier(identifier); | |
750 mdRecord.setCollectionNames(collectionNames); | |
751 mdRecord.setCreator(author); | |
752 mdRecord.setTitle(title); | |
753 mdRecord.setDate(yearDate); | |
754 mdRecord.setLanguage(language); | |
755 mdRecord.setPublisher(publisher); | |
756 mdRecord.setLicense(license); | |
757 mdRecord.setRights(rights); | |
758 mdRecord.setAccessRights(accessRights); | |
759 mdRecord.setEchoId(echoId); | |
760 mdRecord.setEchoPageImageDir(echoPageImageDir); | |
761 mdRecord.setEchoFiguresDir(echoFiguresDir); | |
762 mdRecord.setMpiwgDocId(mpiwgDocId); | |
763 mdRecord.setPageCount(pageCount); | |
764 mdRecord.setSchemaName(schemaName); | |
765 mdRecord.setLastModified(lastModified); | |
766 } | |
767 return mdRecord; | |
768 } | |
769 | |
770 public ArrayList<Token> getToken(String fieldName, String value, int count) throws ApplicationException { | |
771 ArrayList<Token> retToken = null; | |
772 int counter = 0; | |
773 TermEnum terms = null; | |
774 try { | |
775 if (value == null) | |
776 value = ""; | |
777 Term term = new Term(fieldName, value); | |
778 makeIndexReaderUpToDate(); | |
779 terms = documentsIndexReader.terms(term); | |
780 while (terms != null && fieldName != null && fieldName.equals(terms.term().field()) && counter < count) { | |
781 if (retToken == null) | |
782 retToken = new ArrayList<Token>(); | |
783 Term termContent = terms.term(); | |
784 Token token = new Token(termContent); | |
785 retToken.add(token); | |
786 counter++; | |
787 if (!terms.next()) | |
788 break; | |
789 } | |
790 } catch (Exception e) { | |
791 throw new ApplicationException(e); | |
792 } finally { | |
793 if (terms != null) { | |
794 try { | |
795 terms.close(); | |
796 } catch (IOException e) { | |
797 // nothing | |
798 } | |
799 } | |
800 } | |
801 return retToken; | |
802 } | |
803 | |
804 public ArrayList<Token> getToken(String docId, String fieldName, String value, int count) throws ApplicationException { | |
805 ArrayList<Token> retToken = null; | |
806 if (value == null) | |
807 value = ""; | |
808 int counter = 0; | |
809 IndexSearcher searcher = null; | |
810 try { | |
811 makeDocumentsSearcherManagerUpToDate(); | |
812 makeIndexReaderUpToDate(); | |
813 searcher = documentsSearcherManager.acquire(); | |
814 Query queryDocId = new TermQuery(new Term("docId", docId)); | |
815 TopDocs topDocs = searcher.search(queryDocId, 1); | |
816 if (topDocs != null) { | |
817 int docIdInt = topDocs.scoreDocs[0].doc; | |
818 TermFreqVector termFreqVector = documentsIndexReader.getTermFreqVector(docIdInt, fieldName); | |
819 if (termFreqVector != null) { | |
820 String[] terms = termFreqVector.getTerms(); | |
821 int[] freqs = termFreqVector.getTermFrequencies(); | |
822 boolean success = false; | |
823 if (terms != null) { | |
824 retToken = new ArrayList<Token>(); | |
825 for (int i = 0; i < terms.length; i++) { | |
826 String termStr = terms[i]; | |
827 if (termStr.startsWith(value)) | |
828 success = true; | |
829 if (success) { | |
830 counter++; | |
831 int freq = freqs[i]; | |
832 Term t = new Term(fieldName, termStr); | |
833 Token tok = new Token(t); | |
834 tok.setFreq(freq); | |
835 retToken.add(tok); | |
836 } | |
837 if (counter >= count) | |
838 break; | |
839 } | |
840 } | |
841 } | |
842 } | |
843 } catch (Exception e) { | |
844 throw new ApplicationException(e); | |
845 } finally { | |
846 try { | |
847 if (searcher != null) | |
848 documentsSearcherManager.release(searcher); | |
849 } catch (IOException e) { | |
850 // nothing | |
851 } | |
852 } | |
853 // Do not use searcher after this! | |
854 searcher = null; | |
855 return retToken; | |
856 } | |
857 | |
858 public void end() throws ApplicationException { | |
859 try { | |
860 if (documentsIndexWriter != null) | |
861 documentsIndexWriter.close(); | |
862 if (nodesIndexWriter != null) | |
863 nodesIndexWriter.close(); | |
864 if (documentsSearcherManager != null) | |
865 documentsSearcherManager.close(); | |
866 if (nodesSearcherManager != null) | |
867 nodesSearcherManager.close(); | |
868 if (documentsIndexReader != null) | |
869 documentsIndexReader.close(); | |
870 } catch (IOException e) { | |
871 throw new ApplicationException(e); | |
872 } | |
873 } | |
874 | |
875 private Query buildMorphQuery(Query query, String language) throws ApplicationException { | |
876 return buildMorphQuery(query, language, false, false); | |
877 } | |
878 | |
879 private Query buildMorphQuery(Query query, String language, boolean withAllForms, boolean translate) throws ApplicationException { | |
880 Query morphQuery = null; | |
881 if (query instanceof TermQuery) { | |
882 TermQuery termQuery = (TermQuery) query; | |
883 morphQuery = buildMorphQuery(termQuery, language, withAllForms, translate); | |
884 } else if (query instanceof BooleanQuery) { | |
885 BooleanQuery booleanQuery = (BooleanQuery) query; | |
886 morphQuery = buildMorphQuery(booleanQuery, language, withAllForms, translate); | |
887 } else { | |
888 morphQuery = query; // all other cases: PrefixQuery, PhraseQuery, FuzzyQuery, TermRangeQuery, ... | |
889 } | |
890 return morphQuery; | |
891 } | |
892 | |
893 private Query buildMorphQuery(TermQuery inputTermQuery, String fromLang, boolean withAllForms, boolean translate) throws ApplicationException { | |
894 String[] toLanguages = {"deu", "eng", "fra"}; // TODO | |
895 String fromLanguage = null; | |
896 String inputTerm = inputTermQuery.getTerm().text(); | |
897 if (fromLang == null) { | |
898 String detectedLang = MicrosoftTranslator.detectLanguageCode(inputTerm); | |
899 if (detectedLang != null) | |
900 fromLanguage = detectedLang; | |
901 } else { | |
902 fromLanguage = fromLang; | |
903 } | |
904 LexHandler lexHandler = LexHandler.getInstance(); | |
905 String fieldName = inputTermQuery.getTerm().field(); | |
906 ArrayList<TermQuery> queryTerms = new ArrayList<TermQuery>(); | |
907 if (fieldName != null && fieldName.equals("tokenMorph")) { | |
908 ArrayList<Lemma> lemmas = lexHandler.getLemmas(inputTerm, "form", fromLanguage, Normalizer.DICTIONARY, true); | |
909 if (lemmas == null) { // if no lemmas are found then do a query in tokenOrig TODO should this really be done ? | |
910 if (translate) { | |
911 String[] terms = {inputTerm}; | |
912 ArrayList<String> translatedTerms = MicrosoftTranslator.translate(terms, fromLanguage, toLanguages); | |
913 for (int i=0; i<translatedTerms.size(); i++) { | |
914 String translatedTerm = translatedTerms.get(i); | |
915 Term translatedTermTokenOrig = new Term("tokenOrig", translatedTerm); | |
916 TermQuery translatedTermQueryInTokenOrig = new TermQuery(translatedTermTokenOrig); | |
917 queryTerms.add(translatedTermQueryInTokenOrig); | |
918 } | |
919 } else { | |
920 Term termTokenOrig = new Term("tokenOrig", inputTerm); | |
921 TermQuery termQueryInTokenOrig = new TermQuery(termTokenOrig); | |
922 queryTerms.add(termQueryInTokenOrig); | |
923 } | |
924 } else { | |
925 if (translate) { | |
926 ArrayList<String> morphTerms = new ArrayList<String>(); | |
927 for (int i=0; i<lemmas.size(); i++) { | |
928 Lemma lemma = lemmas.get(i); | |
929 if (withAllForms) { // all word forms are put into the query as boolean or clauses: needed in fragments search when all forms should be highlighted | |
930 ArrayList<Form> forms = lemma.getFormsList(); | |
931 for (int j=0; j<forms.size(); j++) { | |
932 Form form = forms.get(j); | |
933 String formName = form.getFormName(); | |
934 morphTerms.add(formName); | |
935 } | |
936 } else { | |
937 String lemmaName = lemma.getLemmaName(); | |
938 morphTerms.add(lemmaName); | |
939 } | |
940 } | |
941 String[] morphTermsArray = morphTerms.toArray(new String[morphTerms.size()]); | |
942 ArrayList<String> translatedMorphTerms = MicrosoftTranslator.translate(morphTermsArray, fromLanguage, toLanguages); | |
943 for (int i=0; i<translatedMorphTerms.size(); i++) { | |
944 String translatedMorphTermStr = translatedMorphTerms.get(i); | |
945 Term translatedMorphTerm = new Term(fieldName, translatedMorphTermStr); | |
946 TermQuery translatedMorphTermQuery = new TermQuery(translatedMorphTerm); | |
947 queryTerms.add(translatedMorphTermQuery); | |
948 } | |
949 } else { | |
950 for (int i = 0; i < lemmas.size(); i++) { | |
951 Lemma lemma = lemmas.get(i); | |
952 if (withAllForms) { // all word forms are put into the query as boolean or clauses: needed in fragments search when all forms should be highlighted | |
953 ArrayList<Form> forms = lemma.getFormsList(); | |
954 for (int j=0; j<forms.size(); j++) { | |
955 Form form = forms.get(j); | |
956 Term formTerm = new Term(fieldName, form.getFormName()); | |
957 TermQuery morphTermQuery = new TermQuery(formTerm); | |
958 queryTerms.add(morphTermQuery); | |
959 } | |
960 } else { | |
961 Term lemmaTerm = new Term(fieldName, lemma.getLemmaName()); | |
962 TermQuery morphTermQuery = new TermQuery(lemmaTerm); | |
963 queryTerms.add(morphTermQuery); | |
964 } | |
965 } | |
966 } | |
967 } | |
968 } else { | |
969 // if it is not the morph field then do a normal query | |
970 if (translate) { | |
971 String inputTermQueryField = inputTermQuery.getTerm().field(); | |
972 String inputTermQueryStr = inputTermQuery.getTerm().text(); | |
973 String[] terms = {inputTermQueryStr}; | |
974 ArrayList<String> translatedTerms = MicrosoftTranslator.translate(terms, fromLanguage, toLanguages); | |
975 for (int i=0; i<translatedTerms.size(); i++) { | |
976 String translatedTerm = translatedTerms.get(i); | |
977 Term translatedTermTokenOrig = new Term(inputTermQueryField, translatedTerm); | |
978 TermQuery translatedTermQueryInTokenOrig = new TermQuery(translatedTermTokenOrig); | |
979 queryTerms.add(translatedTermQueryInTokenOrig); | |
980 } | |
981 } else { | |
982 queryTerms.add(inputTermQuery); | |
983 } | |
984 //TODO ?? perhaps other fields should also be queried morphological e.g. title etc. | |
985 } | |
986 Query retQuery = buildBooleanShouldQuery(queryTerms); | |
987 return retQuery; | |
988 } | |
989 | |
990 private Query buildBooleanShouldQuery(ArrayList<TermQuery> queryTerms) throws ApplicationException { | |
991 BooleanQuery retBooleanQuery = new BooleanQuery(); | |
992 for (int i = 0; i < queryTerms.size(); i++) { | |
993 TermQuery termQuery = queryTerms.get(i); | |
994 retBooleanQuery.add(termQuery, BooleanClause.Occur.SHOULD); | |
995 } | |
996 return retBooleanQuery; | |
997 } | |
998 | |
999 private Query buildMorphQuery(BooleanQuery query, String language, boolean withAllForms, boolean translate) throws ApplicationException { | |
1000 BooleanQuery morphBooleanQuery = new BooleanQuery(); | |
1001 BooleanClause[] booleanClauses = query.getClauses(); | |
1002 for (int i = 0; i < booleanClauses.length; i++) { | |
1003 BooleanClause boolClause = booleanClauses[i]; | |
1004 Query q = boolClause.getQuery(); | |
1005 Query morphQuery = buildMorphQuery(q, language, withAllForms, translate); | |
1006 BooleanClause.Occur occur = boolClause.getOccur(); | |
1007 morphBooleanQuery.add(morphQuery, occur); | |
1008 } | |
1009 return morphBooleanQuery; | |
1010 } | |
1011 | |
1012 public ArrayList<String> fetchTerms(String queryStr) throws ApplicationException { | |
1013 ArrayList<String> terms = null; | |
1014 String defaultQueryFieldName = "tokenOrig"; | |
1015 try { | |
1016 Query query = new QueryParser(Version.LUCENE_35, defaultQueryFieldName, nodesPerFieldAnalyzer).parse(queryStr); | |
1017 terms = fetchTerms(query); | |
1018 } catch (Exception e) { | |
1019 throw new ApplicationException(e); | |
1020 } | |
1021 return terms; | |
1022 } | |
1023 | |
1024 /** | |
1025 * recursively fetch all terms of the query | |
1026 * | |
1027 * @param query | |
1028 * @return | |
1029 */ | |
1030 private ArrayList<String> fetchTerms(Query query) throws ApplicationException { | |
1031 ArrayList<String> terms = new ArrayList<String>(); | |
1032 if (query instanceof TermQuery) { | |
1033 TermQuery termQuery = (TermQuery) query; | |
1034 String termQueryStr = termQuery.getTerm().text(); | |
1035 terms.add(termQueryStr); | |
1036 } else if (query instanceof BooleanQuery) { | |
1037 BooleanQuery booleanQuery = (BooleanQuery) query; | |
1038 terms = fetchTerms(booleanQuery); | |
1039 } else { | |
1040 String queryStr = query.toString(); | |
1041 terms.add(queryStr); // all other cases: PrefixQuery, PhraseQuery, | |
1042 // FuzzyQuery, TermRangeQuery, ... | |
1043 } | |
1044 return terms; | |
1045 } | |
1046 | |
1047 private ArrayList<String> fetchTerms(BooleanQuery query) throws ApplicationException { | |
1048 ArrayList<String> terms = new ArrayList<String>(); | |
1049 BooleanClause[] booleanClauses = query.getClauses(); | |
1050 for (int i = 0; i < booleanClauses.length; i++) { | |
1051 BooleanClause boolClause = booleanClauses[i]; | |
1052 Query q = boolClause.getQuery(); | |
1053 ArrayList<String> qTerms = fetchTerms(q); | |
1054 BooleanClause.Occur occur = boolClause.getOccur(); | |
1055 if (occur == BooleanClause.Occur.SHOULD || occur == BooleanClause.Occur.MUST) | |
1056 terms.addAll(qTerms); | |
1057 } | |
1058 return terms; | |
1059 } | |
1060 | |
1061 public ArrayList<String> fetchTerms(String queryStr, String language) throws ApplicationException { | |
1062 ArrayList<String> terms = null; | |
1063 String defaultQueryFieldName = "tokenOrig"; | |
1064 try { | |
1065 Query query = new QueryParser(Version.LUCENE_35, defaultQueryFieldName, nodesPerFieldAnalyzer).parse(queryStr); | |
1066 terms = fetchTerms(query, language); | |
1067 } catch (Exception e) { | |
1068 throw new ApplicationException(e); | |
1069 } | |
1070 return terms; | |
1071 } | |
1072 | |
1073 /** | |
1074 * recursively fetch all terms of the query | |
1075 * | |
1076 * @param query | |
1077 * @return | |
1078 */ | |
1079 private ArrayList<String> fetchTerms(Query query, String language) throws ApplicationException { | |
1080 ArrayList<String> terms = new ArrayList<String>(); | |
1081 if (query instanceof TermQuery) { | |
1082 TermQuery termQuery = (TermQuery) query; | |
1083 terms = fetchTerms(termQuery, language); | |
1084 } else if (query instanceof BooleanQuery) { | |
1085 BooleanQuery booleanQuery = (BooleanQuery) query; | |
1086 terms = fetchTerms(booleanQuery, language); | |
1087 } else { | |
1088 String queryStr = query.toString(); | |
1089 terms.add(queryStr); | |
1090 // all other cases: PrefixQuery, PhraseQuery, FuzzyQuery, TermRangeQuery, ... | |
1091 } | |
1092 return terms; | |
1093 } | |
1094 | |
1095 private ArrayList<String> fetchTerms(TermQuery termQuery, String language) throws ApplicationException { | |
1096 if (language == null) | |
1097 language = "eng"; | |
1098 ArrayList<String> terms = new ArrayList<String>(); | |
1099 Term termQueryTerm = termQuery.getTerm(); | |
1100 String term = termQuery.getTerm().text(); | |
1101 String fieldName = termQueryTerm.field(); | |
1102 if (fieldName != null && fieldName.equals("tokenMorph")) { | |
1103 LexHandler lexHandler = LexHandler.getInstance(); | |
1104 ArrayList<Lemma> lemmas = lexHandler.getLemmas(term, "form", language, Normalizer.DICTIONARY, true); | |
1105 // TODO : language über den translator service holen | |
1106 if (lemmas == null) { | |
1107 terms.add(term); | |
1108 } else { | |
1109 for (int i = 0; i < lemmas.size(); i++) { | |
1110 Lemma lemma = lemmas.get(i); | |
1111 ArrayList<Form> forms = lemma.getFormsList(); | |
1112 for (int j = 0; j < forms.size(); j++) { | |
1113 Form form = forms.get(j); | |
1114 String formName = form.getFormName(); | |
1115 terms.add(formName); | |
1116 } | |
1117 } | |
1118 } | |
1119 } else { | |
1120 terms.add(term); | |
1121 } | |
1122 return terms; | |
1123 } | |
1124 | |
1125 private ArrayList<String> fetchTerms(BooleanQuery query, String language) throws ApplicationException { | |
1126 ArrayList<String> terms = new ArrayList<String>(); | |
1127 BooleanClause[] booleanClauses = query.getClauses(); | |
1128 for (int i = 0; i < booleanClauses.length; i++) { | |
1129 BooleanClause boolClause = booleanClauses[i]; | |
1130 Query q = boolClause.getQuery(); | |
1131 ArrayList<String> qTerms = fetchTerms(q, language); | |
1132 BooleanClause.Occur occur = boolClause.getOccur(); | |
1133 if (occur == BooleanClause.Occur.SHOULD || occur == BooleanClause.Occur.MUST) | |
1134 terms.addAll(qTerms); | |
1135 } | |
1136 return terms; | |
1137 } | |
1138 | |
1139 private Document getDocument(String docId) throws ApplicationException { | |
1140 Document doc = null; | |
1141 IndexSearcher searcher = null; | |
1142 try { | |
1143 makeDocumentsSearcherManagerUpToDate(); | |
1144 searcher = documentsSearcherManager.acquire(); | |
1145 String fieldNameDocId = "docId"; | |
1146 Query queryDocId = new QueryParser(Version.LUCENE_35, fieldNameDocId, documentsPerFieldAnalyzer).parse(docId); | |
1147 TopDocs topDocs = searcher.search(queryDocId, 100000); | |
1148 topDocs.setMaxScore(1); | |
1149 if (topDocs != null && topDocs.scoreDocs != null && topDocs.scoreDocs.length > 0) { | |
1150 int docID = topDocs.scoreDocs[0].doc; | |
1151 FieldSelector docFieldSelector = getDocFieldSelector(); | |
1152 doc = searcher.doc(docID, docFieldSelector); | |
1153 } | |
1154 searcher.close(); | |
1155 } catch (Exception e) { | |
1156 throw new ApplicationException(e); | |
1157 } finally { | |
1158 try { | |
1159 if (searcher != null) | |
1160 documentsSearcherManager.release(searcher); | |
1161 } catch (IOException e) { | |
1162 // nothing | |
1163 } | |
1164 } | |
1165 // Do not use searcher after this! | |
1166 searcher = null; | |
1167 return doc; | |
1168 } | |
1169 | |
1170 private IndexWriter getDocumentsWriter() throws ApplicationException { | |
1171 IndexWriter writer = null; | |
1172 String luceneDocsDirectoryStr = Constants.getInstance().getLuceneDocumentsDir(); | |
1173 File luceneDocsDirectory = new File(luceneDocsDirectoryStr); | |
1174 try { | |
1175 Map<String, Analyzer> documentsFieldAnalyzers = new HashMap<String, Analyzer>(); | |
1176 documentsFieldAnalyzers.put("docId", new KeywordAnalyzer()); | |
1177 documentsFieldAnalyzers.put("identifier", new KeywordAnalyzer()); | |
1178 documentsFieldAnalyzers.put("uri", new KeywordAnalyzer()); | |
1179 documentsFieldAnalyzers.put("collectionNames", new StandardAnalyzer(Version.LUCENE_35)); | |
1180 documentsFieldAnalyzers.put("author", new StandardAnalyzer(Version.LUCENE_35)); | |
1181 documentsFieldAnalyzers.put("title", new StandardAnalyzer(Version.LUCENE_35)); | |
1182 documentsFieldAnalyzers.put("language", new StandardAnalyzer(Version.LUCENE_35)); | |
1183 documentsFieldAnalyzers.put("publisher", new StandardAnalyzer(Version.LUCENE_35)); | |
1184 documentsFieldAnalyzers.put("date", new StandardAnalyzer(Version.LUCENE_35)); | |
1185 documentsFieldAnalyzers.put("subject", new StandardAnalyzer(Version.LUCENE_35)); | |
1186 documentsFieldAnalyzers.put("rights", new StandardAnalyzer(Version.LUCENE_35)); | |
1187 documentsFieldAnalyzers.put("license", new StandardAnalyzer(Version.LUCENE_35)); | |
1188 documentsFieldAnalyzers.put("accessRights", new StandardAnalyzer(Version.LUCENE_35)); | |
1189 documentsFieldAnalyzers.put("echoId", new KeywordAnalyzer()); | |
1190 documentsFieldAnalyzers.put("echoPageImageDir", new KeywordAnalyzer()); | |
1191 documentsFieldAnalyzers.put("echoFiguresDir", new KeywordAnalyzer()); | |
1192 documentsFieldAnalyzers.put("mpiwgDocId", new KeywordAnalyzer()); | |
1193 documentsFieldAnalyzers.put("type", new KeywordAnalyzer()); // e.g. mime type "text/xml" | |
1194 documentsFieldAnalyzers.put("pageCount", new KeywordAnalyzer()); | |
1195 documentsFieldAnalyzers.put("schemaName", new StandardAnalyzer(Version.LUCENE_35)); | |
1196 documentsFieldAnalyzers.put("lastModified", new KeywordAnalyzer()); | |
1197 documentsFieldAnalyzers.put("tokenOrig", new StandardAnalyzer(Version.LUCENE_35)); | |
1198 documentsFieldAnalyzers.put("tokenReg", new StandardAnalyzer(Version.LUCENE_35)); | |
1199 documentsFieldAnalyzers.put("tokenNorm", new StandardAnalyzer(Version.LUCENE_35)); | |
1200 documentsFieldAnalyzers.put("tokenMorph", new StandardAnalyzer(Version.LUCENE_35)); | |
1201 documentsFieldAnalyzers.put("xmlContent", new StandardAnalyzer(Version.LUCENE_35)); | |
1202 documentsFieldAnalyzers.put("content", new StandardAnalyzer(Version.LUCENE_35)); | |
1203 documentsPerFieldAnalyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_35), documentsFieldAnalyzers); | |
1204 IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, documentsPerFieldAnalyzer); | |
1205 conf.setOpenMode(OpenMode.CREATE_OR_APPEND); | |
1206 conf.setRAMBufferSizeMB(300); // 300 MB because some documents are big; 16 MB is default | |
1207 FSDirectory fsDirectory = FSDirectory.open(luceneDocsDirectory); | |
1208 writer = new IndexWriter(fsDirectory, conf); | |
1209 writer.commit(); // when directory is empty this creates init files | |
1210 } catch (IOException e) { | |
1211 throw new ApplicationException(e); | |
1212 } | |
1213 return writer; | |
1214 } | |
1215 | |
1216 private IndexWriter getNodesWriter() throws ApplicationException { | |
1217 IndexWriter writer = null; | |
1218 String luceneNodesDirectoryStr = Constants.getInstance().getLuceneNodesDir(); | |
1219 File luceneNodesDirectory = new File(luceneNodesDirectoryStr); | |
1220 try { | |
1221 Map<String, Analyzer> nodesFieldAnalyzers = new HashMap<String, Analyzer>(); | |
1222 nodesFieldAnalyzers.put("docId", new KeywordAnalyzer()); | |
1223 nodesFieldAnalyzers.put("language", new StandardAnalyzer(Version.LUCENE_35)); // language (through xml:id): e.g. "lat" | |
1224 nodesFieldAnalyzers.put("pageNumber", new KeywordAnalyzer()); // page number (through element pb): e.g. "13" | |
1225 nodesFieldAnalyzers.put("lineNumber", new KeywordAnalyzer()); // line number on the page (through element lb): e.g. "17" | |
1226 nodesFieldAnalyzers.put("elementName", new KeywordAnalyzer()); // element name: e.g. "tei:s" | |
1227 nodesFieldAnalyzers.put("elementDocPosition", new KeywordAnalyzer()); // absolute position in document: e.g. "4711" | |
1228 nodesFieldAnalyzers.put("elementPosition", new KeywordAnalyzer()); // position in parent node (in relation to other nodes of the same name): e.g. "5" | |
1229 nodesFieldAnalyzers.put("elementAbsolutePosition", new KeywordAnalyzer()); // absolute position in document (in relation to other nodes of the same name): e.g. "213" | |
1230 nodesFieldAnalyzers.put("elementPagePosition", new KeywordAnalyzer()); // position in relation to other nodes of the same name: e.g. "213" | |
1231 nodesFieldAnalyzers.put("xmlId", new KeywordAnalyzer()); // xml id: e.g. "4711bla" | |
1232 nodesFieldAnalyzers.put("xpath", new KeywordAnalyzer()); // xpath: e.g. "/echo[1]/text[1]/p[1]/s[5]" | |
1233 nodesFieldAnalyzers.put("tokenOrig", new StandardAnalyzer(Version.LUCENE_35)); | |
1234 nodesFieldAnalyzers.put("tokenReg", new StandardAnalyzer(Version.LUCENE_35)); | |
1235 nodesFieldAnalyzers.put("tokenNorm", new StandardAnalyzer(Version.LUCENE_35)); | |
1236 nodesFieldAnalyzers.put("tokenMorph", new StandardAnalyzer(Version.LUCENE_35)); | |
1237 nodesFieldAnalyzers.put("xmlContent", new StandardAnalyzer(Version.LUCENE_35)); | |
1238 nodesPerFieldAnalyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_35), nodesFieldAnalyzers); | |
1239 IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, nodesPerFieldAnalyzer); | |
1240 conf.setOpenMode(OpenMode.CREATE_OR_APPEND); | |
1241 conf.setRAMBufferSizeMB(300); // 300 MB because some documents are big; 16 MB is default | |
1242 FSDirectory fsDirectory = FSDirectory.open(luceneNodesDirectory); | |
1243 writer = new IndexWriter(fsDirectory, conf); | |
1244 writer.commit(); | |
1245 } catch (IOException e) { | |
1246 throw new ApplicationException(e); | |
1247 } | |
1248 return writer; | |
1249 } | |
1250 | |
1251 private Sort buildSort(String[] sortFieldNames, String type) { | |
1252 Sort sort = new Sort(); | |
1253 ArrayList<SortField> sortFields = new ArrayList<SortField>(); | |
1254 for (int i=0; i<sortFieldNames.length; i++) { | |
1255 String sortFieldName = sortFieldNames[i]; | |
1256 int sortFieldType = getDocSortFieldType(sortFieldName); | |
1257 if (type.equals("node")) | |
1258 sortFieldType = getNodeSortFieldType(sortFieldName); | |
1259 String realSortFieldName = getDocSortFieldName(sortFieldName); | |
1260 SortField sortField = new SortField(realSortFieldName, sortFieldType); | |
1261 sortFields.add(sortField); | |
1262 } | |
1263 if (sortFieldNames.length == 1) { | |
1264 SortField sortField1 = sortFields.get(0); | |
1265 sort.setSort(sortField1); | |
1266 } else if (sortFieldNames.length == 2) { | |
1267 SortField sortField1 = sortFields.get(0); | |
1268 SortField sortField2 = sortFields.get(1); | |
1269 sort.setSort(sortField1, sortField2); | |
1270 } else if (sortFieldNames.length == 2) { | |
1271 SortField sortField1 = sortFields.get(0); | |
1272 SortField sortField2 = sortFields.get(1); | |
1273 SortField sortField3 = sortFields.get(2); | |
1274 sort.setSort(sortField1, sortField2, sortField3); | |
1275 } | |
1276 return sort; | |
1277 } | |
1278 | |
1279 private String getDocSortFieldName(String fieldName) { | |
1280 String sortFieldName = fieldName + "Sorted"; | |
1281 return sortFieldName; | |
1282 } | |
1283 | |
1284 private int getDocSortFieldType(String fieldName) { | |
1285 int type = SortField.STRING; | |
1286 if (fieldName.equals("lastModified")) | |
1287 type = SortField.LONG; | |
1288 return type; | |
1289 } | |
1290 | |
1291 private int getNodeSortFieldType(String fieldName) { | |
1292 int type = SortField.STRING; | |
1293 if (fieldName.equals("pageNumber") || fieldName.equals("lineNumber") || fieldName.equals("elementDocPosition")) | |
1294 type = SortField.INT; | |
1295 return type; | |
1296 } | |
1297 | |
1298 private FieldSelector getDocFieldSelector() { | |
1299 HashSet<String> fields = new HashSet<String>(); | |
1300 fields.add("docId"); | |
1301 fields.add("identifier"); | |
1302 fields.add("uri"); | |
1303 fields.add("collectionNames"); | |
1304 fields.add("author"); | |
1305 fields.add("title"); | |
1306 fields.add("language"); | |
1307 fields.add("publisher"); | |
1308 fields.add("date"); | |
1309 fields.add("subject"); | |
1310 fields.add("rights"); | |
1311 fields.add("license"); | |
1312 fields.add("echoId"); | |
1313 fields.add("echoPageImageDir"); | |
1314 fields.add("echoFiguresDir"); | |
1315 fields.add("mpiwgDocId"); | |
1316 fields.add("type"); | |
1317 fields.add("pageCount"); | |
1318 fields.add("schemaName"); | |
1319 fields.add("lastModified"); | |
1320 fields.add("content"); | |
1321 FieldSelector fieldSelector = new SetBasedFieldSelector(fields, fields); | |
1322 return fieldSelector; | |
1323 } | |
1324 | |
1325 private FieldSelector getNodeFieldSelector() { | |
1326 HashSet<String> fields = new HashSet<String>(); | |
1327 fields.add("docId"); | |
1328 fields.add("language"); | |
1329 fields.add("pageNumber"); | |
1330 fields.add("lineNumber"); | |
1331 fields.add("elementName"); | |
1332 fields.add("elementDocPosition"); | |
1333 fields.add("elementPosition"); | |
1334 fields.add("elementAbsolutePosition"); | |
1335 fields.add("elementPagePosition"); | |
1336 fields.add("xmlId"); | |
1337 fields.add("xpath"); | |
1338 fields.add("xmlContent"); | |
1339 fields.add("xmlContentTokenized"); | |
1340 FieldSelector fieldSelector = new SetBasedFieldSelector(fields, fields); | |
1341 return fieldSelector; | |
1342 } | |
1343 | |
1344 private SearcherManager getNewSearcherManager(IndexWriter indexWriter) throws ApplicationException { | |
1345 SearcherManager searcherManager = null; | |
1346 try { | |
1347 searcherManager = new SearcherManager(indexWriter, true, null, null); | |
1348 } catch (IOException e) { | |
1349 throw new ApplicationException(e); | |
1350 } | |
1351 return searcherManager; | |
1352 } | |
1353 | |
1354 private IndexReader getDocumentsReader() throws ApplicationException { | |
1355 IndexReader reader = null; | |
1356 String luceneDocsDirectoryStr = Constants.getInstance().getLuceneDocumentsDir(); | |
1357 File luceneDocsDirectory = new File(luceneDocsDirectoryStr); | |
1358 try { | |
1359 FSDirectory fsDirectory = FSDirectory.open(luceneDocsDirectory); | |
1360 reader = IndexReader.open(fsDirectory, true); | |
1361 } catch (IOException e) { | |
1362 throw new ApplicationException(e); | |
1363 } | |
1364 return reader; | |
1365 } | |
1366 | |
1367 private void makeIndexReaderUpToDate() throws ApplicationException { | |
1368 try { | |
1369 boolean isCurrent = documentsIndexReader.isCurrent(); | |
1370 if (!isCurrent) { | |
1371 documentsIndexReader = IndexReader.openIfChanged(documentsIndexReader); | |
1372 } | |
1373 } catch (IOException e) { | |
1374 throw new ApplicationException(e); | |
1375 } | |
1376 } | |
1377 | |
1378 private void makeDocumentsSearcherManagerUpToDate() throws ApplicationException { | |
1379 try { | |
1380 boolean isCurrent = documentsSearcherManager.isSearcherCurrent(); | |
1381 if (!isCurrent) { | |
1382 documentsSearcherManager.maybeReopen(); | |
1383 } | |
1384 } catch (IOException e) { | |
1385 throw new ApplicationException(e); | |
1386 } | |
1387 } | |
1388 | |
1389 private void makeNodesSearcherManagerUpToDate() throws ApplicationException { | |
1390 try { | |
1391 boolean isCurrent = nodesSearcherManager.isSearcherCurrent(); | |
1392 if (!isCurrent) { | |
1393 nodesSearcherManager.maybeReopen(); | |
1394 } | |
1395 } catch (IOException e) { | |
1396 throw new ApplicationException(e); | |
1397 } | |
1398 } | |
1399 | |
1400 private String toTokenizedXmlString(String xmlStr, String language) throws ApplicationException { | |
1401 String xmlPre = "<tokenized xmlns:xhtml=\"http://www.w3.org/1999/xhtml\" xmlns:de=\"http://www.mpiwg-berlin.mpg.de/ns/de/1.0/\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">"; | |
1402 String xmlPost = "</tokenized>"; | |
1403 String xmlStrTmp = xmlPre + xmlStr + xmlPost; | |
1404 StringReader xmlInputStringReader = new StringReader(xmlStrTmp); | |
1405 XmlTokenizer xmlTokenizer = new XmlTokenizer(xmlInputStringReader); | |
1406 xmlTokenizer.setLanguage(language); | |
1407 String[] outputOptions = { "withLemmas" }; | |
1408 xmlTokenizer.setOutputOptions(outputOptions); | |
1409 xmlTokenizer.tokenize(); | |
1410 String result = xmlTokenizer.getXmlResult(); | |
1411 return result; | |
1412 } | |
1413 | |
1414 private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException { | |
1415 try { | |
1416 WordContentHandler wordContentHandler = new WordContentHandler(); | |
1417 XMLReader xmlParser = new SAXParser(); | |
1418 xmlParser.setContentHandler(wordContentHandler); | |
1419 StringReader strReader = new StringReader(xmlStr); | |
1420 InputSource inputSource = new InputSource(strReader); | |
1421 xmlParser.parse(inputSource); | |
1422 String result = wordContentHandler.getResult(); | |
1423 return result; | |
1424 } catch (SAXException e) { | |
1425 throw new ApplicationException(e); | |
1426 } catch (IOException e) { | |
1427 throw new ApplicationException(e); | |
1428 } | |
1429 } | |
1430 | |
1431 private String escapeLuceneChars(String inputStr) { | |
1432 String luceneCharsStr = "+-&|!(){}[]^~*?:\\"; // Lucene escape symbols | |
1433 StringBuilder retStrBuilder = new StringBuilder(); | |
1434 for (int i = 0; i < inputStr.length(); i++) { | |
1435 char c = inputStr.charAt(i); | |
1436 if (luceneCharsStr.contains(String.valueOf(c))) | |
1437 retStrBuilder.append("\\"); | |
1438 retStrBuilder.append(c); | |
1439 } | |
1440 return retStrBuilder.toString(); | |
1441 } | |
1442 | |
1443 /** | |
1444 * sorgt für sinnvolle satzanfänge | |
1445 * | |
1446 * @param fragment | |
1447 */ | |
1448 private String checkHitFragment(String fragment) { | |
1449 if (fragment.startsWith(".") | |
1450 || fragment.startsWith(":") | |
1451 || fragment.startsWith(",") | |
1452 || fragment.startsWith("-") | |
1453 || fragment.startsWith(";") | |
1454 || fragment.startsWith("?") | |
1455 || fragment.startsWith(")") | |
1456 || fragment.startsWith("!")) { | |
1457 fragment = fragment.substring(1, fragment.length()); | |
1458 // finds first occurence of a given string out.println("first index of point : "+StringUtils.indexOfAny(fragment, ".")); | |
1459 } | |
1460 return fragment; | |
1461 } | |
1462 | |
1463 } |