comparison software/mpdl-services/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/lucene/IndexHandler.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
comparison
equal deleted inserted replaced
22:6a45a982c333 23:e845310098ba
1 package de.mpg.mpiwg.berlin.mpdl.cms.lucene;
2
3 import java.io.File;
4 import java.io.FileInputStream;
5 import java.io.FileReader;
6 import java.io.IOException;
7 import java.io.InputStreamReader;
8 import java.io.StringReader;
9 import java.util.ArrayList;
10 import java.util.Date;
11 import java.util.HashMap;
12 import java.util.HashSet;
13 import java.util.Map;
14
15 import org.apache.commons.io.FileUtils;
16 import org.apache.lucene.analysis.Analyzer;
17 import org.apache.lucene.analysis.KeywordAnalyzer;
18 import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
19 import org.apache.lucene.analysis.TokenStream;
20 import org.apache.lucene.analysis.standard.StandardAnalyzer;
21 import org.apache.lucene.document.Document;
22 import org.apache.lucene.document.Field;
23 import org.apache.lucene.document.FieldSelector;
24 import org.apache.lucene.document.Fieldable;
25 import org.apache.lucene.document.SetBasedFieldSelector;
26 import org.apache.lucene.index.IndexReader;
27 import org.apache.lucene.index.IndexWriter;
28 import org.apache.lucene.index.IndexWriterConfig;
29 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
30 import org.apache.lucene.index.Term;
31 import org.apache.lucene.index.TermEnum;
32 import org.apache.lucene.index.TermFreqVector;
33 import org.apache.lucene.queryParser.QueryParser;
34 import org.apache.lucene.search.BooleanClause;
35 import org.apache.lucene.search.BooleanQuery;
36 import org.apache.lucene.search.FuzzyQuery;
37 import org.apache.lucene.search.IndexSearcher;
38 import org.apache.lucene.search.MatchAllDocsQuery;
39 import org.apache.lucene.search.PhraseQuery;
40 import org.apache.lucene.search.PrefixQuery;
41 import org.apache.lucene.search.Query;
42 import org.apache.lucene.search.SearcherManager;
43 import org.apache.lucene.search.Sort;
44 import org.apache.lucene.search.SortField;
45 import org.apache.lucene.search.TermQuery;
46 import org.apache.lucene.search.TermRangeQuery;
47 import org.apache.lucene.search.TopDocs;
48 import org.apache.lucene.search.highlight.Highlighter;
49 import org.apache.lucene.search.highlight.QueryScorer;
50 import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
51 import org.apache.lucene.search.highlight.TextFragment;
52 import org.apache.lucene.search.highlight.TokenSources;
53 import org.apache.lucene.search.similar.MoreLikeThis;
54 import org.apache.lucene.store.FSDirectory;
55 import org.apache.lucene.util.Version;
56 import org.xml.sax.InputSource;
57 import org.xml.sax.SAXException;
58 import org.xml.sax.XMLReader;
59
60 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
61
62 import de.mpg.mpiwg.berlin.mpdl.cms.confmanager.CollectionReader;
63 import de.mpg.mpiwg.berlin.mpdl.cms.confmanager.ConfManagerResultWrapper;
64 import de.mpg.mpiwg.berlin.mpdl.cms.document.DocumentHandler;
65 import de.mpg.mpiwg.berlin.mpdl.cms.document.Hits;
66 import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord;
67 import de.mpg.mpiwg.berlin.mpdl.cms.document.Token;
68 import de.mpg.mpiwg.berlin.mpdl.cms.general.Constants;
69 import de.mpg.mpiwg.berlin.mpdl.cms.scheduler.CmsDocOperation;
70 import de.mpg.mpiwg.berlin.mpdl.cms.transform.XslResourceTransformer;
71 import de.mpg.mpiwg.berlin.mpdl.cms.translator.MicrosoftTranslator;
72 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
73 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler;
74 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form;
75 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
76 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer;
77 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler;
78 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
79 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler;
80 import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;
81 import de.mpg.mpiwg.berlin.mpdl.util.Util;
82
83 public class IndexHandler {
84 private static IndexHandler instance;
85 private IndexWriter documentsIndexWriter;
86 private IndexWriter nodesIndexWriter;
87 private SearcherManager documentsSearcherManager;
88 private SearcherManager nodesSearcherManager;
89 private IndexReader documentsIndexReader;
90 private PerFieldAnalyzerWrapper documentsPerFieldAnalyzer;
91 private PerFieldAnalyzerWrapper nodesPerFieldAnalyzer;
92
93
94 public static IndexHandler getInstance() throws ApplicationException {
95 if (instance == null) {
96 instance = new IndexHandler();
97 instance.init();
98 }
99 return instance;
100 }
101
102 private void init() throws ApplicationException {
103 documentsIndexWriter = getDocumentsWriter();
104 documentsIndexWriter.setMaxFieldLength(1000000);
105 nodesIndexWriter = getNodesWriter();
106 nodesIndexWriter.setMaxFieldLength(1000000);
107 documentsSearcherManager = getNewSearcherManager(documentsIndexWriter);
108 nodesSearcherManager = getNewSearcherManager(nodesIndexWriter);
109 documentsIndexReader = getDocumentsReader();
110 }
111
112 public void indexDocument(CmsDocOperation docOperation) throws ApplicationException {
113 try {
114 // first delete document in documentsIndex and nodesIndex
115 deleteDocumentLocal(docOperation);
116 indexDocumentLocal(docOperation);
117 documentsIndexWriter.commit();
118 nodesIndexWriter.commit();
119 } catch (Exception e) {
120 try {
121 documentsIndexWriter.rollback();
122 nodesIndexWriter.rollback();
123 } catch (Exception ex) {
124 // nothing
125 }
126 throw new ApplicationException(e);
127 }
128 }
129
130 private void indexDocumentLocal(CmsDocOperation docOperation) throws ApplicationException {
131 FileReader fr = null;
132 try {
133 MetadataRecord mdRecord = docOperation.getMdRecord();
134 String docId = mdRecord.getDocId();
135 DocumentHandler docHandler = new DocumentHandler();
136 String docFileName = docHandler.getDocFullFileName(docId) + ".upgrade";
137 // add document to documentsIndex
138 Document doc = new Document();
139 Field docIdField = new Field("docId", docId, Field.Store.YES, Field.Index.ANALYZED);
140 doc.add(docIdField);
141 String docIdSortedStr = docId.toLowerCase(); // so that sorting is lower case
142 Field docIdFieldSorted = new Field("docIdSorted", docIdSortedStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
143 doc.add(docIdFieldSorted);
144 String identifier = mdRecord.getIdentifier();
145 if (identifier != null) {
146 Field identifierField = new Field("identifier", identifier, Field.Store.YES, Field.Index.ANALYZED);
147 doc.add(identifierField);
148 }
149 String uri = docOperation.getSrcUrl();
150 if (uri != null) {
151 Field uriField = new Field("uri", uri, Field.Store.YES, Field.Index.ANALYZED);
152 doc.add(uriField);
153 }
154 String collectionNames = docOperation.getCollectionNames();
155 if (collectionNames != null) {
156 Field collectionNamesField = new Field("collectionNames", collectionNames, Field.Store.YES, Field.Index.ANALYZED);
157 doc.add(collectionNamesField);
158 }
159 if (mdRecord.getCreator() != null) {
160 String authorStr = mdRecord.getCreator();
161 Field authorField = new Field("author", authorStr, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
162 doc.add(authorField);
163 if (authorStr != null)
164 authorStr = authorStr.toLowerCase(); // so that sorting is lower case
165 Field authorFieldSorted = new Field("authorSorted", authorStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
166 doc.add(authorFieldSorted);
167 }
168 if (mdRecord.getTitle() != null) {
169 String titleStr = mdRecord.getTitle();
170 Field titleField = new Field("title", titleStr, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
171 doc.add(titleField);
172 if (titleStr != null)
173 titleStr = titleStr.toLowerCase(); // so that sorting is lower case
174 Field titleFieldSorted = new Field("titleSorted", titleStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
175 doc.add(titleFieldSorted);
176 }
177 if (mdRecord.getLanguage() != null) {
178 String langStr = mdRecord.getLanguage();
179 if (langStr != null)
180 langStr = langStr.toLowerCase(); // all language codes are lower case
181 Field languageField = new Field("language",langStr, Field.Store.YES, Field.Index.ANALYZED);
182 doc.add(languageField);
183 Field languageFieldSorted = new Field("languageSorted", langStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
184 doc.add(languageFieldSorted);
185 }
186 if (mdRecord.getPublisher() != null) {
187 String publisherStr = mdRecord.getPublisher();
188 Field publisherField = new Field("publisher", publisherStr, Field.Store.YES, Field.Index.ANALYZED);
189 doc.add(publisherField);
190 if (publisherStr != null)
191 publisherStr = publisherStr.toLowerCase(); // so that sorting is lower case
192 Field publisherFieldSorted = new Field("publisherSorted", publisherStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
193 doc.add(publisherFieldSorted);
194 }
195 if (mdRecord.getYear() != null) {
196 Field dateField = new Field("date", mdRecord.getYear(), Field.Store.YES, Field.Index.ANALYZED);
197 doc.add(dateField);
198 Field dateFieldSorted = new Field("dateSorted", mdRecord.getYear(), Field.Store.YES, Field.Index.NOT_ANALYZED);
199 doc.add(dateFieldSorted);
200 }
201 if (mdRecord.getSubject() != null) {
202 Field subjectField = new Field("subject", mdRecord.getSubject(), Field.Store.YES, Field.Index.ANALYZED);
203 doc.add(subjectField);
204 }
205 if (mdRecord.getRights() != null) {
206 Field rightsField = new Field("rights", mdRecord.getRights(), Field.Store.YES, Field.Index.ANALYZED);
207 doc.add(rightsField);
208 }
209 if (mdRecord.getLicense() != null) {
210 Field licenseField = new Field("license", mdRecord.getLicense(), Field.Store.YES, Field.Index.ANALYZED);
211 doc.add(licenseField);
212 }
213 if (mdRecord.getAccessRights() != null) {
214 Field accessRightsField = new Field("accessRights", mdRecord.getAccessRights(), Field.Store.YES, Field.Index.ANALYZED);
215 doc.add(accessRightsField);
216 }
217 String echoId = mdRecord.getEchoId();
218 if (echoId != null) {
219 Field echoIdField = new Field("echoId", echoId, Field.Store.YES, Field.Index.ANALYZED);
220 doc.add(echoIdField);
221 }
222 String echoPageImageDir = mdRecord.getEchoPageImageDir();
223 if (echoPageImageDir != null) {
224 Field echoPageImageDirField = new Field("echoPageImageDir", echoPageImageDir, Field.Store.YES, Field.Index.ANALYZED);
225 doc.add(echoPageImageDirField);
226 }
227 String echoFiguresDir = mdRecord.getEchoFiguresDir();
228 if (echoFiguresDir != null) {
229 Field echoFiguresDirField = new Field("echoFiguresDir", echoFiguresDir, Field.Store.YES, Field.Index.ANALYZED);
230 doc.add(echoFiguresDirField);
231 }
232 String mpiwgDocId = mdRecord.getMpiwgDocId();
233 if (mpiwgDocId != null) {
234 Field mpiwgDocIdField = new Field("mpiwgDocId", mpiwgDocId, Field.Store.YES, Field.Index.ANALYZED);
235 doc.add(mpiwgDocIdField);
236 }
237 if (mdRecord.getLastModified() != null) {
238 Date lastModified = mdRecord.getLastModified();
239 String xsDateStr = new Util().toXsDate(lastModified);
240 Field lastModifiedField = new Field("lastModified", xsDateStr, Field.Store.YES, Field.Index.ANALYZED);
241 doc.add(lastModifiedField);
242 long time = lastModified.getTime();
243 String timeStr = String.valueOf(time);
244 Field lastModifiedFieldSorted = new Field("lastModifiedSorted", timeStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
245 doc.add(lastModifiedFieldSorted);
246 }
247 if (mdRecord.getSchemaName() != null) {
248 String schemNameStr = mdRecord.getSchemaName();
249 Field schemaField = new Field("schemaName", schemNameStr, Field.Store.YES, Field.Index.ANALYZED);
250 doc.add(schemaField);
251 if (schemNameStr != null)
252 schemNameStr = schemNameStr.toLowerCase(); // so that sorting is lower case
253 Field schemaFieldSorted = new Field("schemaNameSorted", schemNameStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
254 doc.add(schemaFieldSorted);
255 }
256
257 String language = mdRecord.getLanguage();
258 InputStreamReader docFileReader = new InputStreamReader(new FileInputStream(docFileName), "utf-8");
259 // to guarantee that utf-8 is used (if not done, it does not work on Tomcat which has another default charset)
260 XmlTokenizer docXmlTokenizer = new XmlTokenizer(docFileReader);
261 docXmlTokenizer.setDocIdentifier(docId);
262 docXmlTokenizer.setLanguage(language);
263 docXmlTokenizer.setOutputFormat("string");
264 String[] outputOptionsWithLemmas = { "withLemmas" }; // so all tokens are
265 // fetched with lemmas (costs performance)
266 docXmlTokenizer.setOutputOptions(outputOptionsWithLemmas);
267 String[] normFunctionNone = { "none" };
268 docXmlTokenizer.setNormFunctions(normFunctionNone);
269 docXmlTokenizer.tokenize();
270
271 int pageCount = docXmlTokenizer.getPageCount();
272 if (pageCount == 0)
273 pageCount = 1; // each document at least has one page
274 String pageCountStr = String.valueOf(pageCount);
275 Field pageCountField = new Field("pageCount", pageCountStr, Field.Store.YES, Field.Index.ANALYZED);
276 doc.add(pageCountField);
277
278 String[] outputOptionsEmpty = {};
279 docXmlTokenizer.setOutputOptions(outputOptionsEmpty);
280 // must be set to null so that the normalization function works
281 String docTokensOrig = docXmlTokenizer.getStringResult();
282 String[] normFunctionReg = { "reg" };
283 docXmlTokenizer.setNormFunctions(normFunctionReg);
284 String docTokensReg = docXmlTokenizer.getStringResult();
285 String[] normFunctionNorm = { "norm" };
286 docXmlTokenizer.setNormFunctions(normFunctionNorm);
287 String docTokensNorm = docXmlTokenizer.getStringResult();
288 docXmlTokenizer.setOutputOptions(outputOptionsWithLemmas);
289 String docTokensMorph = docXmlTokenizer.getStringResult();
290
291 Field tokenOrigField = new Field("tokenOrig", docTokensOrig, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
292 Field tokenRegField = new Field("tokenReg", docTokensReg, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
293 Field tokenNormField = new Field("tokenNorm", docTokensNorm, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
294 Field tokenMorphField = new Field("tokenMorph", docTokensMorph, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
295 doc.add(tokenOrigField);
296 doc.add(tokenRegField);
297 doc.add(tokenNormField);
298 doc.add(tokenMorphField);
299
300 // save original content of the doc file
301 File docFile = new File(docFileName);
302 String contentXml = FileUtils.readFileToString(docFile, "utf-8");
303 Field contentXmlField = new Field("xmlContent", contentXml, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
304 doc.add(contentXmlField);
305
306 // generate original chars content
307 XslResourceTransformer charsTransformer = new XslResourceTransformer("chars.xsl");
308 String content = charsTransformer.transform(docFileName);
309 Field contentField = new Field("content", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
310 doc.add(contentField);
311
312 documentsIndexWriter.addDocument(doc);
313
314 // add all elements with the specified names of the document to nodesIndex
315 String[] elementNamesArray = docOperation.getElementNames();
316 String elementNames = "";
317 for (int i = 0; i < elementNamesArray.length; i++) {
318 String elemName = elementNamesArray[i];
319 elementNames = elementNames + elemName + " ";
320 }
321 elementNames = elementNames.substring(0, elementNames.length() - 1);
322 ArrayList<XmlTokenizerContentHandler.Element> elements = docXmlTokenizer.getElements(elementNames);
323 for (int i = 0; i < elements.size(); i++) {
324 XmlTokenizerContentHandler.Element element = elements.get(i);
325 Document nodeDoc = new Document();
326 nodeDoc.add(docIdField);
327 String nodeLanguage = element.lang;
328 if (nodeLanguage == null)
329 nodeLanguage = language;
330 String nodePageNumber = String.valueOf(element.pageNumber);
331 String nodeLineNumber = String.valueOf(element.lineNumber);
332 String nodeElementName = String.valueOf(element.name);
333 String nodeElementDocPosition = String.valueOf(element.docPosition);
334 String nodeElementAbsolutePosition = String.valueOf(element.position);
335 String nodeElementPagePosition = String.valueOf(element.pagePosition);
336 String nodeElementPosition = String.valueOf(element.elemPosition);
337 String nodeXmlId = element.xmlId;
338 String nodeXpath = element.xpath;
339 String nodeXmlContent = element.toXmlString();
340 String nodeTokensOrig = element.getTokensStr("orig");
341 String nodeTokensReg = element.getTokensStr("reg");
342 String nodeTokensNorm = element.getTokensStr("norm");
343 String nodeTokensMorph = element.getTokensStr("morph");
344 if (nodeLanguage != null) {
345 Field nodeLanguageField = new Field("language", nodeLanguage, Field.Store.YES, Field.Index.ANALYZED);
346 nodeDoc.add(nodeLanguageField);
347 }
348 Field nodePageNumberField = new Field("pageNumber", nodePageNumber, Field.Store.YES, Field.Index.ANALYZED);
349 nodeDoc.add(nodePageNumberField);
350 Field nodeLineNumberField = new Field("lineNumber", nodeLineNumber, Field.Store.YES, Field.Index.ANALYZED);
351 nodeDoc.add(nodeLineNumberField);
352 Field nodeElementNameField = new Field("elementName", nodeElementName, Field.Store.YES, Field.Index.ANALYZED);
353 nodeDoc.add(nodeElementNameField);
354 Field nodeElementDocPositionField = new Field("elementDocPosition", nodeElementDocPosition, Field.Store.YES, Field.Index.ANALYZED);
355 nodeDoc.add(nodeElementDocPositionField);
356 Field nodeElementDocPositionFieldSorted = new Field("elementDocPositionSorted", nodeElementDocPosition, Field.Store.YES, Field.Index.NOT_ANALYZED);
357 nodeDoc.add(nodeElementDocPositionFieldSorted);
358 Field nodeElementAbsolutePositionField = new Field("elementAbsolutePosition", nodeElementAbsolutePosition, Field.Store.YES, Field.Index.ANALYZED);
359 nodeDoc.add(nodeElementAbsolutePositionField);
360 Field nodeElementPagePositionField = new Field("elementPagePosition", nodeElementPagePosition, Field.Store.YES, Field.Index.ANALYZED);
361 nodeDoc.add(nodeElementPagePositionField);
362 Field nodeElementPositionField = new Field("elementPosition", nodeElementPosition, Field.Store.YES, Field.Index.ANALYZED);
363 nodeDoc.add(nodeElementPositionField);
364 if (nodeXmlId != null) {
365 Field nodeXmlIdField = new Field("xmlId", nodeXmlId, Field.Store.YES, Field.Index.ANALYZED);
366 nodeDoc.add(nodeXmlIdField);
367 }
368 if (nodeXpath != null) {
369 Field nodeXpathField = new Field("xpath", nodeXpath, Field.Store.YES, Field.Index.ANALYZED);
370 nodeDoc.add(nodeXpathField);
371 }
372 if (nodeXmlContent != null) {
373 Field nodeXmlContentField = new Field("xmlContent", nodeXmlContent, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
374 nodeDoc.add(nodeXmlContentField);
375 }
376 if (nodeXmlContent != null) {
377 String nodeXmlContentTokenized = toTokenizedXmlString(nodeXmlContent, nodeLanguage);
378 byte[] blabla = nodeXmlContentTokenized.getBytes("utf-8"); // TODO why is tokenizedXmlStr not already utf-8 on page 444 Benedetti ?
379 nodeXmlContentTokenized = new String(blabla, "utf-8");
380 nodeXmlContentTokenized = enrichWordsOrigRegNorm(nodeXmlContentTokenized);
381 Field nodeXmlContentTokenizedField = new Field("xmlContentTokenized", nodeXmlContentTokenized, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
382 nodeDoc.add(nodeXmlContentTokenizedField);
383 }
384 if (nodeTokensOrig != null) {
385 Field nodeTokenOrigField = new Field("tokenOrig", nodeTokensOrig, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
386 nodeDoc.add(nodeTokenOrigField);
387 }
388 if (nodeTokensReg != null) {
389 Field nodeTokenRegField = new Field("tokenReg", nodeTokensReg, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
390 nodeDoc.add(nodeTokenRegField);
391 }
392 if (nodeTokensNorm != null) {
393 Field nodeTokenNormField = new Field("tokenNorm", nodeTokensNorm, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
394 nodeDoc.add(nodeTokenNormField);
395 }
396 if (nodeTokensMorph != null) {
397 Field nodeTokenMorphField = new Field("tokenMorph", nodeTokensMorph, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
398 nodeDoc.add(nodeTokenMorphField);
399 }
400
401 nodesIndexWriter.addDocument(nodeDoc);
402 }
403 } catch (Exception e) {
404 throw new ApplicationException(e);
405 } finally {
406 try {
407 if (fr != null)
408 fr.close();
409 } catch (Exception e) {
410 // nothing
411 }
412 }
413 }
414
415 public void deleteDocument(CmsDocOperation docOperation) throws ApplicationException {
416 try {
417 deleteDocumentLocal(docOperation);
418 documentsIndexWriter.commit();
419 nodesIndexWriter.commit();
420 } catch (Exception e) {
421 try {
422 documentsIndexWriter.rollback();
423 nodesIndexWriter.rollback();
424 } catch (Exception ex) {
425 // nothing
426 }
427 throw new ApplicationException(e);
428 }
429 }
430
431 private void deleteDocumentLocal(CmsDocOperation docOperation) throws ApplicationException {
432 String docId = docOperation.getDocIdentifier();
433 try {
434 Term termIdentifier = new Term("docId", docId);
435 documentsIndexWriter.deleteDocuments(termIdentifier);
436 nodesIndexWriter.deleteDocuments(termIdentifier);
437 } catch (Exception e) {
438 throw new ApplicationException(e);
439 }
440 }
441
442 public Hits queryDocuments(String queryStr, String[] sortFieldNames, String language, int from, int to, boolean withHitFragments, boolean translate) throws ApplicationException {
443 Hits hits = null;
444 IndexSearcher searcher = null;
445 try {
446 makeDocumentsSearcherManagerUpToDate();
447 searcher = documentsSearcherManager.acquire();
448 String defaultQueryFieldName = "tokenOrig";
449 QueryParser queryParser = new QueryParser(Version.LUCENE_35, defaultQueryFieldName, documentsPerFieldAnalyzer);
450 Query query = null;
451 if (queryStr.equals("*")) {
452 query = new MatchAllDocsQuery();
453 } else {
454 query = queryParser.parse(queryStr);
455 }
456 Query morphQuery = buildMorphQuery(query, language, false, translate);
457 Query highlighterQuery = buildMorphQuery(query, language, true, translate);
458 if (query instanceof PhraseQuery || query instanceof PrefixQuery || query instanceof FuzzyQuery || query instanceof TermRangeQuery) {
459 highlighterQuery = query; // TODO wenn sie rekursiv enthalten sind
460 }
461 String beginHitMark = "!!!BEGIN_HIT!!!";
462 String endHitMark = "!!!END_HIT!!!";
463 SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(beginHitMark, endHitMark); // marks should not contain xml special chars
464 QueryScorer queryScorer = new QueryScorer(highlighterQuery);
465 Highlighter highlighter = new Highlighter(htmlFormatter, queryScorer);
466 TopDocs resultDocs = null;
467 if (sortFieldNames != null) {
468 Sort sort = buildSort(sortFieldNames, "doc"); // build sort criteria
469 resultDocs = searcher.search(morphQuery, 10000, sort);
470 } else {
471 resultDocs = searcher.search(morphQuery, 10000);
472 }
473 resultDocs.setMaxScore(1);
474 int toTmp = to;
475 if (resultDocs.scoreDocs.length <= to)
476 toTmp = resultDocs.scoreDocs.length - 1;
477 if (resultDocs != null) {
478 ArrayList<de.mpg.mpiwg.berlin.mpdl.cms.document.Document> docs = new ArrayList<de.mpg.mpiwg.berlin.mpdl.cms.document.Document>();
479 for (int i=from; i<=toTmp; i++) {
480 int docID = resultDocs.scoreDocs[i].doc;
481 FieldSelector docFieldSelector = getDocFieldSelector();
482 Document luceneDoc = searcher.doc(docID, docFieldSelector);
483 de.mpg.mpiwg.berlin.mpdl.cms.document.Document doc = new de.mpg.mpiwg.berlin.mpdl.cms.document.Document(luceneDoc);
484 if (withHitFragments) {
485 ArrayList<String> hitFragments = new ArrayList<String>();
486 Fieldable docContentField = luceneDoc.getFieldable("content");
487 if (docContentField != null) {
488 String docContent = docContentField.stringValue();
489 TokenStream tokenStream = TokenSources.getAnyTokenStream(this.documentsIndexReader, docID, docContentField.name(), luceneDoc, documentsPerFieldAnalyzer);
490 // highlighter.setMaxDocCharsToAnalyze(100000); // the first 100000 chars are fetched maximal, but performance is not really better
491 TextFragment[] textfragments = highlighter.getBestTextFragments(tokenStream, docContent, false, 5);
492 if (textfragments.length > 0) {
493 for (int j=0; j<textfragments.length; j++) {
494 String textFragment = textfragments[j].toString().trim();
495 textFragment = StringUtils.deresolveXmlEntities(textFragment);
496 textFragment = textFragment.replaceAll(beginHitMark, "<span class=\"highlight hit\">");
497 textFragment = textFragment.replaceAll(endHitMark, "</span>");
498 hitFragments.add(checkHitFragment(textFragment));
499 }
500 }
501 }
502 if (! hitFragments.isEmpty())
503 doc.setHitFragments(hitFragments);
504 }
505 docs.add(doc);
506 }
507 if (docs != null) {
508 hits = new Hits(docs, from, to);
509 hits.setSize(resultDocs.scoreDocs.length);
510 hits.setQuery(morphQuery);
511 }
512 }
513 } catch (Exception e) {
514 throw new ApplicationException(e);
515 } finally {
516 try {
517 if (searcher != null)
518 documentsSearcherManager.release(searcher);
519 } catch (IOException e) {
520 // nothing
521 }
522 }
523 // Do not use searcher after this!
524 searcher = null;
525 return hits;
526 }
527
528 public Hits queryDocument(String docId, String queryStr, int from, int to) throws ApplicationException {
529 Hits hits = null;
530 IndexSearcher searcher = null;
531 MetadataRecord docMetadataRecord = getDocMetadata(docId);
532 if (docMetadataRecord == null)
533 return null; // no document with that docId is in index
534 try {
535 makeNodesSearcherManagerUpToDate();
536 searcher = nodesSearcherManager.acquire();
537 String fieldNameDocId = "docId";
538 Query queryDocId = new QueryParser(Version.LUCENE_35, fieldNameDocId, nodesPerFieldAnalyzer).parse(docId);
539 String defaultQueryFieldName = "tokenOrig";
540 Query query = new QueryParser(Version.LUCENE_35, defaultQueryFieldName, nodesPerFieldAnalyzer).parse(queryStr);
541 String language = docMetadataRecord.getLanguage();
542 if (language == null || language.equals("")) {
543 String collectionNames = docMetadataRecord.getCollectionNames();
544 ConfManagerResultWrapper collectionInfo = CollectionReader.getInstance().getResultWrapper(collectionNames);
545 if (collectionInfo != null) {
546 String mainLang = collectionInfo.getMainLanguage();
547 if (mainLang != null)
548 language = mainLang;
549 }
550 }
551 Query morphQuery = buildMorphQuery(query, language);
552 BooleanQuery queryDoc = new BooleanQuery();
553 queryDoc.add(queryDocId, BooleanClause.Occur.MUST);
554 queryDoc.add(morphQuery, BooleanClause.Occur.MUST);
555 String[] sortFieldNames = {"elementDocPosition"};
556 Sort sortByPosition = buildSort(sortFieldNames, "node");
557 TopDocs topDocs = searcher.search(queryDoc, 100000, sortByPosition);
558 topDocs.setMaxScore(1);
559 int toTmp = to;
560 if (topDocs.scoreDocs.length <= to)
561 toTmp = topDocs.scoreDocs.length - 1;
562 if (topDocs != null) {
563 ArrayList<de.mpg.mpiwg.berlin.mpdl.cms.document.Document> docs = new ArrayList<de.mpg.mpiwg.berlin.mpdl.cms.document.Document>();
564 for (int i=from; i<=toTmp; i++) {
565 int docID = topDocs.scoreDocs[i].doc;
566 FieldSelector nodeFieldSelector = getNodeFieldSelector();
567 Document luceneDoc = searcher.doc(docID, nodeFieldSelector);
568 de.mpg.mpiwg.berlin.mpdl.cms.document.Document doc = new de.mpg.mpiwg.berlin.mpdl.cms.document.Document(luceneDoc);
569 docs.add(doc);
570 }
571 if (docs != null) {
572 hits = new Hits(docs, from, to);
573 hits.setSize(topDocs.scoreDocs.length);
574 }
575 }
576 searcher.close();
577 } catch (Exception e) {
578 throw new ApplicationException(e);
579 } finally {
580 try {
581 if (searcher != null)
582 documentsSearcherManager.release(searcher);
583 } catch (IOException e) {
584 // nothing
585 }
586 }
587 // Do not use searcher after this!
588 searcher = null;
589 return hits;
590 }
591
592 public Hits moreLikeThis(String docId, int from, int to) throws ApplicationException {
593 Hits hits = null;
594 ArrayList<de.mpg.mpiwg.berlin.mpdl.cms.document.Document> wspDocs = null;
595 IndexSearcher searcher1 = null;
596 IndexSearcher searcher2 = null;
597 try {
598 makeDocumentsSearcherManagerUpToDate();
599 searcher1 = documentsSearcherManager.acquire();
600 String fieldNameDocId = "docId";
601 Query queryDocId = new QueryParser(Version.LUCENE_35, fieldNameDocId, documentsPerFieldAnalyzer).parse(docId);
602 TopDocs topDocs = searcher1.search(queryDocId, 100000);
603 topDocs.setMaxScore(1);
604 int docID = -1;
605 if (topDocs != null && topDocs.scoreDocs != null && topDocs.scoreDocs.length > 0) {
606 docID = topDocs.scoreDocs[0].doc;
607 }
608 makeDocumentsSearcherManagerUpToDate();
609 searcher2 = documentsSearcherManager.acquire();
610 MoreLikeThis mlt = new MoreLikeThis(documentsIndexReader); // TODO documentsIndexReader is ok ?
611 mlt.setFieldNames(new String[]{"content"}); // similarity function works against these fields
612 mlt.setMinWordLen(2);
613 mlt.setBoost(true);
614 Query queryMoreLikeThis = mlt.like(docID);
615 TopDocs moreLikeThisDocs = searcher2.search(queryMoreLikeThis, 10);
616 moreLikeThisDocs.setMaxScore(10);
617 if (moreLikeThisDocs != null) {
618 if (wspDocs == null)
619 wspDocs = new ArrayList<de.mpg.mpiwg.berlin.mpdl.cms.document.Document>();
620 for (int i=0; i<moreLikeThisDocs.scoreDocs.length; i++) {
621 int docIdent = moreLikeThisDocs.scoreDocs[i].doc;
622 Document luceneDoc = searcher2.doc(docIdent);
623 de.mpg.mpiwg.berlin.mpdl.cms.document.Document wspDoc = new de.mpg.mpiwg.berlin.mpdl.cms.document.Document(luceneDoc);
624 wspDocs.add(wspDoc);
625 }
626 }
627 if (wspDocs != null) {
628 hits = new Hits(wspDocs, from, to);
629 hits.setSize(moreLikeThisDocs.scoreDocs.length);
630 }
631 } catch (Exception e) {
632 throw new ApplicationException(e);
633 } finally {
634 try {
635 if (searcher1 != null)
636 documentsSearcherManager.release(searcher1);
637 if (searcher2 != null)
638 documentsSearcherManager.release(searcher2);
639 } catch (IOException e) {
640 // nothing
641 }
642 }
643 // Do not use searcher after this!
644 searcher1 = null;
645 searcher2 = null;
646 return hits;
647 }
648
649 public MetadataRecord getDocMetadata(String docId) throws ApplicationException {
650 MetadataRecord mdRecord = null;
651 Document doc = getDocument(docId);
652 if (doc != null) {
653 String identifier = null;
654 Fieldable identifierField = doc.getFieldable("identifier");
655 if (identifierField != null)
656 identifier = identifierField.stringValue();
657 String uri = null;
658 Fieldable uriField = doc.getFieldable("uri");
659 if (uriField != null)
660 uri = uriField.stringValue();
661 String collectionNames = null;
662 Fieldable collectionNamesField = doc.getFieldable("collectionNames");
663 if (collectionNamesField != null)
664 collectionNames = collectionNamesField.stringValue();
665 String author = null;
666 Fieldable authorField = doc.getFieldable("author");
667 if (authorField != null)
668 author = authorField.stringValue();
669 String title = null;
670 Fieldable titleField = doc.getFieldable("title");
671 if (titleField != null)
672 title = titleField.stringValue();
673 String language = null;
674 Fieldable languageField = doc.getFieldable("language");
675 if (languageField != null)
676 language = languageField.stringValue();
677 else {
678 ConfManagerResultWrapper collectionInfo = CollectionReader.getInstance().getResultWrapper(collectionNames);
679 if (collectionInfo != null) {
680 String mainLang = collectionInfo.getMainLanguage();
681 if (mainLang != null)
682 language = mainLang;
683 }
684 }
685 String publisher = null;
686 Fieldable publisherField = doc.getFieldable("publisher");
687 if (publisherField != null)
688 publisher = publisherField.stringValue();
689 Date yearDate = null;
690 Fieldable dateField = doc.getFieldable("date");
691 if (dateField != null) {
692 String dateStr = dateField.stringValue();
693 if (dateStr != null && !dateStr.equals("")) {
694 dateStr = StringUtils.deresolveXmlEntities(dateStr);
695 String yearStr = new Util().toYearStr(dateStr); // test if possible
696 // etc
697 if (yearStr != null) {
698 yearDate = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
699 }
700 }
701 }
702 String rights = null;
703 Fieldable rightsField = doc.getFieldable("rights");
704 if (rightsField != null)
705 rights = rightsField.stringValue();
706 String license = null;
707 Fieldable licenseField = doc.getFieldable("license");
708 if (licenseField != null)
709 license = licenseField.stringValue();
710 String accessRights = null;
711 Fieldable accessRightsField = doc.getFieldable("accessRights");
712 if (accessRightsField != null)
713 accessRights = accessRightsField.stringValue();
714 String echoId = null;
715 Fieldable echoIdField = doc.getFieldable("echoId");
716 if (echoIdField != null)
717 echoId = echoIdField.stringValue();
718 String echoPageImageDir = null;
719 Fieldable echoPageImageDirField = doc.getFieldable("echoPageImageDir");
720 if (echoPageImageDirField != null)
721 echoPageImageDir = echoPageImageDirField.stringValue();
722 String echoFiguresDir = null;
723 Fieldable echoFiguresDirField = doc.getFieldable("echoFiguresDir");
724 if (echoFiguresDirField != null)
725 echoFiguresDir = echoFiguresDirField.stringValue();
726 String mpiwgDocId = null;
727 Fieldable mpiwgDocIdField = doc.getFieldable("mpiwgDocId");
728 if (mpiwgDocIdField != null)
729 mpiwgDocId = mpiwgDocIdField.stringValue();
730 int pageCount = -1;
731 Fieldable pageCountField = doc.getFieldable("pageCount");
732 if (pageCountField != null) {
733 String pageCountStr = pageCountField.stringValue();
734 pageCount = Integer.valueOf(pageCountStr);
735 }
736 String schemaName = null;
737 Fieldable schemaNameField = doc.getFieldable("schemaName");
738 if (schemaNameField != null)
739 schemaName = schemaNameField.stringValue();
740 Date lastModified = null;
741 Fieldable lastModifiedField = doc.getFieldable("lastModified");
742 if (lastModifiedField != null) {
743 String lastModifiedXSDateStr = lastModifiedField.stringValue();
744 lastModified = new Util().toDate(lastModifiedXSDateStr);
745 }
746 mdRecord = new MetadataRecord();
747 mdRecord.setDocId(docId);
748 mdRecord.setUri(uri);
749 mdRecord.setIdentifier(identifier);
750 mdRecord.setCollectionNames(collectionNames);
751 mdRecord.setCreator(author);
752 mdRecord.setTitle(title);
753 mdRecord.setDate(yearDate);
754 mdRecord.setLanguage(language);
755 mdRecord.setPublisher(publisher);
756 mdRecord.setLicense(license);
757 mdRecord.setRights(rights);
758 mdRecord.setAccessRights(accessRights);
759 mdRecord.setEchoId(echoId);
760 mdRecord.setEchoPageImageDir(echoPageImageDir);
761 mdRecord.setEchoFiguresDir(echoFiguresDir);
762 mdRecord.setMpiwgDocId(mpiwgDocId);
763 mdRecord.setPageCount(pageCount);
764 mdRecord.setSchemaName(schemaName);
765 mdRecord.setLastModified(lastModified);
766 }
767 return mdRecord;
768 }
769
770 public ArrayList<Token> getToken(String fieldName, String value, int count) throws ApplicationException {
771 ArrayList<Token> retToken = null;
772 int counter = 0;
773 TermEnum terms = null;
774 try {
775 if (value == null)
776 value = "";
777 Term term = new Term(fieldName, value);
778 makeIndexReaderUpToDate();
779 terms = documentsIndexReader.terms(term);
780 while (terms != null && fieldName != null && fieldName.equals(terms.term().field()) && counter < count) {
781 if (retToken == null)
782 retToken = new ArrayList<Token>();
783 Term termContent = terms.term();
784 Token token = new Token(termContent);
785 retToken.add(token);
786 counter++;
787 if (!terms.next())
788 break;
789 }
790 } catch (Exception e) {
791 throw new ApplicationException(e);
792 } finally {
793 if (terms != null) {
794 try {
795 terms.close();
796 } catch (IOException e) {
797 // nothing
798 }
799 }
800 }
801 return retToken;
802 }
803
804 public ArrayList<Token> getToken(String docId, String fieldName, String value, int count) throws ApplicationException {
805 ArrayList<Token> retToken = null;
806 if (value == null)
807 value = "";
808 int counter = 0;
809 IndexSearcher searcher = null;
810 try {
811 makeDocumentsSearcherManagerUpToDate();
812 makeIndexReaderUpToDate();
813 searcher = documentsSearcherManager.acquire();
814 Query queryDocId = new TermQuery(new Term("docId", docId));
815 TopDocs topDocs = searcher.search(queryDocId, 1);
816 if (topDocs != null) {
817 int docIdInt = topDocs.scoreDocs[0].doc;
818 TermFreqVector termFreqVector = documentsIndexReader.getTermFreqVector(docIdInt, fieldName);
819 if (termFreqVector != null) {
820 String[] terms = termFreqVector.getTerms();
821 int[] freqs = termFreqVector.getTermFrequencies();
822 boolean success = false;
823 if (terms != null) {
824 retToken = new ArrayList<Token>();
825 for (int i = 0; i < terms.length; i++) {
826 String termStr = terms[i];
827 if (termStr.startsWith(value))
828 success = true;
829 if (success) {
830 counter++;
831 int freq = freqs[i];
832 Term t = new Term(fieldName, termStr);
833 Token tok = new Token(t);
834 tok.setFreq(freq);
835 retToken.add(tok);
836 }
837 if (counter >= count)
838 break;
839 }
840 }
841 }
842 }
843 } catch (Exception e) {
844 throw new ApplicationException(e);
845 } finally {
846 try {
847 if (searcher != null)
848 documentsSearcherManager.release(searcher);
849 } catch (IOException e) {
850 // nothing
851 }
852 }
853 // Do not use searcher after this!
854 searcher = null;
855 return retToken;
856 }
857
858 public void end() throws ApplicationException {
859 try {
860 if (documentsIndexWriter != null)
861 documentsIndexWriter.close();
862 if (nodesIndexWriter != null)
863 nodesIndexWriter.close();
864 if (documentsSearcherManager != null)
865 documentsSearcherManager.close();
866 if (nodesSearcherManager != null)
867 nodesSearcherManager.close();
868 if (documentsIndexReader != null)
869 documentsIndexReader.close();
870 } catch (IOException e) {
871 throw new ApplicationException(e);
872 }
873 }
874
875 private Query buildMorphQuery(Query query, String language) throws ApplicationException {
876 return buildMorphQuery(query, language, false, false);
877 }
878
879 private Query buildMorphQuery(Query query, String language, boolean withAllForms, boolean translate) throws ApplicationException {
880 Query morphQuery = null;
881 if (query instanceof TermQuery) {
882 TermQuery termQuery = (TermQuery) query;
883 morphQuery = buildMorphQuery(termQuery, language, withAllForms, translate);
884 } else if (query instanceof BooleanQuery) {
885 BooleanQuery booleanQuery = (BooleanQuery) query;
886 morphQuery = buildMorphQuery(booleanQuery, language, withAllForms, translate);
887 } else {
888 morphQuery = query; // all other cases: PrefixQuery, PhraseQuery, FuzzyQuery, TermRangeQuery, ...
889 }
890 return morphQuery;
891 }
892
893 private Query buildMorphQuery(TermQuery inputTermQuery, String fromLang, boolean withAllForms, boolean translate) throws ApplicationException {
894 String[] toLanguages = {"deu", "eng", "fra"}; // TODO
895 String fromLanguage = null;
896 String inputTerm = inputTermQuery.getTerm().text();
897 if (fromLang == null) {
898 String detectedLang = MicrosoftTranslator.detectLanguageCode(inputTerm);
899 if (detectedLang != null)
900 fromLanguage = detectedLang;
901 } else {
902 fromLanguage = fromLang;
903 }
904 LexHandler lexHandler = LexHandler.getInstance();
905 String fieldName = inputTermQuery.getTerm().field();
906 ArrayList<TermQuery> queryTerms = new ArrayList<TermQuery>();
907 if (fieldName != null && fieldName.equals("tokenMorph")) {
908 ArrayList<Lemma> lemmas = lexHandler.getLemmas(inputTerm, "form", fromLanguage, Normalizer.DICTIONARY, true);
909 if (lemmas == null) { // if no lemmas are found then do a query in tokenOrig TODO should this really be done ?
910 if (translate) {
911 String[] terms = {inputTerm};
912 ArrayList<String> translatedTerms = MicrosoftTranslator.translate(terms, fromLanguage, toLanguages);
913 for (int i=0; i<translatedTerms.size(); i++) {
914 String translatedTerm = translatedTerms.get(i);
915 Term translatedTermTokenOrig = new Term("tokenOrig", translatedTerm);
916 TermQuery translatedTermQueryInTokenOrig = new TermQuery(translatedTermTokenOrig);
917 queryTerms.add(translatedTermQueryInTokenOrig);
918 }
919 } else {
920 Term termTokenOrig = new Term("tokenOrig", inputTerm);
921 TermQuery termQueryInTokenOrig = new TermQuery(termTokenOrig);
922 queryTerms.add(termQueryInTokenOrig);
923 }
924 } else {
925 if (translate) {
926 ArrayList<String> morphTerms = new ArrayList<String>();
927 for (int i=0; i<lemmas.size(); i++) {
928 Lemma lemma = lemmas.get(i);
929 if (withAllForms) { // all word forms are put into the query as boolean or clauses: needed in fragments search when all forms should be highlighted
930 ArrayList<Form> forms = lemma.getFormsList();
931 for (int j=0; j<forms.size(); j++) {
932 Form form = forms.get(j);
933 String formName = form.getFormName();
934 morphTerms.add(formName);
935 }
936 } else {
937 String lemmaName = lemma.getLemmaName();
938 morphTerms.add(lemmaName);
939 }
940 }
941 String[] morphTermsArray = morphTerms.toArray(new String[morphTerms.size()]);
942 ArrayList<String> translatedMorphTerms = MicrosoftTranslator.translate(morphTermsArray, fromLanguage, toLanguages);
943 for (int i=0; i<translatedMorphTerms.size(); i++) {
944 String translatedMorphTermStr = translatedMorphTerms.get(i);
945 Term translatedMorphTerm = new Term(fieldName, translatedMorphTermStr);
946 TermQuery translatedMorphTermQuery = new TermQuery(translatedMorphTerm);
947 queryTerms.add(translatedMorphTermQuery);
948 }
949 } else {
950 for (int i = 0; i < lemmas.size(); i++) {
951 Lemma lemma = lemmas.get(i);
952 if (withAllForms) { // all word forms are put into the query as boolean or clauses: needed in fragments search when all forms should be highlighted
953 ArrayList<Form> forms = lemma.getFormsList();
954 for (int j=0; j<forms.size(); j++) {
955 Form form = forms.get(j);
956 Term formTerm = new Term(fieldName, form.getFormName());
957 TermQuery morphTermQuery = new TermQuery(formTerm);
958 queryTerms.add(morphTermQuery);
959 }
960 } else {
961 Term lemmaTerm = new Term(fieldName, lemma.getLemmaName());
962 TermQuery morphTermQuery = new TermQuery(lemmaTerm);
963 queryTerms.add(morphTermQuery);
964 }
965 }
966 }
967 }
968 } else {
969 // if it is not the morph field then do a normal query
970 if (translate) {
971 String inputTermQueryField = inputTermQuery.getTerm().field();
972 String inputTermQueryStr = inputTermQuery.getTerm().text();
973 String[] terms = {inputTermQueryStr};
974 ArrayList<String> translatedTerms = MicrosoftTranslator.translate(terms, fromLanguage, toLanguages);
975 for (int i=0; i<translatedTerms.size(); i++) {
976 String translatedTerm = translatedTerms.get(i);
977 Term translatedTermTokenOrig = new Term(inputTermQueryField, translatedTerm);
978 TermQuery translatedTermQueryInTokenOrig = new TermQuery(translatedTermTokenOrig);
979 queryTerms.add(translatedTermQueryInTokenOrig);
980 }
981 } else {
982 queryTerms.add(inputTermQuery);
983 }
984 //TODO ?? perhaps other fields should also be queried morphological e.g. title etc.
985 }
986 Query retQuery = buildBooleanShouldQuery(queryTerms);
987 return retQuery;
988 }
989
990 private Query buildBooleanShouldQuery(ArrayList<TermQuery> queryTerms) throws ApplicationException {
991 BooleanQuery retBooleanQuery = new BooleanQuery();
992 for (int i = 0; i < queryTerms.size(); i++) {
993 TermQuery termQuery = queryTerms.get(i);
994 retBooleanQuery.add(termQuery, BooleanClause.Occur.SHOULD);
995 }
996 return retBooleanQuery;
997 }
998
999 private Query buildMorphQuery(BooleanQuery query, String language, boolean withAllForms, boolean translate) throws ApplicationException {
1000 BooleanQuery morphBooleanQuery = new BooleanQuery();
1001 BooleanClause[] booleanClauses = query.getClauses();
1002 for (int i = 0; i < booleanClauses.length; i++) {
1003 BooleanClause boolClause = booleanClauses[i];
1004 Query q = boolClause.getQuery();
1005 Query morphQuery = buildMorphQuery(q, language, withAllForms, translate);
1006 BooleanClause.Occur occur = boolClause.getOccur();
1007 morphBooleanQuery.add(morphQuery, occur);
1008 }
1009 return morphBooleanQuery;
1010 }
1011
1012 public ArrayList<String> fetchTerms(String queryStr) throws ApplicationException {
1013 ArrayList<String> terms = null;
1014 String defaultQueryFieldName = "tokenOrig";
1015 try {
1016 Query query = new QueryParser(Version.LUCENE_35, defaultQueryFieldName, nodesPerFieldAnalyzer).parse(queryStr);
1017 terms = fetchTerms(query);
1018 } catch (Exception e) {
1019 throw new ApplicationException(e);
1020 }
1021 return terms;
1022 }
1023
1024 /**
1025 * recursively fetch all terms of the query
1026 *
1027 * @param query
1028 * @return
1029 */
1030 private ArrayList<String> fetchTerms(Query query) throws ApplicationException {
1031 ArrayList<String> terms = new ArrayList<String>();
1032 if (query instanceof TermQuery) {
1033 TermQuery termQuery = (TermQuery) query;
1034 String termQueryStr = termQuery.getTerm().text();
1035 terms.add(termQueryStr);
1036 } else if (query instanceof BooleanQuery) {
1037 BooleanQuery booleanQuery = (BooleanQuery) query;
1038 terms = fetchTerms(booleanQuery);
1039 } else {
1040 String queryStr = query.toString();
1041 terms.add(queryStr); // all other cases: PrefixQuery, PhraseQuery,
1042 // FuzzyQuery, TermRangeQuery, ...
1043 }
1044 return terms;
1045 }
1046
1047 private ArrayList<String> fetchTerms(BooleanQuery query) throws ApplicationException {
1048 ArrayList<String> terms = new ArrayList<String>();
1049 BooleanClause[] booleanClauses = query.getClauses();
1050 for (int i = 0; i < booleanClauses.length; i++) {
1051 BooleanClause boolClause = booleanClauses[i];
1052 Query q = boolClause.getQuery();
1053 ArrayList<String> qTerms = fetchTerms(q);
1054 BooleanClause.Occur occur = boolClause.getOccur();
1055 if (occur == BooleanClause.Occur.SHOULD || occur == BooleanClause.Occur.MUST)
1056 terms.addAll(qTerms);
1057 }
1058 return terms;
1059 }
1060
1061 public ArrayList<String> fetchTerms(String queryStr, String language) throws ApplicationException {
1062 ArrayList<String> terms = null;
1063 String defaultQueryFieldName = "tokenOrig";
1064 try {
1065 Query query = new QueryParser(Version.LUCENE_35, defaultQueryFieldName, nodesPerFieldAnalyzer).parse(queryStr);
1066 terms = fetchTerms(query, language);
1067 } catch (Exception e) {
1068 throw new ApplicationException(e);
1069 }
1070 return terms;
1071 }
1072
1073 /**
1074 * recursively fetch all terms of the query
1075 *
1076 * @param query
1077 * @return
1078 */
1079 private ArrayList<String> fetchTerms(Query query, String language) throws ApplicationException {
1080 ArrayList<String> terms = new ArrayList<String>();
1081 if (query instanceof TermQuery) {
1082 TermQuery termQuery = (TermQuery) query;
1083 terms = fetchTerms(termQuery, language);
1084 } else if (query instanceof BooleanQuery) {
1085 BooleanQuery booleanQuery = (BooleanQuery) query;
1086 terms = fetchTerms(booleanQuery, language);
1087 } else {
1088 String queryStr = query.toString();
1089 terms.add(queryStr);
1090 // all other cases: PrefixQuery, PhraseQuery, FuzzyQuery, TermRangeQuery, ...
1091 }
1092 return terms;
1093 }
1094
1095 private ArrayList<String> fetchTerms(TermQuery termQuery, String language) throws ApplicationException {
1096 if (language == null)
1097 language = "eng";
1098 ArrayList<String> terms = new ArrayList<String>();
1099 Term termQueryTerm = termQuery.getTerm();
1100 String term = termQuery.getTerm().text();
1101 String fieldName = termQueryTerm.field();
1102 if (fieldName != null && fieldName.equals("tokenMorph")) {
1103 LexHandler lexHandler = LexHandler.getInstance();
1104 ArrayList<Lemma> lemmas = lexHandler.getLemmas(term, "form", language, Normalizer.DICTIONARY, true);
1105 // TODO : language über den translator service holen
1106 if (lemmas == null) {
1107 terms.add(term);
1108 } else {
1109 for (int i = 0; i < lemmas.size(); i++) {
1110 Lemma lemma = lemmas.get(i);
1111 ArrayList<Form> forms = lemma.getFormsList();
1112 for (int j = 0; j < forms.size(); j++) {
1113 Form form = forms.get(j);
1114 String formName = form.getFormName();
1115 terms.add(formName);
1116 }
1117 }
1118 }
1119 } else {
1120 terms.add(term);
1121 }
1122 return terms;
1123 }
1124
1125 private ArrayList<String> fetchTerms(BooleanQuery query, String language) throws ApplicationException {
1126 ArrayList<String> terms = new ArrayList<String>();
1127 BooleanClause[] booleanClauses = query.getClauses();
1128 for (int i = 0; i < booleanClauses.length; i++) {
1129 BooleanClause boolClause = booleanClauses[i];
1130 Query q = boolClause.getQuery();
1131 ArrayList<String> qTerms = fetchTerms(q, language);
1132 BooleanClause.Occur occur = boolClause.getOccur();
1133 if (occur == BooleanClause.Occur.SHOULD || occur == BooleanClause.Occur.MUST)
1134 terms.addAll(qTerms);
1135 }
1136 return terms;
1137 }
1138
1139 private Document getDocument(String docId) throws ApplicationException {
1140 Document doc = null;
1141 IndexSearcher searcher = null;
1142 try {
1143 makeDocumentsSearcherManagerUpToDate();
1144 searcher = documentsSearcherManager.acquire();
1145 String fieldNameDocId = "docId";
1146 Query queryDocId = new QueryParser(Version.LUCENE_35, fieldNameDocId, documentsPerFieldAnalyzer).parse(docId);
1147 TopDocs topDocs = searcher.search(queryDocId, 100000);
1148 topDocs.setMaxScore(1);
1149 if (topDocs != null && topDocs.scoreDocs != null && topDocs.scoreDocs.length > 0) {
1150 int docID = topDocs.scoreDocs[0].doc;
1151 FieldSelector docFieldSelector = getDocFieldSelector();
1152 doc = searcher.doc(docID, docFieldSelector);
1153 }
1154 searcher.close();
1155 } catch (Exception e) {
1156 throw new ApplicationException(e);
1157 } finally {
1158 try {
1159 if (searcher != null)
1160 documentsSearcherManager.release(searcher);
1161 } catch (IOException e) {
1162 // nothing
1163 }
1164 }
1165 // Do not use searcher after this!
1166 searcher = null;
1167 return doc;
1168 }
1169
1170 private IndexWriter getDocumentsWriter() throws ApplicationException {
1171 IndexWriter writer = null;
1172 String luceneDocsDirectoryStr = Constants.getInstance().getLuceneDocumentsDir();
1173 File luceneDocsDirectory = new File(luceneDocsDirectoryStr);
1174 try {
1175 Map<String, Analyzer> documentsFieldAnalyzers = new HashMap<String, Analyzer>();
1176 documentsFieldAnalyzers.put("docId", new KeywordAnalyzer());
1177 documentsFieldAnalyzers.put("identifier", new KeywordAnalyzer());
1178 documentsFieldAnalyzers.put("uri", new KeywordAnalyzer());
1179 documentsFieldAnalyzers.put("collectionNames", new StandardAnalyzer(Version.LUCENE_35));
1180 documentsFieldAnalyzers.put("author", new StandardAnalyzer(Version.LUCENE_35));
1181 documentsFieldAnalyzers.put("title", new StandardAnalyzer(Version.LUCENE_35));
1182 documentsFieldAnalyzers.put("language", new StandardAnalyzer(Version.LUCENE_35));
1183 documentsFieldAnalyzers.put("publisher", new StandardAnalyzer(Version.LUCENE_35));
1184 documentsFieldAnalyzers.put("date", new StandardAnalyzer(Version.LUCENE_35));
1185 documentsFieldAnalyzers.put("subject", new StandardAnalyzer(Version.LUCENE_35));
1186 documentsFieldAnalyzers.put("rights", new StandardAnalyzer(Version.LUCENE_35));
1187 documentsFieldAnalyzers.put("license", new StandardAnalyzer(Version.LUCENE_35));
1188 documentsFieldAnalyzers.put("accessRights", new StandardAnalyzer(Version.LUCENE_35));
1189 documentsFieldAnalyzers.put("echoId", new KeywordAnalyzer());
1190 documentsFieldAnalyzers.put("echoPageImageDir", new KeywordAnalyzer());
1191 documentsFieldAnalyzers.put("echoFiguresDir", new KeywordAnalyzer());
1192 documentsFieldAnalyzers.put("mpiwgDocId", new KeywordAnalyzer());
1193 documentsFieldAnalyzers.put("type", new KeywordAnalyzer()); // e.g. mime type "text/xml"
1194 documentsFieldAnalyzers.put("pageCount", new KeywordAnalyzer());
1195 documentsFieldAnalyzers.put("schemaName", new StandardAnalyzer(Version.LUCENE_35));
1196 documentsFieldAnalyzers.put("lastModified", new KeywordAnalyzer());
1197 documentsFieldAnalyzers.put("tokenOrig", new StandardAnalyzer(Version.LUCENE_35));
1198 documentsFieldAnalyzers.put("tokenReg", new StandardAnalyzer(Version.LUCENE_35));
1199 documentsFieldAnalyzers.put("tokenNorm", new StandardAnalyzer(Version.LUCENE_35));
1200 documentsFieldAnalyzers.put("tokenMorph", new StandardAnalyzer(Version.LUCENE_35));
1201 documentsFieldAnalyzers.put("xmlContent", new StandardAnalyzer(Version.LUCENE_35));
1202 documentsFieldAnalyzers.put("content", new StandardAnalyzer(Version.LUCENE_35));
1203 documentsPerFieldAnalyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_35), documentsFieldAnalyzers);
1204 IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, documentsPerFieldAnalyzer);
1205 conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
1206 conf.setRAMBufferSizeMB(300); // 300 MB because some documents are big; 16 MB is default
1207 FSDirectory fsDirectory = FSDirectory.open(luceneDocsDirectory);
1208 writer = new IndexWriter(fsDirectory, conf);
1209 writer.commit(); // when directory is empty this creates init files
1210 } catch (IOException e) {
1211 throw new ApplicationException(e);
1212 }
1213 return writer;
1214 }
1215
1216 private IndexWriter getNodesWriter() throws ApplicationException {
1217 IndexWriter writer = null;
1218 String luceneNodesDirectoryStr = Constants.getInstance().getLuceneNodesDir();
1219 File luceneNodesDirectory = new File(luceneNodesDirectoryStr);
1220 try {
1221 Map<String, Analyzer> nodesFieldAnalyzers = new HashMap<String, Analyzer>();
1222 nodesFieldAnalyzers.put("docId", new KeywordAnalyzer());
1223 nodesFieldAnalyzers.put("language", new StandardAnalyzer(Version.LUCENE_35)); // language (through xml:id): e.g. "lat"
1224 nodesFieldAnalyzers.put("pageNumber", new KeywordAnalyzer()); // page number (through element pb): e.g. "13"
1225 nodesFieldAnalyzers.put("lineNumber", new KeywordAnalyzer()); // line number on the page (through element lb): e.g. "17"
1226 nodesFieldAnalyzers.put("elementName", new KeywordAnalyzer()); // element name: e.g. "tei:s"
1227 nodesFieldAnalyzers.put("elementDocPosition", new KeywordAnalyzer()); // absolute position in document: e.g. "4711"
1228 nodesFieldAnalyzers.put("elementPosition", new KeywordAnalyzer()); // position in parent node (in relation to other nodes of the same name): e.g. "5"
1229 nodesFieldAnalyzers.put("elementAbsolutePosition", new KeywordAnalyzer()); // absolute position in document (in relation to other nodes of the same name): e.g. "213"
1230 nodesFieldAnalyzers.put("elementPagePosition", new KeywordAnalyzer()); // position in relation to other nodes of the same name: e.g. "213"
1231 nodesFieldAnalyzers.put("xmlId", new KeywordAnalyzer()); // xml id: e.g. "4711bla"
1232 nodesFieldAnalyzers.put("xpath", new KeywordAnalyzer()); // xpath: e.g. "/echo[1]/text[1]/p[1]/s[5]"
1233 nodesFieldAnalyzers.put("tokenOrig", new StandardAnalyzer(Version.LUCENE_35));
1234 nodesFieldAnalyzers.put("tokenReg", new StandardAnalyzer(Version.LUCENE_35));
1235 nodesFieldAnalyzers.put("tokenNorm", new StandardAnalyzer(Version.LUCENE_35));
1236 nodesFieldAnalyzers.put("tokenMorph", new StandardAnalyzer(Version.LUCENE_35));
1237 nodesFieldAnalyzers.put("xmlContent", new StandardAnalyzer(Version.LUCENE_35));
1238 nodesPerFieldAnalyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_35), nodesFieldAnalyzers);
1239 IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, nodesPerFieldAnalyzer);
1240 conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
1241 conf.setRAMBufferSizeMB(300); // 300 MB because some documents are big; 16 MB is default
1242 FSDirectory fsDirectory = FSDirectory.open(luceneNodesDirectory);
1243 writer = new IndexWriter(fsDirectory, conf);
1244 writer.commit();
1245 } catch (IOException e) {
1246 throw new ApplicationException(e);
1247 }
1248 return writer;
1249 }
1250
1251 private Sort buildSort(String[] sortFieldNames, String type) {
1252 Sort sort = new Sort();
1253 ArrayList<SortField> sortFields = new ArrayList<SortField>();
1254 for (int i=0; i<sortFieldNames.length; i++) {
1255 String sortFieldName = sortFieldNames[i];
1256 int sortFieldType = getDocSortFieldType(sortFieldName);
1257 if (type.equals("node"))
1258 sortFieldType = getNodeSortFieldType(sortFieldName);
1259 String realSortFieldName = getDocSortFieldName(sortFieldName);
1260 SortField sortField = new SortField(realSortFieldName, sortFieldType);
1261 sortFields.add(sortField);
1262 }
1263 if (sortFieldNames.length == 1) {
1264 SortField sortField1 = sortFields.get(0);
1265 sort.setSort(sortField1);
1266 } else if (sortFieldNames.length == 2) {
1267 SortField sortField1 = sortFields.get(0);
1268 SortField sortField2 = sortFields.get(1);
1269 sort.setSort(sortField1, sortField2);
1270 } else if (sortFieldNames.length == 2) {
1271 SortField sortField1 = sortFields.get(0);
1272 SortField sortField2 = sortFields.get(1);
1273 SortField sortField3 = sortFields.get(2);
1274 sort.setSort(sortField1, sortField2, sortField3);
1275 }
1276 return sort;
1277 }
1278
1279 private String getDocSortFieldName(String fieldName) {
1280 String sortFieldName = fieldName + "Sorted";
1281 return sortFieldName;
1282 }
1283
1284 private int getDocSortFieldType(String fieldName) {
1285 int type = SortField.STRING;
1286 if (fieldName.equals("lastModified"))
1287 type = SortField.LONG;
1288 return type;
1289 }
1290
1291 private int getNodeSortFieldType(String fieldName) {
1292 int type = SortField.STRING;
1293 if (fieldName.equals("pageNumber") || fieldName.equals("lineNumber") || fieldName.equals("elementDocPosition"))
1294 type = SortField.INT;
1295 return type;
1296 }
1297
1298 private FieldSelector getDocFieldSelector() {
1299 HashSet<String> fields = new HashSet<String>();
1300 fields.add("docId");
1301 fields.add("identifier");
1302 fields.add("uri");
1303 fields.add("collectionNames");
1304 fields.add("author");
1305 fields.add("title");
1306 fields.add("language");
1307 fields.add("publisher");
1308 fields.add("date");
1309 fields.add("subject");
1310 fields.add("rights");
1311 fields.add("license");
1312 fields.add("echoId");
1313 fields.add("echoPageImageDir");
1314 fields.add("echoFiguresDir");
1315 fields.add("mpiwgDocId");
1316 fields.add("type");
1317 fields.add("pageCount");
1318 fields.add("schemaName");
1319 fields.add("lastModified");
1320 fields.add("content");
1321 FieldSelector fieldSelector = new SetBasedFieldSelector(fields, fields);
1322 return fieldSelector;
1323 }
1324
1325 private FieldSelector getNodeFieldSelector() {
1326 HashSet<String> fields = new HashSet<String>();
1327 fields.add("docId");
1328 fields.add("language");
1329 fields.add("pageNumber");
1330 fields.add("lineNumber");
1331 fields.add("elementName");
1332 fields.add("elementDocPosition");
1333 fields.add("elementPosition");
1334 fields.add("elementAbsolutePosition");
1335 fields.add("elementPagePosition");
1336 fields.add("xmlId");
1337 fields.add("xpath");
1338 fields.add("xmlContent");
1339 fields.add("xmlContentTokenized");
1340 FieldSelector fieldSelector = new SetBasedFieldSelector(fields, fields);
1341 return fieldSelector;
1342 }
1343
1344 private SearcherManager getNewSearcherManager(IndexWriter indexWriter) throws ApplicationException {
1345 SearcherManager searcherManager = null;
1346 try {
1347 searcherManager = new SearcherManager(indexWriter, true, null, null);
1348 } catch (IOException e) {
1349 throw new ApplicationException(e);
1350 }
1351 return searcherManager;
1352 }
1353
1354 private IndexReader getDocumentsReader() throws ApplicationException {
1355 IndexReader reader = null;
1356 String luceneDocsDirectoryStr = Constants.getInstance().getLuceneDocumentsDir();
1357 File luceneDocsDirectory = new File(luceneDocsDirectoryStr);
1358 try {
1359 FSDirectory fsDirectory = FSDirectory.open(luceneDocsDirectory);
1360 reader = IndexReader.open(fsDirectory, true);
1361 } catch (IOException e) {
1362 throw new ApplicationException(e);
1363 }
1364 return reader;
1365 }
1366
1367 private void makeIndexReaderUpToDate() throws ApplicationException {
1368 try {
1369 boolean isCurrent = documentsIndexReader.isCurrent();
1370 if (!isCurrent) {
1371 documentsIndexReader = IndexReader.openIfChanged(documentsIndexReader);
1372 }
1373 } catch (IOException e) {
1374 throw new ApplicationException(e);
1375 }
1376 }
1377
1378 private void makeDocumentsSearcherManagerUpToDate() throws ApplicationException {
1379 try {
1380 boolean isCurrent = documentsSearcherManager.isSearcherCurrent();
1381 if (!isCurrent) {
1382 documentsSearcherManager.maybeReopen();
1383 }
1384 } catch (IOException e) {
1385 throw new ApplicationException(e);
1386 }
1387 }
1388
1389 private void makeNodesSearcherManagerUpToDate() throws ApplicationException {
1390 try {
1391 boolean isCurrent = nodesSearcherManager.isSearcherCurrent();
1392 if (!isCurrent) {
1393 nodesSearcherManager.maybeReopen();
1394 }
1395 } catch (IOException e) {
1396 throw new ApplicationException(e);
1397 }
1398 }
1399
1400 private String toTokenizedXmlString(String xmlStr, String language) throws ApplicationException {
1401 String xmlPre = "<tokenized xmlns:xhtml=\"http://www.w3.org/1999/xhtml\" xmlns:de=\"http://www.mpiwg-berlin.mpg.de/ns/de/1.0/\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">";
1402 String xmlPost = "</tokenized>";
1403 String xmlStrTmp = xmlPre + xmlStr + xmlPost;
1404 StringReader xmlInputStringReader = new StringReader(xmlStrTmp);
1405 XmlTokenizer xmlTokenizer = new XmlTokenizer(xmlInputStringReader);
1406 xmlTokenizer.setLanguage(language);
1407 String[] outputOptions = { "withLemmas" };
1408 xmlTokenizer.setOutputOptions(outputOptions);
1409 xmlTokenizer.tokenize();
1410 String result = xmlTokenizer.getXmlResult();
1411 return result;
1412 }
1413
1414 private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException {
1415 try {
1416 WordContentHandler wordContentHandler = new WordContentHandler();
1417 XMLReader xmlParser = new SAXParser();
1418 xmlParser.setContentHandler(wordContentHandler);
1419 StringReader strReader = new StringReader(xmlStr);
1420 InputSource inputSource = new InputSource(strReader);
1421 xmlParser.parse(inputSource);
1422 String result = wordContentHandler.getResult();
1423 return result;
1424 } catch (SAXException e) {
1425 throw new ApplicationException(e);
1426 } catch (IOException e) {
1427 throw new ApplicationException(e);
1428 }
1429 }
1430
1431 private String escapeLuceneChars(String inputStr) {
1432 String luceneCharsStr = "+-&|!(){}[]^~*?:\\"; // Lucene escape symbols
1433 StringBuilder retStrBuilder = new StringBuilder();
1434 for (int i = 0; i < inputStr.length(); i++) {
1435 char c = inputStr.charAt(i);
1436 if (luceneCharsStr.contains(String.valueOf(c)))
1437 retStrBuilder.append("\\");
1438 retStrBuilder.append(c);
1439 }
1440 return retStrBuilder.toString();
1441 }
1442
1443 /**
1444 * sorgt für sinnvolle satzanfänge
1445 *
1446 * @param fragment
1447 */
1448 private String checkHitFragment(String fragment) {
1449 if (fragment.startsWith(".")
1450 || fragment.startsWith(":")
1451 || fragment.startsWith(",")
1452 || fragment.startsWith("-")
1453 || fragment.startsWith(";")
1454 || fragment.startsWith("?")
1455 || fragment.startsWith(")")
1456 || fragment.startsWith("!")) {
1457 fragment = fragment.substring(1, fragment.length());
1458 // finds first occurence of a given string out.println("first index of point : "+StringUtils.indexOfAny(fragment, "."));
1459 }
1460 return fragment;
1461 }
1462
1463 }