annotate src/de/mpiwg/dwinter/lucencetools/analyzer/.svn/text-base/LanguageAnalyzers.java.svn-base @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dc7622afcfea initial
dwinter
parents:
diff changeset
1 /* */ package de.mpiwg.dwinter.lucencetools.analyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
2 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
3 /* */ import de.mpiwg.dwinter.lucencetools.analyzer.donatusAnalyzer.DonatusAnalyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
4 /* */ import de.mpiwg.dwinter.lucencetools.documents.MorphDocument;
dc7622afcfea initial
dwinter
parents:
diff changeset
5 /* */ import java.io.IOException;
dc7622afcfea initial
dwinter
parents:
diff changeset
6 /* */ import java.io.PrintStream;
dc7622afcfea initial
dwinter
parents:
diff changeset
7 /* */ import java.util.ArrayList;
dc7622afcfea initial
dwinter
parents:
diff changeset
8 /* */ import java.util.HashMap;
dc7622afcfea initial
dwinter
parents:
diff changeset
9 /* */ import java.util.Iterator;
dc7622afcfea initial
dwinter
parents:
diff changeset
10 /* */ import org.apache.lucene.document.Document;
dc7622afcfea initial
dwinter
parents:
diff changeset
11 /* */ import org.apache.lucene.index.CorruptIndexException;
dc7622afcfea initial
dwinter
parents:
diff changeset
12 /* */ import org.apache.lucene.index.IndexReader;
dc7622afcfea initial
dwinter
parents:
diff changeset
13 /* */ import org.apache.lucene.index.IndexWriter;
dc7622afcfea initial
dwinter
parents:
diff changeset
14 /* */ import org.apache.lucene.index.Term;
dc7622afcfea initial
dwinter
parents:
diff changeset
15 /* */ import org.apache.lucene.search.Collector;
dc7622afcfea initial
dwinter
parents:
diff changeset
16 /* */ import org.apache.lucene.search.IndexSearcher;
dc7622afcfea initial
dwinter
parents:
diff changeset
17 /* */ import org.apache.lucene.search.TermQuery;
dc7622afcfea initial
dwinter
parents:
diff changeset
18 /* */ import org.apache.lucene.search.TopScoreDocCollector;
dc7622afcfea initial
dwinter
parents:
diff changeset
19 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
20 /* */ public class LanguageAnalyzers extends ArrayList<LanguageAnalyzer>
dc7622afcfea initial
dwinter
parents:
diff changeset
21 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
22 /* */ private static final long serialVersionUID = 2L;
dc7622afcfea initial
dwinter
parents:
diff changeset
23 /* */ private static final int MAX_HITS_PER_PAGE = 10;
dc7622afcfea initial
dwinter
parents:
diff changeset
24 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
25 /* */ public void optimize()
dc7622afcfea initial
dwinter
parents:
diff changeset
26 /* */ throws CorruptIndexException, IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
27 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
28 /* 43 */ Iterator indexWriter = iterator();
dc7622afcfea initial
dwinter
parents:
diff changeset
29 /* 44 */ while (indexWriter.hasNext())
dc7622afcfea initial
dwinter
parents:
diff changeset
30 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
31 /* 46 */ ((LanguageAnalyzer)indexWriter.next()).writer.optimize();
dc7622afcfea initial
dwinter
parents:
diff changeset
32 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
33 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
34 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
35 /* */ public void close()
dc7622afcfea initial
dwinter
parents:
diff changeset
36 /* */ throws CorruptIndexException, IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
37 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
38 /* 57 */ Iterator indexWriter = iterator();
dc7622afcfea initial
dwinter
parents:
diff changeset
39 /* 58 */ while (indexWriter.hasNext())
dc7622afcfea initial
dwinter
parents:
diff changeset
40 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
41 /* 60 */ ((LanguageAnalyzer)indexWriter.next()).writer.close();
dc7622afcfea initial
dwinter
parents:
diff changeset
42 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
43 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
44 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
45 /* */ public void addDocument(Document document, String lang)
dc7622afcfea initial
dwinter
parents:
diff changeset
46 /* */ throws CorruptIndexException, IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
47 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
48 /* 74 */ LanguageAnalyzer analyzer = getAnalyzer(lang);
dc7622afcfea initial
dwinter
parents:
diff changeset
49 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
50 /* 76 */ if (analyzer != null)
dc7622afcfea initial
dwinter
parents:
diff changeset
51 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
52 /* 78 */ analyzer.writer.addDocument(document);
dc7622afcfea initial
dwinter
parents:
diff changeset
53 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
54 /* 81 */ if (!DonatusAnalyzer.class.isInstance(analyzer.analyzer))
dc7622afcfea initial
dwinter
parents:
diff changeset
55 /* */ return;
dc7622afcfea initial
dwinter
parents:
diff changeset
56 /* 83 */ StringBuffer mp = DonatusAnalyzer.morphFile;
dc7622afcfea initial
dwinter
parents:
diff changeset
57 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
58 /* 86 */ LanguageAnalyzer mpAnalyzer = getAnalyzer("morph");
dc7622afcfea initial
dwinter
parents:
diff changeset
59 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
60 /* 88 */ mpAnalyzer.writer.addDocument(MorphDocument.Document(mp.toString(), document.get("cleanedPath")));
dc7622afcfea initial
dwinter
parents:
diff changeset
61 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
62 /* */ else
dc7622afcfea initial
dwinter
parents:
diff changeset
63 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
64
dc7622afcfea initial
dwinter
parents:
diff changeset
65 System.err.println("addDocument: cannot add the document language " + lang + " not known!");
dc7622afcfea initial
dwinter
parents:
diff changeset
66 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
67 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
68 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
69 /* */ public LanguageAnalyzer getAnalyzer(String lang)
dc7622afcfea initial
dwinter
parents:
diff changeset
70 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
71 /* 106 */ for (LanguageAnalyzer analyzer : this)
dc7622afcfea initial
dwinter
parents:
diff changeset
72 /* 107 */ if (analyzer.lang.equals(lang))
dc7622afcfea initial
dwinter
parents:
diff changeset
73 /* 108 */ return analyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
74 /* 109 */ return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
75 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
76 /* */ public void deleteDocuments(Term term) throws CorruptIndexException, IOException {
dc7622afcfea initial
dwinter
parents:
diff changeset
77 /* 112 */ for (LanguageAnalyzer analyzer : this)
dc7622afcfea initial
dwinter
parents:
diff changeset
78 /* 113 */ analyzer.writer.deleteDocuments(term);
dc7622afcfea initial
dwinter
parents:
diff changeset
79 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
80 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
81 /* */ public void deleteDocuments(TermQuery query) throws CorruptIndexException, IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
82 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
83 /* 118 */ for (LanguageAnalyzer analyzer : this)
dc7622afcfea initial
dwinter
parents:
diff changeset
84 /* 119 */ analyzer.writer.deleteDocuments(query);
dc7622afcfea initial
dwinter
parents:
diff changeset
85 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
86 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
87 /* */ public void deleteDocument(int id) throws CorruptIndexException, IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
88 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
89 /* 124 */ for (LanguageAnalyzer analyzer : this)
dc7622afcfea initial
dwinter
parents:
diff changeset
90 /* 125 */ analyzer.reader.deleteDocument(id);
dc7622afcfea initial
dwinter
parents:
diff changeset
91 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
92 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
93 /* */ public HashMap<String, Collector> search(TermQuery query) throws IOException {
dc7622afcfea initial
dwinter
parents:
diff changeset
94 /* 129 */ HashMap collectors = new HashMap();
dc7622afcfea initial
dwinter
parents:
diff changeset
95 /* 130 */ for (LanguageAnalyzer analyzer : this)
dc7622afcfea initial
dwinter
parents:
diff changeset
96 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
97 /* 132 */ if (analyzer.searcher == null)
dc7622afcfea initial
dwinter
parents:
diff changeset
98 /* */ continue;
dc7622afcfea initial
dwinter
parents:
diff changeset
99 /* 134 */ Collector col = TopScoreDocCollector.create(10, false);
dc7622afcfea initial
dwinter
parents:
diff changeset
100 /* 135 */ analyzer.searcher.search(query, col);
dc7622afcfea initial
dwinter
parents:
diff changeset
101 /* 136 */ collectors.put(analyzer.lang, col);
dc7622afcfea initial
dwinter
parents:
diff changeset
102 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
103 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
104 /* 140 */ return collectors;
dc7622afcfea initial
dwinter
parents:
diff changeset
105 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
106 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
107
dc7622afcfea initial
dwinter
parents:
diff changeset
108 /* Location: /private/tmp/fulltextIndexer.jar
dc7622afcfea initial
dwinter
parents:
diff changeset
109 * Qualified Name: de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers
dc7622afcfea initial
dwinter
parents:
diff changeset
110 * JD-Core Version: 0.5.4
dc7622afcfea initial
dwinter
parents:
diff changeset
111 */