annotate src/de/mpiwg/dwinter/fulltextIndexer/harvester/processors/ProcessFileThread.java @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dc7622afcfea initial
dwinter
parents:
diff changeset
1 /* */ package de.mpiwg.dwinter.fulltextIndexer.harvester.processors;
dc7622afcfea initial
dwinter
parents:
diff changeset
2 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
3 /* */ import java.io.BufferedReader;
dc7622afcfea initial
dwinter
parents:
diff changeset
4 import java.io.File;
dc7622afcfea initial
dwinter
parents:
diff changeset
5 import java.io.FileNotFoundException;
dc7622afcfea initial
dwinter
parents:
diff changeset
6 import java.io.FileReader;
dc7622afcfea initial
dwinter
parents:
diff changeset
7 import java.io.IOException;
dc7622afcfea initial
dwinter
parents:
diff changeset
8 import java.io.Reader;
dc7622afcfea initial
dwinter
parents:
diff changeset
9 import java.io.UnsupportedEncodingException;
dc7622afcfea initial
dwinter
parents:
diff changeset
10 import java.net.URL;
dc7622afcfea initial
dwinter
parents:
diff changeset
11 import java.util.ArrayList;
dc7622afcfea initial
dwinter
parents:
diff changeset
12 import java.util.Arrays;
dc7622afcfea initial
dwinter
parents:
diff changeset
13 import java.util.Date;
dc7622afcfea initial
dwinter
parents:
diff changeset
14 import java.util.HashMap;
dc7622afcfea initial
dwinter
parents:
diff changeset
15 import java.util.regex.Matcher;
dc7622afcfea initial
dwinter
parents:
diff changeset
16 import java.util.regex.Pattern;
dc7622afcfea initial
dwinter
parents:
diff changeset
17
dc7622afcfea initial
dwinter
parents:
diff changeset
18 import javax.xml.parsers.ParserConfigurationException;
dc7622afcfea initial
dwinter
parents:
diff changeset
19
dc7622afcfea initial
dwinter
parents:
diff changeset
20 import org.apache.lucene.document.DateTools;
dc7622afcfea initial
dwinter
parents:
diff changeset
21 import org.apache.lucene.index.CorruptIndexException;
dc7622afcfea initial
dwinter
parents:
diff changeset
22 import org.apache.lucene.index.Term;
dc7622afcfea initial
dwinter
parents:
diff changeset
23 import org.apache.lucene.search.Collector;
dc7622afcfea initial
dwinter
parents:
diff changeset
24 import org.apache.lucene.search.ScoreDoc;
dc7622afcfea initial
dwinter
parents:
diff changeset
25 import org.apache.lucene.search.TermQuery;
dc7622afcfea initial
dwinter
parents:
diff changeset
26 import org.apache.lucene.search.TopDocs;
dc7622afcfea initial
dwinter
parents:
diff changeset
27 import org.apache.lucene.search.TopScoreDocCollector;
dc7622afcfea initial
dwinter
parents:
diff changeset
28 import org.apache.lucene.store.LockObtainFailedException;
dc7622afcfea initial
dwinter
parents:
diff changeset
29 import org.apache.xmlrpc.XmlRpcException;
dc7622afcfea initial
dwinter
parents:
diff changeset
30 import org.apache.xmlrpc.client.XmlRpcClient;
dc7622afcfea initial
dwinter
parents:
diff changeset
31 import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;
dc7622afcfea initial
dwinter
parents:
diff changeset
32 import org.xml.sax.InputSource;
dc7622afcfea initial
dwinter
parents:
diff changeset
33 import org.xml.sax.SAXException;
dc7622afcfea initial
dwinter
parents:
diff changeset
34 import org.xml.sax.XMLReader;
dc7622afcfea initial
dwinter
parents:
diff changeset
35
dc7622afcfea initial
dwinter
parents:
diff changeset
36 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
dc7622afcfea initial
dwinter
parents:
diff changeset
37
dc7622afcfea initial
dwinter
parents:
diff changeset
38 import de.mpiwg.dwinter.fulltextIndexer.utils.ParseIndexMeta;
dc7622afcfea initial
dwinter
parents:
diff changeset
39 import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;
dc7622afcfea initial
dwinter
parents:
diff changeset
40 import de.mpiwg.dwinter.lucencetools.documents.FileDocument;
dc7622afcfea initial
dwinter
parents:
diff changeset
41 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
42 /* */ public class ProcessFileThread extends Thread
dc7622afcfea initial
dwinter
parents:
diff changeset
43 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
44 /* */ private static final String TEXTIDFROMPATH_REGEXP = ".*(/(permanent|experimental)/.*)";
dc7622afcfea initial
dwinter
parents:
diff changeset
45 /* */ private static final int DELETED_WRONG_LANGUAGE = 1;
dc7622afcfea initial
dwinter
parents:
diff changeset
46 /* */ private static final int DELETED_OLD_VERSION = 2;
dc7622afcfea initial
dwinter
parents:
diff changeset
47 /* */ private static final int NEW_FILE = 0;
dc7622afcfea initial
dwinter
parents:
diff changeset
48 /* */ private static final int FILE_EXISTS = -1;
dc7622afcfea initial
dwinter
parents:
diff changeset
49 /* */ protected File docDir;
dc7622afcfea initial
dwinter
parents:
diff changeset
50 /* */ protected File index_dir;
dc7622afcfea initial
dwinter
parents:
diff changeset
51 /* 86 */ protected ArrayList<String> fileTypesToIndex = new ArrayList(Arrays.asList(new String[] { "xml" }));
dc7622afcfea initial
dwinter
parents:
diff changeset
52 /* 87 */ protected ArrayList<String> excludeFolders = new ArrayList(Arrays.asList(new String[] { "OCR" }));
dc7622afcfea initial
dwinter
parents:
diff changeset
53 /* 88 */ private HashMap<String, String> textLanguage = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
54 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
55 /* 90 */ protected HashMap<String, String> languageToISO = new HashMap();
dc7622afcfea initial
dwinter
parents:
diff changeset
56 /* 91 */ protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers();
dc7622afcfea initial
dwinter
parents:
diff changeset
57 /* 92 */ protected HashMap<String, String> supportedLanguageFolder = new HashMap();
dc7622afcfea initial
dwinter
parents:
diff changeset
58 /* 93 */ private int counter = 0;
dc7622afcfea initial
dwinter
parents:
diff changeset
59 /* */ protected String languageFileName;
dc7622afcfea initial
dwinter
parents:
diff changeset
60 /* 95 */ protected boolean indexMetaPriority = false;
dc7622afcfea initial
dwinter
parents:
diff changeset
61 /* 96 */ protected boolean deduceFromFolderPriority = true;
dc7622afcfea initial
dwinter
parents:
diff changeset
62 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
63 /* 101 */ private String specialMode = "";
dc7622afcfea initial
dwinter
parents:
diff changeset
64 /* 102 */ public boolean done = false;
dc7622afcfea initial
dwinter
parents:
diff changeset
65 /* */ private File processThisFile;
dc7622afcfea initial
dwinter
parents:
diff changeset
66 /* 105 */ private String mode = "new"; // if mode is not add, then only modified files and new files will be added.
dc7622afcfea initial
dwinter
parents:
diff changeset
67 /* */ private String mdProviderUrl;
dc7622afcfea initial
dwinter
parents:
diff changeset
68 /* 107 */ private String preferedLanguage = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
69 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
70 /* */ public ProcessFileThread(File docDir, File index_dir, String languageFileName, File processThisFile, String mdProviderUrl, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder) throws CorruptIndexException, LockObtainFailedException, IOException {
dc7622afcfea initial
dwinter
parents:
diff changeset
71 /* 110 */ this.docDir = docDir;
dc7622afcfea initial
dwinter
parents:
diff changeset
72 /* 111 */ this.languageFileName = languageFileName;
dc7622afcfea initial
dwinter
parents:
diff changeset
73 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
74 /* 114 */ this.index_dir = index_dir;
dc7622afcfea initial
dwinter
parents:
diff changeset
75 /* 115 */ this.processThisFile = processThisFile;
dc7622afcfea initial
dwinter
parents:
diff changeset
76 /* 116 */ this.mdProviderUrl = mdProviderUrl;
dc7622afcfea initial
dwinter
parents:
diff changeset
77 /* 117 */ this.languageToISO = languageToISO;
dc7622afcfea initial
dwinter
parents:
diff changeset
78 /* 118 */ this.supportedLanguageFolder = supportedLanguageFolder;
dc7622afcfea initial
dwinter
parents:
diff changeset
79 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
80 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
81 /* */ public ProcessFileThread(LanguageAnalyzers languageAnalyzers2, File file, String lfn, HashMap<String, String> tl, String mdProviderUrl, String preferedLanguage, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder)
dc7622afcfea initial
dwinter
parents:
diff changeset
82 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
83 /* 123 */ this.languageAnalyzers = languageAnalyzers2;
dc7622afcfea initial
dwinter
parents:
diff changeset
84 /* 124 */ this.processThisFile = file;
dc7622afcfea initial
dwinter
parents:
diff changeset
85 /* 125 */ this.textLanguage = tl;
dc7622afcfea initial
dwinter
parents:
diff changeset
86 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
87 /* 128 */ this.languageFileName = lfn;
dc7622afcfea initial
dwinter
parents:
diff changeset
88 /* 129 */ this.preferedLanguage = preferedLanguage;
dc7622afcfea initial
dwinter
parents:
diff changeset
89 /* 130 */ this.mdProviderUrl = mdProviderUrl;
dc7622afcfea initial
dwinter
parents:
diff changeset
90 /* 131 */ this.languageToISO = languageToISO;
dc7622afcfea initial
dwinter
parents:
diff changeset
91 /* 132 */ this.supportedLanguageFolder = supportedLanguageFolder;
dc7622afcfea initial
dwinter
parents:
diff changeset
92 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
93 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
94 /* */ public void run()
dc7622afcfea initial
dwinter
parents:
diff changeset
95 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
96 /* */ try
dc7622afcfea initial
dwinter
parents:
diff changeset
97 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
98 /* 140 */ processFile(this.processThisFile);
dc7622afcfea initial
dwinter
parents:
diff changeset
99 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
100 /* */ catch (CorruptIndexException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
101 /* 143 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
102 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
103 /* */ catch (FileNotFoundException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
104 /* 146 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
105 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
106 /* */ catch (UnsupportedEncodingException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
107 /* 149 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
108 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
109 /* */ catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
110 /* 152 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
111 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
112 /* */ catch (InterruptedException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
113 /* 155 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
114 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
115 /* 157 */ this.done = true;
dc7622afcfea initial
dwinter
parents:
diff changeset
116 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
117 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
118 /* */ private String getLanguageOfText(String textId, File file) throws IOException {
dc7622afcfea initial
dwinter
parents:
diff changeset
119 /* 161 */ String lang = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
120 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
121 /* 163 */ if (this.deduceFromFolderPriority)
dc7622afcfea initial
dwinter
parents:
diff changeset
122 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
123 /* 165 */ lang = deduceFromFolderName(file);
dc7622afcfea initial
dwinter
parents:
diff changeset
124 /* 166 */ if (lang != null) {
dc7622afcfea initial
dwinter
parents:
diff changeset
125 /* 167 */ return lang;
dc7622afcfea initial
dwinter
parents:
diff changeset
126 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
127 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
128 /* 170 */ if ((this.languageFileName == null | this.indexMetaPriority)) {
dc7622afcfea initial
dwinter
parents:
diff changeset
129 /* 171 */ lang = getLanguageFromIndexMeta(file);
dc7622afcfea initial
dwinter
parents:
diff changeset
130 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
131 /* 177 */ if ((lang != null) &&
dc7622afcfea initial
dwinter
parents:
diff changeset
132 /* 178 */ (lang.equals(""))) {
dc7622afcfea initial
dwinter
parents:
diff changeset
133 /* 179 */ System.out.println("Language for " + file.getAbsolutePath() + " is " + lang);
dc7622afcfea initial
dwinter
parents:
diff changeset
134 /* 180 */ return lang;
dc7622afcfea initial
dwinter
parents:
diff changeset
135 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
136 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
137 /* 183 */ if (this.languageFileName != null)
dc7622afcfea initial
dwinter
parents:
diff changeset
138 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
139 /* 185 */ if (this.textLanguage == null)
dc7622afcfea initial
dwinter
parents:
diff changeset
140 /* 186 */ this.textLanguage = loadLanguages();
dc7622afcfea initial
dwinter
parents:
diff changeset
141 /* 187 */ if (this.textLanguage == null)
dc7622afcfea initial
dwinter
parents:
diff changeset
142 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
143 /* 189 */ System.out.println("NO LANGUAGE FILES LOADED");
dc7622afcfea initial
dwinter
parents:
diff changeset
144 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
145 /* */ else
dc7622afcfea initial
dwinter
parents:
diff changeset
146 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
147 /* 198 */ String language = (String)this.textLanguage.get(textId);
dc7622afcfea initial
dwinter
parents:
diff changeset
148 /* 199 */ lang = (String)this.languageToISO.get(language);
dc7622afcfea initial
dwinter
parents:
diff changeset
149 /* 200 */ if (lang != null)
dc7622afcfea initial
dwinter
parents:
diff changeset
150 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
151 /* 202 */ System.out.println("GOT language from language file:" + lang);
dc7622afcfea initial
dwinter
parents:
diff changeset
152 /* 203 */ return lang;
dc7622afcfea initial
dwinter
parents:
diff changeset
153 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
154 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
155 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
156 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
157 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
158 /* 209 */ lang = deduceFromFolderName(file);
dc7622afcfea initial
dwinter
parents:
diff changeset
159 /* 210 */ if (lang != null)
dc7622afcfea initial
dwinter
parents:
diff changeset
160 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
161 /* 212 */ System.out.println("Langugage deduced from Folder:" + lang);
dc7622afcfea initial
dwinter
parents:
diff changeset
162 /* 213 */ return lang;
dc7622afcfea initial
dwinter
parents:
diff changeset
163 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
164 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
165 /* 216 */ if ((this.preferedLanguage != null) && (!this.preferedLanguage.equals(""))) {
dc7622afcfea initial
dwinter
parents:
diff changeset
166 /* 217 */ System.out.println("no language identified from Metadata: prefered language " + this.preferedLanguage + "will be used:" + file.getAbsolutePath());
dc7622afcfea initial
dwinter
parents:
diff changeset
167 /* 218 */ return this.preferedLanguage;
dc7622afcfea initial
dwinter
parents:
diff changeset
168 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
169 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
170 /* 221 */ System.out.println("no language identified: language will be generic all:" + file.getAbsolutePath());
dc7622afcfea initial
dwinter
parents:
diff changeset
171 /* 222 */ return "all";
dc7622afcfea initial
dwinter
parents:
diff changeset
172 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
173 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
174 /* */ private String deduceFromFolderName(File file) {
dc7622afcfea initial
dwinter
parents:
diff changeset
175 /* 226 */ File parent = file.getParentFile();
dc7622afcfea initial
dwinter
parents:
diff changeset
176 /* 227 */ String name = parent.getName();
dc7622afcfea initial
dwinter
parents:
diff changeset
177 /* 228 */ String lang = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
178 /* 229 */ if (this.supportedLanguageFolder.containsKey(name))
dc7622afcfea initial
dwinter
parents:
diff changeset
179 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
180 /* 231 */ lang = (String)this.supportedLanguageFolder.get(name);
dc7622afcfea initial
dwinter
parents:
diff changeset
181 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
182 /* 233 */ return lang;
dc7622afcfea initial
dwinter
parents:
diff changeset
183 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
184 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
185 /* */ private String getLanguageFromIndexMeta(File file)
dc7622afcfea initial
dwinter
parents:
diff changeset
186 /* */ throws IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
187 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
188 /* 244 */ file = new File("/mpiwg/online/" + absPathToTextId(file.getAbsolutePath()));
dc7622afcfea initial
dwinter
parents:
diff changeset
189 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
190 /* 246 */ File pf = file.getParentFile().getParentFile().getParentFile();
dc7622afcfea initial
dwinter
parents:
diff changeset
191 /* 247 */ File indexMeta = new File(pf, "index.meta");
dc7622afcfea initial
dwinter
parents:
diff changeset
192 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
193 /* 249 */ if (!indexMeta.exists())
dc7622afcfea initial
dwinter
parents:
diff changeset
194 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
195 /* 251 */ File pf2 = pf.getParentFile();
dc7622afcfea initial
dwinter
parents:
diff changeset
196 /* 252 */ indexMeta = new File(pf2, "index.meta");
dc7622afcfea initial
dwinter
parents:
diff changeset
197 /* 253 */ if (!indexMeta.exists())
dc7622afcfea initial
dwinter
parents:
diff changeset
198 /* 254 */ return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
199 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
200 /* 256 */ XMLReader parser = new SAXParser();
dc7622afcfea initial
dwinter
parents:
diff changeset
201 /* 257 */ ParseIndexMeta ch = new ParseIndexMeta();
dc7622afcfea initial
dwinter
parents:
diff changeset
202 /* 258 */ parser.setContentHandler(ch);
dc7622afcfea initial
dwinter
parents:
diff changeset
203 /* */ try {
dc7622afcfea initial
dwinter
parents:
diff changeset
204 /* 260 */ Reader reader = new FileReader(indexMeta);
dc7622afcfea initial
dwinter
parents:
diff changeset
205 /* 261 */ InputSource input = new InputSource(reader);
dc7622afcfea initial
dwinter
parents:
diff changeset
206 /* 262 */ parser.parse(input);
dc7622afcfea initial
dwinter
parents:
diff changeset
207 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
208 /* */ catch (SAXException e)
dc7622afcfea initial
dwinter
parents:
diff changeset
209 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
210 /* 266 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
211 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
212 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
213 /* 269 */ String lang = ch.lang;
dc7622afcfea initial
dwinter
parents:
diff changeset
214 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
215 /* 272 */ return lang;
dc7622afcfea initial
dwinter
parents:
diff changeset
216 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
217 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
218 /* */ private String getDCFromIndexMeta(String textId)
dc7622afcfea initial
dwinter
parents:
diff changeset
219 /* */ throws IOException, XmlRpcException
dc7622afcfea initial
dwinter
parents:
diff changeset
220 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
221 /* 301 */ XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
dc7622afcfea initial
dwinter
parents:
diff changeset
222 /* 302 */ URL url = new URL(this.mdProviderUrl);
dc7622afcfea initial
dwinter
parents:
diff changeset
223 /* 303 */ config.setServerURL(url);
dc7622afcfea initial
dwinter
parents:
diff changeset
224 /* 304 */ XmlRpcClient client = new XmlRpcClient();
dc7622afcfea initial
dwinter
parents:
diff changeset
225 /* 305 */ client.setConfig(config);
dc7622afcfea initial
dwinter
parents:
diff changeset
226 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
227 /* 307 */ Object[] params = { textId };
dc7622afcfea initial
dwinter
parents:
diff changeset
228 /* 308 */ Object returnVals = client.execute("getDCFormatted", params);
dc7622afcfea initial
dwinter
parents:
diff changeset
229 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
230 /* 311 */ return (String)returnVals;
dc7622afcfea initial
dwinter
parents:
diff changeset
231 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
232 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
233 /* */ protected HashMap<String, String> loadLanguages()
dc7622afcfea initial
dwinter
parents:
diff changeset
234 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
235 /* 320 */ File languageFile = new File(this.docDir + "/" + this.languageFileName);
dc7622afcfea initial
dwinter
parents:
diff changeset
236 /* 321 */ String languageFilePath = this.docDir + "/" + this.languageFileName;
dc7622afcfea initial
dwinter
parents:
diff changeset
237 /* 322 */ HashMap languages = new HashMap();
dc7622afcfea initial
dwinter
parents:
diff changeset
238 /* 323 */ boolean relativ = true;
dc7622afcfea initial
dwinter
parents:
diff changeset
239 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
240 /* 325 */ if (!languageFile.exists())
dc7622afcfea initial
dwinter
parents:
diff changeset
241 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
242 /* 327 */ languageFile = new File(this.languageFileName);
dc7622afcfea initial
dwinter
parents:
diff changeset
243 /* 328 */ languageFilePath = this.languageFileName;
dc7622afcfea initial
dwinter
parents:
diff changeset
244 /* 329 */ relativ = false;
dc7622afcfea initial
dwinter
parents:
diff changeset
245 /* 330 */ if (!languageFile.exists())
dc7622afcfea initial
dwinter
parents:
diff changeset
246 /* 331 */ return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
247 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
248 /* */ BufferedReader in;
dc7622afcfea initial
dwinter
parents:
diff changeset
249 /* */ try {
dc7622afcfea initial
dwinter
parents:
diff changeset
250 /* 335 */ in = new BufferedReader(new FileReader(languageFilePath));
dc7622afcfea initial
dwinter
parents:
diff changeset
251 /* */ } catch (FileNotFoundException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
252 /* 337 */ return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
253 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
254 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
255 /* 340 */ String zeile = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
256 /* */ try {
dc7622afcfea initial
dwinter
parents:
diff changeset
257 /* 342 */ while ((zeile = in.readLine()) != null) {
dc7622afcfea initial
dwinter
parents:
diff changeset
258 /* 343 */ String[] splitted = zeile.replace("\"", "").split("[,]");
dc7622afcfea initial
dwinter
parents:
diff changeset
259 /* 344 */ if (splitted.length == 2)
dc7622afcfea initial
dwinter
parents:
diff changeset
260 /* 345 */ if (relativ)
dc7622afcfea initial
dwinter
parents:
diff changeset
261 /* 346 */ languages.put(this.docDir + "/" + splitted[0], splitted[1]);
dc7622afcfea initial
dwinter
parents:
diff changeset
262 /* */ else
dc7622afcfea initial
dwinter
parents:
diff changeset
263 /* 348 */ languages.put(splitted[0], splitted[1]);
dc7622afcfea initial
dwinter
parents:
diff changeset
264 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
265 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
266 /* */ catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
267 /* 352 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
268 /* 353 */ return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
269 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
270 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
271 /* 356 */ return languages;
dc7622afcfea initial
dwinter
parents:
diff changeset
272 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
273 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
274 /* */ public void harvestFolder()
dc7622afcfea initial
dwinter
parents:
diff changeset
275 /* */ throws InterruptedException
dc7622afcfea initial
dwinter
parents:
diff changeset
276 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
277 /* 362 */ Date start = new Date();
dc7622afcfea initial
dwinter
parents:
diff changeset
278 /* 363 */ boolean create = true;
dc7622afcfea initial
dwinter
parents:
diff changeset
279 /* */ try
dc7622afcfea initial
dwinter
parents:
diff changeset
280 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
281 /* 374 */ System.out.println("Indexing to directory '" + this.index_dir + "'...");
dc7622afcfea initial
dwinter
parents:
diff changeset
282 /* 375 */ indexDocs(this.docDir);
dc7622afcfea initial
dwinter
parents:
diff changeset
283 /* 376 */ System.out.println("Optimizing...");
dc7622afcfea initial
dwinter
parents:
diff changeset
284 /* 377 */ this.languageAnalyzers.optimize();
dc7622afcfea initial
dwinter
parents:
diff changeset
285 /* 378 */ this.languageAnalyzers.close();
dc7622afcfea initial
dwinter
parents:
diff changeset
286 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
287 /* 380 */ Date end = new Date();
dc7622afcfea initial
dwinter
parents:
diff changeset
288 /* 381 */ System.out.println(end.getTime() - start.getTime() + " total milliseconds");
dc7622afcfea initial
dwinter
parents:
diff changeset
289 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
290 /* */ catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
291 /* 384 */ System.out.println(" caught a " + e.getClass() +
dc7622afcfea initial
dwinter
parents:
diff changeset
292 /* 385 */ "\n with message: " + e.getMessage());
dc7622afcfea initial
dwinter
parents:
diff changeset
293 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
294 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
295 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
296 /* */ void indexDocs(File file)
dc7622afcfea initial
dwinter
parents:
diff changeset
297 /* */ throws IOException, InterruptedException
dc7622afcfea initial
dwinter
parents:
diff changeset
298 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
299 /* 392 */ if (!file.canRead())
dc7622afcfea initial
dwinter
parents:
diff changeset
300 /* */ return;
dc7622afcfea initial
dwinter
parents:
diff changeset
301 /* 394 */ if (file.isDirectory())
dc7622afcfea initial
dwinter
parents:
diff changeset
302 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
303 /* 396 */ if (this.counter > 100000)
dc7622afcfea initial
dwinter
parents:
diff changeset
304 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
305 /* 398 */ return;
dc7622afcfea initial
dwinter
parents:
diff changeset
306 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
307 /* 400 */ String[] files = file.list();
dc7622afcfea initial
dwinter
parents:
diff changeset
308 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
309 /* 402 */ String folderName = file.getName();
dc7622afcfea initial
dwinter
parents:
diff changeset
310 /* 403 */ if ((((files != null) ? 1 : 0) & ((this.excludeFolders.contains(folderName)) ? 0 : 1)) != 0) {
dc7622afcfea initial
dwinter
parents:
diff changeset
311 /* 404 */ for (int i = 0; i < files.length; ++i)
dc7622afcfea initial
dwinter
parents:
diff changeset
312 /* 405 */ indexDocs(new File(file, files[i]));
dc7622afcfea initial
dwinter
parents:
diff changeset
313 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
314 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
315 /* 408 */ else if (isTextFile(file)) {
dc7622afcfea initial
dwinter
parents:
diff changeset
316 /* 409 */ processFile(file);
dc7622afcfea initial
dwinter
parents:
diff changeset
317 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
318 /* */ else {
dc7622afcfea initial
dwinter
parents:
diff changeset
319 /* 412 */ System.out.println("not adding " + file);
dc7622afcfea initial
dwinter
parents:
diff changeset
320 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
321 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
322 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
323 /* */ private void processFile(File file)
dc7622afcfea initial
dwinter
parents:
diff changeset
324 /* */ throws IOException, CorruptIndexException, InterruptedException, FileNotFoundException, UnsupportedEncodingException
dc7622afcfea initial
dwinter
parents:
diff changeset
325 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
326 /* 423 */ String textId = getTextId(file);
dc7622afcfea initial
dwinter
parents:
diff changeset
327 /* 424 */ System.out.println("file:" + this.counter);
dc7622afcfea initial
dwinter
parents:
diff changeset
328 /* 425 */ System.out.println("textId:" + textId);
dc7622afcfea initial
dwinter
parents:
diff changeset
329 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
330 /* 427 */ String lang = getLanguageOfText(textId, file);
dc7622afcfea initial
dwinter
parents:
diff changeset
331 /* 428 */ String dcMetaData = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
332 /* 429 */ if (this.mdProviderUrl != null)
dc7622afcfea initial
dwinter
parents:
diff changeset
333 /* */ try {
dc7622afcfea initial
dwinter
parents:
diff changeset
334 /* 431 */ dcMetaData = getDCFromIndexMeta(textId);
dc7622afcfea initial
dwinter
parents:
diff changeset
335 /* */ } catch (XmlRpcException e2) {
dc7622afcfea initial
dwinter
parents:
diff changeset
336 /* 433 */ dcMetaData = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
337 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
338 /* */ int docNr;
dc7622afcfea initial
dwinter
parents:
diff changeset
339 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
340 /* 437 */ if (this.mode == "add")
dc7622afcfea initial
dwinter
parents:
diff changeset
341 /* 438 */ docNr = 0;
dc7622afcfea initial
dwinter
parents:
diff changeset
342 /* */ else
dc7622afcfea initial
dwinter
parents:
diff changeset
343 /* 440 */ docNr = checkFileAndRemoveOldFile(file.getCanonicalPath(), lang, true, file.lastModified());
dc7622afcfea initial
dwinter
parents:
diff changeset
344 /* 441 */ if (lang == null) {
dc7622afcfea initial
dwinter
parents:
diff changeset
345 /* 442 */ System.out.println("not adding " + file);
dc7622afcfea initial
dwinter
parents:
diff changeset
346 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
347 /* 444 */ else if (docNr == -1) {
dc7622afcfea initial
dwinter
parents:
diff changeset
348 /* 445 */ System.out.println(" OLD FILE:" + file);
dc7622afcfea initial
dwinter
parents:
diff changeset
349 /* 446 */ } else if (docNr >= 0)
dc7622afcfea initial
dwinter
parents:
diff changeset
350 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
351 /* 448 */ System.out.println("adding " + file + " lang: " + lang);
dc7622afcfea initial
dwinter
parents:
diff changeset
352 /* */ try
dc7622afcfea initial
dwinter
parents:
diff changeset
353 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
354 /* 451 */ Boolean ret = addDocument(file, lang, dcMetaData, textId);
dc7622afcfea initial
dwinter
parents:
diff changeset
355 /* 452 */ if (ret.booleanValue())
dc7622afcfea initial
dwinter
parents:
diff changeset
356 /* 453 */ this.counter += 1;
dc7622afcfea initial
dwinter
parents:
diff changeset
357 /* */ } catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
358 /* 455 */ System.out.println("got an IO eception adding the document - wait a bit");
dc7622afcfea initial
dwinter
parents:
diff changeset
359 /* 456 */ Thread.sleep(10000L);
dc7622afcfea initial
dwinter
parents:
diff changeset
360 /* 457 */ System.out.println("Try again");
dc7622afcfea initial
dwinter
parents:
diff changeset
361 /* */ try {
dc7622afcfea initial
dwinter
parents:
diff changeset
362 /* 459 */ Boolean ret = addDocument(file, lang, dcMetaData, textId);
dc7622afcfea initial
dwinter
parents:
diff changeset
363 /* 460 */ if (ret.booleanValue())
dc7622afcfea initial
dwinter
parents:
diff changeset
364 /* 461 */ this.counter += 1;
dc7622afcfea initial
dwinter
parents:
diff changeset
365 /* */ } catch (IOException e1) {
dc7622afcfea initial
dwinter
parents:
diff changeset
366 /* 463 */ System.out.println("Couldn't do:" + file.getName());
dc7622afcfea initial
dwinter
parents:
diff changeset
367 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
368 /* */ catch (ParserConfigurationException e2) {
dc7622afcfea initial
dwinter
parents:
diff changeset
369 /* 466 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
370 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
371 /* */ catch (SAXException e2) {
dc7622afcfea initial
dwinter
parents:
diff changeset
372 /* 469 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
373 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
374 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
375 /* */ catch (ParserConfigurationException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
376 /* 473 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
377 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
378 /* */ catch (SAXException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
379 /* 476 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
380 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
381 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
382 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
383 /* */ else
dc7622afcfea initial
dwinter
parents:
diff changeset
384 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
385 /* 482 */ System.out.println(" UPDATE FILE:" + file + " lang: " + lang);
dc7622afcfea initial
dwinter
parents:
diff changeset
386 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
387 /* 484 */ this.counter += 1;
dc7622afcfea initial
dwinter
parents:
diff changeset
388 /* */ try {
dc7622afcfea initial
dwinter
parents:
diff changeset
389 /* 486 */ addDocument(file, lang, dcMetaData, textId);
dc7622afcfea initial
dwinter
parents:
diff changeset
390 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
391 /* */ catch (ParserConfigurationException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
392 /* 489 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
393 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
394 /* */ catch (SAXException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
395 /* 492 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
396 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
397 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
398 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
399 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
400 /* */ protected Boolean addDocument(File file, String lang, String dcMetaData, String textId)
dc7622afcfea initial
dwinter
parents:
diff changeset
401 /* */ throws CorruptIndexException, IOException, FileNotFoundException, UnsupportedEncodingException, ParserConfigurationException, SAXException
dc7622afcfea initial
dwinter
parents:
diff changeset
402 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
403 /* 509 */ if (dcMetaData != null) {
dc7622afcfea initial
dwinter
parents:
diff changeset
404 /* 510 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),lang, dcMetaData, textId), lang);
dc7622afcfea initial
dwinter
parents:
diff changeset
405 /* 511 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),"all", dcMetaData, textId), "all");
dc7622afcfea initial
dwinter
parents:
diff changeset
406 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
407 /* */ else
dc7622afcfea initial
dwinter
parents:
diff changeset
408 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
409 /* 515 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),lang, textId), lang);
dc7622afcfea initial
dwinter
parents:
diff changeset
410 /* 516 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),"all", textId), "all");
dc7622afcfea initial
dwinter
parents:
diff changeset
411 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
412 /* 518 */ return Boolean.valueOf(true);
dc7622afcfea initial
dwinter
parents:
diff changeset
413 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
414 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
415 /* */ private String getTextId(File file)
dc7622afcfea initial
dwinter
parents:
diff changeset
416 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
417 /* */ try
dc7622afcfea initial
dwinter
parents:
diff changeset
418 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
419 /* 529 */ File parent = file.getParentFile();
dc7622afcfea initial
dwinter
parents:
diff changeset
420 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
421 /* 531 */ if (parent.getName().equals("text"))
dc7622afcfea initial
dwinter
parents:
diff changeset
422 /* 532 */ return absPathToTextId(parent.getParentFile().getAbsolutePath());
dc7622afcfea initial
dwinter
parents:
diff changeset
423 /* 533 */ if (parent.getParentFile().getName().equals("text"))
dc7622afcfea initial
dwinter
parents:
diff changeset
424 /* 534 */ return absPathToTextId(parent.getParentFile().getParentFile().getAbsolutePath());
dc7622afcfea initial
dwinter
parents:
diff changeset
425 /* 535 */ if (parent.getParentFile().getParentFile().getName().equals("text")) {
dc7622afcfea initial
dwinter
parents:
diff changeset
426 /* 536 */ return absPathToTextId(parent.getParentFile().getParentFile().getParentFile().getAbsolutePath());
dc7622afcfea initial
dwinter
parents:
diff changeset
427 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
428 /* 538 */ return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
429 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
430 /* */ catch (RuntimeException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
431 /* 541 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
432 /* 542 */ }return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
433 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
434 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
435 protected String absPathToTextId(File file)
dc7622afcfea initial
dwinter
parents:
diff changeset
436 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
437 try {
dc7622afcfea initial
dwinter
parents:
diff changeset
438 return absPathToTextId(file.getCanonicalPath());
dc7622afcfea initial
dwinter
parents:
diff changeset
439 } catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
440
dc7622afcfea initial
dwinter
parents:
diff changeset
441 e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
442 return "";
dc7622afcfea initial
dwinter
parents:
diff changeset
443 }
dc7622afcfea initial
dwinter
parents:
diff changeset
444 }
dc7622afcfea initial
dwinter
parents:
diff changeset
445
dc7622afcfea initial
dwinter
parents:
diff changeset
446 /* */ protected String absPathToTextId(String absolutePath)
dc7622afcfea initial
dwinter
parents:
diff changeset
447 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
448 /* 555 */ if (this.specialMode.equals("vlp"))
dc7622afcfea initial
dwinter
parents:
diff changeset
449 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
450 /* 557 */ String[] splitted = absolutePath.split("lit");
dc7622afcfea initial
dwinter
parents:
diff changeset
451 /* 558 */ return splitted[1];
dc7622afcfea initial
dwinter
parents:
diff changeset
452 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
453 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
454 /* 562 */ Pattern p = Pattern.compile(TEXTIDFROMPATH_REGEXP);
dc7622afcfea initial
dwinter
parents:
diff changeset
455 /* 563 */ Matcher m = p.matcher(absolutePath);
dc7622afcfea initial
dwinter
parents:
diff changeset
456 /* 564 */ m.matches();
dc7622afcfea initial
dwinter
parents:
diff changeset
457 /* 565 */ if (m.groupCount() > 0) {
dc7622afcfea initial
dwinter
parents:
diff changeset
458 /* 566 */ return m.group(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
459 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
460 /* 568 */ System.err.println("correctPath: not a mpiwg path / no changes done" + absolutePath);
dc7622afcfea initial
dwinter
parents:
diff changeset
461 /* 569 */ return absolutePath;
dc7622afcfea initial
dwinter
parents:
diff changeset
462 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
463 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
464 /* */ private int checkFileAndRemoveOldFile(String filePath, String lang, boolean deleteWrongLanguage, long fileModDate)
dc7622afcfea initial
dwinter
parents:
diff changeset
465 /* */ throws CorruptIndexException, IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
466 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
467 /* 577 */ lang = checkSupportedLanguages(lang);
dc7622afcfea initial
dwinter
parents:
diff changeset
468 /* 578 */ System.out.println("lang converted+" + lang);
dc7622afcfea initial
dwinter
parents:
diff changeset
469 /* 579 */ //TermQuery query = new TermQuery(new Term("path", filePath));
dc7622afcfea initial
dwinter
parents:
diff changeset
470 TermQuery query = new TermQuery(new Term("cleanedPath", absPathToTextId(filePath)));
dc7622afcfea initial
dwinter
parents:
diff changeset
471 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
472 /* 582 */ HashMap<String,Collector> results = this.languageAnalyzers.search(query);
dc7622afcfea initial
dwinter
parents:
diff changeset
473 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
474 /* 584 */ if (results == null) {
dc7622afcfea initial
dwinter
parents:
diff changeset
475 /* 585 */ return 0;
dc7622afcfea initial
dwinter
parents:
diff changeset
476 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
477 /* 587 */ for (String resultLang : results.keySet())
dc7622afcfea initial
dwinter
parents:
diff changeset
478 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
479 /* 589 */ TopScoreDocCollector collector = (TopScoreDocCollector)results.get(resultLang);
dc7622afcfea initial
dwinter
parents:
diff changeset
480 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
481 /* 591 */ if ((collector == null) || (collector.getTotalHits() <= 0))
dc7622afcfea initial
dwinter
parents:
diff changeset
482 /* */ continue;
dc7622afcfea initial
dwinter
parents:
diff changeset
483 /* 593 */ if ((!resultLang.equals(lang)) && (deleteWrongLanguage) && (!resultLang.equals("morph")))
dc7622afcfea initial
dwinter
parents:
diff changeset
484 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
485 /* 595 */ this.languageAnalyzers.deleteDocuments(query);
dc7622afcfea initial
dwinter
parents:
diff changeset
486 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
487 /* 603 */ System.out.println("language changed:" + filePath);
dc7622afcfea initial
dwinter
parents:
diff changeset
488 /* 604 */ return 1;
dc7622afcfea initial
dwinter
parents:
diff changeset
489 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
490 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
491 /* 607 */ if (!resultLang.equals(lang))
dc7622afcfea initial
dwinter
parents:
diff changeset
492 /* */ continue;
dc7622afcfea initial
dwinter
parents:
diff changeset
493 /* 609 */ TopDocs docs = collector.topDocs();
dc7622afcfea initial
dwinter
parents:
diff changeset
494 /* */ ScoreDoc[] arrayOfScoreDoc;
dc7622afcfea initial
dwinter
parents:
diff changeset
495 /* 610 */ if ((arrayOfScoreDoc = docs.scoreDocs).length == 0) continue; ScoreDoc doc = arrayOfScoreDoc[0];
dc7622afcfea initial
dwinter
parents:
diff changeset
496 /* 611 */ String modDate = this.languageAnalyzers.getAnalyzer(resultLang).reader.document(doc.doc).getField("modified").stringValue();
dc7622afcfea initial
dwinter
parents:
diff changeset
497 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
498 /* 613 */ String fileDate = DateTools.timeToString(fileModDate, DateTools.Resolution.MINUTE);
dc7622afcfea initial
dwinter
parents:
diff changeset
499 /* 614 */ if (!fileDate.equals(modDate))
dc7622afcfea initial
dwinter
parents:
diff changeset
500 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
501 /* 618 */ System.out.println("new file:" + filePath);
dc7622afcfea initial
dwinter
parents:
diff changeset
502 /* 619 */ this.languageAnalyzers.deleteDocuments(query);
dc7622afcfea initial
dwinter
parents:
diff changeset
503 /* 620 */ return 2;
dc7622afcfea initial
dwinter
parents:
diff changeset
504 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
505 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
506 /* 623 */ return -1;
dc7622afcfea initial
dwinter
parents:
diff changeset
507 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
508 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
509 /* 631 */ return 0;
dc7622afcfea initial
dwinter
parents:
diff changeset
510 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
511 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
512 /* */ private String checkSupportedLanguages(String lang)
dc7622afcfea initial
dwinter
parents:
diff changeset
513 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
514 /* 643 */ if (this.languageAnalyzers.getAnalyzer(lang) == null)
dc7622afcfea initial
dwinter
parents:
diff changeset
515 /* 644 */ return "all";
dc7622afcfea initial
dwinter
parents:
diff changeset
516 /* 645 */ return lang;
dc7622afcfea initial
dwinter
parents:
diff changeset
517 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
518 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
519 /* */ public void setIndexMetaPriority(boolean prio)
dc7622afcfea initial
dwinter
parents:
diff changeset
520 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
521 /* 650 */ this.indexMetaPriority = prio;
dc7622afcfea initial
dwinter
parents:
diff changeset
522 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
523 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
524 /* */ public boolean getIndexMetaPriority() {
dc7622afcfea initial
dwinter
parents:
diff changeset
525 /* 654 */ return this.indexMetaPriority;
dc7622afcfea initial
dwinter
parents:
diff changeset
526 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
527 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
528 /* */ private boolean isTextFile(File file)
dc7622afcfea initial
dwinter
parents:
diff changeset
529 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
530 /* 659 */ String fn = file.getName();
dc7622afcfea initial
dwinter
parents:
diff changeset
531 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
532 /* 661 */ String[] splitted = fn.split("[.]");
dc7622afcfea initial
dwinter
parents:
diff changeset
533 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
534 /* 663 */ String ext = "";
dc7622afcfea initial
dwinter
parents:
diff changeset
535 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
536 /* 665 */ if (splitted.length > 1)
dc7622afcfea initial
dwinter
parents:
diff changeset
537 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
538 /* 667 */ ext = splitted[(splitted.length - 1)];
dc7622afcfea initial
dwinter
parents:
diff changeset
539 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
540 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
541 /* 670 */ return this.fileTypesToIndex.contains(ext);
dc7622afcfea initial
dwinter
parents:
diff changeset
542 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
543 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
544
dc7622afcfea initial
dwinter
parents:
diff changeset
545 /* Location: /private/tmp/fulltextIndexer.jar
dc7622afcfea initial
dwinter
parents:
diff changeset
546 * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread
dc7622afcfea initial
dwinter
parents:
diff changeset
547 * JD-Core Version: 0.5.4
dc7622afcfea initial
dwinter
parents:
diff changeset
548 */