Mercurial > hg > fulltextIndexer
comparison src/de/mpiwg/dwinter/fulltextIndexer/harvester/processors/ProcessFileThread.java @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dc7622afcfea |
---|---|
1 /* */ package de.mpiwg.dwinter.fulltextIndexer.harvester.processors; | |
2 /* */ | |
3 /* */ import java.io.BufferedReader; | |
4 import java.io.File; | |
5 import java.io.FileNotFoundException; | |
6 import java.io.FileReader; | |
7 import java.io.IOException; | |
8 import java.io.Reader; | |
9 import java.io.UnsupportedEncodingException; | |
10 import java.net.URL; | |
11 import java.util.ArrayList; | |
12 import java.util.Arrays; | |
13 import java.util.Date; | |
14 import java.util.HashMap; | |
15 import java.util.regex.Matcher; | |
16 import java.util.regex.Pattern; | |
17 | |
18 import javax.xml.parsers.ParserConfigurationException; | |
19 | |
20 import org.apache.lucene.document.DateTools; | |
21 import org.apache.lucene.index.CorruptIndexException; | |
22 import org.apache.lucene.index.Term; | |
23 import org.apache.lucene.search.Collector; | |
24 import org.apache.lucene.search.ScoreDoc; | |
25 import org.apache.lucene.search.TermQuery; | |
26 import org.apache.lucene.search.TopDocs; | |
27 import org.apache.lucene.search.TopScoreDocCollector; | |
28 import org.apache.lucene.store.LockObtainFailedException; | |
29 import org.apache.xmlrpc.XmlRpcException; | |
30 import org.apache.xmlrpc.client.XmlRpcClient; | |
31 import org.apache.xmlrpc.client.XmlRpcClientConfigImpl; | |
32 import org.xml.sax.InputSource; | |
33 import org.xml.sax.SAXException; | |
34 import org.xml.sax.XMLReader; | |
35 | |
36 import com.sun.org.apache.xerces.internal.parsers.SAXParser; | |
37 | |
38 import de.mpiwg.dwinter.fulltextIndexer.utils.ParseIndexMeta; | |
39 import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers; | |
40 import de.mpiwg.dwinter.lucencetools.documents.FileDocument; | |
41 /* */ | |
42 /* */ public class ProcessFileThread extends Thread | |
43 /* */ { | |
44 /* */ private static final String TEXTIDFROMPATH_REGEXP = ".*(/(permanent|experimental)/.*)"; | |
45 /* */ private static final int DELETED_WRONG_LANGUAGE = 1; | |
46 /* */ private static final int DELETED_OLD_VERSION = 2; | |
47 /* */ private static final int NEW_FILE = 0; | |
48 /* */ private static final int FILE_EXISTS = -1; | |
49 /* */ protected File docDir; | |
50 /* */ protected File index_dir; | |
51 /* 86 */ protected ArrayList<String> fileTypesToIndex = new ArrayList(Arrays.asList(new String[] { "xml" })); | |
52 /* 87 */ protected ArrayList<String> excludeFolders = new ArrayList(Arrays.asList(new String[] { "OCR" })); | |
53 /* 88 */ private HashMap<String, String> textLanguage = null; | |
54 /* */ | |
55 /* 90 */ protected HashMap<String, String> languageToISO = new HashMap(); | |
56 /* 91 */ protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers(); | |
57 /* 92 */ protected HashMap<String, String> supportedLanguageFolder = new HashMap(); | |
58 /* 93 */ private int counter = 0; | |
59 /* */ protected String languageFileName; | |
60 /* 95 */ protected boolean indexMetaPriority = false; | |
61 /* 96 */ protected boolean deduceFromFolderPriority = true; | |
62 /* */ | |
63 /* 101 */ private String specialMode = ""; | |
64 /* 102 */ public boolean done = false; | |
65 /* */ private File processThisFile; | |
66 /* 105 */ private String mode = "new"; // if mode is not add, then only modified files and new files will be added. | |
67 /* */ private String mdProviderUrl; | |
68 /* 107 */ private String preferedLanguage = null; | |
69 /* */ | |
70 /* */ public ProcessFileThread(File docDir, File index_dir, String languageFileName, File processThisFile, String mdProviderUrl, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder) throws CorruptIndexException, LockObtainFailedException, IOException { | |
71 /* 110 */ this.docDir = docDir; | |
72 /* 111 */ this.languageFileName = languageFileName; | |
73 /* */ | |
74 /* 114 */ this.index_dir = index_dir; | |
75 /* 115 */ this.processThisFile = processThisFile; | |
76 /* 116 */ this.mdProviderUrl = mdProviderUrl; | |
77 /* 117 */ this.languageToISO = languageToISO; | |
78 /* 118 */ this.supportedLanguageFolder = supportedLanguageFolder; | |
79 /* */ } | |
80 /* */ | |
81 /* */ public ProcessFileThread(LanguageAnalyzers languageAnalyzers2, File file, String lfn, HashMap<String, String> tl, String mdProviderUrl, String preferedLanguage, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder) | |
82 /* */ { | |
83 /* 123 */ this.languageAnalyzers = languageAnalyzers2; | |
84 /* 124 */ this.processThisFile = file; | |
85 /* 125 */ this.textLanguage = tl; | |
86 /* */ | |
87 /* 128 */ this.languageFileName = lfn; | |
88 /* 129 */ this.preferedLanguage = preferedLanguage; | |
89 /* 130 */ this.mdProviderUrl = mdProviderUrl; | |
90 /* 131 */ this.languageToISO = languageToISO; | |
91 /* 132 */ this.supportedLanguageFolder = supportedLanguageFolder; | |
92 /* */ } | |
93 /* */ | |
94 /* */ public void run() | |
95 /* */ { | |
96 /* */ try | |
97 /* */ { | |
98 /* 140 */ processFile(this.processThisFile); | |
99 /* */ } | |
100 /* */ catch (CorruptIndexException e) { | |
101 /* 143 */ e.printStackTrace(); | |
102 /* */ } | |
103 /* */ catch (FileNotFoundException e) { | |
104 /* 146 */ e.printStackTrace(); | |
105 /* */ } | |
106 /* */ catch (UnsupportedEncodingException e) { | |
107 /* 149 */ e.printStackTrace(); | |
108 /* */ } | |
109 /* */ catch (IOException e) { | |
110 /* 152 */ e.printStackTrace(); | |
111 /* */ } | |
112 /* */ catch (InterruptedException e) { | |
113 /* 155 */ e.printStackTrace(); | |
114 /* */ } | |
115 /* 157 */ this.done = true; | |
116 /* */ } | |
117 /* */ | |
118 /* */ private String getLanguageOfText(String textId, File file) throws IOException { | |
119 /* 161 */ String lang = null; | |
120 /* */ | |
121 /* 163 */ if (this.deduceFromFolderPriority) | |
122 /* */ { | |
123 /* 165 */ lang = deduceFromFolderName(file); | |
124 /* 166 */ if (lang != null) { | |
125 /* 167 */ return lang; | |
126 /* */ } | |
127 /* */ } | |
128 /* 170 */ if ((this.languageFileName == null | this.indexMetaPriority)) { | |
129 /* 171 */ lang = getLanguageFromIndexMeta(file); | |
130 /* */ | |
131 /* 177 */ if ((lang != null) && | |
132 /* 178 */ (lang.equals(""))) { | |
133 /* 179 */ System.out.println("Language for " + file.getAbsolutePath() + " is " + lang); | |
134 /* 180 */ return lang; | |
135 /* */ } | |
136 /* */ } | |
137 /* 183 */ if (this.languageFileName != null) | |
138 /* */ { | |
139 /* 185 */ if (this.textLanguage == null) | |
140 /* 186 */ this.textLanguage = loadLanguages(); | |
141 /* 187 */ if (this.textLanguage == null) | |
142 /* */ { | |
143 /* 189 */ System.out.println("NO LANGUAGE FILES LOADED"); | |
144 /* */ } | |
145 /* */ else | |
146 /* */ { | |
147 /* 198 */ String language = (String)this.textLanguage.get(textId); | |
148 /* 199 */ lang = (String)this.languageToISO.get(language); | |
149 /* 200 */ if (lang != null) | |
150 /* */ { | |
151 /* 202 */ System.out.println("GOT language from language file:" + lang); | |
152 /* 203 */ return lang; | |
153 /* */ } | |
154 /* */ } | |
155 /* */ | |
156 /* */ } | |
157 /* */ | |
158 /* 209 */ lang = deduceFromFolderName(file); | |
159 /* 210 */ if (lang != null) | |
160 /* */ { | |
161 /* 212 */ System.out.println("Langugage deduced from Folder:" + lang); | |
162 /* 213 */ return lang; | |
163 /* */ } | |
164 /* */ | |
165 /* 216 */ if ((this.preferedLanguage != null) && (!this.preferedLanguage.equals(""))) { | |
166 /* 217 */ System.out.println("no language identified from Metadata: prefered language " + this.preferedLanguage + "will be used:" + file.getAbsolutePath()); | |
167 /* 218 */ return this.preferedLanguage; | |
168 /* */ } | |
169 /* */ | |
170 /* 221 */ System.out.println("no language identified: language will be generic all:" + file.getAbsolutePath()); | |
171 /* 222 */ return "all"; | |
172 /* */ } | |
173 /* */ | |
174 /* */ private String deduceFromFolderName(File file) { | |
175 /* 226 */ File parent = file.getParentFile(); | |
176 /* 227 */ String name = parent.getName(); | |
177 /* 228 */ String lang = null; | |
178 /* 229 */ if (this.supportedLanguageFolder.containsKey(name)) | |
179 /* */ { | |
180 /* 231 */ lang = (String)this.supportedLanguageFolder.get(name); | |
181 /* */ } | |
182 /* 233 */ return lang; | |
183 /* */ } | |
184 /* */ | |
185 /* */ private String getLanguageFromIndexMeta(File file) | |
186 /* */ throws IOException | |
187 /* */ { | |
188 /* 244 */ file = new File("/mpiwg/online/" + absPathToTextId(file.getAbsolutePath())); | |
189 /* */ | |
190 /* 246 */ File pf = file.getParentFile().getParentFile().getParentFile(); | |
191 /* 247 */ File indexMeta = new File(pf, "index.meta"); | |
192 /* */ | |
193 /* 249 */ if (!indexMeta.exists()) | |
194 /* */ { | |
195 /* 251 */ File pf2 = pf.getParentFile(); | |
196 /* 252 */ indexMeta = new File(pf2, "index.meta"); | |
197 /* 253 */ if (!indexMeta.exists()) | |
198 /* 254 */ return null; | |
199 /* */ } | |
200 /* 256 */ XMLReader parser = new SAXParser(); | |
201 /* 257 */ ParseIndexMeta ch = new ParseIndexMeta(); | |
202 /* 258 */ parser.setContentHandler(ch); | |
203 /* */ try { | |
204 /* 260 */ Reader reader = new FileReader(indexMeta); | |
205 /* 261 */ InputSource input = new InputSource(reader); | |
206 /* 262 */ parser.parse(input); | |
207 /* */ } | |
208 /* */ catch (SAXException e) | |
209 /* */ { | |
210 /* 266 */ e.printStackTrace(); | |
211 /* */ } | |
212 /* */ | |
213 /* 269 */ String lang = ch.lang; | |
214 /* */ | |
215 /* 272 */ return lang; | |
216 /* */ } | |
217 /* */ | |
218 /* */ private String getDCFromIndexMeta(String textId) | |
219 /* */ throws IOException, XmlRpcException | |
220 /* */ { | |
221 /* 301 */ XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl(); | |
222 /* 302 */ URL url = new URL(this.mdProviderUrl); | |
223 /* 303 */ config.setServerURL(url); | |
224 /* 304 */ XmlRpcClient client = new XmlRpcClient(); | |
225 /* 305 */ client.setConfig(config); | |
226 /* */ | |
227 /* 307 */ Object[] params = { textId }; | |
228 /* 308 */ Object returnVals = client.execute("getDCFormatted", params); | |
229 /* */ | |
230 /* 311 */ return (String)returnVals; | |
231 /* */ } | |
232 /* */ | |
233 /* */ protected HashMap<String, String> loadLanguages() | |
234 /* */ { | |
235 /* 320 */ File languageFile = new File(this.docDir + "/" + this.languageFileName); | |
236 /* 321 */ String languageFilePath = this.docDir + "/" + this.languageFileName; | |
237 /* 322 */ HashMap languages = new HashMap(); | |
238 /* 323 */ boolean relativ = true; | |
239 /* */ | |
240 /* 325 */ if (!languageFile.exists()) | |
241 /* */ { | |
242 /* 327 */ languageFile = new File(this.languageFileName); | |
243 /* 328 */ languageFilePath = this.languageFileName; | |
244 /* 329 */ relativ = false; | |
245 /* 330 */ if (!languageFile.exists()) | |
246 /* 331 */ return null; | |
247 /* */ } | |
248 /* */ BufferedReader in; | |
249 /* */ try { | |
250 /* 335 */ in = new BufferedReader(new FileReader(languageFilePath)); | |
251 /* */ } catch (FileNotFoundException e) { | |
252 /* 337 */ return null; | |
253 /* */ } | |
254 /* */ | |
255 /* 340 */ String zeile = null; | |
256 /* */ try { | |
257 /* 342 */ while ((zeile = in.readLine()) != null) { | |
258 /* 343 */ String[] splitted = zeile.replace("\"", "").split("[,]"); | |
259 /* 344 */ if (splitted.length == 2) | |
260 /* 345 */ if (relativ) | |
261 /* 346 */ languages.put(this.docDir + "/" + splitted[0], splitted[1]); | |
262 /* */ else | |
263 /* 348 */ languages.put(splitted[0], splitted[1]); | |
264 /* */ } | |
265 /* */ } | |
266 /* */ catch (IOException e) { | |
267 /* 352 */ e.printStackTrace(); | |
268 /* 353 */ return null; | |
269 /* */ } | |
270 /* */ | |
271 /* 356 */ return languages; | |
272 /* */ } | |
273 /* */ | |
274 /* */ public void harvestFolder() | |
275 /* */ throws InterruptedException | |
276 /* */ { | |
277 /* 362 */ Date start = new Date(); | |
278 /* 363 */ boolean create = true; | |
279 /* */ try | |
280 /* */ { | |
281 /* 374 */ System.out.println("Indexing to directory '" + this.index_dir + "'..."); | |
282 /* 375 */ indexDocs(this.docDir); | |
283 /* 376 */ System.out.println("Optimizing..."); | |
284 /* 377 */ this.languageAnalyzers.optimize(); | |
285 /* 378 */ this.languageAnalyzers.close(); | |
286 /* */ | |
287 /* 380 */ Date end = new Date(); | |
288 /* 381 */ System.out.println(end.getTime() - start.getTime() + " total milliseconds"); | |
289 /* */ } | |
290 /* */ catch (IOException e) { | |
291 /* 384 */ System.out.println(" caught a " + e.getClass() + | |
292 /* 385 */ "\n with message: " + e.getMessage()); | |
293 /* */ } | |
294 /* */ } | |
295 /* */ | |
296 /* */ void indexDocs(File file) | |
297 /* */ throws IOException, InterruptedException | |
298 /* */ { | |
299 /* 392 */ if (!file.canRead()) | |
300 /* */ return; | |
301 /* 394 */ if (file.isDirectory()) | |
302 /* */ { | |
303 /* 396 */ if (this.counter > 100000) | |
304 /* */ { | |
305 /* 398 */ return; | |
306 /* */ } | |
307 /* 400 */ String[] files = file.list(); | |
308 /* */ | |
309 /* 402 */ String folderName = file.getName(); | |
310 /* 403 */ if ((((files != null) ? 1 : 0) & ((this.excludeFolders.contains(folderName)) ? 0 : 1)) != 0) { | |
311 /* 404 */ for (int i = 0; i < files.length; ++i) | |
312 /* 405 */ indexDocs(new File(file, files[i])); | |
313 /* */ } | |
314 /* */ } | |
315 /* 408 */ else if (isTextFile(file)) { | |
316 /* 409 */ processFile(file); | |
317 /* */ } | |
318 /* */ else { | |
319 /* 412 */ System.out.println("not adding " + file); | |
320 /* */ } | |
321 /* */ } | |
322 /* */ | |
323 /* */ private void processFile(File file) | |
324 /* */ throws IOException, CorruptIndexException, InterruptedException, FileNotFoundException, UnsupportedEncodingException | |
325 /* */ { | |
326 /* 423 */ String textId = getTextId(file); | |
327 /* 424 */ System.out.println("file:" + this.counter); | |
328 /* 425 */ System.out.println("textId:" + textId); | |
329 /* */ | |
330 /* 427 */ String lang = getLanguageOfText(textId, file); | |
331 /* 428 */ String dcMetaData = null; | |
332 /* 429 */ if (this.mdProviderUrl != null) | |
333 /* */ try { | |
334 /* 431 */ dcMetaData = getDCFromIndexMeta(textId); | |
335 /* */ } catch (XmlRpcException e2) { | |
336 /* 433 */ dcMetaData = null; | |
337 /* */ } | |
338 /* */ int docNr; | |
339 /* */ | |
340 /* 437 */ if (this.mode == "add") | |
341 /* 438 */ docNr = 0; | |
342 /* */ else | |
343 /* 440 */ docNr = checkFileAndRemoveOldFile(file.getCanonicalPath(), lang, true, file.lastModified()); | |
344 /* 441 */ if (lang == null) { | |
345 /* 442 */ System.out.println("not adding " + file); | |
346 /* */ } | |
347 /* 444 */ else if (docNr == -1) { | |
348 /* 445 */ System.out.println(" OLD FILE:" + file); | |
349 /* 446 */ } else if (docNr >= 0) | |
350 /* */ { | |
351 /* 448 */ System.out.println("adding " + file + " lang: " + lang); | |
352 /* */ try | |
353 /* */ { | |
354 /* 451 */ Boolean ret = addDocument(file, lang, dcMetaData, textId); | |
355 /* 452 */ if (ret.booleanValue()) | |
356 /* 453 */ this.counter += 1; | |
357 /* */ } catch (IOException e) { | |
358 /* 455 */ System.out.println("got an IO eception adding the document - wait a bit"); | |
359 /* 456 */ Thread.sleep(10000L); | |
360 /* 457 */ System.out.println("Try again"); | |
361 /* */ try { | |
362 /* 459 */ Boolean ret = addDocument(file, lang, dcMetaData, textId); | |
363 /* 460 */ if (ret.booleanValue()) | |
364 /* 461 */ this.counter += 1; | |
365 /* */ } catch (IOException e1) { | |
366 /* 463 */ System.out.println("Couldn't do:" + file.getName()); | |
367 /* */ } | |
368 /* */ catch (ParserConfigurationException e2) { | |
369 /* 466 */ e.printStackTrace(); | |
370 /* */ } | |
371 /* */ catch (SAXException e2) { | |
372 /* 469 */ e.printStackTrace(); | |
373 /* */ } | |
374 /* */ } | |
375 /* */ catch (ParserConfigurationException e) { | |
376 /* 473 */ e.printStackTrace(); | |
377 /* */ } | |
378 /* */ catch (SAXException e) { | |
379 /* 476 */ e.printStackTrace(); | |
380 /* */ } | |
381 /* */ | |
382 /* */ } | |
383 /* */ else | |
384 /* */ { | |
385 /* 482 */ System.out.println(" UPDATE FILE:" + file + " lang: " + lang); | |
386 /* */ | |
387 /* 484 */ this.counter += 1; | |
388 /* */ try { | |
389 /* 486 */ addDocument(file, lang, dcMetaData, textId); | |
390 /* */ } | |
391 /* */ catch (ParserConfigurationException e) { | |
392 /* 489 */ e.printStackTrace(); | |
393 /* */ } | |
394 /* */ catch (SAXException e) { | |
395 /* 492 */ e.printStackTrace(); | |
396 /* */ } | |
397 /* */ } | |
398 /* */ } | |
399 /* */ | |
400 /* */ protected Boolean addDocument(File file, String lang, String dcMetaData, String textId) | |
401 /* */ throws CorruptIndexException, IOException, FileNotFoundException, UnsupportedEncodingException, ParserConfigurationException, SAXException | |
402 /* */ { | |
403 /* 509 */ if (dcMetaData != null) { | |
404 /* 510 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),lang, dcMetaData, textId), lang); | |
405 /* 511 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),"all", dcMetaData, textId), "all"); | |
406 /* */ } | |
407 /* */ else | |
408 /* */ { | |
409 /* 515 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),lang, textId), lang); | |
410 /* 516 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),"all", textId), "all"); | |
411 /* */ } | |
412 /* 518 */ return Boolean.valueOf(true); | |
413 /* */ } | |
414 /* */ | |
415 /* */ private String getTextId(File file) | |
416 /* */ { | |
417 /* */ try | |
418 /* */ { | |
419 /* 529 */ File parent = file.getParentFile(); | |
420 /* */ | |
421 /* 531 */ if (parent.getName().equals("text")) | |
422 /* 532 */ return absPathToTextId(parent.getParentFile().getAbsolutePath()); | |
423 /* 533 */ if (parent.getParentFile().getName().equals("text")) | |
424 /* 534 */ return absPathToTextId(parent.getParentFile().getParentFile().getAbsolutePath()); | |
425 /* 535 */ if (parent.getParentFile().getParentFile().getName().equals("text")) { | |
426 /* 536 */ return absPathToTextId(parent.getParentFile().getParentFile().getParentFile().getAbsolutePath()); | |
427 /* */ } | |
428 /* 538 */ return null; | |
429 /* */ } | |
430 /* */ catch (RuntimeException e) { | |
431 /* 541 */ e.printStackTrace(); | |
432 /* 542 */ }return null; | |
433 /* */ } | |
434 /* */ | |
435 protected String absPathToTextId(File file) | |
436 /* */ { | |
437 try { | |
438 return absPathToTextId(file.getCanonicalPath()); | |
439 } catch (IOException e) { | |
440 | |
441 e.printStackTrace(); | |
442 return ""; | |
443 } | |
444 } | |
445 | |
446 /* */ protected String absPathToTextId(String absolutePath) | |
447 /* */ { | |
448 /* 555 */ if (this.specialMode.equals("vlp")) | |
449 /* */ { | |
450 /* 557 */ String[] splitted = absolutePath.split("lit"); | |
451 /* 558 */ return splitted[1]; | |
452 /* */ } | |
453 /* */ | |
454 /* 562 */ Pattern p = Pattern.compile(TEXTIDFROMPATH_REGEXP); | |
455 /* 563 */ Matcher m = p.matcher(absolutePath); | |
456 /* 564 */ m.matches(); | |
457 /* 565 */ if (m.groupCount() > 0) { | |
458 /* 566 */ return m.group(1); | |
459 /* */ } | |
460 /* 568 */ System.err.println("correctPath: not a mpiwg path / no changes done" + absolutePath); | |
461 /* 569 */ return absolutePath; | |
462 /* */ } | |
463 /* */ | |
464 /* */ private int checkFileAndRemoveOldFile(String filePath, String lang, boolean deleteWrongLanguage, long fileModDate) | |
465 /* */ throws CorruptIndexException, IOException | |
466 /* */ { | |
467 /* 577 */ lang = checkSupportedLanguages(lang); | |
468 /* 578 */ System.out.println("lang converted+" + lang); | |
469 /* 579 */ //TermQuery query = new TermQuery(new Term("path", filePath)); | |
470 TermQuery query = new TermQuery(new Term("cleanedPath", absPathToTextId(filePath))); | |
471 /* */ | |
472 /* 582 */ HashMap<String,Collector> results = this.languageAnalyzers.search(query); | |
473 /* */ | |
474 /* 584 */ if (results == null) { | |
475 /* 585 */ return 0; | |
476 /* */ } | |
477 /* 587 */ for (String resultLang : results.keySet()) | |
478 /* */ { | |
479 /* 589 */ TopScoreDocCollector collector = (TopScoreDocCollector)results.get(resultLang); | |
480 /* */ | |
481 /* 591 */ if ((collector == null) || (collector.getTotalHits() <= 0)) | |
482 /* */ continue; | |
483 /* 593 */ if ((!resultLang.equals(lang)) && (deleteWrongLanguage) && (!resultLang.equals("morph"))) | |
484 /* */ { | |
485 /* 595 */ this.languageAnalyzers.deleteDocuments(query); | |
486 /* */ | |
487 /* 603 */ System.out.println("language changed:" + filePath); | |
488 /* 604 */ return 1; | |
489 /* */ } | |
490 /* */ | |
491 /* 607 */ if (!resultLang.equals(lang)) | |
492 /* */ continue; | |
493 /* 609 */ TopDocs docs = collector.topDocs(); | |
494 /* */ ScoreDoc[] arrayOfScoreDoc; | |
495 /* 610 */ if ((arrayOfScoreDoc = docs.scoreDocs).length == 0) continue; ScoreDoc doc = arrayOfScoreDoc[0]; | |
496 /* 611 */ String modDate = this.languageAnalyzers.getAnalyzer(resultLang).reader.document(doc.doc).getField("modified").stringValue(); | |
497 /* */ | |
498 /* 613 */ String fileDate = DateTools.timeToString(fileModDate, DateTools.Resolution.MINUTE); | |
499 /* 614 */ if (!fileDate.equals(modDate)) | |
500 /* */ { | |
501 /* 618 */ System.out.println("new file:" + filePath); | |
502 /* 619 */ this.languageAnalyzers.deleteDocuments(query); | |
503 /* 620 */ return 2; | |
504 /* */ } | |
505 /* */ | |
506 /* 623 */ return -1; | |
507 /* */ } | |
508 /* */ | |
509 /* 631 */ return 0; | |
510 /* */ } | |
511 /* */ | |
512 /* */ private String checkSupportedLanguages(String lang) | |
513 /* */ { | |
514 /* 643 */ if (this.languageAnalyzers.getAnalyzer(lang) == null) | |
515 /* 644 */ return "all"; | |
516 /* 645 */ return lang; | |
517 /* */ } | |
518 /* */ | |
519 /* */ public void setIndexMetaPriority(boolean prio) | |
520 /* */ { | |
521 /* 650 */ this.indexMetaPriority = prio; | |
522 /* */ } | |
523 /* */ | |
524 /* */ public boolean getIndexMetaPriority() { | |
525 /* 654 */ return this.indexMetaPriority; | |
526 /* */ } | |
527 /* */ | |
528 /* */ private boolean isTextFile(File file) | |
529 /* */ { | |
530 /* 659 */ String fn = file.getName(); | |
531 /* */ | |
532 /* 661 */ String[] splitted = fn.split("[.]"); | |
533 /* */ | |
534 /* 663 */ String ext = ""; | |
535 /* */ | |
536 /* 665 */ if (splitted.length > 1) | |
537 /* */ { | |
538 /* 667 */ ext = splitted[(splitted.length - 1)]; | |
539 /* */ } | |
540 /* */ | |
541 /* 670 */ return this.fileTypesToIndex.contains(ext); | |
542 /* */ } | |
543 /* */ } | |
544 | |
545 /* Location: /private/tmp/fulltextIndexer.jar | |
546 * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread | |
547 * JD-Core Version: 0.5.4 | |
548 */ |