0
|
1 /* */ package de.mpiwg.dwinter.fulltextIndexer.harvester.processors;
|
|
2 /* */
|
|
3 /* */ import java.io.BufferedReader;
|
|
4 import java.io.File;
|
|
5 import java.io.FileNotFoundException;
|
|
6 import java.io.FileReader;
|
|
7 import java.io.IOException;
|
|
8 import java.io.Reader;
|
|
9 import java.io.UnsupportedEncodingException;
|
|
10 import java.net.URL;
|
|
11 import java.util.ArrayList;
|
|
12 import java.util.Arrays;
|
|
13 import java.util.Date;
|
|
14 import java.util.HashMap;
|
|
15 import java.util.regex.Matcher;
|
|
16 import java.util.regex.Pattern;
|
|
17
|
|
18 import javax.xml.parsers.ParserConfigurationException;
|
|
19
|
|
20 import org.apache.lucene.document.DateTools;
|
|
21 import org.apache.lucene.index.CorruptIndexException;
|
|
22 import org.apache.lucene.index.Term;
|
|
23 import org.apache.lucene.search.Collector;
|
|
24 import org.apache.lucene.search.ScoreDoc;
|
|
25 import org.apache.lucene.search.TermQuery;
|
|
26 import org.apache.lucene.search.TopDocs;
|
|
27 import org.apache.lucene.search.TopScoreDocCollector;
|
|
28 import org.apache.lucene.store.LockObtainFailedException;
|
|
29 import org.apache.xmlrpc.XmlRpcException;
|
|
30 import org.apache.xmlrpc.client.XmlRpcClient;
|
|
31 import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;
|
|
32 import org.xml.sax.InputSource;
|
|
33 import org.xml.sax.SAXException;
|
|
34 import org.xml.sax.XMLReader;
|
|
35
|
|
36 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
|
|
37
|
|
38 import de.mpiwg.dwinter.fulltextIndexer.utils.ParseIndexMeta;
|
|
39 import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;
|
|
40 import de.mpiwg.dwinter.lucencetools.documents.FileDocument;
|
|
41 /* */
|
|
42 /* */ public class ProcessFileThread extends Thread
|
|
43 /* */ {
|
|
44 /* */ private static final String TEXTIDFROMPATH_REGEXP = ".*(/(permanent|experimental)/.*)";
|
|
45 /* */ private static final int DELETED_WRONG_LANGUAGE = 1;
|
|
46 /* */ private static final int DELETED_OLD_VERSION = 2;
|
|
47 /* */ private static final int NEW_FILE = 0;
|
|
48 /* */ private static final int FILE_EXISTS = -1;
|
|
49 /* */ protected File docDir;
|
|
50 /* */ protected File index_dir;
|
|
51 /* 86 */ protected ArrayList<String> fileTypesToIndex = new ArrayList(Arrays.asList(new String[] { "xml" }));
|
|
52 /* 87 */ protected ArrayList<String> excludeFolders = new ArrayList(Arrays.asList(new String[] { "OCR" }));
|
|
53 /* 88 */ private HashMap<String, String> textLanguage = null;
|
|
54 /* */
|
|
55 /* 90 */ protected HashMap<String, String> languageToISO = new HashMap();
|
|
56 /* 91 */ protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers();
|
|
57 /* 92 */ protected HashMap<String, String> supportedLanguageFolder = new HashMap();
|
|
58 /* 93 */ private int counter = 0;
|
|
59 /* */ protected String languageFileName;
|
|
60 /* 95 */ protected boolean indexMetaPriority = false;
|
|
61 /* 96 */ protected boolean deduceFromFolderPriority = true;
|
|
62 /* */
|
|
63 /* 101 */ private String specialMode = "";
|
|
64 /* 102 */ public boolean done = false;
|
|
65 /* */ private File processThisFile;
|
|
66 /* 105 */ private String mode = "new"; // if mode is not add, then only modified files and new files will be added.
|
|
67 /* */ private String mdProviderUrl;
|
|
68 /* 107 */ private String preferedLanguage = null;
|
|
69 /* */
|
|
70 /* */ public ProcessFileThread(File docDir, File index_dir, String languageFileName, File processThisFile, String mdProviderUrl, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder) throws CorruptIndexException, LockObtainFailedException, IOException {
|
|
71 /* 110 */ this.docDir = docDir;
|
|
72 /* 111 */ this.languageFileName = languageFileName;
|
|
73 /* */
|
|
74 /* 114 */ this.index_dir = index_dir;
|
|
75 /* 115 */ this.processThisFile = processThisFile;
|
|
76 /* 116 */ this.mdProviderUrl = mdProviderUrl;
|
|
77 /* 117 */ this.languageToISO = languageToISO;
|
|
78 /* 118 */ this.supportedLanguageFolder = supportedLanguageFolder;
|
|
79 /* */ }
|
|
80 /* */
|
|
81 /* */ public ProcessFileThread(LanguageAnalyzers languageAnalyzers2, File file, String lfn, HashMap<String, String> tl, String mdProviderUrl, String preferedLanguage, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder)
|
|
82 /* */ {
|
|
83 /* 123 */ this.languageAnalyzers = languageAnalyzers2;
|
|
84 /* 124 */ this.processThisFile = file;
|
|
85 /* 125 */ this.textLanguage = tl;
|
|
86 /* */
|
|
87 /* 128 */ this.languageFileName = lfn;
|
|
88 /* 129 */ this.preferedLanguage = preferedLanguage;
|
|
89 /* 130 */ this.mdProviderUrl = mdProviderUrl;
|
|
90 /* 131 */ this.languageToISO = languageToISO;
|
|
91 /* 132 */ this.supportedLanguageFolder = supportedLanguageFolder;
|
|
92 /* */ }
|
|
93 /* */
|
|
94 /* */ public void run()
|
|
95 /* */ {
|
|
96 /* */ try
|
|
97 /* */ {
|
|
98 /* 140 */ processFile(this.processThisFile);
|
|
99 /* */ }
|
|
100 /* */ catch (CorruptIndexException e) {
|
|
101 /* 143 */ e.printStackTrace();
|
|
102 /* */ }
|
|
103 /* */ catch (FileNotFoundException e) {
|
|
104 /* 146 */ e.printStackTrace();
|
|
105 /* */ }
|
|
106 /* */ catch (UnsupportedEncodingException e) {
|
|
107 /* 149 */ e.printStackTrace();
|
|
108 /* */ }
|
|
109 /* */ catch (IOException e) {
|
|
110 /* 152 */ e.printStackTrace();
|
|
111 /* */ }
|
|
112 /* */ catch (InterruptedException e) {
|
|
113 /* 155 */ e.printStackTrace();
|
|
114 /* */ }
|
|
115 /* 157 */ this.done = true;
|
|
116 /* */ }
|
|
117 /* */
|
|
118 /* */ private String getLanguageOfText(String textId, File file) throws IOException {
|
|
119 /* 161 */ String lang = null;
|
|
120 /* */
|
|
121 /* 163 */ if (this.deduceFromFolderPriority)
|
|
122 /* */ {
|
|
123 /* 165 */ lang = deduceFromFolderName(file);
|
|
124 /* 166 */ if (lang != null) {
|
|
125 /* 167 */ return lang;
|
|
126 /* */ }
|
|
127 /* */ }
|
|
128 /* 170 */ if ((this.languageFileName == null | this.indexMetaPriority)) {
|
|
129 /* 171 */ lang = getLanguageFromIndexMeta(file);
|
|
130 /* */
|
|
131 /* 177 */ if ((lang != null) &&
|
|
132 /* 178 */ (lang.equals(""))) {
|
|
133 /* 179 */ System.out.println("Language for " + file.getAbsolutePath() + " is " + lang);
|
|
134 /* 180 */ return lang;
|
|
135 /* */ }
|
|
136 /* */ }
|
|
137 /* 183 */ if (this.languageFileName != null)
|
|
138 /* */ {
|
|
139 /* 185 */ if (this.textLanguage == null)
|
|
140 /* 186 */ this.textLanguage = loadLanguages();
|
|
141 /* 187 */ if (this.textLanguage == null)
|
|
142 /* */ {
|
|
143 /* 189 */ System.out.println("NO LANGUAGE FILES LOADED");
|
|
144 /* */ }
|
|
145 /* */ else
|
|
146 /* */ {
|
|
147 /* 198 */ String language = (String)this.textLanguage.get(textId);
|
|
148 /* 199 */ lang = (String)this.languageToISO.get(language);
|
|
149 /* 200 */ if (lang != null)
|
|
150 /* */ {
|
|
151 /* 202 */ System.out.println("GOT language from language file:" + lang);
|
|
152 /* 203 */ return lang;
|
|
153 /* */ }
|
|
154 /* */ }
|
|
155 /* */
|
|
156 /* */ }
|
|
157 /* */
|
|
158 /* 209 */ lang = deduceFromFolderName(file);
|
|
159 /* 210 */ if (lang != null)
|
|
160 /* */ {
|
|
161 /* 212 */ System.out.println("Langugage deduced from Folder:" + lang);
|
|
162 /* 213 */ return lang;
|
|
163 /* */ }
|
|
164 /* */
|
|
165 /* 216 */ if ((this.preferedLanguage != null) && (!this.preferedLanguage.equals(""))) {
|
|
166 /* 217 */ System.out.println("no language identified from Metadata: prefered language " + this.preferedLanguage + "will be used:" + file.getAbsolutePath());
|
|
167 /* 218 */ return this.preferedLanguage;
|
|
168 /* */ }
|
|
169 /* */
|
|
170 /* 221 */ System.out.println("no language identified: language will be generic all:" + file.getAbsolutePath());
|
|
171 /* 222 */ return "all";
|
|
172 /* */ }
|
|
173 /* */
|
|
174 /* */ private String deduceFromFolderName(File file) {
|
|
175 /* 226 */ File parent = file.getParentFile();
|
|
176 /* 227 */ String name = parent.getName();
|
|
177 /* 228 */ String lang = null;
|
|
178 /* 229 */ if (this.supportedLanguageFolder.containsKey(name))
|
|
179 /* */ {
|
|
180 /* 231 */ lang = (String)this.supportedLanguageFolder.get(name);
|
|
181 /* */ }
|
|
182 /* 233 */ return lang;
|
|
183 /* */ }
|
|
184 /* */
|
|
185 /* */ private String getLanguageFromIndexMeta(File file)
|
|
186 /* */ throws IOException
|
|
187 /* */ {
|
|
188 /* 244 */ file = new File("/mpiwg/online/" + absPathToTextId(file.getAbsolutePath()));
|
|
189 /* */
|
|
190 /* 246 */ File pf = file.getParentFile().getParentFile().getParentFile();
|
|
191 /* 247 */ File indexMeta = new File(pf, "index.meta");
|
|
192 /* */
|
|
193 /* 249 */ if (!indexMeta.exists())
|
|
194 /* */ {
|
|
195 /* 251 */ File pf2 = pf.getParentFile();
|
|
196 /* 252 */ indexMeta = new File(pf2, "index.meta");
|
|
197 /* 253 */ if (!indexMeta.exists())
|
|
198 /* 254 */ return null;
|
|
199 /* */ }
|
|
200 /* 256 */ XMLReader parser = new SAXParser();
|
|
201 /* 257 */ ParseIndexMeta ch = new ParseIndexMeta();
|
|
202 /* 258 */ parser.setContentHandler(ch);
|
|
203 /* */ try {
|
|
204 /* 260 */ Reader reader = new FileReader(indexMeta);
|
|
205 /* 261 */ InputSource input = new InputSource(reader);
|
|
206 /* 262 */ parser.parse(input);
|
|
207 /* */ }
|
|
208 /* */ catch (SAXException e)
|
|
209 /* */ {
|
|
210 /* 266 */ e.printStackTrace();
|
|
211 /* */ }
|
|
212 /* */
|
|
213 /* 269 */ String lang = ch.lang;
|
|
214 /* */
|
|
215 /* 272 */ return lang;
|
|
216 /* */ }
|
|
217 /* */
|
|
218 /* */ private String getDCFromIndexMeta(String textId)
|
|
219 /* */ throws IOException, XmlRpcException
|
|
220 /* */ {
|
|
221 /* 301 */ XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
|
|
222 /* 302 */ URL url = new URL(this.mdProviderUrl);
|
|
223 /* 303 */ config.setServerURL(url);
|
|
224 /* 304 */ XmlRpcClient client = new XmlRpcClient();
|
|
225 /* 305 */ client.setConfig(config);
|
|
226 /* */
|
|
227 /* 307 */ Object[] params = { textId };
|
|
228 /* 308 */ Object returnVals = client.execute("getDCFormatted", params);
|
|
229 /* */
|
|
230 /* 311 */ return (String)returnVals;
|
|
231 /* */ }
|
|
232 /* */
|
|
233 /* */ protected HashMap<String, String> loadLanguages()
|
|
234 /* */ {
|
|
235 /* 320 */ File languageFile = new File(this.docDir + "/" + this.languageFileName);
|
|
236 /* 321 */ String languageFilePath = this.docDir + "/" + this.languageFileName;
|
|
237 /* 322 */ HashMap languages = new HashMap();
|
|
238 /* 323 */ boolean relativ = true;
|
|
239 /* */
|
|
240 /* 325 */ if (!languageFile.exists())
|
|
241 /* */ {
|
|
242 /* 327 */ languageFile = new File(this.languageFileName);
|
|
243 /* 328 */ languageFilePath = this.languageFileName;
|
|
244 /* 329 */ relativ = false;
|
|
245 /* 330 */ if (!languageFile.exists())
|
|
246 /* 331 */ return null;
|
|
247 /* */ }
|
|
248 /* */ BufferedReader in;
|
|
249 /* */ try {
|
|
250 /* 335 */ in = new BufferedReader(new FileReader(languageFilePath));
|
|
251 /* */ } catch (FileNotFoundException e) {
|
|
252 /* 337 */ return null;
|
|
253 /* */ }
|
|
254 /* */
|
|
255 /* 340 */ String zeile = null;
|
|
256 /* */ try {
|
|
257 /* 342 */ while ((zeile = in.readLine()) != null) {
|
|
258 /* 343 */ String[] splitted = zeile.replace("\"", "").split("[,]");
|
|
259 /* 344 */ if (splitted.length == 2)
|
|
260 /* 345 */ if (relativ)
|
|
261 /* 346 */ languages.put(this.docDir + "/" + splitted[0], splitted[1]);
|
|
262 /* */ else
|
|
263 /* 348 */ languages.put(splitted[0], splitted[1]);
|
|
264 /* */ }
|
|
265 /* */ }
|
|
266 /* */ catch (IOException e) {
|
|
267 /* 352 */ e.printStackTrace();
|
|
268 /* 353 */ return null;
|
|
269 /* */ }
|
|
270 /* */
|
|
271 /* 356 */ return languages;
|
|
272 /* */ }
|
|
273 /* */
|
|
274 /* */ public void harvestFolder()
|
|
275 /* */ throws InterruptedException
|
|
276 /* */ {
|
|
277 /* 362 */ Date start = new Date();
|
|
278 /* 363 */ boolean create = true;
|
|
279 /* */ try
|
|
280 /* */ {
|
|
281 /* 374 */ System.out.println("Indexing to directory '" + this.index_dir + "'...");
|
|
282 /* 375 */ indexDocs(this.docDir);
|
|
283 /* 376 */ System.out.println("Optimizing...");
|
|
284 /* 377 */ this.languageAnalyzers.optimize();
|
|
285 /* 378 */ this.languageAnalyzers.close();
|
|
286 /* */
|
|
287 /* 380 */ Date end = new Date();
|
|
288 /* 381 */ System.out.println(end.getTime() - start.getTime() + " total milliseconds");
|
|
289 /* */ }
|
|
290 /* */ catch (IOException e) {
|
|
291 /* 384 */ System.out.println(" caught a " + e.getClass() +
|
|
292 /* 385 */ "\n with message: " + e.getMessage());
|
|
293 /* */ }
|
|
294 /* */ }
|
|
295 /* */
|
|
296 /* */ void indexDocs(File file)
|
|
297 /* */ throws IOException, InterruptedException
|
|
298 /* */ {
|
|
299 /* 392 */ if (!file.canRead())
|
|
300 /* */ return;
|
|
301 /* 394 */ if (file.isDirectory())
|
|
302 /* */ {
|
|
303 /* 396 */ if (this.counter > 100000)
|
|
304 /* */ {
|
|
305 /* 398 */ return;
|
|
306 /* */ }
|
|
307 /* 400 */ String[] files = file.list();
|
|
308 /* */
|
|
309 /* 402 */ String folderName = file.getName();
|
|
310 /* 403 */ if ((((files != null) ? 1 : 0) & ((this.excludeFolders.contains(folderName)) ? 0 : 1)) != 0) {
|
|
311 /* 404 */ for (int i = 0; i < files.length; ++i)
|
|
312 /* 405 */ indexDocs(new File(file, files[i]));
|
|
313 /* */ }
|
|
314 /* */ }
|
|
315 /* 408 */ else if (isTextFile(file)) {
|
|
316 /* 409 */ processFile(file);
|
|
317 /* */ }
|
|
318 /* */ else {
|
|
319 /* 412 */ System.out.println("not adding " + file);
|
|
320 /* */ }
|
|
321 /* */ }
|
|
322 /* */
|
|
323 /* */ private void processFile(File file)
|
|
324 /* */ throws IOException, CorruptIndexException, InterruptedException, FileNotFoundException, UnsupportedEncodingException
|
|
325 /* */ {
|
|
326 /* 423 */ String textId = getTextId(file);
|
|
327 /* 424 */ System.out.println("file:" + this.counter);
|
|
328 /* 425 */ System.out.println("textId:" + textId);
|
|
329 /* */
|
|
330 /* 427 */ String lang = getLanguageOfText(textId, file);
|
|
331 /* 428 */ String dcMetaData = null;
|
|
332 /* 429 */ if (this.mdProviderUrl != null)
|
|
333 /* */ try {
|
|
334 /* 431 */ dcMetaData = getDCFromIndexMeta(textId);
|
|
335 /* */ } catch (XmlRpcException e2) {
|
|
336 /* 433 */ dcMetaData = null;
|
|
337 /* */ }
|
|
338 /* */ int docNr;
|
|
339 /* */
|
|
340 /* 437 */ if (this.mode == "add")
|
|
341 /* 438 */ docNr = 0;
|
|
342 /* */ else
|
|
343 /* 440 */ docNr = checkFileAndRemoveOldFile(file.getCanonicalPath(), lang, true, file.lastModified());
|
|
344 /* 441 */ if (lang == null) {
|
|
345 /* 442 */ System.out.println("not adding " + file);
|
|
346 /* */ }
|
|
347 /* 444 */ else if (docNr == -1) {
|
|
348 /* 445 */ System.out.println(" OLD FILE:" + file);
|
|
349 /* 446 */ } else if (docNr >= 0)
|
|
350 /* */ {
|
|
351 /* 448 */ System.out.println("adding " + file + " lang: " + lang);
|
|
352 /* */ try
|
|
353 /* */ {
|
|
354 /* 451 */ Boolean ret = addDocument(file, lang, dcMetaData, textId);
|
|
355 /* 452 */ if (ret.booleanValue())
|
|
356 /* 453 */ this.counter += 1;
|
|
357 /* */ } catch (IOException e) {
|
|
358 /* 455 */ System.out.println("got an IO eception adding the document - wait a bit");
|
|
359 /* 456 */ Thread.sleep(10000L);
|
|
360 /* 457 */ System.out.println("Try again");
|
|
361 /* */ try {
|
|
362 /* 459 */ Boolean ret = addDocument(file, lang, dcMetaData, textId);
|
|
363 /* 460 */ if (ret.booleanValue())
|
|
364 /* 461 */ this.counter += 1;
|
|
365 /* */ } catch (IOException e1) {
|
|
366 /* 463 */ System.out.println("Couldn't do:" + file.getName());
|
|
367 /* */ }
|
|
368 /* */ catch (ParserConfigurationException e2) {
|
|
369 /* 466 */ e.printStackTrace();
|
|
370 /* */ }
|
|
371 /* */ catch (SAXException e2) {
|
|
372 /* 469 */ e.printStackTrace();
|
|
373 /* */ }
|
|
374 /* */ }
|
|
375 /* */ catch (ParserConfigurationException e) {
|
|
376 /* 473 */ e.printStackTrace();
|
|
377 /* */ }
|
|
378 /* */ catch (SAXException e) {
|
|
379 /* 476 */ e.printStackTrace();
|
|
380 /* */ }
|
|
381 /* */
|
|
382 /* */ }
|
|
383 /* */ else
|
|
384 /* */ {
|
|
385 /* 482 */ System.out.println(" UPDATE FILE:" + file + " lang: " + lang);
|
|
386 /* */
|
|
387 /* 484 */ this.counter += 1;
|
|
388 /* */ try {
|
|
389 /* 486 */ addDocument(file, lang, dcMetaData, textId);
|
|
390 /* */ }
|
|
391 /* */ catch (ParserConfigurationException e) {
|
|
392 /* 489 */ e.printStackTrace();
|
|
393 /* */ }
|
|
394 /* */ catch (SAXException e) {
|
|
395 /* 492 */ e.printStackTrace();
|
|
396 /* */ }
|
|
397 /* */ }
|
|
398 /* */ }
|
|
399 /* */
|
|
400 /* */ protected Boolean addDocument(File file, String lang, String dcMetaData, String textId)
|
|
401 /* */ throws CorruptIndexException, IOException, FileNotFoundException, UnsupportedEncodingException, ParserConfigurationException, SAXException
|
|
402 /* */ {
|
|
403 /* 509 */ if (dcMetaData != null) {
|
|
404 /* 510 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),lang, dcMetaData, textId), lang);
|
|
405 /* 511 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),"all", dcMetaData, textId), "all");
|
|
406 /* */ }
|
|
407 /* */ else
|
|
408 /* */ {
|
|
409 /* 515 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),lang, textId), lang);
|
|
410 /* 516 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),"all", textId), "all");
|
|
411 /* */ }
|
|
412 /* 518 */ return Boolean.valueOf(true);
|
|
413 /* */ }
|
|
414 /* */
|
|
415 /* */ private String getTextId(File file)
|
|
416 /* */ {
|
|
417 /* */ try
|
|
418 /* */ {
|
|
419 /* 529 */ File parent = file.getParentFile();
|
|
420 /* */
|
|
421 /* 531 */ if (parent.getName().equals("text"))
|
|
422 /* 532 */ return absPathToTextId(parent.getParentFile().getAbsolutePath());
|
|
423 /* 533 */ if (parent.getParentFile().getName().equals("text"))
|
|
424 /* 534 */ return absPathToTextId(parent.getParentFile().getParentFile().getAbsolutePath());
|
|
425 /* 535 */ if (parent.getParentFile().getParentFile().getName().equals("text")) {
|
|
426 /* 536 */ return absPathToTextId(parent.getParentFile().getParentFile().getParentFile().getAbsolutePath());
|
|
427 /* */ }
|
|
428 /* 538 */ return null;
|
|
429 /* */ }
|
|
430 /* */ catch (RuntimeException e) {
|
|
431 /* 541 */ e.printStackTrace();
|
|
432 /* 542 */ }return null;
|
|
433 /* */ }
|
|
434 /* */
|
|
435 protected String absPathToTextId(File file)
|
|
436 /* */ {
|
|
437 try {
|
|
438 return absPathToTextId(file.getCanonicalPath());
|
|
439 } catch (IOException e) {
|
|
440
|
|
441 e.printStackTrace();
|
|
442 return "";
|
|
443 }
|
|
444 }
|
|
445
|
|
446 /* */ protected String absPathToTextId(String absolutePath)
|
|
447 /* */ {
|
|
448 /* 555 */ if (this.specialMode.equals("vlp"))
|
|
449 /* */ {
|
|
450 /* 557 */ String[] splitted = absolutePath.split("lit");
|
|
451 /* 558 */ return splitted[1];
|
|
452 /* */ }
|
|
453 /* */
|
|
454 /* 562 */ Pattern p = Pattern.compile(TEXTIDFROMPATH_REGEXP);
|
|
455 /* 563 */ Matcher m = p.matcher(absolutePath);
|
|
456 /* 564 */ m.matches();
|
|
457 /* 565 */ if (m.groupCount() > 0) {
|
|
458 /* 566 */ return m.group(1);
|
|
459 /* */ }
|
|
460 /* 568 */ System.err.println("correctPath: not a mpiwg path / no changes done" + absolutePath);
|
|
461 /* 569 */ return absolutePath;
|
|
462 /* */ }
|
|
463 /* */
|
|
464 /* */ private int checkFileAndRemoveOldFile(String filePath, String lang, boolean deleteWrongLanguage, long fileModDate)
|
|
465 /* */ throws CorruptIndexException, IOException
|
|
466 /* */ {
|
|
467 /* 577 */ lang = checkSupportedLanguages(lang);
|
|
468 /* 578 */ System.out.println("lang converted+" + lang);
|
|
469 /* 579 */ //TermQuery query = new TermQuery(new Term("path", filePath));
|
|
470 TermQuery query = new TermQuery(new Term("cleanedPath", absPathToTextId(filePath)));
|
|
471 /* */
|
|
472 /* 582 */ HashMap<String,Collector> results = this.languageAnalyzers.search(query);
|
|
473 /* */
|
|
474 /* 584 */ if (results == null) {
|
|
475 /* 585 */ return 0;
|
|
476 /* */ }
|
|
477 /* 587 */ for (String resultLang : results.keySet())
|
|
478 /* */ {
|
|
479 /* 589 */ TopScoreDocCollector collector = (TopScoreDocCollector)results.get(resultLang);
|
|
480 /* */
|
|
481 /* 591 */ if ((collector == null) || (collector.getTotalHits() <= 0))
|
|
482 /* */ continue;
|
|
483 /* 593 */ if ((!resultLang.equals(lang)) && (deleteWrongLanguage) && (!resultLang.equals("morph")))
|
|
484 /* */ {
|
|
485 /* 595 */ this.languageAnalyzers.deleteDocuments(query);
|
|
486 /* */
|
|
487 /* 603 */ System.out.println("language changed:" + filePath);
|
|
488 /* 604 */ return 1;
|
|
489 /* */ }
|
|
490 /* */
|
|
491 /* 607 */ if (!resultLang.equals(lang))
|
|
492 /* */ continue;
|
|
493 /* 609 */ TopDocs docs = collector.topDocs();
|
|
494 /* */ ScoreDoc[] arrayOfScoreDoc;
|
|
495 /* 610 */ if ((arrayOfScoreDoc = docs.scoreDocs).length == 0) continue; ScoreDoc doc = arrayOfScoreDoc[0];
|
|
496 /* 611 */ String modDate = this.languageAnalyzers.getAnalyzer(resultLang).reader.document(doc.doc).getField("modified").stringValue();
|
|
497 /* */
|
|
498 /* 613 */ String fileDate = DateTools.timeToString(fileModDate, DateTools.Resolution.MINUTE);
|
|
499 /* 614 */ if (!fileDate.equals(modDate))
|
|
500 /* */ {
|
|
501 /* 618 */ System.out.println("new file:" + filePath);
|
|
502 /* 619 */ this.languageAnalyzers.deleteDocuments(query);
|
|
503 /* 620 */ return 2;
|
|
504 /* */ }
|
|
505 /* */
|
|
506 /* 623 */ return -1;
|
|
507 /* */ }
|
|
508 /* */
|
|
509 /* 631 */ return 0;
|
|
510 /* */ }
|
|
511 /* */
|
|
512 /* */ private String checkSupportedLanguages(String lang)
|
|
513 /* */ {
|
|
514 /* 643 */ if (this.languageAnalyzers.getAnalyzer(lang) == null)
|
|
515 /* 644 */ return "all";
|
|
516 /* 645 */ return lang;
|
|
517 /* */ }
|
|
518 /* */
|
|
519 /* */ public void setIndexMetaPriority(boolean prio)
|
|
520 /* */ {
|
|
521 /* 650 */ this.indexMetaPriority = prio;
|
|
522 /* */ }
|
|
523 /* */
|
|
524 /* */ public boolean getIndexMetaPriority() {
|
|
525 /* 654 */ return this.indexMetaPriority;
|
|
526 /* */ }
|
|
527 /* */
|
|
528 /* */ private boolean isTextFile(File file)
|
|
529 /* */ {
|
|
530 /* 659 */ String fn = file.getName();
|
|
531 /* */
|
|
532 /* 661 */ String[] splitted = fn.split("[.]");
|
|
533 /* */
|
|
534 /* 663 */ String ext = "";
|
|
535 /* */
|
|
536 /* 665 */ if (splitted.length > 1)
|
|
537 /* */ {
|
|
538 /* 667 */ ext = splitted[(splitted.length - 1)];
|
|
539 /* */ }
|
|
540 /* */
|
|
541 /* 670 */ return this.fileTypesToIndex.contains(ext);
|
|
542 /* */ }
|
|
543 /* */ }
|
|
544
|
|
545 /* Location: /private/tmp/fulltextIndexer.jar
|
|
546 * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread
|
|
547 * JD-Core Version: 0.5.4
|
|
548 */ |