Mercurial > hg > fulltextIndexer
view src/de/mpiwg/dwinter/fulltextIndexer/harvester/processors/ProcessFileThread.java @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
line wrap: on
line source
/* */ package de.mpiwg.dwinter.fulltextIndexer.harvester.processors; /* */ /* */ import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; import org.apache.lucene.document.DateTools; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.Term; import org.apache.lucene.search.Collector; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.LockObtainFailedException; import org.apache.xmlrpc.XmlRpcException; import org.apache.xmlrpc.client.XmlRpcClient; import org.apache.xmlrpc.client.XmlRpcClientConfigImpl; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import com.sun.org.apache.xerces.internal.parsers.SAXParser; import de.mpiwg.dwinter.fulltextIndexer.utils.ParseIndexMeta; import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers; import de.mpiwg.dwinter.lucencetools.documents.FileDocument; /* */ /* */ public class ProcessFileThread extends Thread /* */ { /* */ private static final String TEXTIDFROMPATH_REGEXP = ".*(/(permanent|experimental)/.*)"; /* */ private static final int DELETED_WRONG_LANGUAGE = 1; /* */ private static final int DELETED_OLD_VERSION = 2; /* */ private static final int NEW_FILE = 0; /* */ private static final int FILE_EXISTS = -1; /* */ protected File docDir; /* */ protected File index_dir; /* 86 */ protected ArrayList<String> fileTypesToIndex = new ArrayList(Arrays.asList(new String[] { "xml" })); /* 87 */ protected ArrayList<String> excludeFolders = new ArrayList(Arrays.asList(new String[] { "OCR" })); /* 88 */ private HashMap<String, String> textLanguage = null; /* */ /* 90 */ protected HashMap<String, String> languageToISO = new HashMap(); /* 91 */ protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers(); /* 92 */ protected HashMap<String, String> supportedLanguageFolder = new HashMap(); /* 93 */ private int counter = 0; /* */ protected String languageFileName; /* 95 */ protected boolean indexMetaPriority = false; /* 96 */ protected boolean deduceFromFolderPriority = true; /* */ /* 101 */ private String specialMode = ""; /* 102 */ public boolean done = false; /* */ private File processThisFile; /* 105 */ private String mode = "new"; // if mode is not add, then only modified files and new files will be added. /* */ private String mdProviderUrl; /* 107 */ private String preferedLanguage = null; /* */ /* */ public ProcessFileThread(File docDir, File index_dir, String languageFileName, File processThisFile, String mdProviderUrl, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder) throws CorruptIndexException, LockObtainFailedException, IOException { /* 110 */ this.docDir = docDir; /* 111 */ this.languageFileName = languageFileName; /* */ /* 114 */ this.index_dir = index_dir; /* 115 */ this.processThisFile = processThisFile; /* 116 */ this.mdProviderUrl = mdProviderUrl; /* 117 */ this.languageToISO = languageToISO; /* 118 */ this.supportedLanguageFolder = supportedLanguageFolder; /* */ } /* */ /* */ public ProcessFileThread(LanguageAnalyzers languageAnalyzers2, File file, String lfn, HashMap<String, String> tl, String mdProviderUrl, String preferedLanguage, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder) /* */ { /* 123 */ this.languageAnalyzers = languageAnalyzers2; /* 124 */ this.processThisFile = file; /* 125 */ this.textLanguage = tl; /* */ /* 128 */ this.languageFileName = lfn; /* 129 */ this.preferedLanguage = preferedLanguage; /* 130 */ this.mdProviderUrl = mdProviderUrl; /* 131 */ this.languageToISO = languageToISO; /* 132 */ this.supportedLanguageFolder = supportedLanguageFolder; /* */ } /* */ /* */ public void run() /* */ { /* */ try /* */ { /* 140 */ processFile(this.processThisFile); /* */ } /* */ catch (CorruptIndexException e) { /* 143 */ e.printStackTrace(); /* */ } /* */ catch (FileNotFoundException e) { /* 146 */ e.printStackTrace(); /* */ } /* */ catch (UnsupportedEncodingException e) { /* 149 */ e.printStackTrace(); /* */ } /* */ catch (IOException e) { /* 152 */ e.printStackTrace(); /* */ } /* */ catch (InterruptedException e) { /* 155 */ e.printStackTrace(); /* */ } /* 157 */ this.done = true; /* */ } /* */ /* */ private String getLanguageOfText(String textId, File file) throws IOException { /* 161 */ String lang = null; /* */ /* 163 */ if (this.deduceFromFolderPriority) /* */ { /* 165 */ lang = deduceFromFolderName(file); /* 166 */ if (lang != null) { /* 167 */ return lang; /* */ } /* */ } /* 170 */ if ((this.languageFileName == null | this.indexMetaPriority)) { /* 171 */ lang = getLanguageFromIndexMeta(file); /* */ /* 177 */ if ((lang != null) && /* 178 */ (lang.equals(""))) { /* 179 */ System.out.println("Language for " + file.getAbsolutePath() + " is " + lang); /* 180 */ return lang; /* */ } /* */ } /* 183 */ if (this.languageFileName != null) /* */ { /* 185 */ if (this.textLanguage == null) /* 186 */ this.textLanguage = loadLanguages(); /* 187 */ if (this.textLanguage == null) /* */ { /* 189 */ System.out.println("NO LANGUAGE FILES LOADED"); /* */ } /* */ else /* */ { /* 198 */ String language = (String)this.textLanguage.get(textId); /* 199 */ lang = (String)this.languageToISO.get(language); /* 200 */ if (lang != null) /* */ { /* 202 */ System.out.println("GOT language from language file:" + lang); /* 203 */ return lang; /* */ } /* */ } /* */ /* */ } /* */ /* 209 */ lang = deduceFromFolderName(file); /* 210 */ if (lang != null) /* */ { /* 212 */ System.out.println("Langugage deduced from Folder:" + lang); /* 213 */ return lang; /* */ } /* */ /* 216 */ if ((this.preferedLanguage != null) && (!this.preferedLanguage.equals(""))) { /* 217 */ System.out.println("no language identified from Metadata: prefered language " + this.preferedLanguage + "will be used:" + file.getAbsolutePath()); /* 218 */ return this.preferedLanguage; /* */ } /* */ /* 221 */ System.out.println("no language identified: language will be generic all:" + file.getAbsolutePath()); /* 222 */ return "all"; /* */ } /* */ /* */ private String deduceFromFolderName(File file) { /* 226 */ File parent = file.getParentFile(); /* 227 */ String name = parent.getName(); /* 228 */ String lang = null; /* 229 */ if (this.supportedLanguageFolder.containsKey(name)) /* */ { /* 231 */ lang = (String)this.supportedLanguageFolder.get(name); /* */ } /* 233 */ return lang; /* */ } /* */ /* */ private String getLanguageFromIndexMeta(File file) /* */ throws IOException /* */ { /* 244 */ file = new File("/mpiwg/online/" + absPathToTextId(file.getAbsolutePath())); /* */ /* 246 */ File pf = file.getParentFile().getParentFile().getParentFile(); /* 247 */ File indexMeta = new File(pf, "index.meta"); /* */ /* 249 */ if (!indexMeta.exists()) /* */ { /* 251 */ File pf2 = pf.getParentFile(); /* 252 */ indexMeta = new File(pf2, "index.meta"); /* 253 */ if (!indexMeta.exists()) /* 254 */ return null; /* */ } /* 256 */ XMLReader parser = new SAXParser(); /* 257 */ ParseIndexMeta ch = new ParseIndexMeta(); /* 258 */ parser.setContentHandler(ch); /* */ try { /* 260 */ Reader reader = new FileReader(indexMeta); /* 261 */ InputSource input = new InputSource(reader); /* 262 */ parser.parse(input); /* */ } /* */ catch (SAXException e) /* */ { /* 266 */ e.printStackTrace(); /* */ } /* */ /* 269 */ String lang = ch.lang; /* */ /* 272 */ return lang; /* */ } /* */ /* */ private String getDCFromIndexMeta(String textId) /* */ throws IOException, XmlRpcException /* */ { /* 301 */ XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl(); /* 302 */ URL url = new URL(this.mdProviderUrl); /* 303 */ config.setServerURL(url); /* 304 */ XmlRpcClient client = new XmlRpcClient(); /* 305 */ client.setConfig(config); /* */ /* 307 */ Object[] params = { textId }; /* 308 */ Object returnVals = client.execute("getDCFormatted", params); /* */ /* 311 */ return (String)returnVals; /* */ } /* */ /* */ protected HashMap<String, String> loadLanguages() /* */ { /* 320 */ File languageFile = new File(this.docDir + "/" + this.languageFileName); /* 321 */ String languageFilePath = this.docDir + "/" + this.languageFileName; /* 322 */ HashMap languages = new HashMap(); /* 323 */ boolean relativ = true; /* */ /* 325 */ if (!languageFile.exists()) /* */ { /* 327 */ languageFile = new File(this.languageFileName); /* 328 */ languageFilePath = this.languageFileName; /* 329 */ relativ = false; /* 330 */ if (!languageFile.exists()) /* 331 */ return null; /* */ } /* */ BufferedReader in; /* */ try { /* 335 */ in = new BufferedReader(new FileReader(languageFilePath)); /* */ } catch (FileNotFoundException e) { /* 337 */ return null; /* */ } /* */ /* 340 */ String zeile = null; /* */ try { /* 342 */ while ((zeile = in.readLine()) != null) { /* 343 */ String[] splitted = zeile.replace("\"", "").split("[,]"); /* 344 */ if (splitted.length == 2) /* 345 */ if (relativ) /* 346 */ languages.put(this.docDir + "/" + splitted[0], splitted[1]); /* */ else /* 348 */ languages.put(splitted[0], splitted[1]); /* */ } /* */ } /* */ catch (IOException e) { /* 352 */ e.printStackTrace(); /* 353 */ return null; /* */ } /* */ /* 356 */ return languages; /* */ } /* */ /* */ public void harvestFolder() /* */ throws InterruptedException /* */ { /* 362 */ Date start = new Date(); /* 363 */ boolean create = true; /* */ try /* */ { /* 374 */ System.out.println("Indexing to directory '" + this.index_dir + "'..."); /* 375 */ indexDocs(this.docDir); /* 376 */ System.out.println("Optimizing..."); /* 377 */ this.languageAnalyzers.optimize(); /* 378 */ this.languageAnalyzers.close(); /* */ /* 380 */ Date end = new Date(); /* 381 */ System.out.println(end.getTime() - start.getTime() + " total milliseconds"); /* */ } /* */ catch (IOException e) { /* 384 */ System.out.println(" caught a " + e.getClass() + /* 385 */ "\n with message: " + e.getMessage()); /* */ } /* */ } /* */ /* */ void indexDocs(File file) /* */ throws IOException, InterruptedException /* */ { /* 392 */ if (!file.canRead()) /* */ return; /* 394 */ if (file.isDirectory()) /* */ { /* 396 */ if (this.counter > 100000) /* */ { /* 398 */ return; /* */ } /* 400 */ String[] files = file.list(); /* */ /* 402 */ String folderName = file.getName(); /* 403 */ if ((((files != null) ? 1 : 0) & ((this.excludeFolders.contains(folderName)) ? 0 : 1)) != 0) { /* 404 */ for (int i = 0; i < files.length; ++i) /* 405 */ indexDocs(new File(file, files[i])); /* */ } /* */ } /* 408 */ else if (isTextFile(file)) { /* 409 */ processFile(file); /* */ } /* */ else { /* 412 */ System.out.println("not adding " + file); /* */ } /* */ } /* */ /* */ private void processFile(File file) /* */ throws IOException, CorruptIndexException, InterruptedException, FileNotFoundException, UnsupportedEncodingException /* */ { /* 423 */ String textId = getTextId(file); /* 424 */ System.out.println("file:" + this.counter); /* 425 */ System.out.println("textId:" + textId); /* */ /* 427 */ String lang = getLanguageOfText(textId, file); /* 428 */ String dcMetaData = null; /* 429 */ if (this.mdProviderUrl != null) /* */ try { /* 431 */ dcMetaData = getDCFromIndexMeta(textId); /* */ } catch (XmlRpcException e2) { /* 433 */ dcMetaData = null; /* */ } /* */ int docNr; /* */ /* 437 */ if (this.mode == "add") /* 438 */ docNr = 0; /* */ else /* 440 */ docNr = checkFileAndRemoveOldFile(file.getCanonicalPath(), lang, true, file.lastModified()); /* 441 */ if (lang == null) { /* 442 */ System.out.println("not adding " + file); /* */ } /* 444 */ else if (docNr == -1) { /* 445 */ System.out.println(" OLD FILE:" + file); /* 446 */ } else if (docNr >= 0) /* */ { /* 448 */ System.out.println("adding " + file + " lang: " + lang); /* */ try /* */ { /* 451 */ Boolean ret = addDocument(file, lang, dcMetaData, textId); /* 452 */ if (ret.booleanValue()) /* 453 */ this.counter += 1; /* */ } catch (IOException e) { /* 455 */ System.out.println("got an IO eception adding the document - wait a bit"); /* 456 */ Thread.sleep(10000L); /* 457 */ System.out.println("Try again"); /* */ try { /* 459 */ Boolean ret = addDocument(file, lang, dcMetaData, textId); /* 460 */ if (ret.booleanValue()) /* 461 */ this.counter += 1; /* */ } catch (IOException e1) { /* 463 */ System.out.println("Couldn't do:" + file.getName()); /* */ } /* */ catch (ParserConfigurationException e2) { /* 466 */ e.printStackTrace(); /* */ } /* */ catch (SAXException e2) { /* 469 */ e.printStackTrace(); /* */ } /* */ } /* */ catch (ParserConfigurationException e) { /* 473 */ e.printStackTrace(); /* */ } /* */ catch (SAXException e) { /* 476 */ e.printStackTrace(); /* */ } /* */ /* */ } /* */ else /* */ { /* 482 */ System.out.println(" UPDATE FILE:" + file + " lang: " + lang); /* */ /* 484 */ this.counter += 1; /* */ try { /* 486 */ addDocument(file, lang, dcMetaData, textId); /* */ } /* */ catch (ParserConfigurationException e) { /* 489 */ e.printStackTrace(); /* */ } /* */ catch (SAXException e) { /* 492 */ e.printStackTrace(); /* */ } /* */ } /* */ } /* */ /* */ protected Boolean addDocument(File file, String lang, String dcMetaData, String textId) /* */ throws CorruptIndexException, IOException, FileNotFoundException, UnsupportedEncodingException, ParserConfigurationException, SAXException /* */ { /* 509 */ if (dcMetaData != null) { /* 510 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),lang, dcMetaData, textId), lang); /* 511 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),"all", dcMetaData, textId), "all"); /* */ } /* */ else /* */ { /* 515 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),lang, textId), lang); /* 516 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),"all", textId), "all"); /* */ } /* 518 */ return Boolean.valueOf(true); /* */ } /* */ /* */ private String getTextId(File file) /* */ { /* */ try /* */ { /* 529 */ File parent = file.getParentFile(); /* */ /* 531 */ if (parent.getName().equals("text")) /* 532 */ return absPathToTextId(parent.getParentFile().getAbsolutePath()); /* 533 */ if (parent.getParentFile().getName().equals("text")) /* 534 */ return absPathToTextId(parent.getParentFile().getParentFile().getAbsolutePath()); /* 535 */ if (parent.getParentFile().getParentFile().getName().equals("text")) { /* 536 */ return absPathToTextId(parent.getParentFile().getParentFile().getParentFile().getAbsolutePath()); /* */ } /* 538 */ return null; /* */ } /* */ catch (RuntimeException e) { /* 541 */ e.printStackTrace(); /* 542 */ }return null; /* */ } /* */ protected String absPathToTextId(File file) /* */ { try { return absPathToTextId(file.getCanonicalPath()); } catch (IOException e) { e.printStackTrace(); return ""; } } /* */ protected String absPathToTextId(String absolutePath) /* */ { /* 555 */ if (this.specialMode.equals("vlp")) /* */ { /* 557 */ String[] splitted = absolutePath.split("lit"); /* 558 */ return splitted[1]; /* */ } /* */ /* 562 */ Pattern p = Pattern.compile(TEXTIDFROMPATH_REGEXP); /* 563 */ Matcher m = p.matcher(absolutePath); /* 564 */ m.matches(); /* 565 */ if (m.groupCount() > 0) { /* 566 */ return m.group(1); /* */ } /* 568 */ System.err.println("correctPath: not a mpiwg path / no changes done" + absolutePath); /* 569 */ return absolutePath; /* */ } /* */ /* */ private int checkFileAndRemoveOldFile(String filePath, String lang, boolean deleteWrongLanguage, long fileModDate) /* */ throws CorruptIndexException, IOException /* */ { /* 577 */ lang = checkSupportedLanguages(lang); /* 578 */ System.out.println("lang converted+" + lang); /* 579 */ //TermQuery query = new TermQuery(new Term("path", filePath)); TermQuery query = new TermQuery(new Term("cleanedPath", absPathToTextId(filePath))); /* */ /* 582 */ HashMap<String,Collector> results = this.languageAnalyzers.search(query); /* */ /* 584 */ if (results == null) { /* 585 */ return 0; /* */ } /* 587 */ for (String resultLang : results.keySet()) /* */ { /* 589 */ TopScoreDocCollector collector = (TopScoreDocCollector)results.get(resultLang); /* */ /* 591 */ if ((collector == null) || (collector.getTotalHits() <= 0)) /* */ continue; /* 593 */ if ((!resultLang.equals(lang)) && (deleteWrongLanguage) && (!resultLang.equals("morph"))) /* */ { /* 595 */ this.languageAnalyzers.deleteDocuments(query); /* */ /* 603 */ System.out.println("language changed:" + filePath); /* 604 */ return 1; /* */ } /* */ /* 607 */ if (!resultLang.equals(lang)) /* */ continue; /* 609 */ TopDocs docs = collector.topDocs(); /* */ ScoreDoc[] arrayOfScoreDoc; /* 610 */ if ((arrayOfScoreDoc = docs.scoreDocs).length == 0) continue; ScoreDoc doc = arrayOfScoreDoc[0]; /* 611 */ String modDate = this.languageAnalyzers.getAnalyzer(resultLang).reader.document(doc.doc).getField("modified").stringValue(); /* */ /* 613 */ String fileDate = DateTools.timeToString(fileModDate, DateTools.Resolution.MINUTE); /* 614 */ if (!fileDate.equals(modDate)) /* */ { /* 618 */ System.out.println("new file:" + filePath); /* 619 */ this.languageAnalyzers.deleteDocuments(query); /* 620 */ return 2; /* */ } /* */ /* 623 */ return -1; /* */ } /* */ /* 631 */ return 0; /* */ } /* */ /* */ private String checkSupportedLanguages(String lang) /* */ { /* 643 */ if (this.languageAnalyzers.getAnalyzer(lang) == null) /* 644 */ return "all"; /* 645 */ return lang; /* */ } /* */ /* */ public void setIndexMetaPriority(boolean prio) /* */ { /* 650 */ this.indexMetaPriority = prio; /* */ } /* */ /* */ public boolean getIndexMetaPriority() { /* 654 */ return this.indexMetaPriority; /* */ } /* */ /* */ private boolean isTextFile(File file) /* */ { /* 659 */ String fn = file.getName(); /* */ /* 661 */ String[] splitted = fn.split("[.]"); /* */ /* 663 */ String ext = ""; /* */ /* 665 */ if (splitted.length > 1) /* */ { /* 667 */ ext = splitted[(splitted.length - 1)]; /* */ } /* */ /* 670 */ return this.fileTypesToIndex.contains(ext); /* */ } /* */ } /* Location: /private/tmp/fulltextIndexer.jar * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread * JD-Core Version: 0.5.4 */