Mercurial > hg > fulltextIndexer
diff src/de/mpiwg/dwinter/fulltextIndexer/harvester/processors/ProcessFileThread.java @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/fulltextIndexer/harvester/processors/ProcessFileThread.java Wed Nov 03 12:33:16 2010 +0100 @@ -0,0 +1,548 @@ +/* */ package de.mpiwg.dwinter.fulltextIndexer.harvester.processors; +/* */ +/* */ import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import javax.xml.parsers.ParserConfigurationException; + +import org.apache.lucene.document.DateTools; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.xmlrpc.XmlRpcException; +import org.apache.xmlrpc.client.XmlRpcClient; +import org.apache.xmlrpc.client.XmlRpcClientConfigImpl; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpiwg.dwinter.fulltextIndexer.utils.ParseIndexMeta; +import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers; +import de.mpiwg.dwinter.lucencetools.documents.FileDocument; +/* */ +/* */ public class ProcessFileThread extends Thread +/* */ { +/* */ private static final String TEXTIDFROMPATH_REGEXP = ".*(/(permanent|experimental)/.*)"; +/* */ private static final int DELETED_WRONG_LANGUAGE = 1; +/* */ private static final int DELETED_OLD_VERSION = 2; +/* */ private static final int NEW_FILE = 0; +/* */ private static final int FILE_EXISTS = -1; +/* */ protected File docDir; +/* */ protected File index_dir; +/* 86 */ protected ArrayList<String> fileTypesToIndex = new ArrayList(Arrays.asList(new String[] { "xml" })); +/* 87 */ protected ArrayList<String> excludeFolders = new ArrayList(Arrays.asList(new String[] { "OCR" })); +/* 88 */ private HashMap<String, String> textLanguage = null; +/* */ +/* 90 */ protected HashMap<String, String> languageToISO = new HashMap(); +/* 91 */ protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers(); +/* 92 */ protected HashMap<String, String> supportedLanguageFolder = new HashMap(); +/* 93 */ private int counter = 0; +/* */ protected String languageFileName; +/* 95 */ protected boolean indexMetaPriority = false; +/* 96 */ protected boolean deduceFromFolderPriority = true; +/* */ +/* 101 */ private String specialMode = ""; +/* 102 */ public boolean done = false; +/* */ private File processThisFile; +/* 105 */ private String mode = "new"; // if mode is not add, then only modified files and new files will be added. +/* */ private String mdProviderUrl; +/* 107 */ private String preferedLanguage = null; +/* */ +/* */ public ProcessFileThread(File docDir, File index_dir, String languageFileName, File processThisFile, String mdProviderUrl, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder) throws CorruptIndexException, LockObtainFailedException, IOException { +/* 110 */ this.docDir = docDir; +/* 111 */ this.languageFileName = languageFileName; +/* */ +/* 114 */ this.index_dir = index_dir; +/* 115 */ this.processThisFile = processThisFile; +/* 116 */ this.mdProviderUrl = mdProviderUrl; +/* 117 */ this.languageToISO = languageToISO; +/* 118 */ this.supportedLanguageFolder = supportedLanguageFolder; +/* */ } +/* */ +/* */ public ProcessFileThread(LanguageAnalyzers languageAnalyzers2, File file, String lfn, HashMap<String, String> tl, String mdProviderUrl, String preferedLanguage, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder) +/* */ { +/* 123 */ this.languageAnalyzers = languageAnalyzers2; +/* 124 */ this.processThisFile = file; +/* 125 */ this.textLanguage = tl; +/* */ +/* 128 */ this.languageFileName = lfn; +/* 129 */ this.preferedLanguage = preferedLanguage; +/* 130 */ this.mdProviderUrl = mdProviderUrl; +/* 131 */ this.languageToISO = languageToISO; +/* 132 */ this.supportedLanguageFolder = supportedLanguageFolder; +/* */ } +/* */ +/* */ public void run() +/* */ { +/* */ try +/* */ { +/* 140 */ processFile(this.processThisFile); +/* */ } +/* */ catch (CorruptIndexException e) { +/* 143 */ e.printStackTrace(); +/* */ } +/* */ catch (FileNotFoundException e) { +/* 146 */ e.printStackTrace(); +/* */ } +/* */ catch (UnsupportedEncodingException e) { +/* 149 */ e.printStackTrace(); +/* */ } +/* */ catch (IOException e) { +/* 152 */ e.printStackTrace(); +/* */ } +/* */ catch (InterruptedException e) { +/* 155 */ e.printStackTrace(); +/* */ } +/* 157 */ this.done = true; +/* */ } +/* */ +/* */ private String getLanguageOfText(String textId, File file) throws IOException { +/* 161 */ String lang = null; +/* */ +/* 163 */ if (this.deduceFromFolderPriority) +/* */ { +/* 165 */ lang = deduceFromFolderName(file); +/* 166 */ if (lang != null) { +/* 167 */ return lang; +/* */ } +/* */ } +/* 170 */ if ((this.languageFileName == null | this.indexMetaPriority)) { +/* 171 */ lang = getLanguageFromIndexMeta(file); +/* */ +/* 177 */ if ((lang != null) && +/* 178 */ (lang.equals(""))) { +/* 179 */ System.out.println("Language for " + file.getAbsolutePath() + " is " + lang); +/* 180 */ return lang; +/* */ } +/* */ } +/* 183 */ if (this.languageFileName != null) +/* */ { +/* 185 */ if (this.textLanguage == null) +/* 186 */ this.textLanguage = loadLanguages(); +/* 187 */ if (this.textLanguage == null) +/* */ { +/* 189 */ System.out.println("NO LANGUAGE FILES LOADED"); +/* */ } +/* */ else +/* */ { +/* 198 */ String language = (String)this.textLanguage.get(textId); +/* 199 */ lang = (String)this.languageToISO.get(language); +/* 200 */ if (lang != null) +/* */ { +/* 202 */ System.out.println("GOT language from language file:" + lang); +/* 203 */ return lang; +/* */ } +/* */ } +/* */ +/* */ } +/* */ +/* 209 */ lang = deduceFromFolderName(file); +/* 210 */ if (lang != null) +/* */ { +/* 212 */ System.out.println("Langugage deduced from Folder:" + lang); +/* 213 */ return lang; +/* */ } +/* */ +/* 216 */ if ((this.preferedLanguage != null) && (!this.preferedLanguage.equals(""))) { +/* 217 */ System.out.println("no language identified from Metadata: prefered language " + this.preferedLanguage + "will be used:" + file.getAbsolutePath()); +/* 218 */ return this.preferedLanguage; +/* */ } +/* */ +/* 221 */ System.out.println("no language identified: language will be generic all:" + file.getAbsolutePath()); +/* 222 */ return "all"; +/* */ } +/* */ +/* */ private String deduceFromFolderName(File file) { +/* 226 */ File parent = file.getParentFile(); +/* 227 */ String name = parent.getName(); +/* 228 */ String lang = null; +/* 229 */ if (this.supportedLanguageFolder.containsKey(name)) +/* */ { +/* 231 */ lang = (String)this.supportedLanguageFolder.get(name); +/* */ } +/* 233 */ return lang; +/* */ } +/* */ +/* */ private String getLanguageFromIndexMeta(File file) +/* */ throws IOException +/* */ { +/* 244 */ file = new File("/mpiwg/online/" + absPathToTextId(file.getAbsolutePath())); +/* */ +/* 246 */ File pf = file.getParentFile().getParentFile().getParentFile(); +/* 247 */ File indexMeta = new File(pf, "index.meta"); +/* */ +/* 249 */ if (!indexMeta.exists()) +/* */ { +/* 251 */ File pf2 = pf.getParentFile(); +/* 252 */ indexMeta = new File(pf2, "index.meta"); +/* 253 */ if (!indexMeta.exists()) +/* 254 */ return null; +/* */ } +/* 256 */ XMLReader parser = new SAXParser(); +/* 257 */ ParseIndexMeta ch = new ParseIndexMeta(); +/* 258 */ parser.setContentHandler(ch); +/* */ try { +/* 260 */ Reader reader = new FileReader(indexMeta); +/* 261 */ InputSource input = new InputSource(reader); +/* 262 */ parser.parse(input); +/* */ } +/* */ catch (SAXException e) +/* */ { +/* 266 */ e.printStackTrace(); +/* */ } +/* */ +/* 269 */ String lang = ch.lang; +/* */ +/* 272 */ return lang; +/* */ } +/* */ +/* */ private String getDCFromIndexMeta(String textId) +/* */ throws IOException, XmlRpcException +/* */ { +/* 301 */ XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl(); +/* 302 */ URL url = new URL(this.mdProviderUrl); +/* 303 */ config.setServerURL(url); +/* 304 */ XmlRpcClient client = new XmlRpcClient(); +/* 305 */ client.setConfig(config); +/* */ +/* 307 */ Object[] params = { textId }; +/* 308 */ Object returnVals = client.execute("getDCFormatted", params); +/* */ +/* 311 */ return (String)returnVals; +/* */ } +/* */ +/* */ protected HashMap<String, String> loadLanguages() +/* */ { +/* 320 */ File languageFile = new File(this.docDir + "/" + this.languageFileName); +/* 321 */ String languageFilePath = this.docDir + "/" + this.languageFileName; +/* 322 */ HashMap languages = new HashMap(); +/* 323 */ boolean relativ = true; +/* */ +/* 325 */ if (!languageFile.exists()) +/* */ { +/* 327 */ languageFile = new File(this.languageFileName); +/* 328 */ languageFilePath = this.languageFileName; +/* 329 */ relativ = false; +/* 330 */ if (!languageFile.exists()) +/* 331 */ return null; +/* */ } +/* */ BufferedReader in; +/* */ try { +/* 335 */ in = new BufferedReader(new FileReader(languageFilePath)); +/* */ } catch (FileNotFoundException e) { +/* 337 */ return null; +/* */ } +/* */ +/* 340 */ String zeile = null; +/* */ try { +/* 342 */ while ((zeile = in.readLine()) != null) { +/* 343 */ String[] splitted = zeile.replace("\"", "").split("[,]"); +/* 344 */ if (splitted.length == 2) +/* 345 */ if (relativ) +/* 346 */ languages.put(this.docDir + "/" + splitted[0], splitted[1]); +/* */ else +/* 348 */ languages.put(splitted[0], splitted[1]); +/* */ } +/* */ } +/* */ catch (IOException e) { +/* 352 */ e.printStackTrace(); +/* 353 */ return null; +/* */ } +/* */ +/* 356 */ return languages; +/* */ } +/* */ +/* */ public void harvestFolder() +/* */ throws InterruptedException +/* */ { +/* 362 */ Date start = new Date(); +/* 363 */ boolean create = true; +/* */ try +/* */ { +/* 374 */ System.out.println("Indexing to directory '" + this.index_dir + "'..."); +/* 375 */ indexDocs(this.docDir); +/* 376 */ System.out.println("Optimizing..."); +/* 377 */ this.languageAnalyzers.optimize(); +/* 378 */ this.languageAnalyzers.close(); +/* */ +/* 380 */ Date end = new Date(); +/* 381 */ System.out.println(end.getTime() - start.getTime() + " total milliseconds"); +/* */ } +/* */ catch (IOException e) { +/* 384 */ System.out.println(" caught a " + e.getClass() + +/* 385 */ "\n with message: " + e.getMessage()); +/* */ } +/* */ } +/* */ +/* */ void indexDocs(File file) +/* */ throws IOException, InterruptedException +/* */ { +/* 392 */ if (!file.canRead()) +/* */ return; +/* 394 */ if (file.isDirectory()) +/* */ { +/* 396 */ if (this.counter > 100000) +/* */ { +/* 398 */ return; +/* */ } +/* 400 */ String[] files = file.list(); +/* */ +/* 402 */ String folderName = file.getName(); +/* 403 */ if ((((files != null) ? 1 : 0) & ((this.excludeFolders.contains(folderName)) ? 0 : 1)) != 0) { +/* 404 */ for (int i = 0; i < files.length; ++i) +/* 405 */ indexDocs(new File(file, files[i])); +/* */ } +/* */ } +/* 408 */ else if (isTextFile(file)) { +/* 409 */ processFile(file); +/* */ } +/* */ else { +/* 412 */ System.out.println("not adding " + file); +/* */ } +/* */ } +/* */ +/* */ private void processFile(File file) +/* */ throws IOException, CorruptIndexException, InterruptedException, FileNotFoundException, UnsupportedEncodingException +/* */ { +/* 423 */ String textId = getTextId(file); +/* 424 */ System.out.println("file:" + this.counter); +/* 425 */ System.out.println("textId:" + textId); +/* */ +/* 427 */ String lang = getLanguageOfText(textId, file); +/* 428 */ String dcMetaData = null; +/* 429 */ if (this.mdProviderUrl != null) +/* */ try { +/* 431 */ dcMetaData = getDCFromIndexMeta(textId); +/* */ } catch (XmlRpcException e2) { +/* 433 */ dcMetaData = null; +/* */ } +/* */ int docNr; +/* */ +/* 437 */ if (this.mode == "add") +/* 438 */ docNr = 0; +/* */ else +/* 440 */ docNr = checkFileAndRemoveOldFile(file.getCanonicalPath(), lang, true, file.lastModified()); +/* 441 */ if (lang == null) { +/* 442 */ System.out.println("not adding " + file); +/* */ } +/* 444 */ else if (docNr == -1) { +/* 445 */ System.out.println(" OLD FILE:" + file); +/* 446 */ } else if (docNr >= 0) +/* */ { +/* 448 */ System.out.println("adding " + file + " lang: " + lang); +/* */ try +/* */ { +/* 451 */ Boolean ret = addDocument(file, lang, dcMetaData, textId); +/* 452 */ if (ret.booleanValue()) +/* 453 */ this.counter += 1; +/* */ } catch (IOException e) { +/* 455 */ System.out.println("got an IO eception adding the document - wait a bit"); +/* 456 */ Thread.sleep(10000L); +/* 457 */ System.out.println("Try again"); +/* */ try { +/* 459 */ Boolean ret = addDocument(file, lang, dcMetaData, textId); +/* 460 */ if (ret.booleanValue()) +/* 461 */ this.counter += 1; +/* */ } catch (IOException e1) { +/* 463 */ System.out.println("Couldn't do:" + file.getName()); +/* */ } +/* */ catch (ParserConfigurationException e2) { +/* 466 */ e.printStackTrace(); +/* */ } +/* */ catch (SAXException e2) { +/* 469 */ e.printStackTrace(); +/* */ } +/* */ } +/* */ catch (ParserConfigurationException e) { +/* 473 */ e.printStackTrace(); +/* */ } +/* */ catch (SAXException e) { +/* 476 */ e.printStackTrace(); +/* */ } +/* */ +/* */ } +/* */ else +/* */ { +/* 482 */ System.out.println(" UPDATE FILE:" + file + " lang: " + lang); +/* */ +/* 484 */ this.counter += 1; +/* */ try { +/* 486 */ addDocument(file, lang, dcMetaData, textId); +/* */ } +/* */ catch (ParserConfigurationException e) { +/* 489 */ e.printStackTrace(); +/* */ } +/* */ catch (SAXException e) { +/* 492 */ e.printStackTrace(); +/* */ } +/* */ } +/* */ } +/* */ +/* */ protected Boolean addDocument(File file, String lang, String dcMetaData, String textId) +/* */ throws CorruptIndexException, IOException, FileNotFoundException, UnsupportedEncodingException, ParserConfigurationException, SAXException +/* */ { +/* 509 */ if (dcMetaData != null) { +/* 510 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),lang, dcMetaData, textId), lang); +/* 511 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),"all", dcMetaData, textId), "all"); +/* */ } +/* */ else +/* */ { +/* 515 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),lang, textId), lang); +/* 516 */ this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),"all", textId), "all"); +/* */ } +/* 518 */ return Boolean.valueOf(true); +/* */ } +/* */ +/* */ private String getTextId(File file) +/* */ { +/* */ try +/* */ { +/* 529 */ File parent = file.getParentFile(); +/* */ +/* 531 */ if (parent.getName().equals("text")) +/* 532 */ return absPathToTextId(parent.getParentFile().getAbsolutePath()); +/* 533 */ if (parent.getParentFile().getName().equals("text")) +/* 534 */ return absPathToTextId(parent.getParentFile().getParentFile().getAbsolutePath()); +/* 535 */ if (parent.getParentFile().getParentFile().getName().equals("text")) { +/* 536 */ return absPathToTextId(parent.getParentFile().getParentFile().getParentFile().getAbsolutePath()); +/* */ } +/* 538 */ return null; +/* */ } +/* */ catch (RuntimeException e) { +/* 541 */ e.printStackTrace(); +/* 542 */ }return null; +/* */ } +/* */ + protected String absPathToTextId(File file) + /* */ { + try { + return absPathToTextId(file.getCanonicalPath()); + } catch (IOException e) { + + e.printStackTrace(); + return ""; + } + } + +/* */ protected String absPathToTextId(String absolutePath) +/* */ { +/* 555 */ if (this.specialMode.equals("vlp")) +/* */ { +/* 557 */ String[] splitted = absolutePath.split("lit"); +/* 558 */ return splitted[1]; +/* */ } +/* */ +/* 562 */ Pattern p = Pattern.compile(TEXTIDFROMPATH_REGEXP); +/* 563 */ Matcher m = p.matcher(absolutePath); +/* 564 */ m.matches(); +/* 565 */ if (m.groupCount() > 0) { +/* 566 */ return m.group(1); +/* */ } +/* 568 */ System.err.println("correctPath: not a mpiwg path / no changes done" + absolutePath); +/* 569 */ return absolutePath; +/* */ } +/* */ +/* */ private int checkFileAndRemoveOldFile(String filePath, String lang, boolean deleteWrongLanguage, long fileModDate) +/* */ throws CorruptIndexException, IOException +/* */ { +/* 577 */ lang = checkSupportedLanguages(lang); +/* 578 */ System.out.println("lang converted+" + lang); +/* 579 */ //TermQuery query = new TermQuery(new Term("path", filePath)); + TermQuery query = new TermQuery(new Term("cleanedPath", absPathToTextId(filePath))); +/* */ +/* 582 */ HashMap<String,Collector> results = this.languageAnalyzers.search(query); +/* */ +/* 584 */ if (results == null) { +/* 585 */ return 0; +/* */ } +/* 587 */ for (String resultLang : results.keySet()) +/* */ { +/* 589 */ TopScoreDocCollector collector = (TopScoreDocCollector)results.get(resultLang); +/* */ +/* 591 */ if ((collector == null) || (collector.getTotalHits() <= 0)) +/* */ continue; +/* 593 */ if ((!resultLang.equals(lang)) && (deleteWrongLanguage) && (!resultLang.equals("morph"))) +/* */ { +/* 595 */ this.languageAnalyzers.deleteDocuments(query); +/* */ +/* 603 */ System.out.println("language changed:" + filePath); +/* 604 */ return 1; +/* */ } +/* */ +/* 607 */ if (!resultLang.equals(lang)) +/* */ continue; +/* 609 */ TopDocs docs = collector.topDocs(); +/* */ ScoreDoc[] arrayOfScoreDoc; +/* 610 */ if ((arrayOfScoreDoc = docs.scoreDocs).length == 0) continue; ScoreDoc doc = arrayOfScoreDoc[0]; +/* 611 */ String modDate = this.languageAnalyzers.getAnalyzer(resultLang).reader.document(doc.doc).getField("modified").stringValue(); +/* */ +/* 613 */ String fileDate = DateTools.timeToString(fileModDate, DateTools.Resolution.MINUTE); +/* 614 */ if (!fileDate.equals(modDate)) +/* */ { +/* 618 */ System.out.println("new file:" + filePath); +/* 619 */ this.languageAnalyzers.deleteDocuments(query); +/* 620 */ return 2; +/* */ } +/* */ +/* 623 */ return -1; +/* */ } +/* */ +/* 631 */ return 0; +/* */ } +/* */ +/* */ private String checkSupportedLanguages(String lang) +/* */ { +/* 643 */ if (this.languageAnalyzers.getAnalyzer(lang) == null) +/* 644 */ return "all"; +/* 645 */ return lang; +/* */ } +/* */ +/* */ public void setIndexMetaPriority(boolean prio) +/* */ { +/* 650 */ this.indexMetaPriority = prio; +/* */ } +/* */ +/* */ public boolean getIndexMetaPriority() { +/* 654 */ return this.indexMetaPriority; +/* */ } +/* */ +/* */ private boolean isTextFile(File file) +/* */ { +/* 659 */ String fn = file.getName(); +/* */ +/* 661 */ String[] splitted = fn.split("[.]"); +/* */ +/* 663 */ String ext = ""; +/* */ +/* 665 */ if (splitted.length > 1) +/* */ { +/* 667 */ ext = splitted[(splitted.length - 1)]; +/* */ } +/* */ +/* 670 */ return this.fileTypesToIndex.contains(ext); +/* */ } +/* */ } + +/* Location: /private/tmp/fulltextIndexer.jar + * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread + * JD-Core Version: 0.5.4 + */ \ No newline at end of file