view src/de/mpiwg/dwinter/fulltextIndexer/harvester/processors/.svn/text-base/ProcessFileThread.java.svn-base @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line source

/*     */ package de.mpiwg.dwinter.fulltextIndexer.harvester.processors;
/*     */ 
/*     */ import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.ParserConfigurationException;

import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.xmlrpc.XmlRpcException;
import org.apache.xmlrpc.client.XmlRpcClient;
import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import com.sun.org.apache.xerces.internal.parsers.SAXParser;

import de.mpiwg.dwinter.fulltextIndexer.utils.ParseIndexMeta;
import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;
import de.mpiwg.dwinter.lucencetools.documents.FileDocument;
/*     */ 
/*     */ public class ProcessFileThread extends Thread
/*     */ {
/*     */   private static final String TEXTIDFROMPATH_REGEXP = ".*(/(permanent|experimental)/.*)";
/*     */   private static final int DELETED_WRONG_LANGUAGE = 1;
/*     */   private static final int DELETED_OLD_VERSION = 2;
/*     */   private static final int NEW_FILE = 0;
/*     */   private static final int FILE_EXISTS = -1;
/*     */   protected File docDir;
/*     */   protected File index_dir;
/*  86 */   protected ArrayList<String> fileTypesToIndex = new ArrayList(Arrays.asList(new String[] { "xml" }));
/*  87 */   protected ArrayList<String> excludeFolders = new ArrayList(Arrays.asList(new String[] { "OCR" }));
/*  88 */   private HashMap<String, String> textLanguage = null;
/*     */ 
/*  90 */   protected HashMap<String, String> languageToISO = new HashMap();
/*  91 */   protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers();
/*  92 */   protected HashMap<String, String> supportedLanguageFolder = new HashMap();
/*  93 */   private int counter = 0;
/*     */   protected String languageFileName;
/*  95 */   protected boolean indexMetaPriority = false;
/*  96 */   protected boolean deduceFromFolderPriority = true;
/*     */ 
/* 101 */   private String specialMode = "";
/* 102 */   public boolean done = false;
/*     */   private File processThisFile;
/* 105 */   private String mode = "new"; // if mode is not add, then only modified files and new files will be added.
/*     */   private String mdProviderUrl;
/* 107 */   private String preferedLanguage = null;
/*     */ 
/*     */   public ProcessFileThread(File docDir, File index_dir, String languageFileName, File processThisFile, String mdProviderUrl, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder) throws CorruptIndexException, LockObtainFailedException, IOException {
/* 110 */     this.docDir = docDir;
/* 111 */     this.languageFileName = languageFileName;
/*     */ 
/* 114 */     this.index_dir = index_dir;
/* 115 */     this.processThisFile = processThisFile;
/* 116 */     this.mdProviderUrl = mdProviderUrl;
/* 117 */     this.languageToISO = languageToISO;
/* 118 */     this.supportedLanguageFolder = supportedLanguageFolder;
/*     */   }
/*     */ 
/*     */   public ProcessFileThread(LanguageAnalyzers languageAnalyzers2, File file, String lfn, HashMap<String, String> tl, String mdProviderUrl, String preferedLanguage, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder)
/*     */   {
/* 123 */     this.languageAnalyzers = languageAnalyzers2;
/* 124 */     this.processThisFile = file;
/* 125 */     this.textLanguage = tl;
/*     */ 
/* 128 */     this.languageFileName = lfn;
/* 129 */     this.preferedLanguage = preferedLanguage;
/* 130 */     this.mdProviderUrl = mdProviderUrl;
/* 131 */     this.languageToISO = languageToISO;
/* 132 */     this.supportedLanguageFolder = supportedLanguageFolder;
/*     */   }
/*     */ 
/*     */   public void run()
/*     */   {
/*     */     try
/*     */     {
/* 140 */       processFile(this.processThisFile);
/*     */     }
/*     */     catch (CorruptIndexException e) {
/* 143 */       e.printStackTrace();
/*     */     }
/*     */     catch (FileNotFoundException e) {
/* 146 */       e.printStackTrace();
/*     */     }
/*     */     catch (UnsupportedEncodingException e) {
/* 149 */       e.printStackTrace();
/*     */     }
/*     */     catch (IOException e) {
/* 152 */       e.printStackTrace();
/*     */     }
/*     */     catch (InterruptedException e) {
/* 155 */       e.printStackTrace();
/*     */     }
/* 157 */     this.done = true;
/*     */   }
/*     */ 
/*     */   private String getLanguageOfText(String textId, File file) throws IOException {
/* 161 */     String lang = null;
/*     */ 
/* 163 */     if (this.deduceFromFolderPriority)
/*     */     {
/* 165 */       lang = deduceFromFolderName(file);
/* 166 */       if (lang != null) {
/* 167 */         return lang;
/*     */       }
/*     */     }
/* 170 */     if ((this.languageFileName == null | this.indexMetaPriority)) {
/* 171 */       lang = getLanguageFromIndexMeta(file);
/*     */ 
/* 177 */       if ((lang != null) && 
/* 178 */         (lang.equals(""))) {
/* 179 */         System.out.println("Language for " + file.getAbsolutePath() + " is " + lang);
/* 180 */         return lang;
/*     */       }
/*     */     }
/* 183 */     if (this.languageFileName != null)
/*     */     {
/* 185 */       if (this.textLanguage == null)
/* 186 */         this.textLanguage = loadLanguages();
/* 187 */       if (this.textLanguage == null)
/*     */       {
/* 189 */         System.out.println("NO LANGUAGE FILES LOADED");
/*     */       }
/*     */       else
/*     */       {
/* 198 */         String language = (String)this.textLanguage.get(textId);
/* 199 */         lang = (String)this.languageToISO.get(language);
/* 200 */         if (lang != null)
/*     */         {
/* 202 */           System.out.println("GOT language from language file:" + lang);
/* 203 */           return lang;
/*     */         }
/*     */       }
/*     */ 
/*     */     }
/*     */ 
/* 209 */     lang = deduceFromFolderName(file);
/* 210 */     if (lang != null)
/*     */     {
/* 212 */       System.out.println("Langugage deduced from Folder:" + lang);
/* 213 */       return lang;
/*     */     }
/*     */ 
/* 216 */     if ((this.preferedLanguage != null) && (!this.preferedLanguage.equals(""))) {
/* 217 */       System.out.println("no language identified from Metadata:  prefered language " + this.preferedLanguage + "will be used:" + file.getAbsolutePath());
/* 218 */       return this.preferedLanguage;
/*     */     }
/*     */ 
/* 221 */     System.out.println("no language identified:        language will be generic all:" + file.getAbsolutePath());
/* 222 */     return "all";
/*     */   }
/*     */ 
/*     */   private String deduceFromFolderName(File file) {
/* 226 */     File parent = file.getParentFile();
/* 227 */     String name = parent.getName();
/* 228 */     String lang = null;
/* 229 */     if (this.supportedLanguageFolder.containsKey(name))
/*     */     {
/* 231 */       lang = (String)this.supportedLanguageFolder.get(name);
/*     */     }
/* 233 */     return lang;
/*     */   }
/*     */ 
/*     */   private String getLanguageFromIndexMeta(File file)
/*     */     throws IOException
/*     */   {
/* 244 */     file = new File("/mpiwg/online/" + absPathToTextId(file.getAbsolutePath()));
/*     */ 
/* 246 */     File pf = file.getParentFile().getParentFile().getParentFile();
/* 247 */     File indexMeta = new File(pf, "index.meta");
/*     */ 
/* 249 */     if (!indexMeta.exists())
/*     */     {
/* 251 */       File pf2 = pf.getParentFile();
/* 252 */       indexMeta = new File(pf2, "index.meta");
/* 253 */       if (!indexMeta.exists())
/* 254 */         return null;
/*     */     }
/* 256 */     XMLReader parser = new SAXParser();
/* 257 */     ParseIndexMeta ch = new ParseIndexMeta();
/* 258 */     parser.setContentHandler(ch);
/*     */     try {
/* 260 */       Reader reader = new FileReader(indexMeta);
/* 261 */       InputSource input = new InputSource(reader);
/* 262 */       parser.parse(input);
/*     */     }
/*     */     catch (SAXException e)
/*     */     {
/* 266 */       e.printStackTrace();
/*     */     }
/*     */ 
/* 269 */     String lang = ch.lang;
/*     */ 
/* 272 */     return lang;
/*     */   }
/*     */ 
/*     */   private String getDCFromIndexMeta(String textId)
/*     */     throws IOException, XmlRpcException
/*     */   {
/* 301 */     XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl();
/* 302 */     URL url = new URL(this.mdProviderUrl);
/* 303 */     config.setServerURL(url);
/* 304 */     XmlRpcClient client = new XmlRpcClient();
/* 305 */     client.setConfig(config);
/*     */ 
/* 307 */     Object[] params = { textId };
/* 308 */     Object returnVals = client.execute("getDCFormatted", params);
/*     */ 
/* 311 */     return (String)returnVals;
/*     */   }
/*     */ 
/*     */   protected HashMap<String, String> loadLanguages()
/*     */   {
/* 320 */     File languageFile = new File(this.docDir + "/" + this.languageFileName);
/* 321 */     String languageFilePath = this.docDir + "/" + this.languageFileName;
/* 322 */     HashMap languages = new HashMap();
/* 323 */     boolean relativ = true;
/*     */ 
/* 325 */     if (!languageFile.exists())
/*     */     {
/* 327 */       languageFile = new File(this.languageFileName);
/* 328 */       languageFilePath = this.languageFileName;
/* 329 */       relativ = false;
/* 330 */       if (!languageFile.exists())
/* 331 */         return null;
/*     */     }
/*     */     BufferedReader in;
/*     */     try {
/* 335 */       in = new BufferedReader(new FileReader(languageFilePath));
/*     */     } catch (FileNotFoundException e) {
/* 337 */       return null;
/*     */     }
/*     */    
/* 340 */     String zeile = null;
/*     */     try {
/* 342 */       while ((zeile = in.readLine()) != null) {
/* 343 */         String[] splitted = zeile.replace("\"", "").split("[,]");
/* 344 */         if (splitted.length == 2)
/* 345 */           if (relativ)
/* 346 */             languages.put(this.docDir + "/" + splitted[0], splitted[1]);
/*     */           else
/* 348 */             languages.put(splitted[0], splitted[1]);
/*     */       }
/*     */     }
/*     */     catch (IOException e) {
/* 352 */       e.printStackTrace();
/* 353 */       return null;
/*     */     }
/*     */ 
/* 356 */     return languages;
/*     */   }
/*     */ 
/*     */   public void harvestFolder()
/*     */     throws InterruptedException
/*     */   {
/* 362 */     Date start = new Date();
/* 363 */     boolean create = true;
/*     */     try
/*     */     {
/* 374 */       System.out.println("Indexing to directory '" + this.index_dir + "'...");
/* 375 */       indexDocs(this.docDir);
/* 376 */       System.out.println("Optimizing...");
/* 377 */       this.languageAnalyzers.optimize();
/* 378 */       this.languageAnalyzers.close();
/*     */ 
/* 380 */       Date end = new Date();
/* 381 */       System.out.println(end.getTime() - start.getTime() + " total milliseconds");
/*     */     }
/*     */     catch (IOException e) {
/* 384 */       System.out.println(" caught a " + e.getClass() + 
/* 385 */         "\n with message: " + e.getMessage());
/*     */     }
/*     */   }
/*     */ 
/*     */   void indexDocs(File file)
/*     */     throws IOException, InterruptedException
/*     */   {
/* 392 */     if (!file.canRead())
/*     */       return;
/* 394 */     if (file.isDirectory())
/*     */     {
/* 396 */       if (this.counter > 100000)
/*     */       {
/* 398 */         return;
/*     */       }
/* 400 */       String[] files = file.list();
/*     */ 
/* 402 */       String folderName = file.getName();
/* 403 */       if ((((files != null) ? 1 : 0) & ((this.excludeFolders.contains(folderName)) ? 0 : 1)) != 0) {
/* 404 */         for (int i = 0; i < files.length; ++i)
/* 405 */           indexDocs(new File(file, files[i]));
/*     */       }
/*     */     }
/* 408 */     else if (isTextFile(file)) {
/* 409 */       processFile(file);
/*     */     }
/*     */     else {
/* 412 */       System.out.println("not adding " + file);
/*     */     }
/*     */   }
/*     */ 
/*     */   private void processFile(File file)
/*     */     throws IOException, CorruptIndexException, InterruptedException, FileNotFoundException, UnsupportedEncodingException
/*     */   {
/* 423 */     String textId = getTextId(file);
/* 424 */     System.out.println("file:" + this.counter);
/* 425 */     System.out.println("textId:" + textId);
/*     */ 
/* 427 */     String lang = getLanguageOfText(textId, file);
/* 428 */     String dcMetaData = null;
/* 429 */     if (this.mdProviderUrl != null)
/*     */       try {
/* 431 */         dcMetaData = getDCFromIndexMeta(textId);
/*     */       } catch (XmlRpcException e2) {
/* 433 */         dcMetaData = null;
/*     */       }
/*     */     int docNr;
/*     */   
/* 437 */     if (this.mode == "add")
/* 438 */       docNr = 0;
/*     */     else
/* 440 */       docNr = checkFileAndRemoveOldFile(file.getCanonicalPath(), lang, true, file.lastModified());
/* 441 */     if (lang == null) {
/* 442 */       System.out.println("not adding " + file);
/*     */     }
/* 444 */     else if (docNr == -1) {
/* 445 */       System.out.println(" OLD FILE:" + file);
/* 446 */     } else if (docNr >= 0)
/*     */     {
/* 448 */       System.out.println("adding " + file + " lang: " + lang);
/*     */       try
/*     */       {
/* 451 */         Boolean ret = addDocument(file, lang, dcMetaData, textId);
/* 452 */         if (ret.booleanValue())
/* 453 */           this.counter += 1;
/*     */       } catch (IOException e) {
/* 455 */         System.out.println("got an IO eception adding the document - wait a bit");
/* 456 */         Thread.sleep(10000L);
/* 457 */         System.out.println("Try again");
/*     */         try {
/* 459 */           Boolean ret = addDocument(file, lang, dcMetaData, textId);
/* 460 */           if (ret.booleanValue())
/* 461 */             this.counter += 1;
/*     */         } catch (IOException e1) {
/* 463 */           System.out.println("Couldn't do:" + file.getName());
/*     */         }
/*     */         catch (ParserConfigurationException e2) {
/* 466 */           e.printStackTrace();
/*     */         }
/*     */         catch (SAXException e2) {
/* 469 */           e.printStackTrace();
/*     */         }
/*     */       }
/*     */       catch (ParserConfigurationException e) {
/* 473 */         e.printStackTrace();
/*     */       }
/*     */       catch (SAXException e) {
/* 476 */         e.printStackTrace();
/*     */       }
/*     */ 
/*     */     }
/*     */     else
/*     */     {
/* 482 */       System.out.println(" UPDATE FILE:" + file + " lang: " + lang);
/*     */ 
/* 484 */       this.counter += 1;
/*     */       try {
/* 486 */         addDocument(file, lang, dcMetaData, textId);
/*     */       }
/*     */       catch (ParserConfigurationException e) {
/* 489 */         e.printStackTrace();
/*     */       }
/*     */       catch (SAXException e) {
/* 492 */         e.printStackTrace();
/*     */       }
/*     */     }
/*     */   }
/*     */ 
/*     */   protected Boolean addDocument(File file, String lang, String dcMetaData, String textId)
/*     */     throws CorruptIndexException, IOException, FileNotFoundException, UnsupportedEncodingException, ParserConfigurationException, SAXException
/*     */   {
/* 509 */     if (dcMetaData != null) {
/* 510 */       this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),lang, dcMetaData, textId), lang);
/* 511 */       this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),"all", dcMetaData, textId), "all");
/*     */     }
/*     */     else
/*     */     {
/* 515 */       this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),lang, textId), lang);
/* 516 */       this.languageAnalyzers.addDocument(FileDocument.Document(file, absPathToTextId(file),"all", textId), "all");
/*     */     }
/* 518 */     return Boolean.valueOf(true);
/*     */   }
/*     */ 
/*     */   private String getTextId(File file)
/*     */   {
/*     */     try
/*     */     {
/* 529 */       File parent = file.getParentFile();
/*     */ 
/* 531 */       if (parent.getName().equals("text"))
/* 532 */         return absPathToTextId(parent.getParentFile().getAbsolutePath());
/* 533 */       if (parent.getParentFile().getName().equals("text"))
/* 534 */         return absPathToTextId(parent.getParentFile().getParentFile().getAbsolutePath());
/* 535 */       if (parent.getParentFile().getParentFile().getName().equals("text")) {
/* 536 */         return absPathToTextId(parent.getParentFile().getParentFile().getParentFile().getAbsolutePath());
/*     */       }
/* 538 */       return null;
/*     */     }
/*     */     catch (RuntimeException e) {
/* 541 */       e.printStackTrace();
/* 542 */     }return null;
/*     */   }
/*     */ 
	protected String absPathToTextId(File file)
	/*     */   {
		try {
			return absPathToTextId(file.getCanonicalPath());
		} catch (IOException e) {
			
			e.printStackTrace();
			return "";
		}
	}
	
/*     */   protected String absPathToTextId(String absolutePath)
/*     */   {
/* 555 */     if (this.specialMode.equals("vlp"))
/*     */     {
/* 557 */       String[] splitted = absolutePath.split("lit");
/* 558 */       return splitted[1];
/*     */     }
/*     */ 
/* 562 */     Pattern p = Pattern.compile(TEXTIDFROMPATH_REGEXP);
/* 563 */     Matcher m = p.matcher(absolutePath);
/* 564 */     m.matches();
/* 565 */     if (m.groupCount() > 0) {
/* 566 */       return m.group(1);
/*     */     }
/* 568 */     System.err.println("correctPath: not a mpiwg path / no changes done" + absolutePath);
/* 569 */     return absolutePath;
/*     */   }
/*     */ 
/*     */   private int checkFileAndRemoveOldFile(String filePath, String lang, boolean deleteWrongLanguage, long fileModDate)
/*     */     throws CorruptIndexException, IOException
/*     */   {
/* 577 */     lang = checkSupportedLanguages(lang);
/* 578 */     System.out.println("lang converted+" + lang);
/* 579 */     //TermQuery query = new TermQuery(new Term("path", filePath));
			  TermQuery query = new TermQuery(new Term("cleanedPath", absPathToTextId(filePath)));
/*     */ 
/* 582 */     HashMap<String,Collector> results = this.languageAnalyzers.search(query);
/*     */ 
/* 584 */     if (results == null) {
/* 585 */       return 0;
/*     */     }
/* 587 */     for (String resultLang : results.keySet())
/*     */     {
/* 589 */       TopScoreDocCollector collector = (TopScoreDocCollector)results.get(resultLang);
/*     */ 
/* 591 */       if ((collector == null) || (collector.getTotalHits() <= 0))
/*     */         continue;
/* 593 */       if ((!resultLang.equals(lang)) && (deleteWrongLanguage) && (!resultLang.equals("morph")))
/*     */       {
/* 595 */         this.languageAnalyzers.deleteDocuments(query);
/*     */ 
/* 603 */         System.out.println("language changed:" + filePath);
/* 604 */         return 1;
/*     */       }
/*     */ 
/* 607 */       if (!resultLang.equals(lang))
/*     */         continue;
/* 609 */       TopDocs docs = collector.topDocs();
/*     */       ScoreDoc[] arrayOfScoreDoc;
/* 610 */       if ((arrayOfScoreDoc = docs.scoreDocs).length == 0) continue; ScoreDoc doc = arrayOfScoreDoc[0];
/* 611 */       String modDate = this.languageAnalyzers.getAnalyzer(resultLang).reader.document(doc.doc).getField("modified").stringValue();
/*     */ 
/* 613 */       String fileDate = DateTools.timeToString(fileModDate, DateTools.Resolution.MINUTE);
/* 614 */       if (!fileDate.equals(modDate))
/*     */       {
/* 618 */         System.out.println("new file:" + filePath);
/* 619 */         this.languageAnalyzers.deleteDocuments(query);
/* 620 */         return 2;
/*     */       }
/*     */ 
/* 623 */       return -1;
/*     */     }
/*     */ 
/* 631 */     return 0;
/*     */   }
/*     */ 
/*     */   private String checkSupportedLanguages(String lang)
/*     */   {
/* 643 */     if (this.languageAnalyzers.getAnalyzer(lang) == null)
/* 644 */       return "all";
/* 645 */     return lang;
/*     */   }
/*     */ 
/*     */   public void setIndexMetaPriority(boolean prio)
/*     */   {
/* 650 */     this.indexMetaPriority = prio;
/*     */   }
/*     */ 
/*     */   public boolean getIndexMetaPriority() {
/* 654 */     return this.indexMetaPriority;
/*     */   }
/*     */ 
/*     */   private boolean isTextFile(File file)
/*     */   {
/* 659 */     String fn = file.getName();
/*     */ 
/* 661 */     String[] splitted = fn.split("[.]");
/*     */ 
/* 663 */     String ext = "";
/*     */ 
/* 665 */     if (splitted.length > 1)
/*     */     {
/* 667 */       ext = splitted[(splitted.length - 1)];
/*     */     }
/*     */ 
/* 670 */     return this.fileTypesToIndex.contains(ext);
/*     */   }
/*     */ }

/* Location:           /private/tmp/fulltextIndexer.jar
 * Qualified Name:     de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread
 * JD-Core Version:    0.5.4
 */