view src/de/mpiwg/dwinter/fulltextIndexer/harvester/.svn/text-base/HarvesterThreaded.java.svn-base @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line source

/*     */ package de.mpiwg.dwinter.fulltextIndexer.harvester;
/*     */ 
/*     */ import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread;
/*     */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer;
/*     */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;
/*     */ import java.io.BufferedReader;
/*     */ import java.io.File;
/*     */ import java.io.FileNotFoundException;
/*     */ import java.io.FileReader;
/*     */ import java.io.IOException;
/*     */ import java.io.PrintStream;
/*     */ import java.util.ArrayList;
/*     */ import java.util.Arrays;
/*     */ import java.util.Date;
/*     */ import java.util.HashMap;
/*     */ import java.util.List;
/*     */ import org.apache.lucene.analysis.de.GermanAnalyzer;
/*     */ import org.apache.lucene.analysis.fr.FrenchAnalyzer;
/*     */ import org.apache.lucene.analysis.standard.StandardAnalyzer;
/*     */ import org.apache.lucene.index.CorruptIndexException;
/*     */ import org.apache.lucene.store.LockObtainFailedException;
/*     */ import org.apache.lucene.util.Version;
/*     */ import org.jdom.Document;
/*     */ import org.jdom.Element;
/*     */ import org.jdom.JDOMException;
/*     */ import org.jdom.input.SAXBuilder;
/*     */ import org.jdom.xpath.XPath;
/*     */ 
/*     */ public class HarvesterThreaded
/*     */ {
/*     */   private static final boolean DEBUG = false;
			private static final int MAXFILES = 100; // only used if DEBUG is true
/*  75 */   protected static ArrayList<String> fileTypesToIndex = new ArrayList(Arrays.asList(new String[] { "xml" }));
/*     */ 
/*  77 */   protected static ArrayList<String> excludeFolders = new ArrayList(Arrays.asList(new String[] { "OCR" }));
/*  78 */   protected static boolean indexMetaPriority = false;
/*     */ 
/*  81 */   private static String specialMode = "";
/*  82 */   protected static int maxThread = 30;
/*     */   protected File docDir;
/*     */   protected File index_dir;
/*  88 */   protected HashMap<String, String> textLanguage = null;
/*  89 */   protected HashMap<String, String> languageToISO = new HashMap();
/*  90 */   protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers();
/*     */ 
/*  92 */   private int counter = 0;
/*     */   protected String languageFileName;
/*  99 */   protected ProcessFileThread[] mythreads = new ProcessFileThread[maxThread];
/* 100 */   private int filecount = 0;
/*     */ 
/* 102 */   protected String mdProviderUrl = null;
/*     */   private String preferedLanguage;
/* 106 */   protected HashMap<String, String> supportedLanguageFolder = new HashMap();
/*     */ 
/*     */   public HarvesterThreaded()
/*     */   {
/*     */   }
/*     */ 
/*     */   public HarvesterThreaded(File docDir, File index_dir, String languageFileName, String mdProviderUrl, String lang)
/*     */     throws CorruptIndexException, LockObtainFailedException, IOException
/*     */   {
/* 119 */     this.docDir = docDir;
/* 120 */     this.languageFileName = languageFileName;
/* 121 */     this.preferedLanguage = lang;
/*     */ 
/* 133 */     this.mdProviderUrl = mdProviderUrl;
/*     */ 
/* 135 */     this.index_dir = index_dir;
/*     */ 
/* 137 */     for (int i = 0; i < maxThread; ++i)
/*     */     {
/* 139 */       this.mythreads[i] = null;
/*     */     }
/*     */ 
/* 142 */     init_languages();
/*     */   }
/*     */ 
/*     */   private void init_languages() {
/* 146 */     this.languageToISO.put("German", "de");
/* 147 */     this.languageToISO.put("French", "fr");
/* 148 */     this.languageToISO.put("English", "en");
/* 149 */     this.languageToISO.put("German-f", "de-f");
/*     */ 
/* 151 */     this.supportedLanguageFolder.put("deu", "de");
/* 152 */     this.supportedLanguageFolder.put("deu-f", "de");
/* 153 */     this.supportedLanguageFolder.put("fra", "fr");
/* 154 */     this.supportedLanguageFolder.put("eng", "en");
/* 155 */     this.supportedLanguageFolder.put("lic", "la");
/*     */     try
/*     */     {
/* 158 */       this.languageAnalyzers.add(new LanguageAnalyzer("de", new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
/* 159 */       this.languageAnalyzers.add(new LanguageAnalyzer("de-f", new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
/* 160 */       this.languageAnalyzers.add(new LanguageAnalyzer("en", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
/* 161 */       this.languageAnalyzers.add(new LanguageAnalyzer("fr", new FrenchAnalyzer(Version.LUCENE_30), this.index_dir));
/* 162 */       this.languageAnalyzers.add(new LanguageAnalyzer("la", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
/*     */ 
/* 164 */       this.languageAnalyzers.add(new LanguageAnalyzer("all", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
/* 165 */       this.languageAnalyzers.add(new LanguageAnalyzer("morph", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
/*     */     } catch (CorruptIndexException e) {
/* 167 */       e.printStackTrace();
/* 168 */       System.exit(1);
/*     */     } catch (LockObtainFailedException e) {
/* 170 */       e.printStackTrace();
/* 171 */       System.exit(1);
/*     */     } catch (IOException e) {
/* 173 */       e.printStackTrace();
/* 174 */       System.exit(1);
/*     */     }
/*     */   }
/*     */ 
/*     */   public HarvesterThreaded(File docDir, File index_dir, String mdProviderUrl) throws CorruptIndexException, LockObtainFailedException, IOException
/*     */   {
/* 180 */     this(docDir, index_dir, null, mdProviderUrl, null);
/*     */   }
/*     */ 
/*     */   protected HashMap<String, String> loadLanguages()
/*     */   {
/* 187 */     File languageFile = new File(this.docDir + "/" + this.languageFileName);
/* 188 */     String languageFilePath = this.docDir + "/" + this.languageFileName;
/* 189 */     HashMap languages = new HashMap();
/* 190 */     boolean relativ = true;
/* 191 */     if (this.languageFileName == null)
/* 192 */       return null;
/* 193 */     if (!languageFile.exists())
/*     */     {
/* 195 */       languageFile = new File(this.languageFileName);
/* 196 */       languageFilePath = this.languageFileName;
/* 197 */       relativ = false;
/* 198 */       if (!languageFile.exists())
/* 199 */         return null;
/*     */     }
/*     */     BufferedReader in;
/*     */     try {
/* 203 */       in = new BufferedReader(new FileReader(languageFilePath));
/*     */     } catch (FileNotFoundException e) {
/* 205 */       return null;
/*     */     }
/*     */    
/* 208 */     String zeile = null;
/*     */     try {
/* 210 */       while ((zeile = in.readLine()) != null) {
/* 211 */         String[] splitted = zeile.replace("\"", "").split("[,]");
/* 212 */         if (splitted.length == 2)
/* 213 */           if (relativ)
/* 214 */             languages.put(this.docDir + "/" + splitted[0], splitted[1]);
/*     */           else
/* 216 */             languages.put(splitted[0], splitted[1]);
/*     */       }
/*     */     }
/*     */     catch (IOException e) {
/* 220 */       e.printStackTrace();
/* 221 */       return null;
/*     */     }
/*     */ 
/* 224 */     return languages;
/*     */   }
/*     */ 
/*     */   public void harvestFromRDF(String rdffilepath) throws InterruptedException, JDOMException {
/* 228 */     Date start = new Date();
/* 229 */     boolean create = true;
/*     */     try
/*     */     {
/* 240 */       System.out.println("Indexing to directory '" + this.index_dir + "'...");
/* 241 */       ArrayList files = getFileListFromRDF(rdffilepath);
/* 242 */       indexDocs(files);
/* 243 */       System.out.println("Optimizing...");
/* 244 */       this.languageAnalyzers.optimize();
/* 245 */       this.languageAnalyzers.close();
/*     */ 
/* 247 */       Date end = new Date();
/* 248 */       System.out.println(end.getTime() - start.getTime() + " total milliseconds");
/*     */     }
/*     */     catch (IOException e) {
/* 251 */       System.out.println(" caught a " + e.getClass() + 
/* 252 */         "\n with message: " + e.getMessage());
/*     */     }
/*     */   }
/*     */ 
/*     */   private ArrayList<String> getFileListFromRDF(String rdffilepath)
/*     */     throws JDOMException, IOException
/*     */   {
/* 260 */     ArrayList ret = new ArrayList();
/* 261 */     SAXBuilder builder = new SAXBuilder();
/*     */ 
/* 263 */     Document doc = builder.build(rdffilepath);
/*     */ 
/* 265 */     Element el = doc.getRootElement();
/*     */ 
/* 267 */     XPath xpath = XPath.newInstance("//MPIWG:archive-path");
/* 268 */     xpath.addNamespace("MPIWG", "http://www.mpiwg-berlin.mpg.de/ns/mpiwg");
/* 269 */     List<Element> paths = xpath.selectNodes(el);
/* 270 */     for (Element path : paths) {
/* 271 */       ret.add(path.getText());
/*     */     }
/*     */ 
/* 274 */     return ret;
/*     */   }
/*     */ 
/*     */   public void harvestFolder() throws InterruptedException {
/* 278 */     Date start = new Date();
/* 279 */     boolean create = true;
/*     */     try
/*     */     {
/* 290 */       System.out.println("Indexing to directory '" + this.index_dir + "'...");
/* 291 */       indexDocs(this.docDir);
/* 292 */       System.out.println("Optimizing...");
/* 293 */       this.languageAnalyzers.optimize();
/* 294 */       this.languageAnalyzers.close();
/*     */ 
/* 296 */       Date end = new Date();
/* 297 */       System.out.println(end.getTime() - start.getTime() + " total milliseconds");
/*     */     }
/*     */     catch (IOException e) {
/* 300 */       System.out.println(" caught a " + e.getClass() + 
/* 301 */         "\n with message: " + e.getMessage());
/*     */     }
/*     */   }
/*     */ 
/*     */   private void indexDocs(ArrayList<String> files)
/*     */     throws IOException, InterruptedException
/*     */   {
/* 308 */     for (String filename : files)
/*     */     {
/* 310 */       indexDocs(new File(this.docDir.getAbsolutePath() + filename));
				if ((DEBUG==true) & (this.filecount>MAXFILES))
					break;
/*     */     }
/*     */   }
/*     */ 
/*     */   void indexDocs(File file)
/*     */     throws IOException, InterruptedException
/*     */   {
/* 317 */     if (!file.canRead())
/*     */       return;
/* 319 */     if (file.isDirectory())
/*     */     {
/* 321 */       if ((DEBUG==true) && (this.filecount>MAXFILES))
					return;
/* 325 */       String[] files = file.list();
/*     */ 
/* 327 */       String folderName = file.getName();
/* 328 */       if ((((files != null) ? 1 : 0) & ((excludeFolders.contains(folderName)) ? 0 : 1)) != 0)
/* 329 */         for (int i = 0; i < files.length; ++i)
/*     */         {
/* 332 */           indexDocs(new File(file, files[i]));
				if ((DEBUG==true) && (this.filecount>MAXFILES))
					break;
/*     */         }
/*     */     }
/* 335 */     else if (isTextFile(file))
/*     */     {
/* 338 */       processFile(file);
/*     */     }
/*     */     else
/*     */     {
/* 342 */       System.out.println("not adding " + file);
/*     */     }
/*     */   }
/*     */ 
/*     */   protected void processFile(File file) throws CorruptIndexException, LockObtainFailedException, IOException
/*     */   {
/* 348 */     int freeThread = -1;
/* 349 */     while (freeThread == -1)
/*     */     {
/* 351 */       freeThread = waitForFreeThread();
/*     */     }
/*     */ 
/* 355 */     if (this.textLanguage == null)
/* 356 */       this.textLanguage = loadLanguages();
/* 357 */     this.mythreads[freeThread] = new ProcessFileThread(this.languageAnalyzers, file, this.languageFileName, this.textLanguage, this.mdProviderUrl, this.preferedLanguage, this.languageToISO, this.supportedLanguageFolder);
/* 358 */     this.mythreads[freeThread].start();
/* 359 */     System.out.println("New process started:" + freeThread);
/*     */   }
/*     */ 
/*     */   protected int waitForFreeThread()
/*     */   {
/* 367 */     for (int i = 0; i < maxThread; ++i)
/*     */     {
/* 369 */       if (this.mythreads[i] == null)
/* 370 */         return i;
/* 371 */       if (!this.mythreads[i].done)
/*     */         continue;
/* 373 */       this.filecount += 1;
/* 374 */       System.out.println("filecount:" + this.filecount);
/* 375 */       return i;
/*     */     }
/*     */ 
/* 378 */     return -1;
/*     */   }
/*     */ 
/*     */   private boolean isTextFile(File file)
/*     */   {
/* 392 */     String fn = file.getName();
/*     */ 
/* 394 */     String[] splitted = fn.split("[.]");
/*     */ 
/* 396 */     String ext = "";
/*     */ 
/* 398 */     if (splitted.length > 1)
/*     */     {
/* 400 */       ext = splitted[(splitted.length - 1)];
/*     */     }
/*     */ 
/* 403 */     return fileTypesToIndex.contains(ext);
/*     */   }
/*     */ }

/* Location:           /private/tmp/fulltextIndexer.jar
 * Qualified Name:     de.mpiwg.dwinter.fulltextIndexer.harvester.HarvesterThreaded
 * JD-Core Version:    0.5.4
 */