diff src/de/mpiwg/dwinter/fulltextIndexer/harvester/HarvesterThreaded.java @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/fulltextIndexer/harvester/HarvesterThreaded.java	Wed Nov 03 12:33:16 2010 +0100
@@ -0,0 +1,311 @@
+/*     */ package de.mpiwg.dwinter.fulltextIndexer.harvester;
+/*     */ 
+/*     */ import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread;
+/*     */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer;
+/*     */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;
+/*     */ import java.io.BufferedReader;
+/*     */ import java.io.File;
+/*     */ import java.io.FileNotFoundException;
+/*     */ import java.io.FileReader;
+/*     */ import java.io.IOException;
+/*     */ import java.io.PrintStream;
+/*     */ import java.util.ArrayList;
+/*     */ import java.util.Arrays;
+/*     */ import java.util.Date;
+/*     */ import java.util.HashMap;
+/*     */ import java.util.List;
+/*     */ import org.apache.lucene.analysis.de.GermanAnalyzer;
+/*     */ import org.apache.lucene.analysis.fr.FrenchAnalyzer;
+/*     */ import org.apache.lucene.analysis.standard.StandardAnalyzer;
+/*     */ import org.apache.lucene.index.CorruptIndexException;
+/*     */ import org.apache.lucene.store.LockObtainFailedException;
+/*     */ import org.apache.lucene.util.Version;
+/*     */ import org.jdom.Document;
+/*     */ import org.jdom.Element;
+/*     */ import org.jdom.JDOMException;
+/*     */ import org.jdom.input.SAXBuilder;
+/*     */ import org.jdom.xpath.XPath;
+/*     */ 
+/*     */ public class HarvesterThreaded
+/*     */ {
+/*     */   private static final boolean DEBUG = false;
+			private static final int MAXFILES = 100; // only used if DEBUG is true
+/*  75 */   protected static ArrayList<String> fileTypesToIndex = new ArrayList(Arrays.asList(new String[] { "xml" }));
+/*     */ 
+/*  77 */   protected static ArrayList<String> excludeFolders = new ArrayList(Arrays.asList(new String[] { "OCR" }));
+/*  78 */   protected static boolean indexMetaPriority = false;
+/*     */ 
+/*  81 */   private static String specialMode = "";
+/*  82 */   protected static int maxThread = 30;
+/*     */   protected File docDir;
+/*     */   protected File index_dir;
+/*  88 */   protected HashMap<String, String> textLanguage = null;
+/*  89 */   protected HashMap<String, String> languageToISO = new HashMap();
+/*  90 */   protected LanguageAnalyzers languageAnalyzers = new LanguageAnalyzers();
+/*     */ 
+/*  92 */   private int counter = 0;
+/*     */   protected String languageFileName;
+/*  99 */   protected ProcessFileThread[] mythreads = new ProcessFileThread[maxThread];
+/* 100 */   private int filecount = 0;
+/*     */ 
+/* 102 */   protected String mdProviderUrl = null;
+/*     */   private String preferedLanguage;
+/* 106 */   protected HashMap<String, String> supportedLanguageFolder = new HashMap();
+/*     */ 
+/*     */   public HarvesterThreaded()
+/*     */   {
+/*     */   }
+/*     */ 
+/*     */   public HarvesterThreaded(File docDir, File index_dir, String languageFileName, String mdProviderUrl, String lang)
+/*     */     throws CorruptIndexException, LockObtainFailedException, IOException
+/*     */   {
+/* 119 */     this.docDir = docDir;
+/* 120 */     this.languageFileName = languageFileName;
+/* 121 */     this.preferedLanguage = lang;
+/*     */ 
+/* 133 */     this.mdProviderUrl = mdProviderUrl;
+/*     */ 
+/* 135 */     this.index_dir = index_dir;
+/*     */ 
+/* 137 */     for (int i = 0; i < maxThread; ++i)
+/*     */     {
+/* 139 */       this.mythreads[i] = null;
+/*     */     }
+/*     */ 
+/* 142 */     init_languages();
+/*     */   }
+/*     */ 
+/*     */   private void init_languages() {
+/* 146 */     this.languageToISO.put("German", "de");
+/* 147 */     this.languageToISO.put("French", "fr");
+/* 148 */     this.languageToISO.put("English", "en");
+/* 149 */     this.languageToISO.put("German-f", "de-f");
+/*     */ 
+/* 151 */     this.supportedLanguageFolder.put("deu", "de");
+/* 152 */     this.supportedLanguageFolder.put("deu-f", "de");
+/* 153 */     this.supportedLanguageFolder.put("fra", "fr");
+/* 154 */     this.supportedLanguageFolder.put("eng", "en");
+/* 155 */     this.supportedLanguageFolder.put("lic", "la");
+/*     */     try
+/*     */     {
+/* 158 */       this.languageAnalyzers.add(new LanguageAnalyzer("de", new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
+/* 159 */       this.languageAnalyzers.add(new LanguageAnalyzer("de-f", new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
+/* 160 */       this.languageAnalyzers.add(new LanguageAnalyzer("en", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
+/* 161 */       this.languageAnalyzers.add(new LanguageAnalyzer("fr", new FrenchAnalyzer(Version.LUCENE_30), this.index_dir));
+/* 162 */       this.languageAnalyzers.add(new LanguageAnalyzer("la", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
+/*     */ 
+/* 164 */       this.languageAnalyzers.add(new LanguageAnalyzer("all", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
+/* 165 */       this.languageAnalyzers.add(new LanguageAnalyzer("morph", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
+/*     */     } catch (CorruptIndexException e) {
+/* 167 */       e.printStackTrace();
+/* 168 */       System.exit(1);
+/*     */     } catch (LockObtainFailedException e) {
+/* 170 */       e.printStackTrace();
+/* 171 */       System.exit(1);
+/*     */     } catch (IOException e) {
+/* 173 */       e.printStackTrace();
+/* 174 */       System.exit(1);
+/*     */     }
+/*     */   }
+/*     */ 
+/*     */   public HarvesterThreaded(File docDir, File index_dir, String mdProviderUrl) throws CorruptIndexException, LockObtainFailedException, IOException
+/*     */   {
+/* 180 */     this(docDir, index_dir, null, mdProviderUrl, null);
+/*     */   }
+/*     */ 
+/*     */   protected HashMap<String, String> loadLanguages()
+/*     */   {
+/* 187 */     File languageFile = new File(this.docDir + "/" + this.languageFileName);
+/* 188 */     String languageFilePath = this.docDir + "/" + this.languageFileName;
+/* 189 */     HashMap languages = new HashMap();
+/* 190 */     boolean relativ = true;
+/* 191 */     if (this.languageFileName == null)
+/* 192 */       return null;
+/* 193 */     if (!languageFile.exists())
+/*     */     {
+/* 195 */       languageFile = new File(this.languageFileName);
+/* 196 */       languageFilePath = this.languageFileName;
+/* 197 */       relativ = false;
+/* 198 */       if (!languageFile.exists())
+/* 199 */         return null;
+/*     */     }
+/*     */     BufferedReader in;
+/*     */     try {
+/* 203 */       in = new BufferedReader(new FileReader(languageFilePath));
+/*     */     } catch (FileNotFoundException e) {
+/* 205 */       return null;
+/*     */     }
+/*     */    
+/* 208 */     String zeile = null;
+/*     */     try {
+/* 210 */       while ((zeile = in.readLine()) != null) {
+/* 211 */         String[] splitted = zeile.replace("\"", "").split("[,]");
+/* 212 */         if (splitted.length == 2)
+/* 213 */           if (relativ)
+/* 214 */             languages.put(this.docDir + "/" + splitted[0], splitted[1]);
+/*     */           else
+/* 216 */             languages.put(splitted[0], splitted[1]);
+/*     */       }
+/*     */     }
+/*     */     catch (IOException e) {
+/* 220 */       e.printStackTrace();
+/* 221 */       return null;
+/*     */     }
+/*     */ 
+/* 224 */     return languages;
+/*     */   }
+/*     */ 
+/*     */   public void harvestFromRDF(String rdffilepath) throws InterruptedException, JDOMException {
+/* 228 */     Date start = new Date();
+/* 229 */     boolean create = true;
+/*     */     try
+/*     */     {
+/* 240 */       System.out.println("Indexing to directory '" + this.index_dir + "'...");
+/* 241 */       ArrayList files = getFileListFromRDF(rdffilepath);
+/* 242 */       indexDocs(files);
+/* 243 */       System.out.println("Optimizing...");
+/* 244 */       this.languageAnalyzers.optimize();
+/* 245 */       this.languageAnalyzers.close();
+/*     */ 
+/* 247 */       Date end = new Date();
+/* 248 */       System.out.println(end.getTime() - start.getTime() + " total milliseconds");
+/*     */     }
+/*     */     catch (IOException e) {
+/* 251 */       System.out.println(" caught a " + e.getClass() + 
+/* 252 */         "\n with message: " + e.getMessage());
+/*     */     }
+/*     */   }
+/*     */ 
+/*     */   private ArrayList<String> getFileListFromRDF(String rdffilepath)
+/*     */     throws JDOMException, IOException
+/*     */   {
+/* 260 */     ArrayList ret = new ArrayList();
+/* 261 */     SAXBuilder builder = new SAXBuilder();
+/*     */ 
+/* 263 */     Document doc = builder.build(rdffilepath);
+/*     */ 
+/* 265 */     Element el = doc.getRootElement();
+/*     */ 
+/* 267 */     XPath xpath = XPath.newInstance("//MPIWG:archive-path");
+/* 268 */     xpath.addNamespace("MPIWG", "http://www.mpiwg-berlin.mpg.de/ns/mpiwg");
+/* 269 */     List<Element> paths = xpath.selectNodes(el);
+/* 270 */     for (Element path : paths) {
+/* 271 */       ret.add(path.getText());
+/*     */     }
+/*     */ 
+/* 274 */     return ret;
+/*     */   }
+/*     */ 
+/*     */   public void harvestFolder() throws InterruptedException {
+/* 278 */     Date start = new Date();
+/* 279 */     boolean create = true;
+/*     */     try
+/*     */     {
+/* 290 */       System.out.println("Indexing to directory '" + this.index_dir + "'...");
+/* 291 */       indexDocs(this.docDir);
+/* 292 */       System.out.println("Optimizing...");
+/* 293 */       this.languageAnalyzers.optimize();
+/* 294 */       this.languageAnalyzers.close();
+/*     */ 
+/* 296 */       Date end = new Date();
+/* 297 */       System.out.println(end.getTime() - start.getTime() + " total milliseconds");
+/*     */     }
+/*     */     catch (IOException e) {
+/* 300 */       System.out.println(" caught a " + e.getClass() + 
+/* 301 */         "\n with message: " + e.getMessage());
+/*     */     }
+/*     */   }
+/*     */ 
+/*     */   private void indexDocs(ArrayList<String> files)
+/*     */     throws IOException, InterruptedException
+/*     */   {
+/* 308 */     for (String filename : files)
+/*     */     {
+/* 310 */       indexDocs(new File(this.docDir.getAbsolutePath() + filename));
+				if ((DEBUG==true) & (this.filecount>MAXFILES))
+					break;
+/*     */     }
+/*     */   }
+/*     */ 
+/*     */   void indexDocs(File file)
+/*     */     throws IOException, InterruptedException
+/*     */   {
+/* 317 */     if (!file.canRead())
+/*     */       return;
+/* 319 */     if (file.isDirectory())
+/*     */     {
+/* 321 */       if ((DEBUG==true) && (this.filecount>MAXFILES))
+					return;
+/* 325 */       String[] files = file.list();
+/*     */ 
+/* 327 */       String folderName = file.getName();
+/* 328 */       if ((((files != null) ? 1 : 0) & ((excludeFolders.contains(folderName)) ? 0 : 1)) != 0)
+/* 329 */         for (int i = 0; i < files.length; ++i)
+/*     */         {
+/* 332 */           indexDocs(new File(file, files[i]));
+				if ((DEBUG==true) && (this.filecount>MAXFILES))
+					break;
+/*     */         }
+/*     */     }
+/* 335 */     else if (isTextFile(file))
+/*     */     {
+/* 338 */       processFile(file);
+/*     */     }
+/*     */     else
+/*     */     {
+/* 342 */       System.out.println("not adding " + file);
+/*     */     }
+/*     */   }
+/*     */ 
+/*     */   protected void processFile(File file) throws CorruptIndexException, LockObtainFailedException, IOException
+/*     */   {
+/* 348 */     int freeThread = -1;
+/* 349 */     while (freeThread == -1)
+/*     */     {
+/* 351 */       freeThread = waitForFreeThread();
+/*     */     }
+/*     */ 
+/* 355 */     if (this.textLanguage == null)
+/* 356 */       this.textLanguage = loadLanguages();
+/* 357 */     this.mythreads[freeThread] = new ProcessFileThread(this.languageAnalyzers, file, this.languageFileName, this.textLanguage, this.mdProviderUrl, this.preferedLanguage, this.languageToISO, this.supportedLanguageFolder);
+/* 358 */     this.mythreads[freeThread].start();
+/* 359 */     System.out.println("New process started:" + freeThread);
+/*     */   }
+/*     */ 
+/*     */   protected int waitForFreeThread()
+/*     */   {
+/* 367 */     for (int i = 0; i < maxThread; ++i)
+/*     */     {
+/* 369 */       if (this.mythreads[i] == null)
+/* 370 */         return i;
+/* 371 */       if (!this.mythreads[i].done)
+/*     */         continue;
+/* 373 */       this.filecount += 1;
+/* 374 */       System.out.println("filecount:" + this.filecount);
+/* 375 */       return i;
+/*     */     }
+/*     */ 
+/* 378 */     return -1;
+/*     */   }
+/*     */ 
+/*     */   private boolean isTextFile(File file)
+/*     */   {
+/* 392 */     String fn = file.getName();
+/*     */ 
+/* 394 */     String[] splitted = fn.split("[.]");
+/*     */ 
+/* 396 */     String ext = "";
+/*     */ 
+/* 398 */     if (splitted.length > 1)
+/*     */     {
+/* 400 */       ext = splitted[(splitted.length - 1)];
+/*     */     }
+/*     */ 
+/* 403 */     return fileTypesToIndex.contains(ext);
+/*     */   }
+/*     */ }
+
+/* Location:           /private/tmp/fulltextIndexer.jar
+ * Qualified Name:     de.mpiwg.dwinter.fulltextIndexer.harvester.HarvesterThreaded
+ * JD-Core Version:    0.5.4
+ */
\ No newline at end of file