diff src/de/mpiwg/dwinter/fulltextIndexer/harvester/OCRHarvesterThreaded.java @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/fulltextIndexer/harvester/OCRHarvesterThreaded.java	Wed Nov 03 12:33:16 2010 +0100
@@ -0,0 +1,102 @@
+
+/*     */ package de.mpiwg.dwinter.fulltextIndexer.harvester;
+/*     */ 
+/*     */ import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.OCRProcessFileThread;
+/*     */ import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread;
+/*     */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer;
+/*     */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;
+/*     */ import java.io.File;
+/*     */ import java.io.IOException;
+/*     */ import java.io.PrintStream;
+/*     */ import java.util.HashMap;
+/*     */ import org.apache.lucene.analysis.de.GermanAnalyzer;
+/*     */ import org.apache.lucene.analysis.fr.FrenchAnalyzer;
+/*     */ import org.apache.lucene.analysis.standard.StandardAnalyzer;
+/*     */ import org.apache.lucene.index.CorruptIndexException;
+/*     */ import org.apache.lucene.store.LockObtainFailedException;
+/*     */ import org.apache.lucene.util.Version;
+/*     */ 
+/*     */ public class OCRHarvesterThreaded extends HarvesterThreaded
+/*     */ {
+/*     */   private String preferedLanguage;
+/*     */ 
+/*     */   public OCRHarvesterThreaded()
+/*     */   {
+/*     */   }
+/*     */ 
+/*     */   public OCRHarvesterThreaded(File docDir, File index_dir, String languageFileName, String mdProviderUrl, String lang)
+/*     */     throws CorruptIndexException, LockObtainFailedException, IOException
+/*     */   {
+/*  41 */     this.index_dir = index_dir;
+/*  42 */     this.languageFileName = languageFileName;
+/*  43 */     this.docDir = docDir;
+/*  44 */     this.preferedLanguage = lang;
+/*     */ 
+/*  46 */     this.mdProviderUrl = mdProviderUrl;
+/*  47 */     for (int i = 0; i < maxThread; ++i)
+/*     */     {
+/*  49 */       this.mythreads[i] = null;
+/*     */     }
+/*     */ 
+/*  52 */     init_languages();
+/*     */   }
+/*     */ 
+/*     */   private void init_languages() {
+/*  56 */     this.languageToISO.put("German", "de");
+/*  57 */     this.languageToISO.put("French", "fr");
+/*  58 */     this.languageToISO.put("English", "en");
+/*  59 */     this.languageToISO.put("German-f", "de-f");
+/*     */ 
+/*  61 */     this.supportedLanguageFolder.put("deu", "de");
+/*  62 */     this.supportedLanguageFolder.put("deu-f", "de");
+/*  63 */     this.supportedLanguageFolder.put("fra", "fr");
+/*  64 */     this.supportedLanguageFolder.put("eng", "en");
+/*  65 */     this.supportedLanguageFolder.put("lic", "la");
+/*     */     try
+/*     */     {
+/*  68 */       this.languageAnalyzers.add(new LanguageAnalyzer("de", new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
+/*  69 */       this.languageAnalyzers.add(new LanguageAnalyzer("de-f", new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
+/*  70 */       this.languageAnalyzers.add(new LanguageAnalyzer("en", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
+/*  71 */       this.languageAnalyzers.add(new LanguageAnalyzer("fr", new FrenchAnalyzer(Version.LUCENE_30), this.index_dir));
+/*  72 */       this.languageAnalyzers.add(new LanguageAnalyzer("la", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
+/*     */ 
+/*  74 */       this.languageAnalyzers.add(new LanguageAnalyzer("all", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
+/*  75 */       this.languageAnalyzers.add(new LanguageAnalyzer("morph", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
+/*     */     } catch (CorruptIndexException e) {
+/*  77 */       e.printStackTrace();
+/*  78 */       System.exit(1);
+/*     */     } catch (LockObtainFailedException e) {
+/*  80 */       e.printStackTrace();
+/*  81 */       System.exit(1);
+/*     */     } catch (IOException e) {
+/*  83 */       e.printStackTrace();
+/*  84 */       System.exit(1);
+/*     */     }
+/*     */   }
+/*     */ 
+/*     */   public OCRHarvesterThreaded(File docDir, File index_dir, String mdProviderUrl, String preferedLanguage)
+/*     */     throws CorruptIndexException, LockObtainFailedException, IOException
+/*     */   {
+/*  92 */     this(docDir, index_dir, null, mdProviderUrl, preferedLanguage);
+/*     */   }
+/*     */ 
+/*     */   protected void processFile(File file) throws CorruptIndexException, LockObtainFailedException, IOException
+/*     */   {
+/*  97 */     int freeThread = -1;
+/*  98 */     while (freeThread == -1)
+/*     */     {
+/* 100 */       freeThread = waitForFreeThread();
+/*     */     }
+/*     */ 
+/* 104 */     if (this.textLanguage == null)
+/* 105 */       this.textLanguage = loadLanguages();
+/* 106 */     this.mythreads[freeThread] = new OCRProcessFileThread(this.languageAnalyzers, file, this.languageFileName, this.textLanguage, this.mdProviderUrl, this.preferedLanguage, this.languageToISO, this.supportedLanguageFolder);
+/* 107 */     this.mythreads[freeThread].start();
+/* 108 */     System.out.println("New process started:" + freeThread);
+/*     */   }
+/*     */ }
+
+/* Location:           /private/tmp/fulltextIndexer.jar
+ * Qualified Name:     de.mpiwg.dwinter.fulltextIndexer.harvester.OCRHarvesterThreaded
+ * JD-Core Version:    0.5.4
+ */
\ No newline at end of file