Mercurial > hg > fulltextIndexer
diff src/de/mpiwg/dwinter/fulltextIndexer/harvester/OCRHarvesterThreaded.java @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/fulltextIndexer/harvester/OCRHarvesterThreaded.java Wed Nov 03 12:33:16 2010 +0100 @@ -0,0 +1,102 @@ + +/* */ package de.mpiwg.dwinter.fulltextIndexer.harvester; +/* */ +/* */ import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.OCRProcessFileThread; +/* */ import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread; +/* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer; +/* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers; +/* */ import java.io.File; +/* */ import java.io.IOException; +/* */ import java.io.PrintStream; +/* */ import java.util.HashMap; +/* */ import org.apache.lucene.analysis.de.GermanAnalyzer; +/* */ import org.apache.lucene.analysis.fr.FrenchAnalyzer; +/* */ import org.apache.lucene.analysis.standard.StandardAnalyzer; +/* */ import org.apache.lucene.index.CorruptIndexException; +/* */ import org.apache.lucene.store.LockObtainFailedException; +/* */ import org.apache.lucene.util.Version; +/* */ +/* */ public class OCRHarvesterThreaded extends HarvesterThreaded +/* */ { +/* */ private String preferedLanguage; +/* */ +/* */ public OCRHarvesterThreaded() +/* */ { +/* */ } +/* */ +/* */ public OCRHarvesterThreaded(File docDir, File index_dir, String languageFileName, String mdProviderUrl, String lang) +/* */ throws CorruptIndexException, LockObtainFailedException, IOException +/* */ { +/* 41 */ this.index_dir = index_dir; +/* 42 */ this.languageFileName = languageFileName; +/* 43 */ this.docDir = docDir; +/* 44 */ this.preferedLanguage = lang; +/* */ +/* 46 */ this.mdProviderUrl = mdProviderUrl; +/* 47 */ for (int i = 0; i < maxThread; ++i) +/* */ { +/* 49 */ this.mythreads[i] = null; +/* */ } +/* */ +/* 52 */ init_languages(); +/* */ } +/* */ +/* */ private void init_languages() { +/* 56 */ this.languageToISO.put("German", "de"); +/* 57 */ this.languageToISO.put("French", "fr"); +/* 58 */ this.languageToISO.put("English", "en"); +/* 59 */ this.languageToISO.put("German-f", "de-f"); +/* */ +/* 61 */ this.supportedLanguageFolder.put("deu", "de"); +/* 62 */ this.supportedLanguageFolder.put("deu-f", "de"); +/* 63 */ this.supportedLanguageFolder.put("fra", "fr"); +/* 64 */ this.supportedLanguageFolder.put("eng", "en"); +/* 65 */ this.supportedLanguageFolder.put("lic", "la"); +/* */ try +/* */ { +/* 68 */ this.languageAnalyzers.add(new LanguageAnalyzer("de", new GermanAnalyzer(Version.LUCENE_30), this.index_dir)); +/* 69 */ this.languageAnalyzers.add(new LanguageAnalyzer("de-f", new GermanAnalyzer(Version.LUCENE_30), this.index_dir)); +/* 70 */ this.languageAnalyzers.add(new LanguageAnalyzer("en", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); +/* 71 */ this.languageAnalyzers.add(new LanguageAnalyzer("fr", new FrenchAnalyzer(Version.LUCENE_30), this.index_dir)); +/* 72 */ this.languageAnalyzers.add(new LanguageAnalyzer("la", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); +/* */ +/* 74 */ this.languageAnalyzers.add(new LanguageAnalyzer("all", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); +/* 75 */ this.languageAnalyzers.add(new LanguageAnalyzer("morph", new StandardAnalyzer(Version.LUCENE_30), this.index_dir)); +/* */ } catch (CorruptIndexException e) { +/* 77 */ e.printStackTrace(); +/* 78 */ System.exit(1); +/* */ } catch (LockObtainFailedException e) { +/* 80 */ e.printStackTrace(); +/* 81 */ System.exit(1); +/* */ } catch (IOException e) { +/* 83 */ e.printStackTrace(); +/* 84 */ System.exit(1); +/* */ } +/* */ } +/* */ +/* */ public OCRHarvesterThreaded(File docDir, File index_dir, String mdProviderUrl, String preferedLanguage) +/* */ throws CorruptIndexException, LockObtainFailedException, IOException +/* */ { +/* 92 */ this(docDir, index_dir, null, mdProviderUrl, preferedLanguage); +/* */ } +/* */ +/* */ protected void processFile(File file) throws CorruptIndexException, LockObtainFailedException, IOException +/* */ { +/* 97 */ int freeThread = -1; +/* 98 */ while (freeThread == -1) +/* */ { +/* 100 */ freeThread = waitForFreeThread(); +/* */ } +/* */ +/* 104 */ if (this.textLanguage == null) +/* 105 */ this.textLanguage = loadLanguages(); +/* 106 */ this.mythreads[freeThread] = new OCRProcessFileThread(this.languageAnalyzers, file, this.languageFileName, this.textLanguage, this.mdProviderUrl, this.preferedLanguage, this.languageToISO, this.supportedLanguageFolder); +/* 107 */ this.mythreads[freeThread].start(); +/* 108 */ System.out.println("New process started:" + freeThread); +/* */ } +/* */ } + +/* Location: /private/tmp/fulltextIndexer.jar + * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.OCRHarvesterThreaded + * JD-Core Version: 0.5.4 + */ \ No newline at end of file