0
|
1
|
|
2 /* */ package de.mpiwg.dwinter.fulltextIndexer.harvester;
|
|
3 /* */
|
|
4 /* */ import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.OCRProcessFileThread;
|
|
5 /* */ import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread;
|
|
6 /* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer;
|
|
7 /* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;
|
|
8 /* */ import java.io.File;
|
|
9 /* */ import java.io.IOException;
|
|
10 /* */ import java.io.PrintStream;
|
|
11 /* */ import java.util.HashMap;
|
|
12 /* */ import org.apache.lucene.analysis.de.GermanAnalyzer;
|
|
13 /* */ import org.apache.lucene.analysis.fr.FrenchAnalyzer;
|
|
14 /* */ import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
|
15 /* */ import org.apache.lucene.index.CorruptIndexException;
|
|
16 /* */ import org.apache.lucene.store.LockObtainFailedException;
|
|
17 /* */ import org.apache.lucene.util.Version;
|
|
18 /* */
|
|
19 /* */ public class OCRHarvesterThreaded extends HarvesterThreaded
|
|
20 /* */ {
|
|
21 /* */ private String preferedLanguage;
|
|
22 /* */
|
|
23 /* */ public OCRHarvesterThreaded()
|
|
24 /* */ {
|
|
25 /* */ }
|
|
26 /* */
|
|
27 /* */ public OCRHarvesterThreaded(File docDir, File index_dir, String languageFileName, String mdProviderUrl, String lang)
|
|
28 /* */ throws CorruptIndexException, LockObtainFailedException, IOException
|
|
29 /* */ {
|
|
30 /* 41 */ this.index_dir = index_dir;
|
|
31 /* 42 */ this.languageFileName = languageFileName;
|
|
32 /* 43 */ this.docDir = docDir;
|
|
33 /* 44 */ this.preferedLanguage = lang;
|
|
34 /* */
|
|
35 /* 46 */ this.mdProviderUrl = mdProviderUrl;
|
|
36 /* 47 */ for (int i = 0; i < maxThread; ++i)
|
|
37 /* */ {
|
|
38 /* 49 */ this.mythreads[i] = null;
|
|
39 /* */ }
|
|
40 /* */
|
|
41 /* 52 */ init_languages();
|
|
42 /* */ }
|
|
43 /* */
|
|
44 /* */ private void init_languages() {
|
|
45 /* 56 */ this.languageToISO.put("German", "de");
|
|
46 /* 57 */ this.languageToISO.put("French", "fr");
|
|
47 /* 58 */ this.languageToISO.put("English", "en");
|
|
48 /* 59 */ this.languageToISO.put("German-f", "de-f");
|
|
49 /* */
|
|
50 /* 61 */ this.supportedLanguageFolder.put("deu", "de");
|
|
51 /* 62 */ this.supportedLanguageFolder.put("deu-f", "de");
|
|
52 /* 63 */ this.supportedLanguageFolder.put("fra", "fr");
|
|
53 /* 64 */ this.supportedLanguageFolder.put("eng", "en");
|
|
54 /* 65 */ this.supportedLanguageFolder.put("lic", "la");
|
|
55 /* */ try
|
|
56 /* */ {
|
|
57 /* 68 */ this.languageAnalyzers.add(new LanguageAnalyzer("de", new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
|
|
58 /* 69 */ this.languageAnalyzers.add(new LanguageAnalyzer("de-f", new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
|
|
59 /* 70 */ this.languageAnalyzers.add(new LanguageAnalyzer("en", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
|
|
60 /* 71 */ this.languageAnalyzers.add(new LanguageAnalyzer("fr", new FrenchAnalyzer(Version.LUCENE_30), this.index_dir));
|
|
61 /* 72 */ this.languageAnalyzers.add(new LanguageAnalyzer("la", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
|
|
62 /* */
|
|
63 /* 74 */ this.languageAnalyzers.add(new LanguageAnalyzer("all", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
|
|
64 /* 75 */ this.languageAnalyzers.add(new LanguageAnalyzer("morph", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
|
|
65 /* */ } catch (CorruptIndexException e) {
|
|
66 /* 77 */ e.printStackTrace();
|
|
67 /* 78 */ System.exit(1);
|
|
68 /* */ } catch (LockObtainFailedException e) {
|
|
69 /* 80 */ e.printStackTrace();
|
|
70 /* 81 */ System.exit(1);
|
|
71 /* */ } catch (IOException e) {
|
|
72 /* 83 */ e.printStackTrace();
|
|
73 /* 84 */ System.exit(1);
|
|
74 /* */ }
|
|
75 /* */ }
|
|
76 /* */
|
|
77 /* */ public OCRHarvesterThreaded(File docDir, File index_dir, String mdProviderUrl, String preferedLanguage)
|
|
78 /* */ throws CorruptIndexException, LockObtainFailedException, IOException
|
|
79 /* */ {
|
|
80 /* 92 */ this(docDir, index_dir, null, mdProviderUrl, preferedLanguage);
|
|
81 /* */ }
|
|
82 /* */
|
|
83 /* */ protected void processFile(File file) throws CorruptIndexException, LockObtainFailedException, IOException
|
|
84 /* */ {
|
|
85 /* 97 */ int freeThread = -1;
|
|
86 /* 98 */ while (freeThread == -1)
|
|
87 /* */ {
|
|
88 /* 100 */ freeThread = waitForFreeThread();
|
|
89 /* */ }
|
|
90 /* */
|
|
91 /* 104 */ if (this.textLanguage == null)
|
|
92 /* 105 */ this.textLanguage = loadLanguages();
|
|
93 /* 106 */ this.mythreads[freeThread] = new OCRProcessFileThread(this.languageAnalyzers, file, this.languageFileName, this.textLanguage, this.mdProviderUrl, this.preferedLanguage, this.languageToISO, this.supportedLanguageFolder);
|
|
94 /* 107 */ this.mythreads[freeThread].start();
|
|
95 /* 108 */ System.out.println("New process started:" + freeThread);
|
|
96 /* */ }
|
|
97 /* */ }
|
|
98
|
|
99 /* Location: /private/tmp/fulltextIndexer.jar
|
|
100 * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.OCRHarvesterThreaded
|
|
101 * JD-Core Version: 0.5.4
|
|
102 */ |