annotate src/de/mpiwg/dwinter/fulltextIndexer/harvester/.svn/text-base/OCRHarvesterThreaded.java.svn-base @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dc7622afcfea initial
dwinter
parents:
diff changeset
1
dc7622afcfea initial
dwinter
parents:
diff changeset
2 /* */ package de.mpiwg.dwinter.fulltextIndexer.harvester;
dc7622afcfea initial
dwinter
parents:
diff changeset
3 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
4 /* */ import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.OCRProcessFileThread;
dc7622afcfea initial
dwinter
parents:
diff changeset
5 /* */ import de.mpiwg.dwinter.fulltextIndexer.harvester.processors.ProcessFileThread;
dc7622afcfea initial
dwinter
parents:
diff changeset
6 /* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
7 /* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;
dc7622afcfea initial
dwinter
parents:
diff changeset
8 /* */ import java.io.File;
dc7622afcfea initial
dwinter
parents:
diff changeset
9 /* */ import java.io.IOException;
dc7622afcfea initial
dwinter
parents:
diff changeset
10 /* */ import java.io.PrintStream;
dc7622afcfea initial
dwinter
parents:
diff changeset
11 /* */ import java.util.HashMap;
dc7622afcfea initial
dwinter
parents:
diff changeset
12 /* */ import org.apache.lucene.analysis.de.GermanAnalyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
13 /* */ import org.apache.lucene.analysis.fr.FrenchAnalyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
14 /* */ import org.apache.lucene.analysis.standard.StandardAnalyzer;
dc7622afcfea initial
dwinter
parents:
diff changeset
15 /* */ import org.apache.lucene.index.CorruptIndexException;
dc7622afcfea initial
dwinter
parents:
diff changeset
16 /* */ import org.apache.lucene.store.LockObtainFailedException;
dc7622afcfea initial
dwinter
parents:
diff changeset
17 /* */ import org.apache.lucene.util.Version;
dc7622afcfea initial
dwinter
parents:
diff changeset
18 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
19 /* */ public class OCRHarvesterThreaded extends HarvesterThreaded
dc7622afcfea initial
dwinter
parents:
diff changeset
20 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
21 /* */ private String preferedLanguage;
dc7622afcfea initial
dwinter
parents:
diff changeset
22 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
23 /* */ public OCRHarvesterThreaded()
dc7622afcfea initial
dwinter
parents:
diff changeset
24 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
25 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
26 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
27 /* */ public OCRHarvesterThreaded(File docDir, File index_dir, String languageFileName, String mdProviderUrl, String lang)
dc7622afcfea initial
dwinter
parents:
diff changeset
28 /* */ throws CorruptIndexException, LockObtainFailedException, IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
29 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
30 /* 41 */ this.index_dir = index_dir;
dc7622afcfea initial
dwinter
parents:
diff changeset
31 /* 42 */ this.languageFileName = languageFileName;
dc7622afcfea initial
dwinter
parents:
diff changeset
32 /* 43 */ this.docDir = docDir;
dc7622afcfea initial
dwinter
parents:
diff changeset
33 /* 44 */ this.preferedLanguage = lang;
dc7622afcfea initial
dwinter
parents:
diff changeset
34 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
35 /* 46 */ this.mdProviderUrl = mdProviderUrl;
dc7622afcfea initial
dwinter
parents:
diff changeset
36 /* 47 */ for (int i = 0; i < maxThread; ++i)
dc7622afcfea initial
dwinter
parents:
diff changeset
37 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
38 /* 49 */ this.mythreads[i] = null;
dc7622afcfea initial
dwinter
parents:
diff changeset
39 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
40 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
41 /* 52 */ init_languages();
dc7622afcfea initial
dwinter
parents:
diff changeset
42 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
43 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
44 /* */ private void init_languages() {
dc7622afcfea initial
dwinter
parents:
diff changeset
45 /* 56 */ this.languageToISO.put("German", "de");
dc7622afcfea initial
dwinter
parents:
diff changeset
46 /* 57 */ this.languageToISO.put("French", "fr");
dc7622afcfea initial
dwinter
parents:
diff changeset
47 /* 58 */ this.languageToISO.put("English", "en");
dc7622afcfea initial
dwinter
parents:
diff changeset
48 /* 59 */ this.languageToISO.put("German-f", "de-f");
dc7622afcfea initial
dwinter
parents:
diff changeset
49 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
50 /* 61 */ this.supportedLanguageFolder.put("deu", "de");
dc7622afcfea initial
dwinter
parents:
diff changeset
51 /* 62 */ this.supportedLanguageFolder.put("deu-f", "de");
dc7622afcfea initial
dwinter
parents:
diff changeset
52 /* 63 */ this.supportedLanguageFolder.put("fra", "fr");
dc7622afcfea initial
dwinter
parents:
diff changeset
53 /* 64 */ this.supportedLanguageFolder.put("eng", "en");
dc7622afcfea initial
dwinter
parents:
diff changeset
54 /* 65 */ this.supportedLanguageFolder.put("lic", "la");
dc7622afcfea initial
dwinter
parents:
diff changeset
55 /* */ try
dc7622afcfea initial
dwinter
parents:
diff changeset
56 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
57 /* 68 */ this.languageAnalyzers.add(new LanguageAnalyzer("de", new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
58 /* 69 */ this.languageAnalyzers.add(new LanguageAnalyzer("de-f", new GermanAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
59 /* 70 */ this.languageAnalyzers.add(new LanguageAnalyzer("en", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
60 /* 71 */ this.languageAnalyzers.add(new LanguageAnalyzer("fr", new FrenchAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
61 /* 72 */ this.languageAnalyzers.add(new LanguageAnalyzer("la", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
62 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
63 /* 74 */ this.languageAnalyzers.add(new LanguageAnalyzer("all", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
64 /* 75 */ this.languageAnalyzers.add(new LanguageAnalyzer("morph", new StandardAnalyzer(Version.LUCENE_30), this.index_dir));
dc7622afcfea initial
dwinter
parents:
diff changeset
65 /* */ } catch (CorruptIndexException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
66 /* 77 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
67 /* 78 */ System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
68 /* */ } catch (LockObtainFailedException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
69 /* 80 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
70 /* 81 */ System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
71 /* */ } catch (IOException e) {
dc7622afcfea initial
dwinter
parents:
diff changeset
72 /* 83 */ e.printStackTrace();
dc7622afcfea initial
dwinter
parents:
diff changeset
73 /* 84 */ System.exit(1);
dc7622afcfea initial
dwinter
parents:
diff changeset
74 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
75 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
76 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
77 /* */ public OCRHarvesterThreaded(File docDir, File index_dir, String mdProviderUrl, String preferedLanguage)
dc7622afcfea initial
dwinter
parents:
diff changeset
78 /* */ throws CorruptIndexException, LockObtainFailedException, IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
79 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
80 /* 92 */ this(docDir, index_dir, null, mdProviderUrl, preferedLanguage);
dc7622afcfea initial
dwinter
parents:
diff changeset
81 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
82 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
83 /* */ protected void processFile(File file) throws CorruptIndexException, LockObtainFailedException, IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
84 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
85 /* 97 */ int freeThread = -1;
dc7622afcfea initial
dwinter
parents:
diff changeset
86 /* 98 */ while (freeThread == -1)
dc7622afcfea initial
dwinter
parents:
diff changeset
87 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
88 /* 100 */ freeThread = waitForFreeThread();
dc7622afcfea initial
dwinter
parents:
diff changeset
89 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
90 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
91 /* 104 */ if (this.textLanguage == null)
dc7622afcfea initial
dwinter
parents:
diff changeset
92 /* 105 */ this.textLanguage = loadLanguages();
dc7622afcfea initial
dwinter
parents:
diff changeset
93 /* 106 */ this.mythreads[freeThread] = new OCRProcessFileThread(this.languageAnalyzers, file, this.languageFileName, this.textLanguage, this.mdProviderUrl, this.preferedLanguage, this.languageToISO, this.supportedLanguageFolder);
dc7622afcfea initial
dwinter
parents:
diff changeset
94 /* 107 */ this.mythreads[freeThread].start();
dc7622afcfea initial
dwinter
parents:
diff changeset
95 /* 108 */ System.out.println("New process started:" + freeThread);
dc7622afcfea initial
dwinter
parents:
diff changeset
96 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
97 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
98
dc7622afcfea initial
dwinter
parents:
diff changeset
99 /* Location: /private/tmp/fulltextIndexer.jar
dc7622afcfea initial
dwinter
parents:
diff changeset
100 * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.OCRHarvesterThreaded
dc7622afcfea initial
dwinter
parents:
diff changeset
101 * JD-Core Version: 0.5.4
dc7622afcfea initial
dwinter
parents:
diff changeset
102 */