Mercurial > hg > fulltextIndexer
comparison src/de/mpiwg/dwinter/fulltextIndexer/harvester/processors/OCRProcessFileThread.java @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dc7622afcfea |
---|---|
1 /* */ package de.mpiwg.dwinter.fulltextIndexer.harvester.processors; | |
2 /* */ | |
3 /* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument; | |
4 /* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument.OCRLine; | |
5 /* */ import de.mpiwg.dwinter.fulltextIndexer.utils.ParseOcrDocument; | |
6 /* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers; | |
7 /* */ import de.mpiwg.dwinter.lucencetools.documents.OcropusLineDocument; | |
8 /* */ import java.io.File; | |
9 /* */ import java.io.FileNotFoundException; | |
10 /* */ import java.io.FileReader; | |
11 /* */ import java.io.IOException; | |
12 /* */ import java.io.PrintStream; | |
13 /* */ import java.io.Reader; | |
14 /* */ import java.io.UnsupportedEncodingException; | |
15 /* */ import java.util.HashMap; | |
16 /* */ import javax.xml.parsers.ParserConfigurationException; | |
17 /* */ import javax.xml.parsers.SAXParser; | |
18 /* */ import javax.xml.parsers.SAXParserFactory; | |
19 /* */ import org.apache.lucene.index.CorruptIndexException; | |
20 /* */ import org.xml.sax.InputSource; | |
21 /* */ import org.xml.sax.SAXException; | |
22 /* */ | |
23 /* */ public class OCRProcessFileThread extends ProcessFileThread | |
24 /* */ { | |
25 /* */ public OCRProcessFileThread(LanguageAnalyzers languageAnalyzers2, File file, String lfn, HashMap<String, String> tl, String mdProviderUrl, String preferedLanguage, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder) | |
26 /* */ { | |
27 /* 43 */ super(languageAnalyzers2, file, lfn, tl, mdProviderUrl, preferedLanguage, languageToISO, supportedLanguageFolder); | |
28 /* */ } | |
29 /* */ | |
30 /* */ protected Boolean addDocument(File file, String lang, String dcMetaData, String textId) | |
31 /* */ throws CorruptIndexException, IOException, FileNotFoundException, UnsupportedEncodingException, ParserConfigurationException, SAXException | |
32 /* */ { | |
33 /* 52 */ ParseOcrDocument ch = new ParseOcrDocument(); | |
34 /* */ | |
35 /* 59 */ SAXParserFactory factory = SAXParserFactory.newInstance(); | |
36 /* 60 */ factory.setNamespaceAware(true); | |
37 /* 61 */ factory.setValidating(false); | |
38 /* */ | |
39 /* 64 */ SAXParser parser = factory.newSAXParser(); | |
40 /* */ try | |
41 /* */ { | |
42 /* 67 */ Reader reader = new FileReader(file); | |
43 /* 68 */ InputSource input = new InputSource(reader); | |
44 /* */ | |
45 /* 73 */ parser.parse(input, ch); | |
46 /* */ } | |
47 /* */ catch (SAXException e) | |
48 /* */ { | |
49 /* 78 */ e.printStackTrace(); | |
50 /* 79 */ return Boolean.valueOf(false); | |
51 /* */ } | |
52 /* */ catch (IOException e) { | |
53 /* 82 */ e.printStackTrace(); | |
54 /* */ try { | |
55 /* 84 */ sleep(1L); | |
56 /* 85 */ System.out.println("retry"); | |
57 /* 86 */ addDocument(file, lang, dcMetaData, textId); | |
58 /* */ } | |
59 /* */ catch (InterruptedException e1) { | |
60 /* 89 */ e1.printStackTrace(); | |
61 /* */ } | |
62 /* 91 */ return Boolean.valueOf(false); | |
63 /* */ } | |
64 /* */ | |
65 /* 94 */ OCRDocument doc = ch.ocrDocument; | |
66 /* */ | |
67 /* 96 */ for (OCRDocument.OCRLine line : doc.OCRLines) | |
68 /* */ { | |
69 /* 98 */ if (dcMetaData == null) | |
70 /* 99 */ this.languageAnalyzers.addDocument(OcropusLineDocument.Document(file, absPathToTextId(file),lang, line, doc.pageDimension, textId), lang); | |
71 /* */ else { | |
72 /* 101 */ this.languageAnalyzers.addDocument(OcropusLineDocument.Document(file, absPathToTextId(file),lang, line, doc.pageDimension, dcMetaData, textId), lang); | |
73 /* */ } | |
74 /* */ } | |
75 /* 104 */ return Boolean.valueOf(true); | |
76 /* */ } | |
77 /* */ } | |
78 | |
79 /* Location: /private/tmp/fulltextIndexer.jar | |
80 * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.processors.OCRProcessFileThread | |
81 * JD-Core Version: 0.5.4 | |
82 */ |