0
|
1 /* */ package de.mpiwg.dwinter.fulltextIndexer.harvester.processors;
|
|
2 /* */
|
|
3 /* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument;
|
|
4 /* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument.OCRLine;
|
|
5 /* */ import de.mpiwg.dwinter.fulltextIndexer.utils.ParseOcrDocument;
|
|
6 /* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;
|
|
7 /* */ import de.mpiwg.dwinter.lucencetools.documents.OcropusLineDocument;
|
|
8 /* */ import java.io.File;
|
|
9 /* */ import java.io.FileNotFoundException;
|
|
10 /* */ import java.io.FileReader;
|
|
11 /* */ import java.io.IOException;
|
|
12 /* */ import java.io.PrintStream;
|
|
13 /* */ import java.io.Reader;
|
|
14 /* */ import java.io.UnsupportedEncodingException;
|
|
15 /* */ import java.util.HashMap;
|
|
16 /* */ import javax.xml.parsers.ParserConfigurationException;
|
|
17 /* */ import javax.xml.parsers.SAXParser;
|
|
18 /* */ import javax.xml.parsers.SAXParserFactory;
|
|
19 /* */ import org.apache.lucene.index.CorruptIndexException;
|
|
20 /* */ import org.xml.sax.InputSource;
|
|
21 /* */ import org.xml.sax.SAXException;
|
|
22 /* */
|
|
23 /* */ public class OCRProcessFileThread extends ProcessFileThread
|
|
24 /* */ {
|
|
25 /* */ public OCRProcessFileThread(LanguageAnalyzers languageAnalyzers2, File file, String lfn, HashMap<String, String> tl, String mdProviderUrl, String preferedLanguage, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder)
|
|
26 /* */ {
|
|
27 /* 43 */ super(languageAnalyzers2, file, lfn, tl, mdProviderUrl, preferedLanguage, languageToISO, supportedLanguageFolder);
|
|
28 /* */ }
|
|
29 /* */
|
|
30 /* */ protected Boolean addDocument(File file, String lang, String dcMetaData, String textId)
|
|
31 /* */ throws CorruptIndexException, IOException, FileNotFoundException, UnsupportedEncodingException, ParserConfigurationException, SAXException
|
|
32 /* */ {
|
|
33 /* 52 */ ParseOcrDocument ch = new ParseOcrDocument();
|
|
34 /* */
|
|
35 /* 59 */ SAXParserFactory factory = SAXParserFactory.newInstance();
|
|
36 /* 60 */ factory.setNamespaceAware(true);
|
|
37 /* 61 */ factory.setValidating(false);
|
|
38 /* */
|
|
39 /* 64 */ SAXParser parser = factory.newSAXParser();
|
|
40 /* */ try
|
|
41 /* */ {
|
|
42 /* 67 */ Reader reader = new FileReader(file);
|
|
43 /* 68 */ InputSource input = new InputSource(reader);
|
|
44 /* */
|
|
45 /* 73 */ parser.parse(input, ch);
|
|
46 /* */ }
|
|
47 /* */ catch (SAXException e)
|
|
48 /* */ {
|
|
49 /* 78 */ e.printStackTrace();
|
|
50 /* 79 */ return Boolean.valueOf(false);
|
|
51 /* */ }
|
|
52 /* */ catch (IOException e) {
|
|
53 /* 82 */ e.printStackTrace();
|
|
54 /* */ try {
|
|
55 /* 84 */ sleep(1L);
|
|
56 /* 85 */ System.out.println("retry");
|
|
57 /* 86 */ addDocument(file, lang, dcMetaData, textId);
|
|
58 /* */ }
|
|
59 /* */ catch (InterruptedException e1) {
|
|
60 /* 89 */ e1.printStackTrace();
|
|
61 /* */ }
|
|
62 /* 91 */ return Boolean.valueOf(false);
|
|
63 /* */ }
|
|
64 /* */
|
|
65 /* 94 */ OCRDocument doc = ch.ocrDocument;
|
|
66 /* */
|
|
67 /* 96 */ for (OCRDocument.OCRLine line : doc.OCRLines)
|
|
68 /* */ {
|
|
69 /* 98 */ if (dcMetaData == null)
|
|
70 /* 99 */ this.languageAnalyzers.addDocument(OcropusLineDocument.Document(file, absPathToTextId(file),lang, line, doc.pageDimension, textId), lang);
|
|
71 /* */ else {
|
|
72 /* 101 */ this.languageAnalyzers.addDocument(OcropusLineDocument.Document(file, absPathToTextId(file),lang, line, doc.pageDimension, dcMetaData, textId), lang);
|
|
73 /* */ }
|
|
74 /* */ }
|
|
75 /* 104 */ return Boolean.valueOf(true);
|
|
76 /* */ }
|
|
77 /* */ }
|
|
78
|
|
79 /* Location: /private/tmp/fulltextIndexer.jar
|
|
80 * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.processors.OCRProcessFileThread
|
|
81 * JD-Core Version: 0.5.4
|
|
82 */ |