Mercurial > hg > fulltextIndexer
diff src/de/mpiwg/dwinter/fulltextIndexer/harvester/processors/OCRProcessFileThread.java @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/fulltextIndexer/harvester/processors/OCRProcessFileThread.java Wed Nov 03 12:33:16 2010 +0100 @@ -0,0 +1,82 @@ +/* */ package de.mpiwg.dwinter.fulltextIndexer.harvester.processors; +/* */ +/* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument; +/* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument.OCRLine; +/* */ import de.mpiwg.dwinter.fulltextIndexer.utils.ParseOcrDocument; +/* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers; +/* */ import de.mpiwg.dwinter.lucencetools.documents.OcropusLineDocument; +/* */ import java.io.File; +/* */ import java.io.FileNotFoundException; +/* */ import java.io.FileReader; +/* */ import java.io.IOException; +/* */ import java.io.PrintStream; +/* */ import java.io.Reader; +/* */ import java.io.UnsupportedEncodingException; +/* */ import java.util.HashMap; +/* */ import javax.xml.parsers.ParserConfigurationException; +/* */ import javax.xml.parsers.SAXParser; +/* */ import javax.xml.parsers.SAXParserFactory; +/* */ import org.apache.lucene.index.CorruptIndexException; +/* */ import org.xml.sax.InputSource; +/* */ import org.xml.sax.SAXException; +/* */ +/* */ public class OCRProcessFileThread extends ProcessFileThread +/* */ { +/* */ public OCRProcessFileThread(LanguageAnalyzers languageAnalyzers2, File file, String lfn, HashMap<String, String> tl, String mdProviderUrl, String preferedLanguage, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder) +/* */ { +/* 43 */ super(languageAnalyzers2, file, lfn, tl, mdProviderUrl, preferedLanguage, languageToISO, supportedLanguageFolder); +/* */ } +/* */ +/* */ protected Boolean addDocument(File file, String lang, String dcMetaData, String textId) +/* */ throws CorruptIndexException, IOException, FileNotFoundException, UnsupportedEncodingException, ParserConfigurationException, SAXException +/* */ { +/* 52 */ ParseOcrDocument ch = new ParseOcrDocument(); +/* */ +/* 59 */ SAXParserFactory factory = SAXParserFactory.newInstance(); +/* 60 */ factory.setNamespaceAware(true); +/* 61 */ factory.setValidating(false); +/* */ +/* 64 */ SAXParser parser = factory.newSAXParser(); +/* */ try +/* */ { +/* 67 */ Reader reader = new FileReader(file); +/* 68 */ InputSource input = new InputSource(reader); +/* */ +/* 73 */ parser.parse(input, ch); +/* */ } +/* */ catch (SAXException e) +/* */ { +/* 78 */ e.printStackTrace(); +/* 79 */ return Boolean.valueOf(false); +/* */ } +/* */ catch (IOException e) { +/* 82 */ e.printStackTrace(); +/* */ try { +/* 84 */ sleep(1L); +/* 85 */ System.out.println("retry"); +/* 86 */ addDocument(file, lang, dcMetaData, textId); +/* */ } +/* */ catch (InterruptedException e1) { +/* 89 */ e1.printStackTrace(); +/* */ } +/* 91 */ return Boolean.valueOf(false); +/* */ } +/* */ +/* 94 */ OCRDocument doc = ch.ocrDocument; +/* */ +/* 96 */ for (OCRDocument.OCRLine line : doc.OCRLines) +/* */ { +/* 98 */ if (dcMetaData == null) +/* 99 */ this.languageAnalyzers.addDocument(OcropusLineDocument.Document(file, absPathToTextId(file),lang, line, doc.pageDimension, textId), lang); +/* */ else { +/* 101 */ this.languageAnalyzers.addDocument(OcropusLineDocument.Document(file, absPathToTextId(file),lang, line, doc.pageDimension, dcMetaData, textId), lang); +/* */ } +/* */ } +/* 104 */ return Boolean.valueOf(true); +/* */ } +/* */ } + +/* Location: /private/tmp/fulltextIndexer.jar + * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.processors.OCRProcessFileThread + * JD-Core Version: 0.5.4 + */ \ No newline at end of file