Mercurial > hg > fulltextIndexer
view src/de/mpiwg/dwinter/fulltextIndexer/harvester/processors/OCRProcessFileThread.java @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
line wrap: on
line source
/* */ package de.mpiwg.dwinter.fulltextIndexer.harvester.processors; /* */ /* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument; /* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument.OCRLine; /* */ import de.mpiwg.dwinter.fulltextIndexer.utils.ParseOcrDocument; /* */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers; /* */ import de.mpiwg.dwinter.lucencetools.documents.OcropusLineDocument; /* */ import java.io.File; /* */ import java.io.FileNotFoundException; /* */ import java.io.FileReader; /* */ import java.io.IOException; /* */ import java.io.PrintStream; /* */ import java.io.Reader; /* */ import java.io.UnsupportedEncodingException; /* */ import java.util.HashMap; /* */ import javax.xml.parsers.ParserConfigurationException; /* */ import javax.xml.parsers.SAXParser; /* */ import javax.xml.parsers.SAXParserFactory; /* */ import org.apache.lucene.index.CorruptIndexException; /* */ import org.xml.sax.InputSource; /* */ import org.xml.sax.SAXException; /* */ /* */ public class OCRProcessFileThread extends ProcessFileThread /* */ { /* */ public OCRProcessFileThread(LanguageAnalyzers languageAnalyzers2, File file, String lfn, HashMap<String, String> tl, String mdProviderUrl, String preferedLanguage, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder) /* */ { /* 43 */ super(languageAnalyzers2, file, lfn, tl, mdProviderUrl, preferedLanguage, languageToISO, supportedLanguageFolder); /* */ } /* */ /* */ protected Boolean addDocument(File file, String lang, String dcMetaData, String textId) /* */ throws CorruptIndexException, IOException, FileNotFoundException, UnsupportedEncodingException, ParserConfigurationException, SAXException /* */ { /* 52 */ ParseOcrDocument ch = new ParseOcrDocument(); /* */ /* 59 */ SAXParserFactory factory = SAXParserFactory.newInstance(); /* 60 */ factory.setNamespaceAware(true); /* 61 */ factory.setValidating(false); /* */ /* 64 */ SAXParser parser = factory.newSAXParser(); /* */ try /* */ { /* 67 */ Reader reader = new FileReader(file); /* 68 */ InputSource input = new InputSource(reader); /* */ /* 73 */ parser.parse(input, ch); /* */ } /* */ catch (SAXException e) /* */ { /* 78 */ e.printStackTrace(); /* 79 */ return Boolean.valueOf(false); /* */ } /* */ catch (IOException e) { /* 82 */ e.printStackTrace(); /* */ try { /* 84 */ sleep(1L); /* 85 */ System.out.println("retry"); /* 86 */ addDocument(file, lang, dcMetaData, textId); /* */ } /* */ catch (InterruptedException e1) { /* 89 */ e1.printStackTrace(); /* */ } /* 91 */ return Boolean.valueOf(false); /* */ } /* */ /* 94 */ OCRDocument doc = ch.ocrDocument; /* */ /* 96 */ for (OCRDocument.OCRLine line : doc.OCRLines) /* */ { /* 98 */ if (dcMetaData == null) /* 99 */ this.languageAnalyzers.addDocument(OcropusLineDocument.Document(file, absPathToTextId(file),lang, line, doc.pageDimension, textId), lang); /* */ else { /* 101 */ this.languageAnalyzers.addDocument(OcropusLineDocument.Document(file, absPathToTextId(file),lang, line, doc.pageDimension, dcMetaData, textId), lang); /* */ } /* */ } /* 104 */ return Boolean.valueOf(true); /* */ } /* */ } /* Location: /private/tmp/fulltextIndexer.jar * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.harvester.processors.OCRProcessFileThread * JD-Core Version: 0.5.4 */