diff src/de/mpiwg/dwinter/fulltextIndexer/harvester/processors/OCRProcessFileThread.java @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/fulltextIndexer/harvester/processors/OCRProcessFileThread.java	Wed Nov 03 12:33:16 2010 +0100
@@ -0,0 +1,82 @@
+/*     */ package de.mpiwg.dwinter.fulltextIndexer.harvester.processors;
+/*     */ 
+/*     */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument;
+/*     */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument.OCRLine;
+/*     */ import de.mpiwg.dwinter.fulltextIndexer.utils.ParseOcrDocument;
+/*     */ import de.mpiwg.dwinter.lucencetools.analyzer.LanguageAnalyzers;
+/*     */ import de.mpiwg.dwinter.lucencetools.documents.OcropusLineDocument;
+/*     */ import java.io.File;
+/*     */ import java.io.FileNotFoundException;
+/*     */ import java.io.FileReader;
+/*     */ import java.io.IOException;
+/*     */ import java.io.PrintStream;
+/*     */ import java.io.Reader;
+/*     */ import java.io.UnsupportedEncodingException;
+/*     */ import java.util.HashMap;
+/*     */ import javax.xml.parsers.ParserConfigurationException;
+/*     */ import javax.xml.parsers.SAXParser;
+/*     */ import javax.xml.parsers.SAXParserFactory;
+/*     */ import org.apache.lucene.index.CorruptIndexException;
+/*     */ import org.xml.sax.InputSource;
+/*     */ import org.xml.sax.SAXException;
+/*     */ 
+/*     */ public class OCRProcessFileThread extends ProcessFileThread
+/*     */ {
+/*     */   public OCRProcessFileThread(LanguageAnalyzers languageAnalyzers2, File file, String lfn, HashMap<String, String> tl, String mdProviderUrl, String preferedLanguage, HashMap<String, String> languageToISO, HashMap<String, String> supportedLanguageFolder)
+/*     */   {
+/*  43 */     super(languageAnalyzers2, file, lfn, tl, mdProviderUrl, preferedLanguage, languageToISO, supportedLanguageFolder);
+/*     */   }
+/*     */ 
+/*     */   protected Boolean addDocument(File file, String lang, String dcMetaData, String textId)
+/*     */     throws CorruptIndexException, IOException, FileNotFoundException, UnsupportedEncodingException, ParserConfigurationException, SAXException
+/*     */   {
+/*  52 */     ParseOcrDocument ch = new ParseOcrDocument();
+/*     */ 
+/*  59 */     SAXParserFactory factory = SAXParserFactory.newInstance();
+/*  60 */     factory.setNamespaceAware(true);
+/*  61 */     factory.setValidating(false);
+/*     */ 
+/*  64 */     SAXParser parser = factory.newSAXParser();
+/*     */     try
+/*     */     {
+/*  67 */       Reader reader = new FileReader(file);
+/*  68 */       InputSource input = new InputSource(reader);
+/*     */ 
+/*  73 */       parser.parse(input, ch);
+/*     */     }
+/*     */     catch (SAXException e)
+/*     */     {
+/*  78 */       e.printStackTrace();
+/*  79 */       return Boolean.valueOf(false);
+/*     */     }
+/*     */     catch (IOException e) {
+/*  82 */       e.printStackTrace();
+/*     */       try {
+/*  84 */         sleep(1L);
+/*  85 */         System.out.println("retry");
+/*  86 */         addDocument(file, lang, dcMetaData, textId);
+/*     */       }
+/*     */       catch (InterruptedException e1) {
+/*  89 */         e1.printStackTrace();
+/*     */       }
+/*  91 */       return Boolean.valueOf(false);
+/*     */     }
+/*     */ 
+/*  94 */     OCRDocument doc = ch.ocrDocument;
+/*     */ 
+/*  96 */     for (OCRDocument.OCRLine line : doc.OCRLines)
+/*     */     {
+/*  98 */       if (dcMetaData == null)
+/*  99 */         this.languageAnalyzers.addDocument(OcropusLineDocument.Document(file, absPathToTextId(file),lang, line, doc.pageDimension, textId), lang);
+/*     */       else {
+/* 101 */         this.languageAnalyzers.addDocument(OcropusLineDocument.Document(file,  absPathToTextId(file),lang, line, doc.pageDimension, dcMetaData, textId), lang);
+/*     */       }
+/*     */     }
+/* 104 */     return Boolean.valueOf(true);
+/*     */   }
+/*     */ }
+
+/* Location:           /private/tmp/fulltextIndexer.jar
+ * Qualified Name:     de.mpiwg.dwinter.fulltextIndexer.harvester.processors.OCRProcessFileThread
+ * JD-Core Version:    0.5.4
+ */
\ No newline at end of file