view src/de/mpiwg/dwinter/fulltextIndexer/utils/ParseOcrDocument.java @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line source

/*     */ package de.mpiwg.dwinter.fulltextIndexer.utils;
/*     */ 
/*     */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument;
/*     */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument.OCRLine;
/*     */ import java.io.File;
/*     */ import java.io.IOException;
/*     */ import java.io.PrintStream;
/*     */ import java.util.ArrayList;
/*     */ import org.xml.sax.Attributes;
/*     */ import org.xml.sax.InputSource;
/*     */ import org.xml.sax.Locator;
/*     */ import org.xml.sax.SAXException;
/*     */ import org.xml.sax.helpers.DefaultHandler;
/*     */ 
/*     */ public class ParseOcrDocument extends DefaultHandler
/*     */ {
/*  24 */   public OCRDocument ocrDocument = new OCRDocument();
/*     */   private int lineCounter;
/*     */   private OCRDocument.OCRLine currentLine;
/*     */   private boolean inLine;
/*     */ 
/*     */   public void startDocument()
/*     */     throws SAXException
/*     */   {
/*     */   }
/*     */ 
/*     */   public void endDocument()
/*     */     throws SAXException
/*     */   {
/*     */   }
/*     */ 
/*     */   public void characters(char[] c, int start, int length)
/*     */     throws SAXException
/*     */   {
/*  38 */     if (this.inLine)
/*  39 */       this.currentLine.content += new String(c, start, length);
/*     */   }
/*     */ 
/*     */   public void ignorableWhitespace(char[] c, int start, int length)
/*     */     throws SAXException
/*     */   {
/*     */   }
/*     */ 
/*     */   public void processingInstruction(String target, String data)
/*     */     throws SAXException
/*     */   {
/*     */   }
/*     */ 
/*     */   public void setDocumentLocator(Locator arg1)
/*     */   {
/*     */   }
/*     */ 
/*     */   public void endElement(String uri, String localName, String name)
/*     */     throws SAXException
/*     */   {
/*  61 */     if ((!name.equals("span")) || 
/*  63 */       (!this.inLine)) return;
/*  64 */     this.ocrDocument.OCRLines.add(this.currentLine);
/*  65 */     this.inLine = false;
/*     */   }
/*     */ 
/*     */   public void endPrefixMapping(String prefix)
/*     */     throws SAXException
/*     */   {
/*     */   }
/*     */ 
/*     */   public void skippedEntity(String name)
/*     */     throws SAXException
/*     */   {
/*     */   }
/*     */ 
/*     */   public void startElement(String uri, String localName, String name, Attributes attrs)
/*     */     throws SAXException
/*     */   {
/*  83 */     if ((name.equals("div")) && 
/*  84 */       (attrs != null)) {
/*  85 */       int length = attrs.getLength();
/*     */ 
/*  87 */       for (int i = 0; i < length; ++i) {
/*  88 */         if ((!attrs.getLocalName(i).equals("class")) || 
/*  89 */           (!attrs.getValue(i).equals("ocr_page"))) continue;
/*  90 */         doPage(attrs);
/*     */       }
/*     */ 
/*     */     }
/*     */ 
/*  96 */     if ((!name.equals("span")) || 
/*  97 */       (attrs == null)) return;
/*  98 */     int length = attrs.getLength();
/*     */ 
/* 100 */     for (int i = 0; i < length; ++i) {
/* 101 */       if ((!attrs.getLocalName(i).equals("class")) || 
/* 102 */         (!attrs.getValue(i).equals("ocr_line"))) continue;
/* 103 */       doLine(attrs);
/*     */     }
/*     */   }
/*     */ 
/*     */   private void doPage(Attributes attrs)
/*     */   {
/* 111 */     int length = attrs.getLength();
/*     */ 
/* 113 */     for (int i = 0; i < length; ++i) {
/* 114 */       if (attrs.getLocalName(i).equals("title")) {
/* 115 */         String title = attrs.getValue(i);
/* 116 */         String[] splitted = title.split(" ");
/* 117 */         String dimension = splitted[(splitted.length - 2)] + " " + 
/* 118 */           splitted[(splitted.length - 1)];
/* 119 */         this.ocrDocument.pageDimension = dimension;
/*     */       }
/*     */     }
/* 122 */     this.lineCounter = 0;
/*     */   }
/*     */ 
/*     */   private void doLine(Attributes attrs)
/*     */   {
/*     */    
			  OCRDocument doc = new OCRDocument();
			  
			  this.currentLine = doc.new OCRLine();
/* 128 */     this.inLine = true;
/* 129 */     this.currentLine.content = new String();
/*     */ 
/* 131 */     int length = attrs.getLength();
/* 132 */     this.currentLine.lineNumber = String.valueOf(this.lineCounter);
/* 133 */     this.lineCounter += 1;
/* 134 */     this.currentLine.bbox = "0 0";
/*     */ 
/* 137 */     for (int i = 0; i < length; ++i)
/* 138 */       if (attrs.getLocalName(i).equals("title")) {
/* 139 */         String title = attrs.getValue(i);
/*     */ 
/* 141 */         String dimension = title.replace("bbox ", "");
/* 142 */         this.currentLine.bbox = dimension;
/*     */       }
/*     */   }
/*     */ 
/*     */   public void startPrefixMapping(String prefix, String uri)
/*     */     throws SAXException
/*     */   {
/*     */   }
/*     */ 
/*     */   public InputSource resolveEntity(String publicId, String systemId)
/*     */     throws SAXException, IOException
/*     */   {
/* 160 */     File f = new File(
/* 161 */       "/Users/dwinter/text-tools/fulltextsearch/catalog/xhtml1-transitional.dtd");
/* 162 */     if (!f.exists())
/* 163 */       f = new File(
/* 164 */         "/usr/local/apache-tomcat-6.0.16/webapps/fulltextsearch/catalog/xhtml1-transitional.dtd");
/* 165 */     if (!f.exists()) {
/* 166 */       System.err.println("Cant't find xhtml-dtd: MyResolver");
/* 167 */       return null;
/*     */     }
/* 169 */     if (publicId.equals("-//W3C//DTD XHTML 1.0 Transitional//EN"))
/* 170 */       return new InputSource(f.getAbsolutePath());
/* 171 */     return new InputSource(systemId);
/*     */   }
/*     */ }

/* Location:           /private/tmp/fulltextIndexer.jar
 * Qualified Name:     de.mpiwg.dwinter.fulltextIndexer.utils.ParseOcrDocument
 * JD-Core Version:    0.5.4
 */