diff src/de/mpiwg/dwinter/fulltextIndexer/utils/.svn/text-base/ParseOcrDocument.java.svn-base @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/de/mpiwg/dwinter/fulltextIndexer/utils/.svn/text-base/ParseOcrDocument.java.svn-base	Wed Nov 03 12:33:16 2010 +0100
@@ -0,0 +1,163 @@
+/*     */ package de.mpiwg.dwinter.fulltextIndexer.utils;
+/*     */ 
+/*     */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument;
+/*     */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument.OCRLine;
+/*     */ import java.io.File;
+/*     */ import java.io.IOException;
+/*     */ import java.io.PrintStream;
+/*     */ import java.util.ArrayList;
+/*     */ import org.xml.sax.Attributes;
+/*     */ import org.xml.sax.InputSource;
+/*     */ import org.xml.sax.Locator;
+/*     */ import org.xml.sax.SAXException;
+/*     */ import org.xml.sax.helpers.DefaultHandler;
+/*     */ 
+/*     */ public class ParseOcrDocument extends DefaultHandler
+/*     */ {
+/*  24 */   public OCRDocument ocrDocument = new OCRDocument();
+/*     */   private int lineCounter;
+/*     */   private OCRDocument.OCRLine currentLine;
+/*     */   private boolean inLine;
+/*     */ 
+/*     */   public void startDocument()
+/*     */     throws SAXException
+/*     */   {
+/*     */   }
+/*     */ 
+/*     */   public void endDocument()
+/*     */     throws SAXException
+/*     */   {
+/*     */   }
+/*     */ 
+/*     */   public void characters(char[] c, int start, int length)
+/*     */     throws SAXException
+/*     */   {
+/*  38 */     if (this.inLine)
+/*  39 */       this.currentLine.content += new String(c, start, length);
+/*     */   }
+/*     */ 
+/*     */   public void ignorableWhitespace(char[] c, int start, int length)
+/*     */     throws SAXException
+/*     */   {
+/*     */   }
+/*     */ 
+/*     */   public void processingInstruction(String target, String data)
+/*     */     throws SAXException
+/*     */   {
+/*     */   }
+/*     */ 
+/*     */   public void setDocumentLocator(Locator arg1)
+/*     */   {
+/*     */   }
+/*     */ 
+/*     */   public void endElement(String uri, String localName, String name)
+/*     */     throws SAXException
+/*     */   {
+/*  61 */     if ((!name.equals("span")) || 
+/*  63 */       (!this.inLine)) return;
+/*  64 */     this.ocrDocument.OCRLines.add(this.currentLine);
+/*  65 */     this.inLine = false;
+/*     */   }
+/*     */ 
+/*     */   public void endPrefixMapping(String prefix)
+/*     */     throws SAXException
+/*     */   {
+/*     */   }
+/*     */ 
+/*     */   public void skippedEntity(String name)
+/*     */     throws SAXException
+/*     */   {
+/*     */   }
+/*     */ 
+/*     */   public void startElement(String uri, String localName, String name, Attributes attrs)
+/*     */     throws SAXException
+/*     */   {
+/*  83 */     if ((name.equals("div")) && 
+/*  84 */       (attrs != null)) {
+/*  85 */       int length = attrs.getLength();
+/*     */ 
+/*  87 */       for (int i = 0; i < length; ++i) {
+/*  88 */         if ((!attrs.getLocalName(i).equals("class")) || 
+/*  89 */           (!attrs.getValue(i).equals("ocr_page"))) continue;
+/*  90 */         doPage(attrs);
+/*     */       }
+/*     */ 
+/*     */     }
+/*     */ 
+/*  96 */     if ((!name.equals("span")) || 
+/*  97 */       (attrs == null)) return;
+/*  98 */     int length = attrs.getLength();
+/*     */ 
+/* 100 */     for (int i = 0; i < length; ++i) {
+/* 101 */       if ((!attrs.getLocalName(i).equals("class")) || 
+/* 102 */         (!attrs.getValue(i).equals("ocr_line"))) continue;
+/* 103 */       doLine(attrs);
+/*     */     }
+/*     */   }
+/*     */ 
+/*     */   private void doPage(Attributes attrs)
+/*     */   {
+/* 111 */     int length = attrs.getLength();
+/*     */ 
+/* 113 */     for (int i = 0; i < length; ++i) {
+/* 114 */       if (attrs.getLocalName(i).equals("title")) {
+/* 115 */         String title = attrs.getValue(i);
+/* 116 */         String[] splitted = title.split(" ");
+/* 117 */         String dimension = splitted[(splitted.length - 2)] + " " + 
+/* 118 */           splitted[(splitted.length - 1)];
+/* 119 */         this.ocrDocument.pageDimension = dimension;
+/*     */       }
+/*     */     }
+/* 122 */     this.lineCounter = 0;
+/*     */   }
+/*     */ 
+/*     */   private void doLine(Attributes attrs)
+/*     */   {
+/*     */    
+			  OCRDocument doc = new OCRDocument();
+			  
+			  this.currentLine = doc.new OCRLine();
+/* 128 */     this.inLine = true;
+/* 129 */     this.currentLine.content = new String();
+/*     */ 
+/* 131 */     int length = attrs.getLength();
+/* 132 */     this.currentLine.lineNumber = String.valueOf(this.lineCounter);
+/* 133 */     this.lineCounter += 1;
+/* 134 */     this.currentLine.bbox = "0 0";
+/*     */ 
+/* 137 */     for (int i = 0; i < length; ++i)
+/* 138 */       if (attrs.getLocalName(i).equals("title")) {
+/* 139 */         String title = attrs.getValue(i);
+/*     */ 
+/* 141 */         String dimension = title.replace("bbox ", "");
+/* 142 */         this.currentLine.bbox = dimension;
+/*     */       }
+/*     */   }
+/*     */ 
+/*     */   public void startPrefixMapping(String prefix, String uri)
+/*     */     throws SAXException
+/*     */   {
+/*     */   }
+/*     */ 
+/*     */   public InputSource resolveEntity(String publicId, String systemId)
+/*     */     throws SAXException, IOException
+/*     */   {
+/* 160 */     File f = new File(
+/* 161 */       "/Users/dwinter/text-tools/fulltextsearch/catalog/xhtml1-transitional.dtd");
+/* 162 */     if (!f.exists())
+/* 163 */       f = new File(
+/* 164 */         "/usr/local/apache-tomcat-6.0.16/webapps/fulltextsearch/catalog/xhtml1-transitional.dtd");
+/* 165 */     if (!f.exists()) {
+/* 166 */       System.err.println("Cant't find xhtml-dtd: MyResolver");
+/* 167 */       return null;
+/*     */     }
+/* 169 */     if (publicId.equals("-//W3C//DTD XHTML 1.0 Transitional//EN"))
+/* 170 */       return new InputSource(f.getAbsolutePath());
+/* 171 */     return new InputSource(systemId);
+/*     */   }
+/*     */ }
+
+/* Location:           /private/tmp/fulltextIndexer.jar
+ * Qualified Name:     de.mpiwg.dwinter.fulltextIndexer.utils.ParseOcrDocument
+ * JD-Core Version:    0.5.4
+ */
\ No newline at end of file