view src/de/mpiwg/dwinter/fulltextIndexer/OCRutils/ParseOcrDocument.java @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
line wrap: on
line source

/*     */ package de.mpiwg.dwinter.fulltextIndexer.OCRutils;
/*     */ 
/*     */ import java.util.ArrayList;
/*     */ import org.xml.sax.Attributes;
/*     */ import org.xml.sax.Locator;
/*     */ import org.xml.sax.SAXException;
/*     */ import org.xml.sax.helpers.DefaultHandler;
/*     */ 
/*     */ public class ParseOcrDocument extends DefaultHandler
/*     */ {
/*  31 */   public OCRDocument ocrDocument = new OCRDocument();
/*     */   private int lineCounter;
/*     */   private OCRDocument.OCRLine currentLine;
/*     */   private boolean inLine;
/*     */ 
/*     */   public void startDocument()
/*     */     throws SAXException
/*     */   {
/*     */   }
/*     */ 
/*     */   public void endDocument()
/*     */     throws SAXException
/*     */   {
/*     */   }
/*     */ 
/*     */   public void characters(char[] c, int start, int length)
/*     */     throws SAXException
/*     */   {
/*  52 */     if (this.inLine)
/*  53 */       this.currentLine.content += new String(c, start, length);
/*     */   }
/*     */ 
/*     */   public void ignorableWhitespace(char[] c, int start, int length)
/*     */     throws SAXException
/*     */   {
/*     */   }
/*     */ 
/*     */   public void processingInstruction(String target, String data)
/*     */     throws SAXException
/*     */   {
/*     */   }
/*     */ 
/*     */   public void setDocumentLocator(Locator arg1)
/*     */   {
/*     */   }
/*     */ 
/*     */   public void endElement(String uri, String localName, String name)
/*     */     throws SAXException
/*     */   {
/*  76 */     if ((!name.equals("span")) || 
/*  78 */       (!this.inLine))
/*     */       return;
/*  80 */     this.ocrDocument.OCRLines.add(this.currentLine);
/*  81 */     this.inLine = false;
/*     */   }
/*     */ 
/*     */   public void endPrefixMapping(String prefix)
/*     */     throws SAXException
/*     */   {
/*     */   }
/*     */ 
/*     */   public void skippedEntity(String name)
/*     */     throws SAXException
/*     */   {
/*     */   }
/*     */ 
/*     */   public void startElement(String uri, String localName, String name, Attributes attrs)
/*     */     throws SAXException
/*     */   {
/* 100 */     if ((name.equals("div")) && 
/* 101 */       (attrs != null))
/*     */     {
/* 103 */       int length = attrs.getLength();
/*     */ 
/* 106 */       for (int i = 0; i < length; ++i)
/*     */       {
/* 108 */         if ((!attrs.getLocalName(i).equals("class")) || 
/* 110 */           (!attrs.getValue(i).equals("ocr_page"))) continue;
/* 111 */         doPage(attrs);
/*     */       }
/*     */ 
/*     */     }
/*     */ 
/* 117 */     if ((!name.equals("span")) || 
/* 118 */       (attrs == null))
/*     */       return;
/* 120 */     int length = attrs.getLength();
/*     */ 
/* 123 */     for (int i = 0; i < length; ++i)
/*     */     {
/* 125 */       if ((!attrs.getLocalName(i).equals("class")) || 
/* 127 */         (!attrs.getValue(i).equals("ocr_line"))) continue;
/* 128 */       doLine(attrs);
/*     */     }
/*     */   }
/*     */ 
/*     */   private void doPage(Attributes attrs)
/*     */   {
/* 136 */     int length = attrs.getLength();
/*     */ 
/* 139 */     for (int i = 0; i < length; ++i)
/*     */     {
/* 141 */       if (!attrs.getLocalName(i).equals("title"))
/*     */         continue;
/* 143 */       String title = attrs.getValue(i);
/* 144 */       String[] splitted = title.split(" ");
/* 145 */       String dimension = splitted[(splitted.length - 2)] + " " + splitted[(splitted.length - 1)];
/* 146 */       this.ocrDocument.pageDimension = dimension;
/*     */     }
/*     */ 
/* 149 */     this.lineCounter = 0;
/*     */   }
/*     */ 
/*     */   private void doLine(Attributes attrs)
/*     */   {
/*     */     OCRDocument doc = this.ocrDocument;
			this.currentLine = doc.new OCRLine();
/*     */ 
/* 156 */     this.inLine = true;
/* 157 */     this.currentLine.content = new String();
/*     */ 
/* 159 */     int length = attrs.getLength();
/* 160 */     this.currentLine.lineNumber = String.valueOf(this.lineCounter);
/* 161 */     this.lineCounter += 1;
/* 162 */     this.currentLine.bbox = "0 0";
/* 163 */     for (int i = 0; i < length; ++i)
/*     */     {
/* 165 */       if (!attrs.getLocalName(i).equals("title"))
/*     */         continue;
/* 167 */       String title = attrs.getValue(i);
/*     */ 
/* 169 */       String dimension = title.replace("bbox ", "");
/* 170 */       this.currentLine.bbox = dimension;
/*     */     }
/*     */   }
/*     */ 
/*     */   public void startPrefixMapping(String prefix, String uri)
/*     */     throws SAXException
/*     */   {
/*     */   }
/*     */ }

/* Location:           /private/tmp/fulltextIndexer.jar
 * Qualified Name:     de.mpiwg.dwinter.fulltextIndexer.OCRutils.ParseOcrDocument
 * JD-Core Version:    0.5.4
 */