Mercurial > hg > fulltextIndexer
view src/de/mpiwg/dwinter/fulltextIndexer/OCRutils/ParseOcrDocument.java @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
line wrap: on
line source
/* */ package de.mpiwg.dwinter.fulltextIndexer.OCRutils; /* */ /* */ import java.util.ArrayList; /* */ import org.xml.sax.Attributes; /* */ import org.xml.sax.Locator; /* */ import org.xml.sax.SAXException; /* */ import org.xml.sax.helpers.DefaultHandler; /* */ /* */ public class ParseOcrDocument extends DefaultHandler /* */ { /* 31 */ public OCRDocument ocrDocument = new OCRDocument(); /* */ private int lineCounter; /* */ private OCRDocument.OCRLine currentLine; /* */ private boolean inLine; /* */ /* */ public void startDocument() /* */ throws SAXException /* */ { /* */ } /* */ /* */ public void endDocument() /* */ throws SAXException /* */ { /* */ } /* */ /* */ public void characters(char[] c, int start, int length) /* */ throws SAXException /* */ { /* 52 */ if (this.inLine) /* 53 */ this.currentLine.content += new String(c, start, length); /* */ } /* */ /* */ public void ignorableWhitespace(char[] c, int start, int length) /* */ throws SAXException /* */ { /* */ } /* */ /* */ public void processingInstruction(String target, String data) /* */ throws SAXException /* */ { /* */ } /* */ /* */ public void setDocumentLocator(Locator arg1) /* */ { /* */ } /* */ /* */ public void endElement(String uri, String localName, String name) /* */ throws SAXException /* */ { /* 76 */ if ((!name.equals("span")) || /* 78 */ (!this.inLine)) /* */ return; /* 80 */ this.ocrDocument.OCRLines.add(this.currentLine); /* 81 */ this.inLine = false; /* */ } /* */ /* */ public void endPrefixMapping(String prefix) /* */ throws SAXException /* */ { /* */ } /* */ /* */ public void skippedEntity(String name) /* */ throws SAXException /* */ { /* */ } /* */ /* */ public void startElement(String uri, String localName, String name, Attributes attrs) /* */ throws SAXException /* */ { /* 100 */ if ((name.equals("div")) && /* 101 */ (attrs != null)) /* */ { /* 103 */ int length = attrs.getLength(); /* */ /* 106 */ for (int i = 0; i < length; ++i) /* */ { /* 108 */ if ((!attrs.getLocalName(i).equals("class")) || /* 110 */ (!attrs.getValue(i).equals("ocr_page"))) continue; /* 111 */ doPage(attrs); /* */ } /* */ /* */ } /* */ /* 117 */ if ((!name.equals("span")) || /* 118 */ (attrs == null)) /* */ return; /* 120 */ int length = attrs.getLength(); /* */ /* 123 */ for (int i = 0; i < length; ++i) /* */ { /* 125 */ if ((!attrs.getLocalName(i).equals("class")) || /* 127 */ (!attrs.getValue(i).equals("ocr_line"))) continue; /* 128 */ doLine(attrs); /* */ } /* */ } /* */ /* */ private void doPage(Attributes attrs) /* */ { /* 136 */ int length = attrs.getLength(); /* */ /* 139 */ for (int i = 0; i < length; ++i) /* */ { /* 141 */ if (!attrs.getLocalName(i).equals("title")) /* */ continue; /* 143 */ String title = attrs.getValue(i); /* 144 */ String[] splitted = title.split(" "); /* 145 */ String dimension = splitted[(splitted.length - 2)] + " " + splitted[(splitted.length - 1)]; /* 146 */ this.ocrDocument.pageDimension = dimension; /* */ } /* */ /* 149 */ this.lineCounter = 0; /* */ } /* */ /* */ private void doLine(Attributes attrs) /* */ { /* */ OCRDocument doc = this.ocrDocument; this.currentLine = doc.new OCRLine(); /* */ /* 156 */ this.inLine = true; /* 157 */ this.currentLine.content = new String(); /* */ /* 159 */ int length = attrs.getLength(); /* 160 */ this.currentLine.lineNumber = String.valueOf(this.lineCounter); /* 161 */ this.lineCounter += 1; /* 162 */ this.currentLine.bbox = "0 0"; /* 163 */ for (int i = 0; i < length; ++i) /* */ { /* 165 */ if (!attrs.getLocalName(i).equals("title")) /* */ continue; /* 167 */ String title = attrs.getValue(i); /* */ /* 169 */ String dimension = title.replace("bbox ", ""); /* 170 */ this.currentLine.bbox = dimension; /* */ } /* */ } /* */ /* */ public void startPrefixMapping(String prefix, String uri) /* */ throws SAXException /* */ { /* */ } /* */ } /* Location: /private/tmp/fulltextIndexer.jar * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.OCRutils.ParseOcrDocument * JD-Core Version: 0.5.4 */