Mercurial > hg > fulltextIndexer
diff src/de/mpiwg/dwinter/fulltextIndexer/OCRutils/.svn/text-base/ParseOcrDocument.java.svn-base @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/de/mpiwg/dwinter/fulltextIndexer/OCRutils/.svn/text-base/ParseOcrDocument.java.svn-base Wed Nov 03 12:33:16 2010 +0100 @@ -0,0 +1,146 @@ +/* */ package de.mpiwg.dwinter.fulltextIndexer.OCRutils; +/* */ +/* */ import java.util.ArrayList; +/* */ import org.xml.sax.Attributes; +/* */ import org.xml.sax.Locator; +/* */ import org.xml.sax.SAXException; +/* */ import org.xml.sax.helpers.DefaultHandler; +/* */ +/* */ public class ParseOcrDocument extends DefaultHandler +/* */ { +/* 31 */ public OCRDocument ocrDocument = new OCRDocument(); +/* */ private int lineCounter; +/* */ private OCRDocument.OCRLine currentLine; +/* */ private boolean inLine; +/* */ +/* */ public void startDocument() +/* */ throws SAXException +/* */ { +/* */ } +/* */ +/* */ public void endDocument() +/* */ throws SAXException +/* */ { +/* */ } +/* */ +/* */ public void characters(char[] c, int start, int length) +/* */ throws SAXException +/* */ { +/* 52 */ if (this.inLine) +/* 53 */ this.currentLine.content += new String(c, start, length); +/* */ } +/* */ +/* */ public void ignorableWhitespace(char[] c, int start, int length) +/* */ throws SAXException +/* */ { +/* */ } +/* */ +/* */ public void processingInstruction(String target, String data) +/* */ throws SAXException +/* */ { +/* */ } +/* */ +/* */ public void setDocumentLocator(Locator arg1) +/* */ { +/* */ } +/* */ +/* */ public void endElement(String uri, String localName, String name) +/* */ throws SAXException +/* */ { +/* 76 */ if ((!name.equals("span")) || +/* 78 */ (!this.inLine)) +/* */ return; +/* 80 */ this.ocrDocument.OCRLines.add(this.currentLine); +/* 81 */ this.inLine = false; +/* */ } +/* */ +/* */ public void endPrefixMapping(String prefix) +/* */ throws SAXException +/* */ { +/* */ } +/* */ +/* */ public void skippedEntity(String name) +/* */ throws SAXException +/* */ { +/* */ } +/* */ +/* */ public void startElement(String uri, String localName, String name, Attributes attrs) +/* */ throws SAXException +/* */ { +/* 100 */ if ((name.equals("div")) && +/* 101 */ (attrs != null)) +/* */ { +/* 103 */ int length = attrs.getLength(); +/* */ +/* 106 */ for (int i = 0; i < length; ++i) +/* */ { +/* 108 */ if ((!attrs.getLocalName(i).equals("class")) || +/* 110 */ (!attrs.getValue(i).equals("ocr_page"))) continue; +/* 111 */ doPage(attrs); +/* */ } +/* */ +/* */ } +/* */ +/* 117 */ if ((!name.equals("span")) || +/* 118 */ (attrs == null)) +/* */ return; +/* 120 */ int length = attrs.getLength(); +/* */ +/* 123 */ for (int i = 0; i < length; ++i) +/* */ { +/* 125 */ if ((!attrs.getLocalName(i).equals("class")) || +/* 127 */ (!attrs.getValue(i).equals("ocr_line"))) continue; +/* 128 */ doLine(attrs); +/* */ } +/* */ } +/* */ +/* */ private void doPage(Attributes attrs) +/* */ { +/* 136 */ int length = attrs.getLength(); +/* */ +/* 139 */ for (int i = 0; i < length; ++i) +/* */ { +/* 141 */ if (!attrs.getLocalName(i).equals("title")) +/* */ continue; +/* 143 */ String title = attrs.getValue(i); +/* 144 */ String[] splitted = title.split(" "); +/* 145 */ String dimension = splitted[(splitted.length - 2)] + " " + splitted[(splitted.length - 1)]; +/* 146 */ this.ocrDocument.pageDimension = dimension; +/* */ } +/* */ +/* 149 */ this.lineCounter = 0; +/* */ } +/* */ +/* */ private void doLine(Attributes attrs) +/* */ { +/* */ OCRDocument doc = this.ocrDocument; + this.currentLine = doc.new OCRLine(); +/* */ +/* 156 */ this.inLine = true; +/* 157 */ this.currentLine.content = new String(); +/* */ +/* 159 */ int length = attrs.getLength(); +/* 160 */ this.currentLine.lineNumber = String.valueOf(this.lineCounter); +/* 161 */ this.lineCounter += 1; +/* 162 */ this.currentLine.bbox = "0 0"; +/* 163 */ for (int i = 0; i < length; ++i) +/* */ { +/* 165 */ if (!attrs.getLocalName(i).equals("title")) +/* */ continue; +/* 167 */ String title = attrs.getValue(i); +/* */ +/* 169 */ String dimension = title.replace("bbox ", ""); +/* 170 */ this.currentLine.bbox = dimension; +/* */ } +/* */ } +/* */ +/* */ public void startPrefixMapping(String prefix, String uri) +/* */ throws SAXException +/* */ { +/* */ } +/* */ } + +/* Location: /private/tmp/fulltextIndexer.jar + * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.OCRutils.ParseOcrDocument + * JD-Core Version: 0.5.4 + */ \ No newline at end of file