Mercurial > hg > fulltextIndexer
view src/de/mpiwg/dwinter/fulltextIndexer/utils/ParseOcrDocument.java @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
line wrap: on
line source
/* */ package de.mpiwg.dwinter.fulltextIndexer.utils; /* */ /* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument; /* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument.OCRLine; /* */ import java.io.File; /* */ import java.io.IOException; /* */ import java.io.PrintStream; /* */ import java.util.ArrayList; /* */ import org.xml.sax.Attributes; /* */ import org.xml.sax.InputSource; /* */ import org.xml.sax.Locator; /* */ import org.xml.sax.SAXException; /* */ import org.xml.sax.helpers.DefaultHandler; /* */ /* */ public class ParseOcrDocument extends DefaultHandler /* */ { /* 24 */ public OCRDocument ocrDocument = new OCRDocument(); /* */ private int lineCounter; /* */ private OCRDocument.OCRLine currentLine; /* */ private boolean inLine; /* */ /* */ public void startDocument() /* */ throws SAXException /* */ { /* */ } /* */ /* */ public void endDocument() /* */ throws SAXException /* */ { /* */ } /* */ /* */ public void characters(char[] c, int start, int length) /* */ throws SAXException /* */ { /* 38 */ if (this.inLine) /* 39 */ this.currentLine.content += new String(c, start, length); /* */ } /* */ /* */ public void ignorableWhitespace(char[] c, int start, int length) /* */ throws SAXException /* */ { /* */ } /* */ /* */ public void processingInstruction(String target, String data) /* */ throws SAXException /* */ { /* */ } /* */ /* */ public void setDocumentLocator(Locator arg1) /* */ { /* */ } /* */ /* */ public void endElement(String uri, String localName, String name) /* */ throws SAXException /* */ { /* 61 */ if ((!name.equals("span")) || /* 63 */ (!this.inLine)) return; /* 64 */ this.ocrDocument.OCRLines.add(this.currentLine); /* 65 */ this.inLine = false; /* */ } /* */ /* */ public void endPrefixMapping(String prefix) /* */ throws SAXException /* */ { /* */ } /* */ /* */ public void skippedEntity(String name) /* */ throws SAXException /* */ { /* */ } /* */ /* */ public void startElement(String uri, String localName, String name, Attributes attrs) /* */ throws SAXException /* */ { /* 83 */ if ((name.equals("div")) && /* 84 */ (attrs != null)) { /* 85 */ int length = attrs.getLength(); /* */ /* 87 */ for (int i = 0; i < length; ++i) { /* 88 */ if ((!attrs.getLocalName(i).equals("class")) || /* 89 */ (!attrs.getValue(i).equals("ocr_page"))) continue; /* 90 */ doPage(attrs); /* */ } /* */ /* */ } /* */ /* 96 */ if ((!name.equals("span")) || /* 97 */ (attrs == null)) return; /* 98 */ int length = attrs.getLength(); /* */ /* 100 */ for (int i = 0; i < length; ++i) { /* 101 */ if ((!attrs.getLocalName(i).equals("class")) || /* 102 */ (!attrs.getValue(i).equals("ocr_line"))) continue; /* 103 */ doLine(attrs); /* */ } /* */ } /* */ /* */ private void doPage(Attributes attrs) /* */ { /* 111 */ int length = attrs.getLength(); /* */ /* 113 */ for (int i = 0; i < length; ++i) { /* 114 */ if (attrs.getLocalName(i).equals("title")) { /* 115 */ String title = attrs.getValue(i); /* 116 */ String[] splitted = title.split(" "); /* 117 */ String dimension = splitted[(splitted.length - 2)] + " " + /* 118 */ splitted[(splitted.length - 1)]; /* 119 */ this.ocrDocument.pageDimension = dimension; /* */ } /* */ } /* 122 */ this.lineCounter = 0; /* */ } /* */ /* */ private void doLine(Attributes attrs) /* */ { /* */ OCRDocument doc = new OCRDocument(); this.currentLine = doc.new OCRLine(); /* 128 */ this.inLine = true; /* 129 */ this.currentLine.content = new String(); /* */ /* 131 */ int length = attrs.getLength(); /* 132 */ this.currentLine.lineNumber = String.valueOf(this.lineCounter); /* 133 */ this.lineCounter += 1; /* 134 */ this.currentLine.bbox = "0 0"; /* */ /* 137 */ for (int i = 0; i < length; ++i) /* 138 */ if (attrs.getLocalName(i).equals("title")) { /* 139 */ String title = attrs.getValue(i); /* */ /* 141 */ String dimension = title.replace("bbox ", ""); /* 142 */ this.currentLine.bbox = dimension; /* */ } /* */ } /* */ /* */ public void startPrefixMapping(String prefix, String uri) /* */ throws SAXException /* */ { /* */ } /* */ /* */ public InputSource resolveEntity(String publicId, String systemId) /* */ throws SAXException, IOException /* */ { /* 160 */ File f = new File( /* 161 */ "/Users/dwinter/text-tools/fulltextsearch/catalog/xhtml1-transitional.dtd"); /* 162 */ if (!f.exists()) /* 163 */ f = new File( /* 164 */ "/usr/local/apache-tomcat-6.0.16/webapps/fulltextsearch/catalog/xhtml1-transitional.dtd"); /* 165 */ if (!f.exists()) { /* 166 */ System.err.println("Cant't find xhtml-dtd: MyResolver"); /* 167 */ return null; /* */ } /* 169 */ if (publicId.equals("-//W3C//DTD XHTML 1.0 Transitional//EN")) /* 170 */ return new InputSource(f.getAbsolutePath()); /* 171 */ return new InputSource(systemId); /* */ } /* */ } /* Location: /private/tmp/fulltextIndexer.jar * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.utils.ParseOcrDocument * JD-Core Version: 0.5.4 */