annotate src/de/mpiwg/dwinter/fulltextIndexer/utils/ParseOcrDocument.java @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dc7622afcfea initial
dwinter
parents:
diff changeset
1 /* */ package de.mpiwg.dwinter.fulltextIndexer.utils;
dc7622afcfea initial
dwinter
parents:
diff changeset
2 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
3 /* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument;
dc7622afcfea initial
dwinter
parents:
diff changeset
4 /* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument.OCRLine;
dc7622afcfea initial
dwinter
parents:
diff changeset
5 /* */ import java.io.File;
dc7622afcfea initial
dwinter
parents:
diff changeset
6 /* */ import java.io.IOException;
dc7622afcfea initial
dwinter
parents:
diff changeset
7 /* */ import java.io.PrintStream;
dc7622afcfea initial
dwinter
parents:
diff changeset
8 /* */ import java.util.ArrayList;
dc7622afcfea initial
dwinter
parents:
diff changeset
9 /* */ import org.xml.sax.Attributes;
dc7622afcfea initial
dwinter
parents:
diff changeset
10 /* */ import org.xml.sax.InputSource;
dc7622afcfea initial
dwinter
parents:
diff changeset
11 /* */ import org.xml.sax.Locator;
dc7622afcfea initial
dwinter
parents:
diff changeset
12 /* */ import org.xml.sax.SAXException;
dc7622afcfea initial
dwinter
parents:
diff changeset
13 /* */ import org.xml.sax.helpers.DefaultHandler;
dc7622afcfea initial
dwinter
parents:
diff changeset
14 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
15 /* */ public class ParseOcrDocument extends DefaultHandler
dc7622afcfea initial
dwinter
parents:
diff changeset
16 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
17 /* 24 */ public OCRDocument ocrDocument = new OCRDocument();
dc7622afcfea initial
dwinter
parents:
diff changeset
18 /* */ private int lineCounter;
dc7622afcfea initial
dwinter
parents:
diff changeset
19 /* */ private OCRDocument.OCRLine currentLine;
dc7622afcfea initial
dwinter
parents:
diff changeset
20 /* */ private boolean inLine;
dc7622afcfea initial
dwinter
parents:
diff changeset
21 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
22 /* */ public void startDocument()
dc7622afcfea initial
dwinter
parents:
diff changeset
23 /* */ throws SAXException
dc7622afcfea initial
dwinter
parents:
diff changeset
24 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
25 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
26 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
27 /* */ public void endDocument()
dc7622afcfea initial
dwinter
parents:
diff changeset
28 /* */ throws SAXException
dc7622afcfea initial
dwinter
parents:
diff changeset
29 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
30 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
31 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
32 /* */ public void characters(char[] c, int start, int length)
dc7622afcfea initial
dwinter
parents:
diff changeset
33 /* */ throws SAXException
dc7622afcfea initial
dwinter
parents:
diff changeset
34 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
35 /* 38 */ if (this.inLine)
dc7622afcfea initial
dwinter
parents:
diff changeset
36 /* 39 */ this.currentLine.content += new String(c, start, length);
dc7622afcfea initial
dwinter
parents:
diff changeset
37 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
38 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
39 /* */ public void ignorableWhitespace(char[] c, int start, int length)
dc7622afcfea initial
dwinter
parents:
diff changeset
40 /* */ throws SAXException
dc7622afcfea initial
dwinter
parents:
diff changeset
41 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
42 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
43 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
44 /* */ public void processingInstruction(String target, String data)
dc7622afcfea initial
dwinter
parents:
diff changeset
45 /* */ throws SAXException
dc7622afcfea initial
dwinter
parents:
diff changeset
46 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
47 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
48 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
49 /* */ public void setDocumentLocator(Locator arg1)
dc7622afcfea initial
dwinter
parents:
diff changeset
50 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
51 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
52 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
53 /* */ public void endElement(String uri, String localName, String name)
dc7622afcfea initial
dwinter
parents:
diff changeset
54 /* */ throws SAXException
dc7622afcfea initial
dwinter
parents:
diff changeset
55 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
56 /* 61 */ if ((!name.equals("span")) ||
dc7622afcfea initial
dwinter
parents:
diff changeset
57 /* 63 */ (!this.inLine)) return;
dc7622afcfea initial
dwinter
parents:
diff changeset
58 /* 64 */ this.ocrDocument.OCRLines.add(this.currentLine);
dc7622afcfea initial
dwinter
parents:
diff changeset
59 /* 65 */ this.inLine = false;
dc7622afcfea initial
dwinter
parents:
diff changeset
60 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
61 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
62 /* */ public void endPrefixMapping(String prefix)
dc7622afcfea initial
dwinter
parents:
diff changeset
63 /* */ throws SAXException
dc7622afcfea initial
dwinter
parents:
diff changeset
64 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
65 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
66 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
67 /* */ public void skippedEntity(String name)
dc7622afcfea initial
dwinter
parents:
diff changeset
68 /* */ throws SAXException
dc7622afcfea initial
dwinter
parents:
diff changeset
69 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
70 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
71 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
72 /* */ public void startElement(String uri, String localName, String name, Attributes attrs)
dc7622afcfea initial
dwinter
parents:
diff changeset
73 /* */ throws SAXException
dc7622afcfea initial
dwinter
parents:
diff changeset
74 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
75 /* 83 */ if ((name.equals("div")) &&
dc7622afcfea initial
dwinter
parents:
diff changeset
76 /* 84 */ (attrs != null)) {
dc7622afcfea initial
dwinter
parents:
diff changeset
77 /* 85 */ int length = attrs.getLength();
dc7622afcfea initial
dwinter
parents:
diff changeset
78 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
79 /* 87 */ for (int i = 0; i < length; ++i) {
dc7622afcfea initial
dwinter
parents:
diff changeset
80 /* 88 */ if ((!attrs.getLocalName(i).equals("class")) ||
dc7622afcfea initial
dwinter
parents:
diff changeset
81 /* 89 */ (!attrs.getValue(i).equals("ocr_page"))) continue;
dc7622afcfea initial
dwinter
parents:
diff changeset
82 /* 90 */ doPage(attrs);
dc7622afcfea initial
dwinter
parents:
diff changeset
83 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
84 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
85 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
86 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
87 /* 96 */ if ((!name.equals("span")) ||
dc7622afcfea initial
dwinter
parents:
diff changeset
88 /* 97 */ (attrs == null)) return;
dc7622afcfea initial
dwinter
parents:
diff changeset
89 /* 98 */ int length = attrs.getLength();
dc7622afcfea initial
dwinter
parents:
diff changeset
90 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
91 /* 100 */ for (int i = 0; i < length; ++i) {
dc7622afcfea initial
dwinter
parents:
diff changeset
92 /* 101 */ if ((!attrs.getLocalName(i).equals("class")) ||
dc7622afcfea initial
dwinter
parents:
diff changeset
93 /* 102 */ (!attrs.getValue(i).equals("ocr_line"))) continue;
dc7622afcfea initial
dwinter
parents:
diff changeset
94 /* 103 */ doLine(attrs);
dc7622afcfea initial
dwinter
parents:
diff changeset
95 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
96 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
97 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
98 /* */ private void doPage(Attributes attrs)
dc7622afcfea initial
dwinter
parents:
diff changeset
99 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
100 /* 111 */ int length = attrs.getLength();
dc7622afcfea initial
dwinter
parents:
diff changeset
101 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
102 /* 113 */ for (int i = 0; i < length; ++i) {
dc7622afcfea initial
dwinter
parents:
diff changeset
103 /* 114 */ if (attrs.getLocalName(i).equals("title")) {
dc7622afcfea initial
dwinter
parents:
diff changeset
104 /* 115 */ String title = attrs.getValue(i);
dc7622afcfea initial
dwinter
parents:
diff changeset
105 /* 116 */ String[] splitted = title.split(" ");
dc7622afcfea initial
dwinter
parents:
diff changeset
106 /* 117 */ String dimension = splitted[(splitted.length - 2)] + " " +
dc7622afcfea initial
dwinter
parents:
diff changeset
107 /* 118 */ splitted[(splitted.length - 1)];
dc7622afcfea initial
dwinter
parents:
diff changeset
108 /* 119 */ this.ocrDocument.pageDimension = dimension;
dc7622afcfea initial
dwinter
parents:
diff changeset
109 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
110 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
111 /* 122 */ this.lineCounter = 0;
dc7622afcfea initial
dwinter
parents:
diff changeset
112 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
113 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
114 /* */ private void doLine(Attributes attrs)
dc7622afcfea initial
dwinter
parents:
diff changeset
115 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
116 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
117 OCRDocument doc = new OCRDocument();
dc7622afcfea initial
dwinter
parents:
diff changeset
118
dc7622afcfea initial
dwinter
parents:
diff changeset
119 this.currentLine = doc.new OCRLine();
dc7622afcfea initial
dwinter
parents:
diff changeset
120 /* 128 */ this.inLine = true;
dc7622afcfea initial
dwinter
parents:
diff changeset
121 /* 129 */ this.currentLine.content = new String();
dc7622afcfea initial
dwinter
parents:
diff changeset
122 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
123 /* 131 */ int length = attrs.getLength();
dc7622afcfea initial
dwinter
parents:
diff changeset
124 /* 132 */ this.currentLine.lineNumber = String.valueOf(this.lineCounter);
dc7622afcfea initial
dwinter
parents:
diff changeset
125 /* 133 */ this.lineCounter += 1;
dc7622afcfea initial
dwinter
parents:
diff changeset
126 /* 134 */ this.currentLine.bbox = "0 0";
dc7622afcfea initial
dwinter
parents:
diff changeset
127 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
128 /* 137 */ for (int i = 0; i < length; ++i)
dc7622afcfea initial
dwinter
parents:
diff changeset
129 /* 138 */ if (attrs.getLocalName(i).equals("title")) {
dc7622afcfea initial
dwinter
parents:
diff changeset
130 /* 139 */ String title = attrs.getValue(i);
dc7622afcfea initial
dwinter
parents:
diff changeset
131 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
132 /* 141 */ String dimension = title.replace("bbox ", "");
dc7622afcfea initial
dwinter
parents:
diff changeset
133 /* 142 */ this.currentLine.bbox = dimension;
dc7622afcfea initial
dwinter
parents:
diff changeset
134 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
135 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
136 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
137 /* */ public void startPrefixMapping(String prefix, String uri)
dc7622afcfea initial
dwinter
parents:
diff changeset
138 /* */ throws SAXException
dc7622afcfea initial
dwinter
parents:
diff changeset
139 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
140 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
141 /* */
dc7622afcfea initial
dwinter
parents:
diff changeset
142 /* */ public InputSource resolveEntity(String publicId, String systemId)
dc7622afcfea initial
dwinter
parents:
diff changeset
143 /* */ throws SAXException, IOException
dc7622afcfea initial
dwinter
parents:
diff changeset
144 /* */ {
dc7622afcfea initial
dwinter
parents:
diff changeset
145 /* 160 */ File f = new File(
dc7622afcfea initial
dwinter
parents:
diff changeset
146 /* 161 */ "/Users/dwinter/text-tools/fulltextsearch/catalog/xhtml1-transitional.dtd");
dc7622afcfea initial
dwinter
parents:
diff changeset
147 /* 162 */ if (!f.exists())
dc7622afcfea initial
dwinter
parents:
diff changeset
148 /* 163 */ f = new File(
dc7622afcfea initial
dwinter
parents:
diff changeset
149 /* 164 */ "/usr/local/apache-tomcat-6.0.16/webapps/fulltextsearch/catalog/xhtml1-transitional.dtd");
dc7622afcfea initial
dwinter
parents:
diff changeset
150 /* 165 */ if (!f.exists()) {
dc7622afcfea initial
dwinter
parents:
diff changeset
151 /* 166 */ System.err.println("Cant't find xhtml-dtd: MyResolver");
dc7622afcfea initial
dwinter
parents:
diff changeset
152 /* 167 */ return null;
dc7622afcfea initial
dwinter
parents:
diff changeset
153 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
154 /* 169 */ if (publicId.equals("-//W3C//DTD XHTML 1.0 Transitional//EN"))
dc7622afcfea initial
dwinter
parents:
diff changeset
155 /* 170 */ return new InputSource(f.getAbsolutePath());
dc7622afcfea initial
dwinter
parents:
diff changeset
156 /* 171 */ return new InputSource(systemId);
dc7622afcfea initial
dwinter
parents:
diff changeset
157 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
158 /* */ }
dc7622afcfea initial
dwinter
parents:
diff changeset
159
dc7622afcfea initial
dwinter
parents:
diff changeset
160 /* Location: /private/tmp/fulltextIndexer.jar
dc7622afcfea initial
dwinter
parents:
diff changeset
161 * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.utils.ParseOcrDocument
dc7622afcfea initial
dwinter
parents:
diff changeset
162 * JD-Core Version: 0.5.4
dc7622afcfea initial
dwinter
parents:
diff changeset
163 */