Mercurial > hg > fulltextIndexer
comparison src/de/mpiwg/dwinter/fulltextIndexer/utils/ParseOcrDocument.java @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dc7622afcfea |
---|---|
1 /* */ package de.mpiwg.dwinter.fulltextIndexer.utils; | |
2 /* */ | |
3 /* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument; | |
4 /* */ import de.mpiwg.dwinter.fulltextIndexer.OCRutils.OCRDocument.OCRLine; | |
5 /* */ import java.io.File; | |
6 /* */ import java.io.IOException; | |
7 /* */ import java.io.PrintStream; | |
8 /* */ import java.util.ArrayList; | |
9 /* */ import org.xml.sax.Attributes; | |
10 /* */ import org.xml.sax.InputSource; | |
11 /* */ import org.xml.sax.Locator; | |
12 /* */ import org.xml.sax.SAXException; | |
13 /* */ import org.xml.sax.helpers.DefaultHandler; | |
14 /* */ | |
15 /* */ public class ParseOcrDocument extends DefaultHandler | |
16 /* */ { | |
17 /* 24 */ public OCRDocument ocrDocument = new OCRDocument(); | |
18 /* */ private int lineCounter; | |
19 /* */ private OCRDocument.OCRLine currentLine; | |
20 /* */ private boolean inLine; | |
21 /* */ | |
22 /* */ public void startDocument() | |
23 /* */ throws SAXException | |
24 /* */ { | |
25 /* */ } | |
26 /* */ | |
27 /* */ public void endDocument() | |
28 /* */ throws SAXException | |
29 /* */ { | |
30 /* */ } | |
31 /* */ | |
32 /* */ public void characters(char[] c, int start, int length) | |
33 /* */ throws SAXException | |
34 /* */ { | |
35 /* 38 */ if (this.inLine) | |
36 /* 39 */ this.currentLine.content += new String(c, start, length); | |
37 /* */ } | |
38 /* */ | |
39 /* */ public void ignorableWhitespace(char[] c, int start, int length) | |
40 /* */ throws SAXException | |
41 /* */ { | |
42 /* */ } | |
43 /* */ | |
44 /* */ public void processingInstruction(String target, String data) | |
45 /* */ throws SAXException | |
46 /* */ { | |
47 /* */ } | |
48 /* */ | |
49 /* */ public void setDocumentLocator(Locator arg1) | |
50 /* */ { | |
51 /* */ } | |
52 /* */ | |
53 /* */ public void endElement(String uri, String localName, String name) | |
54 /* */ throws SAXException | |
55 /* */ { | |
56 /* 61 */ if ((!name.equals("span")) || | |
57 /* 63 */ (!this.inLine)) return; | |
58 /* 64 */ this.ocrDocument.OCRLines.add(this.currentLine); | |
59 /* 65 */ this.inLine = false; | |
60 /* */ } | |
61 /* */ | |
62 /* */ public void endPrefixMapping(String prefix) | |
63 /* */ throws SAXException | |
64 /* */ { | |
65 /* */ } | |
66 /* */ | |
67 /* */ public void skippedEntity(String name) | |
68 /* */ throws SAXException | |
69 /* */ { | |
70 /* */ } | |
71 /* */ | |
72 /* */ public void startElement(String uri, String localName, String name, Attributes attrs) | |
73 /* */ throws SAXException | |
74 /* */ { | |
75 /* 83 */ if ((name.equals("div")) && | |
76 /* 84 */ (attrs != null)) { | |
77 /* 85 */ int length = attrs.getLength(); | |
78 /* */ | |
79 /* 87 */ for (int i = 0; i < length; ++i) { | |
80 /* 88 */ if ((!attrs.getLocalName(i).equals("class")) || | |
81 /* 89 */ (!attrs.getValue(i).equals("ocr_page"))) continue; | |
82 /* 90 */ doPage(attrs); | |
83 /* */ } | |
84 /* */ | |
85 /* */ } | |
86 /* */ | |
87 /* 96 */ if ((!name.equals("span")) || | |
88 /* 97 */ (attrs == null)) return; | |
89 /* 98 */ int length = attrs.getLength(); | |
90 /* */ | |
91 /* 100 */ for (int i = 0; i < length; ++i) { | |
92 /* 101 */ if ((!attrs.getLocalName(i).equals("class")) || | |
93 /* 102 */ (!attrs.getValue(i).equals("ocr_line"))) continue; | |
94 /* 103 */ doLine(attrs); | |
95 /* */ } | |
96 /* */ } | |
97 /* */ | |
98 /* */ private void doPage(Attributes attrs) | |
99 /* */ { | |
100 /* 111 */ int length = attrs.getLength(); | |
101 /* */ | |
102 /* 113 */ for (int i = 0; i < length; ++i) { | |
103 /* 114 */ if (attrs.getLocalName(i).equals("title")) { | |
104 /* 115 */ String title = attrs.getValue(i); | |
105 /* 116 */ String[] splitted = title.split(" "); | |
106 /* 117 */ String dimension = splitted[(splitted.length - 2)] + " " + | |
107 /* 118 */ splitted[(splitted.length - 1)]; | |
108 /* 119 */ this.ocrDocument.pageDimension = dimension; | |
109 /* */ } | |
110 /* */ } | |
111 /* 122 */ this.lineCounter = 0; | |
112 /* */ } | |
113 /* */ | |
114 /* */ private void doLine(Attributes attrs) | |
115 /* */ { | |
116 /* */ | |
117 OCRDocument doc = new OCRDocument(); | |
118 | |
119 this.currentLine = doc.new OCRLine(); | |
120 /* 128 */ this.inLine = true; | |
121 /* 129 */ this.currentLine.content = new String(); | |
122 /* */ | |
123 /* 131 */ int length = attrs.getLength(); | |
124 /* 132 */ this.currentLine.lineNumber = String.valueOf(this.lineCounter); | |
125 /* 133 */ this.lineCounter += 1; | |
126 /* 134 */ this.currentLine.bbox = "0 0"; | |
127 /* */ | |
128 /* 137 */ for (int i = 0; i < length; ++i) | |
129 /* 138 */ if (attrs.getLocalName(i).equals("title")) { | |
130 /* 139 */ String title = attrs.getValue(i); | |
131 /* */ | |
132 /* 141 */ String dimension = title.replace("bbox ", ""); | |
133 /* 142 */ this.currentLine.bbox = dimension; | |
134 /* */ } | |
135 /* */ } | |
136 /* */ | |
137 /* */ public void startPrefixMapping(String prefix, String uri) | |
138 /* */ throws SAXException | |
139 /* */ { | |
140 /* */ } | |
141 /* */ | |
142 /* */ public InputSource resolveEntity(String publicId, String systemId) | |
143 /* */ throws SAXException, IOException | |
144 /* */ { | |
145 /* 160 */ File f = new File( | |
146 /* 161 */ "/Users/dwinter/text-tools/fulltextsearch/catalog/xhtml1-transitional.dtd"); | |
147 /* 162 */ if (!f.exists()) | |
148 /* 163 */ f = new File( | |
149 /* 164 */ "/usr/local/apache-tomcat-6.0.16/webapps/fulltextsearch/catalog/xhtml1-transitional.dtd"); | |
150 /* 165 */ if (!f.exists()) { | |
151 /* 166 */ System.err.println("Cant't find xhtml-dtd: MyResolver"); | |
152 /* 167 */ return null; | |
153 /* */ } | |
154 /* 169 */ if (publicId.equals("-//W3C//DTD XHTML 1.0 Transitional//EN")) | |
155 /* 170 */ return new InputSource(f.getAbsolutePath()); | |
156 /* 171 */ return new InputSource(systemId); | |
157 /* */ } | |
158 /* */ } | |
159 | |
160 /* Location: /private/tmp/fulltextIndexer.jar | |
161 * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.utils.ParseOcrDocument | |
162 * JD-Core Version: 0.5.4 | |
163 */ |