comparison src/de/mpiwg/dwinter/fulltextIndexer/OCRutils/ParseOcrDocument.java @ 0:dc7622afcfea default tip

initial
author dwinter
date Wed, 03 Nov 2010 12:33:16 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dc7622afcfea
1 /* */ package de.mpiwg.dwinter.fulltextIndexer.OCRutils;
2 /* */
3 /* */ import java.util.ArrayList;
4 /* */ import org.xml.sax.Attributes;
5 /* */ import org.xml.sax.Locator;
6 /* */ import org.xml.sax.SAXException;
7 /* */ import org.xml.sax.helpers.DefaultHandler;
8 /* */
9 /* */ public class ParseOcrDocument extends DefaultHandler
10 /* */ {
11 /* 31 */ public OCRDocument ocrDocument = new OCRDocument();
12 /* */ private int lineCounter;
13 /* */ private OCRDocument.OCRLine currentLine;
14 /* */ private boolean inLine;
15 /* */
16 /* */ public void startDocument()
17 /* */ throws SAXException
18 /* */ {
19 /* */ }
20 /* */
21 /* */ public void endDocument()
22 /* */ throws SAXException
23 /* */ {
24 /* */ }
25 /* */
26 /* */ public void characters(char[] c, int start, int length)
27 /* */ throws SAXException
28 /* */ {
29 /* 52 */ if (this.inLine)
30 /* 53 */ this.currentLine.content += new String(c, start, length);
31 /* */ }
32 /* */
33 /* */ public void ignorableWhitespace(char[] c, int start, int length)
34 /* */ throws SAXException
35 /* */ {
36 /* */ }
37 /* */
38 /* */ public void processingInstruction(String target, String data)
39 /* */ throws SAXException
40 /* */ {
41 /* */ }
42 /* */
43 /* */ public void setDocumentLocator(Locator arg1)
44 /* */ {
45 /* */ }
46 /* */
47 /* */ public void endElement(String uri, String localName, String name)
48 /* */ throws SAXException
49 /* */ {
50 /* 76 */ if ((!name.equals("span")) ||
51 /* 78 */ (!this.inLine))
52 /* */ return;
53 /* 80 */ this.ocrDocument.OCRLines.add(this.currentLine);
54 /* 81 */ this.inLine = false;
55 /* */ }
56 /* */
57 /* */ public void endPrefixMapping(String prefix)
58 /* */ throws SAXException
59 /* */ {
60 /* */ }
61 /* */
62 /* */ public void skippedEntity(String name)
63 /* */ throws SAXException
64 /* */ {
65 /* */ }
66 /* */
67 /* */ public void startElement(String uri, String localName, String name, Attributes attrs)
68 /* */ throws SAXException
69 /* */ {
70 /* 100 */ if ((name.equals("div")) &&
71 /* 101 */ (attrs != null))
72 /* */ {
73 /* 103 */ int length = attrs.getLength();
74 /* */
75 /* 106 */ for (int i = 0; i < length; ++i)
76 /* */ {
77 /* 108 */ if ((!attrs.getLocalName(i).equals("class")) ||
78 /* 110 */ (!attrs.getValue(i).equals("ocr_page"))) continue;
79 /* 111 */ doPage(attrs);
80 /* */ }
81 /* */
82 /* */ }
83 /* */
84 /* 117 */ if ((!name.equals("span")) ||
85 /* 118 */ (attrs == null))
86 /* */ return;
87 /* 120 */ int length = attrs.getLength();
88 /* */
89 /* 123 */ for (int i = 0; i < length; ++i)
90 /* */ {
91 /* 125 */ if ((!attrs.getLocalName(i).equals("class")) ||
92 /* 127 */ (!attrs.getValue(i).equals("ocr_line"))) continue;
93 /* 128 */ doLine(attrs);
94 /* */ }
95 /* */ }
96 /* */
97 /* */ private void doPage(Attributes attrs)
98 /* */ {
99 /* 136 */ int length = attrs.getLength();
100 /* */
101 /* 139 */ for (int i = 0; i < length; ++i)
102 /* */ {
103 /* 141 */ if (!attrs.getLocalName(i).equals("title"))
104 /* */ continue;
105 /* 143 */ String title = attrs.getValue(i);
106 /* 144 */ String[] splitted = title.split(" ");
107 /* 145 */ String dimension = splitted[(splitted.length - 2)] + " " + splitted[(splitted.length - 1)];
108 /* 146 */ this.ocrDocument.pageDimension = dimension;
109 /* */ }
110 /* */
111 /* 149 */ this.lineCounter = 0;
112 /* */ }
113 /* */
114 /* */ private void doLine(Attributes attrs)
115 /* */ {
116 /* */ OCRDocument doc = this.ocrDocument;
117 this.currentLine = doc.new OCRLine();
118 /* */
119 /* 156 */ this.inLine = true;
120 /* 157 */ this.currentLine.content = new String();
121 /* */
122 /* 159 */ int length = attrs.getLength();
123 /* 160 */ this.currentLine.lineNumber = String.valueOf(this.lineCounter);
124 /* 161 */ this.lineCounter += 1;
125 /* 162 */ this.currentLine.bbox = "0 0";
126 /* 163 */ for (int i = 0; i < length; ++i)
127 /* */ {
128 /* 165 */ if (!attrs.getLocalName(i).equals("title"))
129 /* */ continue;
130 /* 167 */ String title = attrs.getValue(i);
131 /* */
132 /* 169 */ String dimension = title.replace("bbox ", "");
133 /* 170 */ this.currentLine.bbox = dimension;
134 /* */ }
135 /* */ }
136 /* */
137 /* */ public void startPrefixMapping(String prefix, String uri)
138 /* */ throws SAXException
139 /* */ {
140 /* */ }
141 /* */ }
142
143 /* Location: /private/tmp/fulltextIndexer.jar
144 * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.OCRutils.ParseOcrDocument
145 * JD-Core Version: 0.5.4
146 */