Mercurial > hg > fulltextIndexer
comparison src/de/mpiwg/dwinter/fulltextIndexer/OCRutils/ParseOcrDocument.java @ 0:dc7622afcfea default tip
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:33:16 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dc7622afcfea |
---|---|
1 /* */ package de.mpiwg.dwinter.fulltextIndexer.OCRutils; | |
2 /* */ | |
3 /* */ import java.util.ArrayList; | |
4 /* */ import org.xml.sax.Attributes; | |
5 /* */ import org.xml.sax.Locator; | |
6 /* */ import org.xml.sax.SAXException; | |
7 /* */ import org.xml.sax.helpers.DefaultHandler; | |
8 /* */ | |
9 /* */ public class ParseOcrDocument extends DefaultHandler | |
10 /* */ { | |
11 /* 31 */ public OCRDocument ocrDocument = new OCRDocument(); | |
12 /* */ private int lineCounter; | |
13 /* */ private OCRDocument.OCRLine currentLine; | |
14 /* */ private boolean inLine; | |
15 /* */ | |
16 /* */ public void startDocument() | |
17 /* */ throws SAXException | |
18 /* */ { | |
19 /* */ } | |
20 /* */ | |
21 /* */ public void endDocument() | |
22 /* */ throws SAXException | |
23 /* */ { | |
24 /* */ } | |
25 /* */ | |
26 /* */ public void characters(char[] c, int start, int length) | |
27 /* */ throws SAXException | |
28 /* */ { | |
29 /* 52 */ if (this.inLine) | |
30 /* 53 */ this.currentLine.content += new String(c, start, length); | |
31 /* */ } | |
32 /* */ | |
33 /* */ public void ignorableWhitespace(char[] c, int start, int length) | |
34 /* */ throws SAXException | |
35 /* */ { | |
36 /* */ } | |
37 /* */ | |
38 /* */ public void processingInstruction(String target, String data) | |
39 /* */ throws SAXException | |
40 /* */ { | |
41 /* */ } | |
42 /* */ | |
43 /* */ public void setDocumentLocator(Locator arg1) | |
44 /* */ { | |
45 /* */ } | |
46 /* */ | |
47 /* */ public void endElement(String uri, String localName, String name) | |
48 /* */ throws SAXException | |
49 /* */ { | |
50 /* 76 */ if ((!name.equals("span")) || | |
51 /* 78 */ (!this.inLine)) | |
52 /* */ return; | |
53 /* 80 */ this.ocrDocument.OCRLines.add(this.currentLine); | |
54 /* 81 */ this.inLine = false; | |
55 /* */ } | |
56 /* */ | |
57 /* */ public void endPrefixMapping(String prefix) | |
58 /* */ throws SAXException | |
59 /* */ { | |
60 /* */ } | |
61 /* */ | |
62 /* */ public void skippedEntity(String name) | |
63 /* */ throws SAXException | |
64 /* */ { | |
65 /* */ } | |
66 /* */ | |
67 /* */ public void startElement(String uri, String localName, String name, Attributes attrs) | |
68 /* */ throws SAXException | |
69 /* */ { | |
70 /* 100 */ if ((name.equals("div")) && | |
71 /* 101 */ (attrs != null)) | |
72 /* */ { | |
73 /* 103 */ int length = attrs.getLength(); | |
74 /* */ | |
75 /* 106 */ for (int i = 0; i < length; ++i) | |
76 /* */ { | |
77 /* 108 */ if ((!attrs.getLocalName(i).equals("class")) || | |
78 /* 110 */ (!attrs.getValue(i).equals("ocr_page"))) continue; | |
79 /* 111 */ doPage(attrs); | |
80 /* */ } | |
81 /* */ | |
82 /* */ } | |
83 /* */ | |
84 /* 117 */ if ((!name.equals("span")) || | |
85 /* 118 */ (attrs == null)) | |
86 /* */ return; | |
87 /* 120 */ int length = attrs.getLength(); | |
88 /* */ | |
89 /* 123 */ for (int i = 0; i < length; ++i) | |
90 /* */ { | |
91 /* 125 */ if ((!attrs.getLocalName(i).equals("class")) || | |
92 /* 127 */ (!attrs.getValue(i).equals("ocr_line"))) continue; | |
93 /* 128 */ doLine(attrs); | |
94 /* */ } | |
95 /* */ } | |
96 /* */ | |
97 /* */ private void doPage(Attributes attrs) | |
98 /* */ { | |
99 /* 136 */ int length = attrs.getLength(); | |
100 /* */ | |
101 /* 139 */ for (int i = 0; i < length; ++i) | |
102 /* */ { | |
103 /* 141 */ if (!attrs.getLocalName(i).equals("title")) | |
104 /* */ continue; | |
105 /* 143 */ String title = attrs.getValue(i); | |
106 /* 144 */ String[] splitted = title.split(" "); | |
107 /* 145 */ String dimension = splitted[(splitted.length - 2)] + " " + splitted[(splitted.length - 1)]; | |
108 /* 146 */ this.ocrDocument.pageDimension = dimension; | |
109 /* */ } | |
110 /* */ | |
111 /* 149 */ this.lineCounter = 0; | |
112 /* */ } | |
113 /* */ | |
114 /* */ private void doLine(Attributes attrs) | |
115 /* */ { | |
116 /* */ OCRDocument doc = this.ocrDocument; | |
117 this.currentLine = doc.new OCRLine(); | |
118 /* */ | |
119 /* 156 */ this.inLine = true; | |
120 /* 157 */ this.currentLine.content = new String(); | |
121 /* */ | |
122 /* 159 */ int length = attrs.getLength(); | |
123 /* 160 */ this.currentLine.lineNumber = String.valueOf(this.lineCounter); | |
124 /* 161 */ this.lineCounter += 1; | |
125 /* 162 */ this.currentLine.bbox = "0 0"; | |
126 /* 163 */ for (int i = 0; i < length; ++i) | |
127 /* */ { | |
128 /* 165 */ if (!attrs.getLocalName(i).equals("title")) | |
129 /* */ continue; | |
130 /* 167 */ String title = attrs.getValue(i); | |
131 /* */ | |
132 /* 169 */ String dimension = title.replace("bbox ", ""); | |
133 /* 170 */ this.currentLine.bbox = dimension; | |
134 /* */ } | |
135 /* */ } | |
136 /* */ | |
137 /* */ public void startPrefixMapping(String prefix, String uri) | |
138 /* */ throws SAXException | |
139 /* */ { | |
140 /* */ } | |
141 /* */ } | |
142 | |
143 /* Location: /private/tmp/fulltextIndexer.jar | |
144 * Qualified Name: de.mpiwg.dwinter.fulltextIndexer.OCRutils.ParseOcrDocument | |
145 * JD-Core Version: 0.5.4 | |
146 */ |