Mercurial > hg > mpdl-group
annotate software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java @ 10:59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Fri, 11 Mar 2011 13:33:26 +0100 |
parents | 2396a569e446 |
children | fba5577e49d9 |
rev | line source |
---|---|
0 | 1 package de.mpg.mpiwg.berlin.mpdl.lt.doc; |
2 | |
3 import java.util.ArrayList; | |
4 | |
5 import org.apache.lucene.analysis.Token; | |
6 import org.xml.sax.*; | |
7 | |
8 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
9 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer; | |
10 import de.mpg.mpiwg.berlin.mpdl.lt.lex.db.LexHandler; | |
11 import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; | |
12 | |
13 public class DictionarizerContentHandler implements ContentHandler { | |
14 private static String MARK = "COMPLEXELEMENTTTTT"; | |
15 private static int MARK_SIZE = MARK.length(); | |
16 private static int ELEMENT_TYPE_CHARACTERS = 1; | |
17 private static int ELEMENT_TYPE_COMPLEX = 2; | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
18 private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString(); |
0 | 19 private String xmlnsString = ""; |
20 private String language; | |
21 private String outputXmlFragment = ""; | |
22 private Element rootElement; | |
23 private Element currentElement; | |
24 private ArrayList<Element> elementQueue; | |
25 | |
26 public DictionarizerContentHandler(String language) throws ApplicationException { | |
27 this.language = language; | |
28 } | |
29 | |
30 public String getXmlFragment() { | |
31 return outputXmlFragment; | |
32 } | |
33 | |
34 public void startDocument() throws SAXException { | |
35 } | |
36 | |
37 public void endDocument() throws SAXException { | |
38 String rootElemToStr = rootElement.toXmlString(); | |
39 write(rootElemToStr); | |
40 write("\n"); | |
41 } | |
42 | |
43 public void characters(char[] c, int start, int length) throws SAXException { | |
44 char[] cCopy = new char[length]; | |
45 System.arraycopy(c, start, cCopy, 0, length); | |
46 String charactersStr = String.valueOf(cCopy); | |
47 if (charactersStr != null && ! charactersStr.equals("")) { | |
10
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
48 // cause there are problems during xsl transformations with ideographic characters without zwsp characters we put always a zwsp between ideographic characters |
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
49 charactersStr = zwsp(charactersStr); |
0 | 50 if (currentElement != null) { |
51 Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); | |
52 charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr); | |
53 if (currentElement.composites == null) | |
54 currentElement.composites = new ArrayList<Element>(); | |
55 currentElement.composites.add(charElement); | |
56 } | |
57 } | |
58 } | |
59 | |
60 public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { | |
61 } | |
62 | |
63 public void processingInstruction(String target, String data) throws SAXException { | |
64 } | |
65 | |
66 public void setDocumentLocator(Locator locator) { | |
67 } | |
68 | |
69 public void startPrefixMapping(String prefix, String uri) throws SAXException { | |
70 xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; | |
71 } | |
72 | |
73 public void endPrefixMapping(String prefix) throws SAXException { | |
74 } | |
75 | |
76 public void skippedEntity(String name) throws SAXException { | |
77 } | |
78 | |
79 public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { | |
80 if (elementQueue == null) | |
81 elementQueue = new ArrayList<Element>(); | |
82 Element newElement = new Element(name); // element of type: complex | |
83 if (currentElement != null) { | |
84 if (currentElement.composites == null) | |
85 currentElement.composites = new ArrayList<Element>(); | |
86 if (currentElement.lang != null) | |
87 newElement.lang = currentElement.lang; // language wird an Kinder vererbt | |
88 currentElement.composites.add(newElement); | |
89 } | |
90 currentElement = newElement; | |
91 int attrSize = attrs.getLength(); | |
92 String attrString = ""; | |
93 for (int i=0; i<attrSize; i++) { | |
94 String attrQName = attrs.getQName(i); | |
95 String attrValue = attrs.getValue(i); | |
96 attrValue = StringUtilEscapeChars.forXML(attrValue); | |
97 attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; | |
98 if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) | |
99 currentElement.lang = attrValue; // wenn xml:lang belegt ist, wird es an das neue Element gesetzt und überschreibt vom Vater geerbte Werte | |
100 } | |
101 currentElement.attrString = attrString; | |
102 if (! xmlnsString.equals("")) { | |
103 currentElement.xmlnsString = xmlnsString; | |
104 } | |
105 xmlnsString = ""; | |
106 elementQueue.add(currentElement); | |
107 // only the first element is the root element | |
108 if(rootElement == null) | |
109 rootElement = currentElement; | |
110 } | |
111 | |
112 public void endElement(String uri, String localName, String name) throws SAXException { | |
113 if (elementQueue != null && elementQueue.size() > 0) { | |
114 int lastIndex = elementQueue.size() - 1; | |
115 elementQueue.remove(lastIndex); | |
116 } | |
117 if (elementQueue != null && elementQueue.size() > 0) { | |
118 int lastIndex = elementQueue.size() - 1; | |
119 currentElement = elementQueue.get(lastIndex); | |
120 } else { | |
121 currentElement = null; | |
122 } | |
123 } | |
124 | |
125 public int getCharIndex(String compositesCharsDictionarized, int indexComplexElemCompositesCharsWithMarks) { | |
126 if (indexComplexElemCompositesCharsWithMarks == 0) | |
127 return -1; | |
128 int size = compositesCharsDictionarized.length(); | |
129 if (size == 0) | |
130 return -1; | |
131 int index = 0; | |
132 int counter = 0; | |
133 boolean isInTag = false; | |
134 boolean success = false; | |
135 while (!success) { | |
136 if (counter > size) | |
137 return -1; | |
138 char c = compositesCharsDictionarized.charAt(counter); | |
139 if (c == '<') | |
140 isInTag = true; | |
141 if (! isInTag) | |
142 index++; | |
143 if (index == indexComplexElemCompositesCharsWithMarks) | |
144 success = true; | |
145 if (c == '>') | |
146 isInTag = false; | |
147 counter++; | |
148 } | |
149 return counter + 1; | |
150 } | |
151 | |
152 private void write(String outStr) throws SAXException { | |
153 outputXmlFragment += outStr; | |
154 } | |
155 | |
10
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
156 /** |
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
157 * Puts a zwsp between two ideographic characters (e.g. in CJK Unified Ideographs) |
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
158 * @param str |
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
159 * @return |
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
160 */ |
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
161 private String zwsp(String str) { |
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
162 // based on Unicode 3.2 |
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
163 String ideographic = "[\u3300-\u33ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]"; |
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
164 String regex = "(" + ideographic + ")(" + ideographic + ")"; |
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
165 String retStr = str.replaceAll(regex, "$1\u200b$2"); |
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
166 retStr = retStr.replaceAll(regex, "$1\u200b$2"); |
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
167 return retStr; |
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
168 } |
59ff47d1e237
TEI Unterst?tzung, Fehlerbehebungen, externe Objekte
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
6
diff
changeset
|
169 |
0 | 170 private class Element { |
171 private int type; | |
172 private String name; | |
173 private String xmlnsString; | |
174 private String attrString; | |
175 private String value; | |
176 private String lang; // normalerweise mit dem Wert aus dem Attribut xml:lang belegt bzw. mit dem aus dem Vaterknoten wererbten xml:lang-Wert | |
177 private ArrayList<Element> composites; | |
178 | |
179 private Element(String name) { | |
180 this.type = ELEMENT_TYPE_COMPLEX; | |
181 this.name = name; | |
182 } | |
183 | |
184 private Element(String name, int type) { | |
185 this.type = type; | |
186 this.name = name; | |
187 } | |
188 | |
189 private boolean isComplex() { | |
190 boolean isComplex = false; | |
191 if (type == ELEMENT_TYPE_COMPLEX) | |
192 isComplex = true; | |
193 return isComplex; | |
194 } | |
195 | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
196 /** |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
197 * feel free to add/remove some element names |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
198 * @return true if element is a word delimiter element else false |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
199 */ |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
200 private boolean isWordDelimiterElement() { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
201 boolean isWordDelimiterElement = true; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
202 if (name.equals("lb") || name.equals("cb") || name.equals("gap") || name.equals("figure") || name.equals("image") || name.equals("note") || name.equals("handwritten") || name.equals("anchor")) |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
203 isWordDelimiterElement = false; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
204 return isWordDelimiterElement; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
205 } |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
206 |
0 | 207 private String toXmlString() throws SAXException { |
208 String retString = ""; | |
209 String elemLanguage = language; // default value for the document/page | |
210 if (lang != null) | |
211 elemLanguage = lang; // der Wert des Elements falls vorhanden | |
212 // write this element | |
213 if (! isComplex()) { | |
214 retString += value; | |
215 } else { | |
216 String xmlNsString = this.xmlnsString; | |
217 if (xmlNsString == null || xmlNsString.equals("")) { | |
218 retString = retString + "<" + name + attrString + ">"; | |
219 } else { | |
220 retString = retString + "<" + name + " " + xmlNsString + attrString + ">"; | |
221 } | |
222 if (composites != null) { | |
223 String compositesChars = ""; | |
224 String compositesCharsWithMarks = ""; | |
225 ArrayList<Element> complexElements = new ArrayList<Element>(); | |
226 for (int i=0; i<composites.size(); i++) { | |
227 Element composite = composites.get(i); | |
228 if (! composite.isComplex()) { | |
229 if (composite.value != null && ! composite.value.equals("")) { | |
230 String compositeValueStr = composite.value; | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
231 compositesChars = compositesChars + compositeValueStr; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
232 compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr; |
0 | 233 } |
234 } else { | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
235 if (! composite.isWordDelimiterElement()) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
236 compositesChars = compositesChars + SPECIAL_NOT_WORD_DELIM_SYMBOL; // add a special symbol at the position of the "not word delimiter element" (e.g. line break) |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
237 } |
0 | 238 complexElements.add(composite); |
239 compositesCharsWithMarks += MARK; | |
240 } | |
241 } | |
242 String compositesCharsDictionarized = characters2DictWords(compositesChars, elemLanguage); | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
243 compositesChars = compositesChars.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, ""); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
244 compositesCharsDictionarized = compositesCharsDictionarized.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, ""); |
0 | 245 if (complexElements.size() > 0) { |
246 for (int i=0; i<complexElements.size(); i++) { | |
247 int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarks.indexOf(MARK); | |
248 int indexComplexElemCompositesCharsDictionarized = getCharIndex(compositesCharsDictionarized, indexComplexElemCompositesCharsWithMarks); | |
249 Element complexElem = complexElements.get(i); | |
250 String complexElementStr = complexElem.toXmlString(); | |
251 String firstPiece = ""; | |
252 if (indexComplexElemCompositesCharsDictionarized > 0) { | |
253 firstPiece = compositesCharsDictionarized.substring(0, indexComplexElemCompositesCharsDictionarized - 1); | |
254 compositesCharsDictionarized = compositesCharsDictionarized.substring(indexComplexElemCompositesCharsDictionarized - 1); | |
255 } | |
256 retString = retString + firstPiece + complexElementStr; | |
257 compositesCharsWithMarks = compositesCharsWithMarks.substring(indexComplexElemCompositesCharsWithMarks + MARK_SIZE); | |
258 } | |
259 retString = retString + compositesCharsDictionarized; // last one must also be added | |
260 } else { | |
261 retString = retString + compositesCharsDictionarized; // last one must also be added | |
262 } | |
263 } | |
264 retString = retString + "</" + name + ">"; | |
265 } | |
266 return retString; | |
267 } | |
268 | |
269 private String characters2DictWords(String charactersStrDeresolved, String language) throws SAXException { | |
270 String charactersStr = StringUtilEscapeChars.resolveXmlEntities(charactersStrDeresolved); | |
271 String retStr = ""; | |
272 try { | |
273 MpdlTokenizerAnalyzer dictionarizerAnalyzer = new MpdlTokenizerAnalyzer(language); | |
274 ArrayList<Token> wordTokens = dictionarizerAnalyzer.getToken(charactersStr); | |
275 int endPos = 0; | |
276 for (int i=0; i < wordTokens.size(); i++) { | |
277 Token wordToken = wordTokens.get(i); | |
278 int startPos = wordToken.startOffset(); | |
279 String beforeStr = charactersStr.substring(endPos, startPos); | |
280 endPos = wordToken.endOffset(); | |
281 String wordStr = charactersStr.substring(startPos, endPos); | |
282 String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr); | |
283 String wordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(wordStr); | |
284 String wordTokenText = wordToken.termText(); | |
285 LexHandler lexHandler = LexHandler.getInstance(); | |
286 // delivers lex entries by help of the morphology component (lex entry of the stem of the normalized word form) | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
287 String wordTokenTextWithoutSpecialSymbols = removeSpecialSymbols(wordTokenText); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
288 ArrayList<String> lexEntryKeys = lexHandler.getLexEntryKeys(wordTokenTextWithoutSpecialSymbols, language, false); |
0 | 289 if (lexEntryKeys != null) { |
290 String lexForms = ""; | |
291 for (int j=0; j<lexEntryKeys.size(); j++) { | |
292 String lexEntryKey = lexEntryKeys.get(j); | |
293 lexForms = lexForms + lexEntryKey + " "; | |
294 } | |
295 lexForms = lexForms.substring(0, lexForms.length() - 1); | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
296 retStr = retStr + beforeStrDeresolved + "<w lang=\"" + language + "\"" + " form=\"" + wordTokenTextWithoutSpecialSymbols + "\"" + " lexForms=\"" + lexForms + "\">" + wordStrDeresolved + "</w>"; |
0 | 297 } else { |
298 retStr = retStr + beforeStrDeresolved + wordStrDeresolved; | |
299 } | |
300 } | |
301 String lastAfterStr = charactersStr.substring(endPos); | |
302 String lastAfterStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(lastAfterStr); | |
303 retStr = retStr + lastAfterStrDeresolved; | |
304 } catch (ApplicationException e) { | |
305 throw new SAXException(e); | |
306 } | |
307 return retStr; | |
308 } | |
6
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
309 |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
310 private String removeSpecialSymbols(String inputStr) { |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
311 String retStr = inputStr.replaceAll(" ", ""); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
312 retStr = retStr.replaceAll("\n", ""); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
313 retStr = retStr.replaceAll("-", ""); |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
314 return retStr; |
2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
0
diff
changeset
|
315 } |
0 | 316 } |
317 } |