Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children | 7d6d969b10cf |
comparison
equal
deleted
inserted
replaced
18:dc5e9fcb3fdc | 19:4a3641ae14d2 |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; | |
2 | |
3 import java.io.StringReader; | |
4 import java.util.ArrayList; | |
5 import java.util.Collections; | |
6 import java.util.Hashtable; | |
7 | |
8 import org.xml.sax.*; | |
9 | |
10 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
11 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; | |
12 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; | |
13 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; | |
14 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; | |
15 import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; | |
16 | |
17 public class XmlTokenizerContentHandler implements ContentHandler { | |
18 private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element | |
19 private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element | |
20 private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length(); | |
21 private static int ELEMENT_TYPE_CHARACTERS = 1; | |
22 private static int ELEMENT_TYPE_COMPLEX = 2; | |
23 private String[] normalizeFunctions = {}; // default: without normalize functions | |
24 private String[] nwbElements = {}; // non word breaking elements, default: these elements | |
25 private String[] stopElements = {}; // default: no stop elements | |
26 private String[] outputOptions = {}; | |
27 private String xmlnsString = ""; | |
28 private String language; | |
29 private String outputXmlFragment = ""; | |
30 private Element rootElement; | |
31 private Element currentElement; | |
32 private ArrayList<Element> elementQueue; | |
33 | |
34 public XmlTokenizerContentHandler(String[] normalizeFunctions, String language) throws ApplicationException { | |
35 if (normalizeFunctions == null) { | |
36 String[] emptyFunctions = {}; | |
37 this.normalizeFunctions = emptyFunctions; | |
38 } else { | |
39 this.normalizeFunctions = normalizeFunctions; | |
40 } | |
41 this.language = language; | |
42 } | |
43 | |
44 public void setNWBElements(String[] nwbElements) { | |
45 this.nwbElements = nwbElements; | |
46 } | |
47 | |
48 public void setStopElements(String[] stopElements) { | |
49 this.stopElements = stopElements; | |
50 } | |
51 | |
52 public void setOutputOptions(String[] outputOptions) { | |
53 this.outputOptions = outputOptions; | |
54 } | |
55 | |
56 public String getXmlFragment() { | |
57 return outputXmlFragment; | |
58 } | |
59 | |
60 public void startDocument() throws SAXException { | |
61 } | |
62 | |
63 public void endDocument() throws SAXException { | |
64 try { | |
65 String rootElemToStr = rootElement.toXmlString(); | |
66 write(rootElemToStr); | |
67 write("\n"); | |
68 } catch (NullPointerException e) { | |
69 throw new SAXException(e); | |
70 } | |
71 } | |
72 | |
73 public void characters(char[] c, int start, int length) throws SAXException { | |
74 char[] cCopy = new char[length]; | |
75 System.arraycopy(c, start, cCopy, 0, length); | |
76 String charactersStr = String.valueOf(cCopy); | |
77 if (charactersStr != null && ! charactersStr.equals("")) { | |
78 if (currentElement != null) { | |
79 Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); | |
80 charElement.value = StringUtils.deresolveXmlEntities(charactersStr); | |
81 if (currentElement.composites == null) | |
82 currentElement.composites = new ArrayList<Element>(); | |
83 currentElement.composites.add(charElement); | |
84 } | |
85 } | |
86 } | |
87 | |
88 public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { | |
89 } | |
90 | |
91 public void processingInstruction(String target, String data) throws SAXException { | |
92 } | |
93 | |
94 public void setDocumentLocator(Locator locator) { | |
95 } | |
96 | |
97 public void startPrefixMapping(String prefix, String uri) throws SAXException { | |
98 xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; | |
99 if (prefix != null && prefix.equals("")) | |
100 xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" "; | |
101 } | |
102 | |
103 public void endPrefixMapping(String prefix) throws SAXException { | |
104 } | |
105 | |
106 public void skippedEntity(String name) throws SAXException { | |
107 } | |
108 | |
109 public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { | |
110 if (elementQueue == null) | |
111 elementQueue = new ArrayList<Element>(); | |
112 Element newElement = new Element(name); // element of type: complex | |
113 if (currentElement != null) { | |
114 if (currentElement.composites == null) | |
115 currentElement.composites = new ArrayList<Element>(); | |
116 if (currentElement.lang != null) | |
117 newElement.lang = currentElement.lang; // language is inherited to childs | |
118 currentElement.composites.add(newElement); | |
119 } | |
120 currentElement = newElement; | |
121 int attrSize = attrs.getLength(); | |
122 String attrString = ""; | |
123 for (int i=0; i<attrSize; i++) { | |
124 String attrQName = attrs.getQName(i); | |
125 String attrValue = attrs.getValue(i); | |
126 attrValue = StringUtils.forXML(attrValue); | |
127 attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; | |
128 if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) | |
129 currentElement.lang = attrValue; // if xml:lang is set, it is set to the new element and overwrites values inherited by the father | |
130 } | |
131 currentElement.attrString = attrString; | |
132 if (! xmlnsString.equals("")) { | |
133 currentElement.xmlnsString = xmlnsString; | |
134 } | |
135 xmlnsString = ""; | |
136 elementQueue.add(currentElement); | |
137 // only the first element is the root element | |
138 if(rootElement == null) | |
139 rootElement = currentElement; | |
140 } | |
141 | |
142 public void endElement(String uri, String localName, String name) throws SAXException { | |
143 if (elementQueue != null && elementQueue.size() > 0) { | |
144 int lastIndex = elementQueue.size() - 1; | |
145 elementQueue.remove(lastIndex); | |
146 } | |
147 if (elementQueue != null && elementQueue.size() > 0) { | |
148 int lastIndex = elementQueue.size() - 1; | |
149 currentElement = elementQueue.get(lastIndex); | |
150 } else { | |
151 currentElement = null; | |
152 } | |
153 } | |
154 | |
155 private boolean withForms() { | |
156 boolean result = false; | |
157 for (int i=0; i< outputOptions.length; i++) { | |
158 String function = outputOptions[i]; | |
159 if (function.equals("withForms")) | |
160 return true; | |
161 } | |
162 return result; | |
163 } | |
164 | |
165 private boolean withLemmas() { | |
166 boolean result = false; | |
167 for (int i=0; i< outputOptions.length; i++) { | |
168 String function = outputOptions[i]; | |
169 if (function.equals("withLemmas")) | |
170 return true; | |
171 } | |
172 return result; | |
173 } | |
174 | |
175 private void write(String outStr) throws SAXException { | |
176 outputXmlFragment += outStr; | |
177 } | |
178 | |
179 private class Element { | |
180 private int type; | |
181 private String name; | |
182 private String xmlnsString; | |
183 private String attrString; | |
184 private String value; | |
185 private String lang; // normally value of attribute xml:lang or the inherited xml:lang value of the father node | |
186 private ArrayList<Element> composites; | |
187 | |
188 private Element(String name) { | |
189 this.type = ELEMENT_TYPE_COMPLEX; | |
190 this.name = name; | |
191 } | |
192 | |
193 private Element(String name, int type) { | |
194 this.type = type; | |
195 this.name = name; | |
196 } | |
197 | |
198 private boolean isComplex() { | |
199 boolean isComplex = false; | |
200 if (type == ELEMENT_TYPE_COMPLEX) | |
201 isComplex = true; | |
202 return isComplex; | |
203 } | |
204 | |
205 private boolean isWordDelimiterElement() { | |
206 boolean isWordDelimiterElement = true; | |
207 for (int i=0; i<nwbElements.length; i++) { | |
208 String nwbElementName = nwbElements[i]; | |
209 if (name.equals(nwbElementName)) { | |
210 isWordDelimiterElement = false; | |
211 break; | |
212 } | |
213 } | |
214 return isWordDelimiterElement; | |
215 } | |
216 | |
217 private boolean isStopElement() { | |
218 boolean isStopElement = false; | |
219 for (int i=0; i<stopElements.length; i++) { | |
220 String stopElementName = stopElements[i]; | |
221 if (name.equals(stopElementName)) { | |
222 isStopElement = true; | |
223 break; | |
224 } | |
225 } | |
226 return isStopElement; | |
227 } | |
228 | |
229 private String toXmlString() throws SAXException { | |
230 String retString = ""; | |
231 String elemLanguage = language; // default value for the document/page | |
232 if (lang != null) | |
233 elemLanguage = lang; // value of the element if available | |
234 // write this element | |
235 if (! isComplex()) { | |
236 retString += value; | |
237 } else { | |
238 String xmlNsString = this.xmlnsString; | |
239 if (xmlNsString == null || xmlNsString.equals("")) { | |
240 retString = retString + "<" + name + attrString + ">"; | |
241 } else { | |
242 retString = retString + "<" + name + " " + xmlNsString + attrString + ">"; | |
243 } | |
244 if (composites != null) { | |
245 String compositesCharsWithMarks = ""; | |
246 ArrayList<Element> complexElements = new ArrayList<Element>(); | |
247 for (int i=0; i<composites.size(); i++) { | |
248 Element composite = composites.get(i); | |
249 if (! composite.isComplex()) { | |
250 if (composite.value != null && ! composite.value.equals("")) { | |
251 String compositeValueStr = composite.value; | |
252 compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words. | |
253 compositeValueStr = compositeValueStr.replaceAll("[ \t]+", " "); // if there are many Blanks/Tabs make them to one Blank | |
254 compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr; | |
255 } | |
256 } else { | |
257 if (! composite.isWordDelimiterElement()) { | |
258 compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_NWD_MARK; // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>) | |
259 } else { | |
260 compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_MARK; // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>) | |
261 } | |
262 complexElements.add(composite); | |
263 } | |
264 } | |
265 // compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK); // remove Blanks after the non word breaking mark (e.g. "praebi<lb/> ta" is changed to "praebi<lb/>ta") | |
266 String compositesCharsWithMarksWithWordTags = insertWordTags(compositesCharsWithMarks, elemLanguage); | |
267 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK); // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values | |
268 if (complexElements.size() > 0) { | |
269 for (int i=0; i<complexElements.size(); i++) { | |
270 int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksWithWordTags.indexOf(COMPLEX_ELEMENT_MARK); | |
271 Element complexElem = complexElements.get(i); | |
272 String complexElementStr = complexElem.toXmlString(); | |
273 String firstPiece = ""; | |
274 if (indexComplexElemCompositesCharsWithMarks > 0) { | |
275 firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks); | |
276 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks); | |
277 } | |
278 retString = retString + firstPiece + complexElementStr; | |
279 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE); | |
280 } | |
281 retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added | |
282 } else { | |
283 retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added | |
284 } | |
285 } | |
286 retString = retString + "</" + name + ">"; | |
287 } | |
288 return retString; | |
289 } | |
290 | |
291 private String insertWordTags(String charactersStrDeresolved, String language) throws SAXException { | |
292 String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved); | |
293 String retStr = ""; | |
294 try { | |
295 Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr)); | |
296 tokenizer.setLanguage(language); | |
297 tokenizer.setNormFunctions(normalizeFunctions); | |
298 ArrayList<Token> tokens = tokenizer.getTokens(); | |
299 int endPos = 0; | |
300 for (int i=0; i < tokens.size(); i++) { | |
301 Token token = tokens.get(i); | |
302 String wordForm = token.getContent(); | |
303 int startPos = token.getStart(); | |
304 String beforeStr = charactersStr.substring(endPos, startPos); | |
305 endPos = token.getEnd(); | |
306 String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr); | |
307 String origWordForm = charactersStr.substring(startPos, endPos); | |
308 String wordTag = insertWordTags(wordForm, language, origWordForm); | |
309 retStr = retStr + beforeStrDeresolved + wordTag; | |
310 } | |
311 String lastAfterStr = charactersStr.substring(endPos); | |
312 String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr); | |
313 retStr = retStr + lastAfterStrDeresolved; | |
314 } catch (ApplicationException e) { | |
315 throw new SAXException(e); | |
316 } | |
317 return retStr; | |
318 } | |
319 | |
320 private String insertWordTags(String wordForm, String language, String origWordForm) throws ApplicationException { | |
321 String wordTag = null; | |
322 if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) | |
323 return origWordForm; | |
324 if (isStopElement()) | |
325 return origWordForm; | |
326 wordForm = removeSpecialSymbols(wordForm); | |
327 wordForm = wordForm.toLowerCase(); | |
328 String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm); | |
329 ArrayList<Lemma> lemmas = null; | |
330 if (withForms() || withLemmas()) { | |
331 LexHandler lexHandler = LexHandler.getInstance(); | |
332 lemmas = lexHandler.getLemmas(wordForm, "form", language, "none"); | |
333 } | |
334 wordTag = insertWordTags(origWordFormDeresolved, wordForm, language, null, lemmas); | |
335 return wordTag; | |
336 } | |
337 | |
338 /** | |
339 * | |
340 * @param origWordToken could contain nwd marks | |
341 * @param wordForm contains no nwd marks | |
342 * @param language | |
343 * @param origWordFormNormalized | |
344 * @param lemmas | |
345 * @return for each substring between nwd marks create a word tag | |
346 */ | |
347 private String insertWordTags(String origWordToken, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) { | |
348 if (origWordToken.isEmpty()) | |
349 return origWordToken; | |
350 if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK)) | |
351 return COMPLEX_ELEMENT_NWD_MARK; | |
352 String retWordTags = ""; | |
353 String origWordTokenTmp = origWordToken; | |
354 while (! origWordTokenTmp.isEmpty()) { | |
355 if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) { // single nwd mark | |
356 origWordTokenTmp = origWordTokenTmp.substring(1); | |
357 retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK; | |
358 } else { | |
359 int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK); | |
360 if (indexUpToNWD != -1) { // not end of string reached | |
361 String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD); | |
362 String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); | |
363 retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK; | |
364 origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1); | |
365 } else { // end of string reached | |
366 String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length()); | |
367 String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); | |
368 retWordTags = retWordTags + origWordTokenFragmentWithTags; | |
369 origWordTokenTmp = ""; // finente | |
370 } | |
371 } | |
372 } | |
373 return retWordTags; | |
374 } | |
375 | |
376 private String getWordTag(String origWordForm, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) { | |
377 if (origWordForm == null || origWordForm.isEmpty()) | |
378 return ""; | |
379 String langISOCode = Language.getInstance().getISO639Code(language); | |
380 String retStr = "<w form=\"" + wordForm + "\"" + " lang=\"" + langISOCode + "\""; | |
381 if (origWordFormNormalized != null) | |
382 retStr = retStr + " formNormalized=\"" + origWordFormNormalized + "\""; | |
383 if (lemmas != null) { | |
384 String lemmasStr = ""; | |
385 String formsStr = ""; | |
386 Collections.sort(lemmas); | |
387 Hashtable<String, Form> formsHashtable = new Hashtable<String, Form>(); | |
388 for (int i=0; i < lemmas.size(); i++) { | |
389 Lemma lemma = lemmas.get(i); | |
390 ArrayList<Form> lemmaForms = lemma.getFormsList(); | |
391 for (int j=0; j < lemmaForms.size(); j++) { | |
392 Form form = lemmaForms.get(j); | |
393 formsHashtable.put(form.getFormName(), form); | |
394 } | |
395 String lemmaName = lemma.getLemmaName(); | |
396 lemmasStr = lemmasStr + lemmaName + " "; | |
397 } | |
398 ArrayList<Form> forms = new ArrayList<Form>(); | |
399 forms.addAll(formsHashtable.values()); | |
400 Collections.sort(forms); | |
401 for (int i=0; i < forms.size(); i++) { | |
402 Form form = forms.get(i); | |
403 String formName = form.getFormName(); | |
404 formName = StringUtils.forXML(formName); | |
405 formsStr = formsStr + formName + " "; | |
406 } | |
407 if (formsStr.endsWith(" ")) | |
408 formsStr = formsStr.substring(0, formsStr.length() - 1); | |
409 if (lemmasStr.endsWith(" ")) | |
410 lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1); | |
411 if (withForms()) | |
412 retStr = retStr + " forms=\"" + formsStr + "\""; | |
413 if (withLemmas()) | |
414 retStr = retStr + " lemmas=\"" + lemmasStr + "\""; | |
415 } | |
416 retStr = retStr + ">" + origWordForm + "</w>"; | |
417 return retStr; | |
418 } | |
419 | |
420 private String removeSpecialSymbols(String inputStr) { | |
421 String retStr = inputStr.replaceAll(" |\n|\t|-|\u2424|\u2425", ""); | |
422 return retStr; | |
423 } | |
424 | |
425 } | |
426 } |