comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents 7d6d969b10cf
children
comparison
equal deleted inserted replaced
22:6a45a982c333 23:e845310098ba
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; 1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;
2 2
3 import java.io.StringReader; 3 import java.io.StringReader;
4 import java.util.ArrayList; 4 import java.util.ArrayList;
5 import java.util.Collections; 5 import java.util.Collections;
6 import java.util.Enumeration;
6 import java.util.Hashtable; 7 import java.util.Hashtable;
7 8
8 import org.xml.sax.*; 9 import org.xml.sax.*;
9 10
10 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; 11 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
19 private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element 20 private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element
20 private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element 21 private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element
21 private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length(); 22 private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length();
22 private static int ELEMENT_TYPE_CHARACTERS = 1; 23 private static int ELEMENT_TYPE_CHARACTERS = 1;
23 private static int ELEMENT_TYPE_COMPLEX = 2; 24 private static int ELEMENT_TYPE_COMPLEX = 2;
24 private String[] normalizeFunctions = {}; // default: without normalize functions 25 private String docId;
25 private String[] nwbElements = {}; // non word breaking elements, default: these elements 26 private String language;
27 private String[] nwbElements = {}; // non word breaking elements, default: no nwb elements
26 private String[] stopElements = {}; // default: no stop elements 28 private String[] stopElements = {}; // default: no stop elements
29 private String outputFormat = "xml"; // default: xml
27 private String[] outputOptions = {}; 30 private String[] outputOptions = {};
31 private boolean withForms = false;
32 private boolean withLemmas = false;
33 private String[] highlightTerms = {}; // highlight terms, default: no highlight terms
34 private String[] normFunctions = {}; // default: no norm function
35 private boolean useNormFunction = false;
36 private boolean useRegFunction = false;
28 private String xmlnsString = ""; 37 private String xmlnsString = "";
29 private String language; 38 private StringBuilder result = new StringBuilder();
30 private String outputXmlFragment = ""; 39 private ArrayList<Token> resultTokens = new ArrayList<Token>();
40 private Hashtable<String, ArrayList<Element>> elements = new Hashtable<String, ArrayList<Element>>();
31 private Element rootElement; 41 private Element rootElement;
32 private Element currentElement; 42 private Element currentElement;
43 private int currentPosition = 0;
44 private int currentPageNumber = 0;
45 private int currentLineNumber = 0;
46 private Hashtable<String, Integer> currentPositions = new Hashtable<String, Integer>();
47 private Hashtable<String, Integer> currentPagePositions = new Hashtable<String, Integer>();
33 private ArrayList<Element> elementQueue; 48 private ArrayList<Element> elementQueue;
34 49
35 public XmlTokenizerContentHandler(String[] normalizeFunctions, String language) throws ApplicationException { 50 public XmlTokenizerContentHandler(String language) throws ApplicationException {
36 if (normalizeFunctions == null) {
37 String[] emptyFunctions = {};
38 this.normalizeFunctions = emptyFunctions;
39 } else {
40 this.normalizeFunctions = normalizeFunctions;
41 }
42 this.language = language; 51 this.language = language;
52 }
53
54 public void setDocIdentifier(String docId) {
55 this.docId = docId;
43 } 56 }
44 57
45 public void setNWBElements(String[] nwbElements) { 58 public void setNWBElements(String[] nwbElements) {
46 this.nwbElements = nwbElements; 59 this.nwbElements = nwbElements;
47 } 60 }
48 61
49 public void setStopElements(String[] stopElements) { 62 public void setStopElements(String[] stopElements) {
50 this.stopElements = stopElements; 63 this.stopElements = stopElements;
51 } 64 }
52 65
66 public void setHighlightTerms(String[] highlightTerms) {
67 this.highlightTerms = highlightTerms;
68 }
69
70 public void setNormFunctions(String[] normFunctions) {
71 this.normFunctions = normFunctions;
72 if (this.normFunctions != null) {
73 for (int i=0; i< this.normFunctions.length; i++) {
74 String function = normFunctions[i];
75 if (function.equals("norm"))
76 this.useNormFunction = true;
77 else if (function.equals("reg"))
78 this.useRegFunction = true;
79 }
80 }
81 }
82
83 public void setOutputFormat(String outputFormat) {
84 this.outputFormat = outputFormat;
85 }
86
53 public void setOutputOptions(String[] outputOptions) { 87 public void setOutputOptions(String[] outputOptions) {
54 this.outputOptions = outputOptions; 88 this.outputOptions = outputOptions;
55 } 89 for (int i=0; i< this.outputOptions.length; i++) {
56 90 String function = outputOptions[i];
57 public String getXmlFragment() { 91 if (function.equals("withForms"))
58 return outputXmlFragment; 92 this.withForms = true;
93 else if (function.equals("withLemmas"))
94 this.withLemmas = true;
95 }
96 }
97
98 public String getResultString() {
99 return result.toString();
100 }
101
102 public ArrayList<Token> getResultTokens() {
103 return resultTokens;
59 } 104 }
60 105
106 public ArrayList<Element> getElements(String elementName) {
107 return elements.get(elementName);
108 }
109
110 public int getPageCount() {
111 return currentPageNumber;
112 }
113
61 public void startDocument() throws SAXException { 114 public void startDocument() throws SAXException {
62 } 115 }
63 116
64 public void endDocument() throws SAXException { 117 public void endDocument() throws SAXException {
65 try { 118 try {
66 String rootElemToStr = rootElement.toXmlString(); 119 String rootElemToStr = rootElement.buildString();
67 write(rootElemToStr); 120 write(rootElemToStr);
68 write("\n"); 121 write("\n");
69 } catch (NullPointerException e) { 122 } catch (NullPointerException e) {
123 throw new SAXException(e);
124 } catch (ApplicationException e) {
70 throw new SAXException(e); 125 throw new SAXException(e);
71 } 126 }
72 } 127 }
73 128
74 public void characters(char[] c, int start, int length) throws SAXException { 129 public void characters(char[] c, int start, int length) throws SAXException {
76 System.arraycopy(c, start, cCopy, 0, length); 131 System.arraycopy(c, start, cCopy, 0, length);
77 String charactersStr = String.valueOf(cCopy); 132 String charactersStr = String.valueOf(cCopy);
78 if (charactersStr != null && ! charactersStr.equals("")) { 133 if (charactersStr != null && ! charactersStr.equals("")) {
79 if (currentElement != null) { 134 if (currentElement != null) {
80 Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); 135 Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS);
136 charElement.pageNumber = currentPageNumber;
81 charElement.value = StringUtils.deresolveXmlEntities(charactersStr); 137 charElement.value = StringUtils.deresolveXmlEntities(charactersStr);
82 if (currentElement.composites == null) 138 if (currentElement.composites == null)
83 currentElement.composites = new ArrayList<Element>(); 139 currentElement.composites = new ArrayList<Element>();
84 currentElement.composites.add(charElement); 140 currentElement.composites.add(charElement);
85 } 141 }
94 150
95 public void setDocumentLocator(Locator locator) { 151 public void setDocumentLocator(Locator locator) {
96 } 152 }
97 153
98 public void startPrefixMapping(String prefix, String uri) throws SAXException { 154 public void startPrefixMapping(String prefix, String uri) throws SAXException {
99 xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
100 if (prefix != null && prefix.equals("")) 155 if (prefix != null && prefix.equals(""))
101 xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" "; 156 xmlnsString += "xmlns" + "=\"" + uri + "\" ";
157 else
158 xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
102 } 159 }
103 160
104 public void endPrefixMapping(String prefix) throws SAXException { 161 public void endPrefixMapping(String prefix) throws SAXException {
105 } 162 }
106 163
115 if (currentElement.composites == null) 172 if (currentElement.composites == null)
116 currentElement.composites = new ArrayList<Element>(); 173 currentElement.composites = new ArrayList<Element>();
117 if (currentElement.lang != null) 174 if (currentElement.lang != null)
118 newElement.lang = currentElement.lang; // language is inherited to childs 175 newElement.lang = currentElement.lang; // language is inherited to childs
119 currentElement.composites.add(newElement); 176 currentElement.composites.add(newElement);
177 newElement.parent = currentElement;
120 } 178 }
121 currentElement = newElement; 179 currentElement = newElement;
180 if (localName != null && localName.equals("pb")) {
181 currentPageNumber++;
182 setCurrentPagePosition(localName, 0);
183 }
184 currentElement.pageNumber = currentPageNumber;
185 if (localName != null && localName.equals("lb")) {
186 currentLineNumber++;
187 }
188 currentElement.lineNumber = currentLineNumber;
189 currentPosition++;
190 currentElement.docPosition = currentPosition;
191 int newElemPosition = incrementCurrentPosition(localName);
192 currentElement.position = newElemPosition;
193
194 currentElement.elemPosition = getElementPosition(currentElement);
195 Element parent = currentElement.parent;
196 if (parent == null) {
197 currentElement.xpath = "/" + currentElement.name + "[" + currentElement.elemPosition + "]";
198 } else {
199 currentElement.xpath = parent.xpath + "/" + currentElement.name + "[" + currentElement.elemPosition + "]";
200 }
201 int newElemPagePosition = incrementCurrentPagePosition(localName);
202 currentElement.pagePosition = newElemPagePosition;
122 int attrSize = attrs.getLength(); 203 int attrSize = attrs.getLength();
123 String attrString = ""; 204 String attrString = "";
124 for (int i=0; i<attrSize; i++) { 205 for (int i=0; i<attrSize; i++) {
125 String attrQName = attrs.getQName(i); 206 String attrQName = attrs.getQName(i);
126 String attrValue = attrs.getValue(i); 207 String attrValue = attrs.getValue(i);
127 attrValue = StringUtils.forXML(attrValue); 208 attrValue = StringUtils.forXML(attrValue);
128 attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; 209 attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
129 if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) 210 if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) {
130 currentElement.lang = attrValue; // if xml:lang is set, it is set to the new element and overwrites values inherited by the father 211 currentElement.lang = attrValue; // if xml:lang is set, it is set to the new element and overwrites values inherited by the father
212 }
213 if (attrQName != null && (attrQName.toLowerCase().equals("xml:id") || attrQName.toLowerCase().equals("id"))) {
214 currentElement.xmlId = attrValue;
215 }
131 } 216 }
132 currentElement.attrString = attrString; 217 currentElement.attrString = attrString;
133 if (! xmlnsString.equals("")) { 218 if (! xmlnsString.equals("")) {
134 currentElement.xmlnsString = xmlnsString; 219 currentElement.xmlnsString = xmlnsString;
135 } 220 }
151 } else { 236 } else {
152 currentElement = null; 237 currentElement = null;
153 } 238 }
154 } 239 }
155 240
156 private boolean withForms() { 241 private int incrementCurrentPosition(String elemName) {
242 Integer currentElemPos = currentPositions.get(elemName);
243 if (currentElemPos == null) {
244 currentElemPos = new Integer(0);
245 }
246 currentElemPos++;
247 currentPositions.put(elemName, currentElemPos);
248 return currentElemPos.intValue();
249 }
250
251 private int getElementPosition(Element elem) {
252 int pos = 0;
253 Element parent = elem.parent;
254 if (parent == null) {
255 pos = 1;
256 } else {
257 pos = 0;
258 ArrayList<Element> composites = parent.composites;
259 if (composites != null) {
260 for (int i=0; i<composites.size(); i++) {
261 Element e = composites.get(i);
262 if (e.isComplex() && e.name.equals(elem.name)) {
263 pos++;
264 }
265 if (e == elem)
266 break;
267 }
268 } else {
269 pos = 1;
270 }
271 }
272 return pos;
273 }
274
275 private int incrementCurrentPagePosition(String elemName) {
276 Integer currentElemPagePos = currentPagePositions.get(elemName);
277 if (currentElemPagePos == null) {
278 currentElemPagePos = new Integer(0);
279 }
280 currentElemPagePos++;
281 currentPagePositions.put(elemName, currentElemPagePos);
282 return currentElemPagePos.intValue();
283 }
284
285 private void setCurrentPagePosition(String elemName, int pos) {
286 Integer newPagePosition = new Integer(pos);
287 Enumeration<String> elemKeys = currentPagePositions.keys();
288 while (elemKeys.hasMoreElements()) {
289 String elemKey = elemKeys.nextElement();
290 currentPagePositions.put(elemKey, newPagePosition);
291 }
292 }
293
294 private boolean isHighlightTerm(String term) {
295 if (term == null)
296 return false;
157 boolean result = false; 297 boolean result = false;
158 for (int i=0; i< outputOptions.length; i++) { 298 for (int i=0; i< highlightTerms.length; i++) {
159 String function = outputOptions[i]; 299 String t = highlightTerms[i].toLowerCase();
160 if (function.equals("withForms")) 300 String termLowerCase = term.toLowerCase();
301 if (t.equals(termLowerCase))
161 return true; 302 return true;
162 } 303 }
163 return result; 304 return result;
164 } 305 }
165 306
166 private boolean withLemmas() { 307 private boolean isHighlightTerm(String[] terms) {
308 if (terms == null)
309 return false;
167 boolean result = false; 310 boolean result = false;
168 for (int i=0; i< outputOptions.length; i++) { 311 for (int i=0; i< highlightTerms.length; i++) {
169 String function = outputOptions[i]; 312 String t = highlightTerms[i].toLowerCase();
170 if (function.equals("withLemmas")) 313 for (int j=0; j<terms.length; j++) {
171 return true; 314 String termLowerCase = terms[j].toLowerCase();
315 if (t.equals(termLowerCase))
316 return true;
317 }
172 } 318 }
173 return result; 319 return result;
174 } 320 }
175 321
176 private void write(String outStr) throws SAXException { 322 private void write(String outStr) throws SAXException {
177 outputXmlFragment += outStr; 323 result.append(outStr);
178 } 324 }
179 325
180 private class Element { 326 public class Element implements Comparable<Element> {
181 private int type; 327 private int type;
182 private String name; 328 public String name;
183 private String xmlnsString; 329 private String xmlnsString;
184 private String attrString; 330 private String attrString;
185 private String value; 331 private String value;
186 private String lang; // normally value of attribute xml:lang or the inherited xml:lang value of the father node 332 public String lang; // value of attribute xml:lang or the inherited xml:lang value of the father node
333 public String xmlId;
334 public String xpath;
335 public int pageNumber;
336 public int lineNumber;
337 public int docPosition; // absolute position in document
338 public int position; // position within all elements with this name
339 public int elemPosition; // position in element e.g. the 6 sentence in paragraph
340 public int pagePosition; // position in page
341 private ArrayList<Token> tokens = new ArrayList<Token>();
187 private ArrayList<Element> composites; 342 private ArrayList<Element> composites;
343 private Element parent;
344 private boolean isStopElement = false;
345 private boolean isWordDelimiterElement = true; // default: is word delimiter element
188 346
189 private Element(String name) { 347 private Element(String name) {
190 this.type = ELEMENT_TYPE_COMPLEX; 348 this.type = ELEMENT_TYPE_COMPLEX;
191 this.name = name; 349 setName(name);
192 } 350 }
193 351
194 private Element(String name, int type) { 352 private Element(String name, int type) {
195 this.type = type; 353 this.type = type;
354 setName(name);
355 }
356
357 private void setName(String name) {
196 this.name = name; 358 this.name = name;
197 } 359 for (int i=0; i<stopElements.length; i++) {
198 360 String stopElementName = stopElements[i];
199 private boolean isComplex() { 361 if (name.equals(stopElementName)) {
362 this.isStopElement = true;
363 break;
364 }
365 }
366 for (int i=0; i<nwbElements.length; i++) {
367 String nwbElementName = nwbElements[i];
368 if (name.equals(nwbElementName)) {
369 this.isWordDelimiterElement = false;
370 break;
371 }
372 }
373 }
374
375 public int compareTo(Element elem) {
376 return (new Integer(position)).compareTo(new Integer(elem.position));
377 }
378
379 private boolean isComplex() {
200 boolean isComplex = false; 380 boolean isComplex = false;
201 if (type == ELEMENT_TYPE_COMPLEX) 381 if (type == ELEMENT_TYPE_COMPLEX)
202 isComplex = true; 382 isComplex = true;
203 return isComplex; 383 return isComplex;
204 } 384 }
205 385
206 private boolean isWordDelimiterElement() { 386 public ArrayList<Token> getTokens() {
207 boolean isWordDelimiterElement = true; 387 ArrayList<Token> retTokens = new ArrayList<Token>();
208 for (int i=0; i<nwbElements.length; i++) { 388 if (isComplex()) {
209 String nwbElementName = nwbElements[i]; 389 if (composites != null) {
210 if (name.equals(nwbElementName)) { 390 for (int i=0; i<composites.size(); i++) {
211 isWordDelimiterElement = false; 391 Element elem = composites.get(i);
212 break; 392 if (elem.tokens != null)
213 } 393 retTokens.addAll(elem.tokens);
214 } 394 }
215 return isWordDelimiterElement; 395 }
216 } 396 }
217 397 if (tokens != null)
218 private boolean isStopElement() { 398 retTokens.addAll(tokens);
219 boolean isStopElement = false; 399 return retTokens;
220 for (int i=0; i<stopElements.length; i++) { 400 }
221 String stopElementName = stopElements[i]; 401
222 if (name.equals(stopElementName)) { 402 public String getTokensStr(String type) {
223 isStopElement = true; 403 ArrayList<Token> elementTokens = getTokens();
224 break; 404 String tokenStr = getTokensStr(type, elementTokens);
225 } 405 return tokenStr;
226 } 406 }
227 return isStopElement; 407
228 } 408 private String getTokensStr(String type, ArrayList<Token> tokens) {
229 409 StringBuilder tokenStr = new StringBuilder();
230 private String toXmlString() throws SAXException { 410 for (int j=0; j<tokens.size(); j++) {
231 String retString = ""; 411 Token token = tokens.get(j);
412 String content = null;
413 if (type.equals("orig"))
414 content = token.getContentOrig();
415 else if (type.equals("reg"))
416 content = token.getContentReg();
417 else if (type.equals("norm"))
418 content = token.getContentNorm();
419 else if (type.equals("morph"))
420 content = token.getContentMorph();
421 if (content != null)
422 tokenStr.append(content + " ");
423 }
424 return tokenStr.toString();
425 }
426
427 public String toXmlString() throws ApplicationException {
428 StringBuilder retStrBuilder = new StringBuilder();
429 if (! isComplex()) {
430 retStrBuilder.append(value);
431 } else {
432 String xmlNsString = this.xmlnsString;
433 if (xmlNsString == null || xmlNsString.equals("")) {
434 retStrBuilder.append("<" + name + attrString + ">");
435 } else {
436 retStrBuilder.append("<" + name + " " + xmlNsString + attrString + ">");
437 }
438 if (composites != null) {
439 for (int i=0; i<composites.size(); i++) {
440 Element composite = composites.get(i);
441 if (! composite.isComplex()) {
442 if (composite.value != null && ! composite.value.equals("")) {
443 String compositeValueStr = StringUtils.removeNlTabBlanks(composite.value); // remove all newlines, they are no separators for words. And: if there are many Blanks/Tabs make them to one Blank
444 retStrBuilder.append(compositeValueStr);
445 }
446 } else {
447 retStrBuilder.append(composite.toXmlString());
448 }
449 }
450 }
451 retStrBuilder.append("</" + name + ">");
452 }
453 return retStrBuilder.toString();
454 }
455
456 private String buildString() throws ApplicationException {
457 StringBuilder retStrBuilder = new StringBuilder();
232 String elemLanguage = language; // default value for the document/page 458 String elemLanguage = language; // default value for the document/page
233 if (lang != null) 459 if (lang != null)
234 elemLanguage = lang; // value of the element if available 460 elemLanguage = lang; // value of the element if available
235 // write this element 461 // write this element
236 if (! isComplex()) { 462 if (! isComplex()) {
237 retString += value; 463 retStrBuilder.append(value);
238 } else { 464 } else {
239 String xmlNsString = this.xmlnsString; 465 if (outputFormat != null && outputFormat.equals("xml")) {
240 if (xmlNsString == null || xmlNsString.equals("")) { 466 String xmlNsString = this.xmlnsString;
241 retString = retString + "<" + name + attrString + ">"; 467 if (xmlNsString == null || xmlNsString.equals("")) {
242 } else { 468 retStrBuilder.append("<" + name + attrString + ">");
243 retString = retString + "<" + name + " " + xmlNsString + attrString + ">"; 469 } else {
470 retStrBuilder.append("<" + name + " " + xmlNsString + attrString + ">");
471 }
472 } else { // outputFormat == string
473 // nothing
244 } 474 }
245 if (composites != null) { 475 if (composites != null) {
246 String compositesCharsWithMarks = ""; 476 StringBuilder compositesCharsWithMarks = new StringBuilder();
247 ArrayList<Element> complexElements = new ArrayList<Element>(); 477 ArrayList<Element> complexElements = new ArrayList<Element>();
248 for (int i=0; i<composites.size(); i++) { 478 for (int i=0; i<composites.size(); i++) {
249 Element composite = composites.get(i); 479 Element composite = composites.get(i);
250 if (! composite.isComplex()) { 480 if (! composite.isComplex()) {
251 if (composite.value != null && ! composite.value.equals("")) { 481 if (composite.value != null && ! composite.value.equals("")) {
252 String compositeValueStr = composite.value; 482 String compositeValueStr = StringUtils.removeNlTabBlanks(composite.value); // remove all newlines, they are no separators for words. And: if there are many Blanks/Tabs make them to one Blank
253 compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words. 483 compositesCharsWithMarks.append(compositeValueStr);
254 compositeValueStr = compositeValueStr.replaceAll("[ \t]+", " "); // if there are many Blanks/Tabs make them to one Blank
255 compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr;
256 } 484 }
257 } else { 485 } else {
258 if (! composite.isWordDelimiterElement()) { 486 if (! composite.isWordDelimiterElement) {
259 compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_NWD_MARK; // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>) 487 compositesCharsWithMarks.append(COMPLEX_ELEMENT_NWD_MARK); // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>)
260 } else { 488 } else {
261 compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_MARK; // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>) 489 compositesCharsWithMarks.append(COMPLEX_ELEMENT_MARK); // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>)
262 } 490 }
263 complexElements.add(composite); 491 complexElements.add(composite);
264 } 492 }
265 } 493 }
266 // compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK); // remove Blanks after the non word breaking mark (e.g. "praebi<lb/> ta" is changed to "praebi<lb/>ta") 494 // compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK); // remove Blanks after the non word breaking mark (e.g. "praebi<lb/> ta" is changed to "praebi<lb/>ta")
268 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK); // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values 496 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK); // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values
269 if (complexElements.size() > 0) { 497 if (complexElements.size() > 0) {
270 for (int i=0; i<complexElements.size(); i++) { 498 for (int i=0; i<complexElements.size(); i++) {
271 int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksWithWordTags.indexOf(COMPLEX_ELEMENT_MARK); 499 int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksWithWordTags.indexOf(COMPLEX_ELEMENT_MARK);
272 Element complexElem = complexElements.get(i); 500 Element complexElem = complexElements.get(i);
273 String complexElementStr = complexElem.toXmlString(); 501 String complexElementStr = complexElem.buildString();
274 String firstPiece = ""; 502 String firstPiece = "";
275 if (indexComplexElemCompositesCharsWithMarks > 0) { 503 if (indexComplexElemCompositesCharsWithMarks > 0) {
276 firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks); 504 firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks);
277 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks); 505 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks);
278 } 506 }
279 retString = retString + firstPiece + complexElementStr; 507 retStrBuilder.append(firstPiece + complexElementStr);
280 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE); 508 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE);
281 } 509 }
282 retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added 510 retStrBuilder.append(compositesCharsWithMarksWithWordTags); // last one must also be added
283 } else { 511 } else {
284 retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added 512 retStrBuilder.append(compositesCharsWithMarksWithWordTags); // last one must also be added
285 } 513 }
286 } 514 }
287 retString = retString + "</" + name + ">"; 515 if (outputFormat != null && outputFormat.equals("xml")) {
516 retStrBuilder.append("</" + name + ">");
517 } else { // outputFormat == string
518 // nothing
519 }
520 // put element into elements name hashtable
521 ArrayList<Element> elems = elements.get(name);
522 if (elems == null) {
523 elems = new ArrayList<Element>();
524 elements.put(name, elems);
525 }
526 elems.add(this);
288 } 527 }
289 return retString; 528 return retStrBuilder.toString();
290 } 529 }
291 530
292 private String insertWordTags(String charactersStrDeresolved, String language) throws SAXException { 531 private String insertWordTags(StringBuilder charactersStrDeresolvedBuilder, String language) throws ApplicationException {
532 String charactersStrDeresolved = charactersStrDeresolvedBuilder.toString();
293 String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved); 533 String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved);
294 String retStr = ""; 534 StringBuilder retStrBuilder = new StringBuilder();
295 try { 535 Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr));
296 Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr)); 536 tokenizer.setLanguage(language);
297 tokenizer.setLanguage(language); 537 String[] normFunction = {"norm"};
298 tokenizer.setNormFunctions(normalizeFunctions); 538 tokenizer.setNormFunctions(normFunction);
299 ArrayList<Token> tokens = tokenizer.getTokens(); 539 ArrayList<Token> tokens = tokenizer.getTokens();
300 int endPos = 0; 540 int endPos = 0;
301 for (int i=0; i < tokens.size(); i++) { 541 for (int i=0; i < tokens.size(); i++) {
302 Token token = tokens.get(i); 542 Token token = tokens.get(i);
303 String wordForm = token.getContent(); 543 int startPos = token.getStart();
304 int startPos = token.getStart(); 544 String beforeStr = charactersStr.substring(endPos, startPos);
305 String beforeStr = charactersStr.substring(endPos, startPos); 545 endPos = token.getEnd();
306 endPos = token.getEnd(); 546 String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr);
307 String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr); 547 String origWordForm = charactersStr.substring(startPos, endPos);
308 String origWordForm = charactersStr.substring(startPos, endPos); 548 String wordTag = insertWordTags(token, language, origWordForm);
309 String wordTag = insertWordTags(wordForm, language, origWordForm); 549 if (outputFormat != null && outputFormat.equals("xml")) {
310 retStr = retStr + beforeStrDeresolved + wordTag; 550 retStrBuilder.append(beforeStrDeresolved + wordTag);
311 } 551 } else { // outputFormat == string
312 String lastAfterStr = charactersStr.substring(endPos); 552 String beforeStrDeresolvedToBlanks = toBlanks(beforeStrDeresolved);
313 String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr); 553 retStrBuilder.append(beforeStrDeresolvedToBlanks + wordTag);
314 retStr = retStr + lastAfterStrDeresolved; 554 }
315 } catch (ApplicationException e) { 555 }
316 throw new SAXException(e); 556 String lastAfterStr = charactersStr.substring(endPos);
317 } 557 String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr);
318 return retStr; 558 if (outputFormat != null && outputFormat.equals("xml")) {
319 } 559 retStrBuilder.append(lastAfterStrDeresolved);
320 560 } else { // outputFormat == string
321 private String insertWordTags(String wordForm, String language, String origWordForm) throws ApplicationException { 561 String lastAfterStrDeresolvedToBlanks = toBlanks(lastAfterStrDeresolved);
562 retStrBuilder.append(lastAfterStrDeresolvedToBlanks);
563 }
564 return retStrBuilder.toString();
565 }
566
567 private String insertWordTags(Token token, String language, String origWordForm) throws ApplicationException {
568 if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) {
569 return origWordForm;
570 }
322 String wordTag = null; 571 String wordTag = null;
323 if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) 572 token.setDocId(docId);
573 token.setLanguage(lang);
574 token.setPageNumber(pageNumber);
575 token.setLineNumber(lineNumber);
576 token.setElementPosition(position);
577 token.setElementPagePosition(pagePosition);
578 token.setElementName(name);
579 token.setXmlId(xmlId);
580 token.setXpath("xpath"); // TODO
581 if (name != null && name.equals("reg")) {
582 if (attrString != null && attrString.contains("norm=\"")) {
583 int regIndexBegin = attrString.indexOf("norm=\"");
584 int regIndexEnd = attrString.indexOf("\"", regIndexBegin + 7);
585 String reg = attrString.substring(regIndexBegin + 6, regIndexEnd);
586 token.setContentReg(reg);
587 String[] normFunction = {"norm"};
588 Normalizer normalizer = new Normalizer(normFunction, language);
589 String normStr = normalizer.normalize(reg);
590 token.setContentNorm(normStr);
591 }
592 }
593 if (language == null) {
594 token.setContentOrig(origWordForm); // TODO necessary ?
595 tokens.add(token);
596 resultTokens.add(token);
324 return origWordForm; 597 return origWordForm;
325 if (isStopElement()) 598 }
599 if (isStopElement && outputFormat != null && outputFormat.equals("xml"))
326 return origWordForm; 600 return origWordForm;
327 wordForm = removeSpecialSymbols(wordForm); 601 if (isStopElement && outputFormat != null && outputFormat.equals("string"))
328 wordForm = wordForm.toLowerCase(); 602 return toBlanks(origWordForm);
603 String wordFormNorm = token.getContentNorm();
329 String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm); 604 String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm);
330 ArrayList<Lemma> lemmas = null; 605 ArrayList<Lemma> lemmas = null;
331 if (withForms() || withLemmas()) { 606 Boolean hasDctionaryEntries = null;
607 String lemmasStr = "";
608 if (withForms || withLemmas) {
332 LexHandler lexHandler = LexHandler.getInstance(); 609 LexHandler lexHandler = LexHandler.getInstance();
333 lemmas = lexHandler.getLemmas(wordForm, "form", language, Normalizer.NONE); 610 lemmas = lexHandler.getLemmas(wordFormNorm, "form", language, Normalizer.DICTIONARY, false); // Performance: needs 15 % of the indexing time
334 } 611 if (lemmas != null) {
335 wordTag = insertWordTags(origWordFormDeresolved, wordForm, language, null, lemmas); 612 for (int i=0; i < lemmas.size(); i++) {
613 Lemma lemma = lemmas.get(i);
614 String lemmaName = lemma.getLemmaName();
615 lemmasStr = lemmasStr + lemmaName + " ";
616 }
617 }
618 lemmasStr = lemmasStr.trim();
619 token.setContentMorph(lemmasStr);
620 hasDctionaryEntries = false;
621 ArrayList<String> lexEntries = lexHandler.getLexEntryKeys(wordFormNorm, language, Normalizer.DICTIONARY); // Performance: needs 15 % of the indexing time
622 if (lexEntries != null)
623 hasDctionaryEntries = true;
624 }
625 if (outputFormat != null && outputFormat.equals("xml")) {
626 wordTag = insertWordTags(origWordFormDeresolved, token, language, lemmas, hasDctionaryEntries); // Performance: needs 10 % of the indexing time
627 String tokenWordForm = token.getContentOrig(); // word form is in contentOrig
628 if (useRegFunction)
629 tokenWordForm = token.getContentReg();
630 else if (useNormFunction)
631 tokenWordForm = token.getContentNorm();
632 else if (withLemmas)
633 tokenWordForm = token.getContentMorph();
634 boolean isHighlightTerm = false;
635 if (highlightTerms.length > 0 && ! withLemmas) {
636 isHighlightTerm = isHighlightTerm(tokenWordForm);
637 } else {
638 if (highlightTerms.length > 0 && lemmas != null) {
639 String[] lemmasArray = lemmasStr.split(" ");
640 isHighlightTerm = isHighlightTerm(lemmasArray);
641 }
642 }
643 if (isHighlightTerm) {
644 wordTag = "<hi>" + wordTag + "</hi>";
645 }
646 } else { // outputFormat == string
647 String inWordFormWithoutSpecialSymbols = removeSpecialSymbols(origWordForm); // without hyphen, blanks, newline, tab
648 if (withLemmas) {
649 if (lemmas != null) {
650 String blanksAndNWBMarksOfOrigWord = toBlanks(origWordFormDeresolved); // to rescue the NWB marks of the origWord and put it to the beginning of the lemmasStr
651 wordTag = blanksAndNWBMarksOfOrigWord + lemmasStr;
652 token.setContentMorph(lemmasStr);
653 } else {
654 wordTag = inWordFormWithoutSpecialSymbols;
655 }
656 } else {
657 wordTag = inWordFormWithoutSpecialSymbols;
658 }
659 tokens.add(token);
660 resultTokens.add(token);
661 }
336 return wordTag; 662 return wordTag;
337 } 663 }
338 664
665 private String removeSpecialSymbols(String inputStr) {
666 String retStr = inputStr.replaceAll(" |\n|\t|-|\u00AD", ""); // blank, newline, tab, minus, soft hyphen
667 return retStr;
668 }
669
339 /** 670 /**
340 * 671 *
341 * @param origWordToken could contain nwd marks 672 * @param origWordToken could contain nwd marks
342 * @param wordForm contains no nwd marks 673 * @param token
343 * @param language 674 * @param language
344 * @param origWordFormNormalized
345 * @param lemmas 675 * @param lemmas
346 * @return for each substring between nwd marks create a word tag 676 * @return for each substring between nwd marks create a word tag
347 */ 677 */
348 private String insertWordTags(String origWordToken, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) { 678 private String insertWordTags(String origWordToken, Token token, String language, ArrayList<Lemma> lemmas, Boolean hasDictionaryEntries) {
349 if (origWordToken.isEmpty()) 679 if (origWordToken.isEmpty())
350 return origWordToken; 680 return origWordToken;
351 if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK)) 681 if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK))
352 return COMPLEX_ELEMENT_NWD_MARK; 682 return COMPLEX_ELEMENT_NWD_MARK;
353 String retWordTags = ""; 683 String retWordTags = "";
354 String origWordTokenTmp = origWordToken; 684 String origWordTokenTmp = origWordToken;
355 while (! origWordTokenTmp.isEmpty()) { 685 if (outputFormat != null && outputFormat.equals("xml")) {
356 if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) { // single nwd mark 686 retWordTags = getWordTag(origWordToken, token, language, lemmas, hasDictionaryEntries);
357 origWordTokenTmp = origWordTokenTmp.substring(1); 687 /*
358 retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK; 688 while (! origWordTokenTmp.isEmpty()) {
359 } else { 689 if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) { // single nwd mark
360 int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK); 690 origWordTokenTmp = origWordTokenTmp.substring(1);
361 if (indexUpToNWD != -1) { // not end of string reached 691 retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK;
362 String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD); 692 } else {
363 String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); 693 int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK);
364 retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK; 694 if (indexUpToNWD != -1) { // not end of string reached
365 origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1); 695 String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD);
366 } else { // end of string reached 696 String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, token, language, lemmas, hasDictionaryEntries);
367 String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length()); 697 retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK;
368 String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); 698 origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1);
369 retWordTags = retWordTags + origWordTokenFragmentWithTags; 699 } else { // end of string reached
370 origWordTokenTmp = ""; // finente 700 String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length());
371 } 701 String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, token, language, lemmas, hasDictionaryEntries);
372 } 702 retWordTags = retWordTags + origWordTokenFragmentWithTags;
703 origWordTokenTmp = ""; // finente
704 }
705 }
706 }
707 */
708 } else {
709 // nothing
373 } 710 }
374 return retWordTags; 711 return retWordTags;
375 } 712 }
376 713
377 private String getWordTag(String origWordForm, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) { 714 private String getWordTag(String origWordForm, Token token, String language, ArrayList<Lemma> lemmas, Boolean hasDictionaryEntries) {
378 if (origWordForm == null || origWordForm.isEmpty()) 715 if (origWordForm == null || origWordForm.isEmpty())
379 return ""; 716 return "";
717 String wordForm = token.getContentOrig(); // word form is in contentOrig
718 String regularizedWordForm = token.getContentReg();
719 String normalizedWordForm = token.getContentNorm();
380 String langISOCode = Language.getInstance().getISO639Code(language); 720 String langISOCode = Language.getInstance().getISO639Code(language);
381 String retStr = "<w form=\"" + wordForm + "\"" + " lang=\"" + langISOCode + "\""; 721 StringBuilder retStrBuilder = new StringBuilder();
382 if (origWordFormNormalized != null) 722 retStrBuilder.append("<w" + " lang=\"" + langISOCode + "\"" + " form=\"" + wordForm + "\"");
383 retStr = retStr + " formNormalized=\"" + origWordFormNormalized + "\""; 723 if (regularizedWordForm != null)
724 retStrBuilder.append(" formRegularized=\"" + regularizedWordForm + "\"");
725 if (normalizedWordForm != null)
726 retStrBuilder.append(" formNormalized=\"" + normalizedWordForm + "\"");
384 if (lemmas != null) { 727 if (lemmas != null) {
385 String lemmasStr = ""; 728 String lemmasStr = "";
386 String formsStr = ""; 729 StringBuilder formsStrBuilder = new StringBuilder();
387 Collections.sort(lemmas); 730 Collections.sort(lemmas);
388 Hashtable<String, Form> formsHashtable = new Hashtable<String, Form>(); 731 Hashtable<String, Form> formsHashtable = new Hashtable<String, Form>();
389 for (int i=0; i < lemmas.size(); i++) { 732 for (int i=0; i < lemmas.size(); i++) {
390 Lemma lemma = lemmas.get(i); 733 Lemma lemma = lemmas.get(i);
391 ArrayList<Form> lemmaForms = lemma.getFormsList(); 734 ArrayList<Form> lemmaForms = lemma.getFormsList();
401 Collections.sort(forms); 744 Collections.sort(forms);
402 for (int i=0; i < forms.size(); i++) { 745 for (int i=0; i < forms.size(); i++) {
403 Form form = forms.get(i); 746 Form form = forms.get(i);
404 String formName = form.getFormName(); 747 String formName = form.getFormName();
405 formName = StringUtils.forXML(formName); 748 formName = StringUtils.forXML(formName);
406 formsStr = formsStr + formName + " "; 749 formsStrBuilder.append(formName + " ");
407 } 750 }
751 String formsStr = formsStrBuilder.toString();
408 if (formsStr.endsWith(" ")) 752 if (formsStr.endsWith(" "))
409 formsStr = formsStr.substring(0, formsStr.length() - 1); 753 formsStr = formsStr.substring(0, formsStr.length() - 1);
410 if (lemmasStr.endsWith(" ")) 754 if (lemmasStr.endsWith(" "))
411 lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1); 755 lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1);
412 if (withForms()) 756 if (withForms)
413 retStr = retStr + " forms=\"" + formsStr + "\""; 757 retStrBuilder.append(" forms=\"" + formsStr + "\"");
414 if (withLemmas()) 758 if (withLemmas)
415 retStr = retStr + " lemmas=\"" + lemmasStr + "\""; 759 retStrBuilder.append(" lemmas=\"" + lemmasStr + "\"");
416 } 760 }
417 retStr = retStr + ">" + origWordForm + "</w>"; 761 if (hasDictionaryEntries != null && hasDictionaryEntries) {
418 return retStr; 762 retStrBuilder.append(" dictionary=\"" + "true" + "\"");
419 } 763 } else if (hasDictionaryEntries != null && ! hasDictionaryEntries) {
420 764 retStrBuilder.append(" dictionary=\"" + "false" + "\"");
421 private String removeSpecialSymbols(String inputStr) { 765 }
422 String retStr = inputStr.replaceAll(" |\n|\t|-|\u2424|\u2425", ""); 766 retStrBuilder.append(">");
423 return retStr; 767 retStrBuilder.append(origWordForm); // origWordForm could contain nwd marks (these are transformed back to elements later in method buildString)
768 retStrBuilder.append("</w>");
769 return retStrBuilder.toString();
770 }
771
772 private String toBlanks(String inputStr) {
773 int size = inputStr.length();
774 StringBuilder retStrBuilder = new StringBuilder();
775 for (int j=0; j < size; j++) {
776 char c = inputStr.charAt(j);
777 if (c == COMPLEX_ELEMENT_NWD_MARK.charAt(0) || c == COMPLEX_ELEMENT_MARK.charAt(0))
778 retStrBuilder.append(c);
779 else
780 retStrBuilder.append(" ");
781 }
782 return retStrBuilder.toString();
424 } 783 }
425 784
426 } 785 }
427 } 786 }