Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 7d6d969b10cf |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; | 1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; |
2 | 2 |
3 import java.io.StringReader; | 3 import java.io.StringReader; |
4 import java.util.ArrayList; | 4 import java.util.ArrayList; |
5 import java.util.Collections; | 5 import java.util.Collections; |
6 import java.util.Enumeration; | |
6 import java.util.Hashtable; | 7 import java.util.Hashtable; |
7 | 8 |
8 import org.xml.sax.*; | 9 import org.xml.sax.*; |
9 | 10 |
10 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | 11 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; |
19 private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element | 20 private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element |
20 private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element | 21 private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element |
21 private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length(); | 22 private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length(); |
22 private static int ELEMENT_TYPE_CHARACTERS = 1; | 23 private static int ELEMENT_TYPE_CHARACTERS = 1; |
23 private static int ELEMENT_TYPE_COMPLEX = 2; | 24 private static int ELEMENT_TYPE_COMPLEX = 2; |
24 private String[] normalizeFunctions = {}; // default: without normalize functions | 25 private String docId; |
25 private String[] nwbElements = {}; // non word breaking elements, default: these elements | 26 private String language; |
27 private String[] nwbElements = {}; // non word breaking elements, default: no nwb elements | |
26 private String[] stopElements = {}; // default: no stop elements | 28 private String[] stopElements = {}; // default: no stop elements |
29 private String outputFormat = "xml"; // default: xml | |
27 private String[] outputOptions = {}; | 30 private String[] outputOptions = {}; |
31 private boolean withForms = false; | |
32 private boolean withLemmas = false; | |
33 private String[] highlightTerms = {}; // highlight terms, default: no highlight terms | |
34 private String[] normFunctions = {}; // default: no norm function | |
35 private boolean useNormFunction = false; | |
36 private boolean useRegFunction = false; | |
28 private String xmlnsString = ""; | 37 private String xmlnsString = ""; |
29 private String language; | 38 private StringBuilder result = new StringBuilder(); |
30 private String outputXmlFragment = ""; | 39 private ArrayList<Token> resultTokens = new ArrayList<Token>(); |
40 private Hashtable<String, ArrayList<Element>> elements = new Hashtable<String, ArrayList<Element>>(); | |
31 private Element rootElement; | 41 private Element rootElement; |
32 private Element currentElement; | 42 private Element currentElement; |
43 private int currentPosition = 0; | |
44 private int currentPageNumber = 0; | |
45 private int currentLineNumber = 0; | |
46 private Hashtable<String, Integer> currentPositions = new Hashtable<String, Integer>(); | |
47 private Hashtable<String, Integer> currentPagePositions = new Hashtable<String, Integer>(); | |
33 private ArrayList<Element> elementQueue; | 48 private ArrayList<Element> elementQueue; |
34 | 49 |
35 public XmlTokenizerContentHandler(String[] normalizeFunctions, String language) throws ApplicationException { | 50 public XmlTokenizerContentHandler(String language) throws ApplicationException { |
36 if (normalizeFunctions == null) { | |
37 String[] emptyFunctions = {}; | |
38 this.normalizeFunctions = emptyFunctions; | |
39 } else { | |
40 this.normalizeFunctions = normalizeFunctions; | |
41 } | |
42 this.language = language; | 51 this.language = language; |
52 } | |
53 | |
54 public void setDocIdentifier(String docId) { | |
55 this.docId = docId; | |
43 } | 56 } |
44 | 57 |
45 public void setNWBElements(String[] nwbElements) { | 58 public void setNWBElements(String[] nwbElements) { |
46 this.nwbElements = nwbElements; | 59 this.nwbElements = nwbElements; |
47 } | 60 } |
48 | 61 |
49 public void setStopElements(String[] stopElements) { | 62 public void setStopElements(String[] stopElements) { |
50 this.stopElements = stopElements; | 63 this.stopElements = stopElements; |
51 } | 64 } |
52 | 65 |
66 public void setHighlightTerms(String[] highlightTerms) { | |
67 this.highlightTerms = highlightTerms; | |
68 } | |
69 | |
70 public void setNormFunctions(String[] normFunctions) { | |
71 this.normFunctions = normFunctions; | |
72 if (this.normFunctions != null) { | |
73 for (int i=0; i< this.normFunctions.length; i++) { | |
74 String function = normFunctions[i]; | |
75 if (function.equals("norm")) | |
76 this.useNormFunction = true; | |
77 else if (function.equals("reg")) | |
78 this.useRegFunction = true; | |
79 } | |
80 } | |
81 } | |
82 | |
83 public void setOutputFormat(String outputFormat) { | |
84 this.outputFormat = outputFormat; | |
85 } | |
86 | |
53 public void setOutputOptions(String[] outputOptions) { | 87 public void setOutputOptions(String[] outputOptions) { |
54 this.outputOptions = outputOptions; | 88 this.outputOptions = outputOptions; |
55 } | 89 for (int i=0; i< this.outputOptions.length; i++) { |
56 | 90 String function = outputOptions[i]; |
57 public String getXmlFragment() { | 91 if (function.equals("withForms")) |
58 return outputXmlFragment; | 92 this.withForms = true; |
93 else if (function.equals("withLemmas")) | |
94 this.withLemmas = true; | |
95 } | |
96 } | |
97 | |
98 public String getResultString() { | |
99 return result.toString(); | |
100 } | |
101 | |
102 public ArrayList<Token> getResultTokens() { | |
103 return resultTokens; | |
59 } | 104 } |
60 | 105 |
106 public ArrayList<Element> getElements(String elementName) { | |
107 return elements.get(elementName); | |
108 } | |
109 | |
110 public int getPageCount() { | |
111 return currentPageNumber; | |
112 } | |
113 | |
61 public void startDocument() throws SAXException { | 114 public void startDocument() throws SAXException { |
62 } | 115 } |
63 | 116 |
64 public void endDocument() throws SAXException { | 117 public void endDocument() throws SAXException { |
65 try { | 118 try { |
66 String rootElemToStr = rootElement.toXmlString(); | 119 String rootElemToStr = rootElement.buildString(); |
67 write(rootElemToStr); | 120 write(rootElemToStr); |
68 write("\n"); | 121 write("\n"); |
69 } catch (NullPointerException e) { | 122 } catch (NullPointerException e) { |
123 throw new SAXException(e); | |
124 } catch (ApplicationException e) { | |
70 throw new SAXException(e); | 125 throw new SAXException(e); |
71 } | 126 } |
72 } | 127 } |
73 | 128 |
74 public void characters(char[] c, int start, int length) throws SAXException { | 129 public void characters(char[] c, int start, int length) throws SAXException { |
76 System.arraycopy(c, start, cCopy, 0, length); | 131 System.arraycopy(c, start, cCopy, 0, length); |
77 String charactersStr = String.valueOf(cCopy); | 132 String charactersStr = String.valueOf(cCopy); |
78 if (charactersStr != null && ! charactersStr.equals("")) { | 133 if (charactersStr != null && ! charactersStr.equals("")) { |
79 if (currentElement != null) { | 134 if (currentElement != null) { |
80 Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); | 135 Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); |
136 charElement.pageNumber = currentPageNumber; | |
81 charElement.value = StringUtils.deresolveXmlEntities(charactersStr); | 137 charElement.value = StringUtils.deresolveXmlEntities(charactersStr); |
82 if (currentElement.composites == null) | 138 if (currentElement.composites == null) |
83 currentElement.composites = new ArrayList<Element>(); | 139 currentElement.composites = new ArrayList<Element>(); |
84 currentElement.composites.add(charElement); | 140 currentElement.composites.add(charElement); |
85 } | 141 } |
94 | 150 |
95 public void setDocumentLocator(Locator locator) { | 151 public void setDocumentLocator(Locator locator) { |
96 } | 152 } |
97 | 153 |
98 public void startPrefixMapping(String prefix, String uri) throws SAXException { | 154 public void startPrefixMapping(String prefix, String uri) throws SAXException { |
99 xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; | |
100 if (prefix != null && prefix.equals("")) | 155 if (prefix != null && prefix.equals("")) |
101 xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" "; | 156 xmlnsString += "xmlns" + "=\"" + uri + "\" "; |
157 else | |
158 xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; | |
102 } | 159 } |
103 | 160 |
104 public void endPrefixMapping(String prefix) throws SAXException { | 161 public void endPrefixMapping(String prefix) throws SAXException { |
105 } | 162 } |
106 | 163 |
115 if (currentElement.composites == null) | 172 if (currentElement.composites == null) |
116 currentElement.composites = new ArrayList<Element>(); | 173 currentElement.composites = new ArrayList<Element>(); |
117 if (currentElement.lang != null) | 174 if (currentElement.lang != null) |
118 newElement.lang = currentElement.lang; // language is inherited to childs | 175 newElement.lang = currentElement.lang; // language is inherited to childs |
119 currentElement.composites.add(newElement); | 176 currentElement.composites.add(newElement); |
177 newElement.parent = currentElement; | |
120 } | 178 } |
121 currentElement = newElement; | 179 currentElement = newElement; |
180 if (localName != null && localName.equals("pb")) { | |
181 currentPageNumber++; | |
182 setCurrentPagePosition(localName, 0); | |
183 } | |
184 currentElement.pageNumber = currentPageNumber; | |
185 if (localName != null && localName.equals("lb")) { | |
186 currentLineNumber++; | |
187 } | |
188 currentElement.lineNumber = currentLineNumber; | |
189 currentPosition++; | |
190 currentElement.docPosition = currentPosition; | |
191 int newElemPosition = incrementCurrentPosition(localName); | |
192 currentElement.position = newElemPosition; | |
193 | |
194 currentElement.elemPosition = getElementPosition(currentElement); | |
195 Element parent = currentElement.parent; | |
196 if (parent == null) { | |
197 currentElement.xpath = "/" + currentElement.name + "[" + currentElement.elemPosition + "]"; | |
198 } else { | |
199 currentElement.xpath = parent.xpath + "/" + currentElement.name + "[" + currentElement.elemPosition + "]"; | |
200 } | |
201 int newElemPagePosition = incrementCurrentPagePosition(localName); | |
202 currentElement.pagePosition = newElemPagePosition; | |
122 int attrSize = attrs.getLength(); | 203 int attrSize = attrs.getLength(); |
123 String attrString = ""; | 204 String attrString = ""; |
124 for (int i=0; i<attrSize; i++) { | 205 for (int i=0; i<attrSize; i++) { |
125 String attrQName = attrs.getQName(i); | 206 String attrQName = attrs.getQName(i); |
126 String attrValue = attrs.getValue(i); | 207 String attrValue = attrs.getValue(i); |
127 attrValue = StringUtils.forXML(attrValue); | 208 attrValue = StringUtils.forXML(attrValue); |
128 attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; | 209 attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; |
129 if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) | 210 if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) { |
130 currentElement.lang = attrValue; // if xml:lang is set, it is set to the new element and overwrites values inherited by the father | 211 currentElement.lang = attrValue; // if xml:lang is set, it is set to the new element and overwrites values inherited by the father |
212 } | |
213 if (attrQName != null && (attrQName.toLowerCase().equals("xml:id") || attrQName.toLowerCase().equals("id"))) { | |
214 currentElement.xmlId = attrValue; | |
215 } | |
131 } | 216 } |
132 currentElement.attrString = attrString; | 217 currentElement.attrString = attrString; |
133 if (! xmlnsString.equals("")) { | 218 if (! xmlnsString.equals("")) { |
134 currentElement.xmlnsString = xmlnsString; | 219 currentElement.xmlnsString = xmlnsString; |
135 } | 220 } |
151 } else { | 236 } else { |
152 currentElement = null; | 237 currentElement = null; |
153 } | 238 } |
154 } | 239 } |
155 | 240 |
156 private boolean withForms() { | 241 private int incrementCurrentPosition(String elemName) { |
242 Integer currentElemPos = currentPositions.get(elemName); | |
243 if (currentElemPos == null) { | |
244 currentElemPos = new Integer(0); | |
245 } | |
246 currentElemPos++; | |
247 currentPositions.put(elemName, currentElemPos); | |
248 return currentElemPos.intValue(); | |
249 } | |
250 | |
251 private int getElementPosition(Element elem) { | |
252 int pos = 0; | |
253 Element parent = elem.parent; | |
254 if (parent == null) { | |
255 pos = 1; | |
256 } else { | |
257 pos = 0; | |
258 ArrayList<Element> composites = parent.composites; | |
259 if (composites != null) { | |
260 for (int i=0; i<composites.size(); i++) { | |
261 Element e = composites.get(i); | |
262 if (e.isComplex() && e.name.equals(elem.name)) { | |
263 pos++; | |
264 } | |
265 if (e == elem) | |
266 break; | |
267 } | |
268 } else { | |
269 pos = 1; | |
270 } | |
271 } | |
272 return pos; | |
273 } | |
274 | |
275 private int incrementCurrentPagePosition(String elemName) { | |
276 Integer currentElemPagePos = currentPagePositions.get(elemName); | |
277 if (currentElemPagePos == null) { | |
278 currentElemPagePos = new Integer(0); | |
279 } | |
280 currentElemPagePos++; | |
281 currentPagePositions.put(elemName, currentElemPagePos); | |
282 return currentElemPagePos.intValue(); | |
283 } | |
284 | |
285 private void setCurrentPagePosition(String elemName, int pos) { | |
286 Integer newPagePosition = new Integer(pos); | |
287 Enumeration<String> elemKeys = currentPagePositions.keys(); | |
288 while (elemKeys.hasMoreElements()) { | |
289 String elemKey = elemKeys.nextElement(); | |
290 currentPagePositions.put(elemKey, newPagePosition); | |
291 } | |
292 } | |
293 | |
294 private boolean isHighlightTerm(String term) { | |
295 if (term == null) | |
296 return false; | |
157 boolean result = false; | 297 boolean result = false; |
158 for (int i=0; i< outputOptions.length; i++) { | 298 for (int i=0; i< highlightTerms.length; i++) { |
159 String function = outputOptions[i]; | 299 String t = highlightTerms[i].toLowerCase(); |
160 if (function.equals("withForms")) | 300 String termLowerCase = term.toLowerCase(); |
301 if (t.equals(termLowerCase)) | |
161 return true; | 302 return true; |
162 } | 303 } |
163 return result; | 304 return result; |
164 } | 305 } |
165 | 306 |
166 private boolean withLemmas() { | 307 private boolean isHighlightTerm(String[] terms) { |
308 if (terms == null) | |
309 return false; | |
167 boolean result = false; | 310 boolean result = false; |
168 for (int i=0; i< outputOptions.length; i++) { | 311 for (int i=0; i< highlightTerms.length; i++) { |
169 String function = outputOptions[i]; | 312 String t = highlightTerms[i].toLowerCase(); |
170 if (function.equals("withLemmas")) | 313 for (int j=0; j<terms.length; j++) { |
171 return true; | 314 String termLowerCase = terms[j].toLowerCase(); |
315 if (t.equals(termLowerCase)) | |
316 return true; | |
317 } | |
172 } | 318 } |
173 return result; | 319 return result; |
174 } | 320 } |
175 | 321 |
176 private void write(String outStr) throws SAXException { | 322 private void write(String outStr) throws SAXException { |
177 outputXmlFragment += outStr; | 323 result.append(outStr); |
178 } | 324 } |
179 | 325 |
180 private class Element { | 326 public class Element implements Comparable<Element> { |
181 private int type; | 327 private int type; |
182 private String name; | 328 public String name; |
183 private String xmlnsString; | 329 private String xmlnsString; |
184 private String attrString; | 330 private String attrString; |
185 private String value; | 331 private String value; |
186 private String lang; // normally value of attribute xml:lang or the inherited xml:lang value of the father node | 332 public String lang; // value of attribute xml:lang or the inherited xml:lang value of the father node |
333 public String xmlId; | |
334 public String xpath; | |
335 public int pageNumber; | |
336 public int lineNumber; | |
337 public int docPosition; // absolute position in document | |
338 public int position; // position within all elements with this name | |
339 public int elemPosition; // position in element e.g. the 6 sentence in paragraph | |
340 public int pagePosition; // position in page | |
341 private ArrayList<Token> tokens = new ArrayList<Token>(); | |
187 private ArrayList<Element> composites; | 342 private ArrayList<Element> composites; |
343 private Element parent; | |
344 private boolean isStopElement = false; | |
345 private boolean isWordDelimiterElement = true; // default: is word delimiter element | |
188 | 346 |
189 private Element(String name) { | 347 private Element(String name) { |
190 this.type = ELEMENT_TYPE_COMPLEX; | 348 this.type = ELEMENT_TYPE_COMPLEX; |
191 this.name = name; | 349 setName(name); |
192 } | 350 } |
193 | 351 |
194 private Element(String name, int type) { | 352 private Element(String name, int type) { |
195 this.type = type; | 353 this.type = type; |
354 setName(name); | |
355 } | |
356 | |
357 private void setName(String name) { | |
196 this.name = name; | 358 this.name = name; |
197 } | 359 for (int i=0; i<stopElements.length; i++) { |
198 | 360 String stopElementName = stopElements[i]; |
199 private boolean isComplex() { | 361 if (name.equals(stopElementName)) { |
362 this.isStopElement = true; | |
363 break; | |
364 } | |
365 } | |
366 for (int i=0; i<nwbElements.length; i++) { | |
367 String nwbElementName = nwbElements[i]; | |
368 if (name.equals(nwbElementName)) { | |
369 this.isWordDelimiterElement = false; | |
370 break; | |
371 } | |
372 } | |
373 } | |
374 | |
375 public int compareTo(Element elem) { | |
376 return (new Integer(position)).compareTo(new Integer(elem.position)); | |
377 } | |
378 | |
379 private boolean isComplex() { | |
200 boolean isComplex = false; | 380 boolean isComplex = false; |
201 if (type == ELEMENT_TYPE_COMPLEX) | 381 if (type == ELEMENT_TYPE_COMPLEX) |
202 isComplex = true; | 382 isComplex = true; |
203 return isComplex; | 383 return isComplex; |
204 } | 384 } |
205 | 385 |
206 private boolean isWordDelimiterElement() { | 386 public ArrayList<Token> getTokens() { |
207 boolean isWordDelimiterElement = true; | 387 ArrayList<Token> retTokens = new ArrayList<Token>(); |
208 for (int i=0; i<nwbElements.length; i++) { | 388 if (isComplex()) { |
209 String nwbElementName = nwbElements[i]; | 389 if (composites != null) { |
210 if (name.equals(nwbElementName)) { | 390 for (int i=0; i<composites.size(); i++) { |
211 isWordDelimiterElement = false; | 391 Element elem = composites.get(i); |
212 break; | 392 if (elem.tokens != null) |
213 } | 393 retTokens.addAll(elem.tokens); |
214 } | 394 } |
215 return isWordDelimiterElement; | 395 } |
216 } | 396 } |
217 | 397 if (tokens != null) |
218 private boolean isStopElement() { | 398 retTokens.addAll(tokens); |
219 boolean isStopElement = false; | 399 return retTokens; |
220 for (int i=0; i<stopElements.length; i++) { | 400 } |
221 String stopElementName = stopElements[i]; | 401 |
222 if (name.equals(stopElementName)) { | 402 public String getTokensStr(String type) { |
223 isStopElement = true; | 403 ArrayList<Token> elementTokens = getTokens(); |
224 break; | 404 String tokenStr = getTokensStr(type, elementTokens); |
225 } | 405 return tokenStr; |
226 } | 406 } |
227 return isStopElement; | 407 |
228 } | 408 private String getTokensStr(String type, ArrayList<Token> tokens) { |
229 | 409 StringBuilder tokenStr = new StringBuilder(); |
230 private String toXmlString() throws SAXException { | 410 for (int j=0; j<tokens.size(); j++) { |
231 String retString = ""; | 411 Token token = tokens.get(j); |
412 String content = null; | |
413 if (type.equals("orig")) | |
414 content = token.getContentOrig(); | |
415 else if (type.equals("reg")) | |
416 content = token.getContentReg(); | |
417 else if (type.equals("norm")) | |
418 content = token.getContentNorm(); | |
419 else if (type.equals("morph")) | |
420 content = token.getContentMorph(); | |
421 if (content != null) | |
422 tokenStr.append(content + " "); | |
423 } | |
424 return tokenStr.toString(); | |
425 } | |
426 | |
427 public String toXmlString() throws ApplicationException { | |
428 StringBuilder retStrBuilder = new StringBuilder(); | |
429 if (! isComplex()) { | |
430 retStrBuilder.append(value); | |
431 } else { | |
432 String xmlNsString = this.xmlnsString; | |
433 if (xmlNsString == null || xmlNsString.equals("")) { | |
434 retStrBuilder.append("<" + name + attrString + ">"); | |
435 } else { | |
436 retStrBuilder.append("<" + name + " " + xmlNsString + attrString + ">"); | |
437 } | |
438 if (composites != null) { | |
439 for (int i=0; i<composites.size(); i++) { | |
440 Element composite = composites.get(i); | |
441 if (! composite.isComplex()) { | |
442 if (composite.value != null && ! composite.value.equals("")) { | |
443 String compositeValueStr = StringUtils.removeNlTabBlanks(composite.value); // remove all newlines, they are no separators for words. And: if there are many Blanks/Tabs make them to one Blank | |
444 retStrBuilder.append(compositeValueStr); | |
445 } | |
446 } else { | |
447 retStrBuilder.append(composite.toXmlString()); | |
448 } | |
449 } | |
450 } | |
451 retStrBuilder.append("</" + name + ">"); | |
452 } | |
453 return retStrBuilder.toString(); | |
454 } | |
455 | |
456 private String buildString() throws ApplicationException { | |
457 StringBuilder retStrBuilder = new StringBuilder(); | |
232 String elemLanguage = language; // default value for the document/page | 458 String elemLanguage = language; // default value for the document/page |
233 if (lang != null) | 459 if (lang != null) |
234 elemLanguage = lang; // value of the element if available | 460 elemLanguage = lang; // value of the element if available |
235 // write this element | 461 // write this element |
236 if (! isComplex()) { | 462 if (! isComplex()) { |
237 retString += value; | 463 retStrBuilder.append(value); |
238 } else { | 464 } else { |
239 String xmlNsString = this.xmlnsString; | 465 if (outputFormat != null && outputFormat.equals("xml")) { |
240 if (xmlNsString == null || xmlNsString.equals("")) { | 466 String xmlNsString = this.xmlnsString; |
241 retString = retString + "<" + name + attrString + ">"; | 467 if (xmlNsString == null || xmlNsString.equals("")) { |
242 } else { | 468 retStrBuilder.append("<" + name + attrString + ">"); |
243 retString = retString + "<" + name + " " + xmlNsString + attrString + ">"; | 469 } else { |
470 retStrBuilder.append("<" + name + " " + xmlNsString + attrString + ">"); | |
471 } | |
472 } else { // outputFormat == string | |
473 // nothing | |
244 } | 474 } |
245 if (composites != null) { | 475 if (composites != null) { |
246 String compositesCharsWithMarks = ""; | 476 StringBuilder compositesCharsWithMarks = new StringBuilder(); |
247 ArrayList<Element> complexElements = new ArrayList<Element>(); | 477 ArrayList<Element> complexElements = new ArrayList<Element>(); |
248 for (int i=0; i<composites.size(); i++) { | 478 for (int i=0; i<composites.size(); i++) { |
249 Element composite = composites.get(i); | 479 Element composite = composites.get(i); |
250 if (! composite.isComplex()) { | 480 if (! composite.isComplex()) { |
251 if (composite.value != null && ! composite.value.equals("")) { | 481 if (composite.value != null && ! composite.value.equals("")) { |
252 String compositeValueStr = composite.value; | 482 String compositeValueStr = StringUtils.removeNlTabBlanks(composite.value); // remove all newlines, they are no separators for words. And: if there are many Blanks/Tabs make them to one Blank |
253 compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words. | 483 compositesCharsWithMarks.append(compositeValueStr); |
254 compositeValueStr = compositeValueStr.replaceAll("[ \t]+", " "); // if there are many Blanks/Tabs make them to one Blank | |
255 compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr; | |
256 } | 484 } |
257 } else { | 485 } else { |
258 if (! composite.isWordDelimiterElement()) { | 486 if (! composite.isWordDelimiterElement) { |
259 compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_NWD_MARK; // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>) | 487 compositesCharsWithMarks.append(COMPLEX_ELEMENT_NWD_MARK); // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>) |
260 } else { | 488 } else { |
261 compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_MARK; // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>) | 489 compositesCharsWithMarks.append(COMPLEX_ELEMENT_MARK); // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>) |
262 } | 490 } |
263 complexElements.add(composite); | 491 complexElements.add(composite); |
264 } | 492 } |
265 } | 493 } |
266 // compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK); // remove Blanks after the non word breaking mark (e.g. "praebi<lb/> ta" is changed to "praebi<lb/>ta") | 494 // compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK); // remove Blanks after the non word breaking mark (e.g. "praebi<lb/> ta" is changed to "praebi<lb/>ta") |
268 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK); // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values | 496 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK); // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values |
269 if (complexElements.size() > 0) { | 497 if (complexElements.size() > 0) { |
270 for (int i=0; i<complexElements.size(); i++) { | 498 for (int i=0; i<complexElements.size(); i++) { |
271 int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksWithWordTags.indexOf(COMPLEX_ELEMENT_MARK); | 499 int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksWithWordTags.indexOf(COMPLEX_ELEMENT_MARK); |
272 Element complexElem = complexElements.get(i); | 500 Element complexElem = complexElements.get(i); |
273 String complexElementStr = complexElem.toXmlString(); | 501 String complexElementStr = complexElem.buildString(); |
274 String firstPiece = ""; | 502 String firstPiece = ""; |
275 if (indexComplexElemCompositesCharsWithMarks > 0) { | 503 if (indexComplexElemCompositesCharsWithMarks > 0) { |
276 firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks); | 504 firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks); |
277 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks); | 505 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks); |
278 } | 506 } |
279 retString = retString + firstPiece + complexElementStr; | 507 retStrBuilder.append(firstPiece + complexElementStr); |
280 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE); | 508 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE); |
281 } | 509 } |
282 retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added | 510 retStrBuilder.append(compositesCharsWithMarksWithWordTags); // last one must also be added |
283 } else { | 511 } else { |
284 retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added | 512 retStrBuilder.append(compositesCharsWithMarksWithWordTags); // last one must also be added |
285 } | 513 } |
286 } | 514 } |
287 retString = retString + "</" + name + ">"; | 515 if (outputFormat != null && outputFormat.equals("xml")) { |
516 retStrBuilder.append("</" + name + ">"); | |
517 } else { // outputFormat == string | |
518 // nothing | |
519 } | |
520 // put element into elements name hashtable | |
521 ArrayList<Element> elems = elements.get(name); | |
522 if (elems == null) { | |
523 elems = new ArrayList<Element>(); | |
524 elements.put(name, elems); | |
525 } | |
526 elems.add(this); | |
288 } | 527 } |
289 return retString; | 528 return retStrBuilder.toString(); |
290 } | 529 } |
291 | 530 |
292 private String insertWordTags(String charactersStrDeresolved, String language) throws SAXException { | 531 private String insertWordTags(StringBuilder charactersStrDeresolvedBuilder, String language) throws ApplicationException { |
532 String charactersStrDeresolved = charactersStrDeresolvedBuilder.toString(); | |
293 String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved); | 533 String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved); |
294 String retStr = ""; | 534 StringBuilder retStrBuilder = new StringBuilder(); |
295 try { | 535 Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr)); |
296 Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr)); | 536 tokenizer.setLanguage(language); |
297 tokenizer.setLanguage(language); | 537 String[] normFunction = {"norm"}; |
298 tokenizer.setNormFunctions(normalizeFunctions); | 538 tokenizer.setNormFunctions(normFunction); |
299 ArrayList<Token> tokens = tokenizer.getTokens(); | 539 ArrayList<Token> tokens = tokenizer.getTokens(); |
300 int endPos = 0; | 540 int endPos = 0; |
301 for (int i=0; i < tokens.size(); i++) { | 541 for (int i=0; i < tokens.size(); i++) { |
302 Token token = tokens.get(i); | 542 Token token = tokens.get(i); |
303 String wordForm = token.getContent(); | 543 int startPos = token.getStart(); |
304 int startPos = token.getStart(); | 544 String beforeStr = charactersStr.substring(endPos, startPos); |
305 String beforeStr = charactersStr.substring(endPos, startPos); | 545 endPos = token.getEnd(); |
306 endPos = token.getEnd(); | 546 String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr); |
307 String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr); | 547 String origWordForm = charactersStr.substring(startPos, endPos); |
308 String origWordForm = charactersStr.substring(startPos, endPos); | 548 String wordTag = insertWordTags(token, language, origWordForm); |
309 String wordTag = insertWordTags(wordForm, language, origWordForm); | 549 if (outputFormat != null && outputFormat.equals("xml")) { |
310 retStr = retStr + beforeStrDeresolved + wordTag; | 550 retStrBuilder.append(beforeStrDeresolved + wordTag); |
311 } | 551 } else { // outputFormat == string |
312 String lastAfterStr = charactersStr.substring(endPos); | 552 String beforeStrDeresolvedToBlanks = toBlanks(beforeStrDeresolved); |
313 String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr); | 553 retStrBuilder.append(beforeStrDeresolvedToBlanks + wordTag); |
314 retStr = retStr + lastAfterStrDeresolved; | 554 } |
315 } catch (ApplicationException e) { | 555 } |
316 throw new SAXException(e); | 556 String lastAfterStr = charactersStr.substring(endPos); |
317 } | 557 String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr); |
318 return retStr; | 558 if (outputFormat != null && outputFormat.equals("xml")) { |
319 } | 559 retStrBuilder.append(lastAfterStrDeresolved); |
320 | 560 } else { // outputFormat == string |
321 private String insertWordTags(String wordForm, String language, String origWordForm) throws ApplicationException { | 561 String lastAfterStrDeresolvedToBlanks = toBlanks(lastAfterStrDeresolved); |
562 retStrBuilder.append(lastAfterStrDeresolvedToBlanks); | |
563 } | |
564 return retStrBuilder.toString(); | |
565 } | |
566 | |
567 private String insertWordTags(Token token, String language, String origWordForm) throws ApplicationException { | |
568 if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) { | |
569 return origWordForm; | |
570 } | |
322 String wordTag = null; | 571 String wordTag = null; |
323 if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) | 572 token.setDocId(docId); |
573 token.setLanguage(lang); | |
574 token.setPageNumber(pageNumber); | |
575 token.setLineNumber(lineNumber); | |
576 token.setElementPosition(position); | |
577 token.setElementPagePosition(pagePosition); | |
578 token.setElementName(name); | |
579 token.setXmlId(xmlId); | |
580 token.setXpath("xpath"); // TODO | |
581 if (name != null && name.equals("reg")) { | |
582 if (attrString != null && attrString.contains("norm=\"")) { | |
583 int regIndexBegin = attrString.indexOf("norm=\""); | |
584 int regIndexEnd = attrString.indexOf("\"", regIndexBegin + 7); | |
585 String reg = attrString.substring(regIndexBegin + 6, regIndexEnd); | |
586 token.setContentReg(reg); | |
587 String[] normFunction = {"norm"}; | |
588 Normalizer normalizer = new Normalizer(normFunction, language); | |
589 String normStr = normalizer.normalize(reg); | |
590 token.setContentNorm(normStr); | |
591 } | |
592 } | |
593 if (language == null) { | |
594 token.setContentOrig(origWordForm); // TODO necessary ? | |
595 tokens.add(token); | |
596 resultTokens.add(token); | |
324 return origWordForm; | 597 return origWordForm; |
325 if (isStopElement()) | 598 } |
599 if (isStopElement && outputFormat != null && outputFormat.equals("xml")) | |
326 return origWordForm; | 600 return origWordForm; |
327 wordForm = removeSpecialSymbols(wordForm); | 601 if (isStopElement && outputFormat != null && outputFormat.equals("string")) |
328 wordForm = wordForm.toLowerCase(); | 602 return toBlanks(origWordForm); |
603 String wordFormNorm = token.getContentNorm(); | |
329 String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm); | 604 String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm); |
330 ArrayList<Lemma> lemmas = null; | 605 ArrayList<Lemma> lemmas = null; |
331 if (withForms() || withLemmas()) { | 606 Boolean hasDctionaryEntries = null; |
607 String lemmasStr = ""; | |
608 if (withForms || withLemmas) { | |
332 LexHandler lexHandler = LexHandler.getInstance(); | 609 LexHandler lexHandler = LexHandler.getInstance(); |
333 lemmas = lexHandler.getLemmas(wordForm, "form", language, Normalizer.NONE); | 610 lemmas = lexHandler.getLemmas(wordFormNorm, "form", language, Normalizer.DICTIONARY, false); // Performance: needs 15 % of the indexing time |
334 } | 611 if (lemmas != null) { |
335 wordTag = insertWordTags(origWordFormDeresolved, wordForm, language, null, lemmas); | 612 for (int i=0; i < lemmas.size(); i++) { |
613 Lemma lemma = lemmas.get(i); | |
614 String lemmaName = lemma.getLemmaName(); | |
615 lemmasStr = lemmasStr + lemmaName + " "; | |
616 } | |
617 } | |
618 lemmasStr = lemmasStr.trim(); | |
619 token.setContentMorph(lemmasStr); | |
620 hasDctionaryEntries = false; | |
621 ArrayList<String> lexEntries = lexHandler.getLexEntryKeys(wordFormNorm, language, Normalizer.DICTIONARY); // Performance: needs 15 % of the indexing time | |
622 if (lexEntries != null) | |
623 hasDctionaryEntries = true; | |
624 } | |
625 if (outputFormat != null && outputFormat.equals("xml")) { | |
626 wordTag = insertWordTags(origWordFormDeresolved, token, language, lemmas, hasDctionaryEntries); // Performance: needs 10 % of the indexing time | |
627 String tokenWordForm = token.getContentOrig(); // word form is in contentOrig | |
628 if (useRegFunction) | |
629 tokenWordForm = token.getContentReg(); | |
630 else if (useNormFunction) | |
631 tokenWordForm = token.getContentNorm(); | |
632 else if (withLemmas) | |
633 tokenWordForm = token.getContentMorph(); | |
634 boolean isHighlightTerm = false; | |
635 if (highlightTerms.length > 0 && ! withLemmas) { | |
636 isHighlightTerm = isHighlightTerm(tokenWordForm); | |
637 } else { | |
638 if (highlightTerms.length > 0 && lemmas != null) { | |
639 String[] lemmasArray = lemmasStr.split(" "); | |
640 isHighlightTerm = isHighlightTerm(lemmasArray); | |
641 } | |
642 } | |
643 if (isHighlightTerm) { | |
644 wordTag = "<hi>" + wordTag + "</hi>"; | |
645 } | |
646 } else { // outputFormat == string | |
647 String inWordFormWithoutSpecialSymbols = removeSpecialSymbols(origWordForm); // without hyphen, blanks, newline, tab | |
648 if (withLemmas) { | |
649 if (lemmas != null) { | |
650 String blanksAndNWBMarksOfOrigWord = toBlanks(origWordFormDeresolved); // to rescue the NWB marks of the origWord and put it to the beginning of the lemmasStr | |
651 wordTag = blanksAndNWBMarksOfOrigWord + lemmasStr; | |
652 token.setContentMorph(lemmasStr); | |
653 } else { | |
654 wordTag = inWordFormWithoutSpecialSymbols; | |
655 } | |
656 } else { | |
657 wordTag = inWordFormWithoutSpecialSymbols; | |
658 } | |
659 tokens.add(token); | |
660 resultTokens.add(token); | |
661 } | |
336 return wordTag; | 662 return wordTag; |
337 } | 663 } |
338 | 664 |
665 private String removeSpecialSymbols(String inputStr) { | |
666 String retStr = inputStr.replaceAll(" |\n|\t|-|\u00AD", ""); // blank, newline, tab, minus, soft hyphen | |
667 return retStr; | |
668 } | |
669 | |
339 /** | 670 /** |
340 * | 671 * |
341 * @param origWordToken could contain nwd marks | 672 * @param origWordToken could contain nwd marks |
342 * @param wordForm contains no nwd marks | 673 * @param token |
343 * @param language | 674 * @param language |
344 * @param origWordFormNormalized | |
345 * @param lemmas | 675 * @param lemmas |
346 * @return for each substring between nwd marks create a word tag | 676 * @return for each substring between nwd marks create a word tag |
347 */ | 677 */ |
348 private String insertWordTags(String origWordToken, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) { | 678 private String insertWordTags(String origWordToken, Token token, String language, ArrayList<Lemma> lemmas, Boolean hasDictionaryEntries) { |
349 if (origWordToken.isEmpty()) | 679 if (origWordToken.isEmpty()) |
350 return origWordToken; | 680 return origWordToken; |
351 if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK)) | 681 if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK)) |
352 return COMPLEX_ELEMENT_NWD_MARK; | 682 return COMPLEX_ELEMENT_NWD_MARK; |
353 String retWordTags = ""; | 683 String retWordTags = ""; |
354 String origWordTokenTmp = origWordToken; | 684 String origWordTokenTmp = origWordToken; |
355 while (! origWordTokenTmp.isEmpty()) { | 685 if (outputFormat != null && outputFormat.equals("xml")) { |
356 if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) { // single nwd mark | 686 retWordTags = getWordTag(origWordToken, token, language, lemmas, hasDictionaryEntries); |
357 origWordTokenTmp = origWordTokenTmp.substring(1); | 687 /* |
358 retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK; | 688 while (! origWordTokenTmp.isEmpty()) { |
359 } else { | 689 if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) { // single nwd mark |
360 int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK); | 690 origWordTokenTmp = origWordTokenTmp.substring(1); |
361 if (indexUpToNWD != -1) { // not end of string reached | 691 retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK; |
362 String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD); | 692 } else { |
363 String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); | 693 int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK); |
364 retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK; | 694 if (indexUpToNWD != -1) { // not end of string reached |
365 origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1); | 695 String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD); |
366 } else { // end of string reached | 696 String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, token, language, lemmas, hasDictionaryEntries); |
367 String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length()); | 697 retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK; |
368 String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); | 698 origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1); |
369 retWordTags = retWordTags + origWordTokenFragmentWithTags; | 699 } else { // end of string reached |
370 origWordTokenTmp = ""; // finente | 700 String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length()); |
371 } | 701 String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, token, language, lemmas, hasDictionaryEntries); |
372 } | 702 retWordTags = retWordTags + origWordTokenFragmentWithTags; |
703 origWordTokenTmp = ""; // finente | |
704 } | |
705 } | |
706 } | |
707 */ | |
708 } else { | |
709 // nothing | |
373 } | 710 } |
374 return retWordTags; | 711 return retWordTags; |
375 } | 712 } |
376 | 713 |
377 private String getWordTag(String origWordForm, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) { | 714 private String getWordTag(String origWordForm, Token token, String language, ArrayList<Lemma> lemmas, Boolean hasDictionaryEntries) { |
378 if (origWordForm == null || origWordForm.isEmpty()) | 715 if (origWordForm == null || origWordForm.isEmpty()) |
379 return ""; | 716 return ""; |
717 String wordForm = token.getContentOrig(); // word form is in contentOrig | |
718 String regularizedWordForm = token.getContentReg(); | |
719 String normalizedWordForm = token.getContentNorm(); | |
380 String langISOCode = Language.getInstance().getISO639Code(language); | 720 String langISOCode = Language.getInstance().getISO639Code(language); |
381 String retStr = "<w form=\"" + wordForm + "\"" + " lang=\"" + langISOCode + "\""; | 721 StringBuilder retStrBuilder = new StringBuilder(); |
382 if (origWordFormNormalized != null) | 722 retStrBuilder.append("<w" + " lang=\"" + langISOCode + "\"" + " form=\"" + wordForm + "\""); |
383 retStr = retStr + " formNormalized=\"" + origWordFormNormalized + "\""; | 723 if (regularizedWordForm != null) |
724 retStrBuilder.append(" formRegularized=\"" + regularizedWordForm + "\""); | |
725 if (normalizedWordForm != null) | |
726 retStrBuilder.append(" formNormalized=\"" + normalizedWordForm + "\""); | |
384 if (lemmas != null) { | 727 if (lemmas != null) { |
385 String lemmasStr = ""; | 728 String lemmasStr = ""; |
386 String formsStr = ""; | 729 StringBuilder formsStrBuilder = new StringBuilder(); |
387 Collections.sort(lemmas); | 730 Collections.sort(lemmas); |
388 Hashtable<String, Form> formsHashtable = new Hashtable<String, Form>(); | 731 Hashtable<String, Form> formsHashtable = new Hashtable<String, Form>(); |
389 for (int i=0; i < lemmas.size(); i++) { | 732 for (int i=0; i < lemmas.size(); i++) { |
390 Lemma lemma = lemmas.get(i); | 733 Lemma lemma = lemmas.get(i); |
391 ArrayList<Form> lemmaForms = lemma.getFormsList(); | 734 ArrayList<Form> lemmaForms = lemma.getFormsList(); |
401 Collections.sort(forms); | 744 Collections.sort(forms); |
402 for (int i=0; i < forms.size(); i++) { | 745 for (int i=0; i < forms.size(); i++) { |
403 Form form = forms.get(i); | 746 Form form = forms.get(i); |
404 String formName = form.getFormName(); | 747 String formName = form.getFormName(); |
405 formName = StringUtils.forXML(formName); | 748 formName = StringUtils.forXML(formName); |
406 formsStr = formsStr + formName + " "; | 749 formsStrBuilder.append(formName + " "); |
407 } | 750 } |
751 String formsStr = formsStrBuilder.toString(); | |
408 if (formsStr.endsWith(" ")) | 752 if (formsStr.endsWith(" ")) |
409 formsStr = formsStr.substring(0, formsStr.length() - 1); | 753 formsStr = formsStr.substring(0, formsStr.length() - 1); |
410 if (lemmasStr.endsWith(" ")) | 754 if (lemmasStr.endsWith(" ")) |
411 lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1); | 755 lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1); |
412 if (withForms()) | 756 if (withForms) |
413 retStr = retStr + " forms=\"" + formsStr + "\""; | 757 retStrBuilder.append(" forms=\"" + formsStr + "\""); |
414 if (withLemmas()) | 758 if (withLemmas) |
415 retStr = retStr + " lemmas=\"" + lemmasStr + "\""; | 759 retStrBuilder.append(" lemmas=\"" + lemmasStr + "\""); |
416 } | 760 } |
417 retStr = retStr + ">" + origWordForm + "</w>"; | 761 if (hasDictionaryEntries != null && hasDictionaryEntries) { |
418 return retStr; | 762 retStrBuilder.append(" dictionary=\"" + "true" + "\""); |
419 } | 763 } else if (hasDictionaryEntries != null && ! hasDictionaryEntries) { |
420 | 764 retStrBuilder.append(" dictionary=\"" + "false" + "\""); |
421 private String removeSpecialSymbols(String inputStr) { | 765 } |
422 String retStr = inputStr.replaceAll(" |\n|\t|-|\u2424|\u2425", ""); | 766 retStrBuilder.append(">"); |
423 return retStr; | 767 retStrBuilder.append(origWordForm); // origWordForm could contain nwd marks (these are transformed back to elements later in method buildString) |
768 retStrBuilder.append("</w>"); | |
769 return retStrBuilder.toString(); | |
770 } | |
771 | |
772 private String toBlanks(String inputStr) { | |
773 int size = inputStr.length(); | |
774 StringBuilder retStrBuilder = new StringBuilder(); | |
775 for (int j=0; j < size; j++) { | |
776 char c = inputStr.charAt(j); | |
777 if (c == COMPLEX_ELEMENT_NWD_MARK.charAt(0) || c == COMPLEX_ELEMENT_MARK.charAt(0)) | |
778 retStrBuilder.append(c); | |
779 else | |
780 retStrBuilder.append(" "); | |
781 } | |
782 return retStrBuilder.toString(); | |
424 } | 783 } |
425 | 784 |
426 } | 785 } |
427 } | 786 } |