Mercurial > hg > mpdl-group
annotate software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 7d6d969b10cf |
children |
rev | line source |
---|---|
19 | 1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; |
2 | |
3 import java.io.StringReader; | |
4 import java.util.ArrayList; | |
5 import java.util.Collections; | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
6 import java.util.Enumeration; |
19 | 7 import java.util.Hashtable; |
8 | |
9 import org.xml.sax.*; | |
10 | |
11 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
12 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; | |
13 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; | |
14 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; | |
15 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; | |
20
7d6d969b10cf
little corrections
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
19
diff
changeset
|
16 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; |
19 | 17 import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; |
18 | |
19 public class XmlTokenizerContentHandler implements ContentHandler { | |
20 private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element | |
21 private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element | |
22 private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length(); | |
23 private static int ELEMENT_TYPE_CHARACTERS = 1; | |
24 private static int ELEMENT_TYPE_COMPLEX = 2; | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
25 private String docId; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
26 private String language; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
27 private String[] nwbElements = {}; // non word breaking elements, default: no nwb elements |
19 | 28 private String[] stopElements = {}; // default: no stop elements |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
29 private String outputFormat = "xml"; // default: xml |
19 | 30 private String[] outputOptions = {}; |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
31 private boolean withForms = false; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
32 private boolean withLemmas = false; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
33 private String[] highlightTerms = {}; // highlight terms, default: no highlight terms |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
34 private String[] normFunctions = {}; // default: no norm function |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
35 private boolean useNormFunction = false; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
36 private boolean useRegFunction = false; |
19 | 37 private String xmlnsString = ""; |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
38 private StringBuilder result = new StringBuilder(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
39 private ArrayList<Token> resultTokens = new ArrayList<Token>(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
40 private Hashtable<String, ArrayList<Element>> elements = new Hashtable<String, ArrayList<Element>>(); |
19 | 41 private Element rootElement; |
42 private Element currentElement; | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
43 private int currentPosition = 0; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
44 private int currentPageNumber = 0; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
45 private int currentLineNumber = 0; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
46 private Hashtable<String, Integer> currentPositions = new Hashtable<String, Integer>(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
47 private Hashtable<String, Integer> currentPagePositions = new Hashtable<String, Integer>(); |
19 | 48 private ArrayList<Element> elementQueue; |
49 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
50 public XmlTokenizerContentHandler(String language) throws ApplicationException { |
19 | 51 this.language = language; |
52 } | |
53 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
54 public void setDocIdentifier(String docId) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
55 this.docId = docId; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
56 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
57 |
19 | 58 public void setNWBElements(String[] nwbElements) { |
59 this.nwbElements = nwbElements; | |
60 } | |
61 | |
62 public void setStopElements(String[] stopElements) { | |
63 this.stopElements = stopElements; | |
64 } | |
65 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
66 public void setHighlightTerms(String[] highlightTerms) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
67 this.highlightTerms = highlightTerms; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
68 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
69 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
70 public void setNormFunctions(String[] normFunctions) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
71 this.normFunctions = normFunctions; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
72 if (this.normFunctions != null) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
73 for (int i=0; i< this.normFunctions.length; i++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
74 String function = normFunctions[i]; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
75 if (function.equals("norm")) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
76 this.useNormFunction = true; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
77 else if (function.equals("reg")) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
78 this.useRegFunction = true; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
79 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
80 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
81 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
82 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
83 public void setOutputFormat(String outputFormat) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
84 this.outputFormat = outputFormat; |
19 | 85 } |
86 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
87 public void setOutputOptions(String[] outputOptions) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
88 this.outputOptions = outputOptions; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
89 for (int i=0; i< this.outputOptions.length; i++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
90 String function = outputOptions[i]; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
91 if (function.equals("withForms")) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
92 this.withForms = true; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
93 else if (function.equals("withLemmas")) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
94 this.withLemmas = true; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
95 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
96 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
97 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
98 public String getResultString() { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
99 return result.toString(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
100 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
101 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
102 public ArrayList<Token> getResultTokens() { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
103 return resultTokens; |
19 | 104 } |
105 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
106 public ArrayList<Element> getElements(String elementName) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
107 return elements.get(elementName); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
108 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
109 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
110 public int getPageCount() { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
111 return currentPageNumber; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
112 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
113 |
19 | 114 public void startDocument() throws SAXException { |
115 } | |
116 | |
117 public void endDocument() throws SAXException { | |
118 try { | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
119 String rootElemToStr = rootElement.buildString(); |
19 | 120 write(rootElemToStr); |
121 write("\n"); | |
122 } catch (NullPointerException e) { | |
123 throw new SAXException(e); | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
124 } catch (ApplicationException e) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
125 throw new SAXException(e); |
19 | 126 } |
127 } | |
128 | |
129 public void characters(char[] c, int start, int length) throws SAXException { | |
130 char[] cCopy = new char[length]; | |
131 System.arraycopy(c, start, cCopy, 0, length); | |
132 String charactersStr = String.valueOf(cCopy); | |
133 if (charactersStr != null && ! charactersStr.equals("")) { | |
134 if (currentElement != null) { | |
135 Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
136 charElement.pageNumber = currentPageNumber; |
19 | 137 charElement.value = StringUtils.deresolveXmlEntities(charactersStr); |
138 if (currentElement.composites == null) | |
139 currentElement.composites = new ArrayList<Element>(); | |
140 currentElement.composites.add(charElement); | |
141 } | |
142 } | |
143 } | |
144 | |
145 public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { | |
146 } | |
147 | |
148 public void processingInstruction(String target, String data) throws SAXException { | |
149 } | |
150 | |
151 public void setDocumentLocator(Locator locator) { | |
152 } | |
153 | |
154 public void startPrefixMapping(String prefix, String uri) throws SAXException { | |
155 if (prefix != null && prefix.equals("")) | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
156 xmlnsString += "xmlns" + "=\"" + uri + "\" "; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
157 else |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
158 xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; |
19 | 159 } |
160 | |
161 public void endPrefixMapping(String prefix) throws SAXException { | |
162 } | |
163 | |
164 public void skippedEntity(String name) throws SAXException { | |
165 } | |
166 | |
167 public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { | |
168 if (elementQueue == null) | |
169 elementQueue = new ArrayList<Element>(); | |
170 Element newElement = new Element(name); // element of type: complex | |
171 if (currentElement != null) { | |
172 if (currentElement.composites == null) | |
173 currentElement.composites = new ArrayList<Element>(); | |
174 if (currentElement.lang != null) | |
175 newElement.lang = currentElement.lang; // language is inherited to childs | |
176 currentElement.composites.add(newElement); | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
177 newElement.parent = currentElement; |
19 | 178 } |
179 currentElement = newElement; | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
180 if (localName != null && localName.equals("pb")) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
181 currentPageNumber++; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
182 setCurrentPagePosition(localName, 0); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
183 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
184 currentElement.pageNumber = currentPageNumber; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
185 if (localName != null && localName.equals("lb")) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
186 currentLineNumber++; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
187 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
188 currentElement.lineNumber = currentLineNumber; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
189 currentPosition++; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
190 currentElement.docPosition = currentPosition; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
191 int newElemPosition = incrementCurrentPosition(localName); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
192 currentElement.position = newElemPosition; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
193 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
194 currentElement.elemPosition = getElementPosition(currentElement); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
195 Element parent = currentElement.parent; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
196 if (parent == null) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
197 currentElement.xpath = "/" + currentElement.name + "[" + currentElement.elemPosition + "]"; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
198 } else { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
199 currentElement.xpath = parent.xpath + "/" + currentElement.name + "[" + currentElement.elemPosition + "]"; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
200 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
201 int newElemPagePosition = incrementCurrentPagePosition(localName); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
202 currentElement.pagePosition = newElemPagePosition; |
19 | 203 int attrSize = attrs.getLength(); |
204 String attrString = ""; | |
205 for (int i=0; i<attrSize; i++) { | |
206 String attrQName = attrs.getQName(i); | |
207 String attrValue = attrs.getValue(i); | |
208 attrValue = StringUtils.forXML(attrValue); | |
209 attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
210 if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) { |
19 | 211 currentElement.lang = attrValue; // if xml:lang is set, it is set to the new element and overwrites values inherited by the father |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
212 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
213 if (attrQName != null && (attrQName.toLowerCase().equals("xml:id") || attrQName.toLowerCase().equals("id"))) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
214 currentElement.xmlId = attrValue; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
215 } |
19 | 216 } |
217 currentElement.attrString = attrString; | |
218 if (! xmlnsString.equals("")) { | |
219 currentElement.xmlnsString = xmlnsString; | |
220 } | |
221 xmlnsString = ""; | |
222 elementQueue.add(currentElement); | |
223 // only the first element is the root element | |
224 if(rootElement == null) | |
225 rootElement = currentElement; | |
226 } | |
227 | |
228 public void endElement(String uri, String localName, String name) throws SAXException { | |
229 if (elementQueue != null && elementQueue.size() > 0) { | |
230 int lastIndex = elementQueue.size() - 1; | |
231 elementQueue.remove(lastIndex); | |
232 } | |
233 if (elementQueue != null && elementQueue.size() > 0) { | |
234 int lastIndex = elementQueue.size() - 1; | |
235 currentElement = elementQueue.get(lastIndex); | |
236 } else { | |
237 currentElement = null; | |
238 } | |
239 } | |
240 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
241 private int incrementCurrentPosition(String elemName) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
242 Integer currentElemPos = currentPositions.get(elemName); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
243 if (currentElemPos == null) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
244 currentElemPos = new Integer(0); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
245 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
246 currentElemPos++; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
247 currentPositions.put(elemName, currentElemPos); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
248 return currentElemPos.intValue(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
249 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
250 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
251 private int getElementPosition(Element elem) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
252 int pos = 0; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
253 Element parent = elem.parent; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
254 if (parent == null) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
255 pos = 1; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
256 } else { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
257 pos = 0; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
258 ArrayList<Element> composites = parent.composites; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
259 if (composites != null) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
260 for (int i=0; i<composites.size(); i++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
261 Element e = composites.get(i); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
262 if (e.isComplex() && e.name.equals(elem.name)) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
263 pos++; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
264 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
265 if (e == elem) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
266 break; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
267 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
268 } else { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
269 pos = 1; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
270 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
271 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
272 return pos; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
273 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
274 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
275 private int incrementCurrentPagePosition(String elemName) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
276 Integer currentElemPagePos = currentPagePositions.get(elemName); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
277 if (currentElemPagePos == null) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
278 currentElemPagePos = new Integer(0); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
279 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
280 currentElemPagePos++; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
281 currentPagePositions.put(elemName, currentElemPagePos); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
282 return currentElemPagePos.intValue(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
283 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
284 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
285 private void setCurrentPagePosition(String elemName, int pos) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
286 Integer newPagePosition = new Integer(pos); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
287 Enumeration<String> elemKeys = currentPagePositions.keys(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
288 while (elemKeys.hasMoreElements()) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
289 String elemKey = elemKeys.nextElement(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
290 currentPagePositions.put(elemKey, newPagePosition); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
291 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
292 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
293 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
294 private boolean isHighlightTerm(String term) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
295 if (term == null) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
296 return false; |
19 | 297 boolean result = false; |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
298 for (int i=0; i< highlightTerms.length; i++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
299 String t = highlightTerms[i].toLowerCase(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
300 String termLowerCase = term.toLowerCase(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
301 if (t.equals(termLowerCase)) |
19 | 302 return true; |
303 } | |
304 return result; | |
305 } | |
306 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
307 private boolean isHighlightTerm(String[] terms) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
308 if (terms == null) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
309 return false; |
19 | 310 boolean result = false; |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
311 for (int i=0; i< highlightTerms.length; i++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
312 String t = highlightTerms[i].toLowerCase(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
313 for (int j=0; j<terms.length; j++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
314 String termLowerCase = terms[j].toLowerCase(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
315 if (t.equals(termLowerCase)) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
316 return true; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
317 } |
19 | 318 } |
319 return result; | |
320 } | |
321 | |
322 private void write(String outStr) throws SAXException { | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
323 result.append(outStr); |
19 | 324 } |
325 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
326 public class Element implements Comparable<Element> { |
19 | 327 private int type; |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
328 public String name; |
19 | 329 private String xmlnsString; |
330 private String attrString; | |
331 private String value; | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
332 public String lang; // value of attribute xml:lang or the inherited xml:lang value of the father node |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
333 public String xmlId; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
334 public String xpath; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
335 public int pageNumber; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
336 public int lineNumber; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
337 public int docPosition; // absolute position in document |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
338 public int position; // position within all elements with this name |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
339 public int elemPosition; // position in element e.g. the 6 sentence in paragraph |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
340 public int pagePosition; // position in page |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
341 private ArrayList<Token> tokens = new ArrayList<Token>(); |
19 | 342 private ArrayList<Element> composites; |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
343 private Element parent; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
344 private boolean isStopElement = false; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
345 private boolean isWordDelimiterElement = true; // default: is word delimiter element |
19 | 346 |
347 private Element(String name) { | |
348 this.type = ELEMENT_TYPE_COMPLEX; | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
349 setName(name); |
19 | 350 } |
351 | |
352 private Element(String name, int type) { | |
353 this.type = type; | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
354 setName(name); |
19 | 355 } |
356 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
357 private void setName(String name) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
358 this.name = name; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
359 for (int i=0; i<stopElements.length; i++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
360 String stopElementName = stopElements[i]; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
361 if (name.equals(stopElementName)) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
362 this.isStopElement = true; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
363 break; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
364 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
365 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
366 for (int i=0; i<nwbElements.length; i++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
367 String nwbElementName = nwbElements[i]; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
368 if (name.equals(nwbElementName)) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
369 this.isWordDelimiterElement = false; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
370 break; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
371 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
372 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
373 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
374 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
375 public int compareTo(Element elem) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
376 return (new Integer(position)).compareTo(new Integer(elem.position)); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
377 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
378 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
379 private boolean isComplex() { |
19 | 380 boolean isComplex = false; |
381 if (type == ELEMENT_TYPE_COMPLEX) | |
382 isComplex = true; | |
383 return isComplex; | |
384 } | |
385 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
386 public ArrayList<Token> getTokens() { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
387 ArrayList<Token> retTokens = new ArrayList<Token>(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
388 if (isComplex()) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
389 if (composites != null) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
390 for (int i=0; i<composites.size(); i++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
391 Element elem = composites.get(i); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
392 if (elem.tokens != null) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
393 retTokens.addAll(elem.tokens); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
394 } |
19 | 395 } |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
396 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
397 if (tokens != null) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
398 retTokens.addAll(tokens); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
399 return retTokens; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
400 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
401 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
402 public String getTokensStr(String type) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
403 ArrayList<Token> elementTokens = getTokens(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
404 String tokenStr = getTokensStr(type, elementTokens); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
405 return tokenStr; |
19 | 406 } |
407 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
408 private String getTokensStr(String type, ArrayList<Token> tokens) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
409 StringBuilder tokenStr = new StringBuilder(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
410 for (int j=0; j<tokens.size(); j++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
411 Token token = tokens.get(j); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
412 String content = null; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
413 if (type.equals("orig")) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
414 content = token.getContentOrig(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
415 else if (type.equals("reg")) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
416 content = token.getContentReg(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
417 else if (type.equals("norm")) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
418 content = token.getContentNorm(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
419 else if (type.equals("morph")) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
420 content = token.getContentMorph(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
421 if (content != null) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
422 tokenStr.append(content + " "); |
19 | 423 } |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
424 return tokenStr.toString(); |
19 | 425 } |
426 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
427 public String toXmlString() throws ApplicationException { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
428 StringBuilder retStrBuilder = new StringBuilder(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
429 if (! isComplex()) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
430 retStrBuilder.append(value); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
431 } else { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
432 String xmlNsString = this.xmlnsString; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
433 if (xmlNsString == null || xmlNsString.equals("")) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
434 retStrBuilder.append("<" + name + attrString + ">"); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
435 } else { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
436 retStrBuilder.append("<" + name + " " + xmlNsString + attrString + ">"); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
437 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
438 if (composites != null) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
439 for (int i=0; i<composites.size(); i++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
440 Element composite = composites.get(i); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
441 if (! composite.isComplex()) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
442 if (composite.value != null && ! composite.value.equals("")) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
443 String compositeValueStr = StringUtils.removeNlTabBlanks(composite.value); // remove all newlines, they are no separators for words. And: if there are many Blanks/Tabs make them to one Blank |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
444 retStrBuilder.append(compositeValueStr); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
445 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
446 } else { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
447 retStrBuilder.append(composite.toXmlString()); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
448 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
449 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
450 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
451 retStrBuilder.append("</" + name + ">"); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
452 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
453 return retStrBuilder.toString(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
454 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
455 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
456 private String buildString() throws ApplicationException { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
457 StringBuilder retStrBuilder = new StringBuilder(); |
19 | 458 String elemLanguage = language; // default value for the document/page |
459 if (lang != null) | |
460 elemLanguage = lang; // value of the element if available | |
461 // write this element | |
462 if (! isComplex()) { | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
463 retStrBuilder.append(value); |
19 | 464 } else { |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
465 if (outputFormat != null && outputFormat.equals("xml")) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
466 String xmlNsString = this.xmlnsString; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
467 if (xmlNsString == null || xmlNsString.equals("")) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
468 retStrBuilder.append("<" + name + attrString + ">"); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
469 } else { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
470 retStrBuilder.append("<" + name + " " + xmlNsString + attrString + ">"); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
471 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
472 } else { // outputFormat == string |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
473 // nothing |
19 | 474 } |
475 if (composites != null) { | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
476 StringBuilder compositesCharsWithMarks = new StringBuilder(); |
19 | 477 ArrayList<Element> complexElements = new ArrayList<Element>(); |
478 for (int i=0; i<composites.size(); i++) { | |
479 Element composite = composites.get(i); | |
480 if (! composite.isComplex()) { | |
481 if (composite.value != null && ! composite.value.equals("")) { | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
482 String compositeValueStr = StringUtils.removeNlTabBlanks(composite.value); // remove all newlines, they are no separators for words. And: if there are many Blanks/Tabs make them to one Blank |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
483 compositesCharsWithMarks.append(compositeValueStr); |
19 | 484 } |
485 } else { | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
486 if (! composite.isWordDelimiterElement) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
487 compositesCharsWithMarks.append(COMPLEX_ELEMENT_NWD_MARK); // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>) |
19 | 488 } else { |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
489 compositesCharsWithMarks.append(COMPLEX_ELEMENT_MARK); // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>) |
19 | 490 } |
491 complexElements.add(composite); | |
492 } | |
493 } | |
494 // compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK); // remove Blanks after the non word breaking mark (e.g. "praebi<lb/> ta" is changed to "praebi<lb/>ta") | |
495 String compositesCharsWithMarksWithWordTags = insertWordTags(compositesCharsWithMarks, elemLanguage); | |
496 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK); // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values | |
497 if (complexElements.size() > 0) { | |
498 for (int i=0; i<complexElements.size(); i++) { | |
499 int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksWithWordTags.indexOf(COMPLEX_ELEMENT_MARK); | |
500 Element complexElem = complexElements.get(i); | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
501 String complexElementStr = complexElem.buildString(); |
19 | 502 String firstPiece = ""; |
503 if (indexComplexElemCompositesCharsWithMarks > 0) { | |
504 firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks); | |
505 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks); | |
506 } | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
507 retStrBuilder.append(firstPiece + complexElementStr); |
19 | 508 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE); |
509 } | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
510 retStrBuilder.append(compositesCharsWithMarksWithWordTags); // last one must also be added |
19 | 511 } else { |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
512 retStrBuilder.append(compositesCharsWithMarksWithWordTags); // last one must also be added |
19 | 513 } |
514 } | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
515 if (outputFormat != null && outputFormat.equals("xml")) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
516 retStrBuilder.append("</" + name + ">"); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
517 } else { // outputFormat == string |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
518 // nothing |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
519 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
520 // put element into elements name hashtable |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
521 ArrayList<Element> elems = elements.get(name); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
522 if (elems == null) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
523 elems = new ArrayList<Element>(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
524 elements.put(name, elems); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
525 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
526 elems.add(this); |
19 | 527 } |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
528 return retStrBuilder.toString(); |
19 | 529 } |
530 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
531 private String insertWordTags(StringBuilder charactersStrDeresolvedBuilder, String language) throws ApplicationException { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
532 String charactersStrDeresolved = charactersStrDeresolvedBuilder.toString(); |
19 | 533 String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved); |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
534 StringBuilder retStrBuilder = new StringBuilder(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
535 Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr)); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
536 tokenizer.setLanguage(language); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
537 String[] normFunction = {"norm"}; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
538 tokenizer.setNormFunctions(normFunction); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
539 ArrayList<Token> tokens = tokenizer.getTokens(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
540 int endPos = 0; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
541 for (int i=0; i < tokens.size(); i++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
542 Token token = tokens.get(i); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
543 int startPos = token.getStart(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
544 String beforeStr = charactersStr.substring(endPos, startPos); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
545 endPos = token.getEnd(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
546 String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
547 String origWordForm = charactersStr.substring(startPos, endPos); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
548 String wordTag = insertWordTags(token, language, origWordForm); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
549 if (outputFormat != null && outputFormat.equals("xml")) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
550 retStrBuilder.append(beforeStrDeresolved + wordTag); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
551 } else { // outputFormat == string |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
552 String beforeStrDeresolvedToBlanks = toBlanks(beforeStrDeresolved); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
553 retStrBuilder.append(beforeStrDeresolvedToBlanks + wordTag); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
554 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
555 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
556 String lastAfterStr = charactersStr.substring(endPos); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
557 String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
558 if (outputFormat != null && outputFormat.equals("xml")) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
559 retStrBuilder.append(lastAfterStrDeresolved); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
560 } else { // outputFormat == string |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
561 String lastAfterStrDeresolvedToBlanks = toBlanks(lastAfterStrDeresolved); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
562 retStrBuilder.append(lastAfterStrDeresolvedToBlanks); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
563 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
564 return retStrBuilder.toString(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
565 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
566 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
567 private String insertWordTags(Token token, String language, String origWordForm) throws ApplicationException { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
568 if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
569 return origWordForm; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
570 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
571 String wordTag = null; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
572 token.setDocId(docId); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
573 token.setLanguage(lang); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
574 token.setPageNumber(pageNumber); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
575 token.setLineNumber(lineNumber); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
576 token.setElementPosition(position); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
577 token.setElementPagePosition(pagePosition); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
578 token.setElementName(name); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
579 token.setXmlId(xmlId); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
580 token.setXpath("xpath"); // TODO |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
581 if (name != null && name.equals("reg")) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
582 if (attrString != null && attrString.contains("norm=\"")) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
583 int regIndexBegin = attrString.indexOf("norm=\""); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
584 int regIndexEnd = attrString.indexOf("\"", regIndexBegin + 7); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
585 String reg = attrString.substring(regIndexBegin + 6, regIndexEnd); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
586 token.setContentReg(reg); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
587 String[] normFunction = {"norm"}; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
588 Normalizer normalizer = new Normalizer(normFunction, language); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
589 String normStr = normalizer.normalize(reg); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
590 token.setContentNorm(normStr); |
19 | 591 } |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
592 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
593 if (language == null) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
594 token.setContentOrig(origWordForm); // TODO necessary ? |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
595 tokens.add(token); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
596 resultTokens.add(token); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
597 return origWordForm; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
598 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
599 if (isStopElement && outputFormat != null && outputFormat.equals("xml")) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
600 return origWordForm; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
601 if (isStopElement && outputFormat != null && outputFormat.equals("string")) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
602 return toBlanks(origWordForm); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
603 String wordFormNorm = token.getContentNorm(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
604 String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
605 ArrayList<Lemma> lemmas = null; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
606 Boolean hasDctionaryEntries = null; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
607 String lemmasStr = ""; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
608 if (withForms || withLemmas) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
609 LexHandler lexHandler = LexHandler.getInstance(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
610 lemmas = lexHandler.getLemmas(wordFormNorm, "form", language, Normalizer.DICTIONARY, false); // Performance: needs 15 % of the indexing time |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
611 if (lemmas != null) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
612 for (int i=0; i < lemmas.size(); i++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
613 Lemma lemma = lemmas.get(i); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
614 String lemmaName = lemma.getLemmaName(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
615 lemmasStr = lemmasStr + lemmaName + " "; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
616 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
617 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
618 lemmasStr = lemmasStr.trim(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
619 token.setContentMorph(lemmasStr); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
620 hasDctionaryEntries = false; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
621 ArrayList<String> lexEntries = lexHandler.getLexEntryKeys(wordFormNorm, language, Normalizer.DICTIONARY); // Performance: needs 15 % of the indexing time |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
622 if (lexEntries != null) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
623 hasDctionaryEntries = true; |
19 | 624 } |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
625 if (outputFormat != null && outputFormat.equals("xml")) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
626 wordTag = insertWordTags(origWordFormDeresolved, token, language, lemmas, hasDctionaryEntries); // Performance: needs 10 % of the indexing time |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
627 String tokenWordForm = token.getContentOrig(); // word form is in contentOrig |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
628 if (useRegFunction) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
629 tokenWordForm = token.getContentReg(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
630 else if (useNormFunction) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
631 tokenWordForm = token.getContentNorm(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
632 else if (withLemmas) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
633 tokenWordForm = token.getContentMorph(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
634 boolean isHighlightTerm = false; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
635 if (highlightTerms.length > 0 && ! withLemmas) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
636 isHighlightTerm = isHighlightTerm(tokenWordForm); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
637 } else { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
638 if (highlightTerms.length > 0 && lemmas != null) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
639 String[] lemmasArray = lemmasStr.split(" "); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
640 isHighlightTerm = isHighlightTerm(lemmasArray); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
641 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
642 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
643 if (isHighlightTerm) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
644 wordTag = "<hi>" + wordTag + "</hi>"; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
645 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
646 } else { // outputFormat == string |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
647 String inWordFormWithoutSpecialSymbols = removeSpecialSymbols(origWordForm); // without hyphen, blanks, newline, tab |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
648 if (withLemmas) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
649 if (lemmas != null) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
650 String blanksAndNWBMarksOfOrigWord = toBlanks(origWordFormDeresolved); // to rescue the NWB marks of the origWord and put it to the beginning of the lemmasStr |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
651 wordTag = blanksAndNWBMarksOfOrigWord + lemmasStr; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
652 token.setContentMorph(lemmasStr); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
653 } else { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
654 wordTag = inWordFormWithoutSpecialSymbols; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
655 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
656 } else { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
657 wordTag = inWordFormWithoutSpecialSymbols; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
658 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
659 tokens.add(token); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
660 resultTokens.add(token); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
661 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
662 return wordTag; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
663 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
664 |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
665 private String removeSpecialSymbols(String inputStr) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
666 String retStr = inputStr.replaceAll(" |\n|\t|-|\u00AD", ""); // blank, newline, tab, minus, soft hyphen |
19 | 667 return retStr; |
668 } | |
669 | |
670 /** | |
671 * | |
672 * @param origWordToken could contain nwd marks | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
673 * @param token |
19 | 674 * @param language |
675 * @param lemmas | |
676 * @return for each substring between nwd marks create a word tag | |
677 */ | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
678 private String insertWordTags(String origWordToken, Token token, String language, ArrayList<Lemma> lemmas, Boolean hasDictionaryEntries) { |
19 | 679 if (origWordToken.isEmpty()) |
680 return origWordToken; | |
681 if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK)) | |
682 return COMPLEX_ELEMENT_NWD_MARK; | |
683 String retWordTags = ""; | |
684 String origWordTokenTmp = origWordToken; | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
685 if (outputFormat != null && outputFormat.equals("xml")) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
686 retWordTags = getWordTag(origWordToken, token, language, lemmas, hasDictionaryEntries); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
687 /* |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
688 while (! origWordTokenTmp.isEmpty()) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
689 if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) { // single nwd mark |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
690 origWordTokenTmp = origWordTokenTmp.substring(1); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
691 retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
692 } else { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
693 int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
694 if (indexUpToNWD != -1) { // not end of string reached |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
695 String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
696 String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, token, language, lemmas, hasDictionaryEntries); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
697 retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
698 origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
699 } else { // end of string reached |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
700 String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length()); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
701 String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, token, language, lemmas, hasDictionaryEntries); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
702 retWordTags = retWordTags + origWordTokenFragmentWithTags; |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
703 origWordTokenTmp = ""; // finente |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
704 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
705 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
706 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
707 */ |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
708 } else { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
709 // nothing |
19 | 710 } |
711 return retWordTags; | |
712 } | |
713 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
714 private String getWordTag(String origWordForm, Token token, String language, ArrayList<Lemma> lemmas, Boolean hasDictionaryEntries) { |
19 | 715 if (origWordForm == null || origWordForm.isEmpty()) |
716 return ""; | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
717 String wordForm = token.getContentOrig(); // word form is in contentOrig |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
718 String regularizedWordForm = token.getContentReg(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
719 String normalizedWordForm = token.getContentNorm(); |
19 | 720 String langISOCode = Language.getInstance().getISO639Code(language); |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
721 StringBuilder retStrBuilder = new StringBuilder(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
722 retStrBuilder.append("<w" + " lang=\"" + langISOCode + "\"" + " form=\"" + wordForm + "\""); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
723 if (regularizedWordForm != null) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
724 retStrBuilder.append(" formRegularized=\"" + regularizedWordForm + "\""); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
725 if (normalizedWordForm != null) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
726 retStrBuilder.append(" formNormalized=\"" + normalizedWordForm + "\""); |
19 | 727 if (lemmas != null) { |
728 String lemmasStr = ""; | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
729 StringBuilder formsStrBuilder = new StringBuilder(); |
19 | 730 Collections.sort(lemmas); |
731 Hashtable<String, Form> formsHashtable = new Hashtable<String, Form>(); | |
732 for (int i=0; i < lemmas.size(); i++) { | |
733 Lemma lemma = lemmas.get(i); | |
734 ArrayList<Form> lemmaForms = lemma.getFormsList(); | |
735 for (int j=0; j < lemmaForms.size(); j++) { | |
736 Form form = lemmaForms.get(j); | |
737 formsHashtable.put(form.getFormName(), form); | |
738 } | |
739 String lemmaName = lemma.getLemmaName(); | |
740 lemmasStr = lemmasStr + lemmaName + " "; | |
741 } | |
742 ArrayList<Form> forms = new ArrayList<Form>(); | |
743 forms.addAll(formsHashtable.values()); | |
744 Collections.sort(forms); | |
745 for (int i=0; i < forms.size(); i++) { | |
746 Form form = forms.get(i); | |
747 String formName = form.getFormName(); | |
748 formName = StringUtils.forXML(formName); | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
749 formsStrBuilder.append(formName + " "); |
19 | 750 } |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
751 String formsStr = formsStrBuilder.toString(); |
19 | 752 if (formsStr.endsWith(" ")) |
753 formsStr = formsStr.substring(0, formsStr.length() - 1); | |
754 if (lemmasStr.endsWith(" ")) | |
755 lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1); | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
756 if (withForms) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
757 retStrBuilder.append(" forms=\"" + formsStr + "\""); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
758 if (withLemmas) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
759 retStrBuilder.append(" lemmas=\"" + lemmasStr + "\""); |
19 | 760 } |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
761 if (hasDictionaryEntries != null && hasDictionaryEntries) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
762 retStrBuilder.append(" dictionary=\"" + "true" + "\""); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
763 } else if (hasDictionaryEntries != null && ! hasDictionaryEntries) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
764 retStrBuilder.append(" dictionary=\"" + "false" + "\""); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
765 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
766 retStrBuilder.append(">"); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
767 retStrBuilder.append(origWordForm); // origWordForm could contain nwd marks (these are transformed back to elements later in method buildString) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
768 retStrBuilder.append("</w>"); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
769 return retStrBuilder.toString(); |
19 | 770 } |
771 | |
23
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
772 private String toBlanks(String inputStr) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
773 int size = inputStr.length(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
774 StringBuilder retStrBuilder = new StringBuilder(); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
775 for (int j=0; j < size; j++) { |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
776 char c = inputStr.charAt(j); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
777 if (c == COMPLEX_ELEMENT_NWD_MARK.charAt(0) || c == COMPLEX_ELEMENT_MARK.charAt(0)) |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
778 retStrBuilder.append(c); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
779 else |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
780 retStrBuilder.append(" "); |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
781 } |
e845310098ba
diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
20
diff
changeset
|
782 return retStrBuilder.toString(); |
19 | 783 } |
784 | |
785 } | |
786 } |