Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java @ 6:2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 14:54:09 +0100 |
parents | 408254cf2f1d |
children | 59ff47d1e237 |
comparison
equal
deleted
inserted
replaced
5:94305c504178 | 6:2396a569e446 |
---|---|
13 public class DictionarizerContentHandler implements ContentHandler { | 13 public class DictionarizerContentHandler implements ContentHandler { |
14 private static String MARK = "COMPLEXELEMENTTTTT"; | 14 private static String MARK = "COMPLEXELEMENTTTTT"; |
15 private static int MARK_SIZE = MARK.length(); | 15 private static int MARK_SIZE = MARK.length(); |
16 private static int ELEMENT_TYPE_CHARACTERS = 1; | 16 private static int ELEMENT_TYPE_CHARACTERS = 1; |
17 private static int ELEMENT_TYPE_COMPLEX = 2; | 17 private static int ELEMENT_TYPE_COMPLEX = 2; |
18 private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString(); | |
18 private String xmlnsString = ""; | 19 private String xmlnsString = ""; |
19 private String language; | 20 private String language; |
20 private String outputXmlFragment = ""; | 21 private String outputXmlFragment = ""; |
21 private Element rootElement; | 22 private Element rootElement; |
22 private Element currentElement; | 23 private Element currentElement; |
172 private boolean isComplex() { | 173 private boolean isComplex() { |
173 boolean isComplex = false; | 174 boolean isComplex = false; |
174 if (type == ELEMENT_TYPE_COMPLEX) | 175 if (type == ELEMENT_TYPE_COMPLEX) |
175 isComplex = true; | 176 isComplex = true; |
176 return isComplex; | 177 return isComplex; |
178 } | |
179 | |
180 /** | |
181 * feel free to add/remove some element names | |
182 * @return true if element is a word delimiter element else false | |
183 */ | |
184 private boolean isWordDelimiterElement() { | |
185 boolean isWordDelimiterElement = true; | |
186 if (name.equals("lb") || name.equals("cb") || name.equals("gap") || name.equals("figure") || name.equals("image") || name.equals("note") || name.equals("handwritten") || name.equals("anchor")) | |
187 isWordDelimiterElement = false; | |
188 return isWordDelimiterElement; | |
177 } | 189 } |
178 | 190 |
179 private String toXmlString() throws SAXException { | 191 private String toXmlString() throws SAXException { |
180 String retString = ""; | 192 String retString = ""; |
181 String elemLanguage = language; // default value for the document/page | 193 String elemLanguage = language; // default value for the document/page |
198 for (int i=0; i<composites.size(); i++) { | 210 for (int i=0; i<composites.size(); i++) { |
199 Element composite = composites.get(i); | 211 Element composite = composites.get(i); |
200 if (! composite.isComplex()) { | 212 if (! composite.isComplex()) { |
201 if (composite.value != null && ! composite.value.equals("")) { | 213 if (composite.value != null && ! composite.value.equals("")) { |
202 String compositeValueStr = composite.value; | 214 String compositeValueStr = composite.value; |
203 compositesChars += compositeValueStr; | 215 compositesChars = compositesChars + compositeValueStr; |
204 compositesCharsWithMarks += compositeValueStr; | 216 compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr; |
205 } | 217 } |
206 } else { | 218 } else { |
219 if (! composite.isWordDelimiterElement()) { | |
220 compositesChars = compositesChars + SPECIAL_NOT_WORD_DELIM_SYMBOL; // add a special symbol at the position of the "not word delimiter element" (e.g. line break) | |
221 } | |
207 complexElements.add(composite); | 222 complexElements.add(composite); |
208 compositesCharsWithMarks += MARK; | 223 compositesCharsWithMarks += MARK; |
209 } | 224 } |
210 } | 225 } |
211 String compositesCharsDictionarized = characters2DictWords(compositesChars, elemLanguage); | 226 String compositesCharsDictionarized = characters2DictWords(compositesChars, elemLanguage); |
227 compositesChars = compositesChars.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, ""); | |
228 compositesCharsDictionarized = compositesCharsDictionarized.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, ""); | |
212 if (complexElements.size() > 0) { | 229 if (complexElements.size() > 0) { |
213 for (int i=0; i<complexElements.size(); i++) { | 230 for (int i=0; i<complexElements.size(); i++) { |
214 int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarks.indexOf(MARK); | 231 int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarks.indexOf(MARK); |
215 int indexComplexElemCompositesCharsDictionarized = getCharIndex(compositesCharsDictionarized, indexComplexElemCompositesCharsWithMarks); | 232 int indexComplexElemCompositesCharsDictionarized = getCharIndex(compositesCharsDictionarized, indexComplexElemCompositesCharsWithMarks); |
216 Element complexElem = complexElements.get(i); | 233 Element complexElem = complexElements.get(i); |
249 String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr); | 266 String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr); |
250 String wordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(wordStr); | 267 String wordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(wordStr); |
251 String wordTokenText = wordToken.termText(); | 268 String wordTokenText = wordToken.termText(); |
252 LexHandler lexHandler = LexHandler.getInstance(); | 269 LexHandler lexHandler = LexHandler.getInstance(); |
253 // delivers lex entries by help of the morphology component (lex entry of the stem of the normalized word form) | 270 // delivers lex entries by help of the morphology component (lex entry of the stem of the normalized word form) |
254 ArrayList<String> lexEntryKeys = lexHandler.getLexEntryKeys(wordTokenText, language, false); | 271 String wordTokenTextWithoutSpecialSymbols = removeSpecialSymbols(wordTokenText); |
272 ArrayList<String> lexEntryKeys = lexHandler.getLexEntryKeys(wordTokenTextWithoutSpecialSymbols, language, false); | |
255 if (lexEntryKeys != null) { | 273 if (lexEntryKeys != null) { |
256 String lexForms = ""; | 274 String lexForms = ""; |
257 for (int j=0; j<lexEntryKeys.size(); j++) { | 275 for (int j=0; j<lexEntryKeys.size(); j++) { |
258 String lexEntryKey = lexEntryKeys.get(j); | 276 String lexEntryKey = lexEntryKeys.get(j); |
259 lexForms = lexForms + lexEntryKey + " "; | 277 lexForms = lexForms + lexEntryKey + " "; |
260 } | 278 } |
261 lexForms = lexForms.substring(0, lexForms.length() - 1); | 279 lexForms = lexForms.substring(0, lexForms.length() - 1); |
262 retStr = retStr + beforeStrDeresolved + "<w lang=\"" + language + "\"" + " form=\"" + wordTokenText + "\"" + " lexForms=\"" + lexForms + "\">" + wordStrDeresolved + "</w>"; | 280 retStr = retStr + beforeStrDeresolved + "<w lang=\"" + language + "\"" + " form=\"" + wordTokenTextWithoutSpecialSymbols + "\"" + " lexForms=\"" + lexForms + "\">" + wordStrDeresolved + "</w>"; |
263 } else { | 281 } else { |
264 retStr = retStr + beforeStrDeresolved + wordStrDeresolved; | 282 retStr = retStr + beforeStrDeresolved + wordStrDeresolved; |
265 } | 283 } |
266 } | 284 } |
267 String lastAfterStr = charactersStr.substring(endPos); | 285 String lastAfterStr = charactersStr.substring(endPos); |
270 } catch (ApplicationException e) { | 288 } catch (ApplicationException e) { |
271 throw new SAXException(e); | 289 throw new SAXException(e); |
272 } | 290 } |
273 return retStr; | 291 return retStr; |
274 } | 292 } |
293 | |
294 private String removeSpecialSymbols(String inputStr) { | |
295 String retStr = inputStr.replaceAll(" ", ""); | |
296 retStr = retStr.replaceAll("\n", ""); | |
297 retStr = retStr.replaceAll("-", ""); | |
298 return retStr; | |
299 } | |
275 } | 300 } |
276 } | 301 } |