Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java @ 20:7d6d969b10cf
little corrections
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 14 Dec 2011 12:48:43 +0100 |
parents | 4a3641ae14d2 |
children | e845310098ba |
comparison
equal
deleted
inserted
replaced
19:4a3641ae14d2 | 20:7d6d969b10cf |
---|---|
10 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | 10 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; |
11 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; | 11 import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; |
12 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; | 12 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; |
13 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; | 13 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; |
14 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; | 14 import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; |
15 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; | |
15 import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; | 16 import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; |
16 | 17 |
17 public class XmlTokenizerContentHandler implements ContentHandler { | 18 public class XmlTokenizerContentHandler implements ContentHandler { |
18 private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element | 19 private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element |
19 private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element | 20 private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element |
327 wordForm = wordForm.toLowerCase(); | 328 wordForm = wordForm.toLowerCase(); |
328 String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm); | 329 String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm); |
329 ArrayList<Lemma> lemmas = null; | 330 ArrayList<Lemma> lemmas = null; |
330 if (withForms() || withLemmas()) { | 331 if (withForms() || withLemmas()) { |
331 LexHandler lexHandler = LexHandler.getInstance(); | 332 LexHandler lexHandler = LexHandler.getInstance(); |
332 lemmas = lexHandler.getLemmas(wordForm, "form", language, "none"); | 333 lemmas = lexHandler.getLemmas(wordForm, "form", language, Normalizer.NONE); |
333 } | 334 } |
334 wordTag = insertWordTags(origWordFormDeresolved, wordForm, language, null, lemmas); | 335 wordTag = insertWordTags(origWordFormDeresolved, wordForm, language, null, lemmas); |
335 return wordTag; | 336 return wordTag; |
336 } | 337 } |
337 | 338 |