Mercurial > hg > mpdl-group
diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java @ 6:2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 14:54:09 +0100 |
parents | 408254cf2f1d |
children | fba5577e49d9 |
line wrap: on
line diff
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java Tue Feb 08 14:36:38 2011 +0100 +++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java Tue Feb 08 14:54:09 2011 +0100 @@ -102,6 +102,7 @@ String retStr = ""; try { MpdlTokenizerAnalyzer tokenizerAnalyzer = new MpdlTokenizerAnalyzer(language); + tokenizerAnalyzer.setRegWithoutSemicolon(true); // hack: feel free to remove it later ArrayList<Token> wordTokens = tokenizerAnalyzer.getToken(charactersStr); int endPos = 0; for (int i=0; i < wordTokens.size(); i++) { @@ -111,10 +112,9 @@ String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr); endPos = wordToken.endOffset(); String wordStr = charactersStr.substring(startPos, endPos); - MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(normalizeFunctions, language); + mpdlNormalizer.setNormMode(MpdlNormalizer.MODE_4HUMAN_READERS); String normalizedWordStr = mpdlNormalizer.normalize(wordStr); - String normalizedWordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(normalizedWordStr); // String wordTokenText = wordToken.termText(); retStr = retStr + beforeStrDeresolved + normalizedWordStrDeresolved;