diff software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java @ 6:2396a569e446

new functions: externalObjects, normalizer, Unicode2Betacode
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 14:54:09 +0100
parents 408254cf2f1d
children fba5577e49d9
line wrap: on
line diff
--- a/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java	Tue Feb 08 14:36:38 2011 +0100
+++ b/software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java	Tue Feb 08 14:54:09 2011 +0100
@@ -102,6 +102,7 @@
     String retStr = "";
     try {
       MpdlTokenizerAnalyzer tokenizerAnalyzer = new MpdlTokenizerAnalyzer(language);
+      tokenizerAnalyzer.setRegWithoutSemicolon(true);  // hack: feel free to remove it later
       ArrayList<Token> wordTokens = tokenizerAnalyzer.getToken(charactersStr);
       int endPos = 0;
       for (int i=0; i < wordTokens.size(); i++) {
@@ -111,10 +112,9 @@
         String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr);
         endPos = wordToken.endOffset();
         String wordStr = charactersStr.substring(startPos, endPos);
-
         MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(normalizeFunctions, language);
+        mpdlNormalizer.setNormMode(MpdlNormalizer.MODE_4HUMAN_READERS);
         String normalizedWordStr = mpdlNormalizer.normalize(wordStr);
-
         String normalizedWordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(normalizedWordStr);
         // String wordTokenText = wordToken.termText();
         retStr = retStr + beforeStrDeresolved + normalizedWordStrDeresolved;