comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java @ 6:2396a569e446

new functions: externalObjects, normalizer, Unicode2Betacode
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 14:54:09 +0100
parents 408254cf2f1d
children fba5577e49d9
comparison
equal deleted inserted replaced
5:94305c504178 6:2396a569e446
100 100
101 private String normalize(String charactersStr) throws SAXException { 101 private String normalize(String charactersStr) throws SAXException {
102 String retStr = ""; 102 String retStr = "";
103 try { 103 try {
104 MpdlTokenizerAnalyzer tokenizerAnalyzer = new MpdlTokenizerAnalyzer(language); 104 MpdlTokenizerAnalyzer tokenizerAnalyzer = new MpdlTokenizerAnalyzer(language);
105 tokenizerAnalyzer.setRegWithoutSemicolon(true); // hack: feel free to remove it later
105 ArrayList<Token> wordTokens = tokenizerAnalyzer.getToken(charactersStr); 106 ArrayList<Token> wordTokens = tokenizerAnalyzer.getToken(charactersStr);
106 int endPos = 0; 107 int endPos = 0;
107 for (int i=0; i < wordTokens.size(); i++) { 108 for (int i=0; i < wordTokens.size(); i++) {
108 Token wordToken = wordTokens.get(i); 109 Token wordToken = wordTokens.get(i);
109 int startPos = wordToken.startOffset(); 110 int startPos = wordToken.startOffset();
110 String beforeStr = charactersStr.substring(endPos, startPos); 111 String beforeStr = charactersStr.substring(endPos, startPos);
111 String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr); 112 String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr);
112 endPos = wordToken.endOffset(); 113 endPos = wordToken.endOffset();
113 String wordStr = charactersStr.substring(startPos, endPos); 114 String wordStr = charactersStr.substring(startPos, endPos);
114
115 MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(normalizeFunctions, language); 115 MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(normalizeFunctions, language);
116 mpdlNormalizer.setNormMode(MpdlNormalizer.MODE_4HUMAN_READERS);
116 String normalizedWordStr = mpdlNormalizer.normalize(wordStr); 117 String normalizedWordStr = mpdlNormalizer.normalize(wordStr);
117
118 String normalizedWordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(normalizedWordStr); 118 String normalizedWordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(normalizedWordStr);
119 // String wordTokenText = wordToken.termText(); 119 // String wordTokenText = wordToken.termText();
120 retStr = retStr + beforeStrDeresolved + normalizedWordStrDeresolved; 120 retStr = retStr + beforeStrDeresolved + normalizedWordStrDeresolved;
121 } 121 }
122 String lastAfterStr = charactersStr.substring(endPos); 122 String lastAfterStr = charactersStr.substring(endPos);