Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java @ 6:2396a569e446
new functions: externalObjects, normalizer, Unicode2Betacode
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 08 Feb 2011 14:54:09 +0100 |
parents | 408254cf2f1d |
children | fba5577e49d9 |
comparison
equal
deleted
inserted
replaced
5:94305c504178 | 6:2396a569e446 |
---|---|
100 | 100 |
101 private String normalize(String charactersStr) throws SAXException { | 101 private String normalize(String charactersStr) throws SAXException { |
102 String retStr = ""; | 102 String retStr = ""; |
103 try { | 103 try { |
104 MpdlTokenizerAnalyzer tokenizerAnalyzer = new MpdlTokenizerAnalyzer(language); | 104 MpdlTokenizerAnalyzer tokenizerAnalyzer = new MpdlTokenizerAnalyzer(language); |
105 tokenizerAnalyzer.setRegWithoutSemicolon(true); // hack: feel free to remove it later | |
105 ArrayList<Token> wordTokens = tokenizerAnalyzer.getToken(charactersStr); | 106 ArrayList<Token> wordTokens = tokenizerAnalyzer.getToken(charactersStr); |
106 int endPos = 0; | 107 int endPos = 0; |
107 for (int i=0; i < wordTokens.size(); i++) { | 108 for (int i=0; i < wordTokens.size(); i++) { |
108 Token wordToken = wordTokens.get(i); | 109 Token wordToken = wordTokens.get(i); |
109 int startPos = wordToken.startOffset(); | 110 int startPos = wordToken.startOffset(); |
110 String beforeStr = charactersStr.substring(endPos, startPos); | 111 String beforeStr = charactersStr.substring(endPos, startPos); |
111 String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr); | 112 String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr); |
112 endPos = wordToken.endOffset(); | 113 endPos = wordToken.endOffset(); |
113 String wordStr = charactersStr.substring(startPos, endPos); | 114 String wordStr = charactersStr.substring(startPos, endPos); |
114 | |
115 MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(normalizeFunctions, language); | 115 MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(normalizeFunctions, language); |
116 mpdlNormalizer.setNormMode(MpdlNormalizer.MODE_4HUMAN_READERS); | |
116 String normalizedWordStr = mpdlNormalizer.normalize(wordStr); | 117 String normalizedWordStr = mpdlNormalizer.normalize(wordStr); |
117 | |
118 String normalizedWordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(normalizedWordStr); | 118 String normalizedWordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(normalizedWordStr); |
119 // String wordTokenText = wordToken.termText(); | 119 // String wordTokenText = wordToken.termText(); |
120 retStr = retStr + beforeStrDeresolved + normalizedWordStrDeresolved; | 120 retStr = retStr + beforeStrDeresolved + normalizedWordStrDeresolved; |
121 } | 121 } |
122 String lastAfterStr = charactersStr.substring(endPos); | 122 String lastAfterStr = charactersStr.substring(endPos); |