comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java @ 6:2396a569e446

new functions: externalObjects, normalizer, Unicode2Betacode
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 08 Feb 2011 14:54:09 +0100
parents 408254cf2f1d
children 59ff47d1e237
comparison
equal deleted inserted replaced
5:94305c504178 6:2396a569e446
13 public class DictionarizerContentHandler implements ContentHandler { 13 public class DictionarizerContentHandler implements ContentHandler {
14 private static String MARK = "COMPLEXELEMENTTTTT"; 14 private static String MARK = "COMPLEXELEMENTTTTT";
15 private static int MARK_SIZE = MARK.length(); 15 private static int MARK_SIZE = MARK.length();
16 private static int ELEMENT_TYPE_CHARACTERS = 1; 16 private static int ELEMENT_TYPE_CHARACTERS = 1;
17 private static int ELEMENT_TYPE_COMPLEX = 2; 17 private static int ELEMENT_TYPE_COMPLEX = 2;
18 private static String SPECIAL_NOT_WORD_DELIM_SYMBOL = new Character('\u2424').toString();
18 private String xmlnsString = ""; 19 private String xmlnsString = "";
19 private String language; 20 private String language;
20 private String outputXmlFragment = ""; 21 private String outputXmlFragment = "";
21 private Element rootElement; 22 private Element rootElement;
22 private Element currentElement; 23 private Element currentElement;
172 private boolean isComplex() { 173 private boolean isComplex() {
173 boolean isComplex = false; 174 boolean isComplex = false;
174 if (type == ELEMENT_TYPE_COMPLEX) 175 if (type == ELEMENT_TYPE_COMPLEX)
175 isComplex = true; 176 isComplex = true;
176 return isComplex; 177 return isComplex;
178 }
179
180 /**
181 * feel free to add/remove some element names
182 * @return true if element is a word delimiter element else false
183 */
184 private boolean isWordDelimiterElement() {
185 boolean isWordDelimiterElement = true;
186 if (name.equals("lb") || name.equals("cb") || name.equals("gap") || name.equals("figure") || name.equals("image") || name.equals("note") || name.equals("handwritten") || name.equals("anchor"))
187 isWordDelimiterElement = false;
188 return isWordDelimiterElement;
177 } 189 }
178 190
179 private String toXmlString() throws SAXException { 191 private String toXmlString() throws SAXException {
180 String retString = ""; 192 String retString = "";
181 String elemLanguage = language; // default value for the document/page 193 String elemLanguage = language; // default value for the document/page
198 for (int i=0; i<composites.size(); i++) { 210 for (int i=0; i<composites.size(); i++) {
199 Element composite = composites.get(i); 211 Element composite = composites.get(i);
200 if (! composite.isComplex()) { 212 if (! composite.isComplex()) {
201 if (composite.value != null && ! composite.value.equals("")) { 213 if (composite.value != null && ! composite.value.equals("")) {
202 String compositeValueStr = composite.value; 214 String compositeValueStr = composite.value;
203 compositesChars += compositeValueStr; 215 compositesChars = compositesChars + compositeValueStr;
204 compositesCharsWithMarks += compositeValueStr; 216 compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr;
205 } 217 }
206 } else { 218 } else {
219 if (! composite.isWordDelimiterElement()) {
220 compositesChars = compositesChars + SPECIAL_NOT_WORD_DELIM_SYMBOL; // add a special symbol at the position of the "not word delimiter element" (e.g. line break)
221 }
207 complexElements.add(composite); 222 complexElements.add(composite);
208 compositesCharsWithMarks += MARK; 223 compositesCharsWithMarks += MARK;
209 } 224 }
210 } 225 }
211 String compositesCharsDictionarized = characters2DictWords(compositesChars, elemLanguage); 226 String compositesCharsDictionarized = characters2DictWords(compositesChars, elemLanguage);
227 compositesChars = compositesChars.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, "");
228 compositesCharsDictionarized = compositesCharsDictionarized.replaceAll(SPECIAL_NOT_WORD_DELIM_SYMBOL, "");
212 if (complexElements.size() > 0) { 229 if (complexElements.size() > 0) {
213 for (int i=0; i<complexElements.size(); i++) { 230 for (int i=0; i<complexElements.size(); i++) {
214 int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarks.indexOf(MARK); 231 int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarks.indexOf(MARK);
215 int indexComplexElemCompositesCharsDictionarized = getCharIndex(compositesCharsDictionarized, indexComplexElemCompositesCharsWithMarks); 232 int indexComplexElemCompositesCharsDictionarized = getCharIndex(compositesCharsDictionarized, indexComplexElemCompositesCharsWithMarks);
216 Element complexElem = complexElements.get(i); 233 Element complexElem = complexElements.get(i);
249 String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr); 266 String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr);
250 String wordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(wordStr); 267 String wordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(wordStr);
251 String wordTokenText = wordToken.termText(); 268 String wordTokenText = wordToken.termText();
252 LexHandler lexHandler = LexHandler.getInstance(); 269 LexHandler lexHandler = LexHandler.getInstance();
253 // delivers lex entries by help of the morphology component (lex entry of the stem of the normalized word form) 270 // delivers lex entries by help of the morphology component (lex entry of the stem of the normalized word form)
254 ArrayList<String> lexEntryKeys = lexHandler.getLexEntryKeys(wordTokenText, language, false); 271 String wordTokenTextWithoutSpecialSymbols = removeSpecialSymbols(wordTokenText);
272 ArrayList<String> lexEntryKeys = lexHandler.getLexEntryKeys(wordTokenTextWithoutSpecialSymbols, language, false);
255 if (lexEntryKeys != null) { 273 if (lexEntryKeys != null) {
256 String lexForms = ""; 274 String lexForms = "";
257 for (int j=0; j<lexEntryKeys.size(); j++) { 275 for (int j=0; j<lexEntryKeys.size(); j++) {
258 String lexEntryKey = lexEntryKeys.get(j); 276 String lexEntryKey = lexEntryKeys.get(j);
259 lexForms = lexForms + lexEntryKey + " "; 277 lexForms = lexForms + lexEntryKey + " ";
260 } 278 }
261 lexForms = lexForms.substring(0, lexForms.length() - 1); 279 lexForms = lexForms.substring(0, lexForms.length() - 1);
262 retStr = retStr + beforeStrDeresolved + "<w lang=\"" + language + "\"" + " form=\"" + wordTokenText + "\"" + " lexForms=\"" + lexForms + "\">" + wordStrDeresolved + "</w>"; 280 retStr = retStr + beforeStrDeresolved + "<w lang=\"" + language + "\"" + " form=\"" + wordTokenTextWithoutSpecialSymbols + "\"" + " lexForms=\"" + lexForms + "\">" + wordStrDeresolved + "</w>";
263 } else { 281 } else {
264 retStr = retStr + beforeStrDeresolved + wordStrDeresolved; 282 retStr = retStr + beforeStrDeresolved + wordStrDeresolved;
265 } 283 }
266 } 284 }
267 String lastAfterStr = charactersStr.substring(endPos); 285 String lastAfterStr = charactersStr.substring(endPos);
270 } catch (ApplicationException e) { 288 } catch (ApplicationException e) {
271 throw new SAXException(e); 289 throw new SAXException(e);
272 } 290 }
273 return retStr; 291 return retStr;
274 } 292 }
293
294 private String removeSpecialSymbols(String inputStr) {
295 String retStr = inputStr.replaceAll(" ", "");
296 retStr = retStr.replaceAll("\n", "");
297 retStr = retStr.replaceAll("-", "");
298 return retStr;
299 }
275 } 300 }
276 } 301 }