Mercurial > hg > mpdl-group
comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormDictContentHandler.java @ 16:257f67be5c00
diverse Fehlerbehebungen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Sep 2011 16:40:57 +0200 |
parents | 5df60f24e997 |
children |
comparison
equal
deleted
inserted
replaced
15:e99964f390e4 | 16:257f67be5c00 |
---|---|
6 import org.xml.sax.*; | 6 import org.xml.sax.*; |
7 | 7 |
8 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | 8 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; |
9 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer; | 9 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer; |
10 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer; | 10 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer; |
11 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; | |
11 import de.mpg.mpiwg.berlin.mpdl.lt.lex.db.LexHandler; | 12 import de.mpg.mpiwg.berlin.mpdl.lt.lex.db.LexHandler; |
12 import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; | 13 import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; |
13 | 14 |
14 public class NormDictContentHandler implements ContentHandler { | 15 public class NormDictContentHandler implements ContentHandler { |
15 private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element | 16 private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element |
46 | 47 |
47 public void startDocument() throws SAXException { | 48 public void startDocument() throws SAXException { |
48 } | 49 } |
49 | 50 |
50 public void endDocument() throws SAXException { | 51 public void endDocument() throws SAXException { |
51 String rootElemToStr = rootElement.toXmlString(); | 52 try { |
52 write(rootElemToStr); | 53 String rootElemToStr = rootElement.toXmlString(); |
53 write("\n"); | 54 // hack: in echo documents the spaces between sentences should be removed |
55 if (rootElemToStr != null && rootElemToStr.startsWith("<echo") && Language.getInstance().isChinese(language)) { | |
56 rootElemToStr = rootElemToStr.replaceAll("</s>[ \n\t]+<s", "</s><s"); | |
57 } | |
58 write(rootElemToStr); | |
59 write("\n"); | |
60 } catch (NullPointerException e) { | |
61 throw new SAXException(e); | |
62 } | |
54 } | 63 } |
55 | 64 |
56 public void characters(char[] c, int start, int length) throws SAXException { | 65 public void characters(char[] c, int start, int length) throws SAXException { |
57 char[] cCopy = new char[length]; | 66 char[] cCopy = new char[length]; |
58 System.arraycopy(c, start, cCopy, 0, length); | 67 System.arraycopy(c, start, cCopy, 0, length); |
171 */ | 180 */ |
172 private boolean isWordDelimiterElement() { | 181 private boolean isWordDelimiterElement() { |
173 boolean isWordDelimiterElement = true; | 182 boolean isWordDelimiterElement = true; |
174 // "note" causes problems: word after the note is not recognized | 183 // "note" causes problems: word after the note is not recognized |
175 // "emph" causes problems: e.g. "Natur<emph>ereignis</emph> enthüllte" is replaced by "Natur<emph><w>ereignis</w></emph>enthüllte" | 184 // "emph" causes problems: e.g. "Natur<emph>ereignis</emph> enthüllte" is replaced by "Natur<emph><w>ereignis</w></emph>enthüllte" |
176 if (name.equals("lb") || name.equals("cb") || name.equals("figure") || name.equals("image") || name.equals("handwritten") || name.equals("anchor")) | 185 if (name.equals("lb") || name.equals("br") || name.equals("cb") || name.equals("figure") || name.equals("image") || name.equals("handwritten") || name.equals("anchor")) |
177 isWordDelimiterElement = false; | 186 isWordDelimiterElement = false; |
178 return isWordDelimiterElement; | 187 return isWordDelimiterElement; |
179 } | 188 } |
180 | 189 |
181 private String toXmlString() throws SAXException { | 190 private String toXmlString() throws SAXException { |
200 Element composite = composites.get(i); | 209 Element composite = composites.get(i); |
201 if (! composite.isComplex()) { | 210 if (! composite.isComplex()) { |
202 if (composite.value != null && ! composite.value.equals("")) { | 211 if (composite.value != null && ! composite.value.equals("")) { |
203 String compositeValueStr = composite.value; | 212 String compositeValueStr = composite.value; |
204 compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words. | 213 compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words. |
205 compositeValueStr = compositeValueStr.replaceAll(" +", " "); // if there are many Blanks make them to one | 214 compositeValueStr = compositeValueStr.replaceAll("[ \t]+", " "); // if there are many Blanks/Tabs make them to one |
206 compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr; | 215 compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr; |
207 } | 216 } |
208 } else { | 217 } else { |
209 if (! composite.isWordDelimiterElement()) { | 218 if (! composite.isWordDelimiterElement()) { |
210 compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_NWD_MARK; // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>) | 219 compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_NWD_MARK; // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>) |
249 mpdlNormalizer.setNormMode(MpdlNormalizer.DICTIONARY); | 258 mpdlNormalizer.setNormMode(MpdlNormalizer.DICTIONARY); |
250 } else { | 259 } else { |
251 mpdlNormalizer.setNormMode(MpdlNormalizer.DISPLAY); | 260 mpdlNormalizer.setNormMode(MpdlNormalizer.DISPLAY); |
252 } | 261 } |
253 MpdlTokenizerAnalyzer tokenAnalyzer = new MpdlTokenizerAnalyzer(mpdlNormalizer, language); | 262 MpdlTokenizerAnalyzer tokenAnalyzer = new MpdlTokenizerAnalyzer(mpdlNormalizer, language); |
254 tokenAnalyzer.setRegWithoutSemicolon(true); // hack: feel free to remove it later | |
255 ArrayList<Token> wordTokens = tokenAnalyzer.getToken(charactersStr); | 263 ArrayList<Token> wordTokens = tokenAnalyzer.getToken(charactersStr); |
256 int endPos = 0; | 264 int endPos = 0; |
257 for (int i=0; i < wordTokens.size(); i++) { | 265 for (int i=0; i < wordTokens.size(); i++) { |
258 Token wordToken = wordTokens.get(i); | 266 Token wordToken = wordTokens.get(i); |
259 int startPos = wordToken.startOffset(); | 267 int startPos = wordToken.startOffset(); |
333 for (int j=0; j<lexEntryKeys.size(); j++) { | 341 for (int j=0; j<lexEntryKeys.size(); j++) { |
334 String lexEntryKey = lexEntryKeys.get(j); | 342 String lexEntryKey = lexEntryKeys.get(j); |
335 lexForms = lexForms + lexEntryKey + " "; | 343 lexForms = lexForms + lexEntryKey + " "; |
336 } | 344 } |
337 lexForms = lexForms.substring(0, lexForms.length() - 1); | 345 lexForms = lexForms.substring(0, lexForms.length() - 1); |
338 lexWord = "<w lang=\"" + language + "\"" + " form=\"" + wordForm + "\"" + " lexForms=\"" + lexForms + "\">" + displayWordDeresolved + "</w>"; | 346 lexWord = "<w lang=\"" + lang + "\"" + " form=\"" + wordForm + "\"" + " lexForms=\"" + lexForms + "\">" + displayWordDeresolved + "</w>"; |
339 } else { | 347 } else { |
340 lexWord = displayWordDeresolved; | 348 lexWord = displayWordDeresolved; |
341 } | 349 } |
342 return lexWord; | 350 return lexWord; |
343 } | 351 } |