comparison software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormDictContentHandler.java @ 16:257f67be5c00

diverse Fehlerbehebungen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Sep 2011 16:40:57 +0200
parents 5df60f24e997
children
comparison
equal deleted inserted replaced
15:e99964f390e4 16:257f67be5c00
6 import org.xml.sax.*; 6 import org.xml.sax.*;
7 7
8 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; 8 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
9 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer; 9 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer;
10 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer; 10 import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer;
11 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
11 import de.mpg.mpiwg.berlin.mpdl.lt.lex.db.LexHandler; 12 import de.mpg.mpiwg.berlin.mpdl.lt.lex.db.LexHandler;
12 import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; 13 import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars;
13 14
14 public class NormDictContentHandler implements ContentHandler { 15 public class NormDictContentHandler implements ContentHandler {
15 private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element 16 private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element
46 47
47 public void startDocument() throws SAXException { 48 public void startDocument() throws SAXException {
48 } 49 }
49 50
50 public void endDocument() throws SAXException { 51 public void endDocument() throws SAXException {
51 String rootElemToStr = rootElement.toXmlString(); 52 try {
52 write(rootElemToStr); 53 String rootElemToStr = rootElement.toXmlString();
53 write("\n"); 54 // hack: in echo documents the spaces between sentences should be removed
55 if (rootElemToStr != null && rootElemToStr.startsWith("<echo") && Language.getInstance().isChinese(language)) {
56 rootElemToStr = rootElemToStr.replaceAll("</s>[ \n\t]+<s", "</s><s");
57 }
58 write(rootElemToStr);
59 write("\n");
60 } catch (NullPointerException e) {
61 throw new SAXException(e);
62 }
54 } 63 }
55 64
56 public void characters(char[] c, int start, int length) throws SAXException { 65 public void characters(char[] c, int start, int length) throws SAXException {
57 char[] cCopy = new char[length]; 66 char[] cCopy = new char[length];
58 System.arraycopy(c, start, cCopy, 0, length); 67 System.arraycopy(c, start, cCopy, 0, length);
171 */ 180 */
172 private boolean isWordDelimiterElement() { 181 private boolean isWordDelimiterElement() {
173 boolean isWordDelimiterElement = true; 182 boolean isWordDelimiterElement = true;
174 // "note" causes problems: word after the note is not recognized 183 // "note" causes problems: word after the note is not recognized
175 // "emph" causes problems: e.g. "Natur<emph>ereignis</emph> enthüllte" is replaced by "Natur<emph><w>ereignis</w></emph>enthüllte" 184 // "emph" causes problems: e.g. "Natur<emph>ereignis</emph> enthüllte" is replaced by "Natur<emph><w>ereignis</w></emph>enthüllte"
176 if (name.equals("lb") || name.equals("cb") || name.equals("figure") || name.equals("image") || name.equals("handwritten") || name.equals("anchor")) 185 if (name.equals("lb") || name.equals("br") || name.equals("cb") || name.equals("figure") || name.equals("image") || name.equals("handwritten") || name.equals("anchor"))
177 isWordDelimiterElement = false; 186 isWordDelimiterElement = false;
178 return isWordDelimiterElement; 187 return isWordDelimiterElement;
179 } 188 }
180 189
181 private String toXmlString() throws SAXException { 190 private String toXmlString() throws SAXException {
200 Element composite = composites.get(i); 209 Element composite = composites.get(i);
201 if (! composite.isComplex()) { 210 if (! composite.isComplex()) {
202 if (composite.value != null && ! composite.value.equals("")) { 211 if (composite.value != null && ! composite.value.equals("")) {
203 String compositeValueStr = composite.value; 212 String compositeValueStr = composite.value;
204 compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words. 213 compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words.
205 compositeValueStr = compositeValueStr.replaceAll(" +", " "); // if there are many Blanks make them to one 214 compositeValueStr = compositeValueStr.replaceAll("[ \t]+", " "); // if there are many Blanks/Tabs make them to one
206 compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr; 215 compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr;
207 } 216 }
208 } else { 217 } else {
209 if (! composite.isWordDelimiterElement()) { 218 if (! composite.isWordDelimiterElement()) {
210 compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_NWD_MARK; // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>) 219 compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_NWD_MARK; // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>)
249 mpdlNormalizer.setNormMode(MpdlNormalizer.DICTIONARY); 258 mpdlNormalizer.setNormMode(MpdlNormalizer.DICTIONARY);
250 } else { 259 } else {
251 mpdlNormalizer.setNormMode(MpdlNormalizer.DISPLAY); 260 mpdlNormalizer.setNormMode(MpdlNormalizer.DISPLAY);
252 } 261 }
253 MpdlTokenizerAnalyzer tokenAnalyzer = new MpdlTokenizerAnalyzer(mpdlNormalizer, language); 262 MpdlTokenizerAnalyzer tokenAnalyzer = new MpdlTokenizerAnalyzer(mpdlNormalizer, language);
254 tokenAnalyzer.setRegWithoutSemicolon(true); // hack: feel free to remove it later
255 ArrayList<Token> wordTokens = tokenAnalyzer.getToken(charactersStr); 263 ArrayList<Token> wordTokens = tokenAnalyzer.getToken(charactersStr);
256 int endPos = 0; 264 int endPos = 0;
257 for (int i=0; i < wordTokens.size(); i++) { 265 for (int i=0; i < wordTokens.size(); i++) {
258 Token wordToken = wordTokens.get(i); 266 Token wordToken = wordTokens.get(i);
259 int startPos = wordToken.startOffset(); 267 int startPos = wordToken.startOffset();
333 for (int j=0; j<lexEntryKeys.size(); j++) { 341 for (int j=0; j<lexEntryKeys.size(); j++) {
334 String lexEntryKey = lexEntryKeys.get(j); 342 String lexEntryKey = lexEntryKeys.get(j);
335 lexForms = lexForms + lexEntryKey + " "; 343 lexForms = lexForms + lexEntryKey + " ";
336 } 344 }
337 lexForms = lexForms.substring(0, lexForms.length() - 1); 345 lexForms = lexForms.substring(0, lexForms.length() - 1);
338 lexWord = "<w lang=\"" + language + "\"" + " form=\"" + wordForm + "\"" + " lexForms=\"" + lexForms + "\">" + displayWordDeresolved + "</w>"; 346 lexWord = "<w lang=\"" + lang + "\"" + " form=\"" + wordForm + "\"" + " lexForms=\"" + lexForms + "\">" + displayWordDeresolved + "</w>";
339 } else { 347 } else {
340 lexWord = displayWordDeresolved; 348 lexWord = displayWordDeresolved;
341 } 349 }
342 return lexWord; 350 return lexWord;
343 } 351 }