diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/WordContentHandler.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/WordContentHandler.java	Tue Nov 27 12:35:19 2012 +0100
@@ -0,0 +1,159 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;
+
+import org.xml.sax.*;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer;
+import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;
+
+public class WordContentHandler implements ContentHandler {
+  private static String DEFAULT_LANGUAGE = "eng";
+  private String xmlnsString = "";
+  private StringBuilder resultStrBuilder = new StringBuilder();
+  private String language = DEFAULT_LANGUAGE;
+  private String formRegularized;
+  private int wordLevelCounter = 0;
+  private String wordElemContent = "";
+  private String wordElementName = "w";
+  
+  public String getResult() {
+    return resultStrBuilder.toString();  
+  }
+
+  public void startDocument() throws SAXException {
+  }
+
+  public void endDocument() throws SAXException {
+  }
+  
+  public void characters(char[] c, int start, int length) throws SAXException {
+    char[] cCopy = new char[length];
+    System.arraycopy(c, start, cCopy, 0, length);
+    String charactersStr = String.valueOf(cCopy);
+    if (charactersStr != null && ! charactersStr.equals("")) {
+      charactersStr = StringUtils.deresolveXmlEntities(charactersStr);
+      write(charactersStr);
+    }
+  }
+
+  public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
+  }
+
+  public void processingInstruction(String target, String data) throws SAXException {
+  }
+
+  public void setDocumentLocator(Locator locator) {
+  }
+
+  public void startPrefixMapping(String prefix, String uri) throws SAXException {
+    if (prefix != null && prefix.equals(""))  
+      xmlnsString += "xmlns" + "=\"" + uri + "\" ";
+    else
+      xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
+  }
+  
+  public void endPrefixMapping(String prefix) throws SAXException {
+  }
+
+  public void skippedEntity(String name) throws SAXException {
+  }
+
+  public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
+    int attrSize = attrs.getLength();
+    String attrString = "";
+    for (int i=0; i<attrSize; i++) {
+      String attrQName = attrs.getQName(i);
+      String attrValue = attrs.getValue(i);
+      attrValue = StringUtils.forXML(attrValue);
+      if (localName != null && localName.equals(wordElementName) && (attrQName.equals("lang") || attrQName.equals("xml:lang")))
+        language = attrValue;
+      if (localName != null && localName.equals(wordElementName) && attrQName.equals("formRegularized") && attrValue != null && ! attrValue.isEmpty())
+        formRegularized = attrValue;
+      attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
+    }
+    if (attrString != null && ! attrString.isEmpty()) {
+      attrString = attrString.trim();
+    }
+    if (xmlnsString != null && ! xmlnsString.isEmpty()) {
+      xmlnsString = xmlnsString.trim();
+    }
+    // start all elements but no word elements within word elements (level > 0)
+    if (localName != null && (! localName.equals(wordElementName) || (localName.equals(wordElementName) && wordLevelCounter == 0))) {
+      write("<" + name);
+      if (xmlnsString != null && ! xmlnsString.isEmpty())
+        write(" " + xmlnsString);
+      if (attrString != null && ! attrString.isEmpty())
+        write(" " + attrString);
+      write(">");
+    }
+    xmlnsString = "";
+    if (localName != null && localName.equals(wordElementName)) {
+      wordLevelCounter++;
+    }
+  }
+
+  public void endElement(String uri, String localName, String name) throws SAXException {
+    try {
+      if (localName != null && localName.equals(wordElementName)) {
+        wordLevelCounter--;
+      }
+      // special handling of word elements (with level 0): insert orig, reg and norm attributes
+      if (localName != null && localName.equals(wordElementName) && wordLevelCounter == 0) {
+        // handle formRegularized
+        String newWordElemContentReg = "";
+        if (formRegularized == null || formRegularized.isEmpty()) {
+          newWordElemContentReg = wordElemContent;
+        } else if (formRegularized.contains("- ")) {
+          String[] wordParts = formRegularized.split("- ");
+          for (int i=0; i<wordParts.length - 1; i++) {
+            String wp = wordParts[i];
+            newWordElemContentReg = newWordElemContentReg + wp + "-<lb/>";
+          }
+          newWordElemContentReg = newWordElemContentReg + wordParts[wordParts.length - 1]; // last one
+        } else if (formRegularized.contains(" ")) {
+          String[] wordParts = formRegularized.split(" ");
+          for (int i=0; i<wordParts.length - 1; i++) {
+            String wp = wordParts[i];
+            newWordElemContentReg = newWordElemContentReg + wp + "<lb/>";
+          }
+          newWordElemContentReg = newWordElemContentReg + wordParts[wordParts.length - 1]; // last one
+        } else {
+          newWordElemContentReg = formRegularized;
+        }
+        // handle normalized word form
+        String[] norm = {"norm"};
+        Normalizer normNormalizer = new Normalizer(norm, language);
+        String newWordElemContentNorm = null; 
+        if (formRegularized == null)
+          newWordElemContentNorm = normNormalizer.normalize(wordElemContent);
+        else 
+          newWordElemContentNorm = normNormalizer.normalize(newWordElemContentReg);
+        // write full word content (including lb etc.) into elements orig, reg and norm
+        write("<orig>" + wordElemContent + "</orig>");
+        write("<reg>" + newWordElemContentReg + "</reg>");
+        write("<norm>" + newWordElemContentNorm + "</norm>");
+        write("</" + name + ">");
+        formRegularized = null;
+        wordElemContent = "";
+      } else if (localName != null && localName.equals(wordElementName) && wordLevelCounter > 0) {
+        // nothing: remove word elements within word elements (level > 0)
+      } else {  
+        write("</" + name + ">");
+      }
+    } catch (ApplicationException e) {
+      throw new SAXException(e);
+    }
+  }
+
+  private void write(String outStr) throws SAXException {
+    if (wordLevelCounter > 0)
+      writeWordElemContent(outStr);
+    else 
+      resultStrBuilder.append(outStr);
+  }
+  
+  private void writeWordElemContent(String outStr) throws SAXException {
+    wordElemContent = wordElemContent + outStr;
+  }
+
+}