view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/WordContentHandler.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;

import org.xml.sax.*;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer;
import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;

public class WordContentHandler implements ContentHandler {
  private static String DEFAULT_LANGUAGE = "eng";
  private String xmlnsString = "";
  private StringBuilder resultStrBuilder = new StringBuilder();
  private String language = DEFAULT_LANGUAGE;
  private String formRegularized;
  private int wordLevelCounter = 0;
  private String wordElemContent = "";
  private String wordElementName = "w";
  
  public String getResult() {
    return resultStrBuilder.toString();  
  }

  public void startDocument() throws SAXException {
  }

  public void endDocument() throws SAXException {
  }
  
  public void characters(char[] c, int start, int length) throws SAXException {
    char[] cCopy = new char[length];
    System.arraycopy(c, start, cCopy, 0, length);
    String charactersStr = String.valueOf(cCopy);
    if (charactersStr != null && ! charactersStr.equals("")) {
      charactersStr = StringUtils.deresolveXmlEntities(charactersStr);
      write(charactersStr);
    }
  }

  public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
  }

  public void processingInstruction(String target, String data) throws SAXException {
  }

  public void setDocumentLocator(Locator locator) {
  }

  public void startPrefixMapping(String prefix, String uri) throws SAXException {
    if (prefix != null && prefix.equals(""))  
      xmlnsString += "xmlns" + "=\"" + uri + "\" ";
    else
      xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
  }
  
  public void endPrefixMapping(String prefix) throws SAXException {
  }

  public void skippedEntity(String name) throws SAXException {
  }

  public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
    int attrSize = attrs.getLength();
    String attrString = "";
    for (int i=0; i<attrSize; i++) {
      String attrQName = attrs.getQName(i);
      String attrValue = attrs.getValue(i);
      attrValue = StringUtils.forXML(attrValue);
      if (localName != null && localName.equals(wordElementName) && (attrQName.equals("lang") || attrQName.equals("xml:lang")))
        language = attrValue;
      if (localName != null && localName.equals(wordElementName) && attrQName.equals("formRegularized") && attrValue != null && ! attrValue.isEmpty())
        formRegularized = attrValue;
      attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
    }
    if (attrString != null && ! attrString.isEmpty()) {
      attrString = attrString.trim();
    }
    if (xmlnsString != null && ! xmlnsString.isEmpty()) {
      xmlnsString = xmlnsString.trim();
    }
    // start all elements but no word elements within word elements (level > 0)
    if (localName != null && (! localName.equals(wordElementName) || (localName.equals(wordElementName) && wordLevelCounter == 0))) {
      write("<" + name);
      if (xmlnsString != null && ! xmlnsString.isEmpty())
        write(" " + xmlnsString);
      if (attrString != null && ! attrString.isEmpty())
        write(" " + attrString);
      write(">");
    }
    xmlnsString = "";
    if (localName != null && localName.equals(wordElementName)) {
      wordLevelCounter++;
    }
  }

  public void endElement(String uri, String localName, String name) throws SAXException {
    try {
      if (localName != null && localName.equals(wordElementName)) {
        wordLevelCounter--;
      }
      // special handling of word elements (with level 0): insert orig, reg and norm attributes
      if (localName != null && localName.equals(wordElementName) && wordLevelCounter == 0) {
        // handle formRegularized
        String newWordElemContentReg = "";
        if (formRegularized == null || formRegularized.isEmpty()) {
          newWordElemContentReg = wordElemContent;
        } else if (formRegularized.contains("- ")) {
          String[] wordParts = formRegularized.split("- ");
          for (int i=0; i<wordParts.length - 1; i++) {
            String wp = wordParts[i];
            newWordElemContentReg = newWordElemContentReg + wp + "-<lb/>";
          }
          newWordElemContentReg = newWordElemContentReg + wordParts[wordParts.length - 1]; // last one
        } else if (formRegularized.contains(" ")) {
          String[] wordParts = formRegularized.split(" ");
          for (int i=0; i<wordParts.length - 1; i++) {
            String wp = wordParts[i];
            newWordElemContentReg = newWordElemContentReg + wp + "<lb/>";
          }
          newWordElemContentReg = newWordElemContentReg + wordParts[wordParts.length - 1]; // last one
        } else {
          newWordElemContentReg = formRegularized;
        }
        // handle normalized word form
        String[] norm = {"norm"};
        Normalizer normNormalizer = new Normalizer(norm, language);
        String newWordElemContentNorm = null; 
        if (formRegularized == null)
          newWordElemContentNorm = normNormalizer.normalize(wordElemContent);
        else 
          newWordElemContentNorm = normNormalizer.normalize(newWordElemContentReg);
        // write full word content (including lb etc.) into elements orig, reg and norm
        write("<orig>" + wordElemContent + "</orig>");
        write("<reg>" + newWordElemContentReg + "</reg>");
        write("<norm>" + newWordElemContentNorm + "</norm>");
        write("</" + name + ">");
        formRegularized = null;
        wordElemContent = "";
      } else if (localName != null && localName.equals(wordElementName) && wordLevelCounter > 0) {
        // nothing: remove word elements within word elements (level > 0)
      } else {  
        write("</" + name + ">");
      }
    } catch (ApplicationException e) {
      throw new SAXException(e);
    }
  }

  private void write(String outStr) throws SAXException {
    if (wordLevelCounter > 0)
      writeWordElemContent(outStr);
    else 
      resultStrBuilder.append(outStr);
  }
  
  private void writeWordElemContent(String outStr) throws SAXException {
    wordElemContent = wordElemContent + outStr;
  }

}