Mercurial > hg > mpdl-group

package de.mpg.mpiwg.berlin.mpdl.lt.doc;

import java.util.ArrayList;

import org.apache.lucene.analysis.Token;
import org.xml.sax.*;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer;
import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
import de.mpg.mpiwg.berlin.mpdl.lt.lex.db.LexHandler;
import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars;

public class NormDictContentHandler implements ContentHandler {
  private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString();  // word delimiting element
  private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString();  // not word delimiting element
  private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length();
  private static int ELEMENT_TYPE_CHARACTERS = 1;
  private static int ELEMENT_TYPE_COMPLEX = 2;
  private String[] normalizeFunctions = {};  // default: without normalize functions
  private boolean dictMode = false;  // default: not in dictionary mode
  private String xmlnsString = "";
  private String language;
  private String outputXmlFragment = "";
  private Element rootElement;
  private Element currentElement;
  private ArrayList<Element> elementQueue;

  public NormDictContentHandler(String[] normalizeFunctions, String language) throws ApplicationException {
    if (normalizeFunctions == null) {
      String[] emptyFunctions = {};
      this.normalizeFunctions = emptyFunctions;
    } else {
      this.normalizeFunctions = normalizeFunctions;
    }
    this.language = language;
  }

  public void setDictMode(boolean dictMode) {
    this.dictMode = dictMode;
  }

  public String getXmlFragment() {
    return outputXmlFragment;
  }

  public void startDocument() throws SAXException {
  }

  public void endDocument() throws SAXException {
    try {
      String rootElemToStr = rootElement.toXmlString();
      // hack: in echo documents the spaces between sentences should be removed
      if (rootElemToStr != null && rootElemToStr.startsWith("<echo") && Language.getInstance().isChinese(language)) {
        rootElemToStr = rootElemToStr.replaceAll("</s>[ \n\t]+<s", "</s><s");
      }
      write(rootElemToStr);
      write("\n");
    } catch (NullPointerException e) {
      throw new SAXException(e);
    }
  }

  public void characters(char[] c, int start, int length) throws SAXException {
    char[] cCopy = new char[length];
    System.arraycopy(c, start, cCopy, 0, length);
    String charactersStr = String.valueOf(cCopy);
    if (charactersStr != null && ! charactersStr.equals("")) {
      if (currentElement != null) {
        Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS);
        charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr);
        if (currentElement.composites == null)
          currentElement.composites = new ArrayList<Element>();
        currentElement.composites.add(charElement);
      }
    }
  }

  public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
  }

  public void processingInstruction(String target, String data) throws SAXException {
  }

  public void setDocumentLocator(Locator locator) {
  }

  public void startPrefixMapping(String prefix, String uri) throws SAXException {
    xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
    if (prefix != null && prefix.equals(""))
      xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" ";
  }

  public void endPrefixMapping(String prefix) throws SAXException {
  }

  public void skippedEntity(String name) throws SAXException {
  }

  public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
    if (elementQueue == null)
      elementQueue = new ArrayList<Element>();
    Element newElement = new Element(name); // element of type: complex
    if (currentElement != null) {
      if (currentElement.composites == null)
        currentElement.composites = new ArrayList<Element>();
      if (currentElement.lang != null)
        newElement.lang = currentElement.lang;  // language is inherited to childs
      currentElement.composites.add(newElement);
    }
    currentElement = newElement;
    int attrSize = attrs.getLength();
    String attrString = "";
    for (int i=0; i<attrSize; i++) {
      String attrQName = attrs.getQName(i);
      String attrValue = attrs.getValue(i);
      attrValue = StringUtilEscapeChars.forXML(attrValue);
      attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
      if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang")))
        currentElement.lang = attrValue;  // if xml:lang is set, it is set to the new element and overwrites values inherited by the father
    }
    currentElement.attrString = attrString;
    if (! xmlnsString.equals("")) {
      currentElement.xmlnsString = xmlnsString;
    }
    xmlnsString = "";
    elementQueue.add(currentElement);
    // only the first element is the root element
    if(rootElement == null)
      rootElement = currentElement;
  }

  public void endElement(String uri, String localName, String name) throws SAXException {
    if (elementQueue != null && elementQueue.size() > 0) {
      int lastIndex = elementQueue.size() - 1;
      elementQueue.remove(lastIndex);
    }
    if (elementQueue != null && elementQueue.size() > 0) {
      int lastIndex = elementQueue.size() - 1;
      currentElement = elementQueue.get(lastIndex);
    } else {
      currentElement = null;
    }
  }

  private void write(String outStr) throws SAXException {
    outputXmlFragment += outStr;
  }

  private class Element {
    private int type;
    private String name;
    private String xmlnsString;
    private String attrString;
    private String value;
    private String lang;  // normally value of attribute xml:lang or the inherited xml:lang value of the father node
    private ArrayList<Element> composites;

    private Element(String name) {
      this.type = ELEMENT_TYPE_COMPLEX;
      this.name = name;
    }

    private Element(String name, int type) {
      this.type = type;
      this.name = name;
    }

    private boolean isComplex() {
      boolean isComplex = false;
      if (type == ELEMENT_TYPE_COMPLEX)
        isComplex = true;
      return isComplex;
    }

    /**
     * feel free to add/remove some element names; element content must be empty
     * @return true if element is a word delimiter element else false
     */
    private boolean isWordDelimiterElement() {
      boolean isWordDelimiterElement = true;
      // "note" causes problems: word after the note is not recognized
      // "emph" causes problems: e.g. "Natur<emph>ereignis</emph> enthüllte" is replaced by "Natur<emph><w>ereignis</w></emph>enthüllte"
      if (name.equals("lb") || name.equals("br") || name.equals("cb") || name.equals("figure") || name.equals("image") || name.equals("handwritten") || name.equals("anchor"))
        isWordDelimiterElement = false;
      return isWordDelimiterElement;
    }

    private String toXmlString() throws SAXException {
      String retString = "";
      String elemLanguage = language;  // default value for the document/page
      if (lang != null)
        elemLanguage = lang;  // value of the element if available
      // write this element
      if (! isComplex()) {
        retString += value;
      } else {
        String xmlNsString = this.xmlnsString;
        if (xmlNsString == null || xmlNsString.equals("")) {
          retString = retString + "<" + name + attrString + ">";
        } else {
          retString = retString + "<" + name + " " + xmlNsString + attrString + ">";
        }
        if (composites != null) {
          String compositesCharsWithMarks = "";
          ArrayList<Element> complexElements = new ArrayList<Element>();
          for (int i=0; i<composites.size(); i++) {
            Element composite = composites.get(i);
            if (! composite.isComplex()) {
              if (composite.value != null && ! composite.value.equals("")) {
                String compositeValueStr = composite.value;
                compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words.
                compositeValueStr = compositeValueStr.replaceAll("[ \t]+", " "); // if there are many Blanks/Tabs make them to one
                compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr;
              }
            } else {
              if (! composite.isWordDelimiterElement()) {
                compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_NWD_MARK;  // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>)
              } else {
                compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_MARK;  // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>)
              }
              complexElements.add(composite);
            }
          }
          compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK);  // remove Blanks after the non word breaking mark (e.g. "praebi<lb/> ta" is changed to "praebi<lb/>ta")
          String compositesCharsWithMarksNormalized = normalizeWords(compositesCharsWithMarks, elemLanguage);
          compositesCharsWithMarksNormalized = compositesCharsWithMarksNormalized.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK);  // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values
          if (complexElements.size() > 0) {
            for (int i=0; i<complexElements.size(); i++) {
              int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksNormalized.indexOf(COMPLEX_ELEMENT_MARK);
              Element complexElem = complexElements.get(i);
              String complexElementStr = complexElem.toXmlString();
              String firstPiece = "";
              if (indexComplexElemCompositesCharsWithMarks > 0) {
                firstPiece = compositesCharsWithMarksNormalized.substring(0, indexComplexElemCompositesCharsWithMarks);
                compositesCharsWithMarksNormalized = compositesCharsWithMarksNormalized.substring(indexComplexElemCompositesCharsWithMarks);
              }
              retString = retString + firstPiece + complexElementStr;
              compositesCharsWithMarksNormalized = compositesCharsWithMarksNormalized.substring(COMPLEX_ELEMENT_MARK_SIZE);
            }
            retString = retString + compositesCharsWithMarksNormalized; // last one must also be added
          } else {
            retString = retString + compositesCharsWithMarksNormalized; // last one must also be added
          }
        }
        retString = retString + "</" + name + ">";
      }
      return retString;
    }

    private String normalizeWords(String charactersStrDeresolved, String language) throws SAXException {
      String charactersStr = StringUtilEscapeChars.resolveXmlEntities(charactersStrDeresolved);
      String retStr = "";
      try {
        MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(normalizeFunctions, language);
        if (dictMode) {
          mpdlNormalizer.setNormMode(MpdlNormalizer.DICTIONARY);
        } else {
          mpdlNormalizer.setNormMode(MpdlNormalizer.DISPLAY);
        }
        MpdlTokenizerAnalyzer tokenAnalyzer = new MpdlTokenizerAnalyzer(mpdlNormalizer, language);
        ArrayList<Token> wordTokens = tokenAnalyzer.getToken(charactersStr);
        int endPos = 0;
        for (int i=0; i < wordTokens.size(); i++) {
          Token wordToken = wordTokens.get(i);
          int startPos = wordToken.startOffset();
          String beforeStr = charactersStr.substring(endPos, startPos);
          endPos = wordToken.endOffset();
          String displayWordStr = charactersStr.substring(startPos, endPos);
          String normalizedWord = displayWordStr;
          if (! dictMode) {
            normalizedWord = normalize(mpdlNormalizer, displayWordStr);  // normalizer in DISPLAY mode
            normalizedWord = StringUtilEscapeChars.deresolveXmlEntities(normalizedWord);
          } else {
            normalizedWord = getLexWord(mpdlNormalizer, displayWordStr); // normalizer in DICTIONARY mode
          }
          String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr);
          retStr = retStr + beforeStrDeresolved + normalizedWord;
        }
        String lastAfterStr = charactersStr.substring(endPos);
        String lastAfterStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(lastAfterStr);
        retStr = retStr + lastAfterStrDeresolved;
      } catch (ApplicationException e) {
        throw new SAXException(e);
      }
      return retStr;
    }

    /**
     * if word contains "not word delimiting symbol" (e.g. for line break) it is replaced
     * by a "Blank" so that the Lex normalizer could handle it. Other cases see below.
     * The Lex normalizer then e.g. gets "præbi- ta" and normalize it to "praebi- ta".
     * @param mpdlNormalizer Lex normalizer
     * @param word
     * @return normalized word
     * @throws ApplicationException
     */
    private String normalize(MpdlNormalizer mpdlNormalizer, String word) throws ApplicationException {
      if (word.trim().isEmpty())
        return word;
      String cleanedWord = word;
      // starting nwd mark and more than one nwd mark are removed before normalization; after normalization they are added again
      boolean startsWithNWDMark = cleanedWord.startsWith(COMPLEX_ELEMENT_NWD_MARK);
      if (startsWithNWDMark)
        cleanedWord = cleanedWord.substring(1);
      int countNWDMarks = cleanedWord.length() - cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK, "").length();
      if (countNWDMarks > 1)
        cleanedWord = cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK + "+", COMPLEX_ELEMENT_NWD_MARK);
      // boolean notHyphenPlusNWD = cleanedWord.matches(".*[^-]+" + COMPLEX_ELEMENT_NWD_MARK + "+.*");  // e.g. "praebi ta"
      // if (notHyphenPlusNWD)
      //   cleanedWord = cleanedWord.replaceAll("([^-]+)" + COMPLEX_ELEMENT_NWD_MARK + "+", "$1-" + COMPLEX_ELEMENT_NWD_MARK);  // e.g. "praebi ta" is replaced by "praebi- ta"
      String inputWord = cleanedWord.replaceAll(COMPLEX_ELEMENT_NWD_MARK, " ");
      String normalizedWordStr = mpdlNormalizer.normalize(inputWord);
      normalizedWordStr = normalizedWordStr.replaceAll(" ", COMPLEX_ELEMENT_NWD_MARK);
      // if (notHyphenPlusNWD)
      //   normalizedWordStr = normalizedWordStr.replaceAll("-" + COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_NWD_MARK);  // e.g. "praebi- ta" is replaced by "praebi ta"
      if (countNWDMarks > 1) {
        String nwdStr = "";
        for (int i=0; i<countNWDMarks; i++)
          nwdStr += COMPLEX_ELEMENT_NWD_MARK;
        normalizedWordStr = normalizedWordStr.replaceAll(COMPLEX_ELEMENT_NWD_MARK, nwdStr);
      }
      if (startsWithNWDMark)
        normalizedWordStr = COMPLEX_ELEMENT_NWD_MARK + normalizedWordStr;
      return normalizedWordStr;
    }

    private String getLexWord(MpdlNormalizer mpdlNormalizer, String displayWord) throws ApplicationException {
      String lexWord = null;
      String wordForm = removeSpecialSymbols(displayWord);
      wordForm = wordForm.toLowerCase();
      wordForm = normalize(mpdlNormalizer, wordForm);
      // delivers lex entries by help of the morphology component (lex entry of the stem of the normalized word form)
      LexHandler lexHandler = LexHandler.getInstance();
      String lang = mpdlNormalizer.getLanguage();
      ArrayList<String> lexEntryKeys = lexHandler.getLexEntryKeys(wordForm, lang, false);
      String displayWordDeresolved = StringUtilEscapeChars.deresolveXmlEntities(displayWord);
      if (lexEntryKeys != null) {
        String lexForms = "";
        for (int j=0; j<lexEntryKeys.size(); j++) {
          String lexEntryKey = lexEntryKeys.get(j);
          lexForms = lexForms + lexEntryKey + " ";
        }
        lexForms = lexForms.substring(0, lexForms.length() - 1);
        lexWord = "<w lang=\"" + lang + "\"" + " form=\"" + wordForm + "\"" + " lexForms=\"" + lexForms + "\">" + displayWordDeresolved + "</w>";
      } else {
        lexWord = displayWordDeresolved;
      }
      return lexWord;
    }

    private String removeSpecialSymbols(String inputStr) {
      String retStr = inputStr.replaceAll(" |\n|\t|-|\u2424|\u2425", "");
      return retStr;
    }

  }

}
author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Tue, 27 Sep 2011 16:40:57 +0200
parents	5df60f24e997
children