view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children 2396a569e446
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.doc;

import java.util.ArrayList;

import org.apache.lucene.analysis.Token;
import org.xml.sax.*;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer;
import de.mpg.mpiwg.berlin.mpdl.lt.lex.db.LexHandler;
import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars;

public class DictionarizerContentHandler implements ContentHandler {
  private static String MARK = "COMPLEXELEMENTTTTT";
  private static int MARK_SIZE = MARK.length();
  private static int ELEMENT_TYPE_CHARACTERS = 1;
  private static int ELEMENT_TYPE_COMPLEX = 2;
  private String xmlnsString = "";
  private String language;
  private String outputXmlFragment = "";
  private Element rootElement;
  private Element currentElement;
  private ArrayList<Element> elementQueue;
  
  public DictionarizerContentHandler(String language) throws ApplicationException {
    this.language = language;
  }

  public String getXmlFragment() {
    return outputXmlFragment;  
  }
  
  public void startDocument() throws SAXException {
  }

  public void endDocument() throws SAXException {
    String rootElemToStr = rootElement.toXmlString();
    write(rootElemToStr);
    write("\n");
  }
  
  public void characters(char[] c, int start, int length) throws SAXException {
    char[] cCopy = new char[length];
    System.arraycopy(c, start, cCopy, 0, length);
    String charactersStr = String.valueOf(cCopy);
    if (charactersStr != null && ! charactersStr.equals("")) {
      if (currentElement != null) {
        Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS);
        charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr);
        if (currentElement.composites == null)
          currentElement.composites = new ArrayList<Element>();
        currentElement.composites.add(charElement);
      }
    }
  }

  public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
  }

  public void processingInstruction(String target, String data) throws SAXException {
  }

  public void setDocumentLocator(Locator locator) {
  }

  public void startPrefixMapping(String prefix, String uri) throws SAXException {
    xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
  }
  
  public void endPrefixMapping(String prefix) throws SAXException {
  }

  public void skippedEntity(String name) throws SAXException {
  }

  public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
    if (elementQueue == null)
      elementQueue = new ArrayList<Element>();
    Element newElement = new Element(name); // element of type: complex
    if (currentElement != null) {
      if (currentElement.composites == null)
        currentElement.composites = new ArrayList<Element>();
      if (currentElement.lang != null)
        newElement.lang = currentElement.lang;  // language wird an Kinder vererbt
      currentElement.composites.add(newElement);
    }
    currentElement = newElement;
    int attrSize = attrs.getLength();
    String attrString = "";
    for (int i=0; i<attrSize; i++) {
      String attrQName = attrs.getQName(i);
      String attrValue = attrs.getValue(i);
      attrValue = StringUtilEscapeChars.forXML(attrValue);
      attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
      if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang")))
        currentElement.lang = attrValue;  // wenn xml:lang belegt ist, wird es an das neue Element gesetzt und überschreibt vom Vater geerbte Werte
    }
    currentElement.attrString = attrString;
    if (! xmlnsString.equals("")) {
      currentElement.xmlnsString = xmlnsString;
    }
    xmlnsString = "";
    elementQueue.add(currentElement);
    // only the first element is the root element
    if(rootElement == null)
      rootElement = currentElement;
  }

  public void endElement(String uri, String localName, String name) throws SAXException {
    if (elementQueue != null && elementQueue.size() > 0) {
      int lastIndex = elementQueue.size() - 1;
      elementQueue.remove(lastIndex);
    }
    if (elementQueue != null && elementQueue.size() > 0) {
      int lastIndex = elementQueue.size() - 1;
      currentElement = elementQueue.get(lastIndex);
    } else {
      currentElement = null;
    }
  }

  public int getCharIndex(String compositesCharsDictionarized, int indexComplexElemCompositesCharsWithMarks) {
    if (indexComplexElemCompositesCharsWithMarks == 0)
      return -1;
    int size = compositesCharsDictionarized.length();
    if (size == 0)
      return -1;
    int index = 0;
    int counter = 0;
    boolean isInTag = false;
    boolean success = false;
    while (!success) {
      if (counter > size)
        return -1;
      char c = compositesCharsDictionarized.charAt(counter);
      if (c == '<')
        isInTag = true;
      if (! isInTag)
        index++;
      if (index == indexComplexElemCompositesCharsWithMarks)
        success = true;
      if (c == '>')
        isInTag = false;
      counter++;
    }
    return counter + 1;
  }
  
  private void write(String outStr) throws SAXException {
    outputXmlFragment += outStr;
  }
  
  private class Element {
    private int type;
    private String name;
    private String xmlnsString;
    private String attrString;
    private String value;
    private String lang;  // normalerweise mit dem Wert aus dem Attribut xml:lang belegt bzw. mit dem aus dem Vaterknoten wererbten xml:lang-Wert
    private ArrayList<Element> composites;
    
    private Element(String name) {
      this.type = ELEMENT_TYPE_COMPLEX;
      this.name = name;
    }

    private Element(String name, int type) {
      this.type = type;
      this.name = name;
    }

    private boolean isComplex() {
      boolean isComplex = false;
      if (type == ELEMENT_TYPE_COMPLEX)
        isComplex = true;
      return isComplex;
    }
    
    private String toXmlString() throws SAXException {
      String retString = "";
      String elemLanguage = language;  // default value for the document/page
      if (lang != null)
        elemLanguage = lang;  // der Wert des Elements falls vorhanden 
      // write this element
      if (! isComplex()) {
        retString += value;
      } else {
        String xmlNsString = this.xmlnsString;
        if (xmlNsString == null || xmlNsString.equals("")) {
          retString = retString + "<" + name + attrString + ">";
        } else { 
          retString = retString + "<" + name + " " + xmlNsString + attrString + ">";
        }
        if (composites != null) {
          String compositesChars = "";
          String compositesCharsWithMarks = "";
          ArrayList<Element> complexElements = new ArrayList<Element>();
          for (int i=0; i<composites.size(); i++) {
            Element composite = composites.get(i);
            if (! composite.isComplex()) {
              if (composite.value != null && ! composite.value.equals("")) {
                String compositeValueStr = composite.value;
                compositesChars += compositeValueStr;
                compositesCharsWithMarks += compositeValueStr;
              }
            } else {
              complexElements.add(composite);
              compositesCharsWithMarks += MARK;
            }
          }
          String compositesCharsDictionarized = characters2DictWords(compositesChars, elemLanguage);
          if (complexElements.size() > 0) {
            for (int i=0; i<complexElements.size(); i++) {
              int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarks.indexOf(MARK);
              int indexComplexElemCompositesCharsDictionarized = getCharIndex(compositesCharsDictionarized, indexComplexElemCompositesCharsWithMarks);
              Element complexElem = complexElements.get(i);
              String complexElementStr = complexElem.toXmlString();
              String firstPiece = "";
              if (indexComplexElemCompositesCharsDictionarized > 0) {
                firstPiece = compositesCharsDictionarized.substring(0, indexComplexElemCompositesCharsDictionarized - 1);
                compositesCharsDictionarized = compositesCharsDictionarized.substring(indexComplexElemCompositesCharsDictionarized - 1);
              }
              retString = retString + firstPiece + complexElementStr;
              compositesCharsWithMarks = compositesCharsWithMarks.substring(indexComplexElemCompositesCharsWithMarks + MARK_SIZE);
            }
            retString = retString + compositesCharsDictionarized; // last one must also be added
          } else {
            retString = retString + compositesCharsDictionarized; // last one must also be added
          }
        }
        retString = retString + "</" + name + ">";
      }
      return retString;
    }
    
    private String characters2DictWords(String charactersStrDeresolved, String language) throws SAXException {
      String charactersStr = StringUtilEscapeChars.resolveXmlEntities(charactersStrDeresolved);
      String retStr = "";
      try {
        MpdlTokenizerAnalyzer dictionarizerAnalyzer = new MpdlTokenizerAnalyzer(language);
        ArrayList<Token> wordTokens = dictionarizerAnalyzer.getToken(charactersStr);
        int endPos = 0;
        for (int i=0; i < wordTokens.size(); i++) {
          Token wordToken = wordTokens.get(i);
          int startPos = wordToken.startOffset();
          String beforeStr = charactersStr.substring(endPos, startPos);
          endPos = wordToken.endOffset();
          String wordStr = charactersStr.substring(startPos, endPos);
          String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr);
          String wordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(wordStr);
          String wordTokenText = wordToken.termText();
          LexHandler lexHandler = LexHandler.getInstance();
          // delivers lex entries by help of the morphology component (lex entry of the stem of the normalized word form)
          ArrayList<String> lexEntryKeys = lexHandler.getLexEntryKeys(wordTokenText, language, false);
          if (lexEntryKeys != null) {
            String lexForms = "";
            for (int j=0; j<lexEntryKeys.size(); j++) {
              String lexEntryKey = lexEntryKeys.get(j);
              lexForms = lexForms + lexEntryKey + " ";
            }
            lexForms = lexForms.substring(0, lexForms.length() - 1);
            retStr = retStr + beforeStrDeresolved + "<w lang=\"" + language + "\"" + " form=\"" + wordTokenText + "\"" + " lexForms=\"" + lexForms + "\">" + wordStrDeresolved + "</w>";
          } else {
            retStr = retStr + beforeStrDeresolved + wordStrDeresolved;
          }
        }
        String lastAfterStr = charactersStr.substring(endPos);
        String lastAfterStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(lastAfterStr);
        retStr = retStr + lastAfterStrDeresolved;
      } catch (ApplicationException e) {
        throw new SAXException(e);
      }
      return retStr;
    }
  }
}