view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java @ 0:408254cf2f1d

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 24 Nov 2010 17:24:23 +0100
parents
children 2396a569e446
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.doc;

import java.util.ArrayList;

import org.apache.lucene.analysis.Token;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer;
import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer;
import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars;

public class NormalizeCharsContentHandler implements ContentHandler {
  private String xmlnsString = "";
  private String[] normalizeFunctions = {};  // default: without normalize functions
  private String language;
  private String outputXmlFragment = "";
  private Element currentElement;
  
  public NormalizeCharsContentHandler(String[] normalizeFunctions, String language) throws ApplicationException {
    this.normalizeFunctions = normalizeFunctions;
    this.language = language;
  }

  public String getXmlFragment() {
    return outputXmlFragment;  
  }
  
  public void startDocument() throws SAXException {
  }

  public void endDocument() throws SAXException {
  }
  
  public void characters(char[] c, int start, int length) throws SAXException {
    char[] cCopy = new char[length];
    System.arraycopy(c, start, cCopy, 0, length);
    String charactersStr = String.valueOf(cCopy);
    if (charactersStr != null && ! charactersStr.equals("")) {
      charactersStr = normalize(charactersStr);
      if (currentElement != null)
        currentElement.value = charactersStr;
      write(charactersStr);
    }
  }

  public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
  }

  public void processingInstruction(String target, String data) throws SAXException {
  }

  public void setDocumentLocator(Locator locator) {
  }

  public void startPrefixMapping(String prefix, String uri) throws SAXException {
    xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
  }
  
  public void endPrefixMapping(String prefix) throws SAXException {
  }

  public void skippedEntity(String name) throws SAXException {
  }

  public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
    currentElement = new Element(language, name);
    int attrSize = attrs.getLength();
    String attrString = "";
    for (int i=0; i<attrSize; i++) {
      String attrQName = attrs.getQName(i);
      String attrValue = attrs.getValue(i);
      attrValue = StringUtilEscapeChars.forXML(attrValue);
      attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
      if (attrQName != null && attrQName.equals("lang") && attrValue != null) {
        currentElement.language = attrValue;
      }
    }
    currentElement.attrString = attrString;
    if (xmlnsString.equals("")) {
      write("<" + name + attrString + ">");
    } else { 
      currentElement.xmlnsString = xmlnsString;
      write("<" + name + " " + xmlnsString + attrString + ">");
    }
    xmlnsString = "";
  }

  public void endElement(String uri, String localName, String name) throws SAXException {
    currentElement = null;
    write("</" + name + ">");
  }

  private void write(String outStr) throws SAXException {
    outputXmlFragment += outStr;
  }

  private String normalize(String charactersStr) throws SAXException {
    String retStr = "";
    try {
      MpdlTokenizerAnalyzer tokenizerAnalyzer = new MpdlTokenizerAnalyzer(language);
      ArrayList<Token> wordTokens = tokenizerAnalyzer.getToken(charactersStr);
      int endPos = 0;
      for (int i=0; i < wordTokens.size(); i++) {
        Token wordToken = wordTokens.get(i);
        int startPos = wordToken.startOffset();
        String beforeStr = charactersStr.substring(endPos, startPos);
        String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr);
        endPos = wordToken.endOffset();
        String wordStr = charactersStr.substring(startPos, endPos);

        MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(normalizeFunctions, language);
        String normalizedWordStr = mpdlNormalizer.normalize(wordStr);

        String normalizedWordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(normalizedWordStr);
        // String wordTokenText = wordToken.termText();
        retStr = retStr + beforeStrDeresolved + normalizedWordStrDeresolved;
      }
      String lastAfterStr = charactersStr.substring(endPos);
      String lastAfterStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(lastAfterStr);
      retStr = retStr + lastAfterStrDeresolved;
    } catch (ApplicationException e) {
      throw new SAXException(e);
    }
    return retStr;
  }  
  
  private class Element {
    String name;
    String language;
    String xmlnsString;
    String attrString;
    String value;
    
    Element(String language, String name) {
      this.language = language;
      this.name = name;
    }
    
  }
  
}