view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents 4a3641ae14d2
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collections;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import com.sun.org.apache.xerces.internal.parsers.SAXParser;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler.Element;

public class XmlTokenizer {
  private XmlTokenizerContentHandler xmlTokenizerContentHandler;
  private Reader input;
  private String docId;
  private String language = "eng";  // default: english
  private String[] normFunctions = {"specialNorm"};  // default: use special norm function
  private String[] nwbElements = {"lb", "br", "cb", "hi"};  // non word breaking elements, default: these elements
  private String[] stopElements = {};  // stop elements: its tokens should not get word tags (when output format is "xml") or its tokens should be removed (if output format is "string") 
  private String[] elements = {};
  private String[] highlightTerms = {};  // highlight terms, default: no highlight terms
  private String outputFormat = "xml";  // default: xml
  private String[] outputOptions = {};
  
  public XmlTokenizer(Reader input) {
    this.input = input;
  }

  public void setDocIdentifier(String docId) {
    this.docId = docId;
  }

  public void setLanguage(String lang) {
    String language = Language.getInstance().getLanguageId(lang); 
    this.language = language;
  }

  public void setNormFunctions(String[] normFunctions) {
    this.normFunctions = normFunctions;
  }

  public void setNWBElements(String[] nwbElements) {
    this.nwbElements = nwbElements;
  }

  public void setStopElements(String[] stopElements) {
    this.stopElements = stopElements;
  }

  public void setElements(String[] elements) {
    this.elements = elements;
  }

  public void setOutputFormat(String outputFormat) {
    this.outputFormat = outputFormat;
  }

  public void setOutputOptions(String[] outputOptions) {
    this.outputOptions = outputOptions;
  }

  public void setHighlightTerms(String[] highlightTerms) {
    this.highlightTerms = highlightTerms; 
  }

  public void tokenize() throws ApplicationException {
    try {
      xmlTokenizerContentHandler = new XmlTokenizerContentHandler(language);
      xmlTokenizerContentHandler.setDocIdentifier(docId);
      xmlTokenizerContentHandler.setStopElements(stopElements);
      xmlTokenizerContentHandler.setNWBElements(nwbElements);
      xmlTokenizerContentHandler.setHighlightTerms(highlightTerms);
      xmlTokenizerContentHandler.setNormFunctions(normFunctions);
      xmlTokenizerContentHandler.setOutputOptions(outputOptions);
      xmlTokenizerContentHandler.setOutputFormat(outputFormat);
      XMLReader xmlParser = new SAXParser();
      xmlParser.setContentHandler(xmlTokenizerContentHandler);
      InputSource inputSource = new InputSource(input);
      xmlParser.parse(inputSource);
    } catch (SAXException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
  }

  public String getXmlResult() throws ApplicationException {
    return xmlTokenizerContentHandler.getResultString();
  }
  
  public ArrayList<Token> getResultTokens() { 
    return xmlTokenizerContentHandler.getResultTokens();
  }
  
  public int getPageCount() {
    return xmlTokenizerContentHandler.getPageCount();  
  }

  public ArrayList<Element> getElements(String elementNamesStr) {
    ArrayList<Element> retElements = new ArrayList<Element>();
    String[] elementNames = elementNamesStr.split(" ");
    for (int i=0; i<elementNames.length; i++) {
      String elementName = elementNames[i];
      ArrayList<Element> elements = xmlTokenizerContentHandler.getElements(elementName);
      if (elements != null)
        retElements.addAll(elements);
      Collections.sort(retElements);
    }
    return retElements;  
  }

  public String getStringResult() throws ApplicationException {
    StringBuilder result = new StringBuilder();
    ArrayList<Token> resultTokens = new ArrayList<Token>();
    if (elements != null && elements.length > 0) {
      for (int i=0; i<elements.length; i++) {
        String elemName = elements[i];
        ArrayList<XmlTokenizerContentHandler.Element> elems = getElements(elemName);
        for (int j=0; j<elems.size(); j++) {
          XmlTokenizerContentHandler.Element elem = elems.get(j);
          resultTokens.addAll(elem.getTokens());
        }
      }
    } else {
      resultTokens = xmlTokenizerContentHandler.getResultTokens();  // all tokens
    }
    if (resultTokens != null) {
      for (int i=0; i<resultTokens.size(); i++) {
        Token token = resultTokens.get(i);
        if (! withLemmas(outputOptions)) {
          if (useNormFunction()) {
            String contentNorm = token.getContentNorm();
            if (contentNorm != null)
              result.append(contentNorm + " ");
          } else if (useRegFunction()) {
            String contentReg = token.getContentReg();
            if (contentReg != null)
              result.append(contentReg + " ");
            else {
              String contentOrig = token.getContentOrig();
              if (contentOrig != null)
                result.append(contentOrig + " ");
            }
          } else {
            String contentOrig = token.getContentOrig();
            if (contentOrig != null)
              result.append(contentOrig + " ");
          }
        } else {
          String contentMorph = token.getContentMorph();
          if (contentMorph != null)
            result.append(contentMorph + " ");
        }
      }
    }
    return result.toString();
  }
  
  private boolean withLemmas(String[] outputOptions) {
    boolean result = false;
    if (outputOptions != null) {
      for (int i=0; i< outputOptions.length; i++) {
        String function = outputOptions[i];
        if (function.equals("withLemmas"))
          return true;
      }
    }
    return result;
  }

  private boolean useNormFunction() {
    boolean useNorm = false;
    if (normFunctions != null) {
      for (int i=0; i< normFunctions.length; i++) {
        String function = normFunctions[i];
        if (function.equals("norm"))
          return true;
      }
    }
    return useNorm;
  }

  private boolean useRegFunction() {
    boolean useReg = false;
    if (normFunctions != null) {
      for (int i=0; i< normFunctions.length; i++) {
        String function = normFunctions[i];
        if (function.equals("reg"))
          return true;
      }
    }
    return useReg;
  }

}