view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children e845310098ba
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;

import java.io.IOException;
import java.io.Reader;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import com.sun.org.apache.xerces.internal.parsers.SAXParser;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;

public class XmlTokenizer {
  private Reader input;
  private String language = "eng";  // default: english
  private String[] normFunctions = {"specialNorm"};  // default: use special norm function
  private String[] nwbElements = {"lb", "br", "cb", "figure", "image", "handwritten", "anchor", "emph", "note"};  // non word breaking elements, default: these elements
  private String[] stopElements = {};  // default: no stop elements
  private String[] outputOptions = {};
  
  public XmlTokenizer(Reader input) {
    this.input = input;
  }

  public void setLanguage(String lang) {
    String language = Language.getInstance().getLanguageId(lang); 
    this.language = language;
  }

  public void setNormFunctions(String[] normFunctions) {
    this.normFunctions = normFunctions;
  }

  public void setNWBElements(String[] nwbElements) {
    this.nwbElements = nwbElements;
  }

  public void setStopElements(String[] stopElements) {
    this.stopElements = stopElements;
  }

  public void setOutputOptions(String[] outputOptions) {
    this.outputOptions = outputOptions;
  }

  public String tokenize() throws ApplicationException {
    String retString = null;
    try {
      XmlTokenizerContentHandler dictContentHandler = new XmlTokenizerContentHandler(normFunctions, language);
      dictContentHandler.setStopElements(stopElements);
      dictContentHandler.setNWBElements(nwbElements);
      dictContentHandler.setOutputOptions(outputOptions);
      XMLReader xmlParser = new SAXParser();
      xmlParser.setContentHandler(dictContentHandler);
      InputSource inputSource = new InputSource(input);
      xmlParser.parse(inputSource);
      retString = dictContentHandler.getXmlFragment();
    } catch (SAXException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
    return retString;
  }

}