Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children | e845310098ba |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,68 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; + +import java.io.IOException; +import java.io.Reader; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; + +public class XmlTokenizer { + private Reader input; + private String language = "eng"; // default: english + private String[] normFunctions = {"specialNorm"}; // default: use special norm function + private String[] nwbElements = {"lb", "br", "cb", "figure", "image", "handwritten", "anchor", "emph", "note"}; // non word breaking elements, default: these elements + private String[] stopElements = {}; // default: no stop elements + private String[] outputOptions = {}; + + public XmlTokenizer(Reader input) { + this.input = input; + } + + public void setLanguage(String lang) { + String language = Language.getInstance().getLanguageId(lang); + this.language = language; + } + + public void setNormFunctions(String[] normFunctions) { + this.normFunctions = normFunctions; + } + + public void setNWBElements(String[] nwbElements) { + this.nwbElements = nwbElements; + } + + public void setStopElements(String[] stopElements) { + this.stopElements = stopElements; + } + + public void setOutputOptions(String[] outputOptions) { + this.outputOptions = outputOptions; + } + + public String tokenize() throws ApplicationException { + String retString = null; + try { + XmlTokenizerContentHandler dictContentHandler = new XmlTokenizerContentHandler(normFunctions, language); + dictContentHandler.setStopElements(stopElements); + dictContentHandler.setNWBElements(nwbElements); + dictContentHandler.setOutputOptions(outputOptions); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(dictContentHandler); + InputSource inputSource = new InputSource(input); + xmlParser.parse(inputSource); + retString = dictContentHandler.getXmlFragment(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return retString; + } + +} \ No newline at end of file