Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children | e845310098ba |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; import java.io.IOException; import java.io.Reader; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import com.sun.org.apache.xerces.internal.parsers.SAXParser; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; public class XmlTokenizer { private Reader input; private String language = "eng"; // default: english private String[] normFunctions = {"specialNorm"}; // default: use special norm function private String[] nwbElements = {"lb", "br", "cb", "figure", "image", "handwritten", "anchor", "emph", "note"}; // non word breaking elements, default: these elements private String[] stopElements = {}; // default: no stop elements private String[] outputOptions = {}; public XmlTokenizer(Reader input) { this.input = input; } public void setLanguage(String lang) { String language = Language.getInstance().getLanguageId(lang); this.language = language; } public void setNormFunctions(String[] normFunctions) { this.normFunctions = normFunctions; } public void setNWBElements(String[] nwbElements) { this.nwbElements = nwbElements; } public void setStopElements(String[] stopElements) { this.stopElements = stopElements; } public void setOutputOptions(String[] outputOptions) { this.outputOptions = outputOptions; } public String tokenize() throws ApplicationException { String retString = null; try { XmlTokenizerContentHandler dictContentHandler = new XmlTokenizerContentHandler(normFunctions, language); dictContentHandler.setStopElements(stopElements); dictContentHandler.setNWBElements(nwbElements); dictContentHandler.setOutputOptions(outputOptions); XMLReader xmlParser = new SAXParser(); xmlParser.setContentHandler(dictContentHandler); InputSource inputSource = new InputSource(input); xmlParser.parse(inputSource); retString = dictContentHandler.getXmlFragment(); } catch (SAXException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } return retString; } }