diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children e845310098ba
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java	Wed Nov 09 15:32:05 2011 +0100
@@ -0,0 +1,68 @@
+package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+import com.sun.org.apache.xerces.internal.parsers.SAXParser;
+
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
+
+public class XmlTokenizer {
+  private Reader input;
+  private String language = "eng";  // default: english
+  private String[] normFunctions = {"specialNorm"};  // default: use special norm function
+  private String[] nwbElements = {"lb", "br", "cb", "figure", "image", "handwritten", "anchor", "emph", "note"};  // non word breaking elements, default: these elements
+  private String[] stopElements = {};  // default: no stop elements
+  private String[] outputOptions = {};
+  
+  public XmlTokenizer(Reader input) {
+    this.input = input;
+  }
+
+  public void setLanguage(String lang) {
+    String language = Language.getInstance().getLanguageId(lang); 
+    this.language = language;
+  }
+
+  public void setNormFunctions(String[] normFunctions) {
+    this.normFunctions = normFunctions;
+  }
+
+  public void setNWBElements(String[] nwbElements) {
+    this.nwbElements = nwbElements;
+  }
+
+  public void setStopElements(String[] stopElements) {
+    this.stopElements = stopElements;
+  }
+
+  public void setOutputOptions(String[] outputOptions) {
+    this.outputOptions = outputOptions;
+  }
+
+  public String tokenize() throws ApplicationException {
+    String retString = null;
+    try {
+      XmlTokenizerContentHandler dictContentHandler = new XmlTokenizerContentHandler(normFunctions, language);
+      dictContentHandler.setStopElements(stopElements);
+      dictContentHandler.setNWBElements(nwbElements);
+      dictContentHandler.setOutputOptions(outputOptions);
+      XMLReader xmlParser = new SAXParser();
+      xmlParser.setContentHandler(dictContentHandler);
+      InputSource inputSource = new InputSource(input);
+      xmlParser.parse(inputSource);
+      retString = dictContentHandler.getXmlFragment();
+    } catch (SAXException e) {
+      throw new ApplicationException(e);
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+    return retString;
+  }
+
+}
\ No newline at end of file