comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java @ 19:4a3641ae14d2

Erstellung
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Wed, 09 Nov 2011 15:32:05 +0100
parents
children e845310098ba
comparison
equal deleted inserted replaced
18:dc5e9fcb3fdc 19:4a3641ae14d2
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;
2
3 import java.io.IOException;
4 import java.io.Reader;
5
6 import org.xml.sax.InputSource;
7 import org.xml.sax.SAXException;
8 import org.xml.sax.XMLReader;
9
10 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
11
12 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
13 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
14
15 public class XmlTokenizer {
16 private Reader input;
17 private String language = "eng"; // default: english
18 private String[] normFunctions = {"specialNorm"}; // default: use special norm function
19 private String[] nwbElements = {"lb", "br", "cb", "figure", "image", "handwritten", "anchor", "emph", "note"}; // non word breaking elements, default: these elements
20 private String[] stopElements = {}; // default: no stop elements
21 private String[] outputOptions = {};
22
23 public XmlTokenizer(Reader input) {
24 this.input = input;
25 }
26
27 public void setLanguage(String lang) {
28 String language = Language.getInstance().getLanguageId(lang);
29 this.language = language;
30 }
31
32 public void setNormFunctions(String[] normFunctions) {
33 this.normFunctions = normFunctions;
34 }
35
36 public void setNWBElements(String[] nwbElements) {
37 this.nwbElements = nwbElements;
38 }
39
40 public void setStopElements(String[] stopElements) {
41 this.stopElements = stopElements;
42 }
43
44 public void setOutputOptions(String[] outputOptions) {
45 this.outputOptions = outputOptions;
46 }
47
48 public String tokenize() throws ApplicationException {
49 String retString = null;
50 try {
51 XmlTokenizerContentHandler dictContentHandler = new XmlTokenizerContentHandler(normFunctions, language);
52 dictContentHandler.setStopElements(stopElements);
53 dictContentHandler.setNWBElements(nwbElements);
54 dictContentHandler.setOutputOptions(outputOptions);
55 XMLReader xmlParser = new SAXParser();
56 xmlParser.setContentHandler(dictContentHandler);
57 InputSource inputSource = new InputSource(input);
58 xmlParser.parse(inputSource);
59 retString = dictContentHandler.getXmlFragment();
60 } catch (SAXException e) {
61 throw new ApplicationException(e);
62 } catch (IOException e) {
63 throw new ApplicationException(e);
64 }
65 return retString;
66 }
67
68 }