Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children | e845310098ba |
comparison
equal
deleted
inserted
replaced
18:dc5e9fcb3fdc | 19:4a3641ae14d2 |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; | |
2 | |
3 import java.io.IOException; | |
4 import java.io.Reader; | |
5 | |
6 import org.xml.sax.InputSource; | |
7 import org.xml.sax.SAXException; | |
8 import org.xml.sax.XMLReader; | |
9 | |
10 import com.sun.org.apache.xerces.internal.parsers.SAXParser; | |
11 | |
12 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
13 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; | |
14 | |
15 public class XmlTokenizer { | |
16 private Reader input; | |
17 private String language = "eng"; // default: english | |
18 private String[] normFunctions = {"specialNorm"}; // default: use special norm function | |
19 private String[] nwbElements = {"lb", "br", "cb", "figure", "image", "handwritten", "anchor", "emph", "note"}; // non word breaking elements, default: these elements | |
20 private String[] stopElements = {}; // default: no stop elements | |
21 private String[] outputOptions = {}; | |
22 | |
23 public XmlTokenizer(Reader input) { | |
24 this.input = input; | |
25 } | |
26 | |
27 public void setLanguage(String lang) { | |
28 String language = Language.getInstance().getLanguageId(lang); | |
29 this.language = language; | |
30 } | |
31 | |
32 public void setNormFunctions(String[] normFunctions) { | |
33 this.normFunctions = normFunctions; | |
34 } | |
35 | |
36 public void setNWBElements(String[] nwbElements) { | |
37 this.nwbElements = nwbElements; | |
38 } | |
39 | |
40 public void setStopElements(String[] stopElements) { | |
41 this.stopElements = stopElements; | |
42 } | |
43 | |
44 public void setOutputOptions(String[] outputOptions) { | |
45 this.outputOptions = outputOptions; | |
46 } | |
47 | |
48 public String tokenize() throws ApplicationException { | |
49 String retString = null; | |
50 try { | |
51 XmlTokenizerContentHandler dictContentHandler = new XmlTokenizerContentHandler(normFunctions, language); | |
52 dictContentHandler.setStopElements(stopElements); | |
53 dictContentHandler.setNWBElements(nwbElements); | |
54 dictContentHandler.setOutputOptions(outputOptions); | |
55 XMLReader xmlParser = new SAXParser(); | |
56 xmlParser.setContentHandler(dictContentHandler); | |
57 InputSource inputSource = new InputSource(input); | |
58 xmlParser.parse(inputSource); | |
59 retString = dictContentHandler.getXmlFragment(); | |
60 } catch (SAXException e) { | |
61 throw new ApplicationException(e); | |
62 } catch (IOException e) { | |
63 throw new ApplicationException(e); | |
64 } | |
65 return retString; | |
66 } | |
67 | |
68 } |