Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 4a3641ae14d2 |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; | 1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; |
2 | 2 |
3 import java.io.IOException; | 3 import java.io.IOException; |
4 import java.io.Reader; | 4 import java.io.Reader; |
5 import java.util.ArrayList; | |
6 import java.util.Collections; | |
5 | 7 |
6 import org.xml.sax.InputSource; | 8 import org.xml.sax.InputSource; |
7 import org.xml.sax.SAXException; | 9 import org.xml.sax.SAXException; |
8 import org.xml.sax.XMLReader; | 10 import org.xml.sax.XMLReader; |
9 | 11 |
10 import com.sun.org.apache.xerces.internal.parsers.SAXParser; | 12 import com.sun.org.apache.xerces.internal.parsers.SAXParser; |
11 | 13 |
12 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | 14 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; |
13 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; | 15 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; |
16 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler.Element; | |
14 | 17 |
15 public class XmlTokenizer { | 18 public class XmlTokenizer { |
19 private XmlTokenizerContentHandler xmlTokenizerContentHandler; | |
16 private Reader input; | 20 private Reader input; |
21 private String docId; | |
17 private String language = "eng"; // default: english | 22 private String language = "eng"; // default: english |
18 private String[] normFunctions = {"specialNorm"}; // default: use special norm function | 23 private String[] normFunctions = {"specialNorm"}; // default: use special norm function |
19 private String[] nwbElements = {"lb", "br", "cb", "figure", "image", "handwritten", "anchor", "emph", "note"}; // non word breaking elements, default: these elements | 24 private String[] nwbElements = {"lb", "br", "cb", "hi"}; // non word breaking elements, default: these elements |
20 private String[] stopElements = {}; // default: no stop elements | 25 private String[] stopElements = {}; // stop elements: its tokens should not get word tags (when output format is "xml") or its tokens should be removed (if output format is "string") |
26 private String[] elements = {}; | |
27 private String[] highlightTerms = {}; // highlight terms, default: no highlight terms | |
28 private String outputFormat = "xml"; // default: xml | |
21 private String[] outputOptions = {}; | 29 private String[] outputOptions = {}; |
22 | 30 |
23 public XmlTokenizer(Reader input) { | 31 public XmlTokenizer(Reader input) { |
24 this.input = input; | 32 this.input = input; |
33 } | |
34 | |
35 public void setDocIdentifier(String docId) { | |
36 this.docId = docId; | |
25 } | 37 } |
26 | 38 |
27 public void setLanguage(String lang) { | 39 public void setLanguage(String lang) { |
28 String language = Language.getInstance().getLanguageId(lang); | 40 String language = Language.getInstance().getLanguageId(lang); |
29 this.language = language; | 41 this.language = language; |
39 | 51 |
40 public void setStopElements(String[] stopElements) { | 52 public void setStopElements(String[] stopElements) { |
41 this.stopElements = stopElements; | 53 this.stopElements = stopElements; |
42 } | 54 } |
43 | 55 |
56 public void setElements(String[] elements) { | |
57 this.elements = elements; | |
58 } | |
59 | |
60 public void setOutputFormat(String outputFormat) { | |
61 this.outputFormat = outputFormat; | |
62 } | |
63 | |
44 public void setOutputOptions(String[] outputOptions) { | 64 public void setOutputOptions(String[] outputOptions) { |
45 this.outputOptions = outputOptions; | 65 this.outputOptions = outputOptions; |
46 } | 66 } |
47 | 67 |
48 public String tokenize() throws ApplicationException { | 68 public void setHighlightTerms(String[] highlightTerms) { |
49 String retString = null; | 69 this.highlightTerms = highlightTerms; |
70 } | |
71 | |
72 public void tokenize() throws ApplicationException { | |
50 try { | 73 try { |
51 XmlTokenizerContentHandler dictContentHandler = new XmlTokenizerContentHandler(normFunctions, language); | 74 xmlTokenizerContentHandler = new XmlTokenizerContentHandler(language); |
52 dictContentHandler.setStopElements(stopElements); | 75 xmlTokenizerContentHandler.setDocIdentifier(docId); |
53 dictContentHandler.setNWBElements(nwbElements); | 76 xmlTokenizerContentHandler.setStopElements(stopElements); |
54 dictContentHandler.setOutputOptions(outputOptions); | 77 xmlTokenizerContentHandler.setNWBElements(nwbElements); |
78 xmlTokenizerContentHandler.setHighlightTerms(highlightTerms); | |
79 xmlTokenizerContentHandler.setNormFunctions(normFunctions); | |
80 xmlTokenizerContentHandler.setOutputOptions(outputOptions); | |
81 xmlTokenizerContentHandler.setOutputFormat(outputFormat); | |
55 XMLReader xmlParser = new SAXParser(); | 82 XMLReader xmlParser = new SAXParser(); |
56 xmlParser.setContentHandler(dictContentHandler); | 83 xmlParser.setContentHandler(xmlTokenizerContentHandler); |
57 InputSource inputSource = new InputSource(input); | 84 InputSource inputSource = new InputSource(input); |
58 xmlParser.parse(inputSource); | 85 xmlParser.parse(inputSource); |
59 retString = dictContentHandler.getXmlFragment(); | |
60 } catch (SAXException e) { | 86 } catch (SAXException e) { |
61 throw new ApplicationException(e); | 87 throw new ApplicationException(e); |
62 } catch (IOException e) { | 88 } catch (IOException e) { |
63 throw new ApplicationException(e); | 89 throw new ApplicationException(e); |
64 } | 90 } |
65 return retString; | 91 } |
92 | |
93 public String getXmlResult() throws ApplicationException { | |
94 return xmlTokenizerContentHandler.getResultString(); | |
95 } | |
96 | |
97 public ArrayList<Token> getResultTokens() { | |
98 return xmlTokenizerContentHandler.getResultTokens(); | |
99 } | |
100 | |
101 public int getPageCount() { | |
102 return xmlTokenizerContentHandler.getPageCount(); | |
103 } | |
104 | |
105 public ArrayList<Element> getElements(String elementNamesStr) { | |
106 ArrayList<Element> retElements = new ArrayList<Element>(); | |
107 String[] elementNames = elementNamesStr.split(" "); | |
108 for (int i=0; i<elementNames.length; i++) { | |
109 String elementName = elementNames[i]; | |
110 ArrayList<Element> elements = xmlTokenizerContentHandler.getElements(elementName); | |
111 if (elements != null) | |
112 retElements.addAll(elements); | |
113 Collections.sort(retElements); | |
114 } | |
115 return retElements; | |
116 } | |
117 | |
118 public String getStringResult() throws ApplicationException { | |
119 StringBuilder result = new StringBuilder(); | |
120 ArrayList<Token> resultTokens = new ArrayList<Token>(); | |
121 if (elements != null && elements.length > 0) { | |
122 for (int i=0; i<elements.length; i++) { | |
123 String elemName = elements[i]; | |
124 ArrayList<XmlTokenizerContentHandler.Element> elems = getElements(elemName); | |
125 for (int j=0; j<elems.size(); j++) { | |
126 XmlTokenizerContentHandler.Element elem = elems.get(j); | |
127 resultTokens.addAll(elem.getTokens()); | |
128 } | |
129 } | |
130 } else { | |
131 resultTokens = xmlTokenizerContentHandler.getResultTokens(); // all tokens | |
132 } | |
133 if (resultTokens != null) { | |
134 for (int i=0; i<resultTokens.size(); i++) { | |
135 Token token = resultTokens.get(i); | |
136 if (! withLemmas(outputOptions)) { | |
137 if (useNormFunction()) { | |
138 String contentNorm = token.getContentNorm(); | |
139 if (contentNorm != null) | |
140 result.append(contentNorm + " "); | |
141 } else if (useRegFunction()) { | |
142 String contentReg = token.getContentReg(); | |
143 if (contentReg != null) | |
144 result.append(contentReg + " "); | |
145 else { | |
146 String contentOrig = token.getContentOrig(); | |
147 if (contentOrig != null) | |
148 result.append(contentOrig + " "); | |
149 } | |
150 } else { | |
151 String contentOrig = token.getContentOrig(); | |
152 if (contentOrig != null) | |
153 result.append(contentOrig + " "); | |
154 } | |
155 } else { | |
156 String contentMorph = token.getContentMorph(); | |
157 if (contentMorph != null) | |
158 result.append(contentMorph + " "); | |
159 } | |
160 } | |
161 } | |
162 return result.toString(); | |
163 } | |
164 | |
165 private boolean withLemmas(String[] outputOptions) { | |
166 boolean result = false; | |
167 if (outputOptions != null) { | |
168 for (int i=0; i< outputOptions.length; i++) { | |
169 String function = outputOptions[i]; | |
170 if (function.equals("withLemmas")) | |
171 return true; | |
172 } | |
173 } | |
174 return result; | |
175 } | |
176 | |
177 private boolean useNormFunction() { | |
178 boolean useNorm = false; | |
179 if (normFunctions != null) { | |
180 for (int i=0; i< normFunctions.length; i++) { | |
181 String function = normFunctions[i]; | |
182 if (function.equals("norm")) | |
183 return true; | |
184 } | |
185 } | |
186 return useNorm; | |
187 } | |
188 | |
189 private boolean useRegFunction() { | |
190 boolean useReg = false; | |
191 if (normFunctions != null) { | |
192 for (int i=0; i< normFunctions.length; i++) { | |
193 String function = normFunctions[i]; | |
194 if (function.equals("reg")) | |
195 return true; | |
196 } | |
197 } | |
198 return useReg; | |
66 } | 199 } |
67 | 200 |
68 } | 201 } |