comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents 4a3641ae14d2
children
comparison
equal deleted inserted replaced
22:6a45a982c333 23:e845310098ba
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; 1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;
2 2
3 import java.io.IOException; 3 import java.io.IOException;
4 import java.io.Reader; 4 import java.io.Reader;
5 import java.util.ArrayList;
6 import java.util.Collections;
5 7
6 import org.xml.sax.InputSource; 8 import org.xml.sax.InputSource;
7 import org.xml.sax.SAXException; 9 import org.xml.sax.SAXException;
8 import org.xml.sax.XMLReader; 10 import org.xml.sax.XMLReader;
9 11
10 import com.sun.org.apache.xerces.internal.parsers.SAXParser; 12 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
11 13
12 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; 14 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
13 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; 15 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
16 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler.Element;
14 17
15 public class XmlTokenizer { 18 public class XmlTokenizer {
19 private XmlTokenizerContentHandler xmlTokenizerContentHandler;
16 private Reader input; 20 private Reader input;
21 private String docId;
17 private String language = "eng"; // default: english 22 private String language = "eng"; // default: english
18 private String[] normFunctions = {"specialNorm"}; // default: use special norm function 23 private String[] normFunctions = {"specialNorm"}; // default: use special norm function
19 private String[] nwbElements = {"lb", "br", "cb", "figure", "image", "handwritten", "anchor", "emph", "note"}; // non word breaking elements, default: these elements 24 private String[] nwbElements = {"lb", "br", "cb", "hi"}; // non word breaking elements, default: these elements
20 private String[] stopElements = {}; // default: no stop elements 25 private String[] stopElements = {}; // stop elements: its tokens should not get word tags (when output format is "xml") or its tokens should be removed (if output format is "string")
26 private String[] elements = {};
27 private String[] highlightTerms = {}; // highlight terms, default: no highlight terms
28 private String outputFormat = "xml"; // default: xml
21 private String[] outputOptions = {}; 29 private String[] outputOptions = {};
22 30
23 public XmlTokenizer(Reader input) { 31 public XmlTokenizer(Reader input) {
24 this.input = input; 32 this.input = input;
33 }
34
35 public void setDocIdentifier(String docId) {
36 this.docId = docId;
25 } 37 }
26 38
27 public void setLanguage(String lang) { 39 public void setLanguage(String lang) {
28 String language = Language.getInstance().getLanguageId(lang); 40 String language = Language.getInstance().getLanguageId(lang);
29 this.language = language; 41 this.language = language;
39 51
40 public void setStopElements(String[] stopElements) { 52 public void setStopElements(String[] stopElements) {
41 this.stopElements = stopElements; 53 this.stopElements = stopElements;
42 } 54 }
43 55
56 public void setElements(String[] elements) {
57 this.elements = elements;
58 }
59
60 public void setOutputFormat(String outputFormat) {
61 this.outputFormat = outputFormat;
62 }
63
44 public void setOutputOptions(String[] outputOptions) { 64 public void setOutputOptions(String[] outputOptions) {
45 this.outputOptions = outputOptions; 65 this.outputOptions = outputOptions;
46 } 66 }
47 67
48 public String tokenize() throws ApplicationException { 68 public void setHighlightTerms(String[] highlightTerms) {
49 String retString = null; 69 this.highlightTerms = highlightTerms;
70 }
71
72 public void tokenize() throws ApplicationException {
50 try { 73 try {
51 XmlTokenizerContentHandler dictContentHandler = new XmlTokenizerContentHandler(normFunctions, language); 74 xmlTokenizerContentHandler = new XmlTokenizerContentHandler(language);
52 dictContentHandler.setStopElements(stopElements); 75 xmlTokenizerContentHandler.setDocIdentifier(docId);
53 dictContentHandler.setNWBElements(nwbElements); 76 xmlTokenizerContentHandler.setStopElements(stopElements);
54 dictContentHandler.setOutputOptions(outputOptions); 77 xmlTokenizerContentHandler.setNWBElements(nwbElements);
78 xmlTokenizerContentHandler.setHighlightTerms(highlightTerms);
79 xmlTokenizerContentHandler.setNormFunctions(normFunctions);
80 xmlTokenizerContentHandler.setOutputOptions(outputOptions);
81 xmlTokenizerContentHandler.setOutputFormat(outputFormat);
55 XMLReader xmlParser = new SAXParser(); 82 XMLReader xmlParser = new SAXParser();
56 xmlParser.setContentHandler(dictContentHandler); 83 xmlParser.setContentHandler(xmlTokenizerContentHandler);
57 InputSource inputSource = new InputSource(input); 84 InputSource inputSource = new InputSource(input);
58 xmlParser.parse(inputSource); 85 xmlParser.parse(inputSource);
59 retString = dictContentHandler.getXmlFragment();
60 } catch (SAXException e) { 86 } catch (SAXException e) {
61 throw new ApplicationException(e); 87 throw new ApplicationException(e);
62 } catch (IOException e) { 88 } catch (IOException e) {
63 throw new ApplicationException(e); 89 throw new ApplicationException(e);
64 } 90 }
65 return retString; 91 }
92
93 public String getXmlResult() throws ApplicationException {
94 return xmlTokenizerContentHandler.getResultString();
95 }
96
97 public ArrayList<Token> getResultTokens() {
98 return xmlTokenizerContentHandler.getResultTokens();
99 }
100
101 public int getPageCount() {
102 return xmlTokenizerContentHandler.getPageCount();
103 }
104
105 public ArrayList<Element> getElements(String elementNamesStr) {
106 ArrayList<Element> retElements = new ArrayList<Element>();
107 String[] elementNames = elementNamesStr.split(" ");
108 for (int i=0; i<elementNames.length; i++) {
109 String elementName = elementNames[i];
110 ArrayList<Element> elements = xmlTokenizerContentHandler.getElements(elementName);
111 if (elements != null)
112 retElements.addAll(elements);
113 Collections.sort(retElements);
114 }
115 return retElements;
116 }
117
118 public String getStringResult() throws ApplicationException {
119 StringBuilder result = new StringBuilder();
120 ArrayList<Token> resultTokens = new ArrayList<Token>();
121 if (elements != null && elements.length > 0) {
122 for (int i=0; i<elements.length; i++) {
123 String elemName = elements[i];
124 ArrayList<XmlTokenizerContentHandler.Element> elems = getElements(elemName);
125 for (int j=0; j<elems.size(); j++) {
126 XmlTokenizerContentHandler.Element elem = elems.get(j);
127 resultTokens.addAll(elem.getTokens());
128 }
129 }
130 } else {
131 resultTokens = xmlTokenizerContentHandler.getResultTokens(); // all tokens
132 }
133 if (resultTokens != null) {
134 for (int i=0; i<resultTokens.size(); i++) {
135 Token token = resultTokens.get(i);
136 if (! withLemmas(outputOptions)) {
137 if (useNormFunction()) {
138 String contentNorm = token.getContentNorm();
139 if (contentNorm != null)
140 result.append(contentNorm + " ");
141 } else if (useRegFunction()) {
142 String contentReg = token.getContentReg();
143 if (contentReg != null)
144 result.append(contentReg + " ");
145 else {
146 String contentOrig = token.getContentOrig();
147 if (contentOrig != null)
148 result.append(contentOrig + " ");
149 }
150 } else {
151 String contentOrig = token.getContentOrig();
152 if (contentOrig != null)
153 result.append(contentOrig + " ");
154 }
155 } else {
156 String contentMorph = token.getContentMorph();
157 if (contentMorph != null)
158 result.append(contentMorph + " ");
159 }
160 }
161 }
162 return result.toString();
163 }
164
165 private boolean withLemmas(String[] outputOptions) {
166 boolean result = false;
167 if (outputOptions != null) {
168 for (int i=0; i< outputOptions.length; i++) {
169 String function = outputOptions[i];
170 if (function.equals("withLemmas"))
171 return true;
172 }
173 }
174 return result;
175 }
176
177 private boolean useNormFunction() {
178 boolean useNorm = false;
179 if (normFunctions != null) {
180 for (int i=0; i< normFunctions.length; i++) {
181 String function = normFunctions[i];
182 if (function.equals("norm"))
183 return true;
184 }
185 }
186 return useNorm;
187 }
188
189 private boolean useRegFunction() {
190 boolean useReg = false;
191 if (normFunctions != null) {
192 for (int i=0; i< normFunctions.length; i++) {
193 String function = normFunctions[i];
194 if (function.equals("reg"))
195 return true;
196 }
197 }
198 return useReg;
66 } 199 }
67 200
68 } 201 }