annotate software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizer.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents 4a3641ae14d2
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
2
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
3 import java.io.IOException;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
4 import java.io.Reader;
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
5 import java.util.ArrayList;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
6 import java.util.Collections;
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
7
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
8 import org.xml.sax.InputSource;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
9 import org.xml.sax.SAXException;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
10 import org.xml.sax.XMLReader;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
11
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
12 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
13
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
14 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
15 import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
16 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler.Element;
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
17
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
18 public class XmlTokenizer {
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
19 private XmlTokenizerContentHandler xmlTokenizerContentHandler;
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
20 private Reader input;
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
21 private String docId;
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
22 private String language = "eng"; // default: english
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
23 private String[] normFunctions = {"specialNorm"}; // default: use special norm function
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
24 private String[] nwbElements = {"lb", "br", "cb", "hi"}; // non word breaking elements, default: these elements
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
25 private String[] stopElements = {}; // stop elements: its tokens should not get word tags (when output format is "xml") or its tokens should be removed (if output format is "string")
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
26 private String[] elements = {};
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
27 private String[] highlightTerms = {}; // highlight terms, default: no highlight terms
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
28 private String outputFormat = "xml"; // default: xml
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
29 private String[] outputOptions = {};
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
30
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
31 public XmlTokenizer(Reader input) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
32 this.input = input;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
33 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
34
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
35 public void setDocIdentifier(String docId) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
36 this.docId = docId;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
37 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
38
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
39 public void setLanguage(String lang) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
40 String language = Language.getInstance().getLanguageId(lang);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
41 this.language = language;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
42 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
43
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
44 public void setNormFunctions(String[] normFunctions) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
45 this.normFunctions = normFunctions;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
46 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
47
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
48 public void setNWBElements(String[] nwbElements) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
49 this.nwbElements = nwbElements;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
50 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
51
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
52 public void setStopElements(String[] stopElements) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
53 this.stopElements = stopElements;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
54 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
55
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
56 public void setElements(String[] elements) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
57 this.elements = elements;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
58 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
59
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
60 public void setOutputFormat(String outputFormat) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
61 this.outputFormat = outputFormat;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
62 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
63
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
64 public void setOutputOptions(String[] outputOptions) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
65 this.outputOptions = outputOptions;
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
66 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
67
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
68 public void setHighlightTerms(String[] highlightTerms) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
69 this.highlightTerms = highlightTerms;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
70 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
71
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
72 public void tokenize() throws ApplicationException {
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
73 try {
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
74 xmlTokenizerContentHandler = new XmlTokenizerContentHandler(language);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
75 xmlTokenizerContentHandler.setDocIdentifier(docId);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
76 xmlTokenizerContentHandler.setStopElements(stopElements);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
77 xmlTokenizerContentHandler.setNWBElements(nwbElements);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
78 xmlTokenizerContentHandler.setHighlightTerms(highlightTerms);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
79 xmlTokenizerContentHandler.setNormFunctions(normFunctions);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
80 xmlTokenizerContentHandler.setOutputOptions(outputOptions);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
81 xmlTokenizerContentHandler.setOutputFormat(outputFormat);
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
82 XMLReader xmlParser = new SAXParser();
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
83 xmlParser.setContentHandler(xmlTokenizerContentHandler);
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
84 InputSource inputSource = new InputSource(input);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
85 xmlParser.parse(inputSource);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
86 } catch (SAXException e) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
87 throw new ApplicationException(e);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
88 } catch (IOException e) {
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
89 throw new ApplicationException(e);
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
90 }
23
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
91 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
92
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
93 public String getXmlResult() throws ApplicationException {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
94 return xmlTokenizerContentHandler.getResultString();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
95 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
96
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
97 public ArrayList<Token> getResultTokens() {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
98 return xmlTokenizerContentHandler.getResultTokens();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
99 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
100
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
101 public int getPageCount() {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
102 return xmlTokenizerContentHandler.getPageCount();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
103 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
104
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
105 public ArrayList<Element> getElements(String elementNamesStr) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
106 ArrayList<Element> retElements = new ArrayList<Element>();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
107 String[] elementNames = elementNamesStr.split(" ");
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
108 for (int i=0; i<elementNames.length; i++) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
109 String elementName = elementNames[i];
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
110 ArrayList<Element> elements = xmlTokenizerContentHandler.getElements(elementName);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
111 if (elements != null)
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
112 retElements.addAll(elements);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
113 Collections.sort(retElements);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
114 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
115 return retElements;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
116 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
117
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
118 public String getStringResult() throws ApplicationException {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
119 StringBuilder result = new StringBuilder();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
120 ArrayList<Token> resultTokens = new ArrayList<Token>();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
121 if (elements != null && elements.length > 0) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
122 for (int i=0; i<elements.length; i++) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
123 String elemName = elements[i];
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
124 ArrayList<XmlTokenizerContentHandler.Element> elems = getElements(elemName);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
125 for (int j=0; j<elems.size(); j++) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
126 XmlTokenizerContentHandler.Element elem = elems.get(j);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
127 resultTokens.addAll(elem.getTokens());
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
128 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
129 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
130 } else {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
131 resultTokens = xmlTokenizerContentHandler.getResultTokens(); // all tokens
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
132 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
133 if (resultTokens != null) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
134 for (int i=0; i<resultTokens.size(); i++) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
135 Token token = resultTokens.get(i);
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
136 if (! withLemmas(outputOptions)) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
137 if (useNormFunction()) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
138 String contentNorm = token.getContentNorm();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
139 if (contentNorm != null)
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
140 result.append(contentNorm + " ");
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
141 } else if (useRegFunction()) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
142 String contentReg = token.getContentReg();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
143 if (contentReg != null)
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
144 result.append(contentReg + " ");
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
145 else {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
146 String contentOrig = token.getContentOrig();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
147 if (contentOrig != null)
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
148 result.append(contentOrig + " ");
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
149 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
150 } else {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
151 String contentOrig = token.getContentOrig();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
152 if (contentOrig != null)
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
153 result.append(contentOrig + " ");
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
154 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
155 } else {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
156 String contentMorph = token.getContentMorph();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
157 if (contentMorph != null)
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
158 result.append(contentMorph + " ");
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
159 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
160 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
161 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
162 return result.toString();
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
163 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
164
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
165 private boolean withLemmas(String[] outputOptions) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
166 boolean result = false;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
167 if (outputOptions != null) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
168 for (int i=0; i< outputOptions.length; i++) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
169 String function = outputOptions[i];
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
170 if (function.equals("withLemmas"))
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
171 return true;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
172 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
173 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
174 return result;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
175 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
176
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
177 private boolean useNormFunction() {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
178 boolean useNorm = false;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
179 if (normFunctions != null) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
180 for (int i=0; i< normFunctions.length; i++) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
181 String function = normFunctions[i];
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
182 if (function.equals("norm"))
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
183 return true;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
184 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
185 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
186 return useNorm;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
187 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
188
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
189 private boolean useRegFunction() {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
190 boolean useReg = false;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
191 if (normFunctions != null) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
192 for (int i=0; i< normFunctions.length; i++) {
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
193 String function = normFunctions[i];
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
194 if (function.equals("reg"))
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
195 return true;
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
196 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
197 }
e845310098ba diverse Korrekturen
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents: 19
diff changeset
198 return useReg;
19
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
199 }
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
200
4a3641ae14d2 Erstellung
Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
parents:
diff changeset
201 }