comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/WordContentHandler.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
comparison
equal deleted inserted replaced
22:6a45a982c333 23:e845310098ba
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;
2
3 import org.xml.sax.*;
4
5 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
6 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer;
7 import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;
8
9 public class WordContentHandler implements ContentHandler {
10 private static String DEFAULT_LANGUAGE = "eng";
11 private String xmlnsString = "";
12 private StringBuilder resultStrBuilder = new StringBuilder();
13 private String language = DEFAULT_LANGUAGE;
14 private String formRegularized;
15 private int wordLevelCounter = 0;
16 private String wordElemContent = "";
17 private String wordElementName = "w";
18
19 public String getResult() {
20 return resultStrBuilder.toString();
21 }
22
23 public void startDocument() throws SAXException {
24 }
25
26 public void endDocument() throws SAXException {
27 }
28
29 public void characters(char[] c, int start, int length) throws SAXException {
30 char[] cCopy = new char[length];
31 System.arraycopy(c, start, cCopy, 0, length);
32 String charactersStr = String.valueOf(cCopy);
33 if (charactersStr != null && ! charactersStr.equals("")) {
34 charactersStr = StringUtils.deresolveXmlEntities(charactersStr);
35 write(charactersStr);
36 }
37 }
38
39 public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
40 }
41
42 public void processingInstruction(String target, String data) throws SAXException {
43 }
44
45 public void setDocumentLocator(Locator locator) {
46 }
47
48 public void startPrefixMapping(String prefix, String uri) throws SAXException {
49 if (prefix != null && prefix.equals(""))
50 xmlnsString += "xmlns" + "=\"" + uri + "\" ";
51 else
52 xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
53 }
54
55 public void endPrefixMapping(String prefix) throws SAXException {
56 }
57
58 public void skippedEntity(String name) throws SAXException {
59 }
60
61 public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
62 int attrSize = attrs.getLength();
63 String attrString = "";
64 for (int i=0; i<attrSize; i++) {
65 String attrQName = attrs.getQName(i);
66 String attrValue = attrs.getValue(i);
67 attrValue = StringUtils.forXML(attrValue);
68 if (localName != null && localName.equals(wordElementName) && (attrQName.equals("lang") || attrQName.equals("xml:lang")))
69 language = attrValue;
70 if (localName != null && localName.equals(wordElementName) && attrQName.equals("formRegularized") && attrValue != null && ! attrValue.isEmpty())
71 formRegularized = attrValue;
72 attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
73 }
74 if (attrString != null && ! attrString.isEmpty()) {
75 attrString = attrString.trim();
76 }
77 if (xmlnsString != null && ! xmlnsString.isEmpty()) {
78 xmlnsString = xmlnsString.trim();
79 }
80 // start all elements but no word elements within word elements (level > 0)
81 if (localName != null && (! localName.equals(wordElementName) || (localName.equals(wordElementName) && wordLevelCounter == 0))) {
82 write("<" + name);
83 if (xmlnsString != null && ! xmlnsString.isEmpty())
84 write(" " + xmlnsString);
85 if (attrString != null && ! attrString.isEmpty())
86 write(" " + attrString);
87 write(">");
88 }
89 xmlnsString = "";
90 if (localName != null && localName.equals(wordElementName)) {
91 wordLevelCounter++;
92 }
93 }
94
95 public void endElement(String uri, String localName, String name) throws SAXException {
96 try {
97 if (localName != null && localName.equals(wordElementName)) {
98 wordLevelCounter--;
99 }
100 // special handling of word elements (with level 0): insert orig, reg and norm attributes
101 if (localName != null && localName.equals(wordElementName) && wordLevelCounter == 0) {
102 // handle formRegularized
103 String newWordElemContentReg = "";
104 if (formRegularized == null || formRegularized.isEmpty()) {
105 newWordElemContentReg = wordElemContent;
106 } else if (formRegularized.contains("- ")) {
107 String[] wordParts = formRegularized.split("- ");
108 for (int i=0; i<wordParts.length - 1; i++) {
109 String wp = wordParts[i];
110 newWordElemContentReg = newWordElemContentReg + wp + "-<lb/>";
111 }
112 newWordElemContentReg = newWordElemContentReg + wordParts[wordParts.length - 1]; // last one
113 } else if (formRegularized.contains(" ")) {
114 String[] wordParts = formRegularized.split(" ");
115 for (int i=0; i<wordParts.length - 1; i++) {
116 String wp = wordParts[i];
117 newWordElemContentReg = newWordElemContentReg + wp + "<lb/>";
118 }
119 newWordElemContentReg = newWordElemContentReg + wordParts[wordParts.length - 1]; // last one
120 } else {
121 newWordElemContentReg = formRegularized;
122 }
123 // handle normalized word form
124 String[] norm = {"norm"};
125 Normalizer normNormalizer = new Normalizer(norm, language);
126 String newWordElemContentNorm = null;
127 if (formRegularized == null)
128 newWordElemContentNorm = normNormalizer.normalize(wordElemContent);
129 else
130 newWordElemContentNorm = normNormalizer.normalize(newWordElemContentReg);
131 // write full word content (including lb etc.) into elements orig, reg and norm
132 write("<orig>" + wordElemContent + "</orig>");
133 write("<reg>" + newWordElemContentReg + "</reg>");
134 write("<norm>" + newWordElemContentNorm + "</norm>");
135 write("</" + name + ">");
136 formRegularized = null;
137 wordElemContent = "";
138 } else if (localName != null && localName.equals(wordElementName) && wordLevelCounter > 0) {
139 // nothing: remove word elements within word elements (level > 0)
140 } else {
141 write("</" + name + ">");
142 }
143 } catch (ApplicationException e) {
144 throw new SAXException(e);
145 }
146 }
147
148 private void write(String outStr) throws SAXException {
149 if (wordLevelCounter > 0)
150 writeWordElemContent(outStr);
151 else
152 resultStrBuilder.append(outStr);
153 }
154
155 private void writeWordElemContent(String outStr) throws SAXException {
156 wordElemContent = wordElemContent + outStr;
157 }
158
159 }