Mercurial > hg > mpdl-group
comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/WordContentHandler.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
22:6a45a982c333 | 23:e845310098ba |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; | |
2 | |
3 import org.xml.sax.*; | |
4 | |
5 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
6 import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; | |
7 import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; | |
8 | |
9 public class WordContentHandler implements ContentHandler { | |
10 private static String DEFAULT_LANGUAGE = "eng"; | |
11 private String xmlnsString = ""; | |
12 private StringBuilder resultStrBuilder = new StringBuilder(); | |
13 private String language = DEFAULT_LANGUAGE; | |
14 private String formRegularized; | |
15 private int wordLevelCounter = 0; | |
16 private String wordElemContent = ""; | |
17 private String wordElementName = "w"; | |
18 | |
19 public String getResult() { | |
20 return resultStrBuilder.toString(); | |
21 } | |
22 | |
23 public void startDocument() throws SAXException { | |
24 } | |
25 | |
26 public void endDocument() throws SAXException { | |
27 } | |
28 | |
29 public void characters(char[] c, int start, int length) throws SAXException { | |
30 char[] cCopy = new char[length]; | |
31 System.arraycopy(c, start, cCopy, 0, length); | |
32 String charactersStr = String.valueOf(cCopy); | |
33 if (charactersStr != null && ! charactersStr.equals("")) { | |
34 charactersStr = StringUtils.deresolveXmlEntities(charactersStr); | |
35 write(charactersStr); | |
36 } | |
37 } | |
38 | |
39 public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { | |
40 } | |
41 | |
42 public void processingInstruction(String target, String data) throws SAXException { | |
43 } | |
44 | |
45 public void setDocumentLocator(Locator locator) { | |
46 } | |
47 | |
48 public void startPrefixMapping(String prefix, String uri) throws SAXException { | |
49 if (prefix != null && prefix.equals("")) | |
50 xmlnsString += "xmlns" + "=\"" + uri + "\" "; | |
51 else | |
52 xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; | |
53 } | |
54 | |
55 public void endPrefixMapping(String prefix) throws SAXException { | |
56 } | |
57 | |
58 public void skippedEntity(String name) throws SAXException { | |
59 } | |
60 | |
61 public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { | |
62 int attrSize = attrs.getLength(); | |
63 String attrString = ""; | |
64 for (int i=0; i<attrSize; i++) { | |
65 String attrQName = attrs.getQName(i); | |
66 String attrValue = attrs.getValue(i); | |
67 attrValue = StringUtils.forXML(attrValue); | |
68 if (localName != null && localName.equals(wordElementName) && (attrQName.equals("lang") || attrQName.equals("xml:lang"))) | |
69 language = attrValue; | |
70 if (localName != null && localName.equals(wordElementName) && attrQName.equals("formRegularized") && attrValue != null && ! attrValue.isEmpty()) | |
71 formRegularized = attrValue; | |
72 attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; | |
73 } | |
74 if (attrString != null && ! attrString.isEmpty()) { | |
75 attrString = attrString.trim(); | |
76 } | |
77 if (xmlnsString != null && ! xmlnsString.isEmpty()) { | |
78 xmlnsString = xmlnsString.trim(); | |
79 } | |
80 // start all elements but no word elements within word elements (level > 0) | |
81 if (localName != null && (! localName.equals(wordElementName) || (localName.equals(wordElementName) && wordLevelCounter == 0))) { | |
82 write("<" + name); | |
83 if (xmlnsString != null && ! xmlnsString.isEmpty()) | |
84 write(" " + xmlnsString); | |
85 if (attrString != null && ! attrString.isEmpty()) | |
86 write(" " + attrString); | |
87 write(">"); | |
88 } | |
89 xmlnsString = ""; | |
90 if (localName != null && localName.equals(wordElementName)) { | |
91 wordLevelCounter++; | |
92 } | |
93 } | |
94 | |
95 public void endElement(String uri, String localName, String name) throws SAXException { | |
96 try { | |
97 if (localName != null && localName.equals(wordElementName)) { | |
98 wordLevelCounter--; | |
99 } | |
100 // special handling of word elements (with level 0): insert orig, reg and norm attributes | |
101 if (localName != null && localName.equals(wordElementName) && wordLevelCounter == 0) { | |
102 // handle formRegularized | |
103 String newWordElemContentReg = ""; | |
104 if (formRegularized == null || formRegularized.isEmpty()) { | |
105 newWordElemContentReg = wordElemContent; | |
106 } else if (formRegularized.contains("- ")) { | |
107 String[] wordParts = formRegularized.split("- "); | |
108 for (int i=0; i<wordParts.length - 1; i++) { | |
109 String wp = wordParts[i]; | |
110 newWordElemContentReg = newWordElemContentReg + wp + "-<lb/>"; | |
111 } | |
112 newWordElemContentReg = newWordElemContentReg + wordParts[wordParts.length - 1]; // last one | |
113 } else if (formRegularized.contains(" ")) { | |
114 String[] wordParts = formRegularized.split(" "); | |
115 for (int i=0; i<wordParts.length - 1; i++) { | |
116 String wp = wordParts[i]; | |
117 newWordElemContentReg = newWordElemContentReg + wp + "<lb/>"; | |
118 } | |
119 newWordElemContentReg = newWordElemContentReg + wordParts[wordParts.length - 1]; // last one | |
120 } else { | |
121 newWordElemContentReg = formRegularized; | |
122 } | |
123 // handle normalized word form | |
124 String[] norm = {"norm"}; | |
125 Normalizer normNormalizer = new Normalizer(norm, language); | |
126 String newWordElemContentNorm = null; | |
127 if (formRegularized == null) | |
128 newWordElemContentNorm = normNormalizer.normalize(wordElemContent); | |
129 else | |
130 newWordElemContentNorm = normNormalizer.normalize(newWordElemContentReg); | |
131 // write full word content (including lb etc.) into elements orig, reg and norm | |
132 write("<orig>" + wordElemContent + "</orig>"); | |
133 write("<reg>" + newWordElemContentReg + "</reg>"); | |
134 write("<norm>" + newWordElemContentNorm + "</norm>"); | |
135 write("</" + name + ">"); | |
136 formRegularized = null; | |
137 wordElemContent = ""; | |
138 } else if (localName != null && localName.equals(wordElementName) && wordLevelCounter > 0) { | |
139 // nothing: remove word elements within word elements (level > 0) | |
140 } else { | |
141 write("</" + name + ">"); | |
142 } | |
143 } catch (ApplicationException e) { | |
144 throw new SAXException(e); | |
145 } | |
146 } | |
147 | |
148 private void write(String outStr) throws SAXException { | |
149 if (wordLevelCounter > 0) | |
150 writeWordElemContent(outStr); | |
151 else | |
152 resultStrBuilder.append(outStr); | |
153 } | |
154 | |
155 private void writeWordElemContent(String outStr) throws SAXException { | |
156 wordElemContent = wordElemContent + outStr; | |
157 } | |
158 | |
159 } |