Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/WordContentHandler.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/WordContentHandler.java Tue Nov 27 12:35:19 2012 +0100 @@ -0,0 +1,159 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; + +import org.xml.sax.*; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; + +public class WordContentHandler implements ContentHandler { + private static String DEFAULT_LANGUAGE = "eng"; + private String xmlnsString = ""; + private StringBuilder resultStrBuilder = new StringBuilder(); + private String language = DEFAULT_LANGUAGE; + private String formRegularized; + private int wordLevelCounter = 0; + private String wordElemContent = ""; + private String wordElementName = "w"; + + public String getResult() { + return resultStrBuilder.toString(); + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + } + + public void characters(char[] c, int start, int length) throws SAXException { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null && ! charactersStr.equals("")) { + charactersStr = StringUtils.deresolveXmlEntities(charactersStr); + write(charactersStr); + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(Locator locator) { + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + if (prefix != null && prefix.equals("")) + xmlnsString += "xmlns" + "=\"" + uri + "\" "; + else + xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + int attrSize = attrs.getLength(); + String attrString = ""; + for (int i=0; i<attrSize; i++) { + String attrQName = attrs.getQName(i); + String attrValue = attrs.getValue(i); + attrValue = StringUtils.forXML(attrValue); + if (localName != null && localName.equals(wordElementName) && (attrQName.equals("lang") || attrQName.equals("xml:lang"))) + language = attrValue; + if (localName != null && localName.equals(wordElementName) && attrQName.equals("formRegularized") && attrValue != null && ! attrValue.isEmpty()) + formRegularized = attrValue; + attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; + } + if (attrString != null && ! attrString.isEmpty()) { + attrString = attrString.trim(); + } + if (xmlnsString != null && ! xmlnsString.isEmpty()) { + xmlnsString = xmlnsString.trim(); + } + // start all elements but no word elements within word elements (level > 0) + if (localName != null && (! localName.equals(wordElementName) || (localName.equals(wordElementName) && wordLevelCounter == 0))) { + write("<" + name); + if (xmlnsString != null && ! xmlnsString.isEmpty()) + write(" " + xmlnsString); + if (attrString != null && ! attrString.isEmpty()) + write(" " + attrString); + write(">"); + } + xmlnsString = ""; + if (localName != null && localName.equals(wordElementName)) { + wordLevelCounter++; + } + } + + public void endElement(String uri, String localName, String name) throws SAXException { + try { + if (localName != null && localName.equals(wordElementName)) { + wordLevelCounter--; + } + // special handling of word elements (with level 0): insert orig, reg and norm attributes + if (localName != null && localName.equals(wordElementName) && wordLevelCounter == 0) { + // handle formRegularized + String newWordElemContentReg = ""; + if (formRegularized == null || formRegularized.isEmpty()) { + newWordElemContentReg = wordElemContent; + } else if (formRegularized.contains("- ")) { + String[] wordParts = formRegularized.split("- "); + for (int i=0; i<wordParts.length - 1; i++) { + String wp = wordParts[i]; + newWordElemContentReg = newWordElemContentReg + wp + "-<lb/>"; + } + newWordElemContentReg = newWordElemContentReg + wordParts[wordParts.length - 1]; // last one + } else if (formRegularized.contains(" ")) { + String[] wordParts = formRegularized.split(" "); + for (int i=0; i<wordParts.length - 1; i++) { + String wp = wordParts[i]; + newWordElemContentReg = newWordElemContentReg + wp + "<lb/>"; + } + newWordElemContentReg = newWordElemContentReg + wordParts[wordParts.length - 1]; // last one + } else { + newWordElemContentReg = formRegularized; + } + // handle normalized word form + String[] norm = {"norm"}; + Normalizer normNormalizer = new Normalizer(norm, language); + String newWordElemContentNorm = null; + if (formRegularized == null) + newWordElemContentNorm = normNormalizer.normalize(wordElemContent); + else + newWordElemContentNorm = normNormalizer.normalize(newWordElemContentReg); + // write full word content (including lb etc.) into elements orig, reg and norm + write("<orig>" + wordElemContent + "</orig>"); + write("<reg>" + newWordElemContentReg + "</reg>"); + write("<norm>" + newWordElemContentNorm + "</norm>"); + write("</" + name + ">"); + formRegularized = null; + wordElemContent = ""; + } else if (localName != null && localName.equals(wordElementName) && wordLevelCounter > 0) { + // nothing: remove word elements within word elements (level > 0) + } else { + write("</" + name + ">"); + } + } catch (ApplicationException e) { + throw new SAXException(e); + } + } + + private void write(String outStr) throws SAXException { + if (wordLevelCounter > 0) + writeWordElemContent(outStr); + else + resultStrBuilder.append(outStr); + } + + private void writeWordElemContent(String outStr) throws SAXException { + wordElemContent = wordElemContent + outStr; + } + +}