Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/WordContentHandler.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; import org.xml.sax.*; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; public class WordContentHandler implements ContentHandler { private static String DEFAULT_LANGUAGE = "eng"; private String xmlnsString = ""; private StringBuilder resultStrBuilder = new StringBuilder(); private String language = DEFAULT_LANGUAGE; private String formRegularized; private int wordLevelCounter = 0; private String wordElemContent = ""; private String wordElementName = "w"; public String getResult() { return resultStrBuilder.toString(); } public void startDocument() throws SAXException { } public void endDocument() throws SAXException { } public void characters(char[] c, int start, int length) throws SAXException { char[] cCopy = new char[length]; System.arraycopy(c, start, cCopy, 0, length); String charactersStr = String.valueOf(cCopy); if (charactersStr != null && ! charactersStr.equals("")) { charactersStr = StringUtils.deresolveXmlEntities(charactersStr); write(charactersStr); } } public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { } public void processingInstruction(String target, String data) throws SAXException { } public void setDocumentLocator(Locator locator) { } public void startPrefixMapping(String prefix, String uri) throws SAXException { if (prefix != null && prefix.equals("")) xmlnsString += "xmlns" + "=\"" + uri + "\" "; else xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; } public void endPrefixMapping(String prefix) throws SAXException { } public void skippedEntity(String name) throws SAXException { } public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { int attrSize = attrs.getLength(); String attrString = ""; for (int i=0; i<attrSize; i++) { String attrQName = attrs.getQName(i); String attrValue = attrs.getValue(i); attrValue = StringUtils.forXML(attrValue); if (localName != null && localName.equals(wordElementName) && (attrQName.equals("lang") || attrQName.equals("xml:lang"))) language = attrValue; if (localName != null && localName.equals(wordElementName) && attrQName.equals("formRegularized") && attrValue != null && ! attrValue.isEmpty()) formRegularized = attrValue; attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; } if (attrString != null && ! attrString.isEmpty()) { attrString = attrString.trim(); } if (xmlnsString != null && ! xmlnsString.isEmpty()) { xmlnsString = xmlnsString.trim(); } // start all elements but no word elements within word elements (level > 0) if (localName != null && (! localName.equals(wordElementName) || (localName.equals(wordElementName) && wordLevelCounter == 0))) { write("<" + name); if (xmlnsString != null && ! xmlnsString.isEmpty()) write(" " + xmlnsString); if (attrString != null && ! attrString.isEmpty()) write(" " + attrString); write(">"); } xmlnsString = ""; if (localName != null && localName.equals(wordElementName)) { wordLevelCounter++; } } public void endElement(String uri, String localName, String name) throws SAXException { try { if (localName != null && localName.equals(wordElementName)) { wordLevelCounter--; } // special handling of word elements (with level 0): insert orig, reg and norm attributes if (localName != null && localName.equals(wordElementName) && wordLevelCounter == 0) { // handle formRegularized String newWordElemContentReg = ""; if (formRegularized == null || formRegularized.isEmpty()) { newWordElemContentReg = wordElemContent; } else if (formRegularized.contains("- ")) { String[] wordParts = formRegularized.split("- "); for (int i=0; i<wordParts.length - 1; i++) { String wp = wordParts[i]; newWordElemContentReg = newWordElemContentReg + wp + "-<lb/>"; } newWordElemContentReg = newWordElemContentReg + wordParts[wordParts.length - 1]; // last one } else if (formRegularized.contains(" ")) { String[] wordParts = formRegularized.split(" "); for (int i=0; i<wordParts.length - 1; i++) { String wp = wordParts[i]; newWordElemContentReg = newWordElemContentReg + wp + "<lb/>"; } newWordElemContentReg = newWordElemContentReg + wordParts[wordParts.length - 1]; // last one } else { newWordElemContentReg = formRegularized; } // handle normalized word form String[] norm = {"norm"}; Normalizer normNormalizer = new Normalizer(norm, language); String newWordElemContentNorm = null; if (formRegularized == null) newWordElemContentNorm = normNormalizer.normalize(wordElemContent); else newWordElemContentNorm = normNormalizer.normalize(newWordElemContentReg); // write full word content (including lb etc.) into elements orig, reg and norm write("<orig>" + wordElemContent + "</orig>"); write("<reg>" + newWordElemContentReg + "</reg>"); write("<norm>" + newWordElemContentNorm + "</norm>"); write("</" + name + ">"); formRegularized = null; wordElemContent = ""; } else if (localName != null && localName.equals(wordElementName) && wordLevelCounter > 0) { // nothing: remove word elements within word elements (level > 0) } else { write("</" + name + ">"); } } catch (ApplicationException e) { throw new SAXException(e); } } private void write(String outStr) throws SAXException { if (wordLevelCounter > 0) writeWordElemContent(outStr); else resultStrBuilder.append(outStr); } private void writeWordElemContent(String outStr) throws SAXException { wordElemContent = wordElemContent + outStr; } }