Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/NormalizeCharsContentHandler.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children | 2396a569e446 |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.doc; import java.util.ArrayList; import org.apache.lucene.analysis.Token; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlNormalizer; import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer; import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; public class NormalizeCharsContentHandler implements ContentHandler { private String xmlnsString = ""; private String[] normalizeFunctions = {}; // default: without normalize functions private String language; private String outputXmlFragment = ""; private Element currentElement; public NormalizeCharsContentHandler(String[] normalizeFunctions, String language) throws ApplicationException { this.normalizeFunctions = normalizeFunctions; this.language = language; } public String getXmlFragment() { return outputXmlFragment; } public void startDocument() throws SAXException { } public void endDocument() throws SAXException { } public void characters(char[] c, int start, int length) throws SAXException { char[] cCopy = new char[length]; System.arraycopy(c, start, cCopy, 0, length); String charactersStr = String.valueOf(cCopy); if (charactersStr != null && ! charactersStr.equals("")) { charactersStr = normalize(charactersStr); if (currentElement != null) currentElement.value = charactersStr; write(charactersStr); } } public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { } public void processingInstruction(String target, String data) throws SAXException { } public void setDocumentLocator(Locator locator) { } public void startPrefixMapping(String prefix, String uri) throws SAXException { xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; } public void endPrefixMapping(String prefix) throws SAXException { } public void skippedEntity(String name) throws SAXException { } public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { currentElement = new Element(language, name); int attrSize = attrs.getLength(); String attrString = ""; for (int i=0; i<attrSize; i++) { String attrQName = attrs.getQName(i); String attrValue = attrs.getValue(i); attrValue = StringUtilEscapeChars.forXML(attrValue); attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; if (attrQName != null && attrQName.equals("lang") && attrValue != null) { currentElement.language = attrValue; } } currentElement.attrString = attrString; if (xmlnsString.equals("")) { write("<" + name + attrString + ">"); } else { currentElement.xmlnsString = xmlnsString; write("<" + name + " " + xmlnsString + attrString + ">"); } xmlnsString = ""; } public void endElement(String uri, String localName, String name) throws SAXException { currentElement = null; write("</" + name + ">"); } private void write(String outStr) throws SAXException { outputXmlFragment += outStr; } private String normalize(String charactersStr) throws SAXException { String retStr = ""; try { MpdlTokenizerAnalyzer tokenizerAnalyzer = new MpdlTokenizerAnalyzer(language); ArrayList<Token> wordTokens = tokenizerAnalyzer.getToken(charactersStr); int endPos = 0; for (int i=0; i < wordTokens.size(); i++) { Token wordToken = wordTokens.get(i); int startPos = wordToken.startOffset(); String beforeStr = charactersStr.substring(endPos, startPos); String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr); endPos = wordToken.endOffset(); String wordStr = charactersStr.substring(startPos, endPos); MpdlNormalizer mpdlNormalizer = new MpdlNormalizer(normalizeFunctions, language); String normalizedWordStr = mpdlNormalizer.normalize(wordStr); String normalizedWordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(normalizedWordStr); // String wordTokenText = wordToken.termText(); retStr = retStr + beforeStrDeresolved + normalizedWordStrDeresolved; } String lastAfterStr = charactersStr.substring(endPos); String lastAfterStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(lastAfterStr); retStr = retStr + lastAfterStrDeresolved; } catch (ApplicationException e) { throw new SAXException(e); } return retStr; } private class Element { String name; String language; String xmlnsString; String attrString; String value; Element(String language, String name) { this.language = language; this.name = name; } } }