Mercurial > hg > mpdl-group
view software/eXist/mpdl-modules/src/de/mpg/mpiwg/berlin/mpdl/lt/doc/DictionarizerContentHandler.java @ 0:408254cf2f1d
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 24 Nov 2010 17:24:23 +0100 |
parents | |
children | 2396a569e446 |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.doc; import java.util.ArrayList; import org.apache.lucene.analysis.Token; import org.xml.sax.*; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.analyzer.MpdlTokenizerAnalyzer; import de.mpg.mpiwg.berlin.mpdl.lt.lex.db.LexHandler; import de.mpg.mpiwg.berlin.mpdl.util.StringUtilEscapeChars; public class DictionarizerContentHandler implements ContentHandler { private static String MARK = "COMPLEXELEMENTTTTT"; private static int MARK_SIZE = MARK.length(); private static int ELEMENT_TYPE_CHARACTERS = 1; private static int ELEMENT_TYPE_COMPLEX = 2; private String xmlnsString = ""; private String language; private String outputXmlFragment = ""; private Element rootElement; private Element currentElement; private ArrayList<Element> elementQueue; public DictionarizerContentHandler(String language) throws ApplicationException { this.language = language; } public String getXmlFragment() { return outputXmlFragment; } public void startDocument() throws SAXException { } public void endDocument() throws SAXException { String rootElemToStr = rootElement.toXmlString(); write(rootElemToStr); write("\n"); } public void characters(char[] c, int start, int length) throws SAXException { char[] cCopy = new char[length]; System.arraycopy(c, start, cCopy, 0, length); String charactersStr = String.valueOf(cCopy); if (charactersStr != null && ! charactersStr.equals("")) { if (currentElement != null) { Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); charElement.value = StringUtilEscapeChars.deresolveXmlEntities(charactersStr); if (currentElement.composites == null) currentElement.composites = new ArrayList<Element>(); currentElement.composites.add(charElement); } } } public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { } public void processingInstruction(String target, String data) throws SAXException { } public void setDocumentLocator(Locator locator) { } public void startPrefixMapping(String prefix, String uri) throws SAXException { xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; } public void endPrefixMapping(String prefix) throws SAXException { } public void skippedEntity(String name) throws SAXException { } public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { if (elementQueue == null) elementQueue = new ArrayList<Element>(); Element newElement = new Element(name); // element of type: complex if (currentElement != null) { if (currentElement.composites == null) currentElement.composites = new ArrayList<Element>(); if (currentElement.lang != null) newElement.lang = currentElement.lang; // language wird an Kinder vererbt currentElement.composites.add(newElement); } currentElement = newElement; int attrSize = attrs.getLength(); String attrString = ""; for (int i=0; i<attrSize; i++) { String attrQName = attrs.getQName(i); String attrValue = attrs.getValue(i); attrValue = StringUtilEscapeChars.forXML(attrValue); attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) currentElement.lang = attrValue; // wenn xml:lang belegt ist, wird es an das neue Element gesetzt und überschreibt vom Vater geerbte Werte } currentElement.attrString = attrString; if (! xmlnsString.equals("")) { currentElement.xmlnsString = xmlnsString; } xmlnsString = ""; elementQueue.add(currentElement); // only the first element is the root element if(rootElement == null) rootElement = currentElement; } public void endElement(String uri, String localName, String name) throws SAXException { if (elementQueue != null && elementQueue.size() > 0) { int lastIndex = elementQueue.size() - 1; elementQueue.remove(lastIndex); } if (elementQueue != null && elementQueue.size() > 0) { int lastIndex = elementQueue.size() - 1; currentElement = elementQueue.get(lastIndex); } else { currentElement = null; } } public int getCharIndex(String compositesCharsDictionarized, int indexComplexElemCompositesCharsWithMarks) { if (indexComplexElemCompositesCharsWithMarks == 0) return -1; int size = compositesCharsDictionarized.length(); if (size == 0) return -1; int index = 0; int counter = 0; boolean isInTag = false; boolean success = false; while (!success) { if (counter > size) return -1; char c = compositesCharsDictionarized.charAt(counter); if (c == '<') isInTag = true; if (! isInTag) index++; if (index == indexComplexElemCompositesCharsWithMarks) success = true; if (c == '>') isInTag = false; counter++; } return counter + 1; } private void write(String outStr) throws SAXException { outputXmlFragment += outStr; } private class Element { private int type; private String name; private String xmlnsString; private String attrString; private String value; private String lang; // normalerweise mit dem Wert aus dem Attribut xml:lang belegt bzw. mit dem aus dem Vaterknoten wererbten xml:lang-Wert private ArrayList<Element> composites; private Element(String name) { this.type = ELEMENT_TYPE_COMPLEX; this.name = name; } private Element(String name, int type) { this.type = type; this.name = name; } private boolean isComplex() { boolean isComplex = false; if (type == ELEMENT_TYPE_COMPLEX) isComplex = true; return isComplex; } private String toXmlString() throws SAXException { String retString = ""; String elemLanguage = language; // default value for the document/page if (lang != null) elemLanguage = lang; // der Wert des Elements falls vorhanden // write this element if (! isComplex()) { retString += value; } else { String xmlNsString = this.xmlnsString; if (xmlNsString == null || xmlNsString.equals("")) { retString = retString + "<" + name + attrString + ">"; } else { retString = retString + "<" + name + " " + xmlNsString + attrString + ">"; } if (composites != null) { String compositesChars = ""; String compositesCharsWithMarks = ""; ArrayList<Element> complexElements = new ArrayList<Element>(); for (int i=0; i<composites.size(); i++) { Element composite = composites.get(i); if (! composite.isComplex()) { if (composite.value != null && ! composite.value.equals("")) { String compositeValueStr = composite.value; compositesChars += compositeValueStr; compositesCharsWithMarks += compositeValueStr; } } else { complexElements.add(composite); compositesCharsWithMarks += MARK; } } String compositesCharsDictionarized = characters2DictWords(compositesChars, elemLanguage); if (complexElements.size() > 0) { for (int i=0; i<complexElements.size(); i++) { int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarks.indexOf(MARK); int indexComplexElemCompositesCharsDictionarized = getCharIndex(compositesCharsDictionarized, indexComplexElemCompositesCharsWithMarks); Element complexElem = complexElements.get(i); String complexElementStr = complexElem.toXmlString(); String firstPiece = ""; if (indexComplexElemCompositesCharsDictionarized > 0) { firstPiece = compositesCharsDictionarized.substring(0, indexComplexElemCompositesCharsDictionarized - 1); compositesCharsDictionarized = compositesCharsDictionarized.substring(indexComplexElemCompositesCharsDictionarized - 1); } retString = retString + firstPiece + complexElementStr; compositesCharsWithMarks = compositesCharsWithMarks.substring(indexComplexElemCompositesCharsWithMarks + MARK_SIZE); } retString = retString + compositesCharsDictionarized; // last one must also be added } else { retString = retString + compositesCharsDictionarized; // last one must also be added } } retString = retString + "</" + name + ">"; } return retString; } private String characters2DictWords(String charactersStrDeresolved, String language) throws SAXException { String charactersStr = StringUtilEscapeChars.resolveXmlEntities(charactersStrDeresolved); String retStr = ""; try { MpdlTokenizerAnalyzer dictionarizerAnalyzer = new MpdlTokenizerAnalyzer(language); ArrayList<Token> wordTokens = dictionarizerAnalyzer.getToken(charactersStr); int endPos = 0; for (int i=0; i < wordTokens.size(); i++) { Token wordToken = wordTokens.get(i); int startPos = wordToken.startOffset(); String beforeStr = charactersStr.substring(endPos, startPos); endPos = wordToken.endOffset(); String wordStr = charactersStr.substring(startPos, endPos); String beforeStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(beforeStr); String wordStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(wordStr); String wordTokenText = wordToken.termText(); LexHandler lexHandler = LexHandler.getInstance(); // delivers lex entries by help of the morphology component (lex entry of the stem of the normalized word form) ArrayList<String> lexEntryKeys = lexHandler.getLexEntryKeys(wordTokenText, language, false); if (lexEntryKeys != null) { String lexForms = ""; for (int j=0; j<lexEntryKeys.size(); j++) { String lexEntryKey = lexEntryKeys.get(j); lexForms = lexForms + lexEntryKey + " "; } lexForms = lexForms.substring(0, lexForms.length() - 1); retStr = retStr + beforeStrDeresolved + "<w lang=\"" + language + "\"" + " form=\"" + wordTokenText + "\"" + " lexForms=\"" + lexForms + "\">" + wordStrDeresolved + "</w>"; } else { retStr = retStr + beforeStrDeresolved + wordStrDeresolved; } } String lastAfterStr = charactersStr.substring(endPos); String lastAfterStrDeresolved = StringUtilEscapeChars.deresolveXmlEntities(lastAfterStr); retStr = retStr + lastAfterStrDeresolved; } catch (ApplicationException e) { throw new SAXException(e); } return retStr; } } }