Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children | 7d6d969b10cf |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.Hashtable; import org.xml.sax.*; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; public class XmlTokenizerContentHandler implements ContentHandler { private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length(); private static int ELEMENT_TYPE_CHARACTERS = 1; private static int ELEMENT_TYPE_COMPLEX = 2; private String[] normalizeFunctions = {}; // default: without normalize functions private String[] nwbElements = {}; // non word breaking elements, default: these elements private String[] stopElements = {}; // default: no stop elements private String[] outputOptions = {}; private String xmlnsString = ""; private String language; private String outputXmlFragment = ""; private Element rootElement; private Element currentElement; private ArrayList<Element> elementQueue; public XmlTokenizerContentHandler(String[] normalizeFunctions, String language) throws ApplicationException { if (normalizeFunctions == null) { String[] emptyFunctions = {}; this.normalizeFunctions = emptyFunctions; } else { this.normalizeFunctions = normalizeFunctions; } this.language = language; } public void setNWBElements(String[] nwbElements) { this.nwbElements = nwbElements; } public void setStopElements(String[] stopElements) { this.stopElements = stopElements; } public void setOutputOptions(String[] outputOptions) { this.outputOptions = outputOptions; } public String getXmlFragment() { return outputXmlFragment; } public void startDocument() throws SAXException { } public void endDocument() throws SAXException { try { String rootElemToStr = rootElement.toXmlString(); write(rootElemToStr); write("\n"); } catch (NullPointerException e) { throw new SAXException(e); } } public void characters(char[] c, int start, int length) throws SAXException { char[] cCopy = new char[length]; System.arraycopy(c, start, cCopy, 0, length); String charactersStr = String.valueOf(cCopy); if (charactersStr != null && ! charactersStr.equals("")) { if (currentElement != null) { Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); charElement.value = StringUtils.deresolveXmlEntities(charactersStr); if (currentElement.composites == null) currentElement.composites = new ArrayList<Element>(); currentElement.composites.add(charElement); } } } public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { } public void processingInstruction(String target, String data) throws SAXException { } public void setDocumentLocator(Locator locator) { } public void startPrefixMapping(String prefix, String uri) throws SAXException { xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; if (prefix != null && prefix.equals("")) xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" "; } public void endPrefixMapping(String prefix) throws SAXException { } public void skippedEntity(String name) throws SAXException { } public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { if (elementQueue == null) elementQueue = new ArrayList<Element>(); Element newElement = new Element(name); // element of type: complex if (currentElement != null) { if (currentElement.composites == null) currentElement.composites = new ArrayList<Element>(); if (currentElement.lang != null) newElement.lang = currentElement.lang; // language is inherited to childs currentElement.composites.add(newElement); } currentElement = newElement; int attrSize = attrs.getLength(); String attrString = ""; for (int i=0; i<attrSize; i++) { String attrQName = attrs.getQName(i); String attrValue = attrs.getValue(i); attrValue = StringUtils.forXML(attrValue); attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) currentElement.lang = attrValue; // if xml:lang is set, it is set to the new element and overwrites values inherited by the father } currentElement.attrString = attrString; if (! xmlnsString.equals("")) { currentElement.xmlnsString = xmlnsString; } xmlnsString = ""; elementQueue.add(currentElement); // only the first element is the root element if(rootElement == null) rootElement = currentElement; } public void endElement(String uri, String localName, String name) throws SAXException { if (elementQueue != null && elementQueue.size() > 0) { int lastIndex = elementQueue.size() - 1; elementQueue.remove(lastIndex); } if (elementQueue != null && elementQueue.size() > 0) { int lastIndex = elementQueue.size() - 1; currentElement = elementQueue.get(lastIndex); } else { currentElement = null; } } private boolean withForms() { boolean result = false; for (int i=0; i< outputOptions.length; i++) { String function = outputOptions[i]; if (function.equals("withForms")) return true; } return result; } private boolean withLemmas() { boolean result = false; for (int i=0; i< outputOptions.length; i++) { String function = outputOptions[i]; if (function.equals("withLemmas")) return true; } return result; } private void write(String outStr) throws SAXException { outputXmlFragment += outStr; } private class Element { private int type; private String name; private String xmlnsString; private String attrString; private String value; private String lang; // normally value of attribute xml:lang or the inherited xml:lang value of the father node private ArrayList<Element> composites; private Element(String name) { this.type = ELEMENT_TYPE_COMPLEX; this.name = name; } private Element(String name, int type) { this.type = type; this.name = name; } private boolean isComplex() { boolean isComplex = false; if (type == ELEMENT_TYPE_COMPLEX) isComplex = true; return isComplex; } private boolean isWordDelimiterElement() { boolean isWordDelimiterElement = true; for (int i=0; i<nwbElements.length; i++) { String nwbElementName = nwbElements[i]; if (name.equals(nwbElementName)) { isWordDelimiterElement = false; break; } } return isWordDelimiterElement; } private boolean isStopElement() { boolean isStopElement = false; for (int i=0; i<stopElements.length; i++) { String stopElementName = stopElements[i]; if (name.equals(stopElementName)) { isStopElement = true; break; } } return isStopElement; } private String toXmlString() throws SAXException { String retString = ""; String elemLanguage = language; // default value for the document/page if (lang != null) elemLanguage = lang; // value of the element if available // write this element if (! isComplex()) { retString += value; } else { String xmlNsString = this.xmlnsString; if (xmlNsString == null || xmlNsString.equals("")) { retString = retString + "<" + name + attrString + ">"; } else { retString = retString + "<" + name + " " + xmlNsString + attrString + ">"; } if (composites != null) { String compositesCharsWithMarks = ""; ArrayList<Element> complexElements = new ArrayList<Element>(); for (int i=0; i<composites.size(); i++) { Element composite = composites.get(i); if (! composite.isComplex()) { if (composite.value != null && ! composite.value.equals("")) { String compositeValueStr = composite.value; compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words. compositeValueStr = compositeValueStr.replaceAll("[ \t]+", " "); // if there are many Blanks/Tabs make them to one Blank compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr; } } else { if (! composite.isWordDelimiterElement()) { compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_NWD_MARK; // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>) } else { compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_MARK; // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>) } complexElements.add(composite); } } // compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK); // remove Blanks after the non word breaking mark (e.g. "praebi<lb/> ta" is changed to "praebi<lb/>ta") String compositesCharsWithMarksWithWordTags = insertWordTags(compositesCharsWithMarks, elemLanguage); compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK); // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values if (complexElements.size() > 0) { for (int i=0; i<complexElements.size(); i++) { int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksWithWordTags.indexOf(COMPLEX_ELEMENT_MARK); Element complexElem = complexElements.get(i); String complexElementStr = complexElem.toXmlString(); String firstPiece = ""; if (indexComplexElemCompositesCharsWithMarks > 0) { firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks); compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks); } retString = retString + firstPiece + complexElementStr; compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE); } retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added } else { retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added } } retString = retString + "</" + name + ">"; } return retString; } private String insertWordTags(String charactersStrDeresolved, String language) throws SAXException { String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved); String retStr = ""; try { Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr)); tokenizer.setLanguage(language); tokenizer.setNormFunctions(normalizeFunctions); ArrayList<Token> tokens = tokenizer.getTokens(); int endPos = 0; for (int i=0; i < tokens.size(); i++) { Token token = tokens.get(i); String wordForm = token.getContent(); int startPos = token.getStart(); String beforeStr = charactersStr.substring(endPos, startPos); endPos = token.getEnd(); String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr); String origWordForm = charactersStr.substring(startPos, endPos); String wordTag = insertWordTags(wordForm, language, origWordForm); retStr = retStr + beforeStrDeresolved + wordTag; } String lastAfterStr = charactersStr.substring(endPos); String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr); retStr = retStr + lastAfterStrDeresolved; } catch (ApplicationException e) { throw new SAXException(e); } return retStr; } private String insertWordTags(String wordForm, String language, String origWordForm) throws ApplicationException { String wordTag = null; if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) return origWordForm; if (isStopElement()) return origWordForm; wordForm = removeSpecialSymbols(wordForm); wordForm = wordForm.toLowerCase(); String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm); ArrayList<Lemma> lemmas = null; if (withForms() || withLemmas()) { LexHandler lexHandler = LexHandler.getInstance(); lemmas = lexHandler.getLemmas(wordForm, "form", language, "none"); } wordTag = insertWordTags(origWordFormDeresolved, wordForm, language, null, lemmas); return wordTag; } /** * * @param origWordToken could contain nwd marks * @param wordForm contains no nwd marks * @param language * @param origWordFormNormalized * @param lemmas * @return for each substring between nwd marks create a word tag */ private String insertWordTags(String origWordToken, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) { if (origWordToken.isEmpty()) return origWordToken; if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK)) return COMPLEX_ELEMENT_NWD_MARK; String retWordTags = ""; String origWordTokenTmp = origWordToken; while (! origWordTokenTmp.isEmpty()) { if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) { // single nwd mark origWordTokenTmp = origWordTokenTmp.substring(1); retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK; } else { int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK); if (indexUpToNWD != -1) { // not end of string reached String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD); String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK; origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1); } else { // end of string reached String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length()); String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); retWordTags = retWordTags + origWordTokenFragmentWithTags; origWordTokenTmp = ""; // finente } } } return retWordTags; } private String getWordTag(String origWordForm, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) { if (origWordForm == null || origWordForm.isEmpty()) return ""; String langISOCode = Language.getInstance().getISO639Code(language); String retStr = "<w form=\"" + wordForm + "\"" + " lang=\"" + langISOCode + "\""; if (origWordFormNormalized != null) retStr = retStr + " formNormalized=\"" + origWordFormNormalized + "\""; if (lemmas != null) { String lemmasStr = ""; String formsStr = ""; Collections.sort(lemmas); Hashtable<String, Form> formsHashtable = new Hashtable<String, Form>(); for (int i=0; i < lemmas.size(); i++) { Lemma lemma = lemmas.get(i); ArrayList<Form> lemmaForms = lemma.getFormsList(); for (int j=0; j < lemmaForms.size(); j++) { Form form = lemmaForms.get(j); formsHashtable.put(form.getFormName(), form); } String lemmaName = lemma.getLemmaName(); lemmasStr = lemmasStr + lemmaName + " "; } ArrayList<Form> forms = new ArrayList<Form>(); forms.addAll(formsHashtable.values()); Collections.sort(forms); for (int i=0; i < forms.size(); i++) { Form form = forms.get(i); String formName = form.getFormName(); formName = StringUtils.forXML(formName); formsStr = formsStr + formName + " "; } if (formsStr.endsWith(" ")) formsStr = formsStr.substring(0, formsStr.length() - 1); if (lemmasStr.endsWith(" ")) lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1); if (withForms()) retStr = retStr + " forms=\"" + formsStr + "\""; if (withLemmas()) retStr = retStr + " lemmas=\"" + lemmasStr + "\""; } retStr = retStr + ">" + origWordForm + "</w>"; return retStr; } private String removeSpecialSymbols(String inputStr) { String retStr = inputStr.replaceAll(" |\n|\t|-|\u2424|\u2425", ""); return retStr; } } }