Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java @ 19:4a3641ae14d2
Erstellung
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Wed, 09 Nov 2011 15:32:05 +0100 |
parents | |
children | 7d6d969b10cf |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java Wed Nov 09 15:32:05 2011 +0100 @@ -0,0 +1,426 @@ +package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; + +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Hashtable; + +import org.xml.sax.*; + +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; +import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; +import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; +import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; + +public class XmlTokenizerContentHandler implements ContentHandler { + private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element + private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element + private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length(); + private static int ELEMENT_TYPE_CHARACTERS = 1; + private static int ELEMENT_TYPE_COMPLEX = 2; + private String[] normalizeFunctions = {}; // default: without normalize functions + private String[] nwbElements = {}; // non word breaking elements, default: these elements + private String[] stopElements = {}; // default: no stop elements + private String[] outputOptions = {}; + private String xmlnsString = ""; + private String language; + private String outputXmlFragment = ""; + private Element rootElement; + private Element currentElement; + private ArrayList<Element> elementQueue; + + public XmlTokenizerContentHandler(String[] normalizeFunctions, String language) throws ApplicationException { + if (normalizeFunctions == null) { + String[] emptyFunctions = {}; + this.normalizeFunctions = emptyFunctions; + } else { + this.normalizeFunctions = normalizeFunctions; + } + this.language = language; + } + + public void setNWBElements(String[] nwbElements) { + this.nwbElements = nwbElements; + } + + public void setStopElements(String[] stopElements) { + this.stopElements = stopElements; + } + + public void setOutputOptions(String[] outputOptions) { + this.outputOptions = outputOptions; + } + + public String getXmlFragment() { + return outputXmlFragment; + } + + public void startDocument() throws SAXException { + } + + public void endDocument() throws SAXException { + try { + String rootElemToStr = rootElement.toXmlString(); + write(rootElemToStr); + write("\n"); + } catch (NullPointerException e) { + throw new SAXException(e); + } + } + + public void characters(char[] c, int start, int length) throws SAXException { + char[] cCopy = new char[length]; + System.arraycopy(c, start, cCopy, 0, length); + String charactersStr = String.valueOf(cCopy); + if (charactersStr != null && ! charactersStr.equals("")) { + if (currentElement != null) { + Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); + charElement.value = StringUtils.deresolveXmlEntities(charactersStr); + if (currentElement.composites == null) + currentElement.composites = new ArrayList<Element>(); + currentElement.composites.add(charElement); + } + } + } + + public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { + } + + public void processingInstruction(String target, String data) throws SAXException { + } + + public void setDocumentLocator(Locator locator) { + } + + public void startPrefixMapping(String prefix, String uri) throws SAXException { + xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; + if (prefix != null && prefix.equals("")) + xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" "; + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { + if (elementQueue == null) + elementQueue = new ArrayList<Element>(); + Element newElement = new Element(name); // element of type: complex + if (currentElement != null) { + if (currentElement.composites == null) + currentElement.composites = new ArrayList<Element>(); + if (currentElement.lang != null) + newElement.lang = currentElement.lang; // language is inherited to childs + currentElement.composites.add(newElement); + } + currentElement = newElement; + int attrSize = attrs.getLength(); + String attrString = ""; + for (int i=0; i<attrSize; i++) { + String attrQName = attrs.getQName(i); + String attrValue = attrs.getValue(i); + attrValue = StringUtils.forXML(attrValue); + attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; + if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) + currentElement.lang = attrValue; // if xml:lang is set, it is set to the new element and overwrites values inherited by the father + } + currentElement.attrString = attrString; + if (! xmlnsString.equals("")) { + currentElement.xmlnsString = xmlnsString; + } + xmlnsString = ""; + elementQueue.add(currentElement); + // only the first element is the root element + if(rootElement == null) + rootElement = currentElement; + } + + public void endElement(String uri, String localName, String name) throws SAXException { + if (elementQueue != null && elementQueue.size() > 0) { + int lastIndex = elementQueue.size() - 1; + elementQueue.remove(lastIndex); + } + if (elementQueue != null && elementQueue.size() > 0) { + int lastIndex = elementQueue.size() - 1; + currentElement = elementQueue.get(lastIndex); + } else { + currentElement = null; + } + } + + private boolean withForms() { + boolean result = false; + for (int i=0; i< outputOptions.length; i++) { + String function = outputOptions[i]; + if (function.equals("withForms")) + return true; + } + return result; + } + + private boolean withLemmas() { + boolean result = false; + for (int i=0; i< outputOptions.length; i++) { + String function = outputOptions[i]; + if (function.equals("withLemmas")) + return true; + } + return result; + } + + private void write(String outStr) throws SAXException { + outputXmlFragment += outStr; + } + + private class Element { + private int type; + private String name; + private String xmlnsString; + private String attrString; + private String value; + private String lang; // normally value of attribute xml:lang or the inherited xml:lang value of the father node + private ArrayList<Element> composites; + + private Element(String name) { + this.type = ELEMENT_TYPE_COMPLEX; + this.name = name; + } + + private Element(String name, int type) { + this.type = type; + this.name = name; + } + + private boolean isComplex() { + boolean isComplex = false; + if (type == ELEMENT_TYPE_COMPLEX) + isComplex = true; + return isComplex; + } + + private boolean isWordDelimiterElement() { + boolean isWordDelimiterElement = true; + for (int i=0; i<nwbElements.length; i++) { + String nwbElementName = nwbElements[i]; + if (name.equals(nwbElementName)) { + isWordDelimiterElement = false; + break; + } + } + return isWordDelimiterElement; + } + + private boolean isStopElement() { + boolean isStopElement = false; + for (int i=0; i<stopElements.length; i++) { + String stopElementName = stopElements[i]; + if (name.equals(stopElementName)) { + isStopElement = true; + break; + } + } + return isStopElement; + } + + private String toXmlString() throws SAXException { + String retString = ""; + String elemLanguage = language; // default value for the document/page + if (lang != null) + elemLanguage = lang; // value of the element if available + // write this element + if (! isComplex()) { + retString += value; + } else { + String xmlNsString = this.xmlnsString; + if (xmlNsString == null || xmlNsString.equals("")) { + retString = retString + "<" + name + attrString + ">"; + } else { + retString = retString + "<" + name + " " + xmlNsString + attrString + ">"; + } + if (composites != null) { + String compositesCharsWithMarks = ""; + ArrayList<Element> complexElements = new ArrayList<Element>(); + for (int i=0; i<composites.size(); i++) { + Element composite = composites.get(i); + if (! composite.isComplex()) { + if (composite.value != null && ! composite.value.equals("")) { + String compositeValueStr = composite.value; + compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words. + compositeValueStr = compositeValueStr.replaceAll("[ \t]+", " "); // if there are many Blanks/Tabs make them to one Blank + compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr; + } + } else { + if (! composite.isWordDelimiterElement()) { + compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_NWD_MARK; // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>) + } else { + compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_MARK; // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>) + } + complexElements.add(composite); + } + } + // compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK); // remove Blanks after the non word breaking mark (e.g. "praebi<lb/> ta" is changed to "praebi<lb/>ta") + String compositesCharsWithMarksWithWordTags = insertWordTags(compositesCharsWithMarks, elemLanguage); + compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK); // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values + if (complexElements.size() > 0) { + for (int i=0; i<complexElements.size(); i++) { + int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksWithWordTags.indexOf(COMPLEX_ELEMENT_MARK); + Element complexElem = complexElements.get(i); + String complexElementStr = complexElem.toXmlString(); + String firstPiece = ""; + if (indexComplexElemCompositesCharsWithMarks > 0) { + firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks); + compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks); + } + retString = retString + firstPiece + complexElementStr; + compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE); + } + retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added + } else { + retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added + } + } + retString = retString + "</" + name + ">"; + } + return retString; + } + + private String insertWordTags(String charactersStrDeresolved, String language) throws SAXException { + String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved); + String retStr = ""; + try { + Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr)); + tokenizer.setLanguage(language); + tokenizer.setNormFunctions(normalizeFunctions); + ArrayList<Token> tokens = tokenizer.getTokens(); + int endPos = 0; + for (int i=0; i < tokens.size(); i++) { + Token token = tokens.get(i); + String wordForm = token.getContent(); + int startPos = token.getStart(); + String beforeStr = charactersStr.substring(endPos, startPos); + endPos = token.getEnd(); + String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr); + String origWordForm = charactersStr.substring(startPos, endPos); + String wordTag = insertWordTags(wordForm, language, origWordForm); + retStr = retStr + beforeStrDeresolved + wordTag; + } + String lastAfterStr = charactersStr.substring(endPos); + String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr); + retStr = retStr + lastAfterStrDeresolved; + } catch (ApplicationException e) { + throw new SAXException(e); + } + return retStr; + } + + private String insertWordTags(String wordForm, String language, String origWordForm) throws ApplicationException { + String wordTag = null; + if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) + return origWordForm; + if (isStopElement()) + return origWordForm; + wordForm = removeSpecialSymbols(wordForm); + wordForm = wordForm.toLowerCase(); + String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm); + ArrayList<Lemma> lemmas = null; + if (withForms() || withLemmas()) { + LexHandler lexHandler = LexHandler.getInstance(); + lemmas = lexHandler.getLemmas(wordForm, "form", language, "none"); + } + wordTag = insertWordTags(origWordFormDeresolved, wordForm, language, null, lemmas); + return wordTag; + } + + /** + * + * @param origWordToken could contain nwd marks + * @param wordForm contains no nwd marks + * @param language + * @param origWordFormNormalized + * @param lemmas + * @return for each substring between nwd marks create a word tag + */ + private String insertWordTags(String origWordToken, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) { + if (origWordToken.isEmpty()) + return origWordToken; + if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK)) + return COMPLEX_ELEMENT_NWD_MARK; + String retWordTags = ""; + String origWordTokenTmp = origWordToken; + while (! origWordTokenTmp.isEmpty()) { + if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) { // single nwd mark + origWordTokenTmp = origWordTokenTmp.substring(1); + retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK; + } else { + int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK); + if (indexUpToNWD != -1) { // not end of string reached + String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD); + String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); + retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK; + origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1); + } else { // end of string reached + String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length()); + String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); + retWordTags = retWordTags + origWordTokenFragmentWithTags; + origWordTokenTmp = ""; // finente + } + } + } + return retWordTags; + } + + private String getWordTag(String origWordForm, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) { + if (origWordForm == null || origWordForm.isEmpty()) + return ""; + String langISOCode = Language.getInstance().getISO639Code(language); + String retStr = "<w form=\"" + wordForm + "\"" + " lang=\"" + langISOCode + "\""; + if (origWordFormNormalized != null) + retStr = retStr + " formNormalized=\"" + origWordFormNormalized + "\""; + if (lemmas != null) { + String lemmasStr = ""; + String formsStr = ""; + Collections.sort(lemmas); + Hashtable<String, Form> formsHashtable = new Hashtable<String, Form>(); + for (int i=0; i < lemmas.size(); i++) { + Lemma lemma = lemmas.get(i); + ArrayList<Form> lemmaForms = lemma.getFormsList(); + for (int j=0; j < lemmaForms.size(); j++) { + Form form = lemmaForms.get(j); + formsHashtable.put(form.getFormName(), form); + } + String lemmaName = lemma.getLemmaName(); + lemmasStr = lemmasStr + lemmaName + " "; + } + ArrayList<Form> forms = new ArrayList<Form>(); + forms.addAll(formsHashtable.values()); + Collections.sort(forms); + for (int i=0; i < forms.size(); i++) { + Form form = forms.get(i); + String formName = form.getFormName(); + formName = StringUtils.forXML(formName); + formsStr = formsStr + formName + " "; + } + if (formsStr.endsWith(" ")) + formsStr = formsStr.substring(0, formsStr.length() - 1); + if (lemmasStr.endsWith(" ")) + lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1); + if (withForms()) + retStr = retStr + " forms=\"" + formsStr + "\""; + if (withLemmas()) + retStr = retStr + " lemmas=\"" + lemmasStr + "\""; + } + retStr = retStr + ">" + origWordForm + "</w>"; + return retStr; + } + + private String removeSpecialSymbols(String inputStr) { + String retStr = inputStr.replaceAll(" |\n|\t|-|\u2424|\u2425", ""); + return retStr; + } + + } +}