Mercurial > hg > mpdl-group
view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 7d6d969b10cf |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.Enumeration; import java.util.Hashtable; import org.xml.sax.*; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler; import de.mpg.mpiwg.berlin.mpdl.lt.general.Language; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; import de.mpg.mpiwg.berlin.mpdl.util.StringUtils; public class XmlTokenizerContentHandler implements ContentHandler { private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString(); // word delimiting element private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString(); // not word delimiting element private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length(); private static int ELEMENT_TYPE_CHARACTERS = 1; private static int ELEMENT_TYPE_COMPLEX = 2; private String docId; private String language; private String[] nwbElements = {}; // non word breaking elements, default: no nwb elements private String[] stopElements = {}; // default: no stop elements private String outputFormat = "xml"; // default: xml private String[] outputOptions = {}; private boolean withForms = false; private boolean withLemmas = false; private String[] highlightTerms = {}; // highlight terms, default: no highlight terms private String[] normFunctions = {}; // default: no norm function private boolean useNormFunction = false; private boolean useRegFunction = false; private String xmlnsString = ""; private StringBuilder result = new StringBuilder(); private ArrayList<Token> resultTokens = new ArrayList<Token>(); private Hashtable<String, ArrayList<Element>> elements = new Hashtable<String, ArrayList<Element>>(); private Element rootElement; private Element currentElement; private int currentPosition = 0; private int currentPageNumber = 0; private int currentLineNumber = 0; private Hashtable<String, Integer> currentPositions = new Hashtable<String, Integer>(); private Hashtable<String, Integer> currentPagePositions = new Hashtable<String, Integer>(); private ArrayList<Element> elementQueue; public XmlTokenizerContentHandler(String language) throws ApplicationException { this.language = language; } public void setDocIdentifier(String docId) { this.docId = docId; } public void setNWBElements(String[] nwbElements) { this.nwbElements = nwbElements; } public void setStopElements(String[] stopElements) { this.stopElements = stopElements; } public void setHighlightTerms(String[] highlightTerms) { this.highlightTerms = highlightTerms; } public void setNormFunctions(String[] normFunctions) { this.normFunctions = normFunctions; if (this.normFunctions != null) { for (int i=0; i< this.normFunctions.length; i++) { String function = normFunctions[i]; if (function.equals("norm")) this.useNormFunction = true; else if (function.equals("reg")) this.useRegFunction = true; } } } public void setOutputFormat(String outputFormat) { this.outputFormat = outputFormat; } public void setOutputOptions(String[] outputOptions) { this.outputOptions = outputOptions; for (int i=0; i< this.outputOptions.length; i++) { String function = outputOptions[i]; if (function.equals("withForms")) this.withForms = true; else if (function.equals("withLemmas")) this.withLemmas = true; } } public String getResultString() { return result.toString(); } public ArrayList<Token> getResultTokens() { return resultTokens; } public ArrayList<Element> getElements(String elementName) { return elements.get(elementName); } public int getPageCount() { return currentPageNumber; } public void startDocument() throws SAXException { } public void endDocument() throws SAXException { try { String rootElemToStr = rootElement.buildString(); write(rootElemToStr); write("\n"); } catch (NullPointerException e) { throw new SAXException(e); } catch (ApplicationException e) { throw new SAXException(e); } } public void characters(char[] c, int start, int length) throws SAXException { char[] cCopy = new char[length]; System.arraycopy(c, start, cCopy, 0, length); String charactersStr = String.valueOf(cCopy); if (charactersStr != null && ! charactersStr.equals("")) { if (currentElement != null) { Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); charElement.pageNumber = currentPageNumber; charElement.value = StringUtils.deresolveXmlEntities(charactersStr); if (currentElement.composites == null) currentElement.composites = new ArrayList<Element>(); currentElement.composites.add(charElement); } } } public void ignorableWhitespace(char[] c, int start, int length) throws SAXException { } public void processingInstruction(String target, String data) throws SAXException { } public void setDocumentLocator(Locator locator) { } public void startPrefixMapping(String prefix, String uri) throws SAXException { if (prefix != null && prefix.equals("")) xmlnsString += "xmlns" + "=\"" + uri + "\" "; else xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; } public void endPrefixMapping(String prefix) throws SAXException { } public void skippedEntity(String name) throws SAXException { } public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException { if (elementQueue == null) elementQueue = new ArrayList<Element>(); Element newElement = new Element(name); // element of type: complex if (currentElement != null) { if (currentElement.composites == null) currentElement.composites = new ArrayList<Element>(); if (currentElement.lang != null) newElement.lang = currentElement.lang; // language is inherited to childs currentElement.composites.add(newElement); newElement.parent = currentElement; } currentElement = newElement; if (localName != null && localName.equals("pb")) { currentPageNumber++; setCurrentPagePosition(localName, 0); } currentElement.pageNumber = currentPageNumber; if (localName != null && localName.equals("lb")) { currentLineNumber++; } currentElement.lineNumber = currentLineNumber; currentPosition++; currentElement.docPosition = currentPosition; int newElemPosition = incrementCurrentPosition(localName); currentElement.position = newElemPosition; currentElement.elemPosition = getElementPosition(currentElement); Element parent = currentElement.parent; if (parent == null) { currentElement.xpath = "/" + currentElement.name + "[" + currentElement.elemPosition + "]"; } else { currentElement.xpath = parent.xpath + "/" + currentElement.name + "[" + currentElement.elemPosition + "]"; } int newElemPagePosition = incrementCurrentPagePosition(localName); currentElement.pagePosition = newElemPagePosition; int attrSize = attrs.getLength(); String attrString = ""; for (int i=0; i<attrSize; i++) { String attrQName = attrs.getQName(i); String attrValue = attrs.getValue(i); attrValue = StringUtils.forXML(attrValue); attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) { currentElement.lang = attrValue; // if xml:lang is set, it is set to the new element and overwrites values inherited by the father } if (attrQName != null && (attrQName.toLowerCase().equals("xml:id") || attrQName.toLowerCase().equals("id"))) { currentElement.xmlId = attrValue; } } currentElement.attrString = attrString; if (! xmlnsString.equals("")) { currentElement.xmlnsString = xmlnsString; } xmlnsString = ""; elementQueue.add(currentElement); // only the first element is the root element if(rootElement == null) rootElement = currentElement; } public void endElement(String uri, String localName, String name) throws SAXException { if (elementQueue != null && elementQueue.size() > 0) { int lastIndex = elementQueue.size() - 1; elementQueue.remove(lastIndex); } if (elementQueue != null && elementQueue.size() > 0) { int lastIndex = elementQueue.size() - 1; currentElement = elementQueue.get(lastIndex); } else { currentElement = null; } } private int incrementCurrentPosition(String elemName) { Integer currentElemPos = currentPositions.get(elemName); if (currentElemPos == null) { currentElemPos = new Integer(0); } currentElemPos++; currentPositions.put(elemName, currentElemPos); return currentElemPos.intValue(); } private int getElementPosition(Element elem) { int pos = 0; Element parent = elem.parent; if (parent == null) { pos = 1; } else { pos = 0; ArrayList<Element> composites = parent.composites; if (composites != null) { for (int i=0; i<composites.size(); i++) { Element e = composites.get(i); if (e.isComplex() && e.name.equals(elem.name)) { pos++; } if (e == elem) break; } } else { pos = 1; } } return pos; } private int incrementCurrentPagePosition(String elemName) { Integer currentElemPagePos = currentPagePositions.get(elemName); if (currentElemPagePos == null) { currentElemPagePos = new Integer(0); } currentElemPagePos++; currentPagePositions.put(elemName, currentElemPagePos); return currentElemPagePos.intValue(); } private void setCurrentPagePosition(String elemName, int pos) { Integer newPagePosition = new Integer(pos); Enumeration<String> elemKeys = currentPagePositions.keys(); while (elemKeys.hasMoreElements()) { String elemKey = elemKeys.nextElement(); currentPagePositions.put(elemKey, newPagePosition); } } private boolean isHighlightTerm(String term) { if (term == null) return false; boolean result = false; for (int i=0; i< highlightTerms.length; i++) { String t = highlightTerms[i].toLowerCase(); String termLowerCase = term.toLowerCase(); if (t.equals(termLowerCase)) return true; } return result; } private boolean isHighlightTerm(String[] terms) { if (terms == null) return false; boolean result = false; for (int i=0; i< highlightTerms.length; i++) { String t = highlightTerms[i].toLowerCase(); for (int j=0; j<terms.length; j++) { String termLowerCase = terms[j].toLowerCase(); if (t.equals(termLowerCase)) return true; } } return result; } private void write(String outStr) throws SAXException { result.append(outStr); } public class Element implements Comparable<Element> { private int type; public String name; private String xmlnsString; private String attrString; private String value; public String lang; // value of attribute xml:lang or the inherited xml:lang value of the father node public String xmlId; public String xpath; public int pageNumber; public int lineNumber; public int docPosition; // absolute position in document public int position; // position within all elements with this name public int elemPosition; // position in element e.g. the 6 sentence in paragraph public int pagePosition; // position in page private ArrayList<Token> tokens = new ArrayList<Token>(); private ArrayList<Element> composites; private Element parent; private boolean isStopElement = false; private boolean isWordDelimiterElement = true; // default: is word delimiter element private Element(String name) { this.type = ELEMENT_TYPE_COMPLEX; setName(name); } private Element(String name, int type) { this.type = type; setName(name); } private void setName(String name) { this.name = name; for (int i=0; i<stopElements.length; i++) { String stopElementName = stopElements[i]; if (name.equals(stopElementName)) { this.isStopElement = true; break; } } for (int i=0; i<nwbElements.length; i++) { String nwbElementName = nwbElements[i]; if (name.equals(nwbElementName)) { this.isWordDelimiterElement = false; break; } } } public int compareTo(Element elem) { return (new Integer(position)).compareTo(new Integer(elem.position)); } private boolean isComplex() { boolean isComplex = false; if (type == ELEMENT_TYPE_COMPLEX) isComplex = true; return isComplex; } public ArrayList<Token> getTokens() { ArrayList<Token> retTokens = new ArrayList<Token>(); if (isComplex()) { if (composites != null) { for (int i=0; i<composites.size(); i++) { Element elem = composites.get(i); if (elem.tokens != null) retTokens.addAll(elem.tokens); } } } if (tokens != null) retTokens.addAll(tokens); return retTokens; } public String getTokensStr(String type) { ArrayList<Token> elementTokens = getTokens(); String tokenStr = getTokensStr(type, elementTokens); return tokenStr; } private String getTokensStr(String type, ArrayList<Token> tokens) { StringBuilder tokenStr = new StringBuilder(); for (int j=0; j<tokens.size(); j++) { Token token = tokens.get(j); String content = null; if (type.equals("orig")) content = token.getContentOrig(); else if (type.equals("reg")) content = token.getContentReg(); else if (type.equals("norm")) content = token.getContentNorm(); else if (type.equals("morph")) content = token.getContentMorph(); if (content != null) tokenStr.append(content + " "); } return tokenStr.toString(); } public String toXmlString() throws ApplicationException { StringBuilder retStrBuilder = new StringBuilder(); if (! isComplex()) { retStrBuilder.append(value); } else { String xmlNsString = this.xmlnsString; if (xmlNsString == null || xmlNsString.equals("")) { retStrBuilder.append("<" + name + attrString + ">"); } else { retStrBuilder.append("<" + name + " " + xmlNsString + attrString + ">"); } if (composites != null) { for (int i=0; i<composites.size(); i++) { Element composite = composites.get(i); if (! composite.isComplex()) { if (composite.value != null && ! composite.value.equals("")) { String compositeValueStr = StringUtils.removeNlTabBlanks(composite.value); // remove all newlines, they are no separators for words. And: if there are many Blanks/Tabs make them to one Blank retStrBuilder.append(compositeValueStr); } } else { retStrBuilder.append(composite.toXmlString()); } } } retStrBuilder.append("</" + name + ">"); } return retStrBuilder.toString(); } private String buildString() throws ApplicationException { StringBuilder retStrBuilder = new StringBuilder(); String elemLanguage = language; // default value for the document/page if (lang != null) elemLanguage = lang; // value of the element if available // write this element if (! isComplex()) { retStrBuilder.append(value); } else { if (outputFormat != null && outputFormat.equals("xml")) { String xmlNsString = this.xmlnsString; if (xmlNsString == null || xmlNsString.equals("")) { retStrBuilder.append("<" + name + attrString + ">"); } else { retStrBuilder.append("<" + name + " " + xmlNsString + attrString + ">"); } } else { // outputFormat == string // nothing } if (composites != null) { StringBuilder compositesCharsWithMarks = new StringBuilder(); ArrayList<Element> complexElements = new ArrayList<Element>(); for (int i=0; i<composites.size(); i++) { Element composite = composites.get(i); if (! composite.isComplex()) { if (composite.value != null && ! composite.value.equals("")) { String compositeValueStr = StringUtils.removeNlTabBlanks(composite.value); // remove all newlines, they are no separators for words. And: if there are many Blanks/Tabs make them to one Blank compositesCharsWithMarks.append(compositeValueStr); } } else { if (! composite.isWordDelimiterElement) { compositesCharsWithMarks.append(COMPLEX_ELEMENT_NWD_MARK); // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>) } else { compositesCharsWithMarks.append(COMPLEX_ELEMENT_MARK); // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>) } complexElements.add(composite); } } // compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK); // remove Blanks after the non word breaking mark (e.g. "praebi<lb/> ta" is changed to "praebi<lb/>ta") String compositesCharsWithMarksWithWordTags = insertWordTags(compositesCharsWithMarks, elemLanguage); compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK); // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values if (complexElements.size() > 0) { for (int i=0; i<complexElements.size(); i++) { int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksWithWordTags.indexOf(COMPLEX_ELEMENT_MARK); Element complexElem = complexElements.get(i); String complexElementStr = complexElem.buildString(); String firstPiece = ""; if (indexComplexElemCompositesCharsWithMarks > 0) { firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks); compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks); } retStrBuilder.append(firstPiece + complexElementStr); compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE); } retStrBuilder.append(compositesCharsWithMarksWithWordTags); // last one must also be added } else { retStrBuilder.append(compositesCharsWithMarksWithWordTags); // last one must also be added } } if (outputFormat != null && outputFormat.equals("xml")) { retStrBuilder.append("</" + name + ">"); } else { // outputFormat == string // nothing } // put element into elements name hashtable ArrayList<Element> elems = elements.get(name); if (elems == null) { elems = new ArrayList<Element>(); elements.put(name, elems); } elems.add(this); } return retStrBuilder.toString(); } private String insertWordTags(StringBuilder charactersStrDeresolvedBuilder, String language) throws ApplicationException { String charactersStrDeresolved = charactersStrDeresolvedBuilder.toString(); String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved); StringBuilder retStrBuilder = new StringBuilder(); Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr)); tokenizer.setLanguage(language); String[] normFunction = {"norm"}; tokenizer.setNormFunctions(normFunction); ArrayList<Token> tokens = tokenizer.getTokens(); int endPos = 0; for (int i=0; i < tokens.size(); i++) { Token token = tokens.get(i); int startPos = token.getStart(); String beforeStr = charactersStr.substring(endPos, startPos); endPos = token.getEnd(); String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr); String origWordForm = charactersStr.substring(startPos, endPos); String wordTag = insertWordTags(token, language, origWordForm); if (outputFormat != null && outputFormat.equals("xml")) { retStrBuilder.append(beforeStrDeresolved + wordTag); } else { // outputFormat == string String beforeStrDeresolvedToBlanks = toBlanks(beforeStrDeresolved); retStrBuilder.append(beforeStrDeresolvedToBlanks + wordTag); } } String lastAfterStr = charactersStr.substring(endPos); String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr); if (outputFormat != null && outputFormat.equals("xml")) { retStrBuilder.append(lastAfterStrDeresolved); } else { // outputFormat == string String lastAfterStrDeresolvedToBlanks = toBlanks(lastAfterStrDeresolved); retStrBuilder.append(lastAfterStrDeresolvedToBlanks); } return retStrBuilder.toString(); } private String insertWordTags(Token token, String language, String origWordForm) throws ApplicationException { if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) { return origWordForm; } String wordTag = null; token.setDocId(docId); token.setLanguage(lang); token.setPageNumber(pageNumber); token.setLineNumber(lineNumber); token.setElementPosition(position); token.setElementPagePosition(pagePosition); token.setElementName(name); token.setXmlId(xmlId); token.setXpath("xpath"); // TODO if (name != null && name.equals("reg")) { if (attrString != null && attrString.contains("norm=\"")) { int regIndexBegin = attrString.indexOf("norm=\""); int regIndexEnd = attrString.indexOf("\"", regIndexBegin + 7); String reg = attrString.substring(regIndexBegin + 6, regIndexEnd); token.setContentReg(reg); String[] normFunction = {"norm"}; Normalizer normalizer = new Normalizer(normFunction, language); String normStr = normalizer.normalize(reg); token.setContentNorm(normStr); } } if (language == null) { token.setContentOrig(origWordForm); // TODO necessary ? tokens.add(token); resultTokens.add(token); return origWordForm; } if (isStopElement && outputFormat != null && outputFormat.equals("xml")) return origWordForm; if (isStopElement && outputFormat != null && outputFormat.equals("string")) return toBlanks(origWordForm); String wordFormNorm = token.getContentNorm(); String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm); ArrayList<Lemma> lemmas = null; Boolean hasDctionaryEntries = null; String lemmasStr = ""; if (withForms || withLemmas) { LexHandler lexHandler = LexHandler.getInstance(); lemmas = lexHandler.getLemmas(wordFormNorm, "form", language, Normalizer.DICTIONARY, false); // Performance: needs 15 % of the indexing time if (lemmas != null) { for (int i=0; i < lemmas.size(); i++) { Lemma lemma = lemmas.get(i); String lemmaName = lemma.getLemmaName(); lemmasStr = lemmasStr + lemmaName + " "; } } lemmasStr = lemmasStr.trim(); token.setContentMorph(lemmasStr); hasDctionaryEntries = false; ArrayList<String> lexEntries = lexHandler.getLexEntryKeys(wordFormNorm, language, Normalizer.DICTIONARY); // Performance: needs 15 % of the indexing time if (lexEntries != null) hasDctionaryEntries = true; } if (outputFormat != null && outputFormat.equals("xml")) { wordTag = insertWordTags(origWordFormDeresolved, token, language, lemmas, hasDctionaryEntries); // Performance: needs 10 % of the indexing time String tokenWordForm = token.getContentOrig(); // word form is in contentOrig if (useRegFunction) tokenWordForm = token.getContentReg(); else if (useNormFunction) tokenWordForm = token.getContentNorm(); else if (withLemmas) tokenWordForm = token.getContentMorph(); boolean isHighlightTerm = false; if (highlightTerms.length > 0 && ! withLemmas) { isHighlightTerm = isHighlightTerm(tokenWordForm); } else { if (highlightTerms.length > 0 && lemmas != null) { String[] lemmasArray = lemmasStr.split(" "); isHighlightTerm = isHighlightTerm(lemmasArray); } } if (isHighlightTerm) { wordTag = "<hi>" + wordTag + "</hi>"; } } else { // outputFormat == string String inWordFormWithoutSpecialSymbols = removeSpecialSymbols(origWordForm); // without hyphen, blanks, newline, tab if (withLemmas) { if (lemmas != null) { String blanksAndNWBMarksOfOrigWord = toBlanks(origWordFormDeresolved); // to rescue the NWB marks of the origWord and put it to the beginning of the lemmasStr wordTag = blanksAndNWBMarksOfOrigWord + lemmasStr; token.setContentMorph(lemmasStr); } else { wordTag = inWordFormWithoutSpecialSymbols; } } else { wordTag = inWordFormWithoutSpecialSymbols; } tokens.add(token); resultTokens.add(token); } return wordTag; } private String removeSpecialSymbols(String inputStr) { String retStr = inputStr.replaceAll(" |\n|\t|-|\u00AD", ""); // blank, newline, tab, minus, soft hyphen return retStr; } /** * * @param origWordToken could contain nwd marks * @param token * @param language * @param lemmas * @return for each substring between nwd marks create a word tag */ private String insertWordTags(String origWordToken, Token token, String language, ArrayList<Lemma> lemmas, Boolean hasDictionaryEntries) { if (origWordToken.isEmpty()) return origWordToken; if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK)) return COMPLEX_ELEMENT_NWD_MARK; String retWordTags = ""; String origWordTokenTmp = origWordToken; if (outputFormat != null && outputFormat.equals("xml")) { retWordTags = getWordTag(origWordToken, token, language, lemmas, hasDictionaryEntries); /* while (! origWordTokenTmp.isEmpty()) { if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) { // single nwd mark origWordTokenTmp = origWordTokenTmp.substring(1); retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK; } else { int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK); if (indexUpToNWD != -1) { // not end of string reached String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD); String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, token, language, lemmas, hasDictionaryEntries); retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK; origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1); } else { // end of string reached String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length()); String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, token, language, lemmas, hasDictionaryEntries); retWordTags = retWordTags + origWordTokenFragmentWithTags; origWordTokenTmp = ""; // finente } } } */ } else { // nothing } return retWordTags; } private String getWordTag(String origWordForm, Token token, String language, ArrayList<Lemma> lemmas, Boolean hasDictionaryEntries) { if (origWordForm == null || origWordForm.isEmpty()) return ""; String wordForm = token.getContentOrig(); // word form is in contentOrig String regularizedWordForm = token.getContentReg(); String normalizedWordForm = token.getContentNorm(); String langISOCode = Language.getInstance().getISO639Code(language); StringBuilder retStrBuilder = new StringBuilder(); retStrBuilder.append("<w" + " lang=\"" + langISOCode + "\"" + " form=\"" + wordForm + "\""); if (regularizedWordForm != null) retStrBuilder.append(" formRegularized=\"" + regularizedWordForm + "\""); if (normalizedWordForm != null) retStrBuilder.append(" formNormalized=\"" + normalizedWordForm + "\""); if (lemmas != null) { String lemmasStr = ""; StringBuilder formsStrBuilder = new StringBuilder(); Collections.sort(lemmas); Hashtable<String, Form> formsHashtable = new Hashtable<String, Form>(); for (int i=0; i < lemmas.size(); i++) { Lemma lemma = lemmas.get(i); ArrayList<Form> lemmaForms = lemma.getFormsList(); for (int j=0; j < lemmaForms.size(); j++) { Form form = lemmaForms.get(j); formsHashtable.put(form.getFormName(), form); } String lemmaName = lemma.getLemmaName(); lemmasStr = lemmasStr + lemmaName + " "; } ArrayList<Form> forms = new ArrayList<Form>(); forms.addAll(formsHashtable.values()); Collections.sort(forms); for (int i=0; i < forms.size(); i++) { Form form = forms.get(i); String formName = form.getFormName(); formName = StringUtils.forXML(formName); formsStrBuilder.append(formName + " "); } String formsStr = formsStrBuilder.toString(); if (formsStr.endsWith(" ")) formsStr = formsStr.substring(0, formsStr.length() - 1); if (lemmasStr.endsWith(" ")) lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1); if (withForms) retStrBuilder.append(" forms=\"" + formsStr + "\""); if (withLemmas) retStrBuilder.append(" lemmas=\"" + lemmasStr + "\""); } if (hasDictionaryEntries != null && hasDictionaryEntries) { retStrBuilder.append(" dictionary=\"" + "true" + "\""); } else if (hasDictionaryEntries != null && ! hasDictionaryEntries) { retStrBuilder.append(" dictionary=\"" + "false" + "\""); } retStrBuilder.append(">"); retStrBuilder.append(origWordForm); // origWordForm could contain nwd marks (these are transformed back to elements later in method buildString) retStrBuilder.append("</w>"); return retStrBuilder.toString(); } private String toBlanks(String inputStr) { int size = inputStr.length(); StringBuilder retStrBuilder = new StringBuilder(); for (int j=0; j < size; j++) { char c = inputStr.charAt(j); if (c == COMPLEX_ELEMENT_NWD_MARK.charAt(0) || c == COMPLEX_ELEMENT_MARK.charAt(0)) retStrBuilder.append(c); else retStrBuilder.append(" "); } return retStrBuilder.toString(); } } }