Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | 7d6d969b10cf |
children |
line wrap: on
line diff
--- a/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java Wed Dec 14 13:57:09 2011 +0100 +++ b/software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java Tue Nov 27 12:35:19 2012 +0100 @@ -3,6 +3,7 @@ import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; +import java.util.Enumeration; import java.util.Hashtable; import org.xml.sax.*; @@ -21,27 +22,39 @@ private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length(); private static int ELEMENT_TYPE_CHARACTERS = 1; private static int ELEMENT_TYPE_COMPLEX = 2; - private String[] normalizeFunctions = {}; // default: without normalize functions - private String[] nwbElements = {}; // non word breaking elements, default: these elements + private String docId; + private String language; + private String[] nwbElements = {}; // non word breaking elements, default: no nwb elements private String[] stopElements = {}; // default: no stop elements + private String outputFormat = "xml"; // default: xml private String[] outputOptions = {}; + private boolean withForms = false; + private boolean withLemmas = false; + private String[] highlightTerms = {}; // highlight terms, default: no highlight terms + private String[] normFunctions = {}; // default: no norm function + private boolean useNormFunction = false; + private boolean useRegFunction = false; private String xmlnsString = ""; - private String language; - private String outputXmlFragment = ""; + private StringBuilder result = new StringBuilder(); + private ArrayList<Token> resultTokens = new ArrayList<Token>(); + private Hashtable<String, ArrayList<Element>> elements = new Hashtable<String, ArrayList<Element>>(); private Element rootElement; private Element currentElement; + private int currentPosition = 0; + private int currentPageNumber = 0; + private int currentLineNumber = 0; + private Hashtable<String, Integer> currentPositions = new Hashtable<String, Integer>(); + private Hashtable<String, Integer> currentPagePositions = new Hashtable<String, Integer>(); private ArrayList<Element> elementQueue; - public XmlTokenizerContentHandler(String[] normalizeFunctions, String language) throws ApplicationException { - if (normalizeFunctions == null) { - String[] emptyFunctions = {}; - this.normalizeFunctions = emptyFunctions; - } else { - this.normalizeFunctions = normalizeFunctions; - } + public XmlTokenizerContentHandler(String language) throws ApplicationException { this.language = language; } + public void setDocIdentifier(String docId) { + this.docId = docId; + } + public void setNWBElements(String[] nwbElements) { this.nwbElements = nwbElements; } @@ -50,24 +63,66 @@ this.stopElements = stopElements; } - public void setOutputOptions(String[] outputOptions) { - this.outputOptions = outputOptions; + public void setHighlightTerms(String[] highlightTerms) { + this.highlightTerms = highlightTerms; + } + + public void setNormFunctions(String[] normFunctions) { + this.normFunctions = normFunctions; + if (this.normFunctions != null) { + for (int i=0; i< this.normFunctions.length; i++) { + String function = normFunctions[i]; + if (function.equals("norm")) + this.useNormFunction = true; + else if (function.equals("reg")) + this.useRegFunction = true; + } + } + } + + public void setOutputFormat(String outputFormat) { + this.outputFormat = outputFormat; } - public String getXmlFragment() { - return outputXmlFragment; + public void setOutputOptions(String[] outputOptions) { + this.outputOptions = outputOptions; + for (int i=0; i< this.outputOptions.length; i++) { + String function = outputOptions[i]; + if (function.equals("withForms")) + this.withForms = true; + else if (function.equals("withLemmas")) + this.withLemmas = true; + } + } + + public String getResultString() { + return result.toString(); + } + + public ArrayList<Token> getResultTokens() { + return resultTokens; } + public ArrayList<Element> getElements(String elementName) { + return elements.get(elementName); + } + + public int getPageCount() { + return currentPageNumber; + } + public void startDocument() throws SAXException { } public void endDocument() throws SAXException { try { - String rootElemToStr = rootElement.toXmlString(); + String rootElemToStr = rootElement.buildString(); write(rootElemToStr); write("\n"); } catch (NullPointerException e) { throw new SAXException(e); + } catch (ApplicationException e) { + throw new SAXException(e); } } @@ -78,6 +133,7 @@ if (charactersStr != null && ! charactersStr.equals("")) { if (currentElement != null) { Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS); + charElement.pageNumber = currentPageNumber; charElement.value = StringUtils.deresolveXmlEntities(charactersStr); if (currentElement.composites == null) currentElement.composites = new ArrayList<Element>(); @@ -96,9 +152,10 @@ } public void startPrefixMapping(String prefix, String uri) throws SAXException { - xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; if (prefix != null && prefix.equals("")) - xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" "; + xmlnsString += "xmlns" + "=\"" + uri + "\" "; + else + xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" "; } public void endPrefixMapping(String prefix) throws SAXException { @@ -117,8 +174,32 @@ if (currentElement.lang != null) newElement.lang = currentElement.lang; // language is inherited to childs currentElement.composites.add(newElement); + newElement.parent = currentElement; } currentElement = newElement; + if (localName != null && localName.equals("pb")) { + currentPageNumber++; + setCurrentPagePosition(localName, 0); + } + currentElement.pageNumber = currentPageNumber; + if (localName != null && localName.equals("lb")) { + currentLineNumber++; + } + currentElement.lineNumber = currentLineNumber; + currentPosition++; + currentElement.docPosition = currentPosition; + int newElemPosition = incrementCurrentPosition(localName); + currentElement.position = newElemPosition; + + currentElement.elemPosition = getElementPosition(currentElement); + Element parent = currentElement.parent; + if (parent == null) { + currentElement.xpath = "/" + currentElement.name + "[" + currentElement.elemPosition + "]"; + } else { + currentElement.xpath = parent.xpath + "/" + currentElement.name + "[" + currentElement.elemPosition + "]"; + } + int newElemPagePosition = incrementCurrentPagePosition(localName); + currentElement.pagePosition = newElemPagePosition; int attrSize = attrs.getLength(); String attrString = ""; for (int i=0; i<attrSize; i++) { @@ -126,8 +207,12 @@ String attrValue = attrs.getValue(i); attrValue = StringUtils.forXML(attrValue); attrString = attrString + " " + attrQName + "=\"" + attrValue + "\""; - if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) + if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) { currentElement.lang = attrValue; // if xml:lang is set, it is set to the new element and overwrites values inherited by the father + } + if (attrQName != null && (attrQName.toLowerCase().equals("xml:id") || attrQName.toLowerCase().equals("id"))) { + currentElement.xmlId = attrValue; + } } currentElement.attrString = attrString; if (! xmlnsString.equals("")) { @@ -153,112 +238,255 @@ } } - private boolean withForms() { + private int incrementCurrentPosition(String elemName) { + Integer currentElemPos = currentPositions.get(elemName); + if (currentElemPos == null) { + currentElemPos = new Integer(0); + } + currentElemPos++; + currentPositions.put(elemName, currentElemPos); + return currentElemPos.intValue(); + } + + private int getElementPosition(Element elem) { + int pos = 0; + Element parent = elem.parent; + if (parent == null) { + pos = 1; + } else { + pos = 0; + ArrayList<Element> composites = parent.composites; + if (composites != null) { + for (int i=0; i<composites.size(); i++) { + Element e = composites.get(i); + if (e.isComplex() && e.name.equals(elem.name)) { + pos++; + } + if (e == elem) + break; + } + } else { + pos = 1; + } + } + return pos; + } + + private int incrementCurrentPagePosition(String elemName) { + Integer currentElemPagePos = currentPagePositions.get(elemName); + if (currentElemPagePos == null) { + currentElemPagePos = new Integer(0); + } + currentElemPagePos++; + currentPagePositions.put(elemName, currentElemPagePos); + return currentElemPagePos.intValue(); + } + + private void setCurrentPagePosition(String elemName, int pos) { + Integer newPagePosition = new Integer(pos); + Enumeration<String> elemKeys = currentPagePositions.keys(); + while (elemKeys.hasMoreElements()) { + String elemKey = elemKeys.nextElement(); + currentPagePositions.put(elemKey, newPagePosition); + } + } + + private boolean isHighlightTerm(String term) { + if (term == null) + return false; boolean result = false; - for (int i=0; i< outputOptions.length; i++) { - String function = outputOptions[i]; - if (function.equals("withForms")) + for (int i=0; i< highlightTerms.length; i++) { + String t = highlightTerms[i].toLowerCase(); + String termLowerCase = term.toLowerCase(); + if (t.equals(termLowerCase)) return true; } return result; } - private boolean withLemmas() { + private boolean isHighlightTerm(String[] terms) { + if (terms == null) + return false; boolean result = false; - for (int i=0; i< outputOptions.length; i++) { - String function = outputOptions[i]; - if (function.equals("withLemmas")) - return true; + for (int i=0; i< highlightTerms.length; i++) { + String t = highlightTerms[i].toLowerCase(); + for (int j=0; j<terms.length; j++) { + String termLowerCase = terms[j].toLowerCase(); + if (t.equals(termLowerCase)) + return true; + } } return result; } private void write(String outStr) throws SAXException { - outputXmlFragment += outStr; + result.append(outStr); } - private class Element { + public class Element implements Comparable<Element> { private int type; - private String name; + public String name; private String xmlnsString; private String attrString; private String value; - private String lang; // normally value of attribute xml:lang or the inherited xml:lang value of the father node + public String lang; // value of attribute xml:lang or the inherited xml:lang value of the father node + public String xmlId; + public String xpath; + public int pageNumber; + public int lineNumber; + public int docPosition; // absolute position in document + public int position; // position within all elements with this name + public int elemPosition; // position in element e.g. the 6 sentence in paragraph + public int pagePosition; // position in page + private ArrayList<Token> tokens = new ArrayList<Token>(); private ArrayList<Element> composites; + private Element parent; + private boolean isStopElement = false; + private boolean isWordDelimiterElement = true; // default: is word delimiter element private Element(String name) { this.type = ELEMENT_TYPE_COMPLEX; - this.name = name; + setName(name); } private Element(String name, int type) { this.type = type; - this.name = name; + setName(name); } - private boolean isComplex() { + private void setName(String name) { + this.name = name; + for (int i=0; i<stopElements.length; i++) { + String stopElementName = stopElements[i]; + if (name.equals(stopElementName)) { + this.isStopElement = true; + break; + } + } + for (int i=0; i<nwbElements.length; i++) { + String nwbElementName = nwbElements[i]; + if (name.equals(nwbElementName)) { + this.isWordDelimiterElement = false; + break; + } + } + } + + public int compareTo(Element elem) { + return (new Integer(position)).compareTo(new Integer(elem.position)); + } + + private boolean isComplex() { boolean isComplex = false; if (type == ELEMENT_TYPE_COMPLEX) isComplex = true; return isComplex; } - private boolean isWordDelimiterElement() { - boolean isWordDelimiterElement = true; - for (int i=0; i<nwbElements.length; i++) { - String nwbElementName = nwbElements[i]; - if (name.equals(nwbElementName)) { - isWordDelimiterElement = false; - break; + public ArrayList<Token> getTokens() { + ArrayList<Token> retTokens = new ArrayList<Token>(); + if (isComplex()) { + if (composites != null) { + for (int i=0; i<composites.size(); i++) { + Element elem = composites.get(i); + if (elem.tokens != null) + retTokens.addAll(elem.tokens); + } } - } - return isWordDelimiterElement; + } + if (tokens != null) + retTokens.addAll(tokens); + return retTokens; + } + + public String getTokensStr(String type) { + ArrayList<Token> elementTokens = getTokens(); + String tokenStr = getTokensStr(type, elementTokens); + return tokenStr; } - private boolean isStopElement() { - boolean isStopElement = false; - for (int i=0; i<stopElements.length; i++) { - String stopElementName = stopElements[i]; - if (name.equals(stopElementName)) { - isStopElement = true; - break; - } + private String getTokensStr(String type, ArrayList<Token> tokens) { + StringBuilder tokenStr = new StringBuilder(); + for (int j=0; j<tokens.size(); j++) { + Token token = tokens.get(j); + String content = null; + if (type.equals("orig")) + content = token.getContentOrig(); + else if (type.equals("reg")) + content = token.getContentReg(); + else if (type.equals("norm")) + content = token.getContentNorm(); + else if (type.equals("morph")) + content = token.getContentMorph(); + if (content != null) + tokenStr.append(content + " "); } - return isStopElement; + return tokenStr.toString(); } - private String toXmlString() throws SAXException { - String retString = ""; + public String toXmlString() throws ApplicationException { + StringBuilder retStrBuilder = new StringBuilder(); + if (! isComplex()) { + retStrBuilder.append(value); + } else { + String xmlNsString = this.xmlnsString; + if (xmlNsString == null || xmlNsString.equals("")) { + retStrBuilder.append("<" + name + attrString + ">"); + } else { + retStrBuilder.append("<" + name + " " + xmlNsString + attrString + ">"); + } + if (composites != null) { + for (int i=0; i<composites.size(); i++) { + Element composite = composites.get(i); + if (! composite.isComplex()) { + if (composite.value != null && ! composite.value.equals("")) { + String compositeValueStr = StringUtils.removeNlTabBlanks(composite.value); // remove all newlines, they are no separators for words. And: if there are many Blanks/Tabs make them to one Blank + retStrBuilder.append(compositeValueStr); + } + } else { + retStrBuilder.append(composite.toXmlString()); + } + } + } + retStrBuilder.append("</" + name + ">"); + } + return retStrBuilder.toString(); + } + + private String buildString() throws ApplicationException { + StringBuilder retStrBuilder = new StringBuilder(); String elemLanguage = language; // default value for the document/page if (lang != null) elemLanguage = lang; // value of the element if available // write this element if (! isComplex()) { - retString += value; + retStrBuilder.append(value); } else { - String xmlNsString = this.xmlnsString; - if (xmlNsString == null || xmlNsString.equals("")) { - retString = retString + "<" + name + attrString + ">"; - } else { - retString = retString + "<" + name + " " + xmlNsString + attrString + ">"; + if (outputFormat != null && outputFormat.equals("xml")) { + String xmlNsString = this.xmlnsString; + if (xmlNsString == null || xmlNsString.equals("")) { + retStrBuilder.append("<" + name + attrString + ">"); + } else { + retStrBuilder.append("<" + name + " " + xmlNsString + attrString + ">"); + } + } else { // outputFormat == string + // nothing } if (composites != null) { - String compositesCharsWithMarks = ""; + StringBuilder compositesCharsWithMarks = new StringBuilder(); ArrayList<Element> complexElements = new ArrayList<Element>(); for (int i=0; i<composites.size(); i++) { Element composite = composites.get(i); if (! composite.isComplex()) { if (composite.value != null && ! composite.value.equals("")) { - String compositeValueStr = composite.value; - compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words. - compositeValueStr = compositeValueStr.replaceAll("[ \t]+", " "); // if there are many Blanks/Tabs make them to one Blank - compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr; + String compositeValueStr = StringUtils.removeNlTabBlanks(composite.value); // remove all newlines, they are no separators for words. And: if there are many Blanks/Tabs make them to one Blank + compositesCharsWithMarks.append(compositeValueStr); } } else { - if (! composite.isWordDelimiterElement()) { - compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_NWD_MARK; // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>) + if (! composite.isWordDelimiterElement) { + compositesCharsWithMarks.append(COMPLEX_ELEMENT_NWD_MARK); // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>) } else { - compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_MARK; // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>) + compositesCharsWithMarks.append(COMPLEX_ELEMENT_MARK); // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>) } complexElements.add(composite); } @@ -270,120 +498,235 @@ for (int i=0; i<complexElements.size(); i++) { int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksWithWordTags.indexOf(COMPLEX_ELEMENT_MARK); Element complexElem = complexElements.get(i); - String complexElementStr = complexElem.toXmlString(); + String complexElementStr = complexElem.buildString(); String firstPiece = ""; if (indexComplexElemCompositesCharsWithMarks > 0) { firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks); compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks); } - retString = retString + firstPiece + complexElementStr; + retStrBuilder.append(firstPiece + complexElementStr); compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE); } - retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added + retStrBuilder.append(compositesCharsWithMarksWithWordTags); // last one must also be added } else { - retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added + retStrBuilder.append(compositesCharsWithMarksWithWordTags); // last one must also be added } } - retString = retString + "</" + name + ">"; + if (outputFormat != null && outputFormat.equals("xml")) { + retStrBuilder.append("</" + name + ">"); + } else { // outputFormat == string + // nothing + } + // put element into elements name hashtable + ArrayList<Element> elems = elements.get(name); + if (elems == null) { + elems = new ArrayList<Element>(); + elements.put(name, elems); + } + elems.add(this); } - return retString; + return retStrBuilder.toString(); } - private String insertWordTags(String charactersStrDeresolved, String language) throws SAXException { + private String insertWordTags(StringBuilder charactersStrDeresolvedBuilder, String language) throws ApplicationException { + String charactersStrDeresolved = charactersStrDeresolvedBuilder.toString(); String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved); - String retStr = ""; - try { - Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr)); - tokenizer.setLanguage(language); - tokenizer.setNormFunctions(normalizeFunctions); - ArrayList<Token> tokens = tokenizer.getTokens(); - int endPos = 0; - for (int i=0; i < tokens.size(); i++) { - Token token = tokens.get(i); - String wordForm = token.getContent(); - int startPos = token.getStart(); - String beforeStr = charactersStr.substring(endPos, startPos); - endPos = token.getEnd(); - String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr); - String origWordForm = charactersStr.substring(startPos, endPos); - String wordTag = insertWordTags(wordForm, language, origWordForm); - retStr = retStr + beforeStrDeresolved + wordTag; + StringBuilder retStrBuilder = new StringBuilder(); + Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr)); + tokenizer.setLanguage(language); + String[] normFunction = {"norm"}; + tokenizer.setNormFunctions(normFunction); + ArrayList<Token> tokens = tokenizer.getTokens(); + int endPos = 0; + for (int i=0; i < tokens.size(); i++) { + Token token = tokens.get(i); + int startPos = token.getStart(); + String beforeStr = charactersStr.substring(endPos, startPos); + endPos = token.getEnd(); + String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr); + String origWordForm = charactersStr.substring(startPos, endPos); + String wordTag = insertWordTags(token, language, origWordForm); + if (outputFormat != null && outputFormat.equals("xml")) { + retStrBuilder.append(beforeStrDeresolved + wordTag); + } else { // outputFormat == string + String beforeStrDeresolvedToBlanks = toBlanks(beforeStrDeresolved); + retStrBuilder.append(beforeStrDeresolvedToBlanks + wordTag); + } + } + String lastAfterStr = charactersStr.substring(endPos); + String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr); + if (outputFormat != null && outputFormat.equals("xml")) { + retStrBuilder.append(lastAfterStrDeresolved); + } else { // outputFormat == string + String lastAfterStrDeresolvedToBlanks = toBlanks(lastAfterStrDeresolved); + retStrBuilder.append(lastAfterStrDeresolvedToBlanks); + } + return retStrBuilder.toString(); + } + + private String insertWordTags(Token token, String language, String origWordForm) throws ApplicationException { + if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) { + return origWordForm; + } + String wordTag = null; + token.setDocId(docId); + token.setLanguage(lang); + token.setPageNumber(pageNumber); + token.setLineNumber(lineNumber); + token.setElementPosition(position); + token.setElementPagePosition(pagePosition); + token.setElementName(name); + token.setXmlId(xmlId); + token.setXpath("xpath"); // TODO + if (name != null && name.equals("reg")) { + if (attrString != null && attrString.contains("norm=\"")) { + int regIndexBegin = attrString.indexOf("norm=\""); + int regIndexEnd = attrString.indexOf("\"", regIndexBegin + 7); + String reg = attrString.substring(regIndexBegin + 6, regIndexEnd); + token.setContentReg(reg); + String[] normFunction = {"norm"}; + Normalizer normalizer = new Normalizer(normFunction, language); + String normStr = normalizer.normalize(reg); + token.setContentNorm(normStr); } - String lastAfterStr = charactersStr.substring(endPos); - String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr); - retStr = retStr + lastAfterStrDeresolved; - } catch (ApplicationException e) { - throw new SAXException(e); + } + if (language == null) { + token.setContentOrig(origWordForm); // TODO necessary ? + tokens.add(token); + resultTokens.add(token); + return origWordForm; + } + if (isStopElement && outputFormat != null && outputFormat.equals("xml")) + return origWordForm; + if (isStopElement && outputFormat != null && outputFormat.equals("string")) + return toBlanks(origWordForm); + String wordFormNorm = token.getContentNorm(); + String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm); + ArrayList<Lemma> lemmas = null; + Boolean hasDctionaryEntries = null; + String lemmasStr = ""; + if (withForms || withLemmas) { + LexHandler lexHandler = LexHandler.getInstance(); + lemmas = lexHandler.getLemmas(wordFormNorm, "form", language, Normalizer.DICTIONARY, false); // Performance: needs 15 % of the indexing time + if (lemmas != null) { + for (int i=0; i < lemmas.size(); i++) { + Lemma lemma = lemmas.get(i); + String lemmaName = lemma.getLemmaName(); + lemmasStr = lemmasStr + lemmaName + " "; + } + } + lemmasStr = lemmasStr.trim(); + token.setContentMorph(lemmasStr); + hasDctionaryEntries = false; + ArrayList<String> lexEntries = lexHandler.getLexEntryKeys(wordFormNorm, language, Normalizer.DICTIONARY); // Performance: needs 15 % of the indexing time + if (lexEntries != null) + hasDctionaryEntries = true; } + if (outputFormat != null && outputFormat.equals("xml")) { + wordTag = insertWordTags(origWordFormDeresolved, token, language, lemmas, hasDctionaryEntries); // Performance: needs 10 % of the indexing time + String tokenWordForm = token.getContentOrig(); // word form is in contentOrig + if (useRegFunction) + tokenWordForm = token.getContentReg(); + else if (useNormFunction) + tokenWordForm = token.getContentNorm(); + else if (withLemmas) + tokenWordForm = token.getContentMorph(); + boolean isHighlightTerm = false; + if (highlightTerms.length > 0 && ! withLemmas) { + isHighlightTerm = isHighlightTerm(tokenWordForm); + } else { + if (highlightTerms.length > 0 && lemmas != null) { + String[] lemmasArray = lemmasStr.split(" "); + isHighlightTerm = isHighlightTerm(lemmasArray); + } + } + if (isHighlightTerm) { + wordTag = "<hi>" + wordTag + "</hi>"; + } + } else { // outputFormat == string + String inWordFormWithoutSpecialSymbols = removeSpecialSymbols(origWordForm); // without hyphen, blanks, newline, tab + if (withLemmas) { + if (lemmas != null) { + String blanksAndNWBMarksOfOrigWord = toBlanks(origWordFormDeresolved); // to rescue the NWB marks of the origWord and put it to the beginning of the lemmasStr + wordTag = blanksAndNWBMarksOfOrigWord + lemmasStr; + token.setContentMorph(lemmasStr); + } else { + wordTag = inWordFormWithoutSpecialSymbols; + } + } else { + wordTag = inWordFormWithoutSpecialSymbols; + } + tokens.add(token); + resultTokens.add(token); + } + return wordTag; + } + + private String removeSpecialSymbols(String inputStr) { + String retStr = inputStr.replaceAll(" |\n|\t|-|\u00AD", ""); // blank, newline, tab, minus, soft hyphen return retStr; } - private String insertWordTags(String wordForm, String language, String origWordForm) throws ApplicationException { - String wordTag = null; - if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) - return origWordForm; - if (isStopElement()) - return origWordForm; - wordForm = removeSpecialSymbols(wordForm); - wordForm = wordForm.toLowerCase(); - String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm); - ArrayList<Lemma> lemmas = null; - if (withForms() || withLemmas()) { - LexHandler lexHandler = LexHandler.getInstance(); - lemmas = lexHandler.getLemmas(wordForm, "form", language, Normalizer.NONE); - } - wordTag = insertWordTags(origWordFormDeresolved, wordForm, language, null, lemmas); - return wordTag; - } - /** * * @param origWordToken could contain nwd marks - * @param wordForm contains no nwd marks + * @param token * @param language - * @param origWordFormNormalized * @param lemmas * @return for each substring between nwd marks create a word tag */ - private String insertWordTags(String origWordToken, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) { + private String insertWordTags(String origWordToken, Token token, String language, ArrayList<Lemma> lemmas, Boolean hasDictionaryEntries) { if (origWordToken.isEmpty()) return origWordToken; if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK)) return COMPLEX_ELEMENT_NWD_MARK; String retWordTags = ""; String origWordTokenTmp = origWordToken; - while (! origWordTokenTmp.isEmpty()) { - if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) { // single nwd mark - origWordTokenTmp = origWordTokenTmp.substring(1); - retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK; - } else { - int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK); - if (indexUpToNWD != -1) { // not end of string reached - String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD); - String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); - retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK; - origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1); - } else { // end of string reached - String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length()); - String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas); - retWordTags = retWordTags + origWordTokenFragmentWithTags; - origWordTokenTmp = ""; // finente - } - } + if (outputFormat != null && outputFormat.equals("xml")) { + retWordTags = getWordTag(origWordToken, token, language, lemmas, hasDictionaryEntries); + /* + while (! origWordTokenTmp.isEmpty()) { + if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) { // single nwd mark + origWordTokenTmp = origWordTokenTmp.substring(1); + retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK; + } else { + int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK); + if (indexUpToNWD != -1) { // not end of string reached + String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD); + String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, token, language, lemmas, hasDictionaryEntries); + retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK; + origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1); + } else { // end of string reached + String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length()); + String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, token, language, lemmas, hasDictionaryEntries); + retWordTags = retWordTags + origWordTokenFragmentWithTags; + origWordTokenTmp = ""; // finente + } + } + } + */ + } else { + // nothing } return retWordTags; } - private String getWordTag(String origWordForm, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) { + private String getWordTag(String origWordForm, Token token, String language, ArrayList<Lemma> lemmas, Boolean hasDictionaryEntries) { if (origWordForm == null || origWordForm.isEmpty()) return ""; + String wordForm = token.getContentOrig(); // word form is in contentOrig + String regularizedWordForm = token.getContentReg(); + String normalizedWordForm = token.getContentNorm(); String langISOCode = Language.getInstance().getISO639Code(language); - String retStr = "<w form=\"" + wordForm + "\"" + " lang=\"" + langISOCode + "\""; - if (origWordFormNormalized != null) - retStr = retStr + " formNormalized=\"" + origWordFormNormalized + "\""; + StringBuilder retStrBuilder = new StringBuilder(); + retStrBuilder.append("<w" + " lang=\"" + langISOCode + "\"" + " form=\"" + wordForm + "\""); + if (regularizedWordForm != null) + retStrBuilder.append(" formRegularized=\"" + regularizedWordForm + "\""); + if (normalizedWordForm != null) + retStrBuilder.append(" formNormalized=\"" + normalizedWordForm + "\""); if (lemmas != null) { String lemmasStr = ""; - String formsStr = ""; + StringBuilder formsStrBuilder = new StringBuilder(); Collections.sort(lemmas); Hashtable<String, Form> formsHashtable = new Hashtable<String, Form>(); for (int i=0; i < lemmas.size(); i++) { @@ -403,24 +746,40 @@ Form form = forms.get(i); String formName = form.getFormName(); formName = StringUtils.forXML(formName); - formsStr = formsStr + formName + " "; + formsStrBuilder.append(formName + " "); } + String formsStr = formsStrBuilder.toString(); if (formsStr.endsWith(" ")) formsStr = formsStr.substring(0, formsStr.length() - 1); if (lemmasStr.endsWith(" ")) lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1); - if (withForms()) - retStr = retStr + " forms=\"" + formsStr + "\""; - if (withLemmas()) - retStr = retStr + " lemmas=\"" + lemmasStr + "\""; + if (withForms) + retStrBuilder.append(" forms=\"" + formsStr + "\""); + if (withLemmas) + retStrBuilder.append(" lemmas=\"" + lemmasStr + "\""); } - retStr = retStr + ">" + origWordForm + "</w>"; - return retStr; + if (hasDictionaryEntries != null && hasDictionaryEntries) { + retStrBuilder.append(" dictionary=\"" + "true" + "\""); + } else if (hasDictionaryEntries != null && ! hasDictionaryEntries) { + retStrBuilder.append(" dictionary=\"" + "false" + "\""); + } + retStrBuilder.append(">"); + retStrBuilder.append(origWordForm); // origWordForm could contain nwd marks (these are transformed back to elements later in method buildString) + retStrBuilder.append("</w>"); + return retStrBuilder.toString(); } - private String removeSpecialSymbols(String inputStr) { - String retStr = inputStr.replaceAll(" |\n|\t|-|\u2424|\u2425", ""); - return retStr; + private String toBlanks(String inputStr) { + int size = inputStr.length(); + StringBuilder retStrBuilder = new StringBuilder(); + for (int j=0; j < size; j++) { + char c = inputStr.charAt(j); + if (c == COMPLEX_ELEMENT_NWD_MARK.charAt(0) || c == COMPLEX_ELEMENT_MARK.charAt(0)) + retStrBuilder.append(c); + else + retStrBuilder.append(" "); + } + return retStrBuilder.toString(); } }