mpdl-group: software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java comparison

comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java @ 23:e845310098ba

diverse Korrekturen

author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Tue, 27 Nov 2012 12:35:19 +0100
parents	7d6d969b10cf
children

comparison

equal deleted inserted replaced

-:6a45a982c333
+:e845310098ba
 package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.Enumeration;
 import java.util.Hashtable;
 import org.xml.sax.*;
 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
 private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString();  // word delimiting element
 private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString();  // not word delimiting element
 private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length();
 private static int ELEMENT_TYPE_CHARACTERS = 1;
 private static int ELEMENT_TYPE_COMPLEX = 2;
-private String[] normalizeFunctions = {};  // default: without normalize functions
+private String docId;
-private String[] nwbElements = {};  // non word breaking elements, default: these elements
+private String language;
+private String[] nwbElements = {};  // non word breaking elements, default: no nwb elements
 private String[] stopElements = {};  // default: no stop elements
+private String outputFormat = "xml";  // default: xml
 private String[] outputOptions = {};
+private boolean withForms = false;
+private boolean withLemmas = false;
+private String[] highlightTerms = {};  // highlight terms, default: no highlight terms
+private String[] normFunctions = {};  // default: no norm function
+private boolean useNormFunction = false;
+private boolean useRegFunction = false;
 private String xmlnsString = "";
-private String language;
+private StringBuilder result = new StringBuilder();
-private String outputXmlFragment = "";
+private ArrayList<Token> resultTokens = new ArrayList<Token>();
+private Hashtable<String, ArrayList<Element>> elements = new Hashtable<String, ArrayList<Element>>();
 private Element rootElement;
 private Element currentElement;
+private int currentPosition = 0;
+private int currentPageNumber = 0;
+private int currentLineNumber = 0;
+private Hashtable<String, Integer> currentPositions = new Hashtable<String, Integer>();
+private Hashtable<String, Integer> currentPagePositions = new Hashtable<String, Integer>();
 private ArrayList<Element> elementQueue;
-public XmlTokenizerContentHandler(String[] normalizeFunctions, String language) throws ApplicationException {
+public XmlTokenizerContentHandler(String language) throws ApplicationException {
-if (normalizeFunctions == null) {
-String[] emptyFunctions = {};
-this.normalizeFunctions = emptyFunctions;
-} else {
-this.normalizeFunctions = normalizeFunctions;
-}
 this.language = language;
+}
+public void setDocIdentifier(String docId) {
+this.docId = docId;
 }
 public void setNWBElements(String[] nwbElements) {
 this.nwbElements = nwbElements;
 }
 public void setStopElements(String[] stopElements) {
 this.stopElements = stopElements;
 }
+public void setHighlightTerms(String[] highlightTerms) {
+this.highlightTerms = highlightTerms;
+}
+public void setNormFunctions(String[] normFunctions) {
+this.normFunctions = normFunctions;
+if (this.normFunctions != null) {
+for (int i=0; i< this.normFunctions.length; i++) {
+String function = normFunctions[i];
+if (function.equals("norm"))
+this.useNormFunction = true;
+else if (function.equals("reg"))
+this.useRegFunction = true;
+}
+}
+}
+public void setOutputFormat(String outputFormat) {
+this.outputFormat = outputFormat;
+}
 public void setOutputOptions(String[] outputOptions) {
 this.outputOptions = outputOptions;
-}
+for (int i=0; i< this.outputOptions.length; i++) {
+String function = outputOptions[i];
-public String getXmlFragment() {
+if (function.equals("withForms"))
-return outputXmlFragment;
+this.withForms = true;
+else if (function.equals("withLemmas"))
+this.withLemmas = true;
+}
+}
+public String getResultString() {
+return result.toString();
+}
+public ArrayList<Token> getResultTokens() {
+return resultTokens;
 }
+public ArrayList<Element> getElements(String elementName) {
+return elements.get(elementName);
+}
+public int getPageCount() {
+return currentPageNumber;
+}
 public void startDocument() throws SAXException {
 }
 public void endDocument() throws SAXException {
 try {
-String rootElemToStr = rootElement.toXmlString();
+String rootElemToStr = rootElement.buildString();
 write(rootElemToStr);
 write("\n");
 } catch (NullPointerException e) {
+throw new SAXException(e);
+} catch (ApplicationException e) {
 throw new SAXException(e);
 }
 }
 public void characters(char[] c, int start, int length) throws SAXException {
 System.arraycopy(c, start, cCopy, 0, length);
 String charactersStr = String.valueOf(cCopy);
 if (charactersStr != null && ! charactersStr.equals("")) {
 if (currentElement != null) {
 Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS);
+charElement.pageNumber = currentPageNumber;
 charElement.value = StringUtils.deresolveXmlEntities(charactersStr);
 if (currentElement.composites == null)
 currentElement.composites = new ArrayList<Element>();
 currentElement.composites.add(charElement);
 }
 public void setDocumentLocator(Locator locator) {
 }
 public void startPrefixMapping(String prefix, String uri) throws SAXException {
-xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
 if (prefix != null && prefix.equals(""))
-xmlnsString = "xmlns" + prefix + "=\"" + uri + "\" ";
+xmlnsString += "xmlns" + "=\"" + uri + "\" ";
+else
+xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
 }
 public void endPrefixMapping(String prefix) throws SAXException {
 }
 if (currentElement.composites == null)
 currentElement.composites = new ArrayList<Element>();
 if (currentElement.lang != null)
 newElement.lang = currentElement.lang;  // language is inherited to childs
 currentElement.composites.add(newElement);
+newElement.parent = currentElement;
 }
 currentElement = newElement;
+if (localName != null && localName.equals("pb")) {
+currentPageNumber++;
+setCurrentPagePosition(localName, 0);
+}
+currentElement.pageNumber = currentPageNumber;
+if (localName != null && localName.equals("lb")) {
+currentLineNumber++;
+}
+currentElement.lineNumber = currentLineNumber;
+currentPosition++;
+currentElement.docPosition = currentPosition;
+int newElemPosition = incrementCurrentPosition(localName);
+currentElement.position = newElemPosition;
+currentElement.elemPosition = getElementPosition(currentElement);
+Element parent = currentElement.parent;
+if (parent == null) {
+currentElement.xpath = "/" + currentElement.name + "[" + currentElement.elemPosition + "]";
+} else {
+currentElement.xpath = parent.xpath + "/" + currentElement.name + "[" + currentElement.elemPosition + "]";
+}
+int newElemPagePosition = incrementCurrentPagePosition(localName);
+currentElement.pagePosition = newElemPagePosition;
 int attrSize = attrs.getLength();
 String attrString = "";
 for (int i=0; i<attrSize; i++) {
 String attrQName = attrs.getQName(i);
 String attrValue = attrs.getValue(i);
 attrValue = StringUtils.forXML(attrValue);
 attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
-if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang")))
+if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) {
 currentElement.lang = attrValue;  // if xml:lang is set, it is set to the new element and overwrites values inherited by the father
+}
+if (attrQName != null && (attrQName.toLowerCase().equals("xml:id") || attrQName.toLowerCase().equals("id"))) {
+currentElement.xmlId = attrValue;
+}
 }
 currentElement.attrString = attrString;
 if (! xmlnsString.equals("")) {
 currentElement.xmlnsString = xmlnsString;
 }
 } else {
 currentElement = null;
 }
 }
-private boolean withForms() {
+private int incrementCurrentPosition(String elemName) {
+Integer currentElemPos = currentPositions.get(elemName);
+if (currentElemPos == null) {
+currentElemPos = new Integer(0);
+}
+currentElemPos++;
+currentPositions.put(elemName, currentElemPos);
+return currentElemPos.intValue();
+}
+private int getElementPosition(Element elem) {
+int pos = 0;
+Element parent = elem.parent;
+if (parent == null) {
+pos = 1;
+} else {
+pos = 0;
+ArrayList<Element> composites = parent.composites;
+if (composites != null) {
+for (int i=0; i<composites.size(); i++) {
+Element e = composites.get(i);
+if (e.isComplex() && e.name.equals(elem.name)) {
+pos++;
+}
+if (e == elem)
+break;
+}
+} else {
+pos = 1;
+}
+}
+return pos;
+}
+private int incrementCurrentPagePosition(String elemName) {
+Integer currentElemPagePos = currentPagePositions.get(elemName);
+if (currentElemPagePos == null) {
+currentElemPagePos = new Integer(0);
+}
+currentElemPagePos++;
+currentPagePositions.put(elemName, currentElemPagePos);
+return currentElemPagePos.intValue();
+}
+private void setCurrentPagePosition(String elemName, int pos) {
+Integer newPagePosition = new Integer(pos);
+Enumeration<String> elemKeys = currentPagePositions.keys();
+while (elemKeys.hasMoreElements()) {
+String elemKey = elemKeys.nextElement();
+currentPagePositions.put(elemKey, newPagePosition);
+}
+}
+private boolean isHighlightTerm(String term) {
+if (term == null)
+return false;
 boolean result = false;
-for (int i=0; i< outputOptions.length; i++) {
+for (int i=0; i< highlightTerms.length; i++) {
-String function = outputOptions[i];
+String t = highlightTerms[i].toLowerCase();
-if (function.equals("withForms"))
+String termLowerCase = term.toLowerCase();
+if (t.equals(termLowerCase))
 return true;
 }
 return result;
 }
-private boolean withLemmas() {
+private boolean isHighlightTerm(String[] terms) {
+if (terms == null)
+return false;
 boolean result = false;
-for (int i=0; i< outputOptions.length; i++) {
+for (int i=0; i< highlightTerms.length; i++) {
-String function = outputOptions[i];
+String t = highlightTerms[i].toLowerCase();
-if (function.equals("withLemmas"))
+for (int j=0; j<terms.length; j++) {
-return true;
+String termLowerCase = terms[j].toLowerCase();
+if (t.equals(termLowerCase))
+return true;
+}
 }
 return result;
 }
 private void write(String outStr) throws SAXException {
-outputXmlFragment += outStr;
+result.append(outStr);
 }
-private class Element {
+public class Element implements Comparable<Element> {
 private int type;
-private String name;
+public String name;
 private String xmlnsString;
 private String attrString;
 private String value;
-private String lang;  // normally value of attribute xml:lang or the inherited xml:lang value of the father node
+public String lang;  // value of attribute xml:lang or the inherited xml:lang value of the father node
+public String xmlId;
+public String xpath;
+public int pageNumber;
+public int lineNumber;
+public int docPosition;  // absolute position in document
+public int position;  // position within all elements with this name
+public int elemPosition;  // position in element e.g. the 6 sentence in paragraph
+public int pagePosition; // position in page
+private ArrayList<Token> tokens = new ArrayList<Token>();
 private ArrayList<Element> composites;
+private Element parent;
+private boolean isStopElement = false;
+private boolean isWordDelimiterElement = true;  // default: is word delimiter element
 private Element(String name) {
 this.type = ELEMENT_TYPE_COMPLEX;
-this.name = name;
+setName(name);
 }
 private Element(String name, int type) {
 this.type = type;
+setName(name);
+}
+private void setName(String name) {
 this.name = name;
-}
+for (int i=0; i<stopElements.length; i++) {
+String stopElementName = stopElements[i];
-private boolean isComplex() {
+if (name.equals(stopElementName)) {
+this.isStopElement = true;
+break;
+}
+}
+for (int i=0; i<nwbElements.length; i++) {
+String nwbElementName = nwbElements[i];
+if (name.equals(nwbElementName)) {
+this.isWordDelimiterElement = false;
+break;
+}
+}
+}
+public int compareTo(Element elem) {
+return (new Integer(position)).compareTo(new Integer(elem.position));
+}
+private boolean isComplex() {
 boolean isComplex = false;
 if (type == ELEMENT_TYPE_COMPLEX)
 isComplex = true;
 return isComplex;
 }
-private boolean isWordDelimiterElement() {
+public ArrayList<Token> getTokens() {
-boolean isWordDelimiterElement = true;
+ArrayList<Token> retTokens = new ArrayList<Token>();
-for (int i=0; i<nwbElements.length; i++) {
+if (isComplex()) {
-String nwbElementName = nwbElements[i];
+if (composites != null) {
-if (name.equals(nwbElementName)) {
+for (int i=0; i<composites.size(); i++) {
-isWordDelimiterElement = false;
+Element elem = composites.get(i);
-break;
+if (elem.tokens != null)
-}
+retTokens.addAll(elem.tokens);
 }
-return isWordDelimiterElement;
+}
 }
+if (tokens != null)
-private boolean isStopElement() {
+retTokens.addAll(tokens);
-boolean isStopElement = false;
+return retTokens;
-for (int i=0; i<stopElements.length; i++) {
+}
-String stopElementName = stopElements[i];
-if (name.equals(stopElementName)) {
+public String getTokensStr(String type) {
-isStopElement = true;
+ArrayList<Token> elementTokens = getTokens();
-break;
+String tokenStr = getTokensStr(type, elementTokens);
-}
+return tokenStr;
 }
-return isStopElement;
-}
+private String getTokensStr(String type, ArrayList<Token> tokens) {
+StringBuilder tokenStr = new StringBuilder();
-private String toXmlString() throws SAXException {
+for (int j=0; j<tokens.size(); j++) {
-String retString = "";
+Token token = tokens.get(j);
+String content = null;
+if (type.equals("orig"))
+content = token.getContentOrig();
+else if (type.equals("reg"))
+content = token.getContentReg();
+else if (type.equals("norm"))
+content = token.getContentNorm();
+else if (type.equals("morph"))
+content = token.getContentMorph();
+if (content != null)
+tokenStr.append(content + " ");
+}
+return tokenStr.toString();
+}
+public String toXmlString() throws ApplicationException {
+StringBuilder retStrBuilder = new StringBuilder();
+if (! isComplex()) {
+retStrBuilder.append(value);
+} else {
+String xmlNsString = this.xmlnsString;
+if (xmlNsString == null || xmlNsString.equals("")) {
+retStrBuilder.append("<" + name + attrString + ">");
+} else {
+retStrBuilder.append("<" + name + " " + xmlNsString + attrString + ">");
+}
+if (composites != null) {
+for (int i=0; i<composites.size(); i++) {
+Element composite = composites.get(i);
+if (! composite.isComplex()) {
+if (composite.value != null && ! composite.value.equals("")) {
+String compositeValueStr = StringUtils.removeNlTabBlanks(composite.value);  // remove all newlines, they are no separators for words. And: if there are many Blanks/Tabs make them to one Blank
+retStrBuilder.append(compositeValueStr);
+}
+} else {
+retStrBuilder.append(composite.toXmlString());
+}
+}
+}
+retStrBuilder.append("</" + name + ">");
+}
+return retStrBuilder.toString();
+}
+private String buildString() throws ApplicationException {
+StringBuilder retStrBuilder = new StringBuilder();
 String elemLanguage = language;  // default value for the document/page
 if (lang != null)
 elemLanguage = lang;  // value of the element if available
 // write this element
 if (! isComplex()) {
-retString += value;
+retStrBuilder.append(value);
 } else {
-String xmlNsString = this.xmlnsString;
+if (outputFormat != null && outputFormat.equals("xml")) {
-if (xmlNsString == null || xmlNsString.equals("")) {
+String xmlNsString = this.xmlnsString;
-retString = retString + "<" + name + attrString + ">";
+if (xmlNsString == null || xmlNsString.equals("")) {
-} else {
+retStrBuilder.append("<" + name + attrString + ">");
-retString = retString + "<" + name + " " + xmlNsString + attrString + ">";
+} else {
+retStrBuilder.append("<" + name + " " + xmlNsString + attrString + ">");
+}
+} else {  // outputFormat == string
+// nothing
 }
 if (composites != null) {
-String compositesCharsWithMarks = "";
+StringBuilder compositesCharsWithMarks = new StringBuilder();
 ArrayList<Element> complexElements = new ArrayList<Element>();
 for (int i=0; i<composites.size(); i++) {
 Element composite = composites.get(i);
 if (! composite.isComplex()) {
 if (composite.value != null && ! composite.value.equals("")) {
-String compositeValueStr = composite.value;
+String compositeValueStr = StringUtils.removeNlTabBlanks(composite.value);  // remove all newlines, they are no separators for words. And: if there are many Blanks/Tabs make them to one Blank
-compositeValueStr = compositeValueStr.replaceAll("\n", ""); // remove all newlines, they are no separators for words.
+compositesCharsWithMarks.append(compositeValueStr);
-compositeValueStr = compositeValueStr.replaceAll("[ \t]+", " "); // if there are many Blanks/Tabs make them to one Blank
-compositesCharsWithMarks = compositesCharsWithMarks + compositeValueStr;
 }
 } else {
-if (! composite.isWordDelimiterElement()) {
+if (! composite.isWordDelimiterElement) {
-compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_NWD_MARK;  // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>)
+compositesCharsWithMarks.append(COMPLEX_ELEMENT_NWD_MARK);  // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>)
 } else {
-compositesCharsWithMarks = compositesCharsWithMarks + COMPLEX_ELEMENT_MARK;  // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>)
+compositesCharsWithMarks.append(COMPLEX_ELEMENT_MARK);  // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>)
 }
 complexElements.add(composite);
 }
 }
 // compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK);  // remove Blanks after the non word breaking mark (e.g. "praebi<lb/> ta" is changed to "praebi<lb/>ta")
 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK);  // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values
 if (complexElements.size() > 0) {
 for (int i=0; i<complexElements.size(); i++) {
 int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksWithWordTags.indexOf(COMPLEX_ELEMENT_MARK);
 Element complexElem = complexElements.get(i);
-String complexElementStr = complexElem.toXmlString();
+String complexElementStr = complexElem.buildString();
 String firstPiece = "";
 if (indexComplexElemCompositesCharsWithMarks > 0) {
 firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks);
 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks);
 }
-retString = retString + firstPiece + complexElementStr;
+retStrBuilder.append(firstPiece + complexElementStr);
 compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE);
 }
-retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added
+retStrBuilder.append(compositesCharsWithMarksWithWordTags); // last one must also be added
 } else {
-retString = retString + compositesCharsWithMarksWithWordTags; // last one must also be added
+retStrBuilder.append(compositesCharsWithMarksWithWordTags); // last one must also be added
 }
 }
-retString = retString + "</" + name + ">";
+if (outputFormat != null && outputFormat.equals("xml")) {
+retStrBuilder.append("</" + name + ">");
+} else {  // outputFormat == string
+// nothing
+}
+// put element into elements name hashtable
+ArrayList<Element> elems = elements.get(name);
+if (elems == null) {
+elems = new ArrayList<Element>();
+elements.put(name, elems);
+}
+elems.add(this);
 }
-return retString;
+return retStrBuilder.toString();
 }
-private String insertWordTags(String charactersStrDeresolved, String language) throws SAXException {
+private String insertWordTags(StringBuilder charactersStrDeresolvedBuilder, String language) throws ApplicationException {
+String charactersStrDeresolved = charactersStrDeresolvedBuilder.toString();
 String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved);
-String retStr = "";
+StringBuilder retStrBuilder = new StringBuilder();
-try {
+Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr));
-Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr));
+tokenizer.setLanguage(language);
-tokenizer.setLanguage(language);
+String[] normFunction = {"norm"};
-tokenizer.setNormFunctions(normalizeFunctions);
+tokenizer.setNormFunctions(normFunction);
 ArrayList<Token> tokens = tokenizer.getTokens();
 int endPos = 0;
 for (int i=0; i < tokens.size(); i++) {
 Token token = tokens.get(i);
-String wordForm = token.getContent();
+int startPos = token.getStart();
-int startPos = token.getStart();
+String beforeStr = charactersStr.substring(endPos, startPos);
-String beforeStr = charactersStr.substring(endPos, startPos);
+endPos = token.getEnd();
-endPos = token.getEnd();
+String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr);
-String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr);
+String origWordForm = charactersStr.substring(startPos, endPos);
-String origWordForm = charactersStr.substring(startPos, endPos);
+String wordTag = insertWordTags(token, language, origWordForm);
-String wordTag = insertWordTags(wordForm, language, origWordForm);
+if (outputFormat != null && outputFormat.equals("xml")) {
-retStr = retStr + beforeStrDeresolved + wordTag;
+retStrBuilder.append(beforeStrDeresolved + wordTag);
-}
+} else {  // outputFormat == string
-String lastAfterStr = charactersStr.substring(endPos);
+String beforeStrDeresolvedToBlanks = toBlanks(beforeStrDeresolved);
-String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr);
+retStrBuilder.append(beforeStrDeresolvedToBlanks + wordTag);
-retStr = retStr + lastAfterStrDeresolved;
+}
-} catch (ApplicationException e) {
+}
-throw new SAXException(e);
+String lastAfterStr = charactersStr.substring(endPos);
-}
+String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr);
-return retStr;
+if (outputFormat != null && outputFormat.equals("xml")) {
-}
+retStrBuilder.append(lastAfterStrDeresolved);
+} else {  // outputFormat == string
-private String insertWordTags(String wordForm, String language, String origWordForm) throws ApplicationException {
+String lastAfterStrDeresolvedToBlanks = toBlanks(lastAfterStrDeresolved);
+retStrBuilder.append(lastAfterStrDeresolvedToBlanks);
+}
+return retStrBuilder.toString();
+}
+private String insertWordTags(Token token, String language, String origWordForm) throws ApplicationException {
+if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) {
+return origWordForm;
+}
 String wordTag = null;
-if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK))
+token.setDocId(docId);
+token.setLanguage(lang);
+token.setPageNumber(pageNumber);
+token.setLineNumber(lineNumber);
+token.setElementPosition(position);
+token.setElementPagePosition(pagePosition);
+token.setElementName(name);
+token.setXmlId(xmlId);
+token.setXpath("xpath");  // TODO
+if (name != null && name.equals("reg")) {
+if (attrString != null && attrString.contains("norm=\"")) {
+int regIndexBegin = attrString.indexOf("norm=\"");
+int regIndexEnd = attrString.indexOf("\"", regIndexBegin + 7);
+String reg = attrString.substring(regIndexBegin + 6, regIndexEnd);
+token.setContentReg(reg);
+String[] normFunction = {"norm"};
+Normalizer normalizer = new Normalizer(normFunction, language);
+String normStr = normalizer.normalize(reg);
+token.setContentNorm(normStr);
+}
+}
+if (language == null) {
+token.setContentOrig(origWordForm);  // TODO necessary ?
+tokens.add(token);
+resultTokens.add(token);
 return origWordForm;
-if (isStopElement())
+}
+if (isStopElement && outputFormat != null && outputFormat.equals("xml"))
 return origWordForm;
-wordForm = removeSpecialSymbols(wordForm);
+if (isStopElement && outputFormat != null && outputFormat.equals("string"))
-wordForm = wordForm.toLowerCase();
+return toBlanks(origWordForm);
+String wordFormNorm = token.getContentNorm();
 String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm);
 ArrayList<Lemma> lemmas = null;
-if (withForms() || withLemmas()) {
+Boolean hasDctionaryEntries = null;
+String lemmasStr = "";
+if (withForms || withLemmas) {
 LexHandler lexHandler = LexHandler.getInstance();
-lemmas = lexHandler.getLemmas(wordForm, "form", language, Normalizer.NONE);
+lemmas = lexHandler.getLemmas(wordFormNorm, "form", language, Normalizer.DICTIONARY, false);  // Performance: needs 15 % of the indexing time
-}
+if (lemmas != null) {
-wordTag = insertWordTags(origWordFormDeresolved, wordForm, language, null, lemmas);
+for (int i=0; i < lemmas.size(); i++) {
+Lemma lemma = lemmas.get(i);
+String lemmaName = lemma.getLemmaName();
+lemmasStr = lemmasStr + lemmaName + " ";
+}
+}
+lemmasStr = lemmasStr.trim();
+token.setContentMorph(lemmasStr);
+hasDctionaryEntries = false;
+ArrayList<String> lexEntries = lexHandler.getLexEntryKeys(wordFormNorm, language, Normalizer.DICTIONARY);  // Performance: needs 15 % of the indexing time
+if (lexEntries != null)
+hasDctionaryEntries = true;
+}
+if (outputFormat != null && outputFormat.equals("xml")) {
+wordTag = insertWordTags(origWordFormDeresolved, token, language, lemmas, hasDctionaryEntries);  // Performance: needs 10 % of the indexing time
+String tokenWordForm = token.getContentOrig();  // word form is in contentOrig
+if (useRegFunction)
+tokenWordForm = token.getContentReg();
+else if (useNormFunction)
+tokenWordForm = token.getContentNorm();
+else if (withLemmas)
+tokenWordForm = token.getContentMorph();
+boolean isHighlightTerm = false;
+if (highlightTerms.length > 0 && ! withLemmas) {
+isHighlightTerm = isHighlightTerm(tokenWordForm);
+} else {
+if (highlightTerms.length > 0 && lemmas != null) {
+String[] lemmasArray = lemmasStr.split(" ");
+isHighlightTerm = isHighlightTerm(lemmasArray);
+}
+}
+if (isHighlightTerm) {
+wordTag = "<hi>" + wordTag + "</hi>";
+}
+} else {  // outputFormat == string
+String inWordFormWithoutSpecialSymbols = removeSpecialSymbols(origWordForm); // without hyphen, blanks, newline, tab
+if (withLemmas) {
+if (lemmas != null) {
+String blanksAndNWBMarksOfOrigWord = toBlanks(origWordFormDeresolved);  // to rescue the NWB marks of the origWord and put it to the beginning of the lemmasStr
+wordTag = blanksAndNWBMarksOfOrigWord + lemmasStr;
+token.setContentMorph(lemmasStr);
+} else {
+wordTag = inWordFormWithoutSpecialSymbols;
+}
+} else {
+wordTag = inWordFormWithoutSpecialSymbols;
+}
+tokens.add(token);
+resultTokens.add(token);
+}
 return wordTag;
 }
+private String removeSpecialSymbols(String inputStr) {
+String retStr = inputStr.replaceAll(" |\n|\t|-|\u00AD", ""); // blank, newline, tab, minus, soft hyphen
+return retStr;
+}
 /**
 *
 * @param origWordToken  could contain nwd marks
-* @param wordForm  contains no nwd marks
+* @param token
 * @param language
-* @param origWordFormNormalized
 * @param lemmas
 * @return for each substring between nwd marks create a word tag
 */
-private String insertWordTags(String origWordToken, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) {
+private String insertWordTags(String origWordToken, Token token, String language, ArrayList<Lemma> lemmas, Boolean hasDictionaryEntries) {
 if (origWordToken.isEmpty())
 return origWordToken;
 if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK))
 return COMPLEX_ELEMENT_NWD_MARK;
 String retWordTags = "";
 String origWordTokenTmp = origWordToken;
-while (! origWordTokenTmp.isEmpty()) {
+if (outputFormat != null && outputFormat.equals("xml")) {
-if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) {  // single nwd mark
+retWordTags = getWordTag(origWordToken, token, language, lemmas, hasDictionaryEntries);
-origWordTokenTmp = origWordTokenTmp.substring(1);
+/*
-retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK;
+while (! origWordTokenTmp.isEmpty()) {
-} else {
+if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) {  // single nwd mark
-int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK);
+origWordTokenTmp = origWordTokenTmp.substring(1);
-if (indexUpToNWD != -1) { // not end of string reached
+retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK;
-String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD);
+} else {
-String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas);
+int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK);
-retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK;
+if (indexUpToNWD != -1) { // not end of string reached
-origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1);
+String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD);
-} else {  // end of string reached
+String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, token, language, lemmas, hasDictionaryEntries);
-String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length());
+retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK;
-String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, wordForm, language, origWordFormNormalized, lemmas);
+origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1);
-retWordTags = retWordTags + origWordTokenFragmentWithTags;
+} else {  // end of string reached
-origWordTokenTmp = "";  // finente
+String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length());
-}
+String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, token, language, lemmas, hasDictionaryEntries);
-}
+retWordTags = retWordTags + origWordTokenFragmentWithTags;
+origWordTokenTmp = "";  // finente
+}
+}
+}
+*/
+} else {
+// nothing
 }
 return retWordTags;
 }
-private String getWordTag(String origWordForm, String wordForm, String language, String origWordFormNormalized, ArrayList<Lemma> lemmas) {
+private String getWordTag(String origWordForm, Token token, String language, ArrayList<Lemma> lemmas, Boolean hasDictionaryEntries) {
 if (origWordForm == null || origWordForm.isEmpty())
 return "";
+String wordForm = token.getContentOrig(); // word form is in contentOrig
+String regularizedWordForm = token.getContentReg();
+String normalizedWordForm = token.getContentNorm();
 String langISOCode = Language.getInstance().getISO639Code(language);
-String retStr =  "<w form=\"" + wordForm + "\"" + " lang=\"" + langISOCode + "\"";
+StringBuilder retStrBuilder = new StringBuilder();
-if (origWordFormNormalized != null)
+retStrBuilder.append("<w" + " lang=\"" + langISOCode + "\"" + " form=\"" + wordForm + "\"");
-retStr = retStr + " formNormalized=\"" + origWordFormNormalized + "\"";
+if (regularizedWordForm != null)
+retStrBuilder.append(" formRegularized=\"" + regularizedWordForm + "\"");
+if (normalizedWordForm != null)
+retStrBuilder.append(" formNormalized=\"" + normalizedWordForm + "\"");
 if (lemmas != null) {
 String lemmasStr = "";
-String formsStr = "";
+StringBuilder formsStrBuilder = new StringBuilder();
 Collections.sort(lemmas);
 Hashtable<String, Form> formsHashtable = new Hashtable<String, Form>();
 for (int i=0; i < lemmas.size(); i++) {
 Lemma lemma = lemmas.get(i);
 ArrayList<Form> lemmaForms = lemma.getFormsList();
 Collections.sort(forms);
 for (int i=0; i < forms.size(); i++) {
 Form form = forms.get(i);
 String formName = form.getFormName();
 formName = StringUtils.forXML(formName);
-formsStr = formsStr + formName + " ";
+formsStrBuilder.append(formName + " ");
 }
+String formsStr = formsStrBuilder.toString();
 if (formsStr.endsWith(" "))
 formsStr = formsStr.substring(0, formsStr.length() - 1);
 if (lemmasStr.endsWith(" "))
 lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1);
-if (withForms())
+if (withForms)
-retStr = retStr + " forms=\"" + formsStr + "\"";
+retStrBuilder.append(" forms=\"" + formsStr + "\"");
-if (withLemmas())
+if (withLemmas)
-retStr = retStr + " lemmas=\"" + lemmasStr + "\"";
+retStrBuilder.append(" lemmas=\"" + lemmasStr + "\"");
 }
-retStr = retStr + ">" + origWordForm + "</w>";
+if (hasDictionaryEntries != null && hasDictionaryEntries) {
-return retStr;
+retStrBuilder.append(" dictionary=\"" + "true" + "\"");
-}
+} else if (hasDictionaryEntries != null && ! hasDictionaryEntries) {
+retStrBuilder.append(" dictionary=\"" + "false" + "\"");
-private String removeSpecialSymbols(String inputStr) {
+}
-String retStr = inputStr.replaceAll(" |\n|\t|-|\u2424|\u2425", "");
+retStrBuilder.append(">");
-return retStr;
+retStrBuilder.append(origWordForm);  // origWordForm could contain nwd marks (these are transformed back to elements later in method buildString)
+retStrBuilder.append("</w>");
+return retStrBuilder.toString();
+}
+private String toBlanks(String inputStr) {
+int size = inputStr.length();
+StringBuilder retStrBuilder = new StringBuilder();
+for (int j=0; j < size; j++) {
+char c = inputStr.charAt(j);
+if (c == COMPLEX_ELEMENT_NWD_MARK.charAt(0) || c == COMPLEX_ELEMENT_MARK.charAt(0))
+retStrBuilder.append(c);
+else
+retStrBuilder.append(" ");
+}
+return retStrBuilder.toString();
 }
 }
 }

Mercurial > hg > mpdl-group

comparison software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java @ 23:e845310098ba