view software/mpdl-services/mpiwg-mpdl-lt/src/de/mpg/mpiwg/berlin/mpdl/lt/text/tokenize/XmlTokenizerContentHandler.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents 7d6d969b10cf
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize;

import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Hashtable;

import org.xml.sax.*;

import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer;
import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;

public class XmlTokenizerContentHandler implements ContentHandler {
  private static String COMPLEX_ELEMENT_MARK = new Character('\u2425').toString();  // word delimiting element
  private static String COMPLEX_ELEMENT_NWD_MARK = new Character('\u2424').toString();  // not word delimiting element
  private static int COMPLEX_ELEMENT_MARK_SIZE = COMPLEX_ELEMENT_MARK.length();
  private static int ELEMENT_TYPE_CHARACTERS = 1;
  private static int ELEMENT_TYPE_COMPLEX = 2;
  private String docId;
  private String language;
  private String[] nwbElements = {};  // non word breaking elements, default: no nwb elements
  private String[] stopElements = {};  // default: no stop elements
  private String outputFormat = "xml";  // default: xml
  private String[] outputOptions = {};
  private boolean withForms = false; 
  private boolean withLemmas = false; 
  private String[] highlightTerms = {};  // highlight terms, default: no highlight terms
  private String[] normFunctions = {};  // default: no norm function
  private boolean useNormFunction = false;
  private boolean useRegFunction = false;
  private String xmlnsString = "";
  private StringBuilder result = new StringBuilder();
  private ArrayList<Token> resultTokens = new ArrayList<Token>();
  private Hashtable<String, ArrayList<Element>> elements = new Hashtable<String, ArrayList<Element>>();
  private Element rootElement;
  private Element currentElement;
  private int currentPosition = 0;
  private int currentPageNumber = 0;
  private int currentLineNumber = 0;
  private Hashtable<String, Integer> currentPositions = new Hashtable<String, Integer>();
  private Hashtable<String, Integer> currentPagePositions = new Hashtable<String, Integer>();
  private ArrayList<Element> elementQueue;
  
  public XmlTokenizerContentHandler(String language) throws ApplicationException {
    this.language = language;
  }

  public void setDocIdentifier(String docId) {
    this.docId = docId;
  }

  public void setNWBElements(String[] nwbElements) {
    this.nwbElements = nwbElements;
  }

  public void setStopElements(String[] stopElements) {
    this.stopElements = stopElements;
  }

  public void setHighlightTerms(String[] highlightTerms) {
    this.highlightTerms = highlightTerms;
  }

  public void setNormFunctions(String[] normFunctions) {
    this.normFunctions = normFunctions;
    if (this.normFunctions != null) {
      for (int i=0; i< this.normFunctions.length; i++) {
        String function = normFunctions[i];
        if (function.equals("norm"))
          this.useNormFunction = true;
        else if (function.equals("reg"))
          this.useRegFunction = true;
      }
    }
  }

  public void setOutputFormat(String outputFormat) {
    this.outputFormat = outputFormat;
  }

  public void setOutputOptions(String[] outputOptions) {
    this.outputOptions = outputOptions;
    for (int i=0; i< this.outputOptions.length; i++) {
      String function = outputOptions[i];
      if (function.equals("withForms"))
        this.withForms = true;
      else if (function.equals("withLemmas"))
        this.withLemmas = true;
    }
  }

  public String getResultString() {
    return result.toString();  
  }

  public ArrayList<Token> getResultTokens() {
    return resultTokens;  
  }
  
  public ArrayList<Element> getElements(String elementName) {
    return elements.get(elementName);  
  }

  public int getPageCount() {
    return currentPageNumber;  
  }

  public void startDocument() throws SAXException {
  }

  public void endDocument() throws SAXException {
    try {
      String rootElemToStr = rootElement.buildString();
      write(rootElemToStr);
      write("\n");
    } catch (NullPointerException e) {
      throw new SAXException(e); 
    } catch (ApplicationException e) {
      throw new SAXException(e); 
    }
  }
  
  public void characters(char[] c, int start, int length) throws SAXException {
    char[] cCopy = new char[length];
    System.arraycopy(c, start, cCopy, 0, length);
    String charactersStr = String.valueOf(cCopy);
    if (charactersStr != null && ! charactersStr.equals("")) {
      if (currentElement != null) {
        Element charElement = new Element("characters", ELEMENT_TYPE_CHARACTERS);
        charElement.pageNumber = currentPageNumber;
        charElement.value = StringUtils.deresolveXmlEntities(charactersStr);
        if (currentElement.composites == null)
          currentElement.composites = new ArrayList<Element>();
        currentElement.composites.add(charElement);
      }
    }
  }

  public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
  }

  public void processingInstruction(String target, String data) throws SAXException {
  }

  public void setDocumentLocator(Locator locator) {
  }

  public void startPrefixMapping(String prefix, String uri) throws SAXException {
    if (prefix != null && prefix.equals(""))  
      xmlnsString += "xmlns" + "=\"" + uri + "\" ";
    else
      xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
  }
  
  public void endPrefixMapping(String prefix) throws SAXException {
  }

  public void skippedEntity(String name) throws SAXException {
  }

  public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
    if (elementQueue == null)
      elementQueue = new ArrayList<Element>();
    Element newElement = new Element(name); // element of type: complex
    if (currentElement != null) {
      if (currentElement.composites == null)
        currentElement.composites = new ArrayList<Element>();
      if (currentElement.lang != null)
        newElement.lang = currentElement.lang;  // language is inherited to childs
      currentElement.composites.add(newElement);
      newElement.parent = currentElement;
    }
    currentElement = newElement;
    if (localName != null && localName.equals("pb")) {
      currentPageNumber++;
      setCurrentPagePosition(localName, 0);
    }
    currentElement.pageNumber = currentPageNumber;
    if (localName != null && localName.equals("lb")) {
      currentLineNumber++;
    }
    currentElement.lineNumber = currentLineNumber;
    currentPosition++;
    currentElement.docPosition = currentPosition;
    int newElemPosition = incrementCurrentPosition(localName);
    currentElement.position = newElemPosition;

    currentElement.elemPosition = getElementPosition(currentElement);
    Element parent = currentElement.parent;
    if (parent == null) {
      currentElement.xpath = "/" + currentElement.name + "[" + currentElement.elemPosition + "]";
    } else {
      currentElement.xpath = parent.xpath + "/" + currentElement.name + "[" + currentElement.elemPosition + "]";
    }
    int newElemPagePosition = incrementCurrentPagePosition(localName);
    currentElement.pagePosition = newElemPagePosition;
    int attrSize = attrs.getLength();
    String attrString = "";
    for (int i=0; i<attrSize; i++) {
      String attrQName = attrs.getQName(i);
      String attrValue = attrs.getValue(i);
      attrValue = StringUtils.forXML(attrValue);
      attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
      if (attrQName != null && (attrQName.toLowerCase().equals("xml:lang") || attrQName.toLowerCase().equals("lang"))) {
        currentElement.lang = attrValue;  // if xml:lang is set, it is set to the new element and overwrites values inherited by the father
      }
      if (attrQName != null && (attrQName.toLowerCase().equals("xml:id") || attrQName.toLowerCase().equals("id"))) {
        currentElement.xmlId = attrValue;
      }
    }
    currentElement.attrString = attrString;
    if (! xmlnsString.equals("")) {
      currentElement.xmlnsString = xmlnsString;
    }
    xmlnsString = "";
    elementQueue.add(currentElement);
    // only the first element is the root element
    if(rootElement == null)
      rootElement = currentElement;
  }

  public void endElement(String uri, String localName, String name) throws SAXException {
    if (elementQueue != null && elementQueue.size() > 0) {
      int lastIndex = elementQueue.size() - 1;
      elementQueue.remove(lastIndex);
    }
    if (elementQueue != null && elementQueue.size() > 0) {
      int lastIndex = elementQueue.size() - 1;
      currentElement = elementQueue.get(lastIndex);
    } else {
      currentElement = null;
    }
  }

  private int incrementCurrentPosition(String elemName) {
    Integer currentElemPos = currentPositions.get(elemName);
    if (currentElemPos == null) {
      currentElemPos = new Integer(0);
    } 
    currentElemPos++;
    currentPositions.put(elemName, currentElemPos);
    return currentElemPos.intValue();
  }

  private int getElementPosition(Element elem) {
    int pos = 0;
    Element parent = elem.parent;
    if (parent == null) {
      pos = 1;
    } else {
      pos = 0;
      ArrayList<Element> composites = parent.composites;
      if (composites != null) {
        for (int i=0; i<composites.size(); i++) {
          Element e = composites.get(i);
          if (e.isComplex() && e.name.equals(elem.name)) {
            pos++;
          }
          if (e == elem)
            break;
        }
      } else {
        pos = 1;
      }
    }
    return pos;
  }
  
  private int incrementCurrentPagePosition(String elemName) {
    Integer currentElemPagePos = currentPagePositions.get(elemName);
    if (currentElemPagePos == null) {
      currentElemPagePos = new Integer(0);
    } 
    currentElemPagePos++;
    currentPagePositions.put(elemName, currentElemPagePos);
    return currentElemPagePos.intValue();
  }
  
  private void setCurrentPagePosition(String elemName, int pos) {
    Integer newPagePosition = new Integer(pos);
    Enumeration<String> elemKeys = currentPagePositions.keys();
    while (elemKeys.hasMoreElements()) {
      String elemKey = elemKeys.nextElement();
      currentPagePositions.put(elemKey, newPagePosition);
    }
  }
  
  private boolean isHighlightTerm(String term) {
    if (term == null)
      return false;
    boolean result = false;
    for (int i=0; i< highlightTerms.length; i++) {
      String t = highlightTerms[i].toLowerCase();
      String termLowerCase = term.toLowerCase();
      if (t.equals(termLowerCase))
        return true;
    }
    return result;
  }

  private boolean isHighlightTerm(String[] terms) {
    if (terms == null)
      return false;
    boolean result = false;
    for (int i=0; i< highlightTerms.length; i++) {
      String t = highlightTerms[i].toLowerCase();
      for (int j=0; j<terms.length; j++) {
        String termLowerCase = terms[j].toLowerCase();
        if (t.equals(termLowerCase))
          return true;
      }
    }
    return result;
  }

  private void write(String outStr) throws SAXException {
    result.append(outStr);
  }
  
  public class Element implements Comparable<Element> {
    private int type;
    public String name;
    private String xmlnsString;
    private String attrString;
    private String value;
    public String lang;  // value of attribute xml:lang or the inherited xml:lang value of the father node
    public String xmlId;
    public String xpath;
    public int pageNumber;
    public int lineNumber;
    public int docPosition;  // absolute position in document
    public int position;  // position within all elements with this name 
    public int elemPosition;  // position in element e.g. the 6 sentence in paragraph
    public int pagePosition; // position in page
    private ArrayList<Token> tokens = new ArrayList<Token>();
    private ArrayList<Element> composites;
    private Element parent;
    private boolean isStopElement = false;
    private boolean isWordDelimiterElement = true;  // default: is word delimiter element
    
    private Element(String name) {
      this.type = ELEMENT_TYPE_COMPLEX;
      setName(name);
    }

    private Element(String name, int type) {
      this.type = type;
      setName(name);
    }

    private void setName(String name) {
      this.name = name;
      for (int i=0; i<stopElements.length; i++) {
        String stopElementName = stopElements[i];
        if (name.equals(stopElementName)) {
          this.isStopElement = true;
          break;
        }
      }
      for (int i=0; i<nwbElements.length; i++) {
        String nwbElementName = nwbElements[i];
        if (name.equals(nwbElementName)) {
          this.isWordDelimiterElement = false;
          break;
        }
      }
    }
    
    public int compareTo(Element elem) {
      return (new Integer(position)).compareTo(new Integer(elem.position));
    }

   private boolean isComplex() {
      boolean isComplex = false;
      if (type == ELEMENT_TYPE_COMPLEX)
        isComplex = true;
      return isComplex;
    }
    
    public ArrayList<Token> getTokens() {
      ArrayList<Token> retTokens = new ArrayList<Token>();
      if (isComplex()) {
        if (composites != null) {
          for (int i=0; i<composites.size(); i++) {
            Element elem = composites.get(i);
            if (elem.tokens != null)
              retTokens.addAll(elem.tokens);
          }
        }
      } 
      if (tokens != null)
        retTokens.addAll(tokens);
      return retTokens;
    }
    
    public String getTokensStr(String type) {
      ArrayList<Token> elementTokens = getTokens();
      String tokenStr = getTokensStr(type, elementTokens);
      return tokenStr;
    }
    
    private String getTokensStr(String type, ArrayList<Token> tokens) {
      StringBuilder tokenStr = new StringBuilder();
      for (int j=0; j<tokens.size(); j++) {
        Token token = tokens.get(j);
        String content = null;
        if (type.equals("orig"))
          content = token.getContentOrig();
        else if (type.equals("reg"))
          content = token.getContentReg();
        else if (type.equals("norm"))
          content = token.getContentNorm();
        else if (type.equals("morph"))
          content = token.getContentMorph();
        if (content != null)
          tokenStr.append(content + " ");
      }
      return tokenStr.toString();
    }

    public String toXmlString() throws ApplicationException {
      StringBuilder retStrBuilder = new StringBuilder();
      if (! isComplex()) {
        retStrBuilder.append(value);
      } else {
        String xmlNsString = this.xmlnsString;
        if (xmlNsString == null || xmlNsString.equals("")) {
          retStrBuilder.append("<" + name + attrString + ">");
        } else { 
          retStrBuilder.append("<" + name + " " + xmlNsString + attrString + ">");
        }
        if (composites != null) {
          for (int i=0; i<composites.size(); i++) {
            Element composite = composites.get(i);
            if (! composite.isComplex()) {
              if (composite.value != null && ! composite.value.equals("")) {
                String compositeValueStr = StringUtils.removeNlTabBlanks(composite.value);  // remove all newlines, they are no separators for words. And: if there are many Blanks/Tabs make them to one Blank
                retStrBuilder.append(compositeValueStr);
              }
            } else {
              retStrBuilder.append(composite.toXmlString());
            }
          }
        }
        retStrBuilder.append("</" + name + ">");
      } 
      return retStrBuilder.toString();
    }
    
    private String buildString() throws ApplicationException {
      StringBuilder retStrBuilder = new StringBuilder();
      String elemLanguage = language;  // default value for the document/page
      if (lang != null)
        elemLanguage = lang;  // value of the element if available 
      // write this element
      if (! isComplex()) {
        retStrBuilder.append(value);
      } else {
        if (outputFormat != null && outputFormat.equals("xml")) {
          String xmlNsString = this.xmlnsString;
          if (xmlNsString == null || xmlNsString.equals("")) {
            retStrBuilder.append("<" + name + attrString + ">");
          } else { 
            retStrBuilder.append("<" + name + " " + xmlNsString + attrString + ">");
          }
        } else {  // outputFormat == string
          // nothing
        }
        if (composites != null) {
          StringBuilder compositesCharsWithMarks = new StringBuilder();
          ArrayList<Element> complexElements = new ArrayList<Element>();
          for (int i=0; i<composites.size(); i++) {
            Element composite = composites.get(i);
            if (! composite.isComplex()) {
              if (composite.value != null && ! composite.value.equals("")) {
                String compositeValueStr = StringUtils.removeNlTabBlanks(composite.value);  // remove all newlines, they are no separators for words. And: if there are many Blanks/Tabs make them to one Blank
                compositesCharsWithMarks.append(compositeValueStr);
              }
            } else {
              if (! composite.isWordDelimiterElement) {
                compositesCharsWithMarks.append(COMPLEX_ELEMENT_NWD_MARK);  // add a special mark symbol at the position of the "not word delimiter element" (e.g. <lb>)
              } else {
                compositesCharsWithMarks.append(COMPLEX_ELEMENT_MARK);  // add a special mark symbol at the position of the "word delimiter element" (e.g. <var>)
              }
              complexElements.add(composite);
            }
          }
          // compositesCharsWithMarks = compositesCharsWithMarks.replaceAll(COMPLEX_ELEMENT_NWD_MARK + " +", COMPLEX_ELEMENT_NWD_MARK);  // remove Blanks after the non word breaking mark (e.g. "praebi<lb/> ta" is changed to "praebi<lb/>ta")
          String compositesCharsWithMarksWithWordTags = insertWordTags(compositesCharsWithMarks, elemLanguage);
          compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.replaceAll(COMPLEX_ELEMENT_NWD_MARK, COMPLEX_ELEMENT_MARK);  // mark symbols are unified to COMPLEX_ELEMENT_MARK to replace them by the element values
          if (complexElements.size() > 0) {
            for (int i=0; i<complexElements.size(); i++) {
              int indexComplexElemCompositesCharsWithMarks = compositesCharsWithMarksWithWordTags.indexOf(COMPLEX_ELEMENT_MARK);
              Element complexElem = complexElements.get(i);
              String complexElementStr = complexElem.buildString();
              String firstPiece = "";
              if (indexComplexElemCompositesCharsWithMarks > 0) {
                firstPiece = compositesCharsWithMarksWithWordTags.substring(0, indexComplexElemCompositesCharsWithMarks);
                compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(indexComplexElemCompositesCharsWithMarks);
              }
              retStrBuilder.append(firstPiece + complexElementStr);
              compositesCharsWithMarksWithWordTags = compositesCharsWithMarksWithWordTags.substring(COMPLEX_ELEMENT_MARK_SIZE);
            }
            retStrBuilder.append(compositesCharsWithMarksWithWordTags); // last one must also be added
          } else {
            retStrBuilder.append(compositesCharsWithMarksWithWordTags); // last one must also be added
          }
        }
        if (outputFormat != null && outputFormat.equals("xml")) {
          retStrBuilder.append("</" + name + ">");
        } else {  // outputFormat == string
          // nothing
        }
        // put element into elements name hashtable
        ArrayList<Element> elems = elements.get(name);
        if (elems == null) {
          elems = new ArrayList<Element>();
          elements.put(name, elems);
        }
        elems.add(this);
      } 
      return retStrBuilder.toString();
    }
    
    private String insertWordTags(StringBuilder charactersStrDeresolvedBuilder, String language) throws ApplicationException {
      String charactersStrDeresolved = charactersStrDeresolvedBuilder.toString();
      String charactersStr = StringUtils.resolveXmlEntities(charactersStrDeresolved);
      StringBuilder retStrBuilder = new StringBuilder();
      Tokenizer tokenizer = new Tokenizer(new StringReader(charactersStr));
      tokenizer.setLanguage(language);
      String[] normFunction = {"norm"};
      tokenizer.setNormFunctions(normFunction); 
      ArrayList<Token> tokens = tokenizer.getTokens();
      int endPos = 0;
      for (int i=0; i < tokens.size(); i++) {
        Token token = tokens.get(i);
        int startPos = token.getStart();
        String beforeStr = charactersStr.substring(endPos, startPos);
        endPos = token.getEnd();
        String beforeStrDeresolved = StringUtils.deresolveXmlEntities(beforeStr);
        String origWordForm = charactersStr.substring(startPos, endPos);
        String wordTag = insertWordTags(token, language, origWordForm); 
        if (outputFormat != null && outputFormat.equals("xml")) {
          retStrBuilder.append(beforeStrDeresolved + wordTag);
        } else {  // outputFormat == string
          String beforeStrDeresolvedToBlanks = toBlanks(beforeStrDeresolved);
          retStrBuilder.append(beforeStrDeresolvedToBlanks + wordTag);
        }
      }
      String lastAfterStr = charactersStr.substring(endPos);
      String lastAfterStrDeresolved = StringUtils.deresolveXmlEntities(lastAfterStr);
      if (outputFormat != null && outputFormat.equals("xml")) {
        retStrBuilder.append(lastAfterStrDeresolved);
      } else {  // outputFormat == string
        String lastAfterStrDeresolvedToBlanks = toBlanks(lastAfterStrDeresolved);
        retStrBuilder.append(lastAfterStrDeresolvedToBlanks);
      }
      return retStrBuilder.toString();
    }

    private String insertWordTags(Token token, String language, String origWordForm) throws ApplicationException {
      if (origWordForm != null && origWordForm.equals(COMPLEX_ELEMENT_NWD_MARK)) {
        return origWordForm;
      }
      String wordTag = null;
      token.setDocId(docId);
      token.setLanguage(lang);
      token.setPageNumber(pageNumber);
      token.setLineNumber(lineNumber);
      token.setElementPosition(position);
      token.setElementPagePosition(pagePosition);
      token.setElementName(name);
      token.setXmlId(xmlId);
      token.setXpath("xpath");  // TODO
      if (name != null && name.equals("reg")) {
        if (attrString != null && attrString.contains("norm=\"")) {
          int regIndexBegin = attrString.indexOf("norm=\"");
          int regIndexEnd = attrString.indexOf("\"", regIndexBegin + 7);
          String reg = attrString.substring(regIndexBegin + 6, regIndexEnd);
          token.setContentReg(reg);
          String[] normFunction = {"norm"};
          Normalizer normalizer = new Normalizer(normFunction, language);
          String normStr = normalizer.normalize(reg);
          token.setContentNorm(normStr);
        }
      }
      if (language == null) { 
        token.setContentOrig(origWordForm);  // TODO necessary ?
        tokens.add(token);
        resultTokens.add(token);
        return origWordForm;
      }
      if (isStopElement && outputFormat != null && outputFormat.equals("xml"))
        return origWordForm;
      if (isStopElement && outputFormat != null && outputFormat.equals("string"))
        return toBlanks(origWordForm);
      String wordFormNorm = token.getContentNorm();
      String origWordFormDeresolved = StringUtils.deresolveXmlEntities(origWordForm);
      ArrayList<Lemma> lemmas = null;
      Boolean hasDctionaryEntries = null;
      String lemmasStr = "";
      if (withForms || withLemmas) {
        LexHandler lexHandler = LexHandler.getInstance();
        lemmas = lexHandler.getLemmas(wordFormNorm, "form", language, Normalizer.DICTIONARY, false);  // Performance: needs 15 % of the indexing time
        if (lemmas != null) {
          for (int i=0; i < lemmas.size(); i++) {
            Lemma lemma = lemmas.get(i);
            String lemmaName = lemma.getLemmaName(); 
            lemmasStr = lemmasStr + lemmaName + " ";
          }
        }
        lemmasStr = lemmasStr.trim();
        token.setContentMorph(lemmasStr);
        hasDctionaryEntries = false;
        ArrayList<String> lexEntries = lexHandler.getLexEntryKeys(wordFormNorm, language, Normalizer.DICTIONARY);  // Performance: needs 15 % of the indexing time
        if (lexEntries != null)
          hasDctionaryEntries = true;
      }
      if (outputFormat != null && outputFormat.equals("xml")) {
        wordTag = insertWordTags(origWordFormDeresolved, token, language, lemmas, hasDctionaryEntries);  // Performance: needs 10 % of the indexing time
        String tokenWordForm = token.getContentOrig();  // word form is in contentOrig
        if (useRegFunction)
          tokenWordForm = token.getContentReg();
        else if (useNormFunction)
          tokenWordForm = token.getContentNorm();
        else if (withLemmas)
          tokenWordForm = token.getContentMorph();
        boolean isHighlightTerm = false; 
        if (highlightTerms.length > 0 && ! withLemmas) {
          isHighlightTerm = isHighlightTerm(tokenWordForm);
        } else {
          if (highlightTerms.length > 0 && lemmas != null) {
            String[] lemmasArray = lemmasStr.split(" ");
            isHighlightTerm = isHighlightTerm(lemmasArray);
          }
        }
        if (isHighlightTerm) {
          wordTag = "<hi>" + wordTag + "</hi>";
        }
      } else {  // outputFormat == string
        String inWordFormWithoutSpecialSymbols = removeSpecialSymbols(origWordForm); // without hyphen, blanks, newline, tab
        if (withLemmas) {
          if (lemmas != null) {
            String blanksAndNWBMarksOfOrigWord = toBlanks(origWordFormDeresolved);  // to rescue the NWB marks of the origWord and put it to the beginning of the lemmasStr
            wordTag = blanksAndNWBMarksOfOrigWord + lemmasStr;
            token.setContentMorph(lemmasStr);
          } else {
            wordTag = inWordFormWithoutSpecialSymbols;
          }
        } else {
          wordTag = inWordFormWithoutSpecialSymbols;
        }
        tokens.add(token);
        resultTokens.add(token);
      }
      return wordTag;
    }
    
    private String removeSpecialSymbols(String inputStr) {
      String retStr = inputStr.replaceAll(" |\n|\t|-|\u00AD", ""); // blank, newline, tab, minus, soft hyphen
      return retStr;
    }

    /**
     * 
     * @param origWordToken  could contain nwd marks
     * @param token   
     * @param language
     * @param lemmas
     * @return for each substring between nwd marks create a word tag
     */
    private String insertWordTags(String origWordToken, Token token, String language, ArrayList<Lemma> lemmas, Boolean hasDictionaryEntries) {
      if (origWordToken.isEmpty())
        return origWordToken;
      if (origWordToken.equals(COMPLEX_ELEMENT_NWD_MARK))
        return COMPLEX_ELEMENT_NWD_MARK;
      String retWordTags = "";
      String origWordTokenTmp = origWordToken;
      if (outputFormat != null && outputFormat.equals("xml")) {
        retWordTags = getWordTag(origWordToken, token, language, lemmas, hasDictionaryEntries);
        /*
        while (! origWordTokenTmp.isEmpty()) {
          if (origWordTokenTmp.startsWith(COMPLEX_ELEMENT_NWD_MARK)) {  // single nwd mark
            origWordTokenTmp = origWordTokenTmp.substring(1);
            retWordTags = retWordTags + COMPLEX_ELEMENT_NWD_MARK;
          } else {
            int indexUpToNWD = origWordTokenTmp.indexOf(COMPLEX_ELEMENT_NWD_MARK);
            if (indexUpToNWD != -1) { // not end of string reached
              String origWordTokenFragment = origWordTokenTmp.substring(0, indexUpToNWD);
              String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, token, language, lemmas, hasDictionaryEntries);
              retWordTags = retWordTags + origWordTokenFragmentWithTags + COMPLEX_ELEMENT_NWD_MARK;
              origWordTokenTmp = origWordTokenTmp.substring(indexUpToNWD + 1);
            } else {  // end of string reached
              String origWordTokenFragment = origWordTokenTmp.substring(0, origWordTokenTmp.length());
              String origWordTokenFragmentWithTags = getWordTag(origWordTokenFragment, token, language, lemmas, hasDictionaryEntries);
              retWordTags = retWordTags + origWordTokenFragmentWithTags;
              origWordTokenTmp = "";  // finente
            }
          }      
        }
        */
      } else {
        // nothing
      }
      return retWordTags;
    }
    
    private String getWordTag(String origWordForm, Token token, String language, ArrayList<Lemma> lemmas, Boolean hasDictionaryEntries) {
      if (origWordForm == null || origWordForm.isEmpty())
        return "";
      String wordForm = token.getContentOrig(); // word form is in contentOrig
      String regularizedWordForm = token.getContentReg();
      String normalizedWordForm = token.getContentNorm();
      String langISOCode = Language.getInstance().getISO639Code(language);
      StringBuilder retStrBuilder = new StringBuilder();
      retStrBuilder.append("<w" + " lang=\"" + langISOCode + "\"" + " form=\"" + wordForm + "\"");
      if (regularizedWordForm != null)
        retStrBuilder.append(" formRegularized=\"" + regularizedWordForm + "\"");
      if (normalizedWordForm != null)
        retStrBuilder.append(" formNormalized=\"" + normalizedWordForm + "\"");
      if (lemmas != null) {
        String lemmasStr = "";
        StringBuilder formsStrBuilder = new StringBuilder();
        Collections.sort(lemmas);
        Hashtable<String, Form> formsHashtable = new Hashtable<String, Form>();
        for (int i=0; i < lemmas.size(); i++) {
          Lemma lemma = lemmas.get(i);
          ArrayList<Form> lemmaForms = lemma.getFormsList();
          for (int j=0; j < lemmaForms.size(); j++) {
            Form form = lemmaForms.get(j);
            formsHashtable.put(form.getFormName(), form);
          }
          String lemmaName = lemma.getLemmaName();
          lemmasStr = lemmasStr + lemmaName + " ";
        }
        ArrayList<Form> forms = new ArrayList<Form>();
        forms.addAll(formsHashtable.values());
        Collections.sort(forms);
        for (int i=0; i < forms.size(); i++) {
          Form form = forms.get(i);
          String formName = form.getFormName();
          formName = StringUtils.forXML(formName);
          formsStrBuilder.append(formName + " ");
        }
        String formsStr = formsStrBuilder.toString();
        if (formsStr.endsWith(" "))
          formsStr = formsStr.substring(0, formsStr.length() - 1);
        if (lemmasStr.endsWith(" "))
          lemmasStr = lemmasStr.substring(0, lemmasStr.length() - 1);
        if (withForms)
          retStrBuilder.append(" forms=\"" + formsStr + "\"");
        if (withLemmas)
          retStrBuilder.append(" lemmas=\"" + lemmasStr + "\"");
      }
      if (hasDictionaryEntries != null && hasDictionaryEntries) {
        retStrBuilder.append(" dictionary=\"" + "true" + "\"");
      } else if (hasDictionaryEntries != null && ! hasDictionaryEntries) {
        retStrBuilder.append(" dictionary=\"" + "false" + "\"");
      }
      retStrBuilder.append(">");
      retStrBuilder.append(origWordForm);  // origWordForm could contain nwd marks (these are transformed back to elements later in method buildString)
      retStrBuilder.append("</w>");
      return retStrBuilder.toString();
    }
    
    private String toBlanks(String inputStr) {
      int size = inputStr.length();
      StringBuilder retStrBuilder = new StringBuilder();
      for (int j=0; j < size; j++) {
        char c = inputStr.charAt(j);
        if (c == COMPLEX_ELEMENT_NWD_MARK.charAt(0) || c == COMPLEX_ELEMENT_MARK.charAt(0))
          retStrBuilder.append(c);
        else
          retStrBuilder.append(" ");
      }
      return retStrBuilder.toString();
    }
    
  }
}