view software/mpdl-services/mpiwg-mpdl-cms/src/de/mpg/mpiwg/berlin/mpdl/cms/transform/HighlightContentHandler.java @ 23:e845310098ba

diverse Korrekturen
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 27 Nov 2012 12:35:19 +0100
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.cms.transform;

import java.util.ArrayList;

import org.xml.sax.*;

import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;

public class HighlightContentHandler implements ContentHandler {
  private String xmlnsString = "";
  private String highlightElemName;  
  private int highlightElemPos = 1; 
  private int currentHighlightElemPos = 0;
  private boolean highlightElemMode = false;
  private int highlightElemModeOpenTags = 0;
  private String highlightQueryType = "orig";  // orig, reg, norm or morph
  private String highlightQuery;  // complex Lucene query
  private String highlightQueryForms;  // highlight terms separated by a blank
  private boolean highlightHitMode = false;
  private int highlightHitModeOpenTags = 0;
  private boolean firstPageBreakReachedMode = false;  // in a page fragment: if a page break element is surrounded by an element (e.g. "s") then this element should not increment the currentHighlightElemPos 
  private boolean firstPageBreakReached = true;  
  private StringBuilder result = new StringBuilder();
  
  public HighlightContentHandler() throws ApplicationException {
  }

  public HighlightContentHandler(String highlightElemName, int highlightElemPos) throws ApplicationException {
    this.highlightElemName = highlightElemName;
    this.highlightElemPos = highlightElemPos;
  }

  public HighlightContentHandler(String highlightElemName, int highlightElemPos, String highlightQueryType, String highlightQuery, String language) throws ApplicationException {
    this.highlightElemName = highlightElemName;
    this.highlightElemPos = highlightElemPos;
    this.highlightQueryType = highlightQueryType;
    this.highlightQuery = highlightQuery;
    if (highlightQuery != null) {
      IndexHandler indexHandler = IndexHandler.getInstance();
      ArrayList<String> queryTerms = indexHandler.fetchTerms(highlightQuery, language); // all query terms in query (also morphological terms)
      highlightQueryForms = toString(queryTerms);
    }
  }

  public void setFirstPageBreakReachedMode(boolean firstPageBreakReachedMode) {
    this.firstPageBreakReachedMode = firstPageBreakReachedMode; 
    if (firstPageBreakReachedMode)
      this.firstPageBreakReached = false;  // is first set to false and later if a page break is found (by startElement) it is set to true
  }
  
  public StringBuilder getResult() {
    return result;  
  }

  public void startDocument() throws SAXException {
  }

  public void endDocument() throws SAXException {
  }
  
  public void characters(char[] c, int start, int length) throws SAXException {
    char[] cCopy = new char[length];
    System.arraycopy(c, start, cCopy, 0, length);
    String charactersStr = String.valueOf(cCopy);
    if (charactersStr != null && ! charactersStr.equals("")) {
      charactersStr = StringUtils.deresolveXmlEntities(charactersStr);
      write(charactersStr);
    }
  }

  public void ignorableWhitespace(char[] c, int start, int length) throws SAXException {
  }

  public void processingInstruction(String target, String data) throws SAXException {
  }

  public void setDocumentLocator(Locator locator) {
  }

  public void startPrefixMapping(String prefix, String uri) throws SAXException {
    xmlnsString += "xmlns:" + prefix + "=\"" + uri + "\" ";
    if (prefix != null && prefix.equals(""))  
      xmlnsString = "xmlns" + "=\"" + uri + "\" ";
  }
  
  public void endPrefixMapping(String prefix) throws SAXException {
  }

  public void skippedEntity(String name) throws SAXException {
  }

  public void startElement(String uri, String localName, String name, Attributes attrs) throws SAXException {
    int attrSize = attrs.getLength();
    String attrString = "";
    for (int i=0; i<attrSize; i++) {
      String attrQName = attrs.getQName(i);
      String attrValue = attrs.getValue(i);
      attrValue = StringUtils.forXML(attrValue);
      attrString = attrString + " " + attrQName + "=\"" + attrValue + "\"";
    }
    if (attrString != null && ! attrString.isEmpty()) {
      attrString = attrString.trim();
    }
    if (xmlnsString != null && ! xmlnsString.isEmpty()) {
      xmlnsString = xmlnsString.trim();
    }
    if (localName.equals("pb"))
      firstPageBreakReached = true;
    // start highlight element at position
    if (highlightElemName != null && highlightElemName.equals(localName) && firstPageBreakReached) {
      currentHighlightElemPos++;
      if (currentHighlightElemPos == highlightElemPos && highlightElemModeOpenTags == 0) {
        highlightElemMode = true;
        write("<hi type=\"elem\">");
      }
    }
    if (highlightElemMode) {
      highlightElemModeOpenTags++;
    }
    // start highlight query 
    if (highlightQuery != null && localName.equals("w")) {
      boolean matched = false;
      String attrQName = "form";
      if (highlightQueryType.equals("orig"))
        attrQName = "form";
      else if (highlightQueryType.equals("reg"))
        attrQName = "formRegularized";
      else if (highlightQueryType.equals("norm"))
        attrQName = "formNormalized";
      else if (highlightQueryType.equals("morph"))
        attrQName = "lemmas";
      String attrValue = getAttrValue(attrs, attrQName);
      if (highlightQueryType.equals("reg") && attrValue == null)
        attrValue = getAttrValue(attrs, "form"); // if no regularized form exist it takes the form 
      if (attrValue != null) {
        String[] forms = highlightQueryForms.split(" "); 
        for (int i=0; i<forms.length; i++) {
          if (! matched) {
            String form = forms[i];
            if (form.endsWith("*")) {  // TODO support middle wildcard queries: bla*bla bla?bla 
              form = form.replace("*", "");
              matched = attrValue.startsWith(form);
            } else {
              matched = attrValue.equals(form);  
            }
          }
        }
      }
      if ((highlightElemName == null && matched && highlightHitModeOpenTags == 0) || (highlightElemName != null && highlightElemMode && matched && highlightHitModeOpenTags == 0)) {
        highlightHitMode = true;
        write("<hi type=\"hit\">");
      }
    }
    if (highlightHitMode) {
      highlightHitModeOpenTags++;
    }
    write("<" + name);
    if (xmlnsString != null && ! xmlnsString.isEmpty())
      write(" " + xmlnsString);
    if (attrString != null && ! attrString.isEmpty())
      write(" " + attrString);
    write(">");
    xmlnsString = "";
  }

  public void endElement(String uri, String localName, String name) throws SAXException {
    write("</" + name + ">");
    // end highlight element at position
    if (highlightElemMode) {
      if (highlightElemModeOpenTags == 1) {
        highlightElemMode = false;
        write("</hi>");
      }
      highlightElemModeOpenTags--;
    }
    // end highlight query 
    if (highlightHitMode) {
      if (highlightHitModeOpenTags == 1) {
        highlightHitMode = false;
        write("</hi>");
      }
      highlightHitModeOpenTags--;
    }
  }

  private String toString(ArrayList<String> queryForms) {
    String queryFormsStr = "";
    for (int i=0; i<queryForms.size(); i++) {
      String form = queryForms.get(i);
      queryFormsStr = queryFormsStr + form + " ";
    }
    if (queryForms == null || queryForms.size() == 0)
      return null;
    else
      return queryFormsStr.substring(0, queryFormsStr.length() -1); 
  }
  
  private void write(String outStr) throws SAXException {
    result.append(outStr);
  }
  
  private String getAttrValue(Attributes attrs, String attrQName) {
    String retValue = null;
    int attrSize = attrs.getLength();
    for (int i=0; i<attrSize; i++) {
      String attrQNameTmp = attrs.getQName(i);
      String attrValue = attrs.getValue(i);
      if (attrQNameTmp.equals(attrQName))
        return attrValue;
    }
    return retValue;
  }
}