view software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/cms/QueryDocument.java @ 25:e9fe3186670c default tip

letzter Stand eingecheckt
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 21 May 2013 10:19:32 +0200
parents
children
line wrap: on
line source

package de.mpg.mpiwg.berlin.mpdl.servlets.cms;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.util.ArrayList;

import javax.servlet.ServletConfig;
import javax.servlet.ServletContext;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.lucene.document.Fieldable;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import com.sun.org.apache.xerces.internal.parsers.SAXParser;

import de.mpg.mpiwg.berlin.mpdl.cms.document.Document;
import de.mpg.mpiwg.berlin.mpdl.cms.document.Hits;
import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord;
import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler;
import de.mpg.mpiwg.berlin.mpdl.cms.transform.HighlightContentHandler;
import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;

public class QueryDocument extends HttpServlet {
  private static final long serialVersionUID = 1L;
  private PageTransformer pageTransformer = null;
  
  public QueryDocument() {
    super();
  }

  public void init(ServletConfig config) throws ServletException  {
    super.init(config);
    ServletContext context = getServletContext();
    pageTransformer = (PageTransformer) context.getAttribute("pageTransformer");
  }

  protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    doGet(request, response);
  }  

  protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    request.setCharacterEncoding("utf-8");
    response.setCharacterEncoding("utf-8");
    String docId = request.getParameter("docId");
    String query = request.getParameter("query");
    String[] normFunctions = {"none"};
    if (query.contains("tokenReg"))  // TODO ordentlich behandeln
      normFunctions[0] = "reg";
    else if (query.contains("tokenNorm"))  // TODO ordentlich behandeln
      normFunctions[0] = "norm";
    String[] outputOptions = {};
    if (query.contains("tokenMorph")) {  // TODO ordentlich behandeln
      outputOptions = new String[1];
      outputOptions[0] = "withLemmas";
    }
    String pageStr = request.getParameter("page");
    if (pageStr == null)
      pageStr = "1";
    int page = Integer.parseInt(pageStr);
    String pageSizeStr = request.getParameter("pageSize");
    if (pageSizeStr == null)
      pageSizeStr = "10";
    int pageSize = Integer.parseInt(pageSizeStr);
    int from = (page * pageSize) - pageSize;  // e.g. 0
    int to = page * pageSize - 1;  // e.g. 9
    String outputFormat = request.getParameter("outputFormat");
    if (outputFormat == null)
      outputFormat = "xml";
    try {
      IndexHandler indexHandler = IndexHandler.getInstance();
      Hits hits = indexHandler.queryDocument(docId, query, from, to);
      MetadataRecord docMetadataRecord = indexHandler.getDocMetadata(docId);
      if (outputFormat.equals("xml"))
        response.setContentType("text/xml");
      else if (outputFormat.equals("html"))
        response.setContentType("text/html");
      else 
        response.setContentType("text/xml");
      PrintWriter out = response.getWriter();
      String resultStr = "";
      if (outputFormat.equals("xml"))
        resultStr = createXmlString(docMetadataRecord, query, page, pageSize, normFunctions, outputOptions, hits);
      else if (outputFormat.equals("html"))
        resultStr = createHtmlString(docMetadataRecord, query, page, pageSize, normFunctions, outputOptions, hits, request);
      out.print(resultStr);
      out.close();
    } catch (ApplicationException e) {
      throw new ServletException(e);
    }
  }

  private String createXmlString(MetadataRecord docMetadataRecord, String query, int page, int pageSize, String[] normFunctions, String[] outputOptions, Hits hits) throws ApplicationException {
    String docId = docMetadataRecord.getDocId();
    ArrayList<Document> docs = null;
    if (hits != null)
      docs = hits.getHits();
    int hitsSize = -1;
    int docsSize = -1;
    if (hits != null)
      hitsSize = hits.getSize();
    if (docs != null)
      docsSize = docs.size();
    StringBuilder xmlStrBuilder = new StringBuilder();
    xmlStrBuilder.append("<document>");
    xmlStrBuilder.append("<id>" + docId + "</id>");
    xmlStrBuilder.append("<query>");
    xmlStrBuilder.append("<queryText>" + query + "</queryText>");
    xmlStrBuilder.append("<resultPage>" + page + "</resultPage>");
    xmlStrBuilder.append("<resultPageSize>" + pageSize + "</resultPageSize>");
    xmlStrBuilder.append("</query>");
    xmlStrBuilder.append("<hitsSize>" + hitsSize + "</hitsSize>");
    xmlStrBuilder.append("<hits>");
    for (int i=0; i<docsSize; i++) {
      Document doc = docs.get(i);
      int num = (page - 1) * pageSize + i + 1;
      xmlStrBuilder.append("<hit>");
      xmlStrBuilder.append("<num>" + num + "</num>");
      String pageNumber = null;
      Fieldable fPageNumber = doc.getFieldable("pageNumber");
      if (fPageNumber != null) {
        pageNumber = fPageNumber.stringValue();
        xmlStrBuilder.append("<pageNumber>" + pageNumber + "</pageNumber>");
      }
      String elementPagePosition = null;
      Fieldable fElementPagePosition = doc.getFieldable("elementPagePosition");
      if (fElementPagePosition != null) {
        elementPagePosition = fElementPagePosition.stringValue();
        xmlStrBuilder.append("<pagePosition>" + elementPagePosition + "</pagePosition>");
      }
      String lineNumber = null;
      Fieldable fLineNumber = doc.getFieldable("lineNumber");
      if (fLineNumber != null) {
        lineNumber = fLineNumber.stringValue();
        xmlStrBuilder.append("<lineNumber>" + lineNumber + "</lineNumber>");
      }
      String elementPosition = null;
      Fieldable fElementPosition = doc.getFieldable("elementAbsolutePosition");
      if (fElementPosition != null) {
        elementPosition = fElementPosition.stringValue();
        xmlStrBuilder.append("<absolutePosition>" + elementPosition + "</absolutePosition>");
      }
      String xpath = null;
      Fieldable fXPath = doc.getFieldable("xpath");
      if (fXPath != null) {
        xpath = fXPath.stringValue();
        xmlStrBuilder.append("<xpath>" + xpath + "</xpath>");
      }
      String xmlId = null;
      Fieldable fXmlId = doc.getFieldable("xmlId");
      if (fXmlId != null) {
        xmlId = fXmlId.stringValue();
        xmlStrBuilder.append("<xmlId>" + xmlId + "</xmlId>");
      }
      String language = null;
      Fieldable fLanguage = doc.getFieldable("language");
      if (fLanguage != null) {
        language = fLanguage.stringValue();
        xmlStrBuilder.append("<language>" + language + "</language>");
      }
      String xmlContentTokenized = null;
      Fieldable fXmlContentTokenized = doc.getFieldable("xmlContentTokenized");
      if (fXmlContentTokenized != null) {
        String highlightQueryType = "orig";
        if (withLemmas(outputOptions)) {
          highlightQueryType = "morph";
        } else if (normFunctions != null) { 
          String normFunction = normFunctions[0];
          highlightQueryType = normFunction;
          if (normFunction.equals("none")) {
            highlightQueryType = "orig";
          }
        }
        xmlContentTokenized = fXmlContentTokenized.stringValue();
        String xmlPre = "<content xmlns:xhtml=\"http://www.w3.org/1999/xhtml\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">";
        String xmlPost = "</content>";
        String xmlInputStr = xmlPre + xmlContentTokenized + xmlPost;
        String docLanguage = docMetadataRecord.getLanguage();
        String highlightedXmlStr = highlight(xmlInputStr, highlightQueryType, query, docLanguage);
        if (highlightedXmlStr == null)
          highlightedXmlStr = "<content>" + xmlContentTokenized + "</content>";
        xmlStrBuilder.append(highlightedXmlStr);
      }
      xmlStrBuilder.append("</hit>");
    }
    xmlStrBuilder.append("</hits>");
    xmlStrBuilder.append("</document>");
    return xmlStrBuilder.toString();   
  }
  
  private String createHtmlString(MetadataRecord docMetadataRecord, String query, int page, int pageSize, String[] normFunctions, String[] outputOptions, Hits hits, HttpServletRequest request) throws ApplicationException {
    String docId = docMetadataRecord.getDocId();
    ArrayList<Document> docs = null;
    if (hits != null)
      docs = hits.getHits();
    int hitsSize = -1;
    int docsSize = -1;
    if (hits != null)
      hitsSize = hits.getSize();
    if (docs != null)
      docsSize = docs.size();
    String highlightQueryType = "orig";
    String normalizationStr = "";
    String highlightQueryTypeStr = "";
    if (withLemmas(outputOptions)) {
      highlightQueryTypeStr = "&amp;highlightQueryType=norm";
      highlightQueryType = "norm";
    } else if (normFunctions != null) { 
      String normFunction = normFunctions[0];
      normalizationStr = "&amp;normalization=" + normFunction;
      highlightQueryType = normFunction;
      if (normFunction.equals("none")) {
        normalizationStr = "&amp;normalization=" + "orig";
        highlightQueryType = "orig";
      }
    }
    StringBuilder xmlStrBuilder = new StringBuilder();
    xmlStrBuilder.append("<html>");
    xmlStrBuilder.append("<head>");
    xmlStrBuilder.append("<title>Document: \"" + docId + " " + query + "\"</title>");
    String baseUrl = getBaseUrl(request);
    String cssUrl = baseUrl + "/css/page.css";
    String cssShowWordFileName = "pageOrig.css";
    if (highlightQueryType.equals("reg"))
      cssShowWordFileName = "pageReg.css"; 
    else if (highlightQueryType.equals("norm"))
      cssShowWordFileName = "pageNorm.css";
    String showWordCssUrl = baseUrl + "/css/" + cssShowWordFileName;
    xmlStrBuilder.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"" + showWordCssUrl + "\"/>");
    xmlStrBuilder.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"" + cssUrl + "\"/>");
    xmlStrBuilder.append("</head>");
    xmlStrBuilder.append("<body>");
    xmlStrBuilder.append("<span class=\"about\">[<span class=\"it\">This is a MPIWG CMS technology service</span>] <a href=\"/mpiwg-mpdl-cms-web/index.html\"><img src=\"/mpiwg-mpdl-cms-web/images/info.png\" valign=\"bottom\" width=\"15\" height=\"15\" border=\"0\" alt=\"MPIWG CMS service\"/></a></span>");
    xmlStrBuilder.append("<span class=\"query\">Query: " + query + "</span>");
    xmlStrBuilder.append("<span class=\"result\">");
    xmlStrBuilder.append("<span class=\"resultPage\">" + page + "</span>");
    xmlStrBuilder.append("<span class=\"resultPageSize\">" + pageSize + "</span>");
    xmlStrBuilder.append("<span class=\"hitsSize\">" + hitsSize + "</span>");
    xmlStrBuilder.append("</span>");
    xmlStrBuilder.append("<table>");
    for (int i=0; i<docsSize; i++) {
      xmlStrBuilder.append("<tr class=\"hit\">");
      Document doc = docs.get(i);
      int num = (page - 1) * pageSize + i + 1;
      xmlStrBuilder.append("<td class=\"hitNum\">" + num + ". " + "</td>");
      xmlStrBuilder.append("<td class=\"hitLink\">");
      String posStr = "";
      String pageNumber = "";
      Fieldable fPageNumber = doc.getFieldable("pageNumber");
      if (fPageNumber != null) {
        pageNumber = fPageNumber.stringValue();
        posStr = posStr + "Page " + pageNumber + ", ";
      }
      String elementName = null;
      String presElementName = "";
      Fieldable fElementName = doc.getFieldable("elementName");
      if (fElementName != null) {
        elementName = fElementName.stringValue();
        presElementName = getPresentationName(elementName);
      }
      String elementPagePosition = "";
      Fieldable fElementPagePosition = doc.getFieldable("elementPagePosition");
      if (fElementPagePosition != null) {
        elementPagePosition = fElementPagePosition.stringValue();
        posStr = posStr + presElementName + " " + elementPagePosition + ":";
      }
      String language = docMetadataRecord.getLanguage();
      String getPageLink = baseUrl + "/query/GetPage?docId=" + docId + "&amp;page=" + pageNumber + normalizationStr + "&amp;highlightElem=" + elementName + "&amp;highlightElemPos=" + elementPagePosition + highlightQueryTypeStr + "&amp;highlightQuery=" + query + "&amp;language=" + language;
      xmlStrBuilder.append("<a href=\"" + getPageLink + "\">" + posStr + "</a>");
      xmlStrBuilder.append("</td>");
      String xmlContentTokenized = null;
      Fieldable fXmlContentTokenized = doc.getFieldable("xmlContentTokenized");
      if (fXmlContentTokenized != null) {
        xmlContentTokenized = fXmlContentTokenized.stringValue();
        String highlightedXmlStr = highlight(xmlContentTokenized, highlightQueryType, query, language);  
        String highlightHtmlStr = pageTransformer.transform(highlightedXmlStr, docMetadataRecord, -1, "html");  // TODO performance: do not highlight each single node but highlight them all in one step
        xmlStrBuilder.append("<td class=\"hitContent\">");
        xmlStrBuilder.append(highlightHtmlStr);
        xmlStrBuilder.append("</td>");
      }
      xmlStrBuilder.append("</tr>");
    }
    xmlStrBuilder.append("</table>");
    xmlStrBuilder.append("</body>");
    xmlStrBuilder.append("</html>");
    return xmlStrBuilder.toString();   
  }
  
  private String highlight(String xmlStr, String highlightQueryType, String highlightQuery, String language) throws ApplicationException {
    String result = null;
    try {
      HighlightContentHandler highlightContentHandler = new HighlightContentHandler(null, -1, highlightQueryType, highlightQuery, language);
      highlightContentHandler.setFirstPageBreakReachedMode(true);
      XMLReader xmlParser = new SAXParser();
      xmlParser.setContentHandler(highlightContentHandler);
      StringReader stringReader = new StringReader(xmlStr);
      InputSource inputSource = new InputSource(stringReader);
      xmlParser.parse(inputSource);
      result = highlightContentHandler.getResult().toString();
    } catch (SAXException e) {
      throw new ApplicationException(e);
    } catch (IOException e) {
      throw new ApplicationException(e);
    }
    return result;
  }

  private String getPresentationName(String elemName) {
    String retStr = null;
    if (elemName != null) {
      if (elemName.equals("s")) {
        retStr = "Sentence";
      } else {
        // first char to uppercase
        char[] stringArray = elemName.toCharArray();
        stringArray[0] = Character.toUpperCase(stringArray[0]);
        retStr = new String(stringArray);
      }
    }
    return retStr;
  }
  
  private String getBaseUrl(HttpServletRequest request) {
    return getServerUrl(request) + request.getContextPath();
  }

  private String getServerUrl(HttpServletRequest request) {
    if ( ( request.getServerPort() == 80 ) || ( request.getServerPort() == 443 ) )
      return request.getScheme() + "://" + request.getServerName();
    else
      return request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort();
  }

  private boolean withLemmas(String[] outputOptions) {
    boolean result = false;
    for (int i=0; i< outputOptions.length; i++) {
      String function = outputOptions[i];
      if (function.equals("withLemmas"))
        return true;
    }
    return result;
  }

}