diff software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/cms/QueryDocument.java @ 25:e9fe3186670c default tip

letzter Stand eingecheckt
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 21 May 2013 10:19:32 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/cms/QueryDocument.java	Tue May 21 10:19:32 2013 +0200
@@ -0,0 +1,350 @@
+package de.mpg.mpiwg.berlin.mpdl.servlets.cms;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringReader;
+import java.util.ArrayList;
+
+import javax.servlet.ServletConfig;
+import javax.servlet.ServletContext;
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import org.apache.lucene.document.Fieldable;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+import com.sun.org.apache.xerces.internal.parsers.SAXParser;
+
+import de.mpg.mpiwg.berlin.mpdl.cms.document.Document;
+import de.mpg.mpiwg.berlin.mpdl.cms.document.Hits;
+import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord;
+import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler;
+import de.mpg.mpiwg.berlin.mpdl.cms.transform.HighlightContentHandler;
+import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer;
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+
+public class QueryDocument extends HttpServlet {
+  private static final long serialVersionUID = 1L;
+  private PageTransformer pageTransformer = null;
+  
+  public QueryDocument() {
+    super();
+  }
+
+  public void init(ServletConfig config) throws ServletException  {
+    super.init(config);
+    ServletContext context = getServletContext();
+    pageTransformer = (PageTransformer) context.getAttribute("pageTransformer");
+  }
+
+  protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
+    doGet(request, response);
+  }  
+
+  protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
+    request.setCharacterEncoding("utf-8");
+    response.setCharacterEncoding("utf-8");
+    String docId = request.getParameter("docId");
+    String query = request.getParameter("query");
+    String[] normFunctions = {"none"};
+    if (query.contains("tokenReg"))  // TODO ordentlich behandeln
+      normFunctions[0] = "reg";
+    else if (query.contains("tokenNorm"))  // TODO ordentlich behandeln
+      normFunctions[0] = "norm";
+    String[] outputOptions = {};
+    if (query.contains("tokenMorph")) {  // TODO ordentlich behandeln
+      outputOptions = new String[1];
+      outputOptions[0] = "withLemmas";
+    }
+    String pageStr = request.getParameter("page");
+    if (pageStr == null)
+      pageStr = "1";
+    int page = Integer.parseInt(pageStr);
+    String pageSizeStr = request.getParameter("pageSize");
+    if (pageSizeStr == null)
+      pageSizeStr = "10";
+    int pageSize = Integer.parseInt(pageSizeStr);
+    int from = (page * pageSize) - pageSize;  // e.g. 0
+    int to = page * pageSize - 1;  // e.g. 9
+    String outputFormat = request.getParameter("outputFormat");
+    if (outputFormat == null)
+      outputFormat = "xml";
+    try {
+      IndexHandler indexHandler = IndexHandler.getInstance();
+      Hits hits = indexHandler.queryDocument(docId, query, from, to);
+      MetadataRecord docMetadataRecord = indexHandler.getDocMetadata(docId);
+      if (outputFormat.equals("xml"))
+        response.setContentType("text/xml");
+      else if (outputFormat.equals("html"))
+        response.setContentType("text/html");
+      else 
+        response.setContentType("text/xml");
+      PrintWriter out = response.getWriter();
+      String resultStr = "";
+      if (outputFormat.equals("xml"))
+        resultStr = createXmlString(docMetadataRecord, query, page, pageSize, normFunctions, outputOptions, hits);
+      else if (outputFormat.equals("html"))
+        resultStr = createHtmlString(docMetadataRecord, query, page, pageSize, normFunctions, outputOptions, hits, request);
+      out.print(resultStr);
+      out.close();
+    } catch (ApplicationException e) {
+      throw new ServletException(e);
+    }
+  }
+
+  private String createXmlString(MetadataRecord docMetadataRecord, String query, int page, int pageSize, String[] normFunctions, String[] outputOptions, Hits hits) throws ApplicationException {
+    String docId = docMetadataRecord.getDocId();
+    ArrayList<Document> docs = null;
+    if (hits != null)
+      docs = hits.getHits();
+    int hitsSize = -1;
+    int docsSize = -1;
+    if (hits != null)
+      hitsSize = hits.getSize();
+    if (docs != null)
+      docsSize = docs.size();
+    StringBuilder xmlStrBuilder = new StringBuilder();
+    xmlStrBuilder.append("<document>");
+    xmlStrBuilder.append("<id>" + docId + "</id>");
+    xmlStrBuilder.append("<query>");
+    xmlStrBuilder.append("<queryText>" + query + "</queryText>");
+    xmlStrBuilder.append("<resultPage>" + page + "</resultPage>");
+    xmlStrBuilder.append("<resultPageSize>" + pageSize + "</resultPageSize>");
+    xmlStrBuilder.append("</query>");
+    xmlStrBuilder.append("<hitsSize>" + hitsSize + "</hitsSize>");
+    xmlStrBuilder.append("<hits>");
+    for (int i=0; i<docsSize; i++) {
+      Document doc = docs.get(i);
+      int num = (page - 1) * pageSize + i + 1;
+      xmlStrBuilder.append("<hit>");
+      xmlStrBuilder.append("<num>" + num + "</num>");
+      String pageNumber = null;
+      Fieldable fPageNumber = doc.getFieldable("pageNumber");
+      if (fPageNumber != null) {
+        pageNumber = fPageNumber.stringValue();
+        xmlStrBuilder.append("<pageNumber>" + pageNumber + "</pageNumber>");
+      }
+      String elementPagePosition = null;
+      Fieldable fElementPagePosition = doc.getFieldable("elementPagePosition");
+      if (fElementPagePosition != null) {
+        elementPagePosition = fElementPagePosition.stringValue();
+        xmlStrBuilder.append("<pagePosition>" + elementPagePosition + "</pagePosition>");
+      }
+      String lineNumber = null;
+      Fieldable fLineNumber = doc.getFieldable("lineNumber");
+      if (fLineNumber != null) {
+        lineNumber = fLineNumber.stringValue();
+        xmlStrBuilder.append("<lineNumber>" + lineNumber + "</lineNumber>");
+      }
+      String elementPosition = null;
+      Fieldable fElementPosition = doc.getFieldable("elementAbsolutePosition");
+      if (fElementPosition != null) {
+        elementPosition = fElementPosition.stringValue();
+        xmlStrBuilder.append("<absolutePosition>" + elementPosition + "</absolutePosition>");
+      }
+      String xpath = null;
+      Fieldable fXPath = doc.getFieldable("xpath");
+      if (fXPath != null) {
+        xpath = fXPath.stringValue();
+        xmlStrBuilder.append("<xpath>" + xpath + "</xpath>");
+      }
+      String xmlId = null;
+      Fieldable fXmlId = doc.getFieldable("xmlId");
+      if (fXmlId != null) {
+        xmlId = fXmlId.stringValue();
+        xmlStrBuilder.append("<xmlId>" + xmlId + "</xmlId>");
+      }
+      String language = null;
+      Fieldable fLanguage = doc.getFieldable("language");
+      if (fLanguage != null) {
+        language = fLanguage.stringValue();
+        xmlStrBuilder.append("<language>" + language + "</language>");
+      }
+      String xmlContentTokenized = null;
+      Fieldable fXmlContentTokenized = doc.getFieldable("xmlContentTokenized");
+      if (fXmlContentTokenized != null) {
+        String highlightQueryType = "orig";
+        if (withLemmas(outputOptions)) {
+          highlightQueryType = "morph";
+        } else if (normFunctions != null) { 
+          String normFunction = normFunctions[0];
+          highlightQueryType = normFunction;
+          if (normFunction.equals("none")) {
+            highlightQueryType = "orig";
+          }
+        }
+        xmlContentTokenized = fXmlContentTokenized.stringValue();
+        String xmlPre = "<content xmlns:xhtml=\"http://www.w3.org/1999/xhtml\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">";
+        String xmlPost = "</content>";
+        String xmlInputStr = xmlPre + xmlContentTokenized + xmlPost;
+        String docLanguage = docMetadataRecord.getLanguage();
+        String highlightedXmlStr = highlight(xmlInputStr, highlightQueryType, query, docLanguage);
+        if (highlightedXmlStr == null)
+          highlightedXmlStr = "<content>" + xmlContentTokenized + "</content>";
+        xmlStrBuilder.append(highlightedXmlStr);
+      }
+      xmlStrBuilder.append("</hit>");
+    }
+    xmlStrBuilder.append("</hits>");
+    xmlStrBuilder.append("</document>");
+    return xmlStrBuilder.toString();   
+  }
+  
+  private String createHtmlString(MetadataRecord docMetadataRecord, String query, int page, int pageSize, String[] normFunctions, String[] outputOptions, Hits hits, HttpServletRequest request) throws ApplicationException {
+    String docId = docMetadataRecord.getDocId();
+    ArrayList<Document> docs = null;
+    if (hits != null)
+      docs = hits.getHits();
+    int hitsSize = -1;
+    int docsSize = -1;
+    if (hits != null)
+      hitsSize = hits.getSize();
+    if (docs != null)
+      docsSize = docs.size();
+    String highlightQueryType = "orig";
+    String normalizationStr = "";
+    String highlightQueryTypeStr = "";
+    if (withLemmas(outputOptions)) {
+      highlightQueryTypeStr = "&amp;highlightQueryType=norm";
+      highlightQueryType = "norm";
+    } else if (normFunctions != null) { 
+      String normFunction = normFunctions[0];
+      normalizationStr = "&amp;normalization=" + normFunction;
+      highlightQueryType = normFunction;
+      if (normFunction.equals("none")) {
+        normalizationStr = "&amp;normalization=" + "orig";
+        highlightQueryType = "orig";
+      }
+    }
+    StringBuilder xmlStrBuilder = new StringBuilder();
+    xmlStrBuilder.append("<html>");
+    xmlStrBuilder.append("<head>");
+    xmlStrBuilder.append("<title>Document: \"" + docId + " " + query + "\"</title>");
+    String baseUrl = getBaseUrl(request);
+    String cssUrl = baseUrl + "/css/page.css";
+    String cssShowWordFileName = "pageOrig.css";
+    if (highlightQueryType.equals("reg"))
+      cssShowWordFileName = "pageReg.css"; 
+    else if (highlightQueryType.equals("norm"))
+      cssShowWordFileName = "pageNorm.css";
+    String showWordCssUrl = baseUrl + "/css/" + cssShowWordFileName;
+    xmlStrBuilder.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"" + showWordCssUrl + "\"/>");
+    xmlStrBuilder.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"" + cssUrl + "\"/>");
+    xmlStrBuilder.append("</head>");
+    xmlStrBuilder.append("<body>");
+    xmlStrBuilder.append("<span class=\"about\">[<span class=\"it\">This is a MPIWG CMS technology service</span>] <a href=\"/mpiwg-mpdl-cms-web/index.html\"><img src=\"/mpiwg-mpdl-cms-web/images/info.png\" valign=\"bottom\" width=\"15\" height=\"15\" border=\"0\" alt=\"MPIWG CMS service\"/></a></span>");
+    xmlStrBuilder.append("<span class=\"query\">Query: " + query + "</span>");
+    xmlStrBuilder.append("<span class=\"result\">");
+    xmlStrBuilder.append("<span class=\"resultPage\">" + page + "</span>");
+    xmlStrBuilder.append("<span class=\"resultPageSize\">" + pageSize + "</span>");
+    xmlStrBuilder.append("<span class=\"hitsSize\">" + hitsSize + "</span>");
+    xmlStrBuilder.append("</span>");
+    xmlStrBuilder.append("<table>");
+    for (int i=0; i<docsSize; i++) {
+      xmlStrBuilder.append("<tr class=\"hit\">");
+      Document doc = docs.get(i);
+      int num = (page - 1) * pageSize + i + 1;
+      xmlStrBuilder.append("<td class=\"hitNum\">" + num + ". " + "</td>");
+      xmlStrBuilder.append("<td class=\"hitLink\">");
+      String posStr = "";
+      String pageNumber = "";
+      Fieldable fPageNumber = doc.getFieldable("pageNumber");
+      if (fPageNumber != null) {
+        pageNumber = fPageNumber.stringValue();
+        posStr = posStr + "Page " + pageNumber + ", ";
+      }
+      String elementName = null;
+      String presElementName = "";
+      Fieldable fElementName = doc.getFieldable("elementName");
+      if (fElementName != null) {
+        elementName = fElementName.stringValue();
+        presElementName = getPresentationName(elementName);
+      }
+      String elementPagePosition = "";
+      Fieldable fElementPagePosition = doc.getFieldable("elementPagePosition");
+      if (fElementPagePosition != null) {
+        elementPagePosition = fElementPagePosition.stringValue();
+        posStr = posStr + presElementName + " " + elementPagePosition + ":";
+      }
+      String language = docMetadataRecord.getLanguage();
+      String getPageLink = baseUrl + "/query/GetPage?docId=" + docId + "&amp;page=" + pageNumber + normalizationStr + "&amp;highlightElem=" + elementName + "&amp;highlightElemPos=" + elementPagePosition + highlightQueryTypeStr + "&amp;highlightQuery=" + query + "&amp;language=" + language;
+      xmlStrBuilder.append("<a href=\"" + getPageLink + "\">" + posStr + "</a>");
+      xmlStrBuilder.append("</td>");
+      String xmlContentTokenized = null;
+      Fieldable fXmlContentTokenized = doc.getFieldable("xmlContentTokenized");
+      if (fXmlContentTokenized != null) {
+        xmlContentTokenized = fXmlContentTokenized.stringValue();
+        String highlightedXmlStr = highlight(xmlContentTokenized, highlightQueryType, query, language);  
+        String highlightHtmlStr = pageTransformer.transform(highlightedXmlStr, docMetadataRecord, -1, "html");  // TODO performance: do not highlight each single node but highlight them all in one step
+        xmlStrBuilder.append("<td class=\"hitContent\">");
+        xmlStrBuilder.append(highlightHtmlStr);
+        xmlStrBuilder.append("</td>");
+      }
+      xmlStrBuilder.append("</tr>");
+    }
+    xmlStrBuilder.append("</table>");
+    xmlStrBuilder.append("</body>");
+    xmlStrBuilder.append("</html>");
+    return xmlStrBuilder.toString();   
+  }
+  
+  private String highlight(String xmlStr, String highlightQueryType, String highlightQuery, String language) throws ApplicationException {
+    String result = null;
+    try {
+      HighlightContentHandler highlightContentHandler = new HighlightContentHandler(null, -1, highlightQueryType, highlightQuery, language);
+      highlightContentHandler.setFirstPageBreakReachedMode(true);
+      XMLReader xmlParser = new SAXParser();
+      xmlParser.setContentHandler(highlightContentHandler);
+      StringReader stringReader = new StringReader(xmlStr);
+      InputSource inputSource = new InputSource(stringReader);
+      xmlParser.parse(inputSource);
+      result = highlightContentHandler.getResult().toString();
+    } catch (SAXException e) {
+      throw new ApplicationException(e);
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+    return result;
+  }
+
+  private String getPresentationName(String elemName) {
+    String retStr = null;
+    if (elemName != null) {
+      if (elemName.equals("s")) {
+        retStr = "Sentence";
+      } else {
+        // first char to uppercase
+        char[] stringArray = elemName.toCharArray();
+        stringArray[0] = Character.toUpperCase(stringArray[0]);
+        retStr = new String(stringArray);
+      }
+    }
+    return retStr;
+  }
+  
+  private String getBaseUrl(HttpServletRequest request) {
+    return getServerUrl(request) + request.getContextPath();
+  }
+
+  private String getServerUrl(HttpServletRequest request) {
+    if ( ( request.getServerPort() == 80 ) || ( request.getServerPort() == 443 ) )
+      return request.getScheme() + "://" + request.getServerName();
+    else
+      return request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort();
+  }
+
+  private boolean withLemmas(String[] outputOptions) {
+    boolean result = false;
+    for (int i=0; i< outputOptions.length; i++) {
+      String function = outputOptions[i];
+      if (function.equals("withLemmas"))
+        return true;
+    }
+    return result;
+  }
+
+}