Mercurial > hg > mpdl-group

diff software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/cms/GetPage.java @ 25:e9fe3186670c default tip
letzter Stand eingecheckt
author: Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date: Tue, 21 May 2013 10:19:32 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/cms/GetPage.java	Tue May 21 10:19:32 2013 +0200
@@ -0,0 +1,215 @@
+package de.mpg.mpiwg.berlin.mpdl.servlets.cms;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringReader;
+
+import javax.servlet.ServletConfig;
+import javax.servlet.ServletContext;
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import org.apache.commons.io.FileUtils;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+import com.sun.org.apache.xerces.internal.parsers.SAXParser;
+
+import de.mpg.mpiwg.berlin.mpdl.cms.document.DocumentHandler;
+import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord;
+import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler;
+import de.mpg.mpiwg.berlin.mpdl.cms.transform.HighlightContentHandler;
+import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer;
+import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
+import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler;
+
+public class GetPage extends HttpServlet {
+  private static final long serialVersionUID = 1L;
+  private PageTransformer pageTransformer;
+
+  public GetPage() {
+    super();
+  }
+
+  public void init(ServletConfig config) throws ServletException  {
+    super.init(config);
+    ServletContext context = getServletContext();
+    pageTransformer = (PageTransformer) context.getAttribute("pageTransformer");
+  }
+
+  protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
+    String result = "";
+    request.setCharacterEncoding("utf-8");
+    response.setCharacterEncoding("utf-8");
+    String docId = request.getParameter("docId");
+    String pageStr = request.getParameter("page");
+    String normalization = request.getParameter("normalization");
+    String highlightQuery = request.getParameter("highlightQuery");
+    String highlightQueryType = request.getParameter("highlightQueryType");
+    if (highlightQueryType == null)
+      highlightQueryType = "form";
+    String highlightElem = request.getParameter("highlightElem");
+    String highlightElemPosStr = request.getParameter("highlightElemPos");
+    int highlightElemPos = -1;
+    if (highlightElemPosStr != null)
+      highlightElemPos = Integer.parseInt(highlightElemPosStr);
+    String mode = request.getParameter("mode");
+    if (mode == null)
+      mode = "untokenized";
+    String outputFormat = request.getParameter("outputFormat");
+    if (outputFormat == null)
+      outputFormat = "html";
+    String cssUrl = request.getParameter("cssUrl");
+    String baseUrl = getBaseUrl(request);
+    if (cssUrl == null) {
+      cssUrl = baseUrl + "/css/page.css";
+    }
+    int page = 1;
+    if (pageStr != null)
+      page = Integer.parseInt(pageStr);
+    if (outputFormat.equals("xml"))
+      response.setContentType("text/xml");
+    else if (outputFormat.equals("html") || outputFormat.equals("xmlDisplay"))
+      response.setContentType("text/html");
+    // normalization 
+    if (normalization == null || ! (normalization.equals("orig") || normalization.equals("reg") || normalization.equals("norm")))
+      normalization = "norm";
+    if (outputFormat.equals("xmlDisplay"))
+      normalization = "orig";
+    PrintWriter out = response.getWriter();
+    try {
+      IndexHandler indexHandler = IndexHandler.getInstance();
+      MetadataRecord mdRecord = indexHandler.getDocMetadata(docId);
+      DocumentHandler docHandler = new DocumentHandler();
+      String docDir = docHandler.getDocDir(docId);
+      String docPageDir = docDir + "/" + "pages";
+      String pageFileName = docPageDir + "/page-" + page + "-morph.xml";
+      File pageFile = new File(pageFileName);
+      if (page == 1 && ! (new File(docPageDir)).exists()) {
+        String docFileName = docHandler.getDocFullFileName(docId);
+        pageFile = new File(docFileName);  // when no page breaks are in the document then the whole document is the first page
+      }
+      if (! pageFile.exists()) {
+        out.print("There is no page: " + page + " in document");
+        out.close();
+        return;
+      }
+      String pageHtmlFileName = docPageDir + "/page-" + page + ".html";
+      File pageHtmlFile = new File(pageHtmlFileName);
+      String fragmentMorphStr = FileUtils.readFileToString(pageFile, "utf-8");
+      if (! pageHtmlFile.exists())  // TODO rausnehmen sobald alle Dokumente neu indexiert wurden
+        fragmentMorphStr = enrichWordsOrigRegNorm(fragmentMorphStr);
+      if (outputFormat.equals("html") || outputFormat.equals("xmlDisplay")) {
+        String schemaName = mdRecord.getSchemaName();
+        String title = docId + ", Page: " + page;
+        String xmlHeader = "<?xml version=\"1.0\" encoding=\"utf-8\"?>";
+        String cssShowWordFileName = "pageNormDict.css";
+        if (outputFormat.equals("xmlDisplay"))
+          cssShowWordFileName = "pageOrig.css"; // xml display shows always the original text
+        else if (normalization.equals("orig") && mode.equals("untokenized"))
+          cssShowWordFileName = "pageOrig.css";
+        else if (normalization.equals("orig") && mode.equals("tokenized"))
+          cssShowWordFileName = "pageOrigDict.css";
+        else if (normalization.equals("reg") && mode.equals("untokenized"))
+          cssShowWordFileName = "pageReg.css";
+        else if (normalization.equals("reg") && mode.equals("tokenized"))
+          cssShowWordFileName = "pageRegDict.css";
+        else if (normalization.equals("norm") && mode.equals("untokenized"))
+          cssShowWordFileName = "pageNorm.css";
+        String showWordCssUrl = baseUrl + "/css/" + cssShowWordFileName;
+        String mainCssLink = "<link rel=\"stylesheet\" type=\"text/css\" href=\"" + cssUrl + "\"/>";
+        String showWordCssLink = "<link rel=\"stylesheet\" type=\"text/css\" href=\"" + showWordCssUrl + "\"/>";
+        String head = "<head>" + "<title>" + title + "</title>" + showWordCssLink + mainCssLink + "</head>";
+        String namespace = "";
+        String pageHtmlStr = null;
+        if (pageHtmlFile.exists() && outputFormat.equals("html") && (highlightElem == null && highlightQuery == null)) {
+          pageHtmlStr = FileUtils.readFileToString(pageHtmlFile, "utf-8");
+        } else {
+          if (highlightElem != null || highlightQuery != null) {
+            String hiQueryType = "orig";
+            if (highlightQueryType.equals("morph"))
+              hiQueryType = "morph";
+            else
+              hiQueryType = normalization;
+            String language = mdRecord.getLanguage();
+            fragmentMorphStr = highlight(fragmentMorphStr, highlightElem, highlightElemPos, hiQueryType, highlightQuery, language);
+          }
+          pageHtmlStr = pageTransformer.transform(fragmentMorphStr, mdRecord, page, outputFormat);
+        }
+        if (schemaName != null && schemaName.equals("echo")) {
+          namespace = "xmlns:echo=\"http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/\" xmlns:de=\"http://www.mpiwg-berlin.mpg.de/ns/de/1.0/\" " +
+                  "xmlns:dcterms=\"http://purl.org/dc/terms\" " + "xmlns:xhtml=\"http://www.w3.org/1999/xhtml\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" " +
+                  "xmlns:xlink=\"http://www.w3.org/1999/xlink\"";
+        }
+        result = xmlHeader + "<html " + namespace + ">" + head + "<body>" + pageHtmlStr + "</body>" + "</html>";
+      } else {
+        String pageFileNameOrig = docPageDir + "/page-" + page + ".xml";
+        File pageFileOrig = new File(pageFileNameOrig);
+        if (pageFileOrig.exists())
+          result = FileUtils.readFileToString(pageFileOrig, "utf-8");
+        else
+          result = "";
+      }
+      out.print(result);
+      out.close();
+    } catch (ApplicationException e) {
+      throw new ServletException(e);
+    }
+  }
+
+  protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
+    doGet(request, response);
+  }  
+
+  private String getBaseUrl(HttpServletRequest request) {
+    return getServerUrl(request) + request.getContextPath();
+  }
+
+  private String getServerUrl(HttpServletRequest request) {
+    if ( ( request.getServerPort() == 80 ) || ( request.getServerPort() == 443 ) )
+      return request.getScheme() + "://" + request.getServerName();
+    else
+      return request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort();
+  }
+
+  private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException {
+    try {
+      WordContentHandler wordContentHandler = new WordContentHandler();
+      XMLReader xmlParser = new SAXParser();
+      xmlParser.setContentHandler(wordContentHandler);
+      StringReader strReader = new StringReader(xmlStr);
+      InputSource inputSource = new InputSource(strReader);
+      xmlParser.parse(inputSource);
+      String result = wordContentHandler.getResult();
+      return result;
+    } catch (SAXException e) {
+      throw new ApplicationException(e);
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+  }
+  
+  private String highlight(String xmlStr, String highlightElem, int highlightElemPos, String highlightQueryType, String highlightQuery, String language) throws ApplicationException {
+    String result = null;
+    try {
+      HighlightContentHandler highlightContentHandler = new HighlightContentHandler(highlightElem, highlightElemPos, highlightQueryType, highlightQuery, language);
+      highlightContentHandler.setFirstPageBreakReachedMode(true);
+      XMLReader xmlParser = new SAXParser();
+      xmlParser.setContentHandler(highlightContentHandler);
+      StringReader stringReader = new StringReader(xmlStr);
+      InputSource inputSource = new InputSource(stringReader);
+      xmlParser.parse(inputSource);
+      result = highlightContentHandler.getResult().toString();
+    } catch (SAXException e) {
+      throw new ApplicationException(e);
+    } catch (IOException e) {
+      throw new ApplicationException(e);
+    }
+    return result;
+  }
+  
+}
author	Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date	Tue, 21 May 2013 10:19:32 +0200
parents
children