Mercurial > hg > mpdl-group
diff software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/cms/GetPage.java @ 25:e9fe3186670c default tip
letzter Stand eingecheckt
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 21 May 2013 10:19:32 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/cms/GetPage.java Tue May 21 10:19:32 2013 +0200 @@ -0,0 +1,215 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.cms; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringReader; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletContext; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.apache.commons.io.FileUtils; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.cms.document.DocumentHandler; +import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord; +import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler; +import de.mpg.mpiwg.berlin.mpdl.cms.transform.HighlightContentHandler; +import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; +import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler; + +public class GetPage extends HttpServlet { + private static final long serialVersionUID = 1L; + private PageTransformer pageTransformer; + + public GetPage() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + ServletContext context = getServletContext(); + pageTransformer = (PageTransformer) context.getAttribute("pageTransformer"); + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + String result = ""; + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String docId = request.getParameter("docId"); + String pageStr = request.getParameter("page"); + String normalization = request.getParameter("normalization"); + String highlightQuery = request.getParameter("highlightQuery"); + String highlightQueryType = request.getParameter("highlightQueryType"); + if (highlightQueryType == null) + highlightQueryType = "form"; + String highlightElem = request.getParameter("highlightElem"); + String highlightElemPosStr = request.getParameter("highlightElemPos"); + int highlightElemPos = -1; + if (highlightElemPosStr != null) + highlightElemPos = Integer.parseInt(highlightElemPosStr); + String mode = request.getParameter("mode"); + if (mode == null) + mode = "untokenized"; + String outputFormat = request.getParameter("outputFormat"); + if (outputFormat == null) + outputFormat = "html"; + String cssUrl = request.getParameter("cssUrl"); + String baseUrl = getBaseUrl(request); + if (cssUrl == null) { + cssUrl = baseUrl + "/css/page.css"; + } + int page = 1; + if (pageStr != null) + page = Integer.parseInt(pageStr); + if (outputFormat.equals("xml")) + response.setContentType("text/xml"); + else if (outputFormat.equals("html") || outputFormat.equals("xmlDisplay")) + response.setContentType("text/html"); + // normalization + if (normalization == null || ! (normalization.equals("orig") || normalization.equals("reg") || normalization.equals("norm"))) + normalization = "norm"; + if (outputFormat.equals("xmlDisplay")) + normalization = "orig"; + PrintWriter out = response.getWriter(); + try { + IndexHandler indexHandler = IndexHandler.getInstance(); + MetadataRecord mdRecord = indexHandler.getDocMetadata(docId); + DocumentHandler docHandler = new DocumentHandler(); + String docDir = docHandler.getDocDir(docId); + String docPageDir = docDir + "/" + "pages"; + String pageFileName = docPageDir + "/page-" + page + "-morph.xml"; + File pageFile = new File(pageFileName); + if (page == 1 && ! (new File(docPageDir)).exists()) { + String docFileName = docHandler.getDocFullFileName(docId); + pageFile = new File(docFileName); // when no page breaks are in the document then the whole document is the first page + } + if (! pageFile.exists()) { + out.print("There is no page: " + page + " in document"); + out.close(); + return; + } + String pageHtmlFileName = docPageDir + "/page-" + page + ".html"; + File pageHtmlFile = new File(pageHtmlFileName); + String fragmentMorphStr = FileUtils.readFileToString(pageFile, "utf-8"); + if (! pageHtmlFile.exists()) // TODO rausnehmen sobald alle Dokumente neu indexiert wurden + fragmentMorphStr = enrichWordsOrigRegNorm(fragmentMorphStr); + if (outputFormat.equals("html") || outputFormat.equals("xmlDisplay")) { + String schemaName = mdRecord.getSchemaName(); + String title = docId + ", Page: " + page; + String xmlHeader = "<?xml version=\"1.0\" encoding=\"utf-8\"?>"; + String cssShowWordFileName = "pageNormDict.css"; + if (outputFormat.equals("xmlDisplay")) + cssShowWordFileName = "pageOrig.css"; // xml display shows always the original text + else if (normalization.equals("orig") && mode.equals("untokenized")) + cssShowWordFileName = "pageOrig.css"; + else if (normalization.equals("orig") && mode.equals("tokenized")) + cssShowWordFileName = "pageOrigDict.css"; + else if (normalization.equals("reg") && mode.equals("untokenized")) + cssShowWordFileName = "pageReg.css"; + else if (normalization.equals("reg") && mode.equals("tokenized")) + cssShowWordFileName = "pageRegDict.css"; + else if (normalization.equals("norm") && mode.equals("untokenized")) + cssShowWordFileName = "pageNorm.css"; + String showWordCssUrl = baseUrl + "/css/" + cssShowWordFileName; + String mainCssLink = "<link rel=\"stylesheet\" type=\"text/css\" href=\"" + cssUrl + "\"/>"; + String showWordCssLink = "<link rel=\"stylesheet\" type=\"text/css\" href=\"" + showWordCssUrl + "\"/>"; + String head = "<head>" + "<title>" + title + "</title>" + showWordCssLink + mainCssLink + "</head>"; + String namespace = ""; + String pageHtmlStr = null; + if (pageHtmlFile.exists() && outputFormat.equals("html") && (highlightElem == null && highlightQuery == null)) { + pageHtmlStr = FileUtils.readFileToString(pageHtmlFile, "utf-8"); + } else { + if (highlightElem != null || highlightQuery != null) { + String hiQueryType = "orig"; + if (highlightQueryType.equals("morph")) + hiQueryType = "morph"; + else + hiQueryType = normalization; + String language = mdRecord.getLanguage(); + fragmentMorphStr = highlight(fragmentMorphStr, highlightElem, highlightElemPos, hiQueryType, highlightQuery, language); + } + pageHtmlStr = pageTransformer.transform(fragmentMorphStr, mdRecord, page, outputFormat); + } + if (schemaName != null && schemaName.equals("echo")) { + namespace = "xmlns:echo=\"http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/\" xmlns:de=\"http://www.mpiwg-berlin.mpg.de/ns/de/1.0/\" " + + "xmlns:dcterms=\"http://purl.org/dc/terms\" " + "xmlns:xhtml=\"http://www.w3.org/1999/xhtml\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" " + + "xmlns:xlink=\"http://www.w3.org/1999/xlink\""; + } + result = xmlHeader + "<html " + namespace + ">" + head + "<body>" + pageHtmlStr + "</body>" + "</html>"; + } else { + String pageFileNameOrig = docPageDir + "/page-" + page + ".xml"; + File pageFileOrig = new File(pageFileNameOrig); + if (pageFileOrig.exists()) + result = FileUtils.readFileToString(pageFileOrig, "utf-8"); + else + result = ""; + } + out.print(result); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + doGet(request, response); + } + + private String getBaseUrl(HttpServletRequest request) { + return getServerUrl(request) + request.getContextPath(); + } + + private String getServerUrl(HttpServletRequest request) { + if ( ( request.getServerPort() == 80 ) || ( request.getServerPort() == 443 ) ) + return request.getScheme() + "://" + request.getServerName(); + else + return request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort(); + } + + private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException { + try { + WordContentHandler wordContentHandler = new WordContentHandler(); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(wordContentHandler); + StringReader strReader = new StringReader(xmlStr); + InputSource inputSource = new InputSource(strReader); + xmlParser.parse(inputSource); + String result = wordContentHandler.getResult(); + return result; + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + } + + private String highlight(String xmlStr, String highlightElem, int highlightElemPos, String highlightQueryType, String highlightQuery, String language) throws ApplicationException { + String result = null; + try { + HighlightContentHandler highlightContentHandler = new HighlightContentHandler(highlightElem, highlightElemPos, highlightQueryType, highlightQuery, language); + highlightContentHandler.setFirstPageBreakReachedMode(true); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(highlightContentHandler); + StringReader stringReader = new StringReader(xmlStr); + InputSource inputSource = new InputSource(stringReader); + xmlParser.parse(inputSource); + result = highlightContentHandler.getResult().toString(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return result; + } + +}