comparison software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/cms/GetPage.java @ 25:e9fe3186670c default tip

letzter Stand eingecheckt
author Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de>
date Tue, 21 May 2013 10:19:32 +0200
parents
children
comparison
equal deleted inserted replaced
23:e845310098ba 25:e9fe3186670c
1 package de.mpg.mpiwg.berlin.mpdl.servlets.cms;
2
3 import java.io.File;
4 import java.io.IOException;
5 import java.io.PrintWriter;
6 import java.io.StringReader;
7
8 import javax.servlet.ServletConfig;
9 import javax.servlet.ServletContext;
10 import javax.servlet.ServletException;
11 import javax.servlet.http.HttpServlet;
12 import javax.servlet.http.HttpServletRequest;
13 import javax.servlet.http.HttpServletResponse;
14
15 import org.apache.commons.io.FileUtils;
16 import org.xml.sax.InputSource;
17 import org.xml.sax.SAXException;
18 import org.xml.sax.XMLReader;
19
20 import com.sun.org.apache.xerces.internal.parsers.SAXParser;
21
22 import de.mpg.mpiwg.berlin.mpdl.cms.document.DocumentHandler;
23 import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord;
24 import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler;
25 import de.mpg.mpiwg.berlin.mpdl.cms.transform.HighlightContentHandler;
26 import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer;
27 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
28 import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler;
29
30 public class GetPage extends HttpServlet {
31 private static final long serialVersionUID = 1L;
32 private PageTransformer pageTransformer;
33
34 public GetPage() {
35 super();
36 }
37
38 public void init(ServletConfig config) throws ServletException {
39 super.init(config);
40 ServletContext context = getServletContext();
41 pageTransformer = (PageTransformer) context.getAttribute("pageTransformer");
42 }
43
44 protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
45 String result = "";
46 request.setCharacterEncoding("utf-8");
47 response.setCharacterEncoding("utf-8");
48 String docId = request.getParameter("docId");
49 String pageStr = request.getParameter("page");
50 String normalization = request.getParameter("normalization");
51 String highlightQuery = request.getParameter("highlightQuery");
52 String highlightQueryType = request.getParameter("highlightQueryType");
53 if (highlightQueryType == null)
54 highlightQueryType = "form";
55 String highlightElem = request.getParameter("highlightElem");
56 String highlightElemPosStr = request.getParameter("highlightElemPos");
57 int highlightElemPos = -1;
58 if (highlightElemPosStr != null)
59 highlightElemPos = Integer.parseInt(highlightElemPosStr);
60 String mode = request.getParameter("mode");
61 if (mode == null)
62 mode = "untokenized";
63 String outputFormat = request.getParameter("outputFormat");
64 if (outputFormat == null)
65 outputFormat = "html";
66 String cssUrl = request.getParameter("cssUrl");
67 String baseUrl = getBaseUrl(request);
68 if (cssUrl == null) {
69 cssUrl = baseUrl + "/css/page.css";
70 }
71 int page = 1;
72 if (pageStr != null)
73 page = Integer.parseInt(pageStr);
74 if (outputFormat.equals("xml"))
75 response.setContentType("text/xml");
76 else if (outputFormat.equals("html") || outputFormat.equals("xmlDisplay"))
77 response.setContentType("text/html");
78 // normalization
79 if (normalization == null || ! (normalization.equals("orig") || normalization.equals("reg") || normalization.equals("norm")))
80 normalization = "norm";
81 if (outputFormat.equals("xmlDisplay"))
82 normalization = "orig";
83 PrintWriter out = response.getWriter();
84 try {
85 IndexHandler indexHandler = IndexHandler.getInstance();
86 MetadataRecord mdRecord = indexHandler.getDocMetadata(docId);
87 DocumentHandler docHandler = new DocumentHandler();
88 String docDir = docHandler.getDocDir(docId);
89 String docPageDir = docDir + "/" + "pages";
90 String pageFileName = docPageDir + "/page-" + page + "-morph.xml";
91 File pageFile = new File(pageFileName);
92 if (page == 1 && ! (new File(docPageDir)).exists()) {
93 String docFileName = docHandler.getDocFullFileName(docId);
94 pageFile = new File(docFileName); // when no page breaks are in the document then the whole document is the first page
95 }
96 if (! pageFile.exists()) {
97 out.print("There is no page: " + page + " in document");
98 out.close();
99 return;
100 }
101 String pageHtmlFileName = docPageDir + "/page-" + page + ".html";
102 File pageHtmlFile = new File(pageHtmlFileName);
103 String fragmentMorphStr = FileUtils.readFileToString(pageFile, "utf-8");
104 if (! pageHtmlFile.exists()) // TODO rausnehmen sobald alle Dokumente neu indexiert wurden
105 fragmentMorphStr = enrichWordsOrigRegNorm(fragmentMorphStr);
106 if (outputFormat.equals("html") || outputFormat.equals("xmlDisplay")) {
107 String schemaName = mdRecord.getSchemaName();
108 String title = docId + ", Page: " + page;
109 String xmlHeader = "<?xml version=\"1.0\" encoding=\"utf-8\"?>";
110 String cssShowWordFileName = "pageNormDict.css";
111 if (outputFormat.equals("xmlDisplay"))
112 cssShowWordFileName = "pageOrig.css"; // xml display shows always the original text
113 else if (normalization.equals("orig") && mode.equals("untokenized"))
114 cssShowWordFileName = "pageOrig.css";
115 else if (normalization.equals("orig") && mode.equals("tokenized"))
116 cssShowWordFileName = "pageOrigDict.css";
117 else if (normalization.equals("reg") && mode.equals("untokenized"))
118 cssShowWordFileName = "pageReg.css";
119 else if (normalization.equals("reg") && mode.equals("tokenized"))
120 cssShowWordFileName = "pageRegDict.css";
121 else if (normalization.equals("norm") && mode.equals("untokenized"))
122 cssShowWordFileName = "pageNorm.css";
123 String showWordCssUrl = baseUrl + "/css/" + cssShowWordFileName;
124 String mainCssLink = "<link rel=\"stylesheet\" type=\"text/css\" href=\"" + cssUrl + "\"/>";
125 String showWordCssLink = "<link rel=\"stylesheet\" type=\"text/css\" href=\"" + showWordCssUrl + "\"/>";
126 String head = "<head>" + "<title>" + title + "</title>" + showWordCssLink + mainCssLink + "</head>";
127 String namespace = "";
128 String pageHtmlStr = null;
129 if (pageHtmlFile.exists() && outputFormat.equals("html") && (highlightElem == null && highlightQuery == null)) {
130 pageHtmlStr = FileUtils.readFileToString(pageHtmlFile, "utf-8");
131 } else {
132 if (highlightElem != null || highlightQuery != null) {
133 String hiQueryType = "orig";
134 if (highlightQueryType.equals("morph"))
135 hiQueryType = "morph";
136 else
137 hiQueryType = normalization;
138 String language = mdRecord.getLanguage();
139 fragmentMorphStr = highlight(fragmentMorphStr, highlightElem, highlightElemPos, hiQueryType, highlightQuery, language);
140 }
141 pageHtmlStr = pageTransformer.transform(fragmentMorphStr, mdRecord, page, outputFormat);
142 }
143 if (schemaName != null && schemaName.equals("echo")) {
144 namespace = "xmlns:echo=\"http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/\" xmlns:de=\"http://www.mpiwg-berlin.mpg.de/ns/de/1.0/\" " +
145 "xmlns:dcterms=\"http://purl.org/dc/terms\" " + "xmlns:xhtml=\"http://www.w3.org/1999/xhtml\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" " +
146 "xmlns:xlink=\"http://www.w3.org/1999/xlink\"";
147 }
148 result = xmlHeader + "<html " + namespace + ">" + head + "<body>" + pageHtmlStr + "</body>" + "</html>";
149 } else {
150 String pageFileNameOrig = docPageDir + "/page-" + page + ".xml";
151 File pageFileOrig = new File(pageFileNameOrig);
152 if (pageFileOrig.exists())
153 result = FileUtils.readFileToString(pageFileOrig, "utf-8");
154 else
155 result = "";
156 }
157 out.print(result);
158 out.close();
159 } catch (ApplicationException e) {
160 throw new ServletException(e);
161 }
162 }
163
164 protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
165 doGet(request, response);
166 }
167
168 private String getBaseUrl(HttpServletRequest request) {
169 return getServerUrl(request) + request.getContextPath();
170 }
171
172 private String getServerUrl(HttpServletRequest request) {
173 if ( ( request.getServerPort() == 80 ) || ( request.getServerPort() == 443 ) )
174 return request.getScheme() + "://" + request.getServerName();
175 else
176 return request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort();
177 }
178
179 private String enrichWordsOrigRegNorm(String xmlStr) throws ApplicationException {
180 try {
181 WordContentHandler wordContentHandler = new WordContentHandler();
182 XMLReader xmlParser = new SAXParser();
183 xmlParser.setContentHandler(wordContentHandler);
184 StringReader strReader = new StringReader(xmlStr);
185 InputSource inputSource = new InputSource(strReader);
186 xmlParser.parse(inputSource);
187 String result = wordContentHandler.getResult();
188 return result;
189 } catch (SAXException e) {
190 throw new ApplicationException(e);
191 } catch (IOException e) {
192 throw new ApplicationException(e);
193 }
194 }
195
196 private String highlight(String xmlStr, String highlightElem, int highlightElemPos, String highlightQueryType, String highlightQuery, String language) throws ApplicationException {
197 String result = null;
198 try {
199 HighlightContentHandler highlightContentHandler = new HighlightContentHandler(highlightElem, highlightElemPos, highlightQueryType, highlightQuery, language);
200 highlightContentHandler.setFirstPageBreakReachedMode(true);
201 XMLReader xmlParser = new SAXParser();
202 xmlParser.setContentHandler(highlightContentHandler);
203 StringReader stringReader = new StringReader(xmlStr);
204 InputSource inputSource = new InputSource(stringReader);
205 xmlParser.parse(inputSource);
206 result = highlightContentHandler.getResult().toString();
207 } catch (SAXException e) {
208 throw new ApplicationException(e);
209 } catch (IOException e) {
210 throw new ApplicationException(e);
211 }
212 return result;
213 }
214
215 }