Mercurial > hg > mpdl-group
comparison software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/cms/QueryDocument.java @ 25:e9fe3186670c default tip
letzter Stand eingecheckt
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 21 May 2013 10:19:32 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
23:e845310098ba | 25:e9fe3186670c |
---|---|
1 package de.mpg.mpiwg.berlin.mpdl.servlets.cms; | |
2 | |
3 import java.io.IOException; | |
4 import java.io.PrintWriter; | |
5 import java.io.StringReader; | |
6 import java.util.ArrayList; | |
7 | |
8 import javax.servlet.ServletConfig; | |
9 import javax.servlet.ServletContext; | |
10 import javax.servlet.ServletException; | |
11 import javax.servlet.http.HttpServlet; | |
12 import javax.servlet.http.HttpServletRequest; | |
13 import javax.servlet.http.HttpServletResponse; | |
14 | |
15 import org.apache.lucene.document.Fieldable; | |
16 import org.xml.sax.InputSource; | |
17 import org.xml.sax.SAXException; | |
18 import org.xml.sax.XMLReader; | |
19 | |
20 import com.sun.org.apache.xerces.internal.parsers.SAXParser; | |
21 | |
22 import de.mpg.mpiwg.berlin.mpdl.cms.document.Document; | |
23 import de.mpg.mpiwg.berlin.mpdl.cms.document.Hits; | |
24 import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord; | |
25 import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler; | |
26 import de.mpg.mpiwg.berlin.mpdl.cms.transform.HighlightContentHandler; | |
27 import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer; | |
28 import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; | |
29 | |
30 public class QueryDocument extends HttpServlet { | |
31 private static final long serialVersionUID = 1L; | |
32 private PageTransformer pageTransformer = null; | |
33 | |
34 public QueryDocument() { | |
35 super(); | |
36 } | |
37 | |
38 public void init(ServletConfig config) throws ServletException { | |
39 super.init(config); | |
40 ServletContext context = getServletContext(); | |
41 pageTransformer = (PageTransformer) context.getAttribute("pageTransformer"); | |
42 } | |
43 | |
44 protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { | |
45 doGet(request, response); | |
46 } | |
47 | |
48 protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { | |
49 request.setCharacterEncoding("utf-8"); | |
50 response.setCharacterEncoding("utf-8"); | |
51 String docId = request.getParameter("docId"); | |
52 String query = request.getParameter("query"); | |
53 String[] normFunctions = {"none"}; | |
54 if (query.contains("tokenReg")) // TODO ordentlich behandeln | |
55 normFunctions[0] = "reg"; | |
56 else if (query.contains("tokenNorm")) // TODO ordentlich behandeln | |
57 normFunctions[0] = "norm"; | |
58 String[] outputOptions = {}; | |
59 if (query.contains("tokenMorph")) { // TODO ordentlich behandeln | |
60 outputOptions = new String[1]; | |
61 outputOptions[0] = "withLemmas"; | |
62 } | |
63 String pageStr = request.getParameter("page"); | |
64 if (pageStr == null) | |
65 pageStr = "1"; | |
66 int page = Integer.parseInt(pageStr); | |
67 String pageSizeStr = request.getParameter("pageSize"); | |
68 if (pageSizeStr == null) | |
69 pageSizeStr = "10"; | |
70 int pageSize = Integer.parseInt(pageSizeStr); | |
71 int from = (page * pageSize) - pageSize; // e.g. 0 | |
72 int to = page * pageSize - 1; // e.g. 9 | |
73 String outputFormat = request.getParameter("outputFormat"); | |
74 if (outputFormat == null) | |
75 outputFormat = "xml"; | |
76 try { | |
77 IndexHandler indexHandler = IndexHandler.getInstance(); | |
78 Hits hits = indexHandler.queryDocument(docId, query, from, to); | |
79 MetadataRecord docMetadataRecord = indexHandler.getDocMetadata(docId); | |
80 if (outputFormat.equals("xml")) | |
81 response.setContentType("text/xml"); | |
82 else if (outputFormat.equals("html")) | |
83 response.setContentType("text/html"); | |
84 else | |
85 response.setContentType("text/xml"); | |
86 PrintWriter out = response.getWriter(); | |
87 String resultStr = ""; | |
88 if (outputFormat.equals("xml")) | |
89 resultStr = createXmlString(docMetadataRecord, query, page, pageSize, normFunctions, outputOptions, hits); | |
90 else if (outputFormat.equals("html")) | |
91 resultStr = createHtmlString(docMetadataRecord, query, page, pageSize, normFunctions, outputOptions, hits, request); | |
92 out.print(resultStr); | |
93 out.close(); | |
94 } catch (ApplicationException e) { | |
95 throw new ServletException(e); | |
96 } | |
97 } | |
98 | |
99 private String createXmlString(MetadataRecord docMetadataRecord, String query, int page, int pageSize, String[] normFunctions, String[] outputOptions, Hits hits) throws ApplicationException { | |
100 String docId = docMetadataRecord.getDocId(); | |
101 ArrayList<Document> docs = null; | |
102 if (hits != null) | |
103 docs = hits.getHits(); | |
104 int hitsSize = -1; | |
105 int docsSize = -1; | |
106 if (hits != null) | |
107 hitsSize = hits.getSize(); | |
108 if (docs != null) | |
109 docsSize = docs.size(); | |
110 StringBuilder xmlStrBuilder = new StringBuilder(); | |
111 xmlStrBuilder.append("<document>"); | |
112 xmlStrBuilder.append("<id>" + docId + "</id>"); | |
113 xmlStrBuilder.append("<query>"); | |
114 xmlStrBuilder.append("<queryText>" + query + "</queryText>"); | |
115 xmlStrBuilder.append("<resultPage>" + page + "</resultPage>"); | |
116 xmlStrBuilder.append("<resultPageSize>" + pageSize + "</resultPageSize>"); | |
117 xmlStrBuilder.append("</query>"); | |
118 xmlStrBuilder.append("<hitsSize>" + hitsSize + "</hitsSize>"); | |
119 xmlStrBuilder.append("<hits>"); | |
120 for (int i=0; i<docsSize; i++) { | |
121 Document doc = docs.get(i); | |
122 int num = (page - 1) * pageSize + i + 1; | |
123 xmlStrBuilder.append("<hit>"); | |
124 xmlStrBuilder.append("<num>" + num + "</num>"); | |
125 String pageNumber = null; | |
126 Fieldable fPageNumber = doc.getFieldable("pageNumber"); | |
127 if (fPageNumber != null) { | |
128 pageNumber = fPageNumber.stringValue(); | |
129 xmlStrBuilder.append("<pageNumber>" + pageNumber + "</pageNumber>"); | |
130 } | |
131 String elementPagePosition = null; | |
132 Fieldable fElementPagePosition = doc.getFieldable("elementPagePosition"); | |
133 if (fElementPagePosition != null) { | |
134 elementPagePosition = fElementPagePosition.stringValue(); | |
135 xmlStrBuilder.append("<pagePosition>" + elementPagePosition + "</pagePosition>"); | |
136 } | |
137 String lineNumber = null; | |
138 Fieldable fLineNumber = doc.getFieldable("lineNumber"); | |
139 if (fLineNumber != null) { | |
140 lineNumber = fLineNumber.stringValue(); | |
141 xmlStrBuilder.append("<lineNumber>" + lineNumber + "</lineNumber>"); | |
142 } | |
143 String elementPosition = null; | |
144 Fieldable fElementPosition = doc.getFieldable("elementAbsolutePosition"); | |
145 if (fElementPosition != null) { | |
146 elementPosition = fElementPosition.stringValue(); | |
147 xmlStrBuilder.append("<absolutePosition>" + elementPosition + "</absolutePosition>"); | |
148 } | |
149 String xpath = null; | |
150 Fieldable fXPath = doc.getFieldable("xpath"); | |
151 if (fXPath != null) { | |
152 xpath = fXPath.stringValue(); | |
153 xmlStrBuilder.append("<xpath>" + xpath + "</xpath>"); | |
154 } | |
155 String xmlId = null; | |
156 Fieldable fXmlId = doc.getFieldable("xmlId"); | |
157 if (fXmlId != null) { | |
158 xmlId = fXmlId.stringValue(); | |
159 xmlStrBuilder.append("<xmlId>" + xmlId + "</xmlId>"); | |
160 } | |
161 String language = null; | |
162 Fieldable fLanguage = doc.getFieldable("language"); | |
163 if (fLanguage != null) { | |
164 language = fLanguage.stringValue(); | |
165 xmlStrBuilder.append("<language>" + language + "</language>"); | |
166 } | |
167 String xmlContentTokenized = null; | |
168 Fieldable fXmlContentTokenized = doc.getFieldable("xmlContentTokenized"); | |
169 if (fXmlContentTokenized != null) { | |
170 String highlightQueryType = "orig"; | |
171 if (withLemmas(outputOptions)) { | |
172 highlightQueryType = "morph"; | |
173 } else if (normFunctions != null) { | |
174 String normFunction = normFunctions[0]; | |
175 highlightQueryType = normFunction; | |
176 if (normFunction.equals("none")) { | |
177 highlightQueryType = "orig"; | |
178 } | |
179 } | |
180 xmlContentTokenized = fXmlContentTokenized.stringValue(); | |
181 String xmlPre = "<content xmlns:xhtml=\"http://www.w3.org/1999/xhtml\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">"; | |
182 String xmlPost = "</content>"; | |
183 String xmlInputStr = xmlPre + xmlContentTokenized + xmlPost; | |
184 String docLanguage = docMetadataRecord.getLanguage(); | |
185 String highlightedXmlStr = highlight(xmlInputStr, highlightQueryType, query, docLanguage); | |
186 if (highlightedXmlStr == null) | |
187 highlightedXmlStr = "<content>" + xmlContentTokenized + "</content>"; | |
188 xmlStrBuilder.append(highlightedXmlStr); | |
189 } | |
190 xmlStrBuilder.append("</hit>"); | |
191 } | |
192 xmlStrBuilder.append("</hits>"); | |
193 xmlStrBuilder.append("</document>"); | |
194 return xmlStrBuilder.toString(); | |
195 } | |
196 | |
197 private String createHtmlString(MetadataRecord docMetadataRecord, String query, int page, int pageSize, String[] normFunctions, String[] outputOptions, Hits hits, HttpServletRequest request) throws ApplicationException { | |
198 String docId = docMetadataRecord.getDocId(); | |
199 ArrayList<Document> docs = null; | |
200 if (hits != null) | |
201 docs = hits.getHits(); | |
202 int hitsSize = -1; | |
203 int docsSize = -1; | |
204 if (hits != null) | |
205 hitsSize = hits.getSize(); | |
206 if (docs != null) | |
207 docsSize = docs.size(); | |
208 String highlightQueryType = "orig"; | |
209 String normalizationStr = ""; | |
210 String highlightQueryTypeStr = ""; | |
211 if (withLemmas(outputOptions)) { | |
212 highlightQueryTypeStr = "&highlightQueryType=norm"; | |
213 highlightQueryType = "norm"; | |
214 } else if (normFunctions != null) { | |
215 String normFunction = normFunctions[0]; | |
216 normalizationStr = "&normalization=" + normFunction; | |
217 highlightQueryType = normFunction; | |
218 if (normFunction.equals("none")) { | |
219 normalizationStr = "&normalization=" + "orig"; | |
220 highlightQueryType = "orig"; | |
221 } | |
222 } | |
223 StringBuilder xmlStrBuilder = new StringBuilder(); | |
224 xmlStrBuilder.append("<html>"); | |
225 xmlStrBuilder.append("<head>"); | |
226 xmlStrBuilder.append("<title>Document: \"" + docId + " " + query + "\"</title>"); | |
227 String baseUrl = getBaseUrl(request); | |
228 String cssUrl = baseUrl + "/css/page.css"; | |
229 String cssShowWordFileName = "pageOrig.css"; | |
230 if (highlightQueryType.equals("reg")) | |
231 cssShowWordFileName = "pageReg.css"; | |
232 else if (highlightQueryType.equals("norm")) | |
233 cssShowWordFileName = "pageNorm.css"; | |
234 String showWordCssUrl = baseUrl + "/css/" + cssShowWordFileName; | |
235 xmlStrBuilder.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"" + showWordCssUrl + "\"/>"); | |
236 xmlStrBuilder.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"" + cssUrl + "\"/>"); | |
237 xmlStrBuilder.append("</head>"); | |
238 xmlStrBuilder.append("<body>"); | |
239 xmlStrBuilder.append("<span class=\"about\">[<span class=\"it\">This is a MPIWG CMS technology service</span>] <a href=\"/mpiwg-mpdl-cms-web/index.html\"><img src=\"/mpiwg-mpdl-cms-web/images/info.png\" valign=\"bottom\" width=\"15\" height=\"15\" border=\"0\" alt=\"MPIWG CMS service\"/></a></span>"); | |
240 xmlStrBuilder.append("<span class=\"query\">Query: " + query + "</span>"); | |
241 xmlStrBuilder.append("<span class=\"result\">"); | |
242 xmlStrBuilder.append("<span class=\"resultPage\">" + page + "</span>"); | |
243 xmlStrBuilder.append("<span class=\"resultPageSize\">" + pageSize + "</span>"); | |
244 xmlStrBuilder.append("<span class=\"hitsSize\">" + hitsSize + "</span>"); | |
245 xmlStrBuilder.append("</span>"); | |
246 xmlStrBuilder.append("<table>"); | |
247 for (int i=0; i<docsSize; i++) { | |
248 xmlStrBuilder.append("<tr class=\"hit\">"); | |
249 Document doc = docs.get(i); | |
250 int num = (page - 1) * pageSize + i + 1; | |
251 xmlStrBuilder.append("<td class=\"hitNum\">" + num + ". " + "</td>"); | |
252 xmlStrBuilder.append("<td class=\"hitLink\">"); | |
253 String posStr = ""; | |
254 String pageNumber = ""; | |
255 Fieldable fPageNumber = doc.getFieldable("pageNumber"); | |
256 if (fPageNumber != null) { | |
257 pageNumber = fPageNumber.stringValue(); | |
258 posStr = posStr + "Page " + pageNumber + ", "; | |
259 } | |
260 String elementName = null; | |
261 String presElementName = ""; | |
262 Fieldable fElementName = doc.getFieldable("elementName"); | |
263 if (fElementName != null) { | |
264 elementName = fElementName.stringValue(); | |
265 presElementName = getPresentationName(elementName); | |
266 } | |
267 String elementPagePosition = ""; | |
268 Fieldable fElementPagePosition = doc.getFieldable("elementPagePosition"); | |
269 if (fElementPagePosition != null) { | |
270 elementPagePosition = fElementPagePosition.stringValue(); | |
271 posStr = posStr + presElementName + " " + elementPagePosition + ":"; | |
272 } | |
273 String language = docMetadataRecord.getLanguage(); | |
274 String getPageLink = baseUrl + "/query/GetPage?docId=" + docId + "&page=" + pageNumber + normalizationStr + "&highlightElem=" + elementName + "&highlightElemPos=" + elementPagePosition + highlightQueryTypeStr + "&highlightQuery=" + query + "&language=" + language; | |
275 xmlStrBuilder.append("<a href=\"" + getPageLink + "\">" + posStr + "</a>"); | |
276 xmlStrBuilder.append("</td>"); | |
277 String xmlContentTokenized = null; | |
278 Fieldable fXmlContentTokenized = doc.getFieldable("xmlContentTokenized"); | |
279 if (fXmlContentTokenized != null) { | |
280 xmlContentTokenized = fXmlContentTokenized.stringValue(); | |
281 String highlightedXmlStr = highlight(xmlContentTokenized, highlightQueryType, query, language); | |
282 String highlightHtmlStr = pageTransformer.transform(highlightedXmlStr, docMetadataRecord, -1, "html"); // TODO performance: do not highlight each single node but highlight them all in one step | |
283 xmlStrBuilder.append("<td class=\"hitContent\">"); | |
284 xmlStrBuilder.append(highlightHtmlStr); | |
285 xmlStrBuilder.append("</td>"); | |
286 } | |
287 xmlStrBuilder.append("</tr>"); | |
288 } | |
289 xmlStrBuilder.append("</table>"); | |
290 xmlStrBuilder.append("</body>"); | |
291 xmlStrBuilder.append("</html>"); | |
292 return xmlStrBuilder.toString(); | |
293 } | |
294 | |
295 private String highlight(String xmlStr, String highlightQueryType, String highlightQuery, String language) throws ApplicationException { | |
296 String result = null; | |
297 try { | |
298 HighlightContentHandler highlightContentHandler = new HighlightContentHandler(null, -1, highlightQueryType, highlightQuery, language); | |
299 highlightContentHandler.setFirstPageBreakReachedMode(true); | |
300 XMLReader xmlParser = new SAXParser(); | |
301 xmlParser.setContentHandler(highlightContentHandler); | |
302 StringReader stringReader = new StringReader(xmlStr); | |
303 InputSource inputSource = new InputSource(stringReader); | |
304 xmlParser.parse(inputSource); | |
305 result = highlightContentHandler.getResult().toString(); | |
306 } catch (SAXException e) { | |
307 throw new ApplicationException(e); | |
308 } catch (IOException e) { | |
309 throw new ApplicationException(e); | |
310 } | |
311 return result; | |
312 } | |
313 | |
314 private String getPresentationName(String elemName) { | |
315 String retStr = null; | |
316 if (elemName != null) { | |
317 if (elemName.equals("s")) { | |
318 retStr = "Sentence"; | |
319 } else { | |
320 // first char to uppercase | |
321 char[] stringArray = elemName.toCharArray(); | |
322 stringArray[0] = Character.toUpperCase(stringArray[0]); | |
323 retStr = new String(stringArray); | |
324 } | |
325 } | |
326 return retStr; | |
327 } | |
328 | |
329 private String getBaseUrl(HttpServletRequest request) { | |
330 return getServerUrl(request) + request.getContextPath(); | |
331 } | |
332 | |
333 private String getServerUrl(HttpServletRequest request) { | |
334 if ( ( request.getServerPort() == 80 ) || ( request.getServerPort() == 443 ) ) | |
335 return request.getScheme() + "://" + request.getServerName(); | |
336 else | |
337 return request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort(); | |
338 } | |
339 | |
340 private boolean withLemmas(String[] outputOptions) { | |
341 boolean result = false; | |
342 for (int i=0; i< outputOptions.length; i++) { | |
343 String function = outputOptions[i]; | |
344 if (function.equals("withLemmas")) | |
345 return true; | |
346 } | |
347 return result; | |
348 } | |
349 | |
350 } |