Mercurial > hg > mpdl-group
view software/mpdl-services-new/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/cms/QueryDocument.java @ 25:e9fe3186670c default tip
letzter Stand eingecheckt
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 21 May 2013 10:19:32 +0200 |
parents | |
children |
line wrap: on
line source
package de.mpg.mpiwg.berlin.mpdl.servlets.cms; import java.io.IOException; import java.io.PrintWriter; import java.io.StringReader; import java.util.ArrayList; import javax.servlet.ServletConfig; import javax.servlet.ServletContext; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.lucene.document.Fieldable; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import com.sun.org.apache.xerces.internal.parsers.SAXParser; import de.mpg.mpiwg.berlin.mpdl.cms.document.Document; import de.mpg.mpiwg.berlin.mpdl.cms.document.Hits; import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord; import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler; import de.mpg.mpiwg.berlin.mpdl.cms.transform.HighlightContentHandler; import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; public class QueryDocument extends HttpServlet { private static final long serialVersionUID = 1L; private PageTransformer pageTransformer = null; public QueryDocument() { super(); } public void init(ServletConfig config) throws ServletException { super.init(config); ServletContext context = getServletContext(); pageTransformer = (PageTransformer) context.getAttribute("pageTransformer"); } protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { doGet(request, response); } protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { request.setCharacterEncoding("utf-8"); response.setCharacterEncoding("utf-8"); String docId = request.getParameter("docId"); String query = request.getParameter("query"); String[] normFunctions = {"none"}; if (query.contains("tokenReg")) // TODO ordentlich behandeln normFunctions[0] = "reg"; else if (query.contains("tokenNorm")) // TODO ordentlich behandeln normFunctions[0] = "norm"; String[] outputOptions = {}; if (query.contains("tokenMorph")) { // TODO ordentlich behandeln outputOptions = new String[1]; outputOptions[0] = "withLemmas"; } String pageStr = request.getParameter("page"); if (pageStr == null) pageStr = "1"; int page = Integer.parseInt(pageStr); String pageSizeStr = request.getParameter("pageSize"); if (pageSizeStr == null) pageSizeStr = "10"; int pageSize = Integer.parseInt(pageSizeStr); int from = (page * pageSize) - pageSize; // e.g. 0 int to = page * pageSize - 1; // e.g. 9 String outputFormat = request.getParameter("outputFormat"); if (outputFormat == null) outputFormat = "xml"; try { IndexHandler indexHandler = IndexHandler.getInstance(); Hits hits = indexHandler.queryDocument(docId, query, from, to); MetadataRecord docMetadataRecord = indexHandler.getDocMetadata(docId); if (outputFormat.equals("xml")) response.setContentType("text/xml"); else if (outputFormat.equals("html")) response.setContentType("text/html"); else response.setContentType("text/xml"); PrintWriter out = response.getWriter(); String resultStr = ""; if (outputFormat.equals("xml")) resultStr = createXmlString(docMetadataRecord, query, page, pageSize, normFunctions, outputOptions, hits); else if (outputFormat.equals("html")) resultStr = createHtmlString(docMetadataRecord, query, page, pageSize, normFunctions, outputOptions, hits, request); out.print(resultStr); out.close(); } catch (ApplicationException e) { throw new ServletException(e); } } private String createXmlString(MetadataRecord docMetadataRecord, String query, int page, int pageSize, String[] normFunctions, String[] outputOptions, Hits hits) throws ApplicationException { String docId = docMetadataRecord.getDocId(); ArrayList<Document> docs = null; if (hits != null) docs = hits.getHits(); int hitsSize = -1; int docsSize = -1; if (hits != null) hitsSize = hits.getSize(); if (docs != null) docsSize = docs.size(); StringBuilder xmlStrBuilder = new StringBuilder(); xmlStrBuilder.append("<document>"); xmlStrBuilder.append("<id>" + docId + "</id>"); xmlStrBuilder.append("<query>"); xmlStrBuilder.append("<queryText>" + query + "</queryText>"); xmlStrBuilder.append("<resultPage>" + page + "</resultPage>"); xmlStrBuilder.append("<resultPageSize>" + pageSize + "</resultPageSize>"); xmlStrBuilder.append("</query>"); xmlStrBuilder.append("<hitsSize>" + hitsSize + "</hitsSize>"); xmlStrBuilder.append("<hits>"); for (int i=0; i<docsSize; i++) { Document doc = docs.get(i); int num = (page - 1) * pageSize + i + 1; xmlStrBuilder.append("<hit>"); xmlStrBuilder.append("<num>" + num + "</num>"); String pageNumber = null; Fieldable fPageNumber = doc.getFieldable("pageNumber"); if (fPageNumber != null) { pageNumber = fPageNumber.stringValue(); xmlStrBuilder.append("<pageNumber>" + pageNumber + "</pageNumber>"); } String elementPagePosition = null; Fieldable fElementPagePosition = doc.getFieldable("elementPagePosition"); if (fElementPagePosition != null) { elementPagePosition = fElementPagePosition.stringValue(); xmlStrBuilder.append("<pagePosition>" + elementPagePosition + "</pagePosition>"); } String lineNumber = null; Fieldable fLineNumber = doc.getFieldable("lineNumber"); if (fLineNumber != null) { lineNumber = fLineNumber.stringValue(); xmlStrBuilder.append("<lineNumber>" + lineNumber + "</lineNumber>"); } String elementPosition = null; Fieldable fElementPosition = doc.getFieldable("elementAbsolutePosition"); if (fElementPosition != null) { elementPosition = fElementPosition.stringValue(); xmlStrBuilder.append("<absolutePosition>" + elementPosition + "</absolutePosition>"); } String xpath = null; Fieldable fXPath = doc.getFieldable("xpath"); if (fXPath != null) { xpath = fXPath.stringValue(); xmlStrBuilder.append("<xpath>" + xpath + "</xpath>"); } String xmlId = null; Fieldable fXmlId = doc.getFieldable("xmlId"); if (fXmlId != null) { xmlId = fXmlId.stringValue(); xmlStrBuilder.append("<xmlId>" + xmlId + "</xmlId>"); } String language = null; Fieldable fLanguage = doc.getFieldable("language"); if (fLanguage != null) { language = fLanguage.stringValue(); xmlStrBuilder.append("<language>" + language + "</language>"); } String xmlContentTokenized = null; Fieldable fXmlContentTokenized = doc.getFieldable("xmlContentTokenized"); if (fXmlContentTokenized != null) { String highlightQueryType = "orig"; if (withLemmas(outputOptions)) { highlightQueryType = "morph"; } else if (normFunctions != null) { String normFunction = normFunctions[0]; highlightQueryType = normFunction; if (normFunction.equals("none")) { highlightQueryType = "orig"; } } xmlContentTokenized = fXmlContentTokenized.stringValue(); String xmlPre = "<content xmlns:xhtml=\"http://www.w3.org/1999/xhtml\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">"; String xmlPost = "</content>"; String xmlInputStr = xmlPre + xmlContentTokenized + xmlPost; String docLanguage = docMetadataRecord.getLanguage(); String highlightedXmlStr = highlight(xmlInputStr, highlightQueryType, query, docLanguage); if (highlightedXmlStr == null) highlightedXmlStr = "<content>" + xmlContentTokenized + "</content>"; xmlStrBuilder.append(highlightedXmlStr); } xmlStrBuilder.append("</hit>"); } xmlStrBuilder.append("</hits>"); xmlStrBuilder.append("</document>"); return xmlStrBuilder.toString(); } private String createHtmlString(MetadataRecord docMetadataRecord, String query, int page, int pageSize, String[] normFunctions, String[] outputOptions, Hits hits, HttpServletRequest request) throws ApplicationException { String docId = docMetadataRecord.getDocId(); ArrayList<Document> docs = null; if (hits != null) docs = hits.getHits(); int hitsSize = -1; int docsSize = -1; if (hits != null) hitsSize = hits.getSize(); if (docs != null) docsSize = docs.size(); String highlightQueryType = "orig"; String normalizationStr = ""; String highlightQueryTypeStr = ""; if (withLemmas(outputOptions)) { highlightQueryTypeStr = "&highlightQueryType=norm"; highlightQueryType = "norm"; } else if (normFunctions != null) { String normFunction = normFunctions[0]; normalizationStr = "&normalization=" + normFunction; highlightQueryType = normFunction; if (normFunction.equals("none")) { normalizationStr = "&normalization=" + "orig"; highlightQueryType = "orig"; } } StringBuilder xmlStrBuilder = new StringBuilder(); xmlStrBuilder.append("<html>"); xmlStrBuilder.append("<head>"); xmlStrBuilder.append("<title>Document: \"" + docId + " " + query + "\"</title>"); String baseUrl = getBaseUrl(request); String cssUrl = baseUrl + "/css/page.css"; String cssShowWordFileName = "pageOrig.css"; if (highlightQueryType.equals("reg")) cssShowWordFileName = "pageReg.css"; else if (highlightQueryType.equals("norm")) cssShowWordFileName = "pageNorm.css"; String showWordCssUrl = baseUrl + "/css/" + cssShowWordFileName; xmlStrBuilder.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"" + showWordCssUrl + "\"/>"); xmlStrBuilder.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"" + cssUrl + "\"/>"); xmlStrBuilder.append("</head>"); xmlStrBuilder.append("<body>"); xmlStrBuilder.append("<span class=\"about\">[<span class=\"it\">This is a MPIWG CMS technology service</span>] <a href=\"/mpiwg-mpdl-cms-web/index.html\"><img src=\"/mpiwg-mpdl-cms-web/images/info.png\" valign=\"bottom\" width=\"15\" height=\"15\" border=\"0\" alt=\"MPIWG CMS service\"/></a></span>"); xmlStrBuilder.append("<span class=\"query\">Query: " + query + "</span>"); xmlStrBuilder.append("<span class=\"result\">"); xmlStrBuilder.append("<span class=\"resultPage\">" + page + "</span>"); xmlStrBuilder.append("<span class=\"resultPageSize\">" + pageSize + "</span>"); xmlStrBuilder.append("<span class=\"hitsSize\">" + hitsSize + "</span>"); xmlStrBuilder.append("</span>"); xmlStrBuilder.append("<table>"); for (int i=0; i<docsSize; i++) { xmlStrBuilder.append("<tr class=\"hit\">"); Document doc = docs.get(i); int num = (page - 1) * pageSize + i + 1; xmlStrBuilder.append("<td class=\"hitNum\">" + num + ". " + "</td>"); xmlStrBuilder.append("<td class=\"hitLink\">"); String posStr = ""; String pageNumber = ""; Fieldable fPageNumber = doc.getFieldable("pageNumber"); if (fPageNumber != null) { pageNumber = fPageNumber.stringValue(); posStr = posStr + "Page " + pageNumber + ", "; } String elementName = null; String presElementName = ""; Fieldable fElementName = doc.getFieldable("elementName"); if (fElementName != null) { elementName = fElementName.stringValue(); presElementName = getPresentationName(elementName); } String elementPagePosition = ""; Fieldable fElementPagePosition = doc.getFieldable("elementPagePosition"); if (fElementPagePosition != null) { elementPagePosition = fElementPagePosition.stringValue(); posStr = posStr + presElementName + " " + elementPagePosition + ":"; } String language = docMetadataRecord.getLanguage(); String getPageLink = baseUrl + "/query/GetPage?docId=" + docId + "&page=" + pageNumber + normalizationStr + "&highlightElem=" + elementName + "&highlightElemPos=" + elementPagePosition + highlightQueryTypeStr + "&highlightQuery=" + query + "&language=" + language; xmlStrBuilder.append("<a href=\"" + getPageLink + "\">" + posStr + "</a>"); xmlStrBuilder.append("</td>"); String xmlContentTokenized = null; Fieldable fXmlContentTokenized = doc.getFieldable("xmlContentTokenized"); if (fXmlContentTokenized != null) { xmlContentTokenized = fXmlContentTokenized.stringValue(); String highlightedXmlStr = highlight(xmlContentTokenized, highlightQueryType, query, language); String highlightHtmlStr = pageTransformer.transform(highlightedXmlStr, docMetadataRecord, -1, "html"); // TODO performance: do not highlight each single node but highlight them all in one step xmlStrBuilder.append("<td class=\"hitContent\">"); xmlStrBuilder.append(highlightHtmlStr); xmlStrBuilder.append("</td>"); } xmlStrBuilder.append("</tr>"); } xmlStrBuilder.append("</table>"); xmlStrBuilder.append("</body>"); xmlStrBuilder.append("</html>"); return xmlStrBuilder.toString(); } private String highlight(String xmlStr, String highlightQueryType, String highlightQuery, String language) throws ApplicationException { String result = null; try { HighlightContentHandler highlightContentHandler = new HighlightContentHandler(null, -1, highlightQueryType, highlightQuery, language); highlightContentHandler.setFirstPageBreakReachedMode(true); XMLReader xmlParser = new SAXParser(); xmlParser.setContentHandler(highlightContentHandler); StringReader stringReader = new StringReader(xmlStr); InputSource inputSource = new InputSource(stringReader); xmlParser.parse(inputSource); result = highlightContentHandler.getResult().toString(); } catch (SAXException e) { throw new ApplicationException(e); } catch (IOException e) { throw new ApplicationException(e); } return result; } private String getPresentationName(String elemName) { String retStr = null; if (elemName != null) { if (elemName.equals("s")) { retStr = "Sentence"; } else { // first char to uppercase char[] stringArray = elemName.toCharArray(); stringArray[0] = Character.toUpperCase(stringArray[0]); retStr = new String(stringArray); } } return retStr; } private String getBaseUrl(HttpServletRequest request) { return getServerUrl(request) + request.getContextPath(); } private String getServerUrl(HttpServletRequest request) { if ( ( request.getServerPort() == 80 ) || ( request.getServerPort() == 443 ) ) return request.getScheme() + "://" + request.getServerName(); else return request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort(); } private boolean withLemmas(String[] outputOptions) { boolean result = false; for (int i=0; i< outputOptions.length; i++) { String function = outputOptions[i]; if (function.equals("withLemmas")) return true; } return result; } }