Mercurial > hg > mpdl-group
diff software/mpdl-services/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/cms/QueryDocument.java @ 23:e845310098ba
diverse Korrekturen
author | Josef Willenborg <jwillenborg@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 27 Nov 2012 12:35:19 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/software/mpdl-services/mpiwg-mpdl-cms-web/src/de/mpg/mpiwg/berlin/mpdl/servlets/cms/QueryDocument.java Tue Nov 27 12:35:19 2012 +0100 @@ -0,0 +1,350 @@ +package de.mpg.mpiwg.berlin.mpdl.servlets.cms; + +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringReader; +import java.util.ArrayList; + +import javax.servlet.ServletConfig; +import javax.servlet.ServletContext; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.apache.lucene.document.Fieldable; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import com.sun.org.apache.xerces.internal.parsers.SAXParser; + +import de.mpg.mpiwg.berlin.mpdl.cms.document.Document; +import de.mpg.mpiwg.berlin.mpdl.cms.document.Hits; +import de.mpg.mpiwg.berlin.mpdl.cms.document.MetadataRecord; +import de.mpg.mpiwg.berlin.mpdl.cms.lucene.IndexHandler; +import de.mpg.mpiwg.berlin.mpdl.cms.transform.HighlightContentHandler; +import de.mpg.mpiwg.berlin.mpdl.cms.transform.PageTransformer; +import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; + +public class QueryDocument extends HttpServlet { + private static final long serialVersionUID = 1L; + private PageTransformer pageTransformer = null; + + public QueryDocument() { + super(); + } + + public void init(ServletConfig config) throws ServletException { + super.init(config); + ServletContext context = getServletContext(); + pageTransformer = (PageTransformer) context.getAttribute("pageTransformer"); + } + + protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + doGet(request, response); + } + + protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { + request.setCharacterEncoding("utf-8"); + response.setCharacterEncoding("utf-8"); + String docId = request.getParameter("docId"); + String query = request.getParameter("query"); + String[] normFunctions = {"none"}; + if (query.contains("tokenReg")) // TODO ordentlich behandeln + normFunctions[0] = "reg"; + else if (query.contains("tokenNorm")) // TODO ordentlich behandeln + normFunctions[0] = "norm"; + String[] outputOptions = {}; + if (query.contains("tokenMorph")) { // TODO ordentlich behandeln + outputOptions = new String[1]; + outputOptions[0] = "withLemmas"; + } + String pageStr = request.getParameter("page"); + if (pageStr == null) + pageStr = "1"; + int page = Integer.parseInt(pageStr); + String pageSizeStr = request.getParameter("pageSize"); + if (pageSizeStr == null) + pageSizeStr = "10"; + int pageSize = Integer.parseInt(pageSizeStr); + int from = (page * pageSize) - pageSize; // e.g. 0 + int to = page * pageSize - 1; // e.g. 9 + String outputFormat = request.getParameter("outputFormat"); + if (outputFormat == null) + outputFormat = "xml"; + try { + IndexHandler indexHandler = IndexHandler.getInstance(); + Hits hits = indexHandler.queryDocument(docId, query, from, to); + MetadataRecord docMetadataRecord = indexHandler.getDocMetadata(docId); + if (outputFormat.equals("xml")) + response.setContentType("text/xml"); + else if (outputFormat.equals("html")) + response.setContentType("text/html"); + else + response.setContentType("text/xml"); + PrintWriter out = response.getWriter(); + String resultStr = ""; + if (outputFormat.equals("xml")) + resultStr = createXmlString(docMetadataRecord, query, page, pageSize, normFunctions, outputOptions, hits); + else if (outputFormat.equals("html")) + resultStr = createHtmlString(docMetadataRecord, query, page, pageSize, normFunctions, outputOptions, hits, request); + out.print(resultStr); + out.close(); + } catch (ApplicationException e) { + throw new ServletException(e); + } + } + + private String createXmlString(MetadataRecord docMetadataRecord, String query, int page, int pageSize, String[] normFunctions, String[] outputOptions, Hits hits) throws ApplicationException { + String docId = docMetadataRecord.getDocId(); + ArrayList<Document> docs = null; + if (hits != null) + docs = hits.getHits(); + int hitsSize = -1; + int docsSize = -1; + if (hits != null) + hitsSize = hits.getSize(); + if (docs != null) + docsSize = docs.size(); + StringBuilder xmlStrBuilder = new StringBuilder(); + xmlStrBuilder.append("<document>"); + xmlStrBuilder.append("<id>" + docId + "</id>"); + xmlStrBuilder.append("<query>"); + xmlStrBuilder.append("<queryText>" + query + "</queryText>"); + xmlStrBuilder.append("<resultPage>" + page + "</resultPage>"); + xmlStrBuilder.append("<resultPageSize>" + pageSize + "</resultPageSize>"); + xmlStrBuilder.append("</query>"); + xmlStrBuilder.append("<hitsSize>" + hitsSize + "</hitsSize>"); + xmlStrBuilder.append("<hits>"); + for (int i=0; i<docsSize; i++) { + Document doc = docs.get(i); + int num = (page - 1) * pageSize + i + 1; + xmlStrBuilder.append("<hit>"); + xmlStrBuilder.append("<num>" + num + "</num>"); + String pageNumber = null; + Fieldable fPageNumber = doc.getFieldable("pageNumber"); + if (fPageNumber != null) { + pageNumber = fPageNumber.stringValue(); + xmlStrBuilder.append("<pageNumber>" + pageNumber + "</pageNumber>"); + } + String elementPagePosition = null; + Fieldable fElementPagePosition = doc.getFieldable("elementPagePosition"); + if (fElementPagePosition != null) { + elementPagePosition = fElementPagePosition.stringValue(); + xmlStrBuilder.append("<pagePosition>" + elementPagePosition + "</pagePosition>"); + } + String lineNumber = null; + Fieldable fLineNumber = doc.getFieldable("lineNumber"); + if (fLineNumber != null) { + lineNumber = fLineNumber.stringValue(); + xmlStrBuilder.append("<lineNumber>" + lineNumber + "</lineNumber>"); + } + String elementPosition = null; + Fieldable fElementPosition = doc.getFieldable("elementAbsolutePosition"); + if (fElementPosition != null) { + elementPosition = fElementPosition.stringValue(); + xmlStrBuilder.append("<absolutePosition>" + elementPosition + "</absolutePosition>"); + } + String xpath = null; + Fieldable fXPath = doc.getFieldable("xpath"); + if (fXPath != null) { + xpath = fXPath.stringValue(); + xmlStrBuilder.append("<xpath>" + xpath + "</xpath>"); + } + String xmlId = null; + Fieldable fXmlId = doc.getFieldable("xmlId"); + if (fXmlId != null) { + xmlId = fXmlId.stringValue(); + xmlStrBuilder.append("<xmlId>" + xmlId + "</xmlId>"); + } + String language = null; + Fieldable fLanguage = doc.getFieldable("language"); + if (fLanguage != null) { + language = fLanguage.stringValue(); + xmlStrBuilder.append("<language>" + language + "</language>"); + } + String xmlContentTokenized = null; + Fieldable fXmlContentTokenized = doc.getFieldable("xmlContentTokenized"); + if (fXmlContentTokenized != null) { + String highlightQueryType = "orig"; + if (withLemmas(outputOptions)) { + highlightQueryType = "morph"; + } else if (normFunctions != null) { + String normFunction = normFunctions[0]; + highlightQueryType = normFunction; + if (normFunction.equals("none")) { + highlightQueryType = "orig"; + } + } + xmlContentTokenized = fXmlContentTokenized.stringValue(); + String xmlPre = "<content xmlns:xhtml=\"http://www.w3.org/1999/xhtml\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">"; + String xmlPost = "</content>"; + String xmlInputStr = xmlPre + xmlContentTokenized + xmlPost; + String docLanguage = docMetadataRecord.getLanguage(); + String highlightedXmlStr = highlight(xmlInputStr, highlightQueryType, query, docLanguage); + if (highlightedXmlStr == null) + highlightedXmlStr = "<content>" + xmlContentTokenized + "</content>"; + xmlStrBuilder.append(highlightedXmlStr); + } + xmlStrBuilder.append("</hit>"); + } + xmlStrBuilder.append("</hits>"); + xmlStrBuilder.append("</document>"); + return xmlStrBuilder.toString(); + } + + private String createHtmlString(MetadataRecord docMetadataRecord, String query, int page, int pageSize, String[] normFunctions, String[] outputOptions, Hits hits, HttpServletRequest request) throws ApplicationException { + String docId = docMetadataRecord.getDocId(); + ArrayList<Document> docs = null; + if (hits != null) + docs = hits.getHits(); + int hitsSize = -1; + int docsSize = -1; + if (hits != null) + hitsSize = hits.getSize(); + if (docs != null) + docsSize = docs.size(); + String highlightQueryType = "orig"; + String normalizationStr = ""; + String highlightQueryTypeStr = ""; + if (withLemmas(outputOptions)) { + highlightQueryTypeStr = "&highlightQueryType=norm"; + highlightQueryType = "norm"; + } else if (normFunctions != null) { + String normFunction = normFunctions[0]; + normalizationStr = "&normalization=" + normFunction; + highlightQueryType = normFunction; + if (normFunction.equals("none")) { + normalizationStr = "&normalization=" + "orig"; + highlightQueryType = "orig"; + } + } + StringBuilder xmlStrBuilder = new StringBuilder(); + xmlStrBuilder.append("<html>"); + xmlStrBuilder.append("<head>"); + xmlStrBuilder.append("<title>Document: \"" + docId + " " + query + "\"</title>"); + String baseUrl = getBaseUrl(request); + String cssUrl = baseUrl + "/css/page.css"; + String cssShowWordFileName = "pageOrig.css"; + if (highlightQueryType.equals("reg")) + cssShowWordFileName = "pageReg.css"; + else if (highlightQueryType.equals("norm")) + cssShowWordFileName = "pageNorm.css"; + String showWordCssUrl = baseUrl + "/css/" + cssShowWordFileName; + xmlStrBuilder.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"" + showWordCssUrl + "\"/>"); + xmlStrBuilder.append("<link rel=\"stylesheet\" type=\"text/css\" href=\"" + cssUrl + "\"/>"); + xmlStrBuilder.append("</head>"); + xmlStrBuilder.append("<body>"); + xmlStrBuilder.append("<span class=\"about\">[<span class=\"it\">This is a MPIWG CMS technology service</span>] <a href=\"/mpiwg-mpdl-cms-web/index.html\"><img src=\"/mpiwg-mpdl-cms-web/images/info.png\" valign=\"bottom\" width=\"15\" height=\"15\" border=\"0\" alt=\"MPIWG CMS service\"/></a></span>"); + xmlStrBuilder.append("<span class=\"query\">Query: " + query + "</span>"); + xmlStrBuilder.append("<span class=\"result\">"); + xmlStrBuilder.append("<span class=\"resultPage\">" + page + "</span>"); + xmlStrBuilder.append("<span class=\"resultPageSize\">" + pageSize + "</span>"); + xmlStrBuilder.append("<span class=\"hitsSize\">" + hitsSize + "</span>"); + xmlStrBuilder.append("</span>"); + xmlStrBuilder.append("<table>"); + for (int i=0; i<docsSize; i++) { + xmlStrBuilder.append("<tr class=\"hit\">"); + Document doc = docs.get(i); + int num = (page - 1) * pageSize + i + 1; + xmlStrBuilder.append("<td class=\"hitNum\">" + num + ". " + "</td>"); + xmlStrBuilder.append("<td class=\"hitLink\">"); + String posStr = ""; + String pageNumber = ""; + Fieldable fPageNumber = doc.getFieldable("pageNumber"); + if (fPageNumber != null) { + pageNumber = fPageNumber.stringValue(); + posStr = posStr + "Page " + pageNumber + ", "; + } + String elementName = null; + String presElementName = ""; + Fieldable fElementName = doc.getFieldable("elementName"); + if (fElementName != null) { + elementName = fElementName.stringValue(); + presElementName = getPresentationName(elementName); + } + String elementPagePosition = ""; + Fieldable fElementPagePosition = doc.getFieldable("elementPagePosition"); + if (fElementPagePosition != null) { + elementPagePosition = fElementPagePosition.stringValue(); + posStr = posStr + presElementName + " " + elementPagePosition + ":"; + } + String language = docMetadataRecord.getLanguage(); + String getPageLink = baseUrl + "/query/GetPage?docId=" + docId + "&page=" + pageNumber + normalizationStr + "&highlightElem=" + elementName + "&highlightElemPos=" + elementPagePosition + highlightQueryTypeStr + "&highlightQuery=" + query + "&language=" + language; + xmlStrBuilder.append("<a href=\"" + getPageLink + "\">" + posStr + "</a>"); + xmlStrBuilder.append("</td>"); + String xmlContentTokenized = null; + Fieldable fXmlContentTokenized = doc.getFieldable("xmlContentTokenized"); + if (fXmlContentTokenized != null) { + xmlContentTokenized = fXmlContentTokenized.stringValue(); + String highlightedXmlStr = highlight(xmlContentTokenized, highlightQueryType, query, language); + String highlightHtmlStr = pageTransformer.transform(highlightedXmlStr, docMetadataRecord, -1, "html"); // TODO performance: do not highlight each single node but highlight them all in one step + xmlStrBuilder.append("<td class=\"hitContent\">"); + xmlStrBuilder.append(highlightHtmlStr); + xmlStrBuilder.append("</td>"); + } + xmlStrBuilder.append("</tr>"); + } + xmlStrBuilder.append("</table>"); + xmlStrBuilder.append("</body>"); + xmlStrBuilder.append("</html>"); + return xmlStrBuilder.toString(); + } + + private String highlight(String xmlStr, String highlightQueryType, String highlightQuery, String language) throws ApplicationException { + String result = null; + try { + HighlightContentHandler highlightContentHandler = new HighlightContentHandler(null, -1, highlightQueryType, highlightQuery, language); + highlightContentHandler.setFirstPageBreakReachedMode(true); + XMLReader xmlParser = new SAXParser(); + xmlParser.setContentHandler(highlightContentHandler); + StringReader stringReader = new StringReader(xmlStr); + InputSource inputSource = new InputSource(stringReader); + xmlParser.parse(inputSource); + result = highlightContentHandler.getResult().toString(); + } catch (SAXException e) { + throw new ApplicationException(e); + } catch (IOException e) { + throw new ApplicationException(e); + } + return result; + } + + private String getPresentationName(String elemName) { + String retStr = null; + if (elemName != null) { + if (elemName.equals("s")) { + retStr = "Sentence"; + } else { + // first char to uppercase + char[] stringArray = elemName.toCharArray(); + stringArray[0] = Character.toUpperCase(stringArray[0]); + retStr = new String(stringArray); + } + } + return retStr; + } + + private String getBaseUrl(HttpServletRequest request) { + return getServerUrl(request) + request.getContextPath(); + } + + private String getServerUrl(HttpServletRequest request) { + if ( ( request.getServerPort() == 80 ) || ( request.getServerPort() == 443 ) ) + return request.getScheme() + "://" + request.getServerName(); + else + return request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort(); + } + + private boolean withLemmas(String[] outputOptions) { + boolean result = false; + for (int i=0; i< outputOptions.length; i++) { + String function = outputOptions[i]; + if (function.equals("withLemmas")) + return true; + } + return result; + } + +}