Mercurial > hg > fulltextSearchServer
view src/de/mpiwg/dwinter/fulltextSearchServer/SearchLines.java @ 0:db87c1b7eb6d
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:18:46 +0100 |
parents | |
children | 83e9a828e794 |
line wrap: on
line source
package de.mpiwg.dwinter.fulltextSearchServer; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.Reader; import java.io.Writer; import java.net.URLDecoder; import java.nio.channels.ReadableByteChannel; import java.nio.channels.WritableByteChannel; import java.util.ArrayList; import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerFactoryConfigurationError; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.util.Version; import org.restlet.data.Form; import org.restlet.data.MediaType; import org.restlet.data.Parameter; import org.restlet.data.Status; import org.restlet.representation.Representation; import org.restlet.representation.StringRepresentation; import org.restlet.resource.Get; import org.restlet.resource.Options; import org.restlet.resource.ServerResource; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import de.mpiwg.dwinter.fulltext.search.FulltextSearchDocsLines; import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc; import de.mpiwg.dwinter.fulltext.search.utils.OCRLine; import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher; import de.mpiwg.dwinter.fulltext.ticket.TicketWriter; import de.mpiwg.dwinter.fulltextSearchServer.Utils.ConfigurationManager; import de.mpiwg.dwinter.fulltextSearchServer.Utils.DigilibTools; import de.mpiwg.dwinter.fulltextSearchServer.Utils.XMLTools; public class SearchLines extends ServerResource { /** * Erlaubt cross scripting bei Aufruf aus Javascript * * @param entity */ private Logger logger = Logger.getRootLogger(); private String cleanedPath; @Options public void doOptions(Representation entity) { Form responseHeaders = (Form) getResponse().getAttributes().get( "org.restlet.http.headers"); if (responseHeaders == null) { responseHeaders = new Form(); getResponse().getAttributes().put("org.restlet.http.headers", responseHeaders); } responseHeaders.add("Access-Control-Allow-Origin", "*"); responseHeaders.add("Access-Control-Allow-Methods", "POST,OPTIONS,GET"); responseHeaders.add("Access-Control-Allow-Headers", "Content-Type"); responseHeaders.add("Access-Control-Allow-Credentials", "false"); responseHeaders.add("Access-Control-Max-Age", "60"); } // @Get("xml") public Representation getXML() throws IOException, ParseException { return new StringRepresentation(getHits(), MediaType.TEXT_XML); } @Get("html") public Representation getHTML() throws TransformerFactoryConfigurationError, IOException, ParseException, XPathExpressionException { // response header fuer cross-site.scripting Form responseHeaders = (Form) getResponse().getAttributes().get( "org.restlet.http.headers"); if (responseHeaders == null) { responseHeaders = new Form(); getResponse().getAttributes().put("org.restlet.http.headers", responseHeaders); } responseHeaders.add("Access-Control-Allow-Origin", "*"); // String txt = // XMLTools.transformToHTML(getHits(),"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl"); String txt = getHits(); if (getStatus().isError()) return new StringRepresentation(txt, MediaType.TEXT_HTML); String ret = ""; // ret+="<pageFileName>"+pageFileName+"</pageFileName>"; //ArrayList<Double[]> points = new ArrayList<Double[]>(); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setNamespaceAware(true); dbf.setValidating(false); DocumentBuilder db; try { db = dbf.newDocumentBuilder(); } catch (ParserConfigurationException e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } // db.setEntityResolver(new MyResolver()); Document doc; try { // stream = new StringInputStream(xml,"utf-8"); ByteArrayInputStream stream = new ByteArrayInputStream( txt.getBytes("utf-8")); doc = db.parse(stream); } catch (SAXException e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } XPath xpath = XPathFactory.newInstance().newXPath(); // XPath Query for showing all nodes value XPathExpression expr = xpath.compile("//page"); XPathExpression line = xpath.compile("line"); XPathExpression name = xpath.compile("name"); Object result = expr.evaluate(doc, XPathConstants.NODESET); NodeList pages = (NodeList) result; for (int i = 0; i < pages.getLength(); i++) { NodeList names = (NodeList) name.evaluate(pages.item(i), XPathConstants.NODESET); String pathName = ""; if (names.getLength() == 1) { Node obj = names.item(0); pathName = obj.getTextContent(); } NodeList lines = (NodeList) line.evaluate(pages.item(i), XPathConstants.NODESET); ArrayList<Double[]> points = new ArrayList<Double[]>(); for (int l = 0; l < lines.getLength(); l++) { Double[] point = DigilibTools.calculatePoint(lines.item(l) .getTextContent()); points.add(point); } // Pattern linePattern = // Pattern.compile("<line>(.*?)</line>",Pattern.MULTILINE); // Matcher m = linePattern.matcher(txt); // while(m.find()){ // Double[] point = DigilibTools.calculatePoint(m.group(1)); // points.add(point); // } String textId = (String) getRequest().getAttributes().get("textId"); String url = String.format(DigilibTools.DIGIVIEWBASICSTRING, DigilibTools.generateImagePath(textId, pathName), DigilibTools.generateMarksFromPoints(points)); ret += String.format( "<div class=\"hitsOnPage\"><a href=\"%s\">%s</a></div>", url, pathName); } ret += ""; // return ret; return new StringRepresentation(ret, MediaType.TEXT_HTML); } protected String getHits() throws IOException, ParseException { String textId = (String) getRequest().getAttributes().get("textId"); String queryString = (String) getRequest().getAttributes().get( "queryString"); String lang = (String) getRequest().getAttributes().get("lang"); Properties defaultProperties = ConfigurationManager.getConfig(); File lineDir = new File(defaultProperties.getProperty("lineIndex")); File docIndex = new File(defaultProperties.getProperty("docIndex")); Boolean parse=true; // im regelfall spll der Querystring noch geparsed werden Form form = getRequest().getResourceRef().getQueryAsForm(); // moeglicher parameter "parse" if "false" dann kein parsing des query strings for (Parameter parameter : form) { String name =parameter.getName(); if (name.equals("parse")){ String parserQuestion = parameter.getValue(); if (parserQuestion.equals("false")) parse=false; } } FulltextSearchDocsLines searcher = new FulltextSearchDocsLines( docIndex, lineDir); LanguageSearcher ls = searcher.languageSearchers .getSearcherByLanguage(lang); if (ls == null) { setStatus(Status.CLIENT_ERROR_NOT_FOUND); return "<error>Language Not Found</error>"; } Analyzer analyzer = searcher.languageSearchers .getSearcherByLanguage(lang).analyzer; QueryParser parser = new QueryParser(Version.LUCENE_30, "contents", analyzer); queryString = URLDecoder.decode(queryString, "utf-8"); logger.debug(queryString); Query query; if (parse){ query = parser.parse(queryString); } else { String[] splitted = queryString.split(":"); String qs; if (splitted.length>1) qs = splitted[1]; else qs = splitted[0]; Term term = new Term("contents",qs); query = new TermQuery(term); } textId = textId.replace(":", "/"); // esetze pfad trenner TODO statt // pfadtrenner ersetzen besser // urlencode auch in den anderen // klassen OCRDoc result = searcher.searchInLinesDoc(textId, query, lang); cleanedPath = result.document.get("cleanedPath") + "</cleanedPath>"; String ret = "<xml>"; ret += "<docId>" + textId + "</docId>"; ret += "<cleanedPath>" + result.document.get("cleanedPath") + "</cleanedPath>"; if (result.linesInPage != null) { for (String page : result.linesInPage.keySet()) { ret += "<page><name>" + page + "</name>"; for (OCRLine line : result.linesInPage.get(page)) { ret += "<line>" + line.toString() + "</line>"; } ret += "</page>"; } } ret += "</xml>"; return ret; } }