comparison src/de/mpiwg/dwinter/fulltextSearchServer/SearchLines.java @ 0:db87c1b7eb6d

initial
author dwinter
date Wed, 03 Nov 2010 12:18:46 +0100
parents
children 83e9a828e794
comparison
equal deleted inserted replaced
-1:000000000000 0:db87c1b7eb6d
1 package de.mpiwg.dwinter.fulltextSearchServer;
2
3 import java.io.ByteArrayInputStream;
4 import java.io.File;
5 import java.io.IOException;
6 import java.io.InputStream;
7 import java.io.OutputStream;
8 import java.io.Reader;
9 import java.io.Writer;
10 import java.net.URLDecoder;
11 import java.nio.channels.ReadableByteChannel;
12 import java.nio.channels.WritableByteChannel;
13 import java.util.ArrayList;
14 import java.util.Properties;
15 import java.util.regex.Matcher;
16 import java.util.regex.Pattern;
17
18 import javax.xml.parsers.DocumentBuilder;
19 import javax.xml.parsers.DocumentBuilderFactory;
20 import javax.xml.parsers.ParserConfigurationException;
21 import javax.xml.transform.TransformerFactoryConfigurationError;
22 import javax.xml.xpath.XPath;
23 import javax.xml.xpath.XPathConstants;
24 import javax.xml.xpath.XPathExpression;
25 import javax.xml.xpath.XPathExpressionException;
26 import javax.xml.xpath.XPathFactory;
27
28 import org.apache.log4j.Logger;
29 import org.apache.lucene.analysis.Analyzer;
30 import org.apache.lucene.index.Term;
31 import org.apache.lucene.queryParser.ParseException;
32 import org.apache.lucene.queryParser.QueryParser;
33 import org.apache.lucene.search.Query;
34 import org.apache.lucene.search.TermQuery;
35 import org.apache.lucene.util.Version;
36 import org.restlet.data.Form;
37 import org.restlet.data.MediaType;
38 import org.restlet.data.Parameter;
39 import org.restlet.data.Status;
40 import org.restlet.representation.Representation;
41 import org.restlet.representation.StringRepresentation;
42 import org.restlet.resource.Get;
43 import org.restlet.resource.Options;
44 import org.restlet.resource.ServerResource;
45 import org.w3c.dom.Document;
46 import org.w3c.dom.Node;
47 import org.w3c.dom.NodeList;
48 import org.xml.sax.SAXException;
49
50 import de.mpiwg.dwinter.fulltext.search.FulltextSearchDocsLines;
51 import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc;
52 import de.mpiwg.dwinter.fulltext.search.utils.OCRLine;
53 import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher;
54 import de.mpiwg.dwinter.fulltext.ticket.TicketWriter;
55 import de.mpiwg.dwinter.fulltextSearchServer.Utils.ConfigurationManager;
56 import de.mpiwg.dwinter.fulltextSearchServer.Utils.DigilibTools;
57 import de.mpiwg.dwinter.fulltextSearchServer.Utils.XMLTools;
58
59 public class SearchLines extends ServerResource {
60
61 /**
62 * Erlaubt cross scripting bei Aufruf aus Javascript
63 *
64 * @param entity
65 */
66
67 private Logger logger = Logger.getRootLogger();
68 private String cleanedPath;
69
70 @Options
71 public void doOptions(Representation entity) {
72 Form responseHeaders = (Form) getResponse().getAttributes().get(
73 "org.restlet.http.headers");
74 if (responseHeaders == null) {
75 responseHeaders = new Form();
76 getResponse().getAttributes().put("org.restlet.http.headers",
77 responseHeaders);
78 }
79 responseHeaders.add("Access-Control-Allow-Origin", "*");
80 responseHeaders.add("Access-Control-Allow-Methods", "POST,OPTIONS,GET");
81 responseHeaders.add("Access-Control-Allow-Headers", "Content-Type");
82 responseHeaders.add("Access-Control-Allow-Credentials", "false");
83 responseHeaders.add("Access-Control-Max-Age", "60");
84 }
85
86 // @Get("xml")
87 public Representation getXML() throws IOException, ParseException {
88
89 return new StringRepresentation(getHits(), MediaType.TEXT_XML);
90
91 }
92
93 @Get("html")
94 public Representation getHTML()
95 throws TransformerFactoryConfigurationError, IOException,
96 ParseException, XPathExpressionException {
97 // response header fuer cross-site.scripting
98 Form responseHeaders = (Form) getResponse().getAttributes().get(
99 "org.restlet.http.headers");
100 if (responseHeaders == null) {
101 responseHeaders = new Form();
102 getResponse().getAttributes().put("org.restlet.http.headers",
103 responseHeaders);
104 }
105 responseHeaders.add("Access-Control-Allow-Origin", "*");
106
107 // String txt =
108 // XMLTools.transformToHTML(getHits(),"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl");
109 String txt = getHits();
110 if (getStatus().isError())
111 return new StringRepresentation(txt, MediaType.TEXT_HTML);
112 String ret = "";
113 // ret+="<pageFileName>"+pageFileName+"</pageFileName>";
114 //ArrayList<Double[]> points = new ArrayList<Double[]>();
115
116 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
117 dbf.setNamespaceAware(true);
118 dbf.setValidating(false);
119 DocumentBuilder db;
120 try {
121 db = dbf.newDocumentBuilder();
122 } catch (ParserConfigurationException e) {
123 // TODO Auto-generated catch block
124 e.printStackTrace();
125 return null;
126 }
127 // db.setEntityResolver(new MyResolver());
128
129 Document doc;
130 try {
131 // stream = new StringInputStream(xml,"utf-8");
132
133 ByteArrayInputStream stream = new ByteArrayInputStream(
134 txt.getBytes("utf-8"));
135 doc = db.parse(stream);
136
137 } catch (SAXException e) {
138 // TODO Auto-generated catch block
139 e.printStackTrace();
140 return null;
141 } catch (IOException e) {
142 // TODO Auto-generated catch block
143 e.printStackTrace();
144 return null;
145 }
146
147 XPath xpath = XPathFactory.newInstance().newXPath();
148 // XPath Query for showing all nodes value
149 XPathExpression expr = xpath.compile("//page");
150 XPathExpression line = xpath.compile("line");
151 XPathExpression name = xpath.compile("name");
152
153 Object result = expr.evaluate(doc, XPathConstants.NODESET);
154 NodeList pages = (NodeList) result;
155 for (int i = 0; i < pages.getLength(); i++) {
156 NodeList names = (NodeList) name.evaluate(pages.item(i),
157 XPathConstants.NODESET);
158 String pathName = "";
159
160 if (names.getLength() == 1) {
161 Node obj = names.item(0);
162 pathName = obj.getTextContent();
163 }
164 NodeList lines = (NodeList) line.evaluate(pages.item(i),
165 XPathConstants.NODESET);
166 ArrayList<Double[]> points = new ArrayList<Double[]>();
167 for (int l = 0; l < lines.getLength(); l++) {
168 Double[] point = DigilibTools.calculatePoint(lines.item(l)
169 .getTextContent());
170 points.add(point);
171 }
172
173 // Pattern linePattern =
174 // Pattern.compile("<line>(.*?)</line>",Pattern.MULTILINE);
175 // Matcher m = linePattern.matcher(txt);
176
177 // while(m.find()){
178 // Double[] point = DigilibTools.calculatePoint(m.group(1));
179 // points.add(point);
180 // }
181
182 String textId = (String) getRequest().getAttributes().get("textId");
183
184 String url = String.format(DigilibTools.DIGIVIEWBASICSTRING,
185 DigilibTools.generateImagePath(textId, pathName),
186 DigilibTools.generateMarksFromPoints(points));
187
188 ret += String.format(
189 "<div class=\"hitsOnPage\"><a href=\"%s\">%s</a></div>",
190 url, pathName);
191 }
192 ret += "";
193
194 // return ret;
195
196 return new StringRepresentation(ret, MediaType.TEXT_HTML);
197 }
198
199 protected String getHits() throws IOException, ParseException {
200 String textId = (String) getRequest().getAttributes().get("textId");
201 String queryString = (String) getRequest().getAttributes().get(
202 "queryString");
203 String lang = (String) getRequest().getAttributes().get("lang");
204
205 Properties defaultProperties = ConfigurationManager.getConfig();
206
207 File lineDir = new File(defaultProperties.getProperty("lineIndex"));
208 File docIndex = new File(defaultProperties.getProperty("docIndex"));
209
210
211 Boolean parse=true; // im regelfall spll der Querystring noch geparsed werden
212
213 Form form = getRequest().getResourceRef().getQueryAsForm(); // moeglicher parameter "parse" if "false" dann kein parsing des query strings
214 for (Parameter parameter : form) {
215 String name =parameter.getName();
216 if (name.equals("parse")){
217 String parserQuestion = parameter.getValue();
218 if (parserQuestion.equals("false"))
219 parse=false;
220 }
221 }
222 FulltextSearchDocsLines searcher = new FulltextSearchDocsLines(
223 docIndex, lineDir);
224
225 LanguageSearcher ls = searcher.languageSearchers
226 .getSearcherByLanguage(lang);
227 if (ls == null) {
228 setStatus(Status.CLIENT_ERROR_NOT_FOUND);
229 return "<error>Language Not Found</error>";
230 }
231 Analyzer analyzer = searcher.languageSearchers
232 .getSearcherByLanguage(lang).analyzer;
233
234 QueryParser parser = new QueryParser(Version.LUCENE_30, "contents",
235 analyzer);
236 queryString = URLDecoder.decode(queryString, "utf-8");
237 logger.debug(queryString);
238 Query query;
239 if (parse){
240 query = parser.parse(queryString);
241 } else {
242 String[] splitted = queryString.split(":");
243 String qs;
244 if (splitted.length>1)
245 qs = splitted[1];
246 else
247 qs = splitted[0];
248
249 Term term = new Term("contents",qs);
250 query = new TermQuery(term);
251 }
252
253 textId = textId.replace(":", "/"); // esetze pfad trenner TODO statt
254 // pfadtrenner ersetzen besser
255 // urlencode auch in den anderen
256 // klassen
257
258 OCRDoc result = searcher.searchInLinesDoc(textId, query, lang);
259
260 cleanedPath = result.document.get("cleanedPath") + "</cleanedPath>";
261
262 String ret = "<xml>";
263 ret += "<docId>" + textId + "</docId>";
264 ret += "<cleanedPath>" + result.document.get("cleanedPath")
265 + "</cleanedPath>";
266
267 if (result.linesInPage != null) {
268
269 for (String page : result.linesInPage.keySet()) {
270 ret += "<page><name>" + page + "</name>";
271 for (OCRLine line : result.linesInPage.get(page)) {
272 ret += "<line>" + line.toString() + "</line>";
273 }
274 ret += "</page>";
275 }
276 }
277 ret += "</xml>";
278 return ret;
279 }
280 }