Mercurial > hg > fulltextSearchServer
comparison src/de/mpiwg/dwinter/fulltextSearchServer/SearchLines.java @ 0:db87c1b7eb6d
initial
author | dwinter |
---|---|
date | Wed, 03 Nov 2010 12:18:46 +0100 |
parents | |
children | 83e9a828e794 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:db87c1b7eb6d |
---|---|
1 package de.mpiwg.dwinter.fulltextSearchServer; | |
2 | |
3 import java.io.ByteArrayInputStream; | |
4 import java.io.File; | |
5 import java.io.IOException; | |
6 import java.io.InputStream; | |
7 import java.io.OutputStream; | |
8 import java.io.Reader; | |
9 import java.io.Writer; | |
10 import java.net.URLDecoder; | |
11 import java.nio.channels.ReadableByteChannel; | |
12 import java.nio.channels.WritableByteChannel; | |
13 import java.util.ArrayList; | |
14 import java.util.Properties; | |
15 import java.util.regex.Matcher; | |
16 import java.util.regex.Pattern; | |
17 | |
18 import javax.xml.parsers.DocumentBuilder; | |
19 import javax.xml.parsers.DocumentBuilderFactory; | |
20 import javax.xml.parsers.ParserConfigurationException; | |
21 import javax.xml.transform.TransformerFactoryConfigurationError; | |
22 import javax.xml.xpath.XPath; | |
23 import javax.xml.xpath.XPathConstants; | |
24 import javax.xml.xpath.XPathExpression; | |
25 import javax.xml.xpath.XPathExpressionException; | |
26 import javax.xml.xpath.XPathFactory; | |
27 | |
28 import org.apache.log4j.Logger; | |
29 import org.apache.lucene.analysis.Analyzer; | |
30 import org.apache.lucene.index.Term; | |
31 import org.apache.lucene.queryParser.ParseException; | |
32 import org.apache.lucene.queryParser.QueryParser; | |
33 import org.apache.lucene.search.Query; | |
34 import org.apache.lucene.search.TermQuery; | |
35 import org.apache.lucene.util.Version; | |
36 import org.restlet.data.Form; | |
37 import org.restlet.data.MediaType; | |
38 import org.restlet.data.Parameter; | |
39 import org.restlet.data.Status; | |
40 import org.restlet.representation.Representation; | |
41 import org.restlet.representation.StringRepresentation; | |
42 import org.restlet.resource.Get; | |
43 import org.restlet.resource.Options; | |
44 import org.restlet.resource.ServerResource; | |
45 import org.w3c.dom.Document; | |
46 import org.w3c.dom.Node; | |
47 import org.w3c.dom.NodeList; | |
48 import org.xml.sax.SAXException; | |
49 | |
50 import de.mpiwg.dwinter.fulltext.search.FulltextSearchDocsLines; | |
51 import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc; | |
52 import de.mpiwg.dwinter.fulltext.search.utils.OCRLine; | |
53 import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher; | |
54 import de.mpiwg.dwinter.fulltext.ticket.TicketWriter; | |
55 import de.mpiwg.dwinter.fulltextSearchServer.Utils.ConfigurationManager; | |
56 import de.mpiwg.dwinter.fulltextSearchServer.Utils.DigilibTools; | |
57 import de.mpiwg.dwinter.fulltextSearchServer.Utils.XMLTools; | |
58 | |
59 public class SearchLines extends ServerResource { | |
60 | |
61 /** | |
62 * Erlaubt cross scripting bei Aufruf aus Javascript | |
63 * | |
64 * @param entity | |
65 */ | |
66 | |
67 private Logger logger = Logger.getRootLogger(); | |
68 private String cleanedPath; | |
69 | |
70 @Options | |
71 public void doOptions(Representation entity) { | |
72 Form responseHeaders = (Form) getResponse().getAttributes().get( | |
73 "org.restlet.http.headers"); | |
74 if (responseHeaders == null) { | |
75 responseHeaders = new Form(); | |
76 getResponse().getAttributes().put("org.restlet.http.headers", | |
77 responseHeaders); | |
78 } | |
79 responseHeaders.add("Access-Control-Allow-Origin", "*"); | |
80 responseHeaders.add("Access-Control-Allow-Methods", "POST,OPTIONS,GET"); | |
81 responseHeaders.add("Access-Control-Allow-Headers", "Content-Type"); | |
82 responseHeaders.add("Access-Control-Allow-Credentials", "false"); | |
83 responseHeaders.add("Access-Control-Max-Age", "60"); | |
84 } | |
85 | |
86 // @Get("xml") | |
87 public Representation getXML() throws IOException, ParseException { | |
88 | |
89 return new StringRepresentation(getHits(), MediaType.TEXT_XML); | |
90 | |
91 } | |
92 | |
93 @Get("html") | |
94 public Representation getHTML() | |
95 throws TransformerFactoryConfigurationError, IOException, | |
96 ParseException, XPathExpressionException { | |
97 // response header fuer cross-site.scripting | |
98 Form responseHeaders = (Form) getResponse().getAttributes().get( | |
99 "org.restlet.http.headers"); | |
100 if (responseHeaders == null) { | |
101 responseHeaders = new Form(); | |
102 getResponse().getAttributes().put("org.restlet.http.headers", | |
103 responseHeaders); | |
104 } | |
105 responseHeaders.add("Access-Control-Allow-Origin", "*"); | |
106 | |
107 // String txt = | |
108 // XMLTools.transformToHTML(getHits(),"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl"); | |
109 String txt = getHits(); | |
110 if (getStatus().isError()) | |
111 return new StringRepresentation(txt, MediaType.TEXT_HTML); | |
112 String ret = ""; | |
113 // ret+="<pageFileName>"+pageFileName+"</pageFileName>"; | |
114 //ArrayList<Double[]> points = new ArrayList<Double[]>(); | |
115 | |
116 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); | |
117 dbf.setNamespaceAware(true); | |
118 dbf.setValidating(false); | |
119 DocumentBuilder db; | |
120 try { | |
121 db = dbf.newDocumentBuilder(); | |
122 } catch (ParserConfigurationException e) { | |
123 // TODO Auto-generated catch block | |
124 e.printStackTrace(); | |
125 return null; | |
126 } | |
127 // db.setEntityResolver(new MyResolver()); | |
128 | |
129 Document doc; | |
130 try { | |
131 // stream = new StringInputStream(xml,"utf-8"); | |
132 | |
133 ByteArrayInputStream stream = new ByteArrayInputStream( | |
134 txt.getBytes("utf-8")); | |
135 doc = db.parse(stream); | |
136 | |
137 } catch (SAXException e) { | |
138 // TODO Auto-generated catch block | |
139 e.printStackTrace(); | |
140 return null; | |
141 } catch (IOException e) { | |
142 // TODO Auto-generated catch block | |
143 e.printStackTrace(); | |
144 return null; | |
145 } | |
146 | |
147 XPath xpath = XPathFactory.newInstance().newXPath(); | |
148 // XPath Query for showing all nodes value | |
149 XPathExpression expr = xpath.compile("//page"); | |
150 XPathExpression line = xpath.compile("line"); | |
151 XPathExpression name = xpath.compile("name"); | |
152 | |
153 Object result = expr.evaluate(doc, XPathConstants.NODESET); | |
154 NodeList pages = (NodeList) result; | |
155 for (int i = 0; i < pages.getLength(); i++) { | |
156 NodeList names = (NodeList) name.evaluate(pages.item(i), | |
157 XPathConstants.NODESET); | |
158 String pathName = ""; | |
159 | |
160 if (names.getLength() == 1) { | |
161 Node obj = names.item(0); | |
162 pathName = obj.getTextContent(); | |
163 } | |
164 NodeList lines = (NodeList) line.evaluate(pages.item(i), | |
165 XPathConstants.NODESET); | |
166 ArrayList<Double[]> points = new ArrayList<Double[]>(); | |
167 for (int l = 0; l < lines.getLength(); l++) { | |
168 Double[] point = DigilibTools.calculatePoint(lines.item(l) | |
169 .getTextContent()); | |
170 points.add(point); | |
171 } | |
172 | |
173 // Pattern linePattern = | |
174 // Pattern.compile("<line>(.*?)</line>",Pattern.MULTILINE); | |
175 // Matcher m = linePattern.matcher(txt); | |
176 | |
177 // while(m.find()){ | |
178 // Double[] point = DigilibTools.calculatePoint(m.group(1)); | |
179 // points.add(point); | |
180 // } | |
181 | |
182 String textId = (String) getRequest().getAttributes().get("textId"); | |
183 | |
184 String url = String.format(DigilibTools.DIGIVIEWBASICSTRING, | |
185 DigilibTools.generateImagePath(textId, pathName), | |
186 DigilibTools.generateMarksFromPoints(points)); | |
187 | |
188 ret += String.format( | |
189 "<div class=\"hitsOnPage\"><a href=\"%s\">%s</a></div>", | |
190 url, pathName); | |
191 } | |
192 ret += ""; | |
193 | |
194 // return ret; | |
195 | |
196 return new StringRepresentation(ret, MediaType.TEXT_HTML); | |
197 } | |
198 | |
199 protected String getHits() throws IOException, ParseException { | |
200 String textId = (String) getRequest().getAttributes().get("textId"); | |
201 String queryString = (String) getRequest().getAttributes().get( | |
202 "queryString"); | |
203 String lang = (String) getRequest().getAttributes().get("lang"); | |
204 | |
205 Properties defaultProperties = ConfigurationManager.getConfig(); | |
206 | |
207 File lineDir = new File(defaultProperties.getProperty("lineIndex")); | |
208 File docIndex = new File(defaultProperties.getProperty("docIndex")); | |
209 | |
210 | |
211 Boolean parse=true; // im regelfall spll der Querystring noch geparsed werden | |
212 | |
213 Form form = getRequest().getResourceRef().getQueryAsForm(); // moeglicher parameter "parse" if "false" dann kein parsing des query strings | |
214 for (Parameter parameter : form) { | |
215 String name =parameter.getName(); | |
216 if (name.equals("parse")){ | |
217 String parserQuestion = parameter.getValue(); | |
218 if (parserQuestion.equals("false")) | |
219 parse=false; | |
220 } | |
221 } | |
222 FulltextSearchDocsLines searcher = new FulltextSearchDocsLines( | |
223 docIndex, lineDir); | |
224 | |
225 LanguageSearcher ls = searcher.languageSearchers | |
226 .getSearcherByLanguage(lang); | |
227 if (ls == null) { | |
228 setStatus(Status.CLIENT_ERROR_NOT_FOUND); | |
229 return "<error>Language Not Found</error>"; | |
230 } | |
231 Analyzer analyzer = searcher.languageSearchers | |
232 .getSearcherByLanguage(lang).analyzer; | |
233 | |
234 QueryParser parser = new QueryParser(Version.LUCENE_30, "contents", | |
235 analyzer); | |
236 queryString = URLDecoder.decode(queryString, "utf-8"); | |
237 logger.debug(queryString); | |
238 Query query; | |
239 if (parse){ | |
240 query = parser.parse(queryString); | |
241 } else { | |
242 String[] splitted = queryString.split(":"); | |
243 String qs; | |
244 if (splitted.length>1) | |
245 qs = splitted[1]; | |
246 else | |
247 qs = splitted[0]; | |
248 | |
249 Term term = new Term("contents",qs); | |
250 query = new TermQuery(term); | |
251 } | |
252 | |
253 textId = textId.replace(":", "/"); // esetze pfad trenner TODO statt | |
254 // pfadtrenner ersetzen besser | |
255 // urlencode auch in den anderen | |
256 // klassen | |
257 | |
258 OCRDoc result = searcher.searchInLinesDoc(textId, query, lang); | |
259 | |
260 cleanedPath = result.document.get("cleanedPath") + "</cleanedPath>"; | |
261 | |
262 String ret = "<xml>"; | |
263 ret += "<docId>" + textId + "</docId>"; | |
264 ret += "<cleanedPath>" + result.document.get("cleanedPath") | |
265 + "</cleanedPath>"; | |
266 | |
267 if (result.linesInPage != null) { | |
268 | |
269 for (String page : result.linesInPage.keySet()) { | |
270 ret += "<page><name>" + page + "</name>"; | |
271 for (OCRLine line : result.linesInPage.get(page)) { | |
272 ret += "<line>" + line.toString() + "</line>"; | |
273 } | |
274 ret += "</page>"; | |
275 } | |
276 } | |
277 ret += "</xml>"; | |
278 return ret; | |
279 } | |
280 } |