0
|
1 package de.mpiwg.dwinter.fulltextSearchServer;
|
|
2
|
|
3 import java.io.ByteArrayInputStream;
|
|
4 import java.io.File;
|
|
5 import java.io.IOException;
|
|
6 import java.io.InputStream;
|
|
7 import java.io.OutputStream;
|
|
8 import java.io.Reader;
|
|
9 import java.io.Writer;
|
|
10 import java.net.URLDecoder;
|
|
11 import java.nio.channels.ReadableByteChannel;
|
|
12 import java.nio.channels.WritableByteChannel;
|
|
13 import java.util.ArrayList;
|
|
14 import java.util.Properties;
|
|
15 import java.util.regex.Matcher;
|
|
16 import java.util.regex.Pattern;
|
|
17
|
|
18 import javax.xml.parsers.DocumentBuilder;
|
|
19 import javax.xml.parsers.DocumentBuilderFactory;
|
|
20 import javax.xml.parsers.ParserConfigurationException;
|
|
21 import javax.xml.transform.TransformerFactoryConfigurationError;
|
|
22 import javax.xml.xpath.XPath;
|
|
23 import javax.xml.xpath.XPathConstants;
|
|
24 import javax.xml.xpath.XPathExpression;
|
|
25 import javax.xml.xpath.XPathExpressionException;
|
|
26 import javax.xml.xpath.XPathFactory;
|
|
27
|
|
28 import org.apache.log4j.Logger;
|
|
29 import org.apache.lucene.analysis.Analyzer;
|
|
30 import org.apache.lucene.index.Term;
|
|
31 import org.apache.lucene.queryParser.ParseException;
|
|
32 import org.apache.lucene.queryParser.QueryParser;
|
|
33 import org.apache.lucene.search.Query;
|
|
34 import org.apache.lucene.search.TermQuery;
|
|
35 import org.apache.lucene.util.Version;
|
|
36 import org.restlet.data.Form;
|
|
37 import org.restlet.data.MediaType;
|
|
38 import org.restlet.data.Parameter;
|
|
39 import org.restlet.data.Status;
|
|
40 import org.restlet.representation.Representation;
|
|
41 import org.restlet.representation.StringRepresentation;
|
|
42 import org.restlet.resource.Get;
|
|
43 import org.restlet.resource.Options;
|
|
44 import org.restlet.resource.ServerResource;
|
|
45 import org.w3c.dom.Document;
|
|
46 import org.w3c.dom.Node;
|
|
47 import org.w3c.dom.NodeList;
|
|
48 import org.xml.sax.SAXException;
|
|
49
|
|
50 import de.mpiwg.dwinter.fulltext.search.FulltextSearchDocsLines;
|
|
51 import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc;
|
|
52 import de.mpiwg.dwinter.fulltext.search.utils.OCRLine;
|
|
53 import de.mpiwg.dwinter.fulltext.searcher.LanguageSearcher;
|
|
54 import de.mpiwg.dwinter.fulltext.ticket.TicketWriter;
|
|
55 import de.mpiwg.dwinter.fulltextSearchServer.Utils.ConfigurationManager;
|
|
56 import de.mpiwg.dwinter.fulltextSearchServer.Utils.DigilibTools;
|
|
57 import de.mpiwg.dwinter.fulltextSearchServer.Utils.XMLTools;
|
|
58
|
|
59 public class SearchLines extends ServerResource {
|
|
60
|
|
61 /**
|
|
62 * Erlaubt cross scripting bei Aufruf aus Javascript
|
|
63 *
|
|
64 * @param entity
|
|
65 */
|
|
66
|
|
67 private Logger logger = Logger.getRootLogger();
|
|
68 private String cleanedPath;
|
|
69
|
|
70 @Options
|
|
71 public void doOptions(Representation entity) {
|
|
72 Form responseHeaders = (Form) getResponse().getAttributes().get(
|
|
73 "org.restlet.http.headers");
|
|
74 if (responseHeaders == null) {
|
|
75 responseHeaders = new Form();
|
|
76 getResponse().getAttributes().put("org.restlet.http.headers",
|
|
77 responseHeaders);
|
|
78 }
|
|
79 responseHeaders.add("Access-Control-Allow-Origin", "*");
|
|
80 responseHeaders.add("Access-Control-Allow-Methods", "POST,OPTIONS,GET");
|
|
81 responseHeaders.add("Access-Control-Allow-Headers", "Content-Type");
|
|
82 responseHeaders.add("Access-Control-Allow-Credentials", "false");
|
|
83 responseHeaders.add("Access-Control-Max-Age", "60");
|
|
84 }
|
|
85
|
|
86 // @Get("xml")
|
|
87 public Representation getXML() throws IOException, ParseException {
|
|
88
|
|
89 return new StringRepresentation(getHits(), MediaType.TEXT_XML);
|
|
90
|
|
91 }
|
|
92
|
|
93 @Get("html")
|
|
94 public Representation getHTML()
|
|
95 throws TransformerFactoryConfigurationError, IOException,
|
|
96 ParseException, XPathExpressionException {
|
|
97 // response header fuer cross-site.scripting
|
|
98 Form responseHeaders = (Form) getResponse().getAttributes().get(
|
|
99 "org.restlet.http.headers");
|
|
100 if (responseHeaders == null) {
|
|
101 responseHeaders = new Form();
|
|
102 getResponse().getAttributes().put("org.restlet.http.headers",
|
|
103 responseHeaders);
|
|
104 }
|
|
105 responseHeaders.add("Access-Control-Allow-Origin", "*");
|
|
106
|
|
107 // String txt =
|
|
108 // XMLTools.transformToHTML(getHits(),"/de/mpiwg/dwinter/fulltextSearchServer/xsl/processRestPathOfDocumentToHTML.xsl");
|
|
109 String txt = getHits();
|
|
110 if (getStatus().isError())
|
|
111 return new StringRepresentation(txt, MediaType.TEXT_HTML);
|
|
112 String ret = "";
|
|
113 // ret+="<pageFileName>"+pageFileName+"</pageFileName>";
|
|
114 //ArrayList<Double[]> points = new ArrayList<Double[]>();
|
|
115
|
|
116 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
|
|
117 dbf.setNamespaceAware(true);
|
|
118 dbf.setValidating(false);
|
|
119 DocumentBuilder db;
|
|
120 try {
|
|
121 db = dbf.newDocumentBuilder();
|
|
122 } catch (ParserConfigurationException e) {
|
|
123 // TODO Auto-generated catch block
|
|
124 e.printStackTrace();
|
|
125 return null;
|
|
126 }
|
|
127 // db.setEntityResolver(new MyResolver());
|
|
128
|
|
129 Document doc;
|
|
130 try {
|
|
131 // stream = new StringInputStream(xml,"utf-8");
|
|
132
|
|
133 ByteArrayInputStream stream = new ByteArrayInputStream(
|
|
134 txt.getBytes("utf-8"));
|
|
135 doc = db.parse(stream);
|
|
136
|
|
137 } catch (SAXException e) {
|
|
138 // TODO Auto-generated catch block
|
|
139 e.printStackTrace();
|
|
140 return null;
|
|
141 } catch (IOException e) {
|
|
142 // TODO Auto-generated catch block
|
|
143 e.printStackTrace();
|
|
144 return null;
|
|
145 }
|
|
146
|
|
147 XPath xpath = XPathFactory.newInstance().newXPath();
|
|
148 // XPath Query for showing all nodes value
|
|
149 XPathExpression expr = xpath.compile("//page");
|
|
150 XPathExpression line = xpath.compile("line");
|
|
151 XPathExpression name = xpath.compile("name");
|
|
152
|
|
153 Object result = expr.evaluate(doc, XPathConstants.NODESET);
|
|
154 NodeList pages = (NodeList) result;
|
|
155 for (int i = 0; i < pages.getLength(); i++) {
|
|
156 NodeList names = (NodeList) name.evaluate(pages.item(i),
|
|
157 XPathConstants.NODESET);
|
|
158 String pathName = "";
|
|
159
|
|
160 if (names.getLength() == 1) {
|
|
161 Node obj = names.item(0);
|
|
162 pathName = obj.getTextContent();
|
|
163 }
|
|
164 NodeList lines = (NodeList) line.evaluate(pages.item(i),
|
|
165 XPathConstants.NODESET);
|
|
166 ArrayList<Double[]> points = new ArrayList<Double[]>();
|
|
167 for (int l = 0; l < lines.getLength(); l++) {
|
|
168 Double[] point = DigilibTools.calculatePoint(lines.item(l)
|
|
169 .getTextContent());
|
|
170 points.add(point);
|
|
171 }
|
|
172
|
|
173 // Pattern linePattern =
|
|
174 // Pattern.compile("<line>(.*?)</line>",Pattern.MULTILINE);
|
|
175 // Matcher m = linePattern.matcher(txt);
|
|
176
|
|
177 // while(m.find()){
|
|
178 // Double[] point = DigilibTools.calculatePoint(m.group(1));
|
|
179 // points.add(point);
|
|
180 // }
|
|
181
|
|
182 String textId = (String) getRequest().getAttributes().get("textId");
|
|
183
|
|
184 String url = String.format(DigilibTools.DIGIVIEWBASICSTRING,
|
|
185 DigilibTools.generateImagePath(textId, pathName),
|
|
186 DigilibTools.generateMarksFromPoints(points));
|
|
187
|
|
188 ret += String.format(
|
|
189 "<div class=\"hitsOnPage\"><a href=\"%s\">%s</a></div>",
|
|
190 url, pathName);
|
|
191 }
|
|
192 ret += "";
|
|
193
|
|
194 // return ret;
|
|
195
|
|
196 return new StringRepresentation(ret, MediaType.TEXT_HTML);
|
|
197 }
|
|
198
|
|
199 protected String getHits() throws IOException, ParseException {
|
|
200 String textId = (String) getRequest().getAttributes().get("textId");
|
|
201 String queryString = (String) getRequest().getAttributes().get(
|
|
202 "queryString");
|
|
203 String lang = (String) getRequest().getAttributes().get("lang");
|
|
204
|
|
205 Properties defaultProperties = ConfigurationManager.getConfig();
|
|
206
|
|
207 File lineDir = new File(defaultProperties.getProperty("lineIndex"));
|
|
208 File docIndex = new File(defaultProperties.getProperty("docIndex"));
|
|
209
|
|
210
|
|
211 Boolean parse=true; // im regelfall spll der Querystring noch geparsed werden
|
|
212
|
|
213 Form form = getRequest().getResourceRef().getQueryAsForm(); // moeglicher parameter "parse" if "false" dann kein parsing des query strings
|
|
214 for (Parameter parameter : form) {
|
|
215 String name =parameter.getName();
|
|
216 if (name.equals("parse")){
|
|
217 String parserQuestion = parameter.getValue();
|
|
218 if (parserQuestion.equals("false"))
|
|
219 parse=false;
|
|
220 }
|
|
221 }
|
|
222 FulltextSearchDocsLines searcher = new FulltextSearchDocsLines(
|
|
223 docIndex, lineDir);
|
|
224
|
|
225 LanguageSearcher ls = searcher.languageSearchers
|
|
226 .getSearcherByLanguage(lang);
|
|
227 if (ls == null) {
|
|
228 setStatus(Status.CLIENT_ERROR_NOT_FOUND);
|
|
229 return "<error>Language Not Found</error>";
|
|
230 }
|
|
231 Analyzer analyzer = searcher.languageSearchers
|
|
232 .getSearcherByLanguage(lang).analyzer;
|
|
233
|
|
234 QueryParser parser = new QueryParser(Version.LUCENE_30, "contents",
|
|
235 analyzer);
|
|
236 queryString = URLDecoder.decode(queryString, "utf-8");
|
|
237 logger.debug(queryString);
|
|
238 Query query;
|
|
239 if (parse){
|
|
240 query = parser.parse(queryString);
|
|
241 } else {
|
|
242 String[] splitted = queryString.split(":");
|
|
243 String qs;
|
|
244 if (splitted.length>1)
|
|
245 qs = splitted[1];
|
|
246 else
|
|
247 qs = splitted[0];
|
|
248
|
|
249 Term term = new Term("contents",qs);
|
|
250 query = new TermQuery(term);
|
|
251 }
|
|
252
|
|
253 textId = textId.replace(":", "/"); // esetze pfad trenner TODO statt
|
|
254 // pfadtrenner ersetzen besser
|
|
255 // urlencode auch in den anderen
|
|
256 // klassen
|
|
257
|
|
258 OCRDoc result = searcher.searchInLinesDoc(textId, query, lang);
|
|
259
|
|
260 cleanedPath = result.document.get("cleanedPath") + "</cleanedPath>";
|
|
261
|
|
262 String ret = "<xml>";
|
|
263 ret += "<docId>" + textId + "</docId>";
|
|
264 ret += "<cleanedPath>" + result.document.get("cleanedPath")
|
|
265 + "</cleanedPath>";
|
|
266
|
|
267 if (result.linesInPage != null) {
|
|
268
|
|
269 for (String page : result.linesInPage.keySet()) {
|
|
270 ret += "<page><name>" + page + "</name>";
|
|
271 for (OCRLine line : result.linesInPage.get(page)) {
|
|
272 ret += "<line>" + line.toString() + "</line>";
|
|
273 }
|
|
274 ret += "</page>";
|
|
275 }
|
|
276 }
|
|
277 ret += "</xml>";
|
|
278 return ret;
|
|
279 }
|
|
280 }
|