Mercurial > hg > fulltextSearch
comparison src/de/mpiwg/dwinter/fulltext/search/xmlsearchadapter/XMLSearchServerAdapter.java @ 2:2b29b0b6db16 default tip
Version mit integrierter Suche ?ber XML-Volltexte
author | dwinter |
---|---|
date | Wed, 26 Jan 2011 14:41:09 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:5c9c31510f0c | 2:2b29b0b6db16 |
---|---|
1 /** | |
2 * | |
3 */ | |
4 package de.mpiwg.dwinter.fulltext.search.xmlsearchadapter; | |
5 | |
6 import java.io.File; | |
7 import java.io.FileNotFoundException; | |
8 import java.io.FileOutputStream; | |
9 import java.io.FileWriter; | |
10 import java.io.IOException; | |
11 import java.io.OutputStreamWriter; | |
12 import java.io.UnsupportedEncodingException; | |
13 import java.util.ArrayList; | |
14 import java.util.Arrays; | |
15 import java.util.HashSet; | |
16 import java.util.List; | |
17 import java.util.Set; | |
18 | |
19 import org.apache.log4j.Logger; | |
20 import org.apache.lucene.index.Term; | |
21 import org.apache.lucene.search.Query; | |
22 import org.apache.lucene.search.ScoreDoc; | |
23 import org.apache.lucene.search.TermQuery; | |
24 import org.jdom.Document; | |
25 import org.jdom.Element; | |
26 import org.jdom.JDOMException; | |
27 import org.jdom.input.SAXBuilder; | |
28 import org.jdom.output.Format; | |
29 import org.jdom.output.XMLOutputter; | |
30 import org.jdom.xpath.XPath; | |
31 | |
32 import de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines; | |
33 import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc; | |
34 import de.mpiwg.dwinter.fulltext.search.utils.OCRLine; | |
35 import de.mpiwg.dwinter.fulltext.ticket.TicketWriter; | |
36 import de.mpiwg.dwinter.lucencetools.documents.FileDocument; | |
37 | |
38 /** | |
39 * @author dwinter | |
40 * | |
41 */ | |
42 public class XMLSearchServerAdapter implements IFulltextSearchDocsLines { | |
43 | |
44 protected static Logger logger = Logger.getRootLogger(); | |
45 | |
46 public static String XMLServerSearchBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/queryResult.xql?"; | |
47 //public static String XMLDocSearchBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/doc-query.xql?"; | |
48 public static String XMLDocSearchBase = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?"; | |
49 public static String XMLServerBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/"; | |
50 | |
51 | |
52 //http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&mode=text&query-type=fulltext&query=quantitas | |
53 // http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&queryType=fulltext&query=quantitas | |
54 /* | |
55 * (non-Javadoc) | |
56 * | |
57 * @see | |
58 * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesToDir | |
59 * (org.apache.lucene.search.Query, java.lang.String, java.lang.String) | |
60 */ | |
61 | |
62 @Override | |
63 public void searchInLinesToDir(Query query, String calledLanguage, String ticket) | |
64 throws IOException { | |
65 String languageFolderName; | |
66 String language; | |
67 //check format of the language string could be lang:xml or just lang | |
68 String[] langsplitted = calledLanguage.split(":"); | |
69 if(langsplitted.length>1){ | |
70 if(langsplitted[1].equals("XML")){ | |
71 language=langsplitted[0]; | |
72 languageFolderName=calledLanguage; | |
73 } else { | |
74 language=calledLanguage; | |
75 languageFolderName=calledLanguage+":XML"; | |
76 } | |
77 | |
78 } else { | |
79 language=calledLanguage; | |
80 languageFolderName=calledLanguage+":XML"; | |
81 } | |
82 | |
83 | |
84 | |
85 TicketWriter tw = new TicketWriter(ticket, query, languageFolderName); | |
86 | |
87 File languageFile = new File(tw.ticketFile.getAbsolutePath() | |
88 + tw.PATHSEPARATOR + languageFolderName); | |
89 if (!languageFile.exists()) { | |
90 logger.debug("Create Languagefolder:" | |
91 + languageFile.getCanonicalPath()); | |
92 if (!languageFile.mkdirs()) | |
93 throw new IOException(); | |
94 } | |
95 | |
96 // docbase=archimedes&docbase=echo&queryType=fulltextMorph&language=la&ftMorphQuery=quantitas&pn=1&output=xml&pageSize=50 | |
97 SAXBuilder parser = new SAXBuilder(); | |
98 | |
99 String queryString = XMLServerSearchBase | |
100 + "docbase=archimedes&docbase=echo&queryType=fulltextMorph"; | |
101 queryString += "&language=" + language; | |
102 | |
103 Set<Term> terms = new HashSet<Term>(); | |
104 query.extractTerms(terms); | |
105 String morphQuery = ""; | |
106 for (Term t : terms) { | |
107 if (t.field().equals("contents")) | |
108 morphQuery = t.text(); | |
109 } | |
110 queryString += "&ftMorphQuery=" + morphQuery; | |
111 queryString += "&pn=1&output=xml&pageSize=500"; | |
112 Document doc; | |
113 try { | |
114 doc = parser.build(queryString); | |
115 } catch (JDOMException e) { | |
116 // TODO Auto-generated catch block | |
117 e.printStackTrace(); | |
118 return; | |
119 } | |
120 | |
121 int counter = writeResults(tw, languageFolderName, ticket, doc); | |
122 writeResultInfo(tw, doc, counter, languageFolderName); | |
123 | |
124 tw.commitTicket(); | |
125 | |
126 List<Element> docElements; | |
127 try { | |
128 XPath docsXP = XPath.newInstance("//document"); | |
129 docElements = docsXP.selectNodes(doc); | |
130 } catch (JDOMException e1) { | |
131 // TODO Auto-generated catch block | |
132 e1.printStackTrace(); | |
133 return; | |
134 } | |
135 | |
136 for (Element e : docElements) { | |
137 Element textIdElement; | |
138 try { | |
139 textIdElement = (Element) XPath.selectSingleNode(e, "uri"); | |
140 } catch (JDOMException e1) { | |
141 // TODO Auto-generated catch block | |
142 continue; | |
143 } | |
144 String textId = textIdElement.getTextTrim(); | |
145 | |
146 File docFile = new File(languageFile.getAbsolutePath() | |
147 + TicketWriter.PATHSEPARATOR | |
148 + textId.replace(TicketWriter.PATHSEPARATORCHAR, ':')); | |
149 if (!docFile.exists()) { | |
150 logger.debug("Create Docfolder:" + docFile.getCanonicalPath()); | |
151 if (!docFile.mkdirs()) | |
152 throw new IOException(); | |
153 } | |
154 | |
155 // TODO: jetzt fuer jede seite ein file, zur Zeit jeweils nur ein | |
156 // File pro Document! | |
157 // for (String page:ocrDoc.linesInPage.keySet()){ | |
158 | |
159 File pageFile = new File(docFile.getAbsolutePath() | |
160 + TicketWriter.PATHSEPARATOR | |
161 + textId.replace(TicketWriter.PATHSEPARATORCHAR, ':')); | |
162 FileWriter pageFileWriter = new FileWriter(pageFile); | |
163 // http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&queryType=fulltext&query=quantitas | |
164 http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&mode=text&query-type=fulltext&query=quantitas | |
165 // for (OCRLine line: ocrDoc.linesInPage.get(page)){ | |
166 // pageFileWriter.write("allLines"+"\n"); | |
167 // } | |
168 queryString = XMLDocSearchBase + "document=" + textId; | |
169 //queryString += "&queryType=fulltext&query=" + morphQuery; | |
170 queryString += "&mode=text&query-type=fulltext&query=" + morphQuery; | |
171 | |
172 try { | |
173 doc = parser.build(queryString); | |
174 } catch (JDOMException e2) { | |
175 // TODO Auto-generated catch block | |
176 e2.printStackTrace(); | |
177 return; | |
178 } | |
179 | |
180 XMLOutputter op = new XMLOutputter(Format.getCompactFormat()); | |
181 op.output(doc, pageFileWriter); | |
182 pageFileWriter.close(); | |
183 } | |
184 tw.closeTicket(languageFolderName); | |
185 } | |
186 | |
187 private void writeResultInfo(TicketWriter tw, Document doc, int counter, | |
188 String languageFolderName) throws IOException { | |
189 //String languageFolderName = language + "_XML"; | |
190 | |
191 File languageFile = new File(tw.ticketFile.getAbsolutePath() | |
192 + TicketWriter.PATHSEPARATOR + languageFolderName); | |
193 File resultFile = new File(languageFile.getAbsolutePath() | |
194 + TicketWriter.PATHSEPARATOR + "resultInfo"); | |
195 FileOutputStream fs = new FileOutputStream(resultFile); | |
196 OutputStreamWriter rw = new OutputStreamWriter(fs, "utf-8"); | |
197 String ret = "<resultInfo>"; | |
198 // int hits = docs.totalHits; | |
199 ret += "<lang>" + languageFolderName+"</lang>"; | |
200 ret += "<hits>" + counter + "</hits>"; | |
201 ret += "<totalHits>" + counter + "</totalHits>";// TODO: gibt es in | |
202 // diesem fall einen | |
203 // unterschied zwischen | |
204 // hits und totalhits? | |
205 | |
206 ret += "</resultInfo>"; | |
207 rw.write(ret); | |
208 rw.close(); | |
209 } | |
210 | |
211 private int writeResults(TicketWriter tw, String languageFolderName, String ticket, | |
212 Document doc) throws IOException { | |
213 OutputStreamWriter rw = null; | |
214 File languageFile; | |
215 //String languageFolderName = language + "_XML"; | |
216 | |
217 try { | |
218 languageFile = new File(tw.ticketFile.getAbsolutePath() | |
219 + tw.PATHSEPARATOR + languageFolderName); | |
220 File resultFile = new File(languageFile.getAbsolutePath() | |
221 + tw.PATHSEPARATOR + "result"); | |
222 FileOutputStream fs = new FileOutputStream(resultFile); | |
223 rw = new OutputStreamWriter(fs, "utf-8"); | |
224 } catch (FileNotFoundException e) { | |
225 // TODO Auto-generated catch block | |
226 e.printStackTrace(); | |
227 return -1; | |
228 } catch (UnsupportedEncodingException e) { | |
229 // TODO Auto-generated catch block | |
230 e.printStackTrace(); | |
231 return -1; | |
232 } | |
233 | |
234 Set<String> idsAlreadyDone = new HashSet<String>(); // TODO: aus | |
235 // irgendwelche | |
236 // gruenden gibt es | |
237 // ein Dokument | |
238 // mehrfach in den | |
239 // Fundstellen | |
240 | |
241 // FileWriter rw = new FileWriter(resultFile); | |
242 | |
243 if (!languageFile.exists()) { | |
244 logger.debug("Create Languagefolder:" | |
245 + languageFile.getCanonicalPath()); | |
246 if (!languageFile.mkdirs()) | |
247 throw new IOException(); | |
248 } | |
249 Integer counter = 0; | |
250 @SuppressWarnings("unchecked") | |
251 List<Element> elements; | |
252 try { | |
253 XPath xpathDoc = XPath.newInstance("//document"); | |
254 elements = xpathDoc.selectNodes(doc); | |
255 } catch (JDOMException e1) { | |
256 // TODO Auto-generated catch block | |
257 e1.printStackTrace(); | |
258 return -1; | |
259 } | |
260 | |
261 for (Element e : elements) { | |
262 try { | |
263 XPath xpathUri = XPath.newInstance("uri"); | |
264 Element uri = (Element) xpathUri.selectSingleNode(e); | |
265 | |
266 String id = uri.getTextTrim(); | |
267 if (!idsAlreadyDone.contains(id)) { // stelle sicher das alle | |
268 // treffer nur einmal in die | |
269 // date geschrieben werden. | |
270 idsAlreadyDone.add(id); | |
271 | |
272 String textId = id; | |
273 String md = ""; | |
274 Element mdEl = (Element) XPath | |
275 .selectSingleNode(e, "author"); | |
276 md += "<dc:creator>" + formatXML(mdEl.getTextTrim()) | |
277 + "</dc:creator>"; | |
278 | |
279 mdEl = (Element) XPath.selectSingleNode(e, "title"); | |
280 md += "<dc:title>" + formatXML(mdEl.getTextTrim()) | |
281 + "</dc:title>"; | |
282 | |
283 mdEl = (Element) XPath.selectSingleNode(e, "place"); | |
284 md += "<dc:place>" + formatXML(mdEl.getTextTrim()) | |
285 + "</dc:place>"; | |
286 | |
287 mdEl = (Element) XPath.selectSingleNode(e, "date"); | |
288 md += "<dc:date>" + formatXML(mdEl.getTextTrim()) | |
289 + "</dc:date>"; | |
290 | |
291 String ret = "<result xmlns:dc=\"http://dublincore.org/documents/dcmi-namespace/\">"; | |
292 ret += "<cleanedPath>" + textId + "</cleanedPath>"; | |
293 ret += "<textId>" + textId.replace("/", ":") + "</textId>"; | |
294 ret += "<textIdCleaned>" + textId.replace("/", "_") | |
295 + "</textIdCleaned>"; | |
296 ret += "<md>" + md + "</md>"; | |
297 ret += "</result>"; | |
298 | |
299 rw.write(ret); | |
300 counter++; | |
301 } | |
302 } catch (JDOMException e1) { | |
303 // TODO Auto-generated catch block | |
304 e1.printStackTrace(); | |
305 return -1; | |
306 } | |
307 } | |
308 rw.close(); | |
309 return counter; | |
310 } | |
311 | |
312 private String formatXML(String string) { | |
313 String retStr = string.replace("&", "&"); | |
314 retStr = retStr.replace("<", "<"); | |
315 retStr = retStr.replace(">", ">"); | |
316 return retStr; | |
317 } | |
318 | |
319 /* | |
320 * (non-Javadoc) | |
321 * | |
322 * @see | |
323 * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesDoc | |
324 * (java.lang.String, org.apache.lucene.search.Query, java.lang.String) | |
325 */ | |
326 @Override | |
327 public OCRDoc searchInLinesDoc(String textId, Query query, String language) | |
328 throws IOException { | |
329 // TODO Auto-generated method stub | |
330 return null; | |
331 } | |
332 | |
333 /* | |
334 * (non-Javadoc) | |
335 * | |
336 * @see | |
337 * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLines | |
338 * (org.apache.lucene.search.Query, java.lang.String) | |
339 */ | |
340 @Override | |
341 public List<OCRDoc> searchInLines(Query query, String language) | |
342 throws IOException { | |
343 // TODO Auto-generated method stub | |
344 return null; | |
345 } | |
346 | |
347 public static void main(String[] args) { | |
348 Term t = new Term("contents", "quantitas"); | |
349 Query q = new TermQuery(t); | |
350 XMLSearchServerAdapter sa = new XMLSearchServerAdapter(); | |
351 try { | |
352 sa.searchInLinesToDir(q, "la", "121"); | |
353 } catch (IOException e) { | |
354 // TODO Auto-generated catch block | |
355 e.printStackTrace(); | |
356 } | |
357 } | |
358 | |
359 public static List<String> getSupportedLanguages() { | |
360 String langs[] = new String[] { "la:XML", "it:XML" }; | |
361 return Arrays.asList(langs); | |
362 | |
363 } | |
364 } |