comparison src/de/mpiwg/dwinter/fulltext/search/xmlsearchadapter/XMLSearchServerAdapter.java @ 2:2b29b0b6db16 default tip

Version mit integrierter Suche ?ber XML-Volltexte
author dwinter
date Wed, 26 Jan 2011 14:41:09 +0100
parents
children
comparison
equal deleted inserted replaced
1:5c9c31510f0c 2:2b29b0b6db16
1 /**
2 *
3 */
4 package de.mpiwg.dwinter.fulltext.search.xmlsearchadapter;
5
6 import java.io.File;
7 import java.io.FileNotFoundException;
8 import java.io.FileOutputStream;
9 import java.io.FileWriter;
10 import java.io.IOException;
11 import java.io.OutputStreamWriter;
12 import java.io.UnsupportedEncodingException;
13 import java.util.ArrayList;
14 import java.util.Arrays;
15 import java.util.HashSet;
16 import java.util.List;
17 import java.util.Set;
18
19 import org.apache.log4j.Logger;
20 import org.apache.lucene.index.Term;
21 import org.apache.lucene.search.Query;
22 import org.apache.lucene.search.ScoreDoc;
23 import org.apache.lucene.search.TermQuery;
24 import org.jdom.Document;
25 import org.jdom.Element;
26 import org.jdom.JDOMException;
27 import org.jdom.input.SAXBuilder;
28 import org.jdom.output.Format;
29 import org.jdom.output.XMLOutputter;
30 import org.jdom.xpath.XPath;
31
32 import de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines;
33 import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc;
34 import de.mpiwg.dwinter.fulltext.search.utils.OCRLine;
35 import de.mpiwg.dwinter.fulltext.ticket.TicketWriter;
36 import de.mpiwg.dwinter.lucencetools.documents.FileDocument;
37
38 /**
39 * @author dwinter
40 *
41 */
42 public class XMLSearchServerAdapter implements IFulltextSearchDocsLines {
43
44 protected static Logger logger = Logger.getRootLogger();
45
46 public static String XMLServerSearchBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/queryResult.xql?";
47 //public static String XMLDocSearchBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/doc-query.xql?";
48 public static String XMLDocSearchBase = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?";
49 public static String XMLServerBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/";
50
51
52 //http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&mode=text&query-type=fulltext&query=quantitas
53 // http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&queryType=fulltext&query=quantitas
54 /*
55 * (non-Javadoc)
56 *
57 * @see
58 * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesToDir
59 * (org.apache.lucene.search.Query, java.lang.String, java.lang.String)
60 */
61
62 @Override
63 public void searchInLinesToDir(Query query, String calledLanguage, String ticket)
64 throws IOException {
65 String languageFolderName;
66 String language;
67 //check format of the language string could be lang:xml or just lang
68 String[] langsplitted = calledLanguage.split(":");
69 if(langsplitted.length>1){
70 if(langsplitted[1].equals("XML")){
71 language=langsplitted[0];
72 languageFolderName=calledLanguage;
73 } else {
74 language=calledLanguage;
75 languageFolderName=calledLanguage+":XML";
76 }
77
78 } else {
79 language=calledLanguage;
80 languageFolderName=calledLanguage+":XML";
81 }
82
83
84
85 TicketWriter tw = new TicketWriter(ticket, query, languageFolderName);
86
87 File languageFile = new File(tw.ticketFile.getAbsolutePath()
88 + tw.PATHSEPARATOR + languageFolderName);
89 if (!languageFile.exists()) {
90 logger.debug("Create Languagefolder:"
91 + languageFile.getCanonicalPath());
92 if (!languageFile.mkdirs())
93 throw new IOException();
94 }
95
96 // docbase=archimedes&docbase=echo&queryType=fulltextMorph&language=la&ftMorphQuery=quantitas&pn=1&output=xml&pageSize=50
97 SAXBuilder parser = new SAXBuilder();
98
99 String queryString = XMLServerSearchBase
100 + "docbase=archimedes&docbase=echo&queryType=fulltextMorph";
101 queryString += "&language=" + language;
102
103 Set<Term> terms = new HashSet<Term>();
104 query.extractTerms(terms);
105 String morphQuery = "";
106 for (Term t : terms) {
107 if (t.field().equals("contents"))
108 morphQuery = t.text();
109 }
110 queryString += "&ftMorphQuery=" + morphQuery;
111 queryString += "&pn=1&output=xml&pageSize=500";
112 Document doc;
113 try {
114 doc = parser.build(queryString);
115 } catch (JDOMException e) {
116 // TODO Auto-generated catch block
117 e.printStackTrace();
118 return;
119 }
120
121 int counter = writeResults(tw, languageFolderName, ticket, doc);
122 writeResultInfo(tw, doc, counter, languageFolderName);
123
124 tw.commitTicket();
125
126 List<Element> docElements;
127 try {
128 XPath docsXP = XPath.newInstance("//document");
129 docElements = docsXP.selectNodes(doc);
130 } catch (JDOMException e1) {
131 // TODO Auto-generated catch block
132 e1.printStackTrace();
133 return;
134 }
135
136 for (Element e : docElements) {
137 Element textIdElement;
138 try {
139 textIdElement = (Element) XPath.selectSingleNode(e, "uri");
140 } catch (JDOMException e1) {
141 // TODO Auto-generated catch block
142 continue;
143 }
144 String textId = textIdElement.getTextTrim();
145
146 File docFile = new File(languageFile.getAbsolutePath()
147 + TicketWriter.PATHSEPARATOR
148 + textId.replace(TicketWriter.PATHSEPARATORCHAR, ':'));
149 if (!docFile.exists()) {
150 logger.debug("Create Docfolder:" + docFile.getCanonicalPath());
151 if (!docFile.mkdirs())
152 throw new IOException();
153 }
154
155 // TODO: jetzt fuer jede seite ein file, zur Zeit jeweils nur ein
156 // File pro Document!
157 // for (String page:ocrDoc.linesInPage.keySet()){
158
159 File pageFile = new File(docFile.getAbsolutePath()
160 + TicketWriter.PATHSEPARATOR
161 + textId.replace(TicketWriter.PATHSEPARATORCHAR, ':'));
162 FileWriter pageFileWriter = new FileWriter(pageFile);
163 // http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&queryType=fulltext&query=quantitas
164 http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&mode=text&query-type=fulltext&query=quantitas
165 // for (OCRLine line: ocrDoc.linesInPage.get(page)){
166 // pageFileWriter.write("allLines"+"\n");
167 // }
168 queryString = XMLDocSearchBase + "document=" + textId;
169 //queryString += "&queryType=fulltext&query=" + morphQuery;
170 queryString += "&mode=text&query-type=fulltext&query=" + morphQuery;
171
172 try {
173 doc = parser.build(queryString);
174 } catch (JDOMException e2) {
175 // TODO Auto-generated catch block
176 e2.printStackTrace();
177 return;
178 }
179
180 XMLOutputter op = new XMLOutputter(Format.getCompactFormat());
181 op.output(doc, pageFileWriter);
182 pageFileWriter.close();
183 }
184 tw.closeTicket(languageFolderName);
185 }
186
187 private void writeResultInfo(TicketWriter tw, Document doc, int counter,
188 String languageFolderName) throws IOException {
189 //String languageFolderName = language + "_XML";
190
191 File languageFile = new File(tw.ticketFile.getAbsolutePath()
192 + TicketWriter.PATHSEPARATOR + languageFolderName);
193 File resultFile = new File(languageFile.getAbsolutePath()
194 + TicketWriter.PATHSEPARATOR + "resultInfo");
195 FileOutputStream fs = new FileOutputStream(resultFile);
196 OutputStreamWriter rw = new OutputStreamWriter(fs, "utf-8");
197 String ret = "<resultInfo>";
198 // int hits = docs.totalHits;
199 ret += "<lang>" + languageFolderName+"</lang>";
200 ret += "<hits>" + counter + "</hits>";
201 ret += "<totalHits>" + counter + "</totalHits>";// TODO: gibt es in
202 // diesem fall einen
203 // unterschied zwischen
204 // hits und totalhits?
205
206 ret += "</resultInfo>";
207 rw.write(ret);
208 rw.close();
209 }
210
211 private int writeResults(TicketWriter tw, String languageFolderName, String ticket,
212 Document doc) throws IOException {
213 OutputStreamWriter rw = null;
214 File languageFile;
215 //String languageFolderName = language + "_XML";
216
217 try {
218 languageFile = new File(tw.ticketFile.getAbsolutePath()
219 + tw.PATHSEPARATOR + languageFolderName);
220 File resultFile = new File(languageFile.getAbsolutePath()
221 + tw.PATHSEPARATOR + "result");
222 FileOutputStream fs = new FileOutputStream(resultFile);
223 rw = new OutputStreamWriter(fs, "utf-8");
224 } catch (FileNotFoundException e) {
225 // TODO Auto-generated catch block
226 e.printStackTrace();
227 return -1;
228 } catch (UnsupportedEncodingException e) {
229 // TODO Auto-generated catch block
230 e.printStackTrace();
231 return -1;
232 }
233
234 Set<String> idsAlreadyDone = new HashSet<String>(); // TODO: aus
235 // irgendwelche
236 // gruenden gibt es
237 // ein Dokument
238 // mehrfach in den
239 // Fundstellen
240
241 // FileWriter rw = new FileWriter(resultFile);
242
243 if (!languageFile.exists()) {
244 logger.debug("Create Languagefolder:"
245 + languageFile.getCanonicalPath());
246 if (!languageFile.mkdirs())
247 throw new IOException();
248 }
249 Integer counter = 0;
250 @SuppressWarnings("unchecked")
251 List<Element> elements;
252 try {
253 XPath xpathDoc = XPath.newInstance("//document");
254 elements = xpathDoc.selectNodes(doc);
255 } catch (JDOMException e1) {
256 // TODO Auto-generated catch block
257 e1.printStackTrace();
258 return -1;
259 }
260
261 for (Element e : elements) {
262 try {
263 XPath xpathUri = XPath.newInstance("uri");
264 Element uri = (Element) xpathUri.selectSingleNode(e);
265
266 String id = uri.getTextTrim();
267 if (!idsAlreadyDone.contains(id)) { // stelle sicher das alle
268 // treffer nur einmal in die
269 // date geschrieben werden.
270 idsAlreadyDone.add(id);
271
272 String textId = id;
273 String md = "";
274 Element mdEl = (Element) XPath
275 .selectSingleNode(e, "author");
276 md += "<dc:creator>" + formatXML(mdEl.getTextTrim())
277 + "</dc:creator>";
278
279 mdEl = (Element) XPath.selectSingleNode(e, "title");
280 md += "<dc:title>" + formatXML(mdEl.getTextTrim())
281 + "</dc:title>";
282
283 mdEl = (Element) XPath.selectSingleNode(e, "place");
284 md += "<dc:place>" + formatXML(mdEl.getTextTrim())
285 + "</dc:place>";
286
287 mdEl = (Element) XPath.selectSingleNode(e, "date");
288 md += "<dc:date>" + formatXML(mdEl.getTextTrim())
289 + "</dc:date>";
290
291 String ret = "<result xmlns:dc=\"http://dublincore.org/documents/dcmi-namespace/\">";
292 ret += "<cleanedPath>" + textId + "</cleanedPath>";
293 ret += "<textId>" + textId.replace("/", ":") + "</textId>";
294 ret += "<textIdCleaned>" + textId.replace("/", "_")
295 + "</textIdCleaned>";
296 ret += "<md>" + md + "</md>";
297 ret += "</result>";
298
299 rw.write(ret);
300 counter++;
301 }
302 } catch (JDOMException e1) {
303 // TODO Auto-generated catch block
304 e1.printStackTrace();
305 return -1;
306 }
307 }
308 rw.close();
309 return counter;
310 }
311
312 private String formatXML(String string) {
313 String retStr = string.replace("&", "&amp;");
314 retStr = retStr.replace("<", "&lt;");
315 retStr = retStr.replace(">", "&gt;");
316 return retStr;
317 }
318
319 /*
320 * (non-Javadoc)
321 *
322 * @see
323 * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesDoc
324 * (java.lang.String, org.apache.lucene.search.Query, java.lang.String)
325 */
326 @Override
327 public OCRDoc searchInLinesDoc(String textId, Query query, String language)
328 throws IOException {
329 // TODO Auto-generated method stub
330 return null;
331 }
332
333 /*
334 * (non-Javadoc)
335 *
336 * @see
337 * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLines
338 * (org.apache.lucene.search.Query, java.lang.String)
339 */
340 @Override
341 public List<OCRDoc> searchInLines(Query query, String language)
342 throws IOException {
343 // TODO Auto-generated method stub
344 return null;
345 }
346
347 public static void main(String[] args) {
348 Term t = new Term("contents", "quantitas");
349 Query q = new TermQuery(t);
350 XMLSearchServerAdapter sa = new XMLSearchServerAdapter();
351 try {
352 sa.searchInLinesToDir(q, "la", "121");
353 } catch (IOException e) {
354 // TODO Auto-generated catch block
355 e.printStackTrace();
356 }
357 }
358
359 public static List<String> getSupportedLanguages() {
360 String langs[] = new String[] { "la:XML", "it:XML" };
361 return Arrays.asList(langs);
362
363 }
364 }