Mercurial > hg > fulltextSearch
comparison src/de/mpiwg/dwinter/fulltext/search/xmlsearchadapter/XMLSearchServerAdapter.java @ 2:2b29b0b6db16 default tip
Version mit integrierter Suche ?ber XML-Volltexte
| author | dwinter |
|---|---|
| date | Wed, 26 Jan 2011 14:41:09 +0100 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:5c9c31510f0c | 2:2b29b0b6db16 |
|---|---|
| 1 /** | |
| 2 * | |
| 3 */ | |
| 4 package de.mpiwg.dwinter.fulltext.search.xmlsearchadapter; | |
| 5 | |
| 6 import java.io.File; | |
| 7 import java.io.FileNotFoundException; | |
| 8 import java.io.FileOutputStream; | |
| 9 import java.io.FileWriter; | |
| 10 import java.io.IOException; | |
| 11 import java.io.OutputStreamWriter; | |
| 12 import java.io.UnsupportedEncodingException; | |
| 13 import java.util.ArrayList; | |
| 14 import java.util.Arrays; | |
| 15 import java.util.HashSet; | |
| 16 import java.util.List; | |
| 17 import java.util.Set; | |
| 18 | |
| 19 import org.apache.log4j.Logger; | |
| 20 import org.apache.lucene.index.Term; | |
| 21 import org.apache.lucene.search.Query; | |
| 22 import org.apache.lucene.search.ScoreDoc; | |
| 23 import org.apache.lucene.search.TermQuery; | |
| 24 import org.jdom.Document; | |
| 25 import org.jdom.Element; | |
| 26 import org.jdom.JDOMException; | |
| 27 import org.jdom.input.SAXBuilder; | |
| 28 import org.jdom.output.Format; | |
| 29 import org.jdom.output.XMLOutputter; | |
| 30 import org.jdom.xpath.XPath; | |
| 31 | |
| 32 import de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines; | |
| 33 import de.mpiwg.dwinter.fulltext.search.utils.OCRDoc; | |
| 34 import de.mpiwg.dwinter.fulltext.search.utils.OCRLine; | |
| 35 import de.mpiwg.dwinter.fulltext.ticket.TicketWriter; | |
| 36 import de.mpiwg.dwinter.lucencetools.documents.FileDocument; | |
| 37 | |
| 38 /** | |
| 39 * @author dwinter | |
| 40 * | |
| 41 */ | |
| 42 public class XMLSearchServerAdapter implements IFulltextSearchDocsLines { | |
| 43 | |
| 44 protected static Logger logger = Logger.getRootLogger(); | |
| 45 | |
| 46 public static String XMLServerSearchBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/queryResult.xql?"; | |
| 47 //public static String XMLDocSearchBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/doc-query.xql?"; | |
| 48 public static String XMLDocSearchBase = "http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?"; | |
| 49 public static String XMLServerBase = "http://mpdl-test.mpiwg-berlin.mpg.de:30030/mpdl/interface/"; | |
| 50 | |
| 51 | |
| 52 //http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&mode=text&query-type=fulltext&query=quantitas | |
| 53 // http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&queryType=fulltext&query=quantitas | |
| 54 /* | |
| 55 * (non-Javadoc) | |
| 56 * | |
| 57 * @see | |
| 58 * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesToDir | |
| 59 * (org.apache.lucene.search.Query, java.lang.String, java.lang.String) | |
| 60 */ | |
| 61 | |
| 62 @Override | |
| 63 public void searchInLinesToDir(Query query, String calledLanguage, String ticket) | |
| 64 throws IOException { | |
| 65 String languageFolderName; | |
| 66 String language; | |
| 67 //check format of the language string could be lang:xml or just lang | |
| 68 String[] langsplitted = calledLanguage.split(":"); | |
| 69 if(langsplitted.length>1){ | |
| 70 if(langsplitted[1].equals("XML")){ | |
| 71 language=langsplitted[0]; | |
| 72 languageFolderName=calledLanguage; | |
| 73 } else { | |
| 74 language=calledLanguage; | |
| 75 languageFolderName=calledLanguage+":XML"; | |
| 76 } | |
| 77 | |
| 78 } else { | |
| 79 language=calledLanguage; | |
| 80 languageFolderName=calledLanguage+":XML"; | |
| 81 } | |
| 82 | |
| 83 | |
| 84 | |
| 85 TicketWriter tw = new TicketWriter(ticket, query, languageFolderName); | |
| 86 | |
| 87 File languageFile = new File(tw.ticketFile.getAbsolutePath() | |
| 88 + tw.PATHSEPARATOR + languageFolderName); | |
| 89 if (!languageFile.exists()) { | |
| 90 logger.debug("Create Languagefolder:" | |
| 91 + languageFile.getCanonicalPath()); | |
| 92 if (!languageFile.mkdirs()) | |
| 93 throw new IOException(); | |
| 94 } | |
| 95 | |
| 96 // docbase=archimedes&docbase=echo&queryType=fulltextMorph&language=la&ftMorphQuery=quantitas&pn=1&output=xml&pageSize=50 | |
| 97 SAXBuilder parser = new SAXBuilder(); | |
| 98 | |
| 99 String queryString = XMLServerSearchBase | |
| 100 + "docbase=archimedes&docbase=echo&queryType=fulltextMorph"; | |
| 101 queryString += "&language=" + language; | |
| 102 | |
| 103 Set<Term> terms = new HashSet<Term>(); | |
| 104 query.extractTerms(terms); | |
| 105 String morphQuery = ""; | |
| 106 for (Term t : terms) { | |
| 107 if (t.field().equals("contents")) | |
| 108 morphQuery = t.text(); | |
| 109 } | |
| 110 queryString += "&ftMorphQuery=" + morphQuery; | |
| 111 queryString += "&pn=1&output=xml&pageSize=500"; | |
| 112 Document doc; | |
| 113 try { | |
| 114 doc = parser.build(queryString); | |
| 115 } catch (JDOMException e) { | |
| 116 // TODO Auto-generated catch block | |
| 117 e.printStackTrace(); | |
| 118 return; | |
| 119 } | |
| 120 | |
| 121 int counter = writeResults(tw, languageFolderName, ticket, doc); | |
| 122 writeResultInfo(tw, doc, counter, languageFolderName); | |
| 123 | |
| 124 tw.commitTicket(); | |
| 125 | |
| 126 List<Element> docElements; | |
| 127 try { | |
| 128 XPath docsXP = XPath.newInstance("//document"); | |
| 129 docElements = docsXP.selectNodes(doc); | |
| 130 } catch (JDOMException e1) { | |
| 131 // TODO Auto-generated catch block | |
| 132 e1.printStackTrace(); | |
| 133 return; | |
| 134 } | |
| 135 | |
| 136 for (Element e : docElements) { | |
| 137 Element textIdElement; | |
| 138 try { | |
| 139 textIdElement = (Element) XPath.selectSingleNode(e, "uri"); | |
| 140 } catch (JDOMException e1) { | |
| 141 // TODO Auto-generated catch block | |
| 142 continue; | |
| 143 } | |
| 144 String textId = textIdElement.getTextTrim(); | |
| 145 | |
| 146 File docFile = new File(languageFile.getAbsolutePath() | |
| 147 + TicketWriter.PATHSEPARATOR | |
| 148 + textId.replace(TicketWriter.PATHSEPARATORCHAR, ':')); | |
| 149 if (!docFile.exists()) { | |
| 150 logger.debug("Create Docfolder:" + docFile.getCanonicalPath()); | |
| 151 if (!docFile.mkdirs()) | |
| 152 throw new IOException(); | |
| 153 } | |
| 154 | |
| 155 // TODO: jetzt fuer jede seite ein file, zur Zeit jeweils nur ein | |
| 156 // File pro Document! | |
| 157 // for (String page:ocrDoc.linesInPage.keySet()){ | |
| 158 | |
| 159 File pageFile = new File(docFile.getAbsolutePath() | |
| 160 + TicketWriter.PATHSEPARATOR | |
| 161 + textId.replace(TicketWriter.PATHSEPARATORCHAR, ':')); | |
| 162 FileWriter pageFileWriter = new FileWriter(pageFile); | |
| 163 // http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/doc-query.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&queryType=fulltext&query=quantitas | |
| 164 http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Bernoulli_1738_AZ870BWE.xml&mode=text&query-type=fulltext&query=quantitas | |
| 165 // for (OCRLine line: ocrDoc.linesInPage.get(page)){ | |
| 166 // pageFileWriter.write("allLines"+"\n"); | |
| 167 // } | |
| 168 queryString = XMLDocSearchBase + "document=" + textId; | |
| 169 //queryString += "&queryType=fulltext&query=" + morphQuery; | |
| 170 queryString += "&mode=text&query-type=fulltext&query=" + morphQuery; | |
| 171 | |
| 172 try { | |
| 173 doc = parser.build(queryString); | |
| 174 } catch (JDOMException e2) { | |
| 175 // TODO Auto-generated catch block | |
| 176 e2.printStackTrace(); | |
| 177 return; | |
| 178 } | |
| 179 | |
| 180 XMLOutputter op = new XMLOutputter(Format.getCompactFormat()); | |
| 181 op.output(doc, pageFileWriter); | |
| 182 pageFileWriter.close(); | |
| 183 } | |
| 184 tw.closeTicket(languageFolderName); | |
| 185 } | |
| 186 | |
| 187 private void writeResultInfo(TicketWriter tw, Document doc, int counter, | |
| 188 String languageFolderName) throws IOException { | |
| 189 //String languageFolderName = language + "_XML"; | |
| 190 | |
| 191 File languageFile = new File(tw.ticketFile.getAbsolutePath() | |
| 192 + TicketWriter.PATHSEPARATOR + languageFolderName); | |
| 193 File resultFile = new File(languageFile.getAbsolutePath() | |
| 194 + TicketWriter.PATHSEPARATOR + "resultInfo"); | |
| 195 FileOutputStream fs = new FileOutputStream(resultFile); | |
| 196 OutputStreamWriter rw = new OutputStreamWriter(fs, "utf-8"); | |
| 197 String ret = "<resultInfo>"; | |
| 198 // int hits = docs.totalHits; | |
| 199 ret += "<lang>" + languageFolderName+"</lang>"; | |
| 200 ret += "<hits>" + counter + "</hits>"; | |
| 201 ret += "<totalHits>" + counter + "</totalHits>";// TODO: gibt es in | |
| 202 // diesem fall einen | |
| 203 // unterschied zwischen | |
| 204 // hits und totalhits? | |
| 205 | |
| 206 ret += "</resultInfo>"; | |
| 207 rw.write(ret); | |
| 208 rw.close(); | |
| 209 } | |
| 210 | |
| 211 private int writeResults(TicketWriter tw, String languageFolderName, String ticket, | |
| 212 Document doc) throws IOException { | |
| 213 OutputStreamWriter rw = null; | |
| 214 File languageFile; | |
| 215 //String languageFolderName = language + "_XML"; | |
| 216 | |
| 217 try { | |
| 218 languageFile = new File(tw.ticketFile.getAbsolutePath() | |
| 219 + tw.PATHSEPARATOR + languageFolderName); | |
| 220 File resultFile = new File(languageFile.getAbsolutePath() | |
| 221 + tw.PATHSEPARATOR + "result"); | |
| 222 FileOutputStream fs = new FileOutputStream(resultFile); | |
| 223 rw = new OutputStreamWriter(fs, "utf-8"); | |
| 224 } catch (FileNotFoundException e) { | |
| 225 // TODO Auto-generated catch block | |
| 226 e.printStackTrace(); | |
| 227 return -1; | |
| 228 } catch (UnsupportedEncodingException e) { | |
| 229 // TODO Auto-generated catch block | |
| 230 e.printStackTrace(); | |
| 231 return -1; | |
| 232 } | |
| 233 | |
| 234 Set<String> idsAlreadyDone = new HashSet<String>(); // TODO: aus | |
| 235 // irgendwelche | |
| 236 // gruenden gibt es | |
| 237 // ein Dokument | |
| 238 // mehrfach in den | |
| 239 // Fundstellen | |
| 240 | |
| 241 // FileWriter rw = new FileWriter(resultFile); | |
| 242 | |
| 243 if (!languageFile.exists()) { | |
| 244 logger.debug("Create Languagefolder:" | |
| 245 + languageFile.getCanonicalPath()); | |
| 246 if (!languageFile.mkdirs()) | |
| 247 throw new IOException(); | |
| 248 } | |
| 249 Integer counter = 0; | |
| 250 @SuppressWarnings("unchecked") | |
| 251 List<Element> elements; | |
| 252 try { | |
| 253 XPath xpathDoc = XPath.newInstance("//document"); | |
| 254 elements = xpathDoc.selectNodes(doc); | |
| 255 } catch (JDOMException e1) { | |
| 256 // TODO Auto-generated catch block | |
| 257 e1.printStackTrace(); | |
| 258 return -1; | |
| 259 } | |
| 260 | |
| 261 for (Element e : elements) { | |
| 262 try { | |
| 263 XPath xpathUri = XPath.newInstance("uri"); | |
| 264 Element uri = (Element) xpathUri.selectSingleNode(e); | |
| 265 | |
| 266 String id = uri.getTextTrim(); | |
| 267 if (!idsAlreadyDone.contains(id)) { // stelle sicher das alle | |
| 268 // treffer nur einmal in die | |
| 269 // date geschrieben werden. | |
| 270 idsAlreadyDone.add(id); | |
| 271 | |
| 272 String textId = id; | |
| 273 String md = ""; | |
| 274 Element mdEl = (Element) XPath | |
| 275 .selectSingleNode(e, "author"); | |
| 276 md += "<dc:creator>" + formatXML(mdEl.getTextTrim()) | |
| 277 + "</dc:creator>"; | |
| 278 | |
| 279 mdEl = (Element) XPath.selectSingleNode(e, "title"); | |
| 280 md += "<dc:title>" + formatXML(mdEl.getTextTrim()) | |
| 281 + "</dc:title>"; | |
| 282 | |
| 283 mdEl = (Element) XPath.selectSingleNode(e, "place"); | |
| 284 md += "<dc:place>" + formatXML(mdEl.getTextTrim()) | |
| 285 + "</dc:place>"; | |
| 286 | |
| 287 mdEl = (Element) XPath.selectSingleNode(e, "date"); | |
| 288 md += "<dc:date>" + formatXML(mdEl.getTextTrim()) | |
| 289 + "</dc:date>"; | |
| 290 | |
| 291 String ret = "<result xmlns:dc=\"http://dublincore.org/documents/dcmi-namespace/\">"; | |
| 292 ret += "<cleanedPath>" + textId + "</cleanedPath>"; | |
| 293 ret += "<textId>" + textId.replace("/", ":") + "</textId>"; | |
| 294 ret += "<textIdCleaned>" + textId.replace("/", "_") | |
| 295 + "</textIdCleaned>"; | |
| 296 ret += "<md>" + md + "</md>"; | |
| 297 ret += "</result>"; | |
| 298 | |
| 299 rw.write(ret); | |
| 300 counter++; | |
| 301 } | |
| 302 } catch (JDOMException e1) { | |
| 303 // TODO Auto-generated catch block | |
| 304 e1.printStackTrace(); | |
| 305 return -1; | |
| 306 } | |
| 307 } | |
| 308 rw.close(); | |
| 309 return counter; | |
| 310 } | |
| 311 | |
| 312 private String formatXML(String string) { | |
| 313 String retStr = string.replace("&", "&"); | |
| 314 retStr = retStr.replace("<", "<"); | |
| 315 retStr = retStr.replace(">", ">"); | |
| 316 return retStr; | |
| 317 } | |
| 318 | |
| 319 /* | |
| 320 * (non-Javadoc) | |
| 321 * | |
| 322 * @see | |
| 323 * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLinesDoc | |
| 324 * (java.lang.String, org.apache.lucene.search.Query, java.lang.String) | |
| 325 */ | |
| 326 @Override | |
| 327 public OCRDoc searchInLinesDoc(String textId, Query query, String language) | |
| 328 throws IOException { | |
| 329 // TODO Auto-generated method stub | |
| 330 return null; | |
| 331 } | |
| 332 | |
| 333 /* | |
| 334 * (non-Javadoc) | |
| 335 * | |
| 336 * @see | |
| 337 * de.mpiwg.dwinter.fulltext.search.IFulltextSearchDocsLines#searchInLines | |
| 338 * (org.apache.lucene.search.Query, java.lang.String) | |
| 339 */ | |
| 340 @Override | |
| 341 public List<OCRDoc> searchInLines(Query query, String language) | |
| 342 throws IOException { | |
| 343 // TODO Auto-generated method stub | |
| 344 return null; | |
| 345 } | |
| 346 | |
| 347 public static void main(String[] args) { | |
| 348 Term t = new Term("contents", "quantitas"); | |
| 349 Query q = new TermQuery(t); | |
| 350 XMLSearchServerAdapter sa = new XMLSearchServerAdapter(); | |
| 351 try { | |
| 352 sa.searchInLinesToDir(q, "la", "121"); | |
| 353 } catch (IOException e) { | |
| 354 // TODO Auto-generated catch block | |
| 355 e.printStackTrace(); | |
| 356 } | |
| 357 } | |
| 358 | |
| 359 public static List<String> getSupportedLanguages() { | |
| 360 String langs[] = new String[] { "la:XML", "it:XML" }; | |
| 361 return Arrays.asList(langs); | |
| 362 | |
| 363 } | |
| 364 } |
