Mercurial > hg > documentViewer
comparison MpiwgXmlTextServer.py @ 576:b2c7e272e075
new w-tag solution with etree. search works now.
author | casties |
---|---|
date | Wed, 17 Oct 2012 16:36:13 +0200 |
parents | f0e5e9c6737f |
children | 9251719154a3 |
comparison
equal
deleted
inserted
replaced
575:f0e5e9c6737f | 576:b2c7e272e075 |
---|---|
6 import re | 6 import re |
7 import logging | 7 import logging |
8 import urllib | 8 import urllib |
9 import urlparse | 9 import urlparse |
10 import base64 | 10 import base64 |
11 | |
12 from datetime import datetime | |
11 | 13 |
12 from SrvTxtUtils import getInt, getText, getHttpData | 14 from SrvTxtUtils import getInt, getText, getHttpData |
13 | 15 |
14 def serialize(node): | 16 def serialize(node): |
15 """returns a string containing an XML snippet of node""" | 17 """returns a string containing an XML snippet of node""" |
184 | 186 |
185 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): | 187 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): |
186 """returns single page from fulltext""" | 188 """returns single page from fulltext""" |
187 | 189 |
188 logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) | 190 logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) |
191 startTime = datetime.now() | |
189 # check for cached text -- but ideally this shouldn't be called twice | 192 # check for cached text -- but ideally this shouldn't be called twice |
190 if pageinfo.has_key('textPage'): | 193 if pageinfo.has_key('textPage'): |
191 logging.debug("getTextPage: using cached text") | 194 logging.debug("getTextPage: using cached text") |
192 return pageinfo['textPage'] | 195 return pageinfo['textPage'] |
193 | 196 |
206 | 209 |
207 normMode = pageinfo.get('characterNormalization', 'reg') | 210 normMode = pageinfo.get('characterNormalization', 'reg') |
208 # TODO: change values in form | 211 # TODO: change values in form |
209 if normMode == 'regPlusNorm': | 212 if normMode == 'regPlusNorm': |
210 normMode = 'norm' | 213 normMode = 'norm' |
211 | 214 |
215 # TODO: this should not be necessary when the backend is fixed | |
216 textParams['normalization'] = normMode | |
217 | |
212 if not mode: | 218 if not mode: |
213 # default is dict | 219 # default is dict |
214 mode = 'text' | 220 mode = 'text' |
215 | 221 |
216 modes = mode.split(',') | 222 modes = mode.split(',') |
238 modes.remove('pundit') | 244 modes.remove('pundit') |
239 | 245 |
240 # other modes don't combine | 246 # other modes don't combine |
241 if 'dict' in modes: | 247 if 'dict' in modes: |
242 textmode = 'dict' | 248 textmode = 'dict' |
243 textParams['mode'] = 'tokenized' | |
244 textParams['outputFormat'] = 'html' | 249 textParams['outputFormat'] = 'html' |
245 elif 'xml' in modes: | 250 elif 'xml' in modes: |
246 textmode = 'xml' | 251 textmode = 'xml' |
247 textParams['mode'] = 'untokenized' | |
248 textParams['outputFormat'] = 'xmlDisplay' | 252 textParams['outputFormat'] = 'xmlDisplay' |
249 textParams['normMode'] = 'orig' | 253 normMode = 'orig' |
250 elif 'gis' in modes: | 254 elif 'gis' in modes: |
251 #FIXME! | 255 #FIXME! |
252 textmode = 'gis' | 256 textmode = 'gis' |
253 else: | 257 else: |
254 # text is default mode | 258 # text is default mode |
255 textmode = 'plain' | 259 textmode = 'plain' |
256 textParams['mode'] = 'untokenized' | |
257 textParams['outputFormat'] = 'html' | 260 textParams['outputFormat'] = 'html' |
258 | 261 |
259 try: | 262 try: |
260 # fetch the page | 263 # fetch the page |
261 pagexml = self.getServerData("query/GetPage",urllib.urlencode(textParams)) | 264 pagexml = self.getServerData("query/GetPage",urllib.urlencode(textParams)) |
270 pagediv = dom.find(".//div[@class='text']") | 273 pagediv = dom.find(".//div[@class='text']") |
271 logging.debug("pagediv: %s"%repr(pagediv)) | 274 logging.debug("pagediv: %s"%repr(pagediv)) |
272 if pagediv is not None: | 275 if pagediv is not None: |
273 # add textmode and normMode classes | 276 # add textmode and normMode classes |
274 pagediv.set('class', 'text %s %s'%(textmode, normMode)) | 277 pagediv.set('class', 'text %s %s'%(textmode, normMode)) |
275 #self._processWTags(textmode, normMode, pagediv) | 278 self._processWTags(textmode, normMode, pagediv) |
276 #self._processPbTag(pagediv, pageinfo) | 279 #self._processPbTag(pagediv, pageinfo) |
277 self._processFigures(pagediv, docinfo) | 280 self._processFigures(pagediv, docinfo) |
278 #self._fixEmptyDivs(pagediv) | 281 #self._fixEmptyDivs(pagediv) |
279 # get full url assuming documentViewer is parent | 282 # get full url assuming documentViewer is parent |
280 selfurl = self.getLink() | 283 selfurl = self.getLink() |
285 if href: | 288 if href: |
286 # is link with href | 289 # is link with href |
287 linkurl = urlparse.urlparse(href) | 290 linkurl = urlparse.urlparse(href) |
288 if linkurl.path.endswith('GetDictionaryEntries'): | 291 if linkurl.path.endswith('GetDictionaryEntries'): |
289 #TODO: replace wordInfo page | 292 #TODO: replace wordInfo page |
290 # is dictionary link - change href (keeping parameters) | |
291 #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl)) | |
292 # add target to open new page | 293 # add target to open new page |
293 l.set('target', '_blank') | 294 l.set('target', '_blank') |
294 | |
295 elif href.startswith('#note-'): | |
296 # note link FIXME! | |
297 l.set('href', href.replace('#note-',"%s#note-"%selfurl)) | |
298 | 295 |
299 if punditMode: | 296 if punditMode: |
300 self._addPunditAttributes(pagediv, pageinfo, docinfo) | 297 self._addPunditAttributes(pagediv, pageinfo, docinfo) |
301 | 298 |
302 return serialize(pagediv) | 299 s = serialize(pagediv) |
300 logging.debug("getTextPage done in %s"%(datetime.now()-startTime)) | |
301 return s | |
303 | 302 |
304 # xml mode | 303 # xml mode |
305 elif textmode == "xml": | 304 elif textmode == "xml": |
306 # the text is in body | 305 # the text is in body |
307 pagediv = dom.find(".//body") | 306 pagediv = dom.find(".//body") |
343 return None | 342 return None |
344 | 343 |
345 def _processWTags(self, textMode, normMode, pagediv): | 344 def _processWTags(self, textMode, normMode, pagediv): |
346 """selects the necessary information from w-spans and removes the rest from pagediv""" | 345 """selects the necessary information from w-spans and removes the rest from pagediv""" |
347 logging.debug("processWTags(textMode=%s,norm=%s,pagediv"%(repr(textMode),repr(normMode))) | 346 logging.debug("processWTags(textMode=%s,norm=%s,pagediv"%(repr(textMode),repr(normMode))) |
347 startTime = datetime.now() | |
348 wtags = pagediv.findall(".//span[@class='w']") | 348 wtags = pagediv.findall(".//span[@class='w']") |
349 for wtag in wtags: | 349 for wtag in wtags: |
350 text = None | |
351 attr = None | |
352 if textMode == 'dict': | 350 if textMode == 'dict': |
353 # take a-tag and matching child | 351 # delete non-a-tags |
354 attr = wtag.find('a').items() | 352 wtag.remove(wtag.find("span[@class='nodictionary orig']")) |
355 text = wtag.find("a/span[@class='%s']"%normMode).text | 353 wtag.remove(wtag.find("span[@class='nodictionary reg']")) |
354 wtag.remove(wtag.find("span[@class='nodictionary norm']")) | |
355 # delete non-matching children of a-tag and suppress remaining tag name | |
356 atag = wtag.find("a[@class='dictionary']") | |
357 if normMode == 'orig': | |
358 atag.remove(atag.find("span[@class='reg']")) | |
359 atag.remove(atag.find("span[@class='norm']")) | |
360 atag.find("span[@class='orig']").tag = None | |
361 elif normMode == 'reg': | |
362 atag.remove(atag.find("span[@class='orig']")) | |
363 atag.remove(atag.find("span[@class='norm']")) | |
364 atag.find("span[@class='reg']").tag = None | |
365 elif normMode == 'norm': | |
366 atag.remove(atag.find("span[@class='orig']")) | |
367 atag.remove(atag.find("span[@class='reg']")) | |
368 atag.find("span[@class='norm']").tag = None | |
369 | |
356 else: | 370 else: |
357 # take matching child | 371 # delete a-tag |
358 text = wtag.find("span[@class='nodictionary %s']"%normMode).text | 372 wtag.remove(wtag.find("a[@class='dictionary']")) |
373 # delete non-matching children and suppress remaining tag name | |
374 if normMode == 'orig': | |
375 wtag.remove(wtag.find("span[@class='nodictionary reg']")) | |
376 wtag.remove(wtag.find("span[@class='nodictionary norm']")) | |
377 wtag.find("span[@class='nodictionary orig']").tag = None | |
378 elif normMode == 'reg': | |
379 wtag.remove(wtag.find("span[@class='nodictionary orig']")) | |
380 wtag.remove(wtag.find("span[@class='nodictionary norm']")) | |
381 wtag.find("span[@class='nodictionary reg']").tag = None | |
382 elif normMode == 'norm': | |
383 wtag.remove(wtag.find("span[@class='nodictionary orig']")) | |
384 wtag.remove(wtag.find("span[@class='nodictionary reg']")) | |
385 wtag.find("span[@class='nodictionary norm']").tag = None | |
359 | 386 |
360 if text: | 387 # suppress w-tag name |
361 # replace wtag by new content | 388 wtag.tag = None |
362 logging.debug("new w-tag attr=%s text=%s"%(attr,text)) | 389 |
363 wtag.clear() | 390 logging.debug("processWTags in %s"%(datetime.now()-startTime)) |
364 | |
365 if attr: | |
366 # make dictionary link | |
367 wtag.tag = 'a' | |
368 wtag.attrib.update(dict(attr)) | |
369 | |
370 # text content | |
371 wtag.text = text | |
372 | |
373 return pagediv | 391 return pagediv |
374 | 392 |
375 def _processPbTag(self, pagediv, pageinfo): | 393 def _processPbTag(self, pagediv, pageinfo): |
376 """extracts information from pb-tag and removes it from pagediv""" | 394 """extracts information from pb-tag and removes it from pagediv""" |
377 pbdiv = pagediv.find(".//span[@class='pb']") | 395 pbdiv = pagediv.find(".//span[@class='pb']") |
407 | 425 |
408 return pagediv | 426 return pagediv |
409 | 427 |
410 def _processFigures(self, pagediv, docinfo): | 428 def _processFigures(self, pagediv, docinfo): |
411 """processes figure-tags""" | 429 """processes figure-tags""" |
412 divs = pagediv.findall(".//span[@class='figure']") | 430 # unfortunately etree can not select class.startswith('figure') |
431 divs = pagediv.findall(".//span[@class]") | |
413 scalerUrl = docinfo['digilibScalerUrl'] | 432 scalerUrl = docinfo['digilibScalerUrl'] |
414 viewerUrl = docinfo['digilibViewerUrl'] | 433 viewerUrl = docinfo['digilibViewerUrl'] |
415 for d in divs: | 434 for d in divs: |
435 if not d.get('class').startswith('figure'): | |
436 continue | |
437 | |
416 try: | 438 try: |
417 a = d.find('a') | 439 a = d.find('a') |
418 img = a.find('img') | 440 img = a.find('img') |
419 imgsrc = img.get('src') | 441 imgsrc = img.get('src') |
420 imgurl = urlparse.urlparse(imgsrc) | 442 imgurl = urlparse.urlparse(imgsrc) |
482 pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params)) | 504 pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params)) |
483 results = [] | 505 results = [] |
484 try: | 506 try: |
485 dom = ET.fromstring(pagexml) | 507 dom = ET.fromstring(pagexml) |
486 # page content is currently in multiple <td align=left> | 508 # page content is currently in multiple <td align=left> |
487 alldivs = dom.findall(".//td[@align='left']") | 509 alldivs = dom.findall(".//tr[@class='hit']") |
488 for div in alldivs: | 510 for div in alldivs: |
511 # change tr to div | |
512 div.tag = 'div' | |
513 # change td to span | |
514 for d in div.findall('td'): | |
515 d.tag = 'span' | |
516 | |
489 # TODO: can we put etree in the session? | 517 # TODO: can we put etree in the session? |
490 results.append(div) | 518 results.append(div) |
491 | 519 |
492 except Exception, e: | 520 except Exception, e: |
493 logging.error("GetSearchResults: Error parsing search result: %s"%e) | 521 logging.error("GetSearchResults: Error parsing search result: %s"%e) |
514 size = pageinfo.get('resultPageSize', 10) | 542 size = pageinfo.get('resultPageSize', 10) |
515 | 543 |
516 if start is None: | 544 if start is None: |
517 start = (pn - 1) * size | 545 start = (pn - 1) * size |
518 | 546 |
519 fullresult = ET.fromstring(resultxml) | 547 #fullresult = ET.fromstring(resultxml) |
520 | 548 #fullresult = resultxml |
521 if fullresult is not None: | 549 #logging.debug("resultxml=%s"%repr(resultxml)) |
550 | |
551 if resultxml is not None: | |
522 # paginate | 552 # paginate |
523 first = start-1 | 553 first = start-1 |
524 len = size | 554 last = first+size |
525 del fullresult[:first] | 555 tocdivs = resultxml[first:last] |
526 del fullresult[len:] | 556 #del fullresult[:first] |
527 tocdivs = fullresult | 557 #del fullresult[len:] |
528 | 558 #tocdivs = fullresult |
529 # check all a-tags | 559 |
530 links = tocdivs.findall(".//a") | 560 toc = ET.Element('div', attrib={'class':'queryResultPage'}) |
531 for l in links: | 561 for div in tocdivs: |
532 href = l.get('href') | 562 # check all a-tags |
533 if href: | 563 links = div.findall(".//a") |
534 # assume all links go to pages | 564 for l in links: |
535 linkUrl = urlparse.urlparse(href) | 565 href = l.get('href') |
536 linkParams = urlparse.parse_qs(linkUrl.query) | 566 if href: |
537 # take some parameters | 567 # assume all links go to pages |
538 params = {'pn': linkParams['pn'], | 568 linkUrl = urlparse.urlparse(href) |
539 'highlightQuery': linkParams.get('highlightQuery',''), | 569 linkParams = urlparse.parse_qs(linkUrl.query) |
540 'highlightElement': linkParams.get('highlightElement',''), | 570 # take some parameters (make sure it works even if the link was already parsed) |
541 'highlightElementPos': linkParams.get('highlightElementPos','') | 571 params = {'pn': linkParams.get('page',linkParams.get('pn', None)), |
542 } | 572 'highlightQuery': linkParams.get('highlightQuery',None), |
543 url = self.getLink(params=params) | 573 'highlightElement': linkParams.get('highlightElem',linkParams.get('highlightElement',None)), |
544 l.set('href', url) | 574 'highlightElementPos': linkParams.get('highlightElemPos',linkParams.get('highlightElementPos',None)) |
575 } | |
576 if not params['pn']: | |
577 logging.warn("getResultsPage: link has no page: %s"%href) | |
578 | |
579 url = self.getLink(params=params) | |
580 l.set('href', url) | |
545 | 581 |
546 return serialize(tocdivs) | 582 toc.append(div) |
583 | |
584 return serialize(toc) | |
547 | 585 |
548 return "ERROR: no results!" | 586 return "ERROR: no results!" |
549 | 587 |
550 | 588 |
551 def getToc(self, mode='text', docinfo=None): | 589 def getToc(self, mode='text', docinfo=None): |