comparison MpiwgXmlTextServer.py @ 583:ca0274423382

follow changes in html format of new text-backend.
author casties
date Mon, 12 Nov 2012 18:12:33 +0100
parents fc861a6cef17
children 6000c7e24d8a
comparison
equal deleted inserted replaced
582:bf0f514b6f92 583:ca0274423382
293 l.set('target', '_blank') 293 l.set('target', '_blank')
294 294
295 if punditMode: 295 if punditMode:
296 self._addPunditAttributes(pagediv, pageinfo, docinfo) 296 self._addPunditAttributes(pagediv, pageinfo, docinfo)
297 297
298 # TODO: move empty page text
299 ep = dom.find(".//div[@class='emptyPage']")
300 if ep is not None:
301 pagediv.append(ep)
302
303 s = serialize(pagediv) 298 s = serialize(pagediv)
304 logging.debug("getTextPage done in %s"%(datetime.now()-startTime)) 299 logging.debug("getTextPage done in %s"%(datetime.now()-startTime))
305 return s 300 return s
306 301
307 # xml mode 302 # xml mode
460 a.set('target', '_blank') 455 a.set('target', '_blank')
461 456
462 except: 457 except:
463 logging.warn("processFigures: strange figure!") 458 logging.warn("processFigures: strange figure!")
464 459
460
461 def _cleanSearchResult(self, pagediv):
462 """fixes search result html (change pbs and figures)"""
463 # replace figure-tag with figureNumText
464 for fig in pagediv.findall(".//span[@class='figure']"):
465 txt = fig.findtext(".//span[@class='figureNumText']")
466 tail = fig.tail
467 fig.clear()
468 fig.set('class', 'figure')
469 fig.text = txt
470 fig.tail = tail
471
472 # replace lb-tag with "//"
473 for lb in pagediv.findall(".//br[@class='lb']"):
474 lb.tag = 'span'
475 lb.text = '//'
476
477 # replace pb-tag with "///"
478 for pb in pagediv.findall(".//span[@class='pb']"):
479 tail = pb.tail
480 pb.clear()
481 pb.set('class', 'pb')
482 pb.text = '///'
483 pb.tail = tail
484
485 return pagediv
486
487 def _cleanSearchResult2(self, pagediv):
488 """fixes search result html (change pbs and figures)"""
489 # unfortunately etree can not select class.startswith('figure')
490 divs = pagediv.findall(".//span[@class]")
491 for d in divs:
492 cls = d.get('class')
493 if cls.startswith('figure'):
494 # replace figure-tag with figureNumText
495 txt = d.findtext(".//span[@class='figureNumText']")
496 d.clear()
497 d.set('class', 'figure')
498 d.text = txt
499
500 elif cls.startswith('pb'):
501 # replace pb-tag with "//"
502 d.clear()
503 d.set('class', 'pb')
504 d.text = '//'
505
506 return pagediv
507
508
465 509
466 def _fixEmptyDivs(self, pagediv): 510 def _fixEmptyDivs(self, pagediv):
467 """fixes empty div-tags by inserting a space""" 511 """fixes empty div-tags by inserting a space"""
468 divs = pagediv.findall('.//div') 512 divs = pagediv.findall('.//div')
469 for d in divs: 513 for d in divs:
474 return pagediv 518 return pagediv
475 519
476 520
477 def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): 521 def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None):
478 """loads list of search results and stores XML in docinfo""" 522 """loads list of search results and stores XML in docinfo"""
479 523 normMode = pageinfo.get('characterNormalization', 'reg')
480 logging.debug("getSearchResults mode=%s query=%s"%(mode, query)) 524 logging.debug("getSearchResults mode=%s query=%s norm=%s"%(mode, query, normMode))
481 if mode == "none": 525 if mode == "none":
482 return docinfo 526 return docinfo
483 527
484 #TODO: put mode into query 528 #TODO: put mode into query
485 529
486 cachedQuery = docinfo.get('cachedQuery', None) 530 cachedQuery = docinfo.get('cachedQuery', None)
487 if cachedQuery is not None: 531 if cachedQuery is not None:
488 # cached search result 532 # cached search result
489 if cachedQuery == '%s_%s'%(mode,query): 533 if cachedQuery == '%s_%s_%s'%(mode,query,normMode):
490 # same query 534 # same query
491 return docinfo 535 return docinfo
492 536
493 else: 537 else:
494 # different query 538 # different query
495 del docinfo['resultSize'] 539 del docinfo['resultSize']
496 del docinfo['results'] 540 del docinfo['results']
497 541
498 # cache query 542 # cache query
499 docinfo['cachedQuery'] = '%s_%s'%(mode,query) 543 docinfo['cachedQuery'] = '%s_%s_%s'%(mode,query,normMode)
500 544
501 # fetch full results 545 # fetch full results
502 docpath = docinfo['textURLPath'] 546 docpath = docinfo['textURLPath']
503 params = {'docId': docpath, 547 params = {'docId': docpath,
504 'query': query, 548 'query': query,
507 'outputFormat': 'html'} 551 'outputFormat': 'html'}
508 pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params)) 552 pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params))
509 results = [] 553 results = []
510 try: 554 try:
511 dom = ET.fromstring(pagexml) 555 dom = ET.fromstring(pagexml)
556 # clean html output
557 self._processWTags('plain', normMode, dom)
558 self._cleanSearchResult(dom)
512 # page content is currently in multiple <td align=left> 559 # page content is currently in multiple <td align=left>
513 alldivs = dom.findall(".//tr[@class='hit']") 560 alldivs = dom.findall(".//tr[@class='hit']")
514 for div in alldivs: 561 for div in alldivs:
515 # change tr to div 562 # change tr to div
516 div.tag = 'div' 563 div.tag = 'div'
530 577
531 return docinfo 578 return docinfo
532 579
533 580
534 def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): 581 def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None):
535 """returns single page from the table of contents""" 582 """returns single page from the list of search results"""
536 logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) 583 logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn))
537 # get (cached) result 584 # get (cached) result
538 self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) 585 self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
539 586
540 resultxml = docinfo.get('results', None) 587 resultxml = docinfo.get('results', None)