Mercurial > hg > documentViewer
comparison MpiwgXmlTextServer.py @ 583:ca0274423382
follow changes in html format of new text-backend.
author | casties |
---|---|
date | Mon, 12 Nov 2012 18:12:33 +0100 |
parents | fc861a6cef17 |
children | 6000c7e24d8a |
comparison
equal
deleted
inserted
replaced
582:bf0f514b6f92 | 583:ca0274423382 |
---|---|
293 l.set('target', '_blank') | 293 l.set('target', '_blank') |
294 | 294 |
295 if punditMode: | 295 if punditMode: |
296 self._addPunditAttributes(pagediv, pageinfo, docinfo) | 296 self._addPunditAttributes(pagediv, pageinfo, docinfo) |
297 | 297 |
298 # TODO: move empty page text | |
299 ep = dom.find(".//div[@class='emptyPage']") | |
300 if ep is not None: | |
301 pagediv.append(ep) | |
302 | |
303 s = serialize(pagediv) | 298 s = serialize(pagediv) |
304 logging.debug("getTextPage done in %s"%(datetime.now()-startTime)) | 299 logging.debug("getTextPage done in %s"%(datetime.now()-startTime)) |
305 return s | 300 return s |
306 | 301 |
307 # xml mode | 302 # xml mode |
460 a.set('target', '_blank') | 455 a.set('target', '_blank') |
461 | 456 |
462 except: | 457 except: |
463 logging.warn("processFigures: strange figure!") | 458 logging.warn("processFigures: strange figure!") |
464 | 459 |
460 | |
461 def _cleanSearchResult(self, pagediv): | |
462 """fixes search result html (change pbs and figures)""" | |
463 # replace figure-tag with figureNumText | |
464 for fig in pagediv.findall(".//span[@class='figure']"): | |
465 txt = fig.findtext(".//span[@class='figureNumText']") | |
466 tail = fig.tail | |
467 fig.clear() | |
468 fig.set('class', 'figure') | |
469 fig.text = txt | |
470 fig.tail = tail | |
471 | |
472 # replace lb-tag with "//" | |
473 for lb in pagediv.findall(".//br[@class='lb']"): | |
474 lb.tag = 'span' | |
475 lb.text = '//' | |
476 | |
477 # replace pb-tag with "///" | |
478 for pb in pagediv.findall(".//span[@class='pb']"): | |
479 tail = pb.tail | |
480 pb.clear() | |
481 pb.set('class', 'pb') | |
482 pb.text = '///' | |
483 pb.tail = tail | |
484 | |
485 return pagediv | |
486 | |
487 def _cleanSearchResult2(self, pagediv): | |
488 """fixes search result html (change pbs and figures)""" | |
489 # unfortunately etree can not select class.startswith('figure') | |
490 divs = pagediv.findall(".//span[@class]") | |
491 for d in divs: | |
492 cls = d.get('class') | |
493 if cls.startswith('figure'): | |
494 # replace figure-tag with figureNumText | |
495 txt = d.findtext(".//span[@class='figureNumText']") | |
496 d.clear() | |
497 d.set('class', 'figure') | |
498 d.text = txt | |
499 | |
500 elif cls.startswith('pb'): | |
501 # replace pb-tag with "//" | |
502 d.clear() | |
503 d.set('class', 'pb') | |
504 d.text = '//' | |
505 | |
506 return pagediv | |
507 | |
508 | |
465 | 509 |
466 def _fixEmptyDivs(self, pagediv): | 510 def _fixEmptyDivs(self, pagediv): |
467 """fixes empty div-tags by inserting a space""" | 511 """fixes empty div-tags by inserting a space""" |
468 divs = pagediv.findall('.//div') | 512 divs = pagediv.findall('.//div') |
469 for d in divs: | 513 for d in divs: |
474 return pagediv | 518 return pagediv |
475 | 519 |
476 | 520 |
477 def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): | 521 def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): |
478 """loads list of search results and stores XML in docinfo""" | 522 """loads list of search results and stores XML in docinfo""" |
479 | 523 normMode = pageinfo.get('characterNormalization', 'reg') |
480 logging.debug("getSearchResults mode=%s query=%s"%(mode, query)) | 524 logging.debug("getSearchResults mode=%s query=%s norm=%s"%(mode, query, normMode)) |
481 if mode == "none": | 525 if mode == "none": |
482 return docinfo | 526 return docinfo |
483 | 527 |
484 #TODO: put mode into query | 528 #TODO: put mode into query |
485 | 529 |
486 cachedQuery = docinfo.get('cachedQuery', None) | 530 cachedQuery = docinfo.get('cachedQuery', None) |
487 if cachedQuery is not None: | 531 if cachedQuery is not None: |
488 # cached search result | 532 # cached search result |
489 if cachedQuery == '%s_%s'%(mode,query): | 533 if cachedQuery == '%s_%s_%s'%(mode,query,normMode): |
490 # same query | 534 # same query |
491 return docinfo | 535 return docinfo |
492 | 536 |
493 else: | 537 else: |
494 # different query | 538 # different query |
495 del docinfo['resultSize'] | 539 del docinfo['resultSize'] |
496 del docinfo['results'] | 540 del docinfo['results'] |
497 | 541 |
498 # cache query | 542 # cache query |
499 docinfo['cachedQuery'] = '%s_%s'%(mode,query) | 543 docinfo['cachedQuery'] = '%s_%s_%s'%(mode,query,normMode) |
500 | 544 |
501 # fetch full results | 545 # fetch full results |
502 docpath = docinfo['textURLPath'] | 546 docpath = docinfo['textURLPath'] |
503 params = {'docId': docpath, | 547 params = {'docId': docpath, |
504 'query': query, | 548 'query': query, |
507 'outputFormat': 'html'} | 551 'outputFormat': 'html'} |
508 pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params)) | 552 pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params)) |
509 results = [] | 553 results = [] |
510 try: | 554 try: |
511 dom = ET.fromstring(pagexml) | 555 dom = ET.fromstring(pagexml) |
556 # clean html output | |
557 self._processWTags('plain', normMode, dom) | |
558 self._cleanSearchResult(dom) | |
512 # page content is currently in multiple <td align=left> | 559 # page content is currently in multiple <td align=left> |
513 alldivs = dom.findall(".//tr[@class='hit']") | 560 alldivs = dom.findall(".//tr[@class='hit']") |
514 for div in alldivs: | 561 for div in alldivs: |
515 # change tr to div | 562 # change tr to div |
516 div.tag = 'div' | 563 div.tag = 'div' |
530 | 577 |
531 return docinfo | 578 return docinfo |
532 | 579 |
533 | 580 |
534 def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): | 581 def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): |
535 """returns single page from the table of contents""" | 582 """returns single page from the list of search results""" |
536 logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) | 583 logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) |
537 # get (cached) result | 584 # get (cached) result |
538 self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) | 585 self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) |
539 | 586 |
540 resultxml = docinfo.get('results', None) | 587 resultxml = docinfo.get('results', None) |