comparison MpiwgXmlTextServer.py @ 576:b2c7e272e075

new w-tag solution with etree. search works now.
author casties
date Wed, 17 Oct 2012 16:36:13 +0200
parents f0e5e9c6737f
children 9251719154a3
comparison
equal deleted inserted replaced
575:f0e5e9c6737f 576:b2c7e272e075
6 import re 6 import re
7 import logging 7 import logging
8 import urllib 8 import urllib
9 import urlparse 9 import urlparse
10 import base64 10 import base64
11
12 from datetime import datetime
11 13
12 from SrvTxtUtils import getInt, getText, getHttpData 14 from SrvTxtUtils import getInt, getText, getHttpData
13 15
14 def serialize(node): 16 def serialize(node):
15 """returns a string containing an XML snippet of node""" 17 """returns a string containing an XML snippet of node"""
184 186
185 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): 187 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
186 """returns single page from fulltext""" 188 """returns single page from fulltext"""
187 189
188 logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) 190 logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
191 startTime = datetime.now()
189 # check for cached text -- but ideally this shouldn't be called twice 192 # check for cached text -- but ideally this shouldn't be called twice
190 if pageinfo.has_key('textPage'): 193 if pageinfo.has_key('textPage'):
191 logging.debug("getTextPage: using cached text") 194 logging.debug("getTextPage: using cached text")
192 return pageinfo['textPage'] 195 return pageinfo['textPage']
193 196
206 209
207 normMode = pageinfo.get('characterNormalization', 'reg') 210 normMode = pageinfo.get('characterNormalization', 'reg')
208 # TODO: change values in form 211 # TODO: change values in form
209 if normMode == 'regPlusNorm': 212 if normMode == 'regPlusNorm':
210 normMode = 'norm' 213 normMode = 'norm'
211 214
215 # TODO: this should not be necessary when the backend is fixed
216 textParams['normalization'] = normMode
217
212 if not mode: 218 if not mode:
213 # default is dict 219 # default is dict
214 mode = 'text' 220 mode = 'text'
215 221
216 modes = mode.split(',') 222 modes = mode.split(',')
238 modes.remove('pundit') 244 modes.remove('pundit')
239 245
240 # other modes don't combine 246 # other modes don't combine
241 if 'dict' in modes: 247 if 'dict' in modes:
242 textmode = 'dict' 248 textmode = 'dict'
243 textParams['mode'] = 'tokenized'
244 textParams['outputFormat'] = 'html' 249 textParams['outputFormat'] = 'html'
245 elif 'xml' in modes: 250 elif 'xml' in modes:
246 textmode = 'xml' 251 textmode = 'xml'
247 textParams['mode'] = 'untokenized'
248 textParams['outputFormat'] = 'xmlDisplay' 252 textParams['outputFormat'] = 'xmlDisplay'
249 textParams['normMode'] = 'orig' 253 normMode = 'orig'
250 elif 'gis' in modes: 254 elif 'gis' in modes:
251 #FIXME! 255 #FIXME!
252 textmode = 'gis' 256 textmode = 'gis'
253 else: 257 else:
254 # text is default mode 258 # text is default mode
255 textmode = 'plain' 259 textmode = 'plain'
256 textParams['mode'] = 'untokenized'
257 textParams['outputFormat'] = 'html' 260 textParams['outputFormat'] = 'html'
258 261
259 try: 262 try:
260 # fetch the page 263 # fetch the page
261 pagexml = self.getServerData("query/GetPage",urllib.urlencode(textParams)) 264 pagexml = self.getServerData("query/GetPage",urllib.urlencode(textParams))
270 pagediv = dom.find(".//div[@class='text']") 273 pagediv = dom.find(".//div[@class='text']")
271 logging.debug("pagediv: %s"%repr(pagediv)) 274 logging.debug("pagediv: %s"%repr(pagediv))
272 if pagediv is not None: 275 if pagediv is not None:
273 # add textmode and normMode classes 276 # add textmode and normMode classes
274 pagediv.set('class', 'text %s %s'%(textmode, normMode)) 277 pagediv.set('class', 'text %s %s'%(textmode, normMode))
275 #self._processWTags(textmode, normMode, pagediv) 278 self._processWTags(textmode, normMode, pagediv)
276 #self._processPbTag(pagediv, pageinfo) 279 #self._processPbTag(pagediv, pageinfo)
277 self._processFigures(pagediv, docinfo) 280 self._processFigures(pagediv, docinfo)
278 #self._fixEmptyDivs(pagediv) 281 #self._fixEmptyDivs(pagediv)
279 # get full url assuming documentViewer is parent 282 # get full url assuming documentViewer is parent
280 selfurl = self.getLink() 283 selfurl = self.getLink()
285 if href: 288 if href:
286 # is link with href 289 # is link with href
287 linkurl = urlparse.urlparse(href) 290 linkurl = urlparse.urlparse(href)
288 if linkurl.path.endswith('GetDictionaryEntries'): 291 if linkurl.path.endswith('GetDictionaryEntries'):
289 #TODO: replace wordInfo page 292 #TODO: replace wordInfo page
290 # is dictionary link - change href (keeping parameters)
291 #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl))
292 # add target to open new page 293 # add target to open new page
293 l.set('target', '_blank') 294 l.set('target', '_blank')
294
295 elif href.startswith('#note-'):
296 # note link FIXME!
297 l.set('href', href.replace('#note-',"%s#note-"%selfurl))
298 295
299 if punditMode: 296 if punditMode:
300 self._addPunditAttributes(pagediv, pageinfo, docinfo) 297 self._addPunditAttributes(pagediv, pageinfo, docinfo)
301 298
302 return serialize(pagediv) 299 s = serialize(pagediv)
300 logging.debug("getTextPage done in %s"%(datetime.now()-startTime))
301 return s
303 302
304 # xml mode 303 # xml mode
305 elif textmode == "xml": 304 elif textmode == "xml":
306 # the text is in body 305 # the text is in body
307 pagediv = dom.find(".//body") 306 pagediv = dom.find(".//body")
343 return None 342 return None
344 343
345 def _processWTags(self, textMode, normMode, pagediv): 344 def _processWTags(self, textMode, normMode, pagediv):
346 """selects the necessary information from w-spans and removes the rest from pagediv""" 345 """selects the necessary information from w-spans and removes the rest from pagediv"""
347 logging.debug("processWTags(textMode=%s,norm=%s,pagediv"%(repr(textMode),repr(normMode))) 346 logging.debug("processWTags(textMode=%s,norm=%s,pagediv"%(repr(textMode),repr(normMode)))
347 startTime = datetime.now()
348 wtags = pagediv.findall(".//span[@class='w']") 348 wtags = pagediv.findall(".//span[@class='w']")
349 for wtag in wtags: 349 for wtag in wtags:
350 text = None
351 attr = None
352 if textMode == 'dict': 350 if textMode == 'dict':
353 # take a-tag and matching child 351 # delete non-a-tags
354 attr = wtag.find('a').items() 352 wtag.remove(wtag.find("span[@class='nodictionary orig']"))
355 text = wtag.find("a/span[@class='%s']"%normMode).text 353 wtag.remove(wtag.find("span[@class='nodictionary reg']"))
354 wtag.remove(wtag.find("span[@class='nodictionary norm']"))
355 # delete non-matching children of a-tag and suppress remaining tag name
356 atag = wtag.find("a[@class='dictionary']")
357 if normMode == 'orig':
358 atag.remove(atag.find("span[@class='reg']"))
359 atag.remove(atag.find("span[@class='norm']"))
360 atag.find("span[@class='orig']").tag = None
361 elif normMode == 'reg':
362 atag.remove(atag.find("span[@class='orig']"))
363 atag.remove(atag.find("span[@class='norm']"))
364 atag.find("span[@class='reg']").tag = None
365 elif normMode == 'norm':
366 atag.remove(atag.find("span[@class='orig']"))
367 atag.remove(atag.find("span[@class='reg']"))
368 atag.find("span[@class='norm']").tag = None
369
356 else: 370 else:
357 # take matching child 371 # delete a-tag
358 text = wtag.find("span[@class='nodictionary %s']"%normMode).text 372 wtag.remove(wtag.find("a[@class='dictionary']"))
373 # delete non-matching children and suppress remaining tag name
374 if normMode == 'orig':
375 wtag.remove(wtag.find("span[@class='nodictionary reg']"))
376 wtag.remove(wtag.find("span[@class='nodictionary norm']"))
377 wtag.find("span[@class='nodictionary orig']").tag = None
378 elif normMode == 'reg':
379 wtag.remove(wtag.find("span[@class='nodictionary orig']"))
380 wtag.remove(wtag.find("span[@class='nodictionary norm']"))
381 wtag.find("span[@class='nodictionary reg']").tag = None
382 elif normMode == 'norm':
383 wtag.remove(wtag.find("span[@class='nodictionary orig']"))
384 wtag.remove(wtag.find("span[@class='nodictionary reg']"))
385 wtag.find("span[@class='nodictionary norm']").tag = None
359 386
360 if text: 387 # suppress w-tag name
361 # replace wtag by new content 388 wtag.tag = None
362 logging.debug("new w-tag attr=%s text=%s"%(attr,text)) 389
363 wtag.clear() 390 logging.debug("processWTags in %s"%(datetime.now()-startTime))
364
365 if attr:
366 # make dictionary link
367 wtag.tag = 'a'
368 wtag.attrib.update(dict(attr))
369
370 # text content
371 wtag.text = text
372
373 return pagediv 391 return pagediv
374 392
375 def _processPbTag(self, pagediv, pageinfo): 393 def _processPbTag(self, pagediv, pageinfo):
376 """extracts information from pb-tag and removes it from pagediv""" 394 """extracts information from pb-tag and removes it from pagediv"""
377 pbdiv = pagediv.find(".//span[@class='pb']") 395 pbdiv = pagediv.find(".//span[@class='pb']")
407 425
408 return pagediv 426 return pagediv
409 427
410 def _processFigures(self, pagediv, docinfo): 428 def _processFigures(self, pagediv, docinfo):
411 """processes figure-tags""" 429 """processes figure-tags"""
412 divs = pagediv.findall(".//span[@class='figure']") 430 # unfortunately etree can not select class.startswith('figure')
431 divs = pagediv.findall(".//span[@class]")
413 scalerUrl = docinfo['digilibScalerUrl'] 432 scalerUrl = docinfo['digilibScalerUrl']
414 viewerUrl = docinfo['digilibViewerUrl'] 433 viewerUrl = docinfo['digilibViewerUrl']
415 for d in divs: 434 for d in divs:
435 if not d.get('class').startswith('figure'):
436 continue
437
416 try: 438 try:
417 a = d.find('a') 439 a = d.find('a')
418 img = a.find('img') 440 img = a.find('img')
419 imgsrc = img.get('src') 441 imgsrc = img.get('src')
420 imgurl = urlparse.urlparse(imgsrc) 442 imgurl = urlparse.urlparse(imgsrc)
482 pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params)) 504 pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params))
483 results = [] 505 results = []
484 try: 506 try:
485 dom = ET.fromstring(pagexml) 507 dom = ET.fromstring(pagexml)
486 # page content is currently in multiple <td align=left> 508 # page content is currently in multiple <td align=left>
487 alldivs = dom.findall(".//td[@align='left']") 509 alldivs = dom.findall(".//tr[@class='hit']")
488 for div in alldivs: 510 for div in alldivs:
511 # change tr to div
512 div.tag = 'div'
513 # change td to span
514 for d in div.findall('td'):
515 d.tag = 'span'
516
489 # TODO: can we put etree in the session? 517 # TODO: can we put etree in the session?
490 results.append(div) 518 results.append(div)
491 519
492 except Exception, e: 520 except Exception, e:
493 logging.error("GetSearchResults: Error parsing search result: %s"%e) 521 logging.error("GetSearchResults: Error parsing search result: %s"%e)
514 size = pageinfo.get('resultPageSize', 10) 542 size = pageinfo.get('resultPageSize', 10)
515 543
516 if start is None: 544 if start is None:
517 start = (pn - 1) * size 545 start = (pn - 1) * size
518 546
519 fullresult = ET.fromstring(resultxml) 547 #fullresult = ET.fromstring(resultxml)
520 548 #fullresult = resultxml
521 if fullresult is not None: 549 #logging.debug("resultxml=%s"%repr(resultxml))
550
551 if resultxml is not None:
522 # paginate 552 # paginate
523 first = start-1 553 first = start-1
524 len = size 554 last = first+size
525 del fullresult[:first] 555 tocdivs = resultxml[first:last]
526 del fullresult[len:] 556 #del fullresult[:first]
527 tocdivs = fullresult 557 #del fullresult[len:]
528 558 #tocdivs = fullresult
529 # check all a-tags 559
530 links = tocdivs.findall(".//a") 560 toc = ET.Element('div', attrib={'class':'queryResultPage'})
531 for l in links: 561 for div in tocdivs:
532 href = l.get('href') 562 # check all a-tags
533 if href: 563 links = div.findall(".//a")
534 # assume all links go to pages 564 for l in links:
535 linkUrl = urlparse.urlparse(href) 565 href = l.get('href')
536 linkParams = urlparse.parse_qs(linkUrl.query) 566 if href:
537 # take some parameters 567 # assume all links go to pages
538 params = {'pn': linkParams['pn'], 568 linkUrl = urlparse.urlparse(href)
539 'highlightQuery': linkParams.get('highlightQuery',''), 569 linkParams = urlparse.parse_qs(linkUrl.query)
540 'highlightElement': linkParams.get('highlightElement',''), 570 # take some parameters (make sure it works even if the link was already parsed)
541 'highlightElementPos': linkParams.get('highlightElementPos','') 571 params = {'pn': linkParams.get('page',linkParams.get('pn', None)),
542 } 572 'highlightQuery': linkParams.get('highlightQuery',None),
543 url = self.getLink(params=params) 573 'highlightElement': linkParams.get('highlightElem',linkParams.get('highlightElement',None)),
544 l.set('href', url) 574 'highlightElementPos': linkParams.get('highlightElemPos',linkParams.get('highlightElementPos',None))
575 }
576 if not params['pn']:
577 logging.warn("getResultsPage: link has no page: %s"%href)
578
579 url = self.getLink(params=params)
580 l.set('href', url)
545 581
546 return serialize(tocdivs) 582 toc.append(div)
583
584 return serialize(toc)
547 585
548 return "ERROR: no results!" 586 return "ERROR: no results!"
549 587
550 588
551 def getToc(self, mode='text', docinfo=None): 589 def getToc(self, mode='text', docinfo=None):