comparison MpiwgXmlTextServer.py @ 566:4a31608f8b0e

more new MpiwgXmlTextServer.
author casties
date Wed, 10 Oct 2012 18:09:49 +0200
parents 1b483194901c
children 8b1e20bf300d
comparison
equal deleted inserted replaced
565:1b483194901c 566:4a31608f8b0e
304 304
305 # the text is in div@class=text 305 # the text is in div@class=text
306 pagediv = body.find(".//div[@class='text']") 306 pagediv = body.find(".//div[@class='text']")
307 logging.debug("pagediv: %s"%repr(pagediv)) 307 logging.debug("pagediv: %s"%repr(pagediv))
308 308
309 # plain text mode 309 # plain text or text-with-links mode
310 if textmode == "text": 310 if textmode == "text" or textmode == "dict":
311 if pagediv is not None: 311 if pagediv is not None:
312 # handle pb-tag 312 self._processPbTag(pagediv, pageinfo)
313 self._extractPbTag(pagediv, pageinfo) 313 self._processFigures(pagediv, docinfo)
314 #self._fixEmptyDivs(pagediv)
314 # get full url assuming documentViewer is parent 315 # get full url assuming documentViewer is parent
315 selfurl = self.getLink() 316 selfurl = self.getLink()
316 if punditMode:
317 self._addPunditAttributes(pagediv, pageinfo, docinfo)
318
319 # fix empty div tags
320 self._fixEmptyDivs(pagediv)
321 # check all a-tags 317 # check all a-tags
322 links = pagediv.findall('.//a') 318 links = pagediv.findall('.//a')
323 for l in links:
324 href = l.get('href')
325 # handle notes FIXME!
326 if href and href.startswith('#note-'):
327 href = href.replace('#note-',"%s#note-"%selfurl)
328 l.set('href', href)
329
330 return serialize(pagediv)
331
332 # text-with-links mode
333 elif textmode == "dict":
334 if pagediv is not None:
335 # handle pb-div
336 self._extractPbTag(pagediv, pageinfo)
337 viewerurl = docinfo['viewerUrl']
338 selfurl = self.getLink()
339 if punditMode:
340 pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo)
341
342 # fix empty div tags
343 self._fixEmptyDivs(pagediv)
344 # check all a-tags
345 links = pagediv.findall(".//a")
346 for l in links: 319 for l in links:
347 href = l.get('href') 320 href = l.get('href')
348 if href: 321 if href:
349 # is link with href 322 # is link with href
350 linkurl = urlparse.urlparse(href) 323 linkurl = urlparse.urlparse(href)
351 #logging.debug("getTextPage: linkurl=%s"%repr(linkurl))
352 if linkurl.path.endswith('GetDictionaryEntries'): 324 if linkurl.path.endswith('GetDictionaryEntries'):
353 #TODO: replace wordInfo page 325 #TODO: replace wordInfo page
354 # is dictionary link - change href (keeping parameters) 326 # is dictionary link - change href (keeping parameters)
355 #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl)) 327 #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl))
356 # add target to open new page 328 # add target to open new page
357 l.set('target', '_blank') 329 l.set('target', '_blank')
358 330
359 if href.startswith('#note-'): 331 elif href.startswith('#note-'):
360 # note link 332 # note link FIXME!
361 l.set('href', href.replace('#note-',"%s#note-"%selfurl)) 333 l.set('href', href.replace('#note-',"%s#note-"%selfurl))
362 334
335 if punditMode:
336 self._addPunditAttributes(pagediv, pageinfo, docinfo)
337
363 return serialize(pagediv) 338 return serialize(pagediv)
364 339
365 # xml mode 340 # xml mode
366 elif textmode == "xml": 341 elif textmode == "xml":
367 if pagediv is not None: 342 if pagediv is not None:
370 # pureXml mode WTF? 345 # pureXml mode WTF?
371 elif textmode == "pureXml": 346 elif textmode == "pureXml":
372 if pagediv is not None: 347 if pagediv is not None:
373 return serialize(pagediv) 348 return serialize(pagediv)
374 349
375 # gis mode 350 # gis mode FIXME!
376 elif textmode == "gis": 351 elif textmode == "gis":
377 if pagediv is not None: 352 if pagediv is not None:
378 # fix empty div tags 353 # fix empty div tags
379 self._fixEmptyDivs(pagediv) 354 self._fixEmptyDivs(pagediv)
380 # check all a-tags 355 # check all a-tags
391 366
392 return serialize(pagediv) 367 return serialize(pagediv)
393 368
394 return None 369 return None
395 370
396 def _extractPbTag(self, pagediv, pageinfo): 371 def _processPbTag(self, pagediv, pageinfo):
397 """extracts information from pb-tag and removes it from pagediv""" 372 """extracts information from pb-tag and removes it from pagediv"""
398 pbdiv = pagediv.find(".//span[@class='pb']") 373 pbdiv = pagediv.find(".//span[@class='pb']")
399 if pbdiv is None: 374 if pbdiv is None:
400 logging.warning("getTextPage: no pb-span!") 375 logging.warning("getTextPage: no pb-span!")
401 return pagediv 376 return pagediv
418 # check all div-tags 393 # check all div-tags
419 divs = pagediv.findall(".//div") 394 divs = pagediv.findall(".//div")
420 for d in divs: 395 for d in divs:
421 id = d.get('id') 396 id = d.get('id')
422 if id: 397 if id:
398 # TODO: check path (cf RFC2396)
423 d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id)) 399 d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id))
424 cls = d.get('class','') 400 cls = d.get('class','')
425 cls += ' pundit-content' 401 cls += ' pundit-content'
426 d.set('class', cls.strip()) 402 d.set('class', cls.strip())
427 403
428 return pagediv 404 return pagediv
429 405
406 def _processFigures(self, pagediv, docinfo):
407 """processes figure-tags"""
408 divs = pagediv.findall(".//span[@class='figure']")
409 scalerUrl = docinfo['digilibScalerUrl']
410 viewerUrl = docinfo['digilibViewerUrl']
411 for d in divs:
412 try:
413 a = d.find('a')
414 img = a.find('img')
415 imgsrc = img.get('src')
416 imgurl = urlparse.urlparse(imgsrc)
417 imgq = imgurl.query
418 imgparams = urlparse.parse_qs(imgq)
419 fn = imgparams.get('fn', None)
420 if fn is not None:
421 # parse_qs puts parameters in lists
422 fn = fn[0]
423 # TODO: check valid path
424 # fix img@src
425 newsrc = '%s?fn=%s&dw=200&dh=200'%(scalerUrl,fn)
426 img.set('src', newsrc)
427 # fix a@href
428 newlink = '%s?fn=%s'%(viewerUrl,fn)
429 a.set('href', newlink)
430 a.set('target', '_blank')
431
432 except:
433 logging.warn("processFigures: strange figure!")
434
435
430 def _fixEmptyDivs(self, pagediv): 436 def _fixEmptyDivs(self, pagediv):
431 """fixes empty div-tags by inserting a space""" 437 """fixes empty div-tags by inserting a space"""
432 divs = pagediv.findall('.//div') 438 divs = pagediv.findall('.//div')
433 for d in divs: 439 for d in divs:
434 if len(d) == 0 and not d.text: 440 if len(d) == 0 and not d.text: