Mercurial > hg > documentViewer
comparison MpiwgXmlTextServer.py @ 566:4a31608f8b0e
more new MpiwgXmlTextServer.
author | casties |
---|---|
date | Wed, 10 Oct 2012 18:09:49 +0200 |
parents | 1b483194901c |
children | 8b1e20bf300d |
comparison
equal
deleted
inserted
replaced
565:1b483194901c | 566:4a31608f8b0e |
---|---|
304 | 304 |
305 # the text is in div@class=text | 305 # the text is in div@class=text |
306 pagediv = body.find(".//div[@class='text']") | 306 pagediv = body.find(".//div[@class='text']") |
307 logging.debug("pagediv: %s"%repr(pagediv)) | 307 logging.debug("pagediv: %s"%repr(pagediv)) |
308 | 308 |
309 # plain text mode | 309 # plain text or text-with-links mode |
310 if textmode == "text": | 310 if textmode == "text" or textmode == "dict": |
311 if pagediv is not None: | 311 if pagediv is not None: |
312 # handle pb-tag | 312 self._processPbTag(pagediv, pageinfo) |
313 self._extractPbTag(pagediv, pageinfo) | 313 self._processFigures(pagediv, docinfo) |
314 #self._fixEmptyDivs(pagediv) | |
314 # get full url assuming documentViewer is parent | 315 # get full url assuming documentViewer is parent |
315 selfurl = self.getLink() | 316 selfurl = self.getLink() |
316 if punditMode: | |
317 self._addPunditAttributes(pagediv, pageinfo, docinfo) | |
318 | |
319 # fix empty div tags | |
320 self._fixEmptyDivs(pagediv) | |
321 # check all a-tags | 317 # check all a-tags |
322 links = pagediv.findall('.//a') | 318 links = pagediv.findall('.//a') |
323 for l in links: | |
324 href = l.get('href') | |
325 # handle notes FIXME! | |
326 if href and href.startswith('#note-'): | |
327 href = href.replace('#note-',"%s#note-"%selfurl) | |
328 l.set('href', href) | |
329 | |
330 return serialize(pagediv) | |
331 | |
332 # text-with-links mode | |
333 elif textmode == "dict": | |
334 if pagediv is not None: | |
335 # handle pb-div | |
336 self._extractPbTag(pagediv, pageinfo) | |
337 viewerurl = docinfo['viewerUrl'] | |
338 selfurl = self.getLink() | |
339 if punditMode: | |
340 pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo) | |
341 | |
342 # fix empty div tags | |
343 self._fixEmptyDivs(pagediv) | |
344 # check all a-tags | |
345 links = pagediv.findall(".//a") | |
346 for l in links: | 319 for l in links: |
347 href = l.get('href') | 320 href = l.get('href') |
348 if href: | 321 if href: |
349 # is link with href | 322 # is link with href |
350 linkurl = urlparse.urlparse(href) | 323 linkurl = urlparse.urlparse(href) |
351 #logging.debug("getTextPage: linkurl=%s"%repr(linkurl)) | |
352 if linkurl.path.endswith('GetDictionaryEntries'): | 324 if linkurl.path.endswith('GetDictionaryEntries'): |
353 #TODO: replace wordInfo page | 325 #TODO: replace wordInfo page |
354 # is dictionary link - change href (keeping parameters) | 326 # is dictionary link - change href (keeping parameters) |
355 #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl)) | 327 #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl)) |
356 # add target to open new page | 328 # add target to open new page |
357 l.set('target', '_blank') | 329 l.set('target', '_blank') |
358 | 330 |
359 if href.startswith('#note-'): | 331 elif href.startswith('#note-'): |
360 # note link | 332 # note link FIXME! |
361 l.set('href', href.replace('#note-',"%s#note-"%selfurl)) | 333 l.set('href', href.replace('#note-',"%s#note-"%selfurl)) |
362 | 334 |
335 if punditMode: | |
336 self._addPunditAttributes(pagediv, pageinfo, docinfo) | |
337 | |
363 return serialize(pagediv) | 338 return serialize(pagediv) |
364 | 339 |
365 # xml mode | 340 # xml mode |
366 elif textmode == "xml": | 341 elif textmode == "xml": |
367 if pagediv is not None: | 342 if pagediv is not None: |
370 # pureXml mode WTF? | 345 # pureXml mode WTF? |
371 elif textmode == "pureXml": | 346 elif textmode == "pureXml": |
372 if pagediv is not None: | 347 if pagediv is not None: |
373 return serialize(pagediv) | 348 return serialize(pagediv) |
374 | 349 |
375 # gis mode | 350 # gis mode FIXME! |
376 elif textmode == "gis": | 351 elif textmode == "gis": |
377 if pagediv is not None: | 352 if pagediv is not None: |
378 # fix empty div tags | 353 # fix empty div tags |
379 self._fixEmptyDivs(pagediv) | 354 self._fixEmptyDivs(pagediv) |
380 # check all a-tags | 355 # check all a-tags |
391 | 366 |
392 return serialize(pagediv) | 367 return serialize(pagediv) |
393 | 368 |
394 return None | 369 return None |
395 | 370 |
396 def _extractPbTag(self, pagediv, pageinfo): | 371 def _processPbTag(self, pagediv, pageinfo): |
397 """extracts information from pb-tag and removes it from pagediv""" | 372 """extracts information from pb-tag and removes it from pagediv""" |
398 pbdiv = pagediv.find(".//span[@class='pb']") | 373 pbdiv = pagediv.find(".//span[@class='pb']") |
399 if pbdiv is None: | 374 if pbdiv is None: |
400 logging.warning("getTextPage: no pb-span!") | 375 logging.warning("getTextPage: no pb-span!") |
401 return pagediv | 376 return pagediv |
418 # check all div-tags | 393 # check all div-tags |
419 divs = pagediv.findall(".//div") | 394 divs = pagediv.findall(".//div") |
420 for d in divs: | 395 for d in divs: |
421 id = d.get('id') | 396 id = d.get('id') |
422 if id: | 397 if id: |
398 # TODO: check path (cf RFC2396) | |
423 d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id)) | 399 d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id)) |
424 cls = d.get('class','') | 400 cls = d.get('class','') |
425 cls += ' pundit-content' | 401 cls += ' pundit-content' |
426 d.set('class', cls.strip()) | 402 d.set('class', cls.strip()) |
427 | 403 |
428 return pagediv | 404 return pagediv |
429 | 405 |
406 def _processFigures(self, pagediv, docinfo): | |
407 """processes figure-tags""" | |
408 divs = pagediv.findall(".//span[@class='figure']") | |
409 scalerUrl = docinfo['digilibScalerUrl'] | |
410 viewerUrl = docinfo['digilibViewerUrl'] | |
411 for d in divs: | |
412 try: | |
413 a = d.find('a') | |
414 img = a.find('img') | |
415 imgsrc = img.get('src') | |
416 imgurl = urlparse.urlparse(imgsrc) | |
417 imgq = imgurl.query | |
418 imgparams = urlparse.parse_qs(imgq) | |
419 fn = imgparams.get('fn', None) | |
420 if fn is not None: | |
421 # parse_qs puts parameters in lists | |
422 fn = fn[0] | |
423 # TODO: check valid path | |
424 # fix img@src | |
425 newsrc = '%s?fn=%s&dw=200&dh=200'%(scalerUrl,fn) | |
426 img.set('src', newsrc) | |
427 # fix a@href | |
428 newlink = '%s?fn=%s'%(viewerUrl,fn) | |
429 a.set('href', newlink) | |
430 a.set('target', '_blank') | |
431 | |
432 except: | |
433 logging.warn("processFigures: strange figure!") | |
434 | |
435 | |
430 def _fixEmptyDivs(self, pagediv): | 436 def _fixEmptyDivs(self, pagediv): |
431 """fixes empty div-tags by inserting a space""" | 437 """fixes empty div-tags by inserting a space""" |
432 divs = pagediv.findall('.//div') | 438 divs = pagediv.findall('.//div') |
433 for d in divs: | 439 for d in divs: |
434 if len(d) == 0 and not d.text: | 440 if len(d) == 0 and not d.text: |