Mercurial > hg > documentViewer
diff MpiwgXmlTextServer.py @ 566:4a31608f8b0e
more new MpiwgXmlTextServer.
author | casties |
---|---|
date | Wed, 10 Oct 2012 18:09:49 +0200 |
parents | 1b483194901c |
children | 8b1e20bf300d |
line wrap: on
line diff
--- a/MpiwgXmlTextServer.py Tue Oct 09 19:01:18 2012 +0200 +++ b/MpiwgXmlTextServer.py Wed Oct 10 18:09:49 2012 +0200 @@ -306,49 +306,21 @@ pagediv = body.find(".//div[@class='text']") logging.debug("pagediv: %s"%repr(pagediv)) - # plain text mode - if textmode == "text": + # plain text or text-with-links mode + if textmode == "text" or textmode == "dict": if pagediv is not None: - # handle pb-tag - self._extractPbTag(pagediv, pageinfo) + self._processPbTag(pagediv, pageinfo) + self._processFigures(pagediv, docinfo) + #self._fixEmptyDivs(pagediv) # get full url assuming documentViewer is parent selfurl = self.getLink() - if punditMode: - self._addPunditAttributes(pagediv, pageinfo, docinfo) - - # fix empty div tags - self._fixEmptyDivs(pagediv) # check all a-tags links = pagediv.findall('.//a') for l in links: href = l.get('href') - # handle notes FIXME! - if href and href.startswith('#note-'): - href = href.replace('#note-',"%s#note-"%selfurl) - l.set('href', href) - - return serialize(pagediv) - - # text-with-links mode - elif textmode == "dict": - if pagediv is not None: - # handle pb-div - self._extractPbTag(pagediv, pageinfo) - viewerurl = docinfo['viewerUrl'] - selfurl = self.getLink() - if punditMode: - pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo) - - # fix empty div tags - self._fixEmptyDivs(pagediv) - # check all a-tags - links = pagediv.findall(".//a") - for l in links: - href = l.get('href') if href: # is link with href linkurl = urlparse.urlparse(href) - #logging.debug("getTextPage: linkurl=%s"%repr(linkurl)) if linkurl.path.endswith('GetDictionaryEntries'): #TODO: replace wordInfo page # is dictionary link - change href (keeping parameters) @@ -356,10 +328,13 @@ # add target to open new page l.set('target', '_blank') - if href.startswith('#note-'): - # note link + elif href.startswith('#note-'): + # note link FIXME! l.set('href', href.replace('#note-',"%s#note-"%selfurl)) - + + if punditMode: + self._addPunditAttributes(pagediv, pageinfo, docinfo) + return serialize(pagediv) # xml mode @@ -372,7 +347,7 @@ if pagediv is not None: return serialize(pagediv) - # gis mode + # gis mode FIXME! elif textmode == "gis": if pagediv is not None: # fix empty div tags @@ -393,7 +368,7 @@ return None - def _extractPbTag(self, pagediv, pageinfo): + def _processPbTag(self, pagediv, pageinfo): """extracts information from pb-tag and removes it from pagediv""" pbdiv = pagediv.find(".//span[@class='pb']") if pbdiv is None: @@ -420,6 +395,7 @@ for d in divs: id = d.get('id') if id: + # TODO: check path (cf RFC2396) d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id)) cls = d.get('class','') cls += ' pundit-content' @@ -427,6 +403,36 @@ return pagediv + def _processFigures(self, pagediv, docinfo): + """processes figure-tags""" + divs = pagediv.findall(".//span[@class='figure']") + scalerUrl = docinfo['digilibScalerUrl'] + viewerUrl = docinfo['digilibViewerUrl'] + for d in divs: + try: + a = d.find('a') + img = a.find('img') + imgsrc = img.get('src') + imgurl = urlparse.urlparse(imgsrc) + imgq = imgurl.query + imgparams = urlparse.parse_qs(imgq) + fn = imgparams.get('fn', None) + if fn is not None: + # parse_qs puts parameters in lists + fn = fn[0] + # TODO: check valid path + # fix img@src + newsrc = '%s?fn=%s&dw=200&dh=200'%(scalerUrl,fn) + img.set('src', newsrc) + # fix a@href + newlink = '%s?fn=%s'%(viewerUrl,fn) + a.set('href', newlink) + a.set('target', '_blank') + + except: + logging.warn("processFigures: strange figure!") + + def _fixEmptyDivs(self, pagediv): """fixes empty div-tags by inserting a space""" divs = pagediv.findall('.//div')