# HG changeset patch # User casties # Date 1349885389 -7200 # Node ID 4a31608f8b0e102ca49a189a185dec984fbc54e4 # Parent 1b483194901c39a53db671a31c9f5ab83325da9b more new MpiwgXmlTextServer. diff -r 1b483194901c -r 4a31608f8b0e MpiwgXmlTextServer.py --- a/MpiwgXmlTextServer.py Tue Oct 09 19:01:18 2012 +0200 +++ b/MpiwgXmlTextServer.py Wed Oct 10 18:09:49 2012 +0200 @@ -306,49 +306,21 @@ pagediv = body.find(".//div[@class='text']") logging.debug("pagediv: %s"%repr(pagediv)) - # plain text mode - if textmode == "text": + # plain text or text-with-links mode + if textmode == "text" or textmode == "dict": if pagediv is not None: - # handle pb-tag - self._extractPbTag(pagediv, pageinfo) + self._processPbTag(pagediv, pageinfo) + self._processFigures(pagediv, docinfo) + #self._fixEmptyDivs(pagediv) # get full url assuming documentViewer is parent selfurl = self.getLink() - if punditMode: - self._addPunditAttributes(pagediv, pageinfo, docinfo) - - # fix empty div tags - self._fixEmptyDivs(pagediv) # check all a-tags links = pagediv.findall('.//a') for l in links: href = l.get('href') - # handle notes FIXME! - if href and href.startswith('#note-'): - href = href.replace('#note-',"%s#note-"%selfurl) - l.set('href', href) - - return serialize(pagediv) - - # text-with-links mode - elif textmode == "dict": - if pagediv is not None: - # handle pb-div - self._extractPbTag(pagediv, pageinfo) - viewerurl = docinfo['viewerUrl'] - selfurl = self.getLink() - if punditMode: - pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo) - - # fix empty div tags - self._fixEmptyDivs(pagediv) - # check all a-tags - links = pagediv.findall(".//a") - for l in links: - href = l.get('href') if href: # is link with href linkurl = urlparse.urlparse(href) - #logging.debug("getTextPage: linkurl=%s"%repr(linkurl)) if linkurl.path.endswith('GetDictionaryEntries'): #TODO: replace wordInfo page # is dictionary link - change href (keeping parameters) @@ -356,10 +328,13 @@ # add target to open new page l.set('target', '_blank') - if href.startswith('#note-'): - # note link + elif href.startswith('#note-'): + # note link FIXME! l.set('href', href.replace('#note-',"%s#note-"%selfurl)) - + + if punditMode: + self._addPunditAttributes(pagediv, pageinfo, docinfo) + return serialize(pagediv) # xml mode @@ -372,7 +347,7 @@ if pagediv is not None: return serialize(pagediv) - # gis mode + # gis mode FIXME! elif textmode == "gis": if pagediv is not None: # fix empty div tags @@ -393,7 +368,7 @@ return None - def _extractPbTag(self, pagediv, pageinfo): + def _processPbTag(self, pagediv, pageinfo): """extracts information from pb-tag and removes it from pagediv""" pbdiv = pagediv.find(".//span[@class='pb']") if pbdiv is None: @@ -420,6 +395,7 @@ for d in divs: id = d.get('id') if id: + # TODO: check path (cf RFC2396) d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id)) cls = d.get('class','') cls += ' pundit-content' @@ -427,6 +403,36 @@ return pagediv + def _processFigures(self, pagediv, docinfo): + """processes figure-tags""" + divs = pagediv.findall(".//span[@class='figure']") + scalerUrl = docinfo['digilibScalerUrl'] + viewerUrl = docinfo['digilibViewerUrl'] + for d in divs: + try: + a = d.find('a') + img = a.find('img') + imgsrc = img.get('src') + imgurl = urlparse.urlparse(imgsrc) + imgq = imgurl.query + imgparams = urlparse.parse_qs(imgq) + fn = imgparams.get('fn', None) + if fn is not None: + # parse_qs puts parameters in lists + fn = fn[0] + # TODO: check valid path + # fix img@src + newsrc = '%s?fn=%s&dw=200&dh=200'%(scalerUrl,fn) + img.set('src', newsrc) + # fix a@href + newlink = '%s?fn=%s'%(viewerUrl,fn) + a.set('href', newlink) + a.set('target', '_blank') + + except: + logging.warn("processFigures: strange figure!") + + def _fixEmptyDivs(self, pagediv): """fixes empty div-tags by inserting a space""" divs = pagediv.findall('.//div') diff -r 1b483194901c -r 4a31608f8b0e documentViewer.py --- a/documentViewer.py Tue Oct 09 19:01:18 2012 +0200 +++ b/documentViewer.py Wed Oct 10 18:09:49 2012 +0200 @@ -199,6 +199,8 @@ if digilibBaseUrl is not None: self.digilibBaseUrl = digilibBaseUrl + self.digilibScalerUrl = digilibBaseUrl + '/servlet/Scaler' + self.digilibViewerUrl = digilibBaseUrl + '/jquery/digilib.html' # proxy text server methods to fulltextclient @@ -360,7 +362,7 @@ url = docinfo.get('imageURL', None) if url is None: - url = "%s/servlet/Scaler?"%self.digilibBaseUrl + url = self.digilibScalerUrl if fn is None and docinfo is not None: fn = docinfo.get('imagePath','') @@ -543,6 +545,8 @@ # add self url docinfo['viewerUrl'] = self.getDocumentViewerURL() docinfo['digilibBaseUrl'] = self.digilibBaseUrl + docinfo['digilibScalerUrl'] = self.digilibScalerUrl + docinfo['digilibViewerUrl'] = self.digilibViewerUrl # get index.meta DOM docUrl = None metaDom = None @@ -562,7 +566,7 @@ elif mode=="filepath": # url points to image file, index.meta optional - docinfo['imageURL'] = self.digilibBaseUrl + "/servlet/Scaler?fn=" + url + docinfo['imageURL'] = "%s?fn=%s"%(self.digilibScalerUrl, url) docinfo['numPages'] = 1 # asssume index.meta is two path segments up docUrl = getParentPath(url, 2) @@ -636,7 +640,7 @@ # number of images from digilib if docinfo.get('imagePath', None): imgpath = docinfo['imagePath'].replace('/mpiwg/online/', '', 1) - docinfo['imageURL'] = self.digilibBaseUrl + "/servlet/Scaler?fn=" + imgpath + docinfo['imageURL'] = "%s?fn=%s"%(self.digilibScalerUrl, imgpath) docinfo = self.getDocinfoFromDigilib(docinfo, imgpath) else: # imagePath still missing? try "./pageimg" @@ -645,7 +649,7 @@ if docinfo.get('numPages', 0) > 0: # there are pages docinfo['imagePath'] = imgPath - docinfo['imageURL'] = self.digilibBaseUrl + "/servlet/Scaler?fn=" + docinfo['imagePath'] + docinfo['imageURL'] = "%s?fn=%s"%(self.digilibScalerUrl, docinfo['imagePath']) # check numPages if docinfo.get('numPages', 0) == 0: @@ -1044,6 +1048,8 @@ """init document viewer""" self.title=title self.digilibBaseUrl = digilibBaseUrl + self.digilibScalerUrl = digilibBaseUrl + '/servlet/Scaler' + self.digilibViewerUrl = digilibBaseUrl + '/jquery/digilib.html' self.thumbrows = thumbrows self.thumbcols = thumbcols self.authgroups = [s.strip().lower() for s in authgroups.split(',')]