changeset 566:4a31608f8b0e

more new MpiwgXmlTextServer.
author casties
date Wed, 10 Oct 2012 18:09:49 +0200
parents 1b483194901c
children 8b1e20bf300d
files MpiwgXmlTextServer.py documentViewer.py
diffstat 2 files changed, 54 insertions(+), 42 deletions(-) [+]
line wrap: on
line diff
--- a/MpiwgXmlTextServer.py	Tue Oct 09 19:01:18 2012 +0200
+++ b/MpiwgXmlTextServer.py	Wed Oct 10 18:09:49 2012 +0200
@@ -306,49 +306,21 @@
         pagediv = body.find(".//div[@class='text']")
         logging.debug("pagediv: %s"%repr(pagediv))
         
-        # plain text mode
-        if textmode == "text":
+        # plain text or text-with-links mode
+        if textmode == "text" or textmode == "dict":
             if pagediv is not None:
-                # handle pb-tag
-                self._extractPbTag(pagediv, pageinfo)
+                self._processPbTag(pagediv, pageinfo)
+                self._processFigures(pagediv, docinfo)
+                #self._fixEmptyDivs(pagediv)
                 # get full url assuming documentViewer is parent
                 selfurl = self.getLink()
-                if punditMode:
-                    self._addPunditAttributes(pagediv, pageinfo, docinfo)
-                    
-                # fix empty div tags
-                self._fixEmptyDivs(pagediv)
                 # check all a-tags
                 links = pagediv.findall('.//a')
                 for l in links:
                     href = l.get('href')
-                    # handle notes FIXME!
-                    if href and href.startswith('#note-'):
-                        href = href.replace('#note-',"%s#note-"%selfurl)
-                        l.set('href', href)
-                        
-                return serialize(pagediv)
-            
-        # text-with-links mode
-        elif textmode == "dict":
-            if pagediv is not None:
-                # handle pb-div
-                self._extractPbTag(pagediv, pageinfo)
-                viewerurl = docinfo['viewerUrl']
-                selfurl = self.getLink()
-                if punditMode:
-                    pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo)
-                    
-                # fix empty div tags
-                self._fixEmptyDivs(pagediv)   
-                # check all a-tags
-                links = pagediv.findall(".//a")
-                for l in links:
-                    href = l.get('href')
                     if href:
                         # is link with href
                         linkurl = urlparse.urlparse(href)
-                        #logging.debug("getTextPage: linkurl=%s"%repr(linkurl))
                         if linkurl.path.endswith('GetDictionaryEntries'):
                             #TODO: replace wordInfo page
                             # is dictionary link - change href (keeping parameters)
@@ -356,10 +328,13 @@
                             # add target to open new page
                             l.set('target', '_blank')
                                                           
-                        if href.startswith('#note-'):
-                            # note link
+                        elif href.startswith('#note-'):
+                            # note link FIXME!
                             l.set('href', href.replace('#note-',"%s#note-"%selfurl))
-                              
+                        
+                if punditMode:
+                    self._addPunditAttributes(pagediv, pageinfo, docinfo)
+                    
                 return serialize(pagediv)
             
         # xml mode
@@ -372,7 +347,7 @@
             if pagediv is not None:
                 return serialize(pagediv)
                   
-        # gis mode
+        # gis mode FIXME!
         elif textmode == "gis":
             if pagediv is not None:
                 # fix empty div tags
@@ -393,7 +368,7 @@
                     
         return None
 
-    def _extractPbTag(self, pagediv, pageinfo):
+    def _processPbTag(self, pagediv, pageinfo):
         """extracts information from pb-tag and removes it from pagediv"""
         pbdiv = pagediv.find(".//span[@class='pb']")
         if pbdiv is None:
@@ -420,6 +395,7 @@
         for d in divs:
             id = d.get('id')
             if id:
+                # TODO: check path (cf RFC2396)
                 d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id))
                 cls = d.get('class','')
                 cls += ' pundit-content'
@@ -427,6 +403,36 @@
 
         return pagediv
 
+    def _processFigures(self, pagediv, docinfo):
+        """processes figure-tags"""
+        divs = pagediv.findall(".//span[@class='figure']")
+        scalerUrl = docinfo['digilibScalerUrl']
+        viewerUrl = docinfo['digilibViewerUrl']
+        for d in divs:
+            try:
+                a = d.find('a')
+                img = a.find('img')
+                imgsrc = img.get('src')
+                imgurl = urlparse.urlparse(imgsrc)
+                imgq = imgurl.query
+                imgparams = urlparse.parse_qs(imgq)
+                fn = imgparams.get('fn', None)
+                if fn is not None:
+                    # parse_qs puts parameters in lists
+                    fn = fn[0]
+                    # TODO: check valid path
+                    # fix img@src
+                    newsrc = '%s?fn=%s&dw=200&dh=200'%(scalerUrl,fn)
+                    img.set('src', newsrc)
+                    # fix a@href
+                    newlink = '%s?fn=%s'%(viewerUrl,fn)
+                    a.set('href', newlink)
+                    a.set('target', '_blank')
+                    
+            except:
+                logging.warn("processFigures: strange figure!")
+                
+    
     def _fixEmptyDivs(self, pagediv):
         """fixes empty div-tags by inserting a space"""
         divs = pagediv.findall('.//div')
--- a/documentViewer.py	Tue Oct 09 19:01:18 2012 +0200
+++ b/documentViewer.py	Wed Oct 10 18:09:49 2012 +0200
@@ -199,6 +199,8 @@
             
         if digilibBaseUrl is not None:
             self.digilibBaseUrl = digilibBaseUrl
+            self.digilibScalerUrl = digilibBaseUrl + '/servlet/Scaler'
+            self.digilibViewerUrl = digilibBaseUrl + '/jquery/digilib.html'
             
         
     # proxy text server methods to fulltextclient
@@ -360,7 +362,7 @@
             url = docinfo.get('imageURL', None)
             
         if url is None:
-            url = "%s/servlet/Scaler?"%self.digilibBaseUrl
+            url = self.digilibScalerUrl
             if fn is None and docinfo is not None:
                 fn = docinfo.get('imagePath','')
             
@@ -543,6 +545,8 @@
         # add self url
         docinfo['viewerUrl'] = self.getDocumentViewerURL()
         docinfo['digilibBaseUrl'] = self.digilibBaseUrl
+        docinfo['digilibScalerUrl'] = self.digilibScalerUrl
+        docinfo['digilibViewerUrl'] = self.digilibViewerUrl
         # get index.meta DOM
         docUrl = None
         metaDom = None
@@ -562,7 +566,7 @@
 
         elif mode=="filepath":
             # url points to image file, index.meta optional
-            docinfo['imageURL'] = self.digilibBaseUrl + "/servlet/Scaler?fn=" + url
+            docinfo['imageURL'] = "%s?fn=%s"%(self.digilibScalerUrl, url)
             docinfo['numPages'] = 1
             # asssume index.meta is two path segments up
             docUrl = getParentPath(url, 2)
@@ -636,7 +640,7 @@
             # number of images from digilib
             if docinfo.get('imagePath', None):
                 imgpath = docinfo['imagePath'].replace('/mpiwg/online/', '', 1)
-                docinfo['imageURL'] = self.digilibBaseUrl + "/servlet/Scaler?fn=" + imgpath
+                docinfo['imageURL'] = "%s?fn=%s"%(self.digilibScalerUrl, imgpath)
                 docinfo = self.getDocinfoFromDigilib(docinfo, imgpath)
             else:
                 # imagePath still missing? try "./pageimg"
@@ -645,7 +649,7 @@
                 if docinfo.get('numPages', 0) > 0:
                     # there are pages
                     docinfo['imagePath'] = imgPath
-                    docinfo['imageURL'] = self.digilibBaseUrl + "/servlet/Scaler?fn=" + docinfo['imagePath']
+                    docinfo['imageURL'] = "%s?fn=%s"%(self.digilibScalerUrl, docinfo['imagePath'])
 
         # check numPages
         if docinfo.get('numPages', 0) == 0:
@@ -1044,6 +1048,8 @@
         """init document viewer"""
         self.title=title
         self.digilibBaseUrl = digilibBaseUrl
+        self.digilibScalerUrl = digilibBaseUrl + '/servlet/Scaler'
+        self.digilibViewerUrl = digilibBaseUrl + '/jquery/digilib.html'
         self.thumbrows = thumbrows
         self.thumbcols = thumbcols
         self.authgroups = [s.strip().lower() for s in authgroups.split(',')]