changeset 516:7d7b639d7be7

add methods to use doc-info.xql. read list of page numbers from doc-info.xql. add original page numbers to thumbs.
author casties
date Mon, 05 Mar 2012 18:04:49 +0100
parents 0afba3afd538
children aaacdf551f6f
files MpdlXmlTextServer.py documentViewer.py version.txt zpt/toc_thumbs.zpt
diffstat 4 files changed, 99 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/MpdlXmlTextServer.py	Tue Feb 28 21:22:52 2012 +0100
+++ b/MpdlXmlTextServer.py	Mon Mar 05 18:04:49 2012 +0100
@@ -71,6 +71,75 @@
         return places
     
           
+    def getTextInfo(self, docinfo=None):
+        """reads document info, including page concordance, from text server"""
+        logging.debug("getDocInfo")
+        docpath = docinfo.get('textURLPath', None)
+        if docpath is None:
+            logging.error("getTextInfo: no textURLPath!")
+            return docinfo
+        
+        # we need to set a result set size
+        pagesize = 10000
+        pn = 1
+        # fetch docinfo
+        pagexml = self.getServerData("doc-info.xql","document=%s&pageSize=%s&pn=%s"%(docpath,pagesize,pn))
+        dom = ET.fromstring(pagexml)
+        # all info in tag <document>
+        doc = dom.find("document")
+        if doc is None:
+            logging.error("getTextInfo: unable to find document-tag!")
+        else:
+            # go through all child elements
+            for tag in doc:
+                name = tag.tag
+                # numTextPages
+                if name == 'countPages':
+                    np = getInt(tag.text)                    
+                    if np > 0:
+                        docinfo['numTextPages'] = np
+                   
+                # numFigureEntries
+                elif name == 'countFigureEntries':
+                    docinfo['numFigureEntries'] = getInt(tag.text)
+                    
+                # numTocEntries
+                elif name == 'countTocEntries':
+                    # WTF: s1 = int(s)/30+1
+                    docinfo['numTocEntries'] = getInt(tag.text)
+                    
+                # numPlaces
+                elif name == 'countPlaces':
+                    docinfo['numPlaces'] = getInt(tag.text)
+                    
+                # pageNumbers
+                elif name == 'pageNumbers':
+                    # contains tags with page numbers
+                    # <pn><n>4</n><no>4</no><non/></pn>
+                    # n=scan number, no=original page no, non=normalized original page no
+                    # pageNumbers is a dict indexed by scan number
+                    pages = {}
+                    for pn in tag:
+                        page = {}
+                        n = 0
+                        for p in pn:
+                            if p.tag == 'n':
+                                n = getInt(p.text)
+                                page['n'] = n
+                            elif p.tag == 'no':
+                                page['no'] = p.text
+                            elif p.tag == 'non':
+                                page['non'] = p.text
+                                
+                        if n > 0:
+                            pages[n] = page
+                        
+                    docinfo['pageNumbers'] = pages
+                    logging.debug("got pageNumbers=%s"%repr(pages))
+                                
+        return docinfo
+        
+          
     def processPageInfo(self, dom, docinfo, pageinfo):
         """processes page info divs from dom and stores in docinfo and pageinfo"""
         # assume first second level div is pageMeta
@@ -333,9 +402,8 @@
     def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None):
         """returns single page from the table of contents"""
         logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn))
-        # check for cached result
-        if not 'resultXML' in docinfo:
-            self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
+        # get (cached) result
+        self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
             
         resultxml = docinfo.get('resultXML', None)
         if not resultxml:
--- a/documentViewer.py	Tue Feb 28 21:22:52 2012 +0100
+++ b/documentViewer.py	Mon Mar 05 18:04:49 2012 +0100
@@ -183,6 +183,10 @@
         """returns one page of the search results"""
         return self.template.fulltextclient.getResultsPage(**args)
 
+    def getTextInfo(self, **args):
+        """returns document info from the text server"""
+        return self.template.fulltextclient.getTextInfo(**args)
+
     def getToc(self, **args):
         """loads table of contents and stores XML in docinfo"""
         return self.template.fulltextclient.getToc(**args)
@@ -479,6 +483,9 @@
             texttool = self.metadataService.getTexttoolData(dom=metaDom)
             if texttool:
                 docinfo = self.getDocinfoFromTexttool(docinfo, texttool)
+                # document info from full text
+                if docinfo.get('textURLPath', None):
+                    docinfo = self.getTextInfo(docinfo=docinfo)
             
             # bib info
             bib = self.metadataService.getBibData(dom=metaDom)
@@ -509,7 +516,7 @@
 
         # image path
         if mode != 'texttool':
-            # override image path from texttool with url
+            # override image path from texttool with url TODO: how about mode=auto?
             docinfo['imagePath'] = url.replace('/mpiwg/online/', '', 1)
 
         # number of images from digilib
@@ -517,12 +524,19 @@
             docinfo['imageURL'] = self.digilibBaseUrl + "/servlet/Scaler?fn=" + docinfo['imagePath']
             docinfo = self.getDocinfoFromDigilib(docinfo, docinfo['imagePath'])
 
+        # check numPages
+        if docinfo.get('numPages', 0) == 0:
+            if docinfo.get('numTextPages', 0) > 0:
+                # replace with numTextPages (text-only?)
+                docinfo['numPages'] = docinfo['numTextPages']
+
         logging.debug("documentViewer (getdocinfo) docinfo: keys=%s"%docinfo.keys())
         #logging.debug("documentViewer (getdocinfo) docinfo: %s"%docinfo)
         # store in session
         self.REQUEST.SESSION['docinfo'] = docinfo
         return docinfo
 
+
     def getDocinfoFromResource(self, docinfo, resource):
         """reads contents of resource element into docinfo"""
         docName = resource.get('name', None)
@@ -698,6 +712,7 @@
         pageinfo['viewLayer'] = viewLayer
         pageinfo['tocMode'] = tocMode
 
+        # TODO: unify current and pn!
         current = getInt(current)
         pageinfo['current'] = current
         pageinfo['pn'] = current
@@ -715,6 +730,7 @@
         np = int(docinfo.get('numPages', 0))
         if np == 0:
             # numPages unknown - maybe we can get it from text page
+            logging.warn("getPageInfo: numPages=0 trying getTextPage!")
             if docinfo.get('textURLPath', None):
                 # cache text page as well
                 pageinfo['textPage'] = self.getTextPage(mode=viewLayer, pn=current, docinfo=docinfo, pageinfo=pageinfo)
@@ -732,8 +748,14 @@
         pageZero = (cols == 2 and (pageFlowLtr != oddScanLeft))
         pageinfo['pageZero'] = pageZero
         pageinfo['pageBatch'] = self.getPageBatch(start=start, rows=rows, cols=cols, pageFlowLtr=pageFlowLtr, pageZero=pageZero, minIdx=1, maxIdx=np)
-                
+        # more page parameters
         pageinfo['characterNormalization'] = self.REQUEST.get('characterNormalization','reg')
+        if docinfo.get('pageNumbers'):
+            # get original page numbers
+            pageNumber = docinfo['pageNumbers'].get(current, None)
+            if pageNumber is not None:
+                pageinfo['pageNumberOrig'] = pageNumber['no']
+                pageinfo['pageNumberOrigNorm'] = pageNumber['non']
         
         # cache search results
         pageinfo['resultPageSize'] = getInt(self.REQUEST.get('resultPageSize', 10))
--- a/version.txt	Tue Feb 28 21:22:52 2012 +0100
+++ b/version.txt	Mon Mar 05 18:04:49 2012 +0100
@@ -1,1 +1,1 @@
-DocumentViewer 2.0b
\ No newline at end of file
+DocumentViewer 2.1a
\ No newline at end of file
--- a/zpt/toc_thumbs.zpt	Tue Feb 28 21:22:52 2012 +0100
+++ b/zpt/toc_thumbs.zpt	Mon Mar 05 18:04:49 2012 +0100
@@ -11,6 +11,7 @@
                 grpsize pageinfo/groupsize;
                 numgroups pageinfo/numgroups;
                 pageBatch pageinfo/pageBatch; pageZero pageinfo/pageZero;
+                pageNumbers docinfo/pageNumbers | nothing;
                 left python:test(flowLtr,pageBatch['prevStart'],pageBatch['nextStart']);
                 right python:test(flowLtr,pageBatch['nextStart'],pageBatch['prevStart']);">
     <ul class="toctype">
@@ -59,7 +60,8 @@
               <img
                 tal:attributes="src python:test(docinfo['imageURL'],here.getScalerUrl(pn=idx,dw=100,dh=100,docinfo=docinfo),'images/pic');
                                 alt idx" /><br/>
-              <span tal:content="idx" />
+              <span title="Scan number" tal:content="idx"/>
+              <span tal:condition="python:pageNumbers and pageNumbers[idx]['no']" title="Original page number" tal:content="python:' (%s)'%(pageNumbers[idx]['no'])"/>
             </a>
           </td>
         </tr>