Context Navigation

← Previous Changeset
Next Changeset →

Changeset 516:7d7b639d7be7 in documentViewer

Timestamp:

Mar 5, 2012, 5:04:49 PM (13 years ago)

Author:

casties

Branch:

default

Message:

add methods to use doc-info.xql.
read list of page numbers from doc-info.xql.
add original page numbers to thumbs.

Files:

: 4 edited

MpdlXmlTextServer.py (modified) (2 diffs)
documentViewer.py (modified) (8 diffs)
version.txt (modified) (1 diff)
zpt/toc_thumbs.zpt (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

MpdlXmlTextServer.py

-                      r513
+                      r516
         return places
+    def getTextInfo(self, docinfo=None):
+        """reads document info, including page concordance, from text server"""
+        logging.debug("getDocInfo")
+        docpath = docinfo.get('textURLPath', None)
+        if docpath is None:
+            logging.error("getTextInfo: no textURLPath!")
+            return docinfo
+        # we need to set a result set size
+        pagesize = 10000
+        pn = 1
+        # fetch docinfo
+        pagexml = self.getServerData("doc-info.xql","document=%s&pageSize=%s&pn=%s"%(docpath,pagesize,pn))
+        dom = ET.fromstring(pagexml)
+        # all info in tag <document>
+        doc = dom.find("document")
+        if doc is None:
+            logging.error("getTextInfo: unable to find document-tag!")
+        else:
+            # go through all child elements
+            for tag in doc:
+                name = tag.tag
+                # numTextPages
+                if name == 'countPages':
+                    np = getInt(tag.text)
+                    if np > 0:
+                        docinfo['numTextPages'] = np
+                # numFigureEntries
+                elif name == 'countFigureEntries':
+                    docinfo['numFigureEntries'] = getInt(tag.text)
+                # numTocEntries
+                elif name == 'countTocEntries':
+                    # WTF: s1 = int(s)/30+1
+                    docinfo['numTocEntries'] = getInt(tag.text)
+                # numPlaces
+                elif name == 'countPlaces':
+                    docinfo['numPlaces'] = getInt(tag.text)
+                # pageNumbers
+                elif name == 'pageNumbers':
+                    # contains tags with page numbers
+                    # <pn><n>4</n><no>4</no><non/></pn>
+                    # n=scan number, no=original page no, non=normalized original page no
+                    # pageNumbers is a dict indexed by scan number
+                    pages = {}
+                    for pn in tag:
+                        page = {}
+                        n = 0
+                        for p in pn:
+                            if p.tag == 'n':
+                                n = getInt(p.text)
+                                page['n'] = n
+                            elif p.tag == 'no':
+                                page['no'] = p.text
+                            elif p.tag == 'non':
+                                page['non'] = p.text
+                        if n > 0:
+                            pages[n] = page
+                    docinfo['pageNumbers'] = pages
+                    logging.debug("got pageNumbers=%s"%repr(pages))
+        return docinfo
     def processPageInfo(self, dom, docinfo, pageinfo):
 …
         """returns single page from the table of contents"""
         logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn))
+        # check for cached result
+        if not 'resultXML' in docinfo:
+            self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
+        # get (cached) result
+        self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
         resultxml = docinfo.get('resultXML', None)

documentViewer.py

-                      r514
+                      r516
         return self.template.fulltextclient.getResultsPage(**args)
+    def getTextInfo(self, **args):
+        """returns document info from the text server"""
+        return self.template.fulltextclient.getTextInfo(**args)
     def getToc(self, **args):
         """loads table of contents and stores XML in docinfo"""
 …
             if texttool:
                 docinfo = self.getDocinfoFromTexttool(docinfo, texttool)
+                # document info from full text
+                if docinfo.get('textURLPath', None):
+                    docinfo = self.getTextInfo(docinfo=docinfo)
             # bib info
 …
         # image path
         if mode != 'texttool':
             # override image path from texttool with url
+            # override image path from texttool with url TODO: how about mode=auto?
             docinfo['imagePath'] = url.replace('/mpiwg/online/', '', 1)
 …
             docinfo['imageURL'] = self.digilibBaseUrl + "/servlet/Scaler?fn=" + docinfo['imagePath']
             docinfo = self.getDocinfoFromDigilib(docinfo, docinfo['imagePath'])
+        # check numPages
+        if docinfo.get('numPages', 0) == 0:
+            if docinfo.get('numTextPages', 0) > 0:
+                # replace with numTextPages (text-only?)
+                docinfo['numPages'] = docinfo['numTextPages']
         logging.debug("documentViewer (getdocinfo) docinfo: keys=%s"%docinfo.keys())
 …
         self.REQUEST.SESSION['docinfo'] = docinfo
         return docinfo
     def getDocinfoFromResource(self, docinfo, resource):
 …
         pageinfo['tocMode'] = tocMode
+        # TODO: unify current and pn!
         current = getInt(current)
         pageinfo['current'] = current
 …
         if np == 0:
             # numPages unknown - maybe we can get it from text page
+            logging.warn("getPageInfo: numPages=0 trying getTextPage!")
             if docinfo.get('textURLPath', None):
                 # cache text page as well
 …
         pageinfo['pageZero'] = pageZero
         pageinfo['pageBatch'] = self.getPageBatch(start=start, rows=rows, cols=cols, pageFlowLtr=pageFlowLtr, pageZero=pageZero, minIdx=1, maxIdx=np)
+        # more page parameters
         pageinfo['characterNormalization'] = self.REQUEST.get('characterNormalization','reg')
+        if docinfo.get('pageNumbers'):
+            # get original page numbers
+            pageNumber = docinfo['pageNumbers'].get(current, None)
+            if pageNumber is not None:
+                pageinfo['pageNumberOrig'] = pageNumber['no']
+                pageinfo['pageNumberOrigNorm'] = pageNumber['non']
         # cache search results

version.txt

r514	r516
1		DocumentViewer 2.0b
	1	DocumentViewer 2.1a

zpt/toc_thumbs.zpt

-                      r489
+                      r516
                 numgroups pageinfo/numgroups;
                 pageBatch pageinfo/pageBatch; pageZero pageinfo/pageZero;
+                pageNumbers docinfo/pageNumbers | nothing;
                 left python:test(flowLtr,pageBatch['prevStart'],pageBatch['nextStart']);
                 right python:test(flowLtr,pageBatch['nextStart'],pageBatch['prevStart']);">
 …
                 tal:attributes="src python:test(docinfo['imageURL'],here.getScalerUrl(pn=idx,dw=100,dh=100,docinfo=docinfo),'images/pic');
                                 alt idx" /><br/>
+              <span tal:content="idx" />
+              <span title="Scan number" tal:content="idx"/>
+              <span tal:condition="python:pageNumbers and pageNumbers[idx]['no']" title="Original page number" tal:content="python:' (%s)'%(pageNumbers[idx]['no'])"/>
             </a>
           </td>

Note: See TracChangeset for help on using the changeset viewer.