Mercurial > hg > documentViewer
changeset 516:7d7b639d7be7
add methods to use doc-info.xql.
read list of page numbers from doc-info.xql.
add original page numbers to thumbs.
author | casties |
---|---|
date | Mon, 05 Mar 2012 18:04:49 +0100 |
parents | 0afba3afd538 |
children | aaacdf551f6f |
files | MpdlXmlTextServer.py documentViewer.py version.txt zpt/toc_thumbs.zpt |
diffstat | 4 files changed, 99 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/MpdlXmlTextServer.py Tue Feb 28 21:22:52 2012 +0100 +++ b/MpdlXmlTextServer.py Mon Mar 05 18:04:49 2012 +0100 @@ -71,6 +71,75 @@ return places + def getTextInfo(self, docinfo=None): + """reads document info, including page concordance, from text server""" + logging.debug("getDocInfo") + docpath = docinfo.get('textURLPath', None) + if docpath is None: + logging.error("getTextInfo: no textURLPath!") + return docinfo + + # we need to set a result set size + pagesize = 10000 + pn = 1 + # fetch docinfo + pagexml = self.getServerData("doc-info.xql","document=%s&pageSize=%s&pn=%s"%(docpath,pagesize,pn)) + dom = ET.fromstring(pagexml) + # all info in tag <document> + doc = dom.find("document") + if doc is None: + logging.error("getTextInfo: unable to find document-tag!") + else: + # go through all child elements + for tag in doc: + name = tag.tag + # numTextPages + if name == 'countPages': + np = getInt(tag.text) + if np > 0: + docinfo['numTextPages'] = np + + # numFigureEntries + elif name == 'countFigureEntries': + docinfo['numFigureEntries'] = getInt(tag.text) + + # numTocEntries + elif name == 'countTocEntries': + # WTF: s1 = int(s)/30+1 + docinfo['numTocEntries'] = getInt(tag.text) + + # numPlaces + elif name == 'countPlaces': + docinfo['numPlaces'] = getInt(tag.text) + + # pageNumbers + elif name == 'pageNumbers': + # contains tags with page numbers + # <pn><n>4</n><no>4</no><non/></pn> + # n=scan number, no=original page no, non=normalized original page no + # pageNumbers is a dict indexed by scan number + pages = {} + for pn in tag: + page = {} + n = 0 + for p in pn: + if p.tag == 'n': + n = getInt(p.text) + page['n'] = n + elif p.tag == 'no': + page['no'] = p.text + elif p.tag == 'non': + page['non'] = p.text + + if n > 0: + pages[n] = page + + docinfo['pageNumbers'] = pages + logging.debug("got pageNumbers=%s"%repr(pages)) + + return docinfo + + def processPageInfo(self, dom, docinfo, pageinfo): """processes page info divs from dom and stores in docinfo and pageinfo""" # assume first second level div is pageMeta @@ -333,9 +402,8 @@ def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): """returns single page from the table of contents""" logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) - # check for cached result - if not 'resultXML' in docinfo: - self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) + # get (cached) result + self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) resultxml = docinfo.get('resultXML', None) if not resultxml:
--- a/documentViewer.py Tue Feb 28 21:22:52 2012 +0100 +++ b/documentViewer.py Mon Mar 05 18:04:49 2012 +0100 @@ -183,6 +183,10 @@ """returns one page of the search results""" return self.template.fulltextclient.getResultsPage(**args) + def getTextInfo(self, **args): + """returns document info from the text server""" + return self.template.fulltextclient.getTextInfo(**args) + def getToc(self, **args): """loads table of contents and stores XML in docinfo""" return self.template.fulltextclient.getToc(**args) @@ -479,6 +483,9 @@ texttool = self.metadataService.getTexttoolData(dom=metaDom) if texttool: docinfo = self.getDocinfoFromTexttool(docinfo, texttool) + # document info from full text + if docinfo.get('textURLPath', None): + docinfo = self.getTextInfo(docinfo=docinfo) # bib info bib = self.metadataService.getBibData(dom=metaDom) @@ -509,7 +516,7 @@ # image path if mode != 'texttool': - # override image path from texttool with url + # override image path from texttool with url TODO: how about mode=auto? docinfo['imagePath'] = url.replace('/mpiwg/online/', '', 1) # number of images from digilib @@ -517,12 +524,19 @@ docinfo['imageURL'] = self.digilibBaseUrl + "/servlet/Scaler?fn=" + docinfo['imagePath'] docinfo = self.getDocinfoFromDigilib(docinfo, docinfo['imagePath']) + # check numPages + if docinfo.get('numPages', 0) == 0: + if docinfo.get('numTextPages', 0) > 0: + # replace with numTextPages (text-only?) + docinfo['numPages'] = docinfo['numTextPages'] + logging.debug("documentViewer (getdocinfo) docinfo: keys=%s"%docinfo.keys()) #logging.debug("documentViewer (getdocinfo) docinfo: %s"%docinfo) # store in session self.REQUEST.SESSION['docinfo'] = docinfo return docinfo + def getDocinfoFromResource(self, docinfo, resource): """reads contents of resource element into docinfo""" docName = resource.get('name', None) @@ -698,6 +712,7 @@ pageinfo['viewLayer'] = viewLayer pageinfo['tocMode'] = tocMode + # TODO: unify current and pn! current = getInt(current) pageinfo['current'] = current pageinfo['pn'] = current @@ -715,6 +730,7 @@ np = int(docinfo.get('numPages', 0)) if np == 0: # numPages unknown - maybe we can get it from text page + logging.warn("getPageInfo: numPages=0 trying getTextPage!") if docinfo.get('textURLPath', None): # cache text page as well pageinfo['textPage'] = self.getTextPage(mode=viewLayer, pn=current, docinfo=docinfo, pageinfo=pageinfo) @@ -732,8 +748,14 @@ pageZero = (cols == 2 and (pageFlowLtr != oddScanLeft)) pageinfo['pageZero'] = pageZero pageinfo['pageBatch'] = self.getPageBatch(start=start, rows=rows, cols=cols, pageFlowLtr=pageFlowLtr, pageZero=pageZero, minIdx=1, maxIdx=np) - + # more page parameters pageinfo['characterNormalization'] = self.REQUEST.get('characterNormalization','reg') + if docinfo.get('pageNumbers'): + # get original page numbers + pageNumber = docinfo['pageNumbers'].get(current, None) + if pageNumber is not None: + pageinfo['pageNumberOrig'] = pageNumber['no'] + pageinfo['pageNumberOrigNorm'] = pageNumber['non'] # cache search results pageinfo['resultPageSize'] = getInt(self.REQUEST.get('resultPageSize', 10))
--- a/version.txt Tue Feb 28 21:22:52 2012 +0100 +++ b/version.txt Mon Mar 05 18:04:49 2012 +0100 @@ -1,1 +1,1 @@ -DocumentViewer 2.0b \ No newline at end of file +DocumentViewer 2.1a \ No newline at end of file
--- a/zpt/toc_thumbs.zpt Tue Feb 28 21:22:52 2012 +0100 +++ b/zpt/toc_thumbs.zpt Mon Mar 05 18:04:49 2012 +0100 @@ -11,6 +11,7 @@ grpsize pageinfo/groupsize; numgroups pageinfo/numgroups; pageBatch pageinfo/pageBatch; pageZero pageinfo/pageZero; + pageNumbers docinfo/pageNumbers | nothing; left python:test(flowLtr,pageBatch['prevStart'],pageBatch['nextStart']); right python:test(flowLtr,pageBatch['nextStart'],pageBatch['prevStart']);"> <ul class="toctype"> @@ -59,7 +60,8 @@ <img tal:attributes="src python:test(docinfo['imageURL'],here.getScalerUrl(pn=idx,dw=100,dh=100,docinfo=docinfo),'images/pic'); alt idx" /><br/> - <span tal:content="idx" /> + <span title="Scan number" tal:content="idx"/> + <span tal:condition="python:pageNumbers and pageNumbers[idx]['no']" title="Original page number" tal:content="python:' (%s)'%(pageNumbers[idx]['no'])"/> </a> </td> </tr>