Mercurial > hg > documentViewer
changeset 568:694935574177
more new MpiwgXmlTextServer.
author | casties |
---|---|
date | Thu, 11 Oct 2012 18:27:14 +0200 |
parents | 8b1e20bf300d |
children | be21250420be |
files | MpiwgXmlTextServer.py css/docuviewer.css documentViewer.py |
diffstat | 3 files changed, 70 insertions(+), 91 deletions(-) [+] |
line wrap: on
line diff
--- a/MpiwgXmlTextServer.py Thu Oct 11 10:21:49 2012 +0200 +++ b/MpiwgXmlTextServer.py Thu Oct 11 18:27:14 2012 +0200 @@ -93,7 +93,7 @@ logging.debug("getTextInfo mode=%s"%mode) field = '' - if mode in ['pages', 'toc', 'figures']: + if mode in ['pages', 'toc', 'figures', 'handwritten']: # translate mode to field param field = '&field=%s'%mode else: @@ -125,9 +125,12 @@ else: if mode is None: # get general info from system-tag - cp = doc.find('system/countPages') - if cp is not None: - docinfo['numTextPages'] = getInt(cp.text) + sys = doc.find('system') + if sys is not None: + docinfo['numTextPages'] = getInt(getText(sys.find('countPages'))) + docinfo['numFigureEntries'] = getInt(getText(sys.find('countFigures'))) + docinfo['numHandwritten'] = getInt(getText(sys.find('countHandwritten'))) + docinfo['numTocEntries'] = getInt(getText(sys.find('countTocEntries'))) else: # result is in list-tag @@ -145,9 +148,9 @@ page = {} pn = getInt(i.get('n')) page['pn'] = pn - no = getInt(i.get('o')) + no = i.get('o') page['no'] = no - non = getInt(i.get('o-norm')) + non = i.get('o-norm') page['non'] = non if pn > 0: @@ -157,25 +160,21 @@ logging.debug("got pageNumbers=%s"%repr(pages)) # toc - elif name == 'toc': + elif lt == 'toc' or lt == 'figures' or lt == 'handwritten': # contains tags with table of contents/figures - # <toc-entry><page>13</page><level>3</level><content>Chapter I</content><level-string>1.</level-string><real-level>1</real-level></toc-entry> + # <item n="2.1." lv="2">CAP.I. <ref o="119">132</ref></item> tocs = [] - for te in tag: - toc = {} - for t in te: - if t.tag == 'page': - toc['pn'] = getInt(t.text) - elif t.tag == 'level': - toc['level'] = t.text - elif t.tag == 'content': - toc['content'] = t.text - elif t.tag == 'level-string': - toc['level-string'] = t.text - elif t.tag == 'real-level': - toc['real-level'] = t.text - - tocs.append(toc) + for te in l: + if te.tag == 'item': + toc = {} + toc['level-string'] = te.get('n') + toc['level'] = te.get('lv') + toc['content'] = te.text.strip() + ref = te.find('ref') + toc['pn'] = getInt(ref.text) + toc['no'] = ref.get('o') + toc['non'] = ref.get('o-norm') + tocs.append(toc) # save as full_toc/full_figures docinfo['full_%s'%mode] = tocs @@ -183,34 +182,6 @@ return docinfo - def processPageInfo(self, dom, docinfo, pageinfo): - """processes page info divs from dom and stores in docinfo and pageinfo""" - # assume first second level div is pageMeta - alldivs = dom.find("div") - - if alldivs is None or alldivs.get('class', '') != 'pageMeta': - logging.error("processPageInfo: pageMeta div not found!") - return - - for div in alldivs: - dc = div.get('class') - - # pageNumberOrig - if dc == 'pageNumberOrig': - pageinfo['pageNumberOrig'] = div.text - - # pageNumberOrigNorm - elif dc == 'pageNumberOrigNorm': - pageinfo['pageNumberOrigNorm'] = div.text - - # pageHeaderTitle - elif dc == 'pageHeaderTitle': - pageinfo['pageHeaderTitle'] = div.text - - #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo)) - return - - def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): """returns single page from fulltext""" @@ -451,6 +422,8 @@ if mode == "none": return docinfo + #TODO: put mode into query + cachedQuery = docinfo.get('cachedQuery', None) if cachedQuery is not None: # cached search result @@ -461,39 +434,34 @@ else: # different query del docinfo['resultSize'] - del docinfo['resultXML'] + del docinfo['results'] # cache query docinfo['cachedQuery'] = '%s_%s'%(mode,query) # fetch full results docpath = docinfo['textURLPath'] - params = {'document': docpath, - 'mode': 'text', - 'queryType': mode, + params = {'docId': docpath, 'query': query, - 'queryResultPageSize': 1000, - 'queryResultPN': 1, - 'characterNormalization': pageinfo.get('characterNormalization', 'reg')} - pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params)) - #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) - dom = ET.fromstring(pagexml) - # page content is in <div class="queryResultPage"> - pagediv = None - # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] - alldivs = dom.findall("div") - for div in alldivs: - dc = div.get('class') - # page content div - if dc == 'queryResultPage': - pagediv = div + 'pageSize': 1000, + 'page': 1, + 'outputFormat': 'html'} + pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params)) + results = [] + try: + dom = ET.fromstring(pagexml) + # page content is currently in multiple <td align=left> + alldivs = dom.findall(".//td[@align='left']") + for div in alldivs: + # TODO: can we put etree in the session? + results.append(div) + + except Exception, e: + logging.error("GetSearchResults: Error parsing search result: %s"%e) - elif dc == 'queryResultHits': - docinfo['resultSize'] = getInt(div.text) - - if pagediv is not None: - # store XML in docinfo - docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8') + # store results in docinfo + docinfo['resultSize'] = len(results) + docinfo['results'] = results return docinfo @@ -504,9 +472,9 @@ # get (cached) result self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) - resultxml = docinfo.get('resultXML', None) + resultxml = docinfo.get('results', None) if not resultxml: - logging.error("getResultPage: unable to find resultXML") + logging.error("getResultPage: unable to find results") return "Error: no result!" if size is None: @@ -561,6 +529,7 @@ return docinfo.get('full_%s'%queryType, []) + def getTocPage(self, mode='text', pn=None, start=None, size=None, pageinfo=None, docinfo=None): """returns single page from the table of contents""" logging.debug("getTocPage mode=%s, pn=%s start=%s size=%s"%(mode,repr(pn),repr(start),repr(size))) @@ -583,8 +552,17 @@ for toc in tocs: pageurl = self.getLink('pn', toc['pn']) tp += '<div class="tocline">' - tp += '<div class="toc name">[%s %s]</div>'%(toc['level-string'], toc['content']) - tp += '<div class="toc float right page"><a href="%s">Page: %s</a></div>'%(pageurl, toc['pn']) + content = toc['content'] + if content: + tp += '<div class="toc name">[%s] %s</div>'%(toc['level-string'], toc['content']) + else: + tp += '<div class="toc name">[Figure %s]</div>'%(toc['level-string']) + + if toc.get('no', None): + tp += '<div class="toc page"><a href="%s">Page: %s (%s)</a></div>'%(pageurl, toc['pn'], toc['no']) + else: + tp += '<div class="toc page"><a href="%s">Page: %s</a></div>'%(pageurl, toc['pn']) + tp += '</div>\n' tp += '</div>\n'
--- a/css/docuviewer.css Thu Oct 11 10:21:49 2012 +0200 +++ b/css/docuviewer.css Thu Oct 11 18:27:14 2012 +0200 @@ -146,15 +146,16 @@ background-color: white; } -div.tocbody.text .toc, -div.tocbody.figures .toc, -div.tocbody.concordance .toc { +div.tocbody.text .toc.name, +div.tocbody.figures .toc.name, +div.tocbody.concordance .toc.name { float:left; clear:right; + margin-right: 1em; } -div.tocbody.text .toc.float.right, -div.tocbody.figures .toc.float.right, -div.tocbody.concordance .toc.float.right { +div.tocbody.text .toc.page, +div.tocbody.figures .toc.page, +div.tocbody.concordance .toc.page { float:right; } @@ -272,11 +273,9 @@ div.col.main div.content.text span.pb span.o { display: none; } +/* running head */ div.col.main div.content.text span.pb span.rhead { - display: block; -} -/* running head */ -div.col.main div.content.text div.pageHeaderTitle { + display: block; text-align: center; margin-bottom: 1em; }
--- a/documentViewer.py Thu Oct 11 10:21:49 2012 +0200 +++ b/documentViewer.py Thu Oct 11 18:27:14 2012 +0200 @@ -588,9 +588,11 @@ texttool = self.metadataService.getTexttoolData(dom=metaDom, recursive=1, all=True) if texttool: docinfo = self.getDocinfoFromTexttool(docinfo, texttool) - # document info (including toc) from full text + # document info from full text server if docinfo.get('textURLPath', None): docinfo = self.getTextInfo(mode=None, docinfo=docinfo) + # include list of pages TODO: do we need this always? + docinfo = self.getTextInfo(mode='pages', docinfo=docinfo) # bib info bib = self.metadataService.getBibData(dom=metaDom)