Mercurial > hg > documentViewer
changeset 565:1b483194901c
more new MpiwgXmlTextServer.
author | casties |
---|---|
date | Tue, 09 Oct 2012 19:01:18 +0200 |
parents | 31f562fa7214 |
children | 4a31608f8b0e |
files | MpiwgXmlTextServer.py css/docuviewer.css documentViewer.py |
diffstat | 3 files changed, 158 insertions(+), 92 deletions(-) [+] |
line wrap: on
line diff
--- a/MpiwgXmlTextServer.py Mon Oct 08 20:36:00 2012 +0200 +++ b/MpiwgXmlTextServer.py Tue Oct 09 19:01:18 2012 +0200 @@ -88,11 +88,17 @@ return places - def getTextInfo(self, mode='', docinfo=None): + def getTextInfo(self, mode=None, docinfo=None): """reads document info, including page concordance, from text server""" logging.debug("getTextInfo mode=%s"%mode) - if mode not in ['toc', 'figures', '']: - mode = '' + + field = '' + if mode in ['pages', 'toc', 'figures']: + # translate mode to field param + field = '&field=%s'%mode + else: + mode = None + # check cached info if mode: # cached toc-request? @@ -100,7 +106,7 @@ return docinfo else: - # no toc-request + # cached but no toc-request? if 'numTextPages' in docinfo: return docinfo @@ -110,63 +116,69 @@ return docinfo # fetch docinfo - pagexml = self.getServerData("query/GetDocInfo","docId=%s&field=%s"%(docpath,mode)) + pagexml = self.getServerData("query/GetDocInfo","docId=%s%s"%(docpath,field)) dom = ET.fromstring(pagexml) - # all info in tag <document> - doc = dom.find("doc") + # all info in tag <doc> + doc = dom if doc is None: logging.error("getTextInfo: unable to find document-tag!") else: - # result is in list-tag - l = doc.find('list') - if l is not None: - lt = l.get('type') - # pageNumbers - if lt == 'pages': - # contains tags with page numbers - # <item n="14" o="2" o-norm="2" file="0014"/> - # n=scan number, o=original page no, on=normalized original page no - # pageNumbers is a dict indexed by scan number - pages = {} - for i in l: - page = {} - pn = getInt(i.get('n')) - page['pn'] = pn - no = getInt(i.get('o')) - page['no'] = no - non = getInt(i.get('o-norm')) - page['non'] = non - - if pn > 0: - pages[pn] = page + if mode is None: + # get general info from system-tag + cp = doc.find('system/countPages') + if cp is not None: + docinfo['numTextPages'] = getInt(cp.text) + + else: + # result is in list-tag + l = doc.find('list') + if l is not None: + lt = l.get('type') + # pageNumbers + if lt == 'pages': + # contains tags with page numbers + # <item n="14" o="2" o-norm="2" file="0014"/> + # n=scan number, o=original page no, on=normalized original page no + # pageNumbers is a dict indexed by scan number + pages = {} + for i in l: + page = {} + pn = getInt(i.get('n')) + page['pn'] = pn + no = getInt(i.get('o')) + page['no'] = no + non = getInt(i.get('o-norm')) + page['non'] = non + + if pn > 0: + pages[pn] = page + + docinfo['pageNumbers'] = pages + logging.debug("got pageNumbers=%s"%repr(pages)) + + # toc + elif name == 'toc': + # contains tags with table of contents/figures + # <toc-entry><page>13</page><level>3</level><content>Chapter I</content><level-string>1.</level-string><real-level>1</real-level></toc-entry> + tocs = [] + for te in tag: + toc = {} + for t in te: + if t.tag == 'page': + toc['pn'] = getInt(t.text) + elif t.tag == 'level': + toc['level'] = t.text + elif t.tag == 'content': + toc['content'] = t.text + elif t.tag == 'level-string': + toc['level-string'] = t.text + elif t.tag == 'real-level': + toc['real-level'] = t.text + + tocs.append(toc) - docinfo['numTextPages'] = len(pages) - docinfo['pageNumbers'] = pages - logging.debug("got pageNumbers=%s"%repr(pages)) - - # toc - elif name == 'toc': - # contains tags with table of contents/figures - # <toc-entry><page>13</page><level>3</level><content>Chapter I</content><level-string>1.</level-string><real-level>1</real-level></toc-entry> - tocs = [] - for te in tag: - toc = {} - for t in te: - if t.tag == 'page': - toc['pn'] = getInt(t.text) - elif t.tag == 'level': - toc['level'] = t.text - elif t.tag == 'content': - toc['content'] = t.text - elif t.tag == 'level-string': - toc['level-string'] = t.text - elif t.tag == 'real-level': - toc['real-level'] = t.text - - tocs.append(toc) - - # save as full_toc/full_figures - docinfo['full_%s'%mode] = tocs + # save as full_toc/full_figures + docinfo['full_%s'%mode] = tocs return docinfo @@ -220,8 +232,14 @@ selfurl = docinfo['viewerUrl'] textParams = {'docId': docpath, 'page': pn} + if 'characterNormalization' in pageinfo: - textParams['normalization'] = pageinfo['characterNormalization'] + cn = pageinfo['characterNormalization'] + # TODO: change values in form + if cn == 'regPlusNorm': + cn = 'norm' + + textParams['normalization'] = cn if not mode: # default is dict @@ -272,68 +290,61 @@ # fetch the page pagexml = self.getServerData("query/GetPage",urllib.urlencode(textParams)) - dom = ET.fromstring(pagexml) - # extract additional info - #self.processPageInfo(dom, docinfo, pageinfo) - # page content is in <div class="pageContent"> + try: + dom = ET.fromstring(pagexml) + except Exception, e: + logging.error("Error parsing page: %s"%e) + return None + pagediv = None body = dom.find('.//body') if body is None: logging.error("getTextPage: no body!") return None - # currently there's lots of divs... - textspan = body.find('span/span') - divs = textspan.findall('div') - logging.debug("textdivs: %s"%repr(divs)) - pagediv = divs[0] - logging.debug("pagediv: %s"%serialize(pagediv)) + # the text is in div@class=text + pagediv = body.find(".//div[@class='text']") + logging.debug("pagediv: %s"%repr(pagediv)) # plain text mode if textmode == "text": - # get full url assuming documentViewer is parent - selfurl = self.getLink() if pagediv is not None: + # handle pb-tag + self._extractPbTag(pagediv, pageinfo) + # get full url assuming documentViewer is parent + selfurl = self.getLink() if punditMode: - pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo) + self._addPunditAttributes(pagediv, pageinfo, docinfo) # fix empty div tags - divs = pagediv.findall('.//div') - for d in divs: - if len(d) == 0 and not d.text: - # make empty divs non-empty - d.text = ' ' - + self._fixEmptyDivs(pagediv) # check all a-tags links = pagediv.findall('.//a') for l in links: href = l.get('href') + # handle notes FIXME! if href and href.startswith('#note-'): href = href.replace('#note-',"%s#note-"%selfurl) l.set('href', href) - + return serialize(pagediv) # text-with-links mode elif textmode == "dict": if pagediv is not None: + # handle pb-div + self._extractPbTag(pagediv, pageinfo) viewerurl = docinfo['viewerUrl'] selfurl = self.getLink() if punditMode: pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo) # fix empty div tags - divs = pagediv.findall('.//div') - for d in divs: - if len(d) == 0 and not d.text: - # make empty divs non-empty - d.text = ' ' - + self._fixEmptyDivs(pagediv) # check all a-tags links = pagediv.findall(".//a") for l in links: href = l.get('href') - if href: # is link with href linkurl = urlparse.urlparse(href) @@ -365,12 +376,7 @@ elif textmode == "gis": if pagediv is not None: # fix empty div tags - divs = pagediv.findall('.//div') - for d in divs: - if len(d) == 0 and not d.text: - # make empty divs non-empty - d.text = ' ' - + self._fixEmptyDivs(pagediv) # check all a-tags links = pagediv.findall(".//a") # add our URL as backlink @@ -386,8 +392,25 @@ return serialize(pagediv) return None + + def _extractPbTag(self, pagediv, pageinfo): + """extracts information from pb-tag and removes it from pagediv""" + pbdiv = pagediv.find(".//span[@class='pb']") + if pbdiv is None: + logging.warning("getTextPage: no pb-span!") + return pagediv + + # extract running head + rh = pbdiv.find(".//span[@class='rhead']") + if rh is not None: + pageinfo['pageHeaderTitle'] = getText(rh) + + # remove pb-div from parent + ppdiv = pagediv.find(".//span[@class='pb']/..") + ppdiv.remove(pbdiv) + return pagediv - def addPunditAttributes(self, pagediv, pageinfo, docinfo): + def _addPunditAttributes(self, pagediv, pageinfo, docinfo): """add about attributes for pundit annotation tool""" textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???')) pn = pageinfo.get('pn', '1') @@ -404,6 +427,17 @@ return pagediv + def _fixEmptyDivs(self, pagediv): + """fixes empty div-tags by inserting a space""" + divs = pagediv.findall('.//div') + for d in divs: + if len(d) == 0 and not d.text: + # make empty divs non-empty + d.text = ' ' + + return pagediv + + def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): """loads list of search results and stores XML in docinfo"""
--- a/css/docuviewer.css Mon Oct 08 20:36:00 2012 +0200 +++ b/css/docuviewer.css Tue Oct 09 19:01:18 2012 +0200 @@ -256,6 +256,38 @@ font-family: Verdana,Arial,sans-serif; font-size: 12px; } +div.col.main div.content.text .bf { + font-weight: bold; +} +div.col.main div.content.text .head { + margin-top: 0.5em; + margin-bottom: 0.25em; +} +/* running head */ +div.col.main div.content.text div.pageHeaderTitle { + text-align: center; + margin-bottom: 1em; +} +/* figures */ +div.col.main div.content.text span.figure { + display: block; + width: 200px; + margin-top: 0.5em; + margin-bottom: 0.5em; + padding: 5px; + border: 1px dashed silver; + /* float: right; */ + /* text-align: center; */ +} +div.col.main div.content.text span.figure>a, +div.col.main div.content.text span.figure span.figureNumber, +div.col.main div.content.text span.figure span.caption, +div.col.main div.content.text span.figure span.description { + display:block; +} +div.col.main div.content.text span.figure span.figureNum { + display: none; +} /* * search results */
--- a/documentViewer.py Mon Oct 08 20:36:00 2012 +0200 +++ b/documentViewer.py Tue Oct 09 19:01:18 2012 +0200 @@ -586,7 +586,7 @@ docinfo = self.getDocinfoFromTexttool(docinfo, texttool) # document info (including toc) from full text if docinfo.get('textURLPath', None): - docinfo = self.getTextInfo(mode='pages', docinfo=docinfo) + docinfo = self.getTextInfo(mode=None, docinfo=docinfo) # bib info bib = self.metadataService.getBibData(dom=metaDom)