# HG changeset patch # User casties # Date 1349802078 -7200 # Node ID 1b483194901c39a53db671a31c9f5ab83325da9b # Parent 31f562fa72142266ca906c733e2572b068b60ade more new MpiwgXmlTextServer. diff -r 31f562fa7214 -r 1b483194901c MpiwgXmlTextServer.py --- a/MpiwgXmlTextServer.py Mon Oct 08 20:36:00 2012 +0200 +++ b/MpiwgXmlTextServer.py Tue Oct 09 19:01:18 2012 +0200 @@ -88,11 +88,17 @@ return places - def getTextInfo(self, mode='', docinfo=None): + def getTextInfo(self, mode=None, docinfo=None): """reads document info, including page concordance, from text server""" logging.debug("getTextInfo mode=%s"%mode) - if mode not in ['toc', 'figures', '']: - mode = '' + + field = '' + if mode in ['pages', 'toc', 'figures']: + # translate mode to field param + field = '&field=%s'%mode + else: + mode = None + # check cached info if mode: # cached toc-request? @@ -100,7 +106,7 @@ return docinfo else: - # no toc-request + # cached but no toc-request? if 'numTextPages' in docinfo: return docinfo @@ -110,63 +116,69 @@ return docinfo # fetch docinfo - pagexml = self.getServerData("query/GetDocInfo","docId=%s&field=%s"%(docpath,mode)) + pagexml = self.getServerData("query/GetDocInfo","docId=%s%s"%(docpath,field)) dom = ET.fromstring(pagexml) - # all info in tag - doc = dom.find("doc") + # all info in tag + doc = dom if doc is None: logging.error("getTextInfo: unable to find document-tag!") else: - # result is in list-tag - l = doc.find('list') - if l is not None: - lt = l.get('type') - # pageNumbers - if lt == 'pages': - # contains tags with page numbers - # - # n=scan number, o=original page no, on=normalized original page no - # pageNumbers is a dict indexed by scan number - pages = {} - for i in l: - page = {} - pn = getInt(i.get('n')) - page['pn'] = pn - no = getInt(i.get('o')) - page['no'] = no - non = getInt(i.get('o-norm')) - page['non'] = non - - if pn > 0: - pages[pn] = page + if mode is None: + # get general info from system-tag + cp = doc.find('system/countPages') + if cp is not None: + docinfo['numTextPages'] = getInt(cp.text) + + else: + # result is in list-tag + l = doc.find('list') + if l is not None: + lt = l.get('type') + # pageNumbers + if lt == 'pages': + # contains tags with page numbers + # + # n=scan number, o=original page no, on=normalized original page no + # pageNumbers is a dict indexed by scan number + pages = {} + for i in l: + page = {} + pn = getInt(i.get('n')) + page['pn'] = pn + no = getInt(i.get('o')) + page['no'] = no + non = getInt(i.get('o-norm')) + page['non'] = non + + if pn > 0: + pages[pn] = page + + docinfo['pageNumbers'] = pages + logging.debug("got pageNumbers=%s"%repr(pages)) + + # toc + elif name == 'toc': + # contains tags with table of contents/figures + # 133Chapter I1.1 + tocs = [] + for te in tag: + toc = {} + for t in te: + if t.tag == 'page': + toc['pn'] = getInt(t.text) + elif t.tag == 'level': + toc['level'] = t.text + elif t.tag == 'content': + toc['content'] = t.text + elif t.tag == 'level-string': + toc['level-string'] = t.text + elif t.tag == 'real-level': + toc['real-level'] = t.text + + tocs.append(toc) - docinfo['numTextPages'] = len(pages) - docinfo['pageNumbers'] = pages - logging.debug("got pageNumbers=%s"%repr(pages)) - - # toc - elif name == 'toc': - # contains tags with table of contents/figures - # 133Chapter I1.1 - tocs = [] - for te in tag: - toc = {} - for t in te: - if t.tag == 'page': - toc['pn'] = getInt(t.text) - elif t.tag == 'level': - toc['level'] = t.text - elif t.tag == 'content': - toc['content'] = t.text - elif t.tag == 'level-string': - toc['level-string'] = t.text - elif t.tag == 'real-level': - toc['real-level'] = t.text - - tocs.append(toc) - - # save as full_toc/full_figures - docinfo['full_%s'%mode] = tocs + # save as full_toc/full_figures + docinfo['full_%s'%mode] = tocs return docinfo @@ -220,8 +232,14 @@ selfurl = docinfo['viewerUrl'] textParams = {'docId': docpath, 'page': pn} + if 'characterNormalization' in pageinfo: - textParams['normalization'] = pageinfo['characterNormalization'] + cn = pageinfo['characterNormalization'] + # TODO: change values in form + if cn == 'regPlusNorm': + cn = 'norm' + + textParams['normalization'] = cn if not mode: # default is dict @@ -272,68 +290,61 @@ # fetch the page pagexml = self.getServerData("query/GetPage",urllib.urlencode(textParams)) - dom = ET.fromstring(pagexml) - # extract additional info - #self.processPageInfo(dom, docinfo, pageinfo) - # page content is in
+ try: + dom = ET.fromstring(pagexml) + except Exception, e: + logging.error("Error parsing page: %s"%e) + return None + pagediv = None body = dom.find('.//body') if body is None: logging.error("getTextPage: no body!") return None - # currently there's lots of divs... - textspan = body.find('span/span') - divs = textspan.findall('div') - logging.debug("textdivs: %s"%repr(divs)) - pagediv = divs[0] - logging.debug("pagediv: %s"%serialize(pagediv)) + # the text is in div@class=text + pagediv = body.find(".//div[@class='text']") + logging.debug("pagediv: %s"%repr(pagediv)) # plain text mode if textmode == "text": - # get full url assuming documentViewer is parent - selfurl = self.getLink() if pagediv is not None: + # handle pb-tag + self._extractPbTag(pagediv, pageinfo) + # get full url assuming documentViewer is parent + selfurl = self.getLink() if punditMode: - pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo) + self._addPunditAttributes(pagediv, pageinfo, docinfo) # fix empty div tags - divs = pagediv.findall('.//div') - for d in divs: - if len(d) == 0 and not d.text: - # make empty divs non-empty - d.text = ' ' - + self._fixEmptyDivs(pagediv) # check all a-tags links = pagediv.findall('.//a') for l in links: href = l.get('href') + # handle notes FIXME! if href and href.startswith('#note-'): href = href.replace('#note-',"%s#note-"%selfurl) l.set('href', href) - + return serialize(pagediv) # text-with-links mode elif textmode == "dict": if pagediv is not None: + # handle pb-div + self._extractPbTag(pagediv, pageinfo) viewerurl = docinfo['viewerUrl'] selfurl = self.getLink() if punditMode: pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo) # fix empty div tags - divs = pagediv.findall('.//div') - for d in divs: - if len(d) == 0 and not d.text: - # make empty divs non-empty - d.text = ' ' - + self._fixEmptyDivs(pagediv) # check all a-tags links = pagediv.findall(".//a") for l in links: href = l.get('href') - if href: # is link with href linkurl = urlparse.urlparse(href) @@ -365,12 +376,7 @@ elif textmode == "gis": if pagediv is not None: # fix empty div tags - divs = pagediv.findall('.//div') - for d in divs: - if len(d) == 0 and not d.text: - # make empty divs non-empty - d.text = ' ' - + self._fixEmptyDivs(pagediv) # check all a-tags links = pagediv.findall(".//a") # add our URL as backlink @@ -386,8 +392,25 @@ return serialize(pagediv) return None + + def _extractPbTag(self, pagediv, pageinfo): + """extracts information from pb-tag and removes it from pagediv""" + pbdiv = pagediv.find(".//span[@class='pb']") + if pbdiv is None: + logging.warning("getTextPage: no pb-span!") + return pagediv + + # extract running head + rh = pbdiv.find(".//span[@class='rhead']") + if rh is not None: + pageinfo['pageHeaderTitle'] = getText(rh) + + # remove pb-div from parent + ppdiv = pagediv.find(".//span[@class='pb']/..") + ppdiv.remove(pbdiv) + return pagediv - def addPunditAttributes(self, pagediv, pageinfo, docinfo): + def _addPunditAttributes(self, pagediv, pageinfo, docinfo): """add about attributes for pundit annotation tool""" textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???')) pn = pageinfo.get('pn', '1') @@ -404,6 +427,17 @@ return pagediv + def _fixEmptyDivs(self, pagediv): + """fixes empty div-tags by inserting a space""" + divs = pagediv.findall('.//div') + for d in divs: + if len(d) == 0 and not d.text: + # make empty divs non-empty + d.text = ' ' + + return pagediv + + def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): """loads list of search results and stores XML in docinfo""" diff -r 31f562fa7214 -r 1b483194901c css/docuviewer.css --- a/css/docuviewer.css Mon Oct 08 20:36:00 2012 +0200 +++ b/css/docuviewer.css Tue Oct 09 19:01:18 2012 +0200 @@ -256,6 +256,38 @@ font-family: Verdana,Arial,sans-serif; font-size: 12px; } +div.col.main div.content.text .bf { + font-weight: bold; +} +div.col.main div.content.text .head { + margin-top: 0.5em; + margin-bottom: 0.25em; +} +/* running head */ +div.col.main div.content.text div.pageHeaderTitle { + text-align: center; + margin-bottom: 1em; +} +/* figures */ +div.col.main div.content.text span.figure { + display: block; + width: 200px; + margin-top: 0.5em; + margin-bottom: 0.5em; + padding: 5px; + border: 1px dashed silver; + /* float: right; */ + /* text-align: center; */ +} +div.col.main div.content.text span.figure>a, +div.col.main div.content.text span.figure span.figureNumber, +div.col.main div.content.text span.figure span.caption, +div.col.main div.content.text span.figure span.description { + display:block; +} +div.col.main div.content.text span.figure span.figureNum { + display: none; +} /* * search results */ diff -r 31f562fa7214 -r 1b483194901c documentViewer.py --- a/documentViewer.py Mon Oct 08 20:36:00 2012 +0200 +++ b/documentViewer.py Tue Oct 09 19:01:18 2012 +0200 @@ -586,7 +586,7 @@ docinfo = self.getDocinfoFromTexttool(docinfo, texttool) # document info (including toc) from full text if docinfo.get('textURLPath', None): - docinfo = self.getTextInfo(mode='pages', docinfo=docinfo) + docinfo = self.getTextInfo(mode=None, docinfo=docinfo) # bib info bib = self.metadataService.getBibData(dom=metaDom)