Mercurial > hg > documentViewer
changeset 610:0488cd12355b
gis mode works again.
author | casties |
---|---|
date | Mon, 21 Jan 2013 19:58:21 +0100 |
parents | 7962e6891d99 |
children | e18ef3786753 |
files | MpiwgXmlTextServer.py documentViewer.py |
diffstat | 2 files changed, 96 insertions(+), 48 deletions(-) [+] |
line wrap: on
line diff
--- a/MpiwgXmlTextServer.py Tue Jan 15 18:15:32 2013 +0100 +++ b/MpiwgXmlTextServer.py Mon Jan 21 19:58:21 2013 +0100 @@ -13,6 +13,16 @@ from SrvTxtUtils import getInt, getText, getHttpData +# mapping of fields in the output of /mpiwg-mpdl-cms-web/query/GetDocInfo to documentViewer docinfo +textinfoFieldMap = { + 'countPages' : 'numTextPages', + 'countFigures' : 'numFigureEntries', + 'countNotesHandwritten' : 'numHandwritten', + 'countNotes' : 'numNotes', + 'countPlaces' : 'numPlaces', + 'countTocEntries' : 'numTocEntries' + } + def serialize(node): """returns a string containing an XML snippet of node""" s = ET.tostring(node, 'UTF-8') @@ -72,7 +82,18 @@ def getPlacesOnPage(self, docinfo=None, pn=None): """Returns list of GIS places of page pn""" - #FIXME! + logging.debug("getPlacesOnPage(pn=%s"%pn) + if not 'places' in docinfo: + self.getTextInfo('places', docinfo) + + allplaces = docinfo.get('places', None) + if len(allplaces) == 0: + return [] + + # search for places on this page TODO: is there a better way? + places = [p for p in allplaces if p['pn'] == pn] + return places + """OLD: docpath = docinfo.get('textURLPath',None) if not docpath: return None @@ -87,7 +108,7 @@ place = {'id': id, 'name': name} places.append(place) - return places + return places""" def getTextInfo(self, mode=None, docinfo=None): @@ -95,7 +116,7 @@ logging.debug("getTextInfo mode=%s"%mode) field = '' - if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten']: + if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten', 'places']: # translate mode to field param if mode == 'handwritten': field = '&field=notesHandwritten' @@ -132,19 +153,25 @@ # get general info from system-tag sys = doc.find('system') if sys is not None: - docinfo['numTextPages'] = getInt(getText(sys.find('countPages'))) - docinfo['numFigureEntries'] = getInt(getText(sys.find('countFigures'))) - docinfo['numHandwritten'] = getInt(getText(sys.find('countNotesHandwritten'))) - docinfo['numNotes'] = getInt(getText(sys.find('countNotes'))) - docinfo['numPlaces'] = getInt(getText(sys.find('countPlaces'))) - docinfo['numTocEntries'] = getInt(getText(sys.find('countTocEntries'))) + for (k,v) in textinfoFieldMap.items(): + # copy into docinfo (even if empty) + docinfo[v] = getInt(getText(sys.find(k))) else: # result is in list-tag l = doc.find('list') if l is not None: + # look for general info + for (k,v) in textinfoFieldMap.items(): + # copy into docinfo (only if not empty) + s = doc.find(k) + if s is not None: + docinfo[v] = getInt(getText(s)) + lt = l.get('type') + # # pageNumbers + # if lt == 'pages': # contains tags with page numbers # <item n="14" o="2" o-norm="2" file="0014"/> @@ -164,8 +191,10 @@ pages[pn] = page docinfo['pageNumbers'] = pages - + + # # toc + # elif lt in ['toc', 'figures', 'notes', 'notesHandwritten']: # contains tags with table of contents/figures # <item n="2.1." lv="2">CAP.I. <ref o="119">132</ref></item> @@ -185,6 +214,26 @@ # save as full_toc/full_figures docinfo['full_%s'%mode] = tocs + # + # places + # + # + # toc + # + elif lt in ['places']: + # contains tags with place-ids + # <item id="N40004F-01"><ref>4</ref></item> + places = [] + for p in l: + if p.tag == 'item': + place = {} + place['id'] = p.get('id') + ref = p.find('ref') + place['pn'] = getInt(ref.text) + places.append(place) + + docinfo['places'] = places + return docinfo @@ -224,6 +273,10 @@ if len(modes) > 1: logging.debug("getTextPage: more than one mode=%s"%mode) + # mode defaults + gisMode = False + punditMode = False + # search mode if 'search' in modes: # add highlighting @@ -237,7 +290,6 @@ modes.remove('search') # pundit mode - punditMode = False if 'pundit' in modes: punditMode = True # ignore mode in the following @@ -252,8 +304,10 @@ textParams['outputFormat'] = 'xmlDisplay' normMode = 'orig' elif 'gis' in modes: - #FIXME! - textmode = 'gis' + gisMode = True + # gis mode uses plain text + textmode = 'plain' + textParams['outputFormat'] = 'html' else: # text is default mode textmode = 'plain' @@ -268,7 +322,7 @@ return None # plain text or text-with-links mode - if textmode == "plain" or textmode == "dict": + if textmode == 'plain' or textmode == 'dict': # the text is in div@class=text pagediv = dom.find(".//div[@class='text']") logging.debug("pagediv: %s"%repr(pagediv)) @@ -296,6 +350,9 @@ if punditMode: self._addPunditAttributes(pagediv, pageinfo, docinfo) + if gisMode: + self._addGisTags(pagediv, pageinfo, docinfo) + s = serialize(pagediv) logging.debug("getTextPage done in %s"%(datetime.now()-startTime)) return s @@ -308,36 +365,6 @@ if pagediv is not None: return serialize(pagediv) - # pureXml mode WTF? - elif textmode == "pureXml": - # the text is in body - pagediv = dom.find(".//body") - logging.debug("pagediv: %s"%repr(pagediv)) - if pagediv is not None: - return serialize(pagediv) - - # gis mode FIXME! - elif textmode == "gis": - # the text is in div@class=text - pagediv = dom.find(".//div[@class='text']") - logging.debug("pagediv: %s"%repr(pagediv)) - if pagediv is not None: - # fix empty div tags - self._fixEmptyDivs(pagediv) - # check all a-tags - links = pagediv.findall(".//a") - # add our URL as backlink - selfurl = self.getLink() - doc = base64.b64encode(selfurl) - for l in links: - href = l.get('href') - if href: - if href.startswith('http://mappit.mpiwg-berlin.mpg.de'): - l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href)) - l.set('target', '_blank') - - return serialize(pagediv) - logging.error("getTextPage: error in text mode %s or in text!"%(textmode)) return None @@ -408,10 +435,9 @@ return pagediv def _addPunditAttributes(self, pagediv, pageinfo, docinfo): - """add about attributes for pundit annotation tool""" + """add about-attributes to divs for pundit annotation tool""" textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???')) pn = pageinfo.get('pn', '1') - # TODO: use pn as well? # check all div-tags divs = pagediv.findall(".//div") for d in divs: @@ -425,6 +451,28 @@ return pagediv + def _addGisTags(self, pagediv, pageinfo, docinfo): + """add links for gis places""" + # use last part of documentPath as db-id + docpath = docinfo.get('documentPath', '') + textid = docpath.split('/')[-1] + # add our URL as backlink + selfurl = self.getLink() + doc = base64.b64encode(selfurl) + # check all span@class=place + spans = pagediv.findall(".//span[@class='place']") + for s in spans: + id = s.get('id') + if id: + # make links like http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/songy_tiang_zh_1637?id=N400061-02&doc=aHR...&format=gis + s.tag = 'a' + # TODO: make links configurable + url = "http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/%s?id=%s&doc=%s&format=gis"%(textid,id,doc) + s.set('href', url) + s.set('target', '_blank') + + return pagediv + def _processFigures(self, pagediv, docinfo): """processes figure-tags""" # unfortunately etree can not select class.startswith('figure') @@ -713,4 +761,4 @@ if RESPONSE is not None: RESPONSE.redirect('manage_main') - \ No newline at end of file +
--- a/documentViewer.py Tue Jan 15 18:15:32 2013 +0100 +++ b/documentViewer.py Mon Jan 21 19:58:21 2013 +0100 @@ -205,7 +205,7 @@ return self.template.fulltextclient.getRepositoryType(**args) def getTextDownloadUrl(self, **args): - """get list of gis places on one page""" + """get URL to download the full text""" return self.template.fulltextclient.getTextDownloadUrl(**args) def getPlacesOnPage(self, **args):