documentViewer: MpiwgXmlTextServer.py comparison

comparison MpiwgXmlTextServer.py @ 610:0488cd12355b

gis mode works again.

author	casties
date	Mon, 21 Jan 2013 19:58:21 +0100
parents	7962e6891d99
children	c57d80a649ea

comparison

equal deleted inserted replaced

-:7962e6891d99
+:0488cd12355b
 import base64
 from datetime import datetime
 from SrvTxtUtils import getInt, getText, getHttpData
+# mapping of fields in the output of /mpiwg-mpdl-cms-web/query/GetDocInfo to documentViewer docinfo
+textinfoFieldMap = {
+'countPages' : 'numTextPages',
+'countFigures' : 'numFigureEntries',
+'countNotesHandwritten' : 'numHandwritten',
+'countNotes' : 'numNotes',
+'countPlaces' : 'numPlaces',
+'countTocEntries' : 'numTocEntries'
+}
 def serialize(node):
 """returns a string containing an XML snippet of node"""
 s = ET.tostring(node, 'UTF-8')
 # snip off XML declaration
 return url
 def getPlacesOnPage(self, docinfo=None, pn=None):
 """Returns list of GIS places of page pn"""
-#FIXME!
+logging.debug("getPlacesOnPage(pn=%s"%pn)
+if not 'places' in docinfo:
+self.getTextInfo('places', docinfo)
+allplaces = docinfo.get('places', None)
+if len(allplaces) == 0:
+return []
+# search for places on this page TODO: is there a better way?
+places = [p for p in allplaces if p['pn'] == pn]
+return places
+"""OLD:
 docpath = docinfo.get('textURLPath',None)
 if not docpath:
 return None
 places=[]
 id = l.get("id")
 name = l.text
 place = {'id': id, 'name': name}
 places.append(place)
-return places
+return places"""
 def getTextInfo(self, mode=None, docinfo=None):
 """reads document info, including page concordance, from text server"""
 logging.debug("getTextInfo mode=%s"%mode)
 field = ''
-if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten']:
+if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten', 'places']:
 # translate mode to field param
 if mode == 'handwritten':
 field = '&field=notesHandwritten'
 else:
 field = '&field=%s'%mode
 else:
 if mode is None:
 # get general info from system-tag
 sys = doc.find('system')
 if sys is not None:
-docinfo['numTextPages'] = getInt(getText(sys.find('countPages')))
+for (k,v) in textinfoFieldMap.items():
-docinfo['numFigureEntries'] = getInt(getText(sys.find('countFigures')))
+# copy into docinfo (even if empty)
-docinfo['numHandwritten'] = getInt(getText(sys.find('countNotesHandwritten')))
+docinfo[v] = getInt(getText(sys.find(k)))
-docinfo['numNotes'] = getInt(getText(sys.find('countNotes')))
-docinfo['numPlaces'] = getInt(getText(sys.find('countPlaces')))
-docinfo['numTocEntries'] = getInt(getText(sys.find('countTocEntries')))
 else:
 # result is in list-tag
 l = doc.find('list')
 if l is not None:
+# look for general info
+for (k,v) in textinfoFieldMap.items():
+# copy into docinfo (only if not empty)
+s = doc.find(k)
+if s is not None:
+docinfo[v] = getInt(getText(s))
 lt = l.get('type')
+#
 # pageNumbers
+#
 if lt == 'pages':
 # contains tags with page numbers
 # <item n="14" o="2" o-norm="2" file="0014"/>
 # n=scan number, o=original page no, on=normalized original page no
 # pageNumbers is a dict indexed by scan number
 if pn > 0:
 pages[pn] = page
 docinfo['pageNumbers'] = pages
+#
 # toc
+#
 elif lt in ['toc', 'figures', 'notes', 'notesHandwritten']:
 # contains tags with table of contents/figures
 # <item n="2.1." lv="2">CAP.I. <ref o="119">132</ref></item>
 tocs = []
 for te in l:
 tocs.append(toc)
 # save as full_toc/full_figures
 docinfo['full_%s'%mode] = tocs
+#
+# places
+#
+#
+# toc
+#
+elif lt in ['places']:
+# contains tags with place-ids
+# <item id="N40004F-01"><ref>4</ref></item>
+places = []
+for p in l:
+if p.tag == 'item':
+place = {}
+place['id'] = p.get('id')
+ref = p.find('ref')
+place['pn'] = getInt(ref.text)
+places.append(place)
+docinfo['places'] = places
 return docinfo
 def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
 """returns single page from fulltext"""
 modes = mode.split(',')
 # check for multiple layers
 if len(modes) > 1:
 logging.debug("getTextPage: more than one mode=%s"%mode)
+# mode defaults
+gisMode = False
+punditMode = False
 # search mode
 if 'search' in modes:
 # add highlighting
 highlightQuery = pageinfo.get('highlightQuery', None)
 if highlightQuery:
 # ignore mode in the following
 modes.remove('search')
 # pundit mode
-punditMode = False
 if 'pundit' in modes:
 punditMode = True
 # ignore mode in the following
 modes.remove('pundit')
 elif 'xml' in modes:
 textmode = 'xml'
 textParams['outputFormat'] = 'xmlDisplay'
 normMode = 'orig'
 elif 'gis' in modes:
-#FIXME!
+gisMode = True
-textmode = 'gis'
+# gis mode uses plain text
+textmode = 'plain'
+textParams['outputFormat'] = 'html'
 else:
 # text is default mode
 textmode = 'plain'
 textParams['outputFormat'] = 'html'
 except Exception, e:
 logging.error("Error reading page: %s"%e)
 return None
 # plain text or text-with-links mode
-if textmode == "plain" or textmode == "dict":
+if textmode == 'plain' or textmode == 'dict':
 # the text is in div@class=text
 pagediv = dom.find(".//div[@class='text']")
 logging.debug("pagediv: %s"%repr(pagediv))
 if pagediv is not None:
 # add textmode and normMode classes
 l.set('target', '_blank')
 if punditMode:
 self._addPunditAttributes(pagediv, pageinfo, docinfo)
+if gisMode:
+self._addGisTags(pagediv, pageinfo, docinfo)
 s = serialize(pagediv)
 logging.debug("getTextPage done in %s"%(datetime.now()-startTime))
 return s
 # xml mode
 pagediv = dom.find(".//body")
 logging.debug("pagediv: %s"%repr(pagediv))
 if pagediv is not None:
 return serialize(pagediv)
-# pureXml mode WTF?
-elif textmode == "pureXml":
-# the text is in body
-pagediv = dom.find(".//body")
-logging.debug("pagediv: %s"%repr(pagediv))
-if pagediv is not None:
-return serialize(pagediv)
-# gis mode FIXME!
-elif textmode == "gis":
-# the text is in div@class=text
-pagediv = dom.find(".//div[@class='text']")
-logging.debug("pagediv: %s"%repr(pagediv))
-if pagediv is not None:
-# fix empty div tags
-self._fixEmptyDivs(pagediv)
-# check all a-tags
-links = pagediv.findall(".//a")
-# add our URL as backlink
-selfurl = self.getLink()
-doc = base64.b64encode(selfurl)
-for l in links:
-href = l.get('href')
-if href:
-if href.startswith('http://mappit.mpiwg-berlin.mpg.de'):
-l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href))
-l.set('target', '_blank')
-return serialize(pagediv)
 logging.error("getTextPage: error in text mode %s or in text!"%(textmode))
 return None
 def _processWTags(self, textMode, normMode, pagediv):
 """selects the necessary information from w-spans and removes the rest from pagediv"""
 ppdiv = pagediv.find(".//span[@class='pb']/..")
 ppdiv.remove(pbdiv)
 return pagediv
 def _addPunditAttributes(self, pagediv, pageinfo, docinfo):
-"""add about attributes for pundit annotation tool"""
+"""add about-attributes to divs for pundit annotation tool"""
 textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???'))
 pn = pageinfo.get('pn', '1')
-#  TODO: use pn as well?
 # check all div-tags
 divs = pagediv.findall(".//div")
 for d in divs:
 id = d.get('id')
 if id:
 # TODO: check path (cf RFC2396)
 d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id))
 cls = d.get('class','')
 cls += ' pundit-content'
 d.set('class', cls.strip())
+return pagediv
+def _addGisTags(self, pagediv, pageinfo, docinfo):
+"""add links for gis places"""
+# use last part of documentPath as db-id
+docpath = docinfo.get('documentPath', '')
+textid = docpath.split('/')[-1]
+# add our URL as backlink
+selfurl = self.getLink()
+doc = base64.b64encode(selfurl)
+# check all span@class=place
+spans = pagediv.findall(".//span[@class='place']")
+for s in spans:
+id = s.get('id')
+if id:
+# make links like http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/songy_tiang_zh_1637?id=N400061-02&doc=aHR...&format=gis
+s.tag = 'a'
+# TODO: make links configurable
+url = "http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/%s?id=%s&doc=%s&format=gis"%(textid,id,doc)
+s.set('href', url)
+s.set('target', '_blank')
 return pagediv
 def _processFigures(self, pagediv, docinfo):
 """processes figure-tags"""

Mercurial > hg > documentViewer

comparison MpiwgXmlTextServer.py @ 610:0488cd12355b