Mercurial > hg > documentViewer

--- a/MpiwgXmlTextServer.py	Tue Jan 15 18:15:32 2013 +0100
+++ b/MpiwgXmlTextServer.py	Mon Jan 21 19:58:21 2013 +0100
@@ -13,6 +13,16 @@

 from SrvTxtUtils import getInt, getText, getHttpData

+# mapping of fields in the output of /mpiwg-mpdl-cms-web/query/GetDocInfo to documentViewer docinfo
+textinfoFieldMap = {
+                    'countPages' : 'numTextPages',
+                    'countFigures' : 'numFigureEntries',
+                    'countNotesHandwritten' : 'numHandwritten',
+                    'countNotes' : 'numNotes',
+                    'countPlaces' : 'numPlaces',
+                    'countTocEntries' : 'numTocEntries'
+                    }
+
 def serialize(node):
     """returns a string containing an XML snippet of node"""
     s = ET.tostring(node, 'UTF-8')
@@ -72,7 +82,18 @@

     def getPlacesOnPage(self, docinfo=None, pn=None):
         """Returns list of GIS places of page pn"""
-        #FIXME!
+        logging.debug("getPlacesOnPage(pn=%s"%pn)
+        if not 'places' in docinfo:
+            self.getTextInfo('places', docinfo)
+
+        allplaces = docinfo.get('places', None)
+        if len(allplaces) == 0:
+            return []
+
+        # search for places on this page TODO: is there a better way?
+        places = [p for p in allplaces if p['pn'] == pn]
+        return places
+        """OLD:
         docpath = docinfo.get('textURLPath',None)
         if not docpath:
             return None
@@ -87,7 +108,7 @@
             place = {'id': id, 'name': name}
             places.append(place)

-        return places
+        return places"""


     def getTextInfo(self, mode=None, docinfo=None):
@@ -95,7 +116,7 @@
         logging.debug("getTextInfo mode=%s"%mode)

         field = ''
-        if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten']:
+        if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten', 'places']:
             # translate mode to field param
             if mode == 'handwritten':
                 field = '&field=notesHandwritten'
@@ -132,19 +153,25 @@
                 # get general info from system-tag
                 sys = doc.find('system')
                 if sys is not None:
-                    docinfo['numTextPages'] = getInt(getText(sys.find('countPages')))
-                    docinfo['numFigureEntries'] = getInt(getText(sys.find('countFigures')))
-                    docinfo['numHandwritten'] = getInt(getText(sys.find('countNotesHandwritten')))
-                    docinfo['numNotes'] = getInt(getText(sys.find('countNotes')))
-                    docinfo['numPlaces'] = getInt(getText(sys.find('countPlaces')))
-                    docinfo['numTocEntries'] = getInt(getText(sys.find('countTocEntries')))
+                    for (k,v) in textinfoFieldMap.items():
+                        # copy into docinfo (even if empty)
+                        docinfo[v] = getInt(getText(sys.find(k)))

             else:
                 # result is in list-tag
                 l = doc.find('list')
                 if l is not None:
+                    # look for general info
+                    for (k,v) in textinfoFieldMap.items():
+                        # copy into docinfo (only if not empty)
+                        s = doc.find(k)
+                        if s is not None:
+                            docinfo[v] = getInt(getText(s))
+
                     lt = l.get('type')
+                    #
                     # pageNumbers
+                    #
                     if lt == 'pages':
                         # contains tags with page numbers
                         # <item n="14" o="2" o-norm="2" file="0014"/>
@@ -164,8 +191,10 @@
                                 pages[pn] = page

                         docinfo['pageNumbers'] = pages
-
+
+                    #
                     # toc
+                    #
                     elif lt in ['toc', 'figures', 'notes', 'notesHandwritten']:
                         # contains tags with table of contents/figures
                         # <item n="2.1." lv="2">CAP.I. <ref o="119">132</ref></item>
@@ -185,6 +214,26 @@
                         # save as full_toc/full_figures
                         docinfo['full_%s'%mode] = tocs

+                    #
+                    # places
+                    #
+                                        #
+                    # toc
+                    #
+                    elif lt in ['places']:
+                        # contains tags with place-ids
+                        # <item id="N40004F-01"><ref>4</ref></item>
+                        places = []
+                        for p in l:
+                            if p.tag == 'item':
+                                place = {}
+                                place['id'] = p.get('id')
+                                ref = p.find('ref')
+                                place['pn'] = getInt(ref.text)
+                                places.append(place)
+
+                        docinfo['places'] = places
+
         return docinfo


@@ -224,6 +273,10 @@
         if len(modes) > 1:
             logging.debug("getTextPage: more than one mode=%s"%mode)

+        # mode defaults
+        gisMode = False
+        punditMode = False
+
         # search mode
         if 'search' in modes:
             # add highlighting
@@ -237,7 +290,6 @@
             modes.remove('search')

         # pundit mode
-        punditMode = False
         if 'pundit' in modes:
             punditMode = True
             # ignore mode in the following
@@ -252,8 +304,10 @@
             textParams['outputFormat'] = 'xmlDisplay'
             normMode = 'orig'
         elif 'gis' in modes:
-            #FIXME!
-            textmode = 'gis'
+            gisMode = True
+            # gis mode uses plain text
+            textmode = 'plain'
+            textParams['outputFormat'] = 'html'
         else:
             # text is default mode
             textmode = 'plain'
@@ -268,7 +322,7 @@
             return None

         # plain text or text-with-links mode
-        if textmode == "plain" or textmode == "dict":
+        if textmode == 'plain' or textmode == 'dict':
             # the text is in div@class=text
             pagediv = dom.find(".//div[@class='text']")
             logging.debug("pagediv: %s"%repr(pagediv))
@@ -296,6 +350,9 @@
                 if punditMode:
                     self._addPunditAttributes(pagediv, pageinfo, docinfo)

+                if gisMode:
+                    self._addGisTags(pagediv, pageinfo, docinfo)
+
                 s = serialize(pagediv)
                 logging.debug("getTextPage done in %s"%(datetime.now()-startTime))
                 return s
@@ -308,36 +365,6 @@
             if pagediv is not None:
                 return serialize(pagediv)

-        # pureXml mode WTF?
-        elif textmode == "pureXml":
-            # the text is in body
-            pagediv = dom.find(".//body")
-            logging.debug("pagediv: %s"%repr(pagediv))
-            if pagediv is not None:
-                return serialize(pagediv)
-
-        # gis mode FIXME!
-        elif textmode == "gis":
-            # the text is in div@class=text
-            pagediv = dom.find(".//div[@class='text']")
-            logging.debug("pagediv: %s"%repr(pagediv))
-            if pagediv is not None:
-                # fix empty div tags
-                self._fixEmptyDivs(pagediv)
-                # check all a-tags
-                links = pagediv.findall(".//a")
-                # add our URL as backlink
-                selfurl = self.getLink()
-                doc = base64.b64encode(selfurl)
-                for l in links:
-                    href = l.get('href')
-                    if href:
-                        if href.startswith('http://mappit.mpiwg-berlin.mpg.de'):
-                            l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href))
-                            l.set('target', '_blank')
-
-                return serialize(pagediv)
-
         logging.error("getTextPage: error in text mode %s or in text!"%(textmode))
         return None

@@ -408,10 +435,9 @@
         return pagediv

     def _addPunditAttributes(self, pagediv, pageinfo, docinfo):
-        """add about attributes for pundit annotation tool"""
+        """add about-attributes to divs for pundit annotation tool"""
         textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???'))
         pn = pageinfo.get('pn', '1')
-        #  TODO: use pn as well?
         # check all div-tags
         divs = pagediv.findall(".//div")
         for d in divs:
@@ -425,6 +451,28 @@

         return pagediv

+    def _addGisTags(self, pagediv, pageinfo, docinfo):
+        """add links for gis places"""
+        # use last part of documentPath as db-id
+        docpath = docinfo.get('documentPath', '')
+        textid = docpath.split('/')[-1]
+        # add our URL as backlink
+        selfurl = self.getLink()
+        doc = base64.b64encode(selfurl)
+        # check all span@class=place
+        spans = pagediv.findall(".//span[@class='place']")
+        for s in spans:
+            id = s.get('id')
+            if id:
+                # make links like http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/songy_tiang_zh_1637?id=N400061-02&doc=aHR...&format=gis
+                s.tag = 'a'
+                # TODO: make links configurable
+                url = "http://mappit.mpiwg-berlin.mpg.de/db/RESTdb/db/mpdl/%s?id=%s&doc=%s&format=gis"%(textid,id,doc)
+                s.set('href', url)
+                s.set('target', '_blank')
+
+        return pagediv
+
     def _processFigures(self, pagediv, docinfo):
         """processes figure-tags"""
         # unfortunately etree can not select class.startswith('figure')
@@ -713,4 +761,4 @@
     if RESPONSE is not None:
         RESPONSE.redirect('manage_main')

-        
\ No newline at end of file
+
--- a/documentViewer.py	Tue Jan 15 18:15:32 2013 +0100
+++ b/documentViewer.py	Mon Jan 21 19:58:21 2013 +0100
@@ -205,7 +205,7 @@
         return self.template.fulltextclient.getRepositoryType(**args)

     def getTextDownloadUrl(self, **args):
-        """get list of gis places on one page"""
+        """get URL to download the full text"""
         return self.template.fulltextclient.getTextDownloadUrl(**args)

     def getPlacesOnPage(self, **args):