changeset 568:694935574177

more new MpiwgXmlTextServer.
author casties
date Thu, 11 Oct 2012 18:27:14 +0200
parents 8b1e20bf300d
children be21250420be
files MpiwgXmlTextServer.py css/docuviewer.css documentViewer.py
diffstat 3 files changed, 70 insertions(+), 91 deletions(-) [+]
line wrap: on
line diff
--- a/MpiwgXmlTextServer.py	Thu Oct 11 10:21:49 2012 +0200
+++ b/MpiwgXmlTextServer.py	Thu Oct 11 18:27:14 2012 +0200
@@ -93,7 +93,7 @@
         logging.debug("getTextInfo mode=%s"%mode)
         
         field = ''
-        if mode in ['pages', 'toc', 'figures']:
+        if mode in ['pages', 'toc', 'figures', 'handwritten']:
             # translate mode to field param
             field = '&field=%s'%mode
         else:
@@ -125,9 +125,12 @@
         else:
             if mode is None:
                 # get general info from system-tag
-                cp = doc.find('system/countPages')
-                if cp is not None:
-                    docinfo['numTextPages'] = getInt(cp.text) 
+                sys = doc.find('system')
+                if sys is not None:
+                    docinfo['numTextPages'] = getInt(getText(sys.find('countPages'))) 
+                    docinfo['numFigureEntries'] = getInt(getText(sys.find('countFigures'))) 
+                    docinfo['numHandwritten'] = getInt(getText(sys.find('countHandwritten'))) 
+                    docinfo['numTocEntries'] = getInt(getText(sys.find('countTocEntries'))) 
                     
             else:
                 # result is in list-tag
@@ -145,9 +148,9 @@
                             page = {}
                             pn = getInt(i.get('n'))
                             page['pn'] = pn
-                            no = getInt(i.get('o'))
+                            no = i.get('o')
                             page['no'] = no
-                            non = getInt(i.get('o-norm'))
+                            non = i.get('o-norm')
                             page['non'] = non
                                     
                             if pn > 0:
@@ -157,25 +160,21 @@
                         logging.debug("got pageNumbers=%s"%repr(pages))
                                     
                     # toc
-                    elif name == 'toc':
+                    elif lt == 'toc' or lt == 'figures' or lt == 'handwritten':
                         # contains tags with table of contents/figures
-                        # <toc-entry><page>13</page><level>3</level><content>Chapter I</content><level-string>1.</level-string><real-level>1</real-level></toc-entry>
+                        # <item n="2.1." lv="2">CAP.I. <ref o="119">132</ref></item>
                         tocs = []
-                        for te in tag:
-                            toc = {}
-                            for t in te:
-                                if t.tag == 'page':
-                                    toc['pn'] = getInt(t.text)
-                                elif t.tag == 'level':
-                                    toc['level'] = t.text
-                                elif t.tag == 'content':
-                                    toc['content'] = t.text
-                                elif t.tag == 'level-string':
-                                    toc['level-string'] = t.text
-                                elif t.tag == 'real-level':
-                                    toc['real-level'] = t.text
-                                    
-                            tocs.append(toc)
+                        for te in l:
+                            if te.tag == 'item':
+                                toc = {}
+                                toc['level-string'] = te.get('n')
+                                toc['level'] = te.get('lv')
+                                toc['content'] = te.text.strip()
+                                ref = te.find('ref')
+                                toc['pn'] = getInt(ref.text)
+                                toc['no'] = ref.get('o')
+                                toc['non'] = ref.get('o-norm')
+                                tocs.append(toc)
                         
                         # save as full_toc/full_figures
                         docinfo['full_%s'%mode] = tocs
@@ -183,34 +182,6 @@
         return docinfo
         
           
-    def processPageInfo(self, dom, docinfo, pageinfo):
-        """processes page info divs from dom and stores in docinfo and pageinfo"""
-        # assume first second level div is pageMeta
-        alldivs = dom.find("div")
-        
-        if alldivs is None or alldivs.get('class', '') != 'pageMeta':
-            logging.error("processPageInfo: pageMeta div not found!")
-            return
-        
-        for div in alldivs:
-            dc = div.get('class')
-            
-            # pageNumberOrig  
-            if dc == 'pageNumberOrig':
-                pageinfo['pageNumberOrig'] = div.text
-                
-            # pageNumberOrigNorm
-            elif dc == 'pageNumberOrigNorm':
-                pageinfo['pageNumberOrigNorm'] = div.text
-                
-            # pageHeaderTitle
-            elif dc == 'pageHeaderTitle':
-                pageinfo['pageHeaderTitle'] = div.text
-                        
-        #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo))
-        return
-         
-           
     def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
         """returns single page from fulltext"""
         
@@ -451,6 +422,8 @@
         if mode == "none":
             return docinfo
               
+        #TODO: put mode into query
+        
         cachedQuery = docinfo.get('cachedQuery', None)
         if cachedQuery is not None:
             # cached search result
@@ -461,39 +434,34 @@
             else:
                 # different query
                 del docinfo['resultSize']
-                del docinfo['resultXML']
+                del docinfo['results']
         
         # cache query
         docinfo['cachedQuery'] = '%s_%s'%(mode,query)
         
         # fetch full results
         docpath = docinfo['textURLPath']
-        params = {'document': docpath,
-                  'mode': 'text',
-                  'queryType': mode,
+        params = {'docId': docpath,
                   'query': query,
-                  'queryResultPageSize': 1000,
-                  'queryResultPN': 1,
-                  'characterNormalization': pageinfo.get('characterNormalization', 'reg')}
-        pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params))
-        #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery)))
-        dom = ET.fromstring(pagexml)
-        # page content is in <div class="queryResultPage">
-        pagediv = None
-        # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
-        alldivs = dom.findall("div")
-        for div in alldivs:
-            dc = div.get('class')
-            # page content div
-            if dc == 'queryResultPage':
-                pagediv = div
+                  'pageSize': 1000,
+                  'page': 1,
+                  'outputFormat': 'html'}
+        pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params))
+        results = []
+        try:
+            dom = ET.fromstring(pagexml)
+            # page content is currently in multiple <td align=left>
+            alldivs = dom.findall(".//td[@align='left']")
+            for div in alldivs:
+                # TODO: can we put etree in the session?
+                results.append(div)
+        
+        except Exception, e:
+            logging.error("GetSearchResults: Error parsing search result: %s"%e)
                 
-            elif dc == 'queryResultHits':
-                docinfo['resultSize'] = getInt(div.text)
-
-        if pagediv is not None:
-            # store XML in docinfo
-            docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8')
+        # store results in docinfo
+        docinfo['resultSize'] = len(results)
+        docinfo['results'] = results
 
         return docinfo
     
@@ -504,9 +472,9 @@
         # get (cached) result
         self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
             
-        resultxml = docinfo.get('resultXML', None)
+        resultxml = docinfo.get('results', None)
         if not resultxml:
-            logging.error("getResultPage: unable to find resultXML")
+            logging.error("getResultPage: unable to find results")
             return "Error: no result!"
         
         if size is None:
@@ -561,6 +529,7 @@
             
         return docinfo.get('full_%s'%queryType, [])
 
+
     def getTocPage(self, mode='text', pn=None, start=None, size=None, pageinfo=None, docinfo=None):
         """returns single page from the table of contents"""
         logging.debug("getTocPage mode=%s, pn=%s start=%s size=%s"%(mode,repr(pn),repr(start),repr(size)))
@@ -583,8 +552,17 @@
         for toc in tocs:
             pageurl = self.getLink('pn', toc['pn'])
             tp += '<div class="tocline">'
-            tp += '<div class="toc name">[%s %s]</div>'%(toc['level-string'], toc['content'])
-            tp += '<div class="toc float right page"><a href="%s">Page: %s</a></div>'%(pageurl, toc['pn'])
+            content = toc['content']
+            if content:
+                tp += '<div class="toc name">[%s] %s</div>'%(toc['level-string'], toc['content'])
+            else:
+                tp += '<div class="toc name">[Figure %s]</div>'%(toc['level-string'])
+            
+            if toc.get('no', None):
+                tp += '<div class="toc page"><a href="%s">Page: %s (%s)</a></div>'%(pageurl, toc['pn'], toc['no'])
+            else:
+                tp += '<div class="toc page"><a href="%s">Page: %s</a></div>'%(pageurl, toc['pn'])
+                
             tp += '</div>\n'
             
         tp += '</div>\n'
--- a/css/docuviewer.css	Thu Oct 11 10:21:49 2012 +0200
+++ b/css/docuviewer.css	Thu Oct 11 18:27:14 2012 +0200
@@ -146,15 +146,16 @@
     background-color: white;
 }
 
-div.tocbody.text .toc, 
-div.tocbody.figures .toc,
-div.tocbody.concordance .toc {
+div.tocbody.text .toc.name, 
+div.tocbody.figures .toc.name,
+div.tocbody.concordance .toc.name {
     float:left;
     clear:right; 
+    margin-right: 1em;
 }
-div.tocbody.text .toc.float.right, 
-div.tocbody.figures .toc.float.right,
-div.tocbody.concordance .toc.float.right {
+div.tocbody.text .toc.page, 
+div.tocbody.figures .toc.page,
+div.tocbody.concordance .toc.page {
     float:right;
 }
 
@@ -272,11 +273,9 @@
 div.col.main div.content.text span.pb span.o {
 	display: none;
 }
+/* running head */
 div.col.main div.content.text span.pb span.rhead {
-	display: block;
-}
-/* running head */
-div.col.main div.content.text div.pageHeaderTitle {
+    display: block;
     text-align: center;
     margin-bottom: 1em;
 }
--- a/documentViewer.py	Thu Oct 11 10:21:49 2012 +0200
+++ b/documentViewer.py	Thu Oct 11 18:27:14 2012 +0200
@@ -588,9 +588,11 @@
             texttool = self.metadataService.getTexttoolData(dom=metaDom, recursive=1, all=True)
             if texttool:
                 docinfo = self.getDocinfoFromTexttool(docinfo, texttool)
-                # document info (including toc) from full text
+                # document info from full text server
                 if docinfo.get('textURLPath', None):
                     docinfo = self.getTextInfo(mode=None, docinfo=docinfo)
+                    # include list of pages TODO: do we need this always? 
+                    docinfo = self.getTextInfo(mode='pages', docinfo=docinfo)
             
             # bib info
             bib = self.metadataService.getBibData(dom=metaDom)