Context Navigation

← Previous Changeset
Next Changeset →

Changeset 458:48b135b089c8 in documentViewer

Timestamp:

Jul 19, 2011, 6:46:35 PM (14 years ago)

Author:

casties

Branch:

elementtree

Message:

more renovation

Files:

: 1 added
: 2 edited

MpdlXmlTextServer.py (modified) (5 diffs)
SrvTxtUtils.py (added)
documentViewer.py (modified) (10 diffs)

Legend:

: Unmodified
: Added
: Removed

MpdlXmlTextServer.py

-                      r456
+                      r458
 import logging
 import urllib
+import documentViewer
+#from documentViewer import getTextFromNode, serializeNode
+def intOr0(s, default=0):
+    """convert s to int or return default"""
+    try:
+        return int(s)
+    except:
+        return default
+def getText(node):
+    """get the cdata content of a node"""
+    if node is None:
+        return ""
+    # ET:
+    text = node.text or ""
+    for e in node:
+        text += gettext(e)
+        if e.tail:
+            text += e.tail
+    return text
+from SrvTxtUtils import getInt, getText, getHttpData
 def serialize(node):
 …
     def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40):
         """constructor"""
         self.id=id
 …
     def getHttpData(self, url, data=None):
         """returns result from url+data HTTP request"""
         return documentViewer.getHttpData(url,data,timeout=self.timeout)
+        return getHttpData(url,data,timeout=self.timeout)
     def getServerData(self, method, data=None):
         """returns result from text server for method+data"""
         url = self.serverUrl+method
         return documentViewer.getHttpData(url,data,timeout=self.timeout)
+        return getHttpData(url,data,timeout=self.timeout)
     # WTF: what does this really do? can it be integrated in getPage?
 …
             # pageNumberOrigNorm
             elif dc == 'countFigureEntries':
                 docinfo['countFigureEntries'] = intOr0(div.text)
+                docinfo['countFigureEntries'] = getInt(div.text)
             # pageNumberOrigNorm
             elif dc == 'countTocEntries':
                 # WTF: s1 = int(s)/30+1
                 docinfo['countTocEntries'] = intOr0(div.text)
+                docinfo['countTocEntries'] = getInt(div.text)
             # numTextPages
             elif dc == 'countPages':
                 np = intOr0(div.text)
+                np = getInt(div.text)
                 if np > 0:
                     docinfo['numTextPages'] = np
 …
             elif dc == 'queryResultHits':
                 docinfo['tocSize_%s'%mode] = intOr0(div.text)
+                docinfo['tocSize_%s'%mode] = getInt(div.text)
         if pagediv:
-#            # split xml in chunks
-#            tocs = []
-#            tocdivs = pagediv.findall('div')
-#            for p in zip(tocdivs[::2], tocdivs[1::2]):
-#                toc = serialize(p[0])
-#                toc += serialize(p[1])
-#                tocs.append(toc)
-#                logging.debug("pair: %s"%(toc))
             # store XML in docinfo
             docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8')

documentViewer.py

-                      r457
+                      r458
 from OFS.Folder import Folder
 from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate
 …
 from AccessControl import getSecurityManager
 from Globals import package_home
-from Products.zogiLib.zogiLib import browserCheck
 #from Ft.Xml import EMPTY_NAMESPACE, Parse
 …
 import sys
 import urllib
-import urllib2
 import logging
 import math
 …
 import string
+from SrvTxtUtils import getInt, getText, getHttpData
 def logger(txt,method,txt2):
     """logging"""
 …
-def getInt(number, default=0):
-    """returns always an int (0 in case of problems)"""
-    try:
-        return int(number)
-    except:
-        return int(default)
-def getText(node):
-    """get the cdata content of a node"""
-    if node is None:
-        return ""
-    # ET:
-    text = node.text or ""
-    for e in node:
-        text += gettext(e)
-        if e.tail:
-            text += e.tail
-    # 4Suite:
-    #nodelist=node.childNodes
-    #text = ""
-    #for n in nodelist:
-    #    if n.nodeType == node.TEXT_NODE:
-    #       text = text + n.data
-    return text
-getTextFromNode = getText
 def serializeNode(node, encoding="utf-8"):
     """returns a string containing node as XML"""
 …
     return bt
 def getParentDir(path):
     """returns pathname shortened by one"""
     return '/'.join(path.split('/')[0:-1])
+def getHttpData(url, data=None, num_tries=3, timeout=10):
+    """returns result from url+data HTTP request"""
+    # we do GET (by appending data to url)
+    if isinstance(data, str) or isinstance(data, unicode):
+        # if data is string then append
+        url = "%s?%s"%(url,data)
+    elif isinstance(data, dict) or isinstance(data, list) or isinstance(data, tuple):
+        # urlencode
+        url = "%s?%s"%(url,urllib.urlencode(data))
+    response = None
+    errmsg = None
+    for cnt in range(num_tries):
+        try:
+            logging.debug("getHttpData(#%s %ss) url=%s"%(cnt+1,timeout,url))
+            if sys.version_info < (2, 6):
+                # set timeout on socket -- ugly :-(
+                import socket
+                socket.setdefaulttimeout(float(timeout))
+                response = urllib2.urlopen(url)
+            else:
+                response = urllib2.urlopen(url,timeout=float(timeout))
+            # check result?
+            break
+        except urllib2.HTTPError, e:
+            logging.error("getHttpData: HTTP error(%s): %s"%(e.code,e))
+            errmsg = str(e)
+            # stop trying
+            break
+        except urllib2.URLError, e:
+            logging.error("getHttpData: URLLIB error(%s): %s"%(e.reason,e))
+            errmsg = str(e)
+            # stop trying
+            #break
+    if response is not None:
+        data = response.read()
+        response.close()
+        return data
+    raise IOError("ERROR fetching HTTP data from %s: %s"%(url,errmsg))
+    #return None
+def getBibdataFromDom(dom):
+    """returns dict with all elements from bib-tag"""
+    bibinfo = {}
+    bib = dom.find(".//meta/bib")
+    if bib is not None:
+        # put type in @type
+        type = bib.get('type')
+        bibinfo['@type'] = type
+        # put all subelements in dict
+        for e in bib:
+            bibinfo[e.tag] = getText(e)
+    return bibinfo
 ##
 …
         '''
         logging.debug("HHHHHHHHHHHHHH:load the rss")
         logger("documentViewer (index)", logging.INFO, "mode: %s url:%s start:%s pn:%s"%(mode,url,start,pn))
+        logging.debug("documentViewer (index) mode: %s url:%s start:%s pn:%s"%(mode,url,start,pn))
         if not hasattr(self, 'template'):
 …
         logging.debug("documentViewer (getbibinfofromindexmeta cutted) path: %s"%(path))
         # put in all raw bib fields as dict "bib"
+        bib = dom.find(".//bib")
+        #bib = dom.xpath("//bib/*")
+        if bib is not None:
+            bibinfo = {}
+            for e in bib:
+                bibinfo[e.tag] = getText(e)
+            docinfo['bib'] = bibinfo
+        bib = getBibdataFromDom(dom)
+        docinfo['bib'] = bib
         # extract some fields (author, title, year) according to their mapping
         metaData=self.metadata.main.meta.bib
         bibtype=bib.get("type")
+        bibtype=bib.get("@type")
         #bibtype=dom.xpath("//bib/@type")
         if not bibtype:
             bibtype="generic"
         bibtype=bibtype.replace("-"," ") # wrong typesiin index meta "-" instead of " " (not wrong! ROC)
+        bibtype=bibtype.replace("-"," ") # wrong types in index meta "-" instead of " " (not wrong! ROC)
         docinfo['bib_type'] = bibtype
         bibmap=metaData.generateMappingForType(bibtype)
 …
         logging.debug("documentViewer (getbibinfofromindexmeta) bibtype:"+repr(bibtype))
         # if there is no mapping bibmap is empty (mapping sometimes has empty fields)
-        logging.debug("bibmap: %s"%repr(bibmap))
         if len(bibmap) > 0 and bibmap.get('author',None) or bibmap.get('title',None):
             try:
                 docinfo['author']=getText(bib.find(bibmap['author'][0]))
+                docinfo['author']=bib.get(bibmap['author'][0])
             except: pass
             try:
                 docinfo['title']=getText(bib.find(bibmap['title'][0]))
+                docinfo['title']=bib.get(bibmap['title'][0])
             except: pass
             try:
                 docinfo['year']=getText(bib.find(bibmap['year'][0]))
+                docinfo['year']=bib.get(bibmap['year'][0])
             except: pass
 …
             docinfo['textURLPath'] = None
         logging.debug("documentViewer (getdocinfo) docinfo: %s"%docinfo)
         #logging.debug("documentViewer (getdocinfo) docinfo: %s"%)
+        logging.debug("documentViewer (getdocinfo) docinfo: keys=%s"%docinfo.keys())
+        #logging.debug("documentViewer (getdocinfo) docinfo: %s"%docinfo)
         self.REQUEST.SESSION['docinfo'] = docinfo
         return docinfo

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 458:48b135b089c8 in documentViewer

Legend:

MpdlXmlTextServer.py

documentViewer.py

Download in other formats: