Changeset 458:48b135b089c8 in documentViewer


Ignore:
Timestamp:
Jul 19, 2011, 6:46:35 PM (13 years ago)
Author:
casties
Branch:
elementtree
Message:

more renovation

Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • MpdlXmlTextServer.py

    r456 r458  
    1313import logging
    1414import urllib
    15 import documentViewer
    16 #from documentViewer import getTextFromNode, serializeNode
    17 
    18 def intOr0(s, default=0):
    19     """convert s to int or return default"""
    20     try:
    21         return int(s)
    22     except:
    23         return default
    24 
    25 def getText(node):
    26     """get the cdata content of a node"""
    27     if node is None:
    28         return ""
    29     # ET:
    30     text = node.text or ""
    31     for e in node:
    32         text += gettext(e)
    33         if e.tail:
    34             text += e.tail
    35 
    36     return text
     15
     16from SrvTxtUtils import getInt, getText, getHttpData
    3717
    3818def serialize(node):
     
    9171       
    9272    def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40):
    93        
    9473        """constructor"""
    9574        self.id=id
     
    10382    def getHttpData(self, url, data=None):
    10483        """returns result from url+data HTTP request"""
    105         return documentViewer.getHttpData(url,data,timeout=self.timeout)
     84        return getHttpData(url,data,timeout=self.timeout)
    10685   
    10786    def getServerData(self, method, data=None):
    10887        """returns result from text server for method+data"""
    10988        url = self.serverUrl+method
    110         return documentViewer.getHttpData(url,data,timeout=self.timeout)
     89        return getHttpData(url,data,timeout=self.timeout)
    11190
    11291    # WTF: what does this really do? can it be integrated in getPage?
     
    269248            # pageNumberOrigNorm
    270249            elif dc == 'countFigureEntries':
    271                 docinfo['countFigureEntries'] = intOr0(div.text)
     250                docinfo['countFigureEntries'] = getInt(div.text)
    272251               
    273252            # pageNumberOrigNorm
    274253            elif dc == 'countTocEntries':
    275254                # WTF: s1 = int(s)/30+1
    276                 docinfo['countTocEntries'] = intOr0(div.text)
     255                docinfo['countTocEntries'] = getInt(div.text)
    277256               
    278257            # numTextPages
    279258            elif dc == 'countPages':
    280                 np = intOr0(div.text)                   
     259                np = getInt(div.text)                   
    281260                if np > 0:
    282261                    docinfo['numTextPages'] = np
     
    505484               
    506485            elif dc == 'queryResultHits':
    507                 docinfo['tocSize_%s'%mode] = intOr0(div.text)
     486                docinfo['tocSize_%s'%mode] = getInt(div.text)
    508487
    509488        if pagediv:
    510 #            # split xml in chunks
    511 #            tocs = []
    512 #            tocdivs = pagediv.findall('div')
    513 #            for p in zip(tocdivs[::2], tocdivs[1::2]):
    514 #                toc = serialize(p[0])
    515 #                toc += serialize(p[1])
    516 #                tocs.append(toc)
    517 #                logging.debug("pair: %s"%(toc))
    518489            # store XML in docinfo
    519490            docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8')
  • documentViewer.py

    r457 r458  
    1 
    21from OFS.Folder import Folder
    32from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate
     
    65from AccessControl import getSecurityManager
    76from Globals import package_home
    8 from Products.zogiLib.zogiLib import browserCheck
    97
    108#from Ft.Xml import EMPTY_NAMESPACE, Parse
     
    1614import sys
    1715import urllib
    18 import urllib2
    1916import logging
    2017import math
     
    2320import string
    2421
     22from SrvTxtUtils import getInt, getText, getHttpData
     23
    2524def logger(txt,method,txt2):
    2625    """logging"""
     
    2827   
    2928   
    30 def getInt(number, default=0):
    31     """returns always an int (0 in case of problems)"""
    32     try:
    33         return int(number)
    34     except:
    35         return int(default)
    36 
    37 def getText(node):
    38     """get the cdata content of a node"""
    39     if node is None:
    40         return ""
    41     # ET:
    42     text = node.text or ""
    43     for e in node:
    44         text += gettext(e)
    45         if e.tail:
    46             text += e.tail
    47 
    48     # 4Suite:
    49     #nodelist=node.childNodes
    50     #text = ""
    51     #for n in nodelist:
    52     #    if n.nodeType == node.TEXT_NODE:
    53     #       text = text + n.data
    54    
    55     return text
    56 
    57 getTextFromNode = getText
    58 
    5929def serializeNode(node, encoding="utf-8"):
    6030    """returns a string containing node as XML"""
     
    12999    return bt
    130100
    131        
    132101def getParentDir(path):
    133102    """returns pathname shortened by one"""
    134103    return '/'.join(path.split('/')[0:-1])
    135104       
    136 
    137 def getHttpData(url, data=None, num_tries=3, timeout=10):
    138     """returns result from url+data HTTP request"""
    139     # we do GET (by appending data to url)
    140     if isinstance(data, str) or isinstance(data, unicode):
    141         # if data is string then append
    142         url = "%s?%s"%(url,data)
    143     elif isinstance(data, dict) or isinstance(data, list) or isinstance(data, tuple):
    144         # urlencode
    145         url = "%s?%s"%(url,urllib.urlencode(data))
    146    
    147     response = None
    148     errmsg = None
    149     for cnt in range(num_tries):
    150         try:
    151             logging.debug("getHttpData(#%s %ss) url=%s"%(cnt+1,timeout,url))
    152             if sys.version_info < (2, 6):
    153                 # set timeout on socket -- ugly :-(
    154                 import socket
    155                 socket.setdefaulttimeout(float(timeout))
    156                 response = urllib2.urlopen(url)
    157             else:
    158                 response = urllib2.urlopen(url,timeout=float(timeout))
    159             # check result?
    160             break
    161         except urllib2.HTTPError, e:
    162             logging.error("getHttpData: HTTP error(%s): %s"%(e.code,e))
    163             errmsg = str(e)
    164             # stop trying
    165             break
    166         except urllib2.URLError, e:
    167             logging.error("getHttpData: URLLIB error(%s): %s"%(e.reason,e))
    168             errmsg = str(e)
    169             # stop trying
    170             #break
    171 
    172     if response is not None:
    173         data = response.read()
    174         response.close()
    175         return data
    176    
    177     raise IOError("ERROR fetching HTTP data from %s: %s"%(url,errmsg))
    178     #return None
     105def getBibdataFromDom(dom):
     106    """returns dict with all elements from bib-tag"""
     107    bibinfo = {}
     108    bib = dom.find(".//meta/bib")
     109    if bib is not None:
     110        # put type in @type
     111        type = bib.get('type')
     112        bibinfo['@type'] = type
     113        # put all subelements in dict
     114        for e in bib:
     115            bibinfo[e.tag] = getText(e)
     116           
     117    return bibinfo
    179118
    180119##
     
    305244        '''
    306245        logging.debug("HHHHHHHHHHHHHH:load the rss")
    307         logger("documentViewer (index)", logging.INFO, "mode: %s url:%s start:%s pn:%s"%(mode,url,start,pn))
     246        logging.debug("documentViewer (index) mode: %s url:%s start:%s pn:%s"%(mode,url,start,pn))
    308247       
    309248        if not hasattr(self, 'template'):
     
    635574        logging.debug("documentViewer (getbibinfofromindexmeta cutted) path: %s"%(path))
    636575        # put in all raw bib fields as dict "bib"
    637         bib = dom.find(".//bib")
    638         #bib = dom.xpath("//bib/*")
    639         if bib is not None:
    640             bibinfo = {}
    641             for e in bib:
    642                 bibinfo[e.tag] = getText(e)
    643                
    644             docinfo['bib'] = bibinfo
     576        bib = getBibdataFromDom(dom)
     577        docinfo['bib'] = bib
    645578       
    646579        # extract some fields (author, title, year) according to their mapping
    647580        metaData=self.metadata.main.meta.bib
    648         bibtype=bib.get("type")
     581        bibtype=bib.get("@type")
    649582        #bibtype=dom.xpath("//bib/@type")
    650583        if not bibtype:
    651584            bibtype="generic"
    652585           
    653         bibtype=bibtype.replace("-"," ") # wrong typesiin index meta "-" instead of " " (not wrong! ROC)
     586        bibtype=bibtype.replace("-"," ") # wrong types in index meta "-" instead of " " (not wrong! ROC)
    654587        docinfo['bib_type'] = bibtype
    655588        bibmap=metaData.generateMappingForType(bibtype)
     
    657590        logging.debug("documentViewer (getbibinfofromindexmeta) bibtype:"+repr(bibtype))
    658591        # if there is no mapping bibmap is empty (mapping sometimes has empty fields)
    659         logging.debug("bibmap: %s"%repr(bibmap))
    660592        if len(bibmap) > 0 and bibmap.get('author',None) or bibmap.get('title',None):
    661593            try:
    662                 docinfo['author']=getText(bib.find(bibmap['author'][0]))
     594                docinfo['author']=bib.get(bibmap['author'][0])
    663595            except: pass
    664596            try:
    665                 docinfo['title']=getText(bib.find(bibmap['title'][0]))
     597                docinfo['title']=bib.get(bibmap['title'][0])
    666598            except: pass
    667599            try:
    668                 docinfo['year']=getText(bib.find(bibmap['year'][0]))
     600                docinfo['year']=bib.get(bibmap['year'][0])
    669601            except: pass
    670602           
     
    897829            docinfo['textURLPath'] = None
    898830       
    899         logging.debug("documentViewer (getdocinfo) docinfo: %s"%docinfo)
    900         #logging.debug("documentViewer (getdocinfo) docinfo: %s"%)
     831        logging.debug("documentViewer (getdocinfo) docinfo: keys=%s"%docinfo.keys())
     832        #logging.debug("documentViewer (getdocinfo) docinfo: %s"%docinfo)
    901833        self.REQUEST.SESSION['docinfo'] = docinfo
    902834        return docinfo
Note: See TracChangeset for help on using the changeset viewer.