Changeset 458:48b135b089c8 in documentViewer
- Timestamp:
- Jul 19, 2011, 6:46:35 PM (14 years ago)
- Branch:
- elementtree
- Files:
-
- 1 added
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
MpdlXmlTextServer.py
r456 r458 13 13 import logging 14 14 import urllib 15 import documentViewer 16 #from documentViewer import getTextFromNode, serializeNode 17 18 def intOr0(s, default=0): 19 """convert s to int or return default""" 20 try: 21 return int(s) 22 except: 23 return default 24 25 def getText(node): 26 """get the cdata content of a node""" 27 if node is None: 28 return "" 29 # ET: 30 text = node.text or "" 31 for e in node: 32 text += gettext(e) 33 if e.tail: 34 text += e.tail 35 36 return text 15 16 from SrvTxtUtils import getInt, getText, getHttpData 37 17 38 18 def serialize(node): … … 91 71 92 72 def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): 93 94 73 """constructor""" 95 74 self.id=id … … 103 82 def getHttpData(self, url, data=None): 104 83 """returns result from url+data HTTP request""" 105 return documentViewer.getHttpData(url,data,timeout=self.timeout)84 return getHttpData(url,data,timeout=self.timeout) 106 85 107 86 def getServerData(self, method, data=None): 108 87 """returns result from text server for method+data""" 109 88 url = self.serverUrl+method 110 return documentViewer.getHttpData(url,data,timeout=self.timeout)89 return getHttpData(url,data,timeout=self.timeout) 111 90 112 91 # WTF: what does this really do? can it be integrated in getPage? … … 269 248 # pageNumberOrigNorm 270 249 elif dc == 'countFigureEntries': 271 docinfo['countFigureEntries'] = intOr0(div.text)250 docinfo['countFigureEntries'] = getInt(div.text) 272 251 273 252 # pageNumberOrigNorm 274 253 elif dc == 'countTocEntries': 275 254 # WTF: s1 = int(s)/30+1 276 docinfo['countTocEntries'] = intOr0(div.text)255 docinfo['countTocEntries'] = getInt(div.text) 277 256 278 257 # numTextPages 279 258 elif dc == 'countPages': 280 np = intOr0(div.text)259 np = getInt(div.text) 281 260 if np > 0: 282 261 docinfo['numTextPages'] = np … … 505 484 506 485 elif dc == 'queryResultHits': 507 docinfo['tocSize_%s'%mode] = intOr0(div.text)486 docinfo['tocSize_%s'%mode] = getInt(div.text) 508 487 509 488 if pagediv: 510 # # split xml in chunks511 # tocs = []512 # tocdivs = pagediv.findall('div')513 # for p in zip(tocdivs[::2], tocdivs[1::2]):514 # toc = serialize(p[0])515 # toc += serialize(p[1])516 # tocs.append(toc)517 # logging.debug("pair: %s"%(toc))518 489 # store XML in docinfo 519 490 docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8') -
documentViewer.py
r457 r458 1 2 1 from OFS.Folder import Folder 3 2 from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate … … 6 5 from AccessControl import getSecurityManager 7 6 from Globals import package_home 8 from Products.zogiLib.zogiLib import browserCheck9 7 10 8 #from Ft.Xml import EMPTY_NAMESPACE, Parse … … 16 14 import sys 17 15 import urllib 18 import urllib219 16 import logging 20 17 import math … … 23 20 import string 24 21 22 from SrvTxtUtils import getInt, getText, getHttpData 23 25 24 def logger(txt,method,txt2): 26 25 """logging""" … … 28 27 29 28 30 def getInt(number, default=0):31 """returns always an int (0 in case of problems)"""32 try:33 return int(number)34 except:35 return int(default)36 37 def getText(node):38 """get the cdata content of a node"""39 if node is None:40 return ""41 # ET:42 text = node.text or ""43 for e in node:44 text += gettext(e)45 if e.tail:46 text += e.tail47 48 # 4Suite:49 #nodelist=node.childNodes50 #text = ""51 #for n in nodelist:52 # if n.nodeType == node.TEXT_NODE:53 # text = text + n.data54 55 return text56 57 getTextFromNode = getText58 59 29 def serializeNode(node, encoding="utf-8"): 60 30 """returns a string containing node as XML""" … … 129 99 return bt 130 100 131 132 101 def getParentDir(path): 133 102 """returns pathname shortened by one""" 134 103 return '/'.join(path.split('/')[0:-1]) 135 104 136 137 def getHttpData(url, data=None, num_tries=3, timeout=10): 138 """returns result from url+data HTTP request""" 139 # we do GET (by appending data to url) 140 if isinstance(data, str) or isinstance(data, unicode): 141 # if data is string then append 142 url = "%s?%s"%(url,data) 143 elif isinstance(data, dict) or isinstance(data, list) or isinstance(data, tuple): 144 # urlencode 145 url = "%s?%s"%(url,urllib.urlencode(data)) 146 147 response = None 148 errmsg = None 149 for cnt in range(num_tries): 150 try: 151 logging.debug("getHttpData(#%s %ss) url=%s"%(cnt+1,timeout,url)) 152 if sys.version_info < (2, 6): 153 # set timeout on socket -- ugly :-( 154 import socket 155 socket.setdefaulttimeout(float(timeout)) 156 response = urllib2.urlopen(url) 157 else: 158 response = urllib2.urlopen(url,timeout=float(timeout)) 159 # check result? 160 break 161 except urllib2.HTTPError, e: 162 logging.error("getHttpData: HTTP error(%s): %s"%(e.code,e)) 163 errmsg = str(e) 164 # stop trying 165 break 166 except urllib2.URLError, e: 167 logging.error("getHttpData: URLLIB error(%s): %s"%(e.reason,e)) 168 errmsg = str(e) 169 # stop trying 170 #break 171 172 if response is not None: 173 data = response.read() 174 response.close() 175 return data 176 177 raise IOError("ERROR fetching HTTP data from %s: %s"%(url,errmsg)) 178 #return None 105 def getBibdataFromDom(dom): 106 """returns dict with all elements from bib-tag""" 107 bibinfo = {} 108 bib = dom.find(".//meta/bib") 109 if bib is not None: 110 # put type in @type 111 type = bib.get('type') 112 bibinfo['@type'] = type 113 # put all subelements in dict 114 for e in bib: 115 bibinfo[e.tag] = getText(e) 116 117 return bibinfo 179 118 180 119 ## … … 305 244 ''' 306 245 logging.debug("HHHHHHHHHHHHHH:load the rss") 307 logg er("documentViewer (index)", logging.INFO, "mode: %s url:%s start:%s pn:%s"%(mode,url,start,pn))246 logging.debug("documentViewer (index) mode: %s url:%s start:%s pn:%s"%(mode,url,start,pn)) 308 247 309 248 if not hasattr(self, 'template'): … … 635 574 logging.debug("documentViewer (getbibinfofromindexmeta cutted) path: %s"%(path)) 636 575 # put in all raw bib fields as dict "bib" 637 bib = dom.find(".//bib") 638 #bib = dom.xpath("//bib/*") 639 if bib is not None: 640 bibinfo = {} 641 for e in bib: 642 bibinfo[e.tag] = getText(e) 643 644 docinfo['bib'] = bibinfo 576 bib = getBibdataFromDom(dom) 577 docinfo['bib'] = bib 645 578 646 579 # extract some fields (author, title, year) according to their mapping 647 580 metaData=self.metadata.main.meta.bib 648 bibtype=bib.get(" type")581 bibtype=bib.get("@type") 649 582 #bibtype=dom.xpath("//bib/@type") 650 583 if not bibtype: 651 584 bibtype="generic" 652 585 653 bibtype=bibtype.replace("-"," ") # wrong types iin index meta "-" instead of " " (not wrong! ROC)586 bibtype=bibtype.replace("-"," ") # wrong types in index meta "-" instead of " " (not wrong! ROC) 654 587 docinfo['bib_type'] = bibtype 655 588 bibmap=metaData.generateMappingForType(bibtype) … … 657 590 logging.debug("documentViewer (getbibinfofromindexmeta) bibtype:"+repr(bibtype)) 658 591 # if there is no mapping bibmap is empty (mapping sometimes has empty fields) 659 logging.debug("bibmap: %s"%repr(bibmap))660 592 if len(bibmap) > 0 and bibmap.get('author',None) or bibmap.get('title',None): 661 593 try: 662 docinfo['author']= getText(bib.find(bibmap['author'][0]))594 docinfo['author']=bib.get(bibmap['author'][0]) 663 595 except: pass 664 596 try: 665 docinfo['title']= getText(bib.find(bibmap['title'][0]))597 docinfo['title']=bib.get(bibmap['title'][0]) 666 598 except: pass 667 599 try: 668 docinfo['year']= getText(bib.find(bibmap['year'][0]))600 docinfo['year']=bib.get(bibmap['year'][0]) 669 601 except: pass 670 602 … … 897 829 docinfo['textURLPath'] = None 898 830 899 logging.debug("documentViewer (getdocinfo) docinfo: %s"%docinfo)900 #logging.debug("documentViewer (getdocinfo) docinfo: %s"% )831 logging.debug("documentViewer (getdocinfo) docinfo: keys=%s"%docinfo.keys()) 832 #logging.debug("documentViewer (getdocinfo) docinfo: %s"%docinfo) 901 833 self.REQUEST.SESSION['docinfo'] = docinfo 902 834 return docinfo
Note: See TracChangeset
for help on using the changeset viewer.