comparison documentViewer.py @ 166:ffb5c62bd459

characterNormalization
author abukhman
date Tue, 24 Aug 2010 14:34:32 +0200
parents 820a2a4b23c3
children 7e2b97941a66
comparison
equal deleted inserted replaced
165:820a2a4b23c3 166:ffb5c62bd459
66 66
67 response = None 67 response = None
68 errmsg = None 68 errmsg = None
69 for cnt in range(num_tries): 69 for cnt in range(num_tries):
70 try: 70 try:
71 logging.debug("getHttpData(#%s %ss) url=%s"%(cnt+1,timeout,url)) 71 #logging.debug("getHttpData(#%s %ss) url=%s"%(cnt+1,timeout,url))
72 if sys.version_info < (2, 6): 72 if sys.version_info < (2, 6):
73 # set timeout on socket -- ugly :-( 73 # set timeout on socket -- ugly :-(
74 import socket 74 import socket
75 socket.setdefaulttimeout(float(timeout)) 75 socket.setdefaulttimeout(float(timeout))
76 response = urllib2.urlopen(url) 76 response = urllib2.urlopen(url)
77 else: 77 else:
78 response = urllib2.urlopen(url,timeout=float(timeout)) 78 response = urllib2.urlopen(url,timeout=float(timeout))
79 # check result? 79 # check result?
80 break 80 break
81 except urllib2.HTTPError, e: 81 except urllib2.HTTPError, e:
82 logging.error("getHttpData: HTTP error(%s): %s"%(e.code,e)) 82 #logging.error("getHttpData: HTTP error(%s): %s"%(e.code,e))
83 errmsg = str(e) 83 errmsg = str(e)
84 # stop trying 84 # stop trying
85 break 85 break
86 except urllib2.URLError, e: 86 except urllib2.URLError, e:
87 logging.error("getHttpData: URLLIB error(%s): %s"%(e.reason,e)) 87 #logging.error("getHttpData: URLLIB error(%s): %s"%(e.reason,e))
88 errmsg = str(e) 88 errmsg = str(e)
89 # stop trying 89 # stop trying
90 #break 90 #break
91 91
92 if response is not None: 92 if response is not None:
201 @param mode: defines how to access the document behind url 201 @param mode: defines how to access the document behind url
202 @param url: url which contains display information 202 @param url: url which contains display information
203 @param viewMode: if images display images, if text display text, default is images (text,images or auto) 203 @param viewMode: if images display images, if text display text, default is images (text,images or auto)
204 204
205 ''' 205 '''
206 logging.debug("HHHHHHHHHHHHHH:load the rss") 206 #logging.debug("HHHHHHHHHHHHHH:load the rss")
207 logger("documentViewer (index)", logging.INFO, "mode: %s url:%s start:%s pn:%s"%(mode,url,start,pn)) 207 #logger("documentViewer (index)", logging.INFO, "mode: %s url:%s start:%s pn:%s"%(mode,url,start,pn))
208 208
209 if not hasattr(self, 'template'): 209 if not hasattr(self, 'template'):
210 # create template folder if it doesn't exist 210 # create template folder if it doesn't exist
211 self.manage_addFolder('template') 211 self.manage_addFolder('template')
212 212
235 @param tocMode: type of 'table of contents' for navigation (thumbs, text, figures, none) 235 @param tocMode: type of 'table of contents' for navigation (thumbs, text, figures, none)
236 @param characterNormalization type of text display (reg, norm, none) 236 @param characterNormalization type of text display (reg, norm, none)
237 @param querySearch: type of different search modes (fulltext, fulltextMorph, xpath, xquery, ftIndex, ftIndexMorph, fulltextMorphLemma) 237 @param querySearch: type of different search modes (fulltext, fulltextMorph, xpath, xquery, ftIndex, ftIndexMorph, fulltextMorphLemma)
238 ''' 238 '''
239 239
240 logging.debug("documentViewer (index) mode: %s url:%s start:%s pn:%s"%(mode,url,start,pn)) 240 #logging.debug("documentViewer (index) mode: %s url:%s start:%s pn:%s"%(mode,url,start,pn))
241 241
242 if not hasattr(self, 'template'): 242 if not hasattr(self, 'template'):
243 # this won't work 243 # this won't work
244 logging.error("template folder missing!") 244 logging.error("template folder missing!")
245 return "ERROR: template folder missing!" 245 return "ERROR: template folder missing!"
321 del params[param] 321 del params[param]
322 else: 322 else:
323 params[param] = str(val) 323 params[param] = str(val)
324 324
325 # quote values and assemble into query string 325 # quote values and assemble into query string
326 logging.debug("XYXXXXX: %s"%repr(params.items())) 326 #logging.debug("XYXXXXX: %s"%repr(params.items()))
327 ps = "&amp;".join(["%s=%s"%(k,urllib.quote(v)) for (k, v) in params.items()]) 327 ps = "&amp;".join(["%s=%s"%(k,urllib.quote(v)) for (k, v) in params.items()])
328 url=self.REQUEST['URL1']+"?"+ps 328 url=self.REQUEST['URL1']+"?"+ps
329 return url 329 return url
330 330
331 def getInfo_xml(self,url,mode): 331 def getInfo_xml(self,url,mode):
340 340
341 341
342 def isAccessible(self, docinfo): 342 def isAccessible(self, docinfo):
343 """returns if access to the resource is granted""" 343 """returns if access to the resource is granted"""
344 access = docinfo.get('accessType', None) 344 access = docinfo.get('accessType', None)
345 logging.debug("documentViewer (accessOK) access type %s"%access) 345 #logging.debug("documentViewer (accessOK) access type %s"%access)
346 if access is not None and access == 'free': 346 if access is not None and access == 'free':
347 logging.debug("documentViewer (accessOK) access is free") 347 #logging.debug("documentViewer (accessOK) access is free")
348 return True 348 return True
349 elif access is None or access in self.authgroups: 349 elif access is None or access in self.authgroups:
350 # only local access -- only logged in users 350 # only local access -- only logged in users
351 user = getSecurityManager().getUser() 351 user = getSecurityManager().getUser()
352 logging.debug("documentViewer (accessOK) user=%s ip=%s"%(user,self.REQUEST.getClientAddr())) 352 #logging.debug("documentViewer (accessOK) user=%s ip=%s"%(user,self.REQUEST.getClientAddr()))
353 if user is not None: 353 if user is not None:
354 #print "user: ", user 354 #print "user: ", user
355 return (user.getUserName() != "Anonymous User") 355 return (user.getUserName() != "Anonymous User")
356 else: 356 else:
357 return False 357 return False
358 358
359 logging.error("documentViewer (accessOK) unknown access type %s"%access) 359 #logging.error("documentViewer (accessOK) unknown access type %s"%access)
360 return False 360 return False
361 361
362 362
363 def getDirinfoFromDigilib(self,path,docinfo=None,cut=0): 363 def getDirinfoFromDigilib(self,path,docinfo=None,cut=0):
364 """gibt param von dlInfo aus""" 364 """gibt param von dlInfo aus"""
369 369
370 path=getParentDir(path) 370 path=getParentDir(path)
371 371
372 infoUrl=self.digilibBaseUrl+"/dirInfo-xml.jsp?mo=dir&fn="+path 372 infoUrl=self.digilibBaseUrl+"/dirInfo-xml.jsp?mo=dir&fn="+path
373 373
374 logging.debug("documentViewer (getparamfromdigilib) dirInfo from %s"%(infoUrl)) 374 #logging.debug("documentViewer (getparamfromdigilib) dirInfo from %s"%(infoUrl))
375 375
376 txt = getHttpData(infoUrl) 376 txt = getHttpData(infoUrl)
377 if txt is None: 377 if txt is None:
378 raise IOError("Unable to get dir-info from %s"%(infoUrl)) 378 raise IOError("Unable to get dir-info from %s"%(infoUrl))
379 379
380 dom = Parse(txt) 380 dom = Parse(txt)
381 sizes=dom.xpath("//dir/size") 381 sizes=dom.xpath("//dir/size")
382 logging.debug("documentViewer (getparamfromdigilib) dirInfo:size"%sizes) 382 #logging.debug("documentViewer (getparamfromdigilib) dirInfo:size"%sizes)
383 383
384 if sizes: 384 if sizes:
385 docinfo['numPages'] = int(getTextFromNode(sizes[0])) 385 docinfo['numPages'] = int(getTextFromNode(sizes[0]))
386 else: 386 else:
387 docinfo['numPages'] = 0 387 docinfo['numPages'] = 0
403 server=self.digilibBaseUrl+"/servlet/Texter?fn=" 403 server=self.digilibBaseUrl+"/servlet/Texter?fn="
404 metaUrl=server+url.replace("/mpiwg/online","") 404 metaUrl=server+url.replace("/mpiwg/online","")
405 if not metaUrl.endswith("index.meta"): 405 if not metaUrl.endswith("index.meta"):
406 metaUrl += "/index.meta" 406 metaUrl += "/index.meta"
407 407
408 logging.debug("(getIndexMeta): METAURL: %s"%metaUrl) 408 #logging.debug("(getIndexMeta): METAURL: %s"%metaUrl)
409 txt=getHttpData(metaUrl) 409 txt=getHttpData(metaUrl)
410 if txt is None: 410 if txt is None:
411 raise IOError("Unable to read index meta from %s"%(url)) 411 raise IOError("Unable to read index meta from %s"%(url))
412 412
413 dom = Parse(txt) 413 dom = Parse(txt)
433 return dom 433 return dom
434 434
435 435
436 def getAuthinfoFromIndexMeta(self,path,docinfo=None,dom=None,cut=0): 436 def getAuthinfoFromIndexMeta(self,path,docinfo=None,dom=None,cut=0):
437 """gets authorization info from the index.meta file at path or given by dom""" 437 """gets authorization info from the index.meta file at path or given by dom"""
438 logging.debug("documentViewer (getauthinfofromindexmeta) path: %s"%(path)) 438 #logging.debug("documentViewer (getauthinfofromindexmeta) path: %s"%(path))
439 439
440 access = None 440 access = None
441 441
442 if docinfo is None: 442 if docinfo is None:
443 docinfo = {} 443 docinfo = {}
457 return docinfo 457 return docinfo
458 458
459 459
460 def getBibinfoFromIndexMeta(self,path,docinfo=None,dom=None,cut=0): 460 def getBibinfoFromIndexMeta(self,path,docinfo=None,dom=None,cut=0):
461 """gets bibliographical info from the index.meta file at path or given by dom""" 461 """gets bibliographical info from the index.meta file at path or given by dom"""
462 logging.debug("documentViewer (getbibinfofromindexmeta) path: %s"%(path)) 462 #logging.debug("documentViewer (getbibinfofromindexmeta) path: %s"%(path))
463 463
464 if docinfo is None: 464 if docinfo is None:
465 docinfo = {} 465 docinfo = {}
466 466
467 if dom is None: 467 if dom is None:
468 for x in range(cut): 468 for x in range(cut):
469 path=getParentDir(path) 469 path=getParentDir(path)
470 dom = self.getIndexMeta(path) 470 dom = self.getIndexMeta(path)
471 471
472 logging.debug("documentViewer (getbibinfofromindexmeta cutted) path: %s"%(path)) 472 #logging.debug("documentViewer (getbibinfofromindexmeta cutted) path: %s"%(path))
473 # put in all raw bib fields as dict "bib" 473 # put in all raw bib fields as dict "bib"
474 bib = dom.xpath("//bib/*") 474 bib = dom.xpath("//bib/*")
475 if bib and len(bib)>0: 475 if bib and len(bib)>0:
476 bibinfo = {} 476 bibinfo = {}
477 for e in bib: 477 for e in bib:
498 docinfo['title']=getTextFromNode(dom.xpath("//bib/%s"%bibmap['title'][0])[0]) 498 docinfo['title']=getTextFromNode(dom.xpath("//bib/%s"%bibmap['title'][0])[0])
499 except: pass 499 except: pass
500 try: 500 try:
501 docinfo['year']=getTextFromNode(dom.xpath("//bib/%s"%bibmap['year'][0])[0]) 501 docinfo['year']=getTextFromNode(dom.xpath("//bib/%s"%bibmap['year'][0])[0])
502 except: pass 502 except: pass
503 logging.debug("documentViewer (getbibinfofromindexmeta) using mapping for %s"%bibtype) 503 #logging.debug("documentViewer (getbibinfofromindexmeta) using mapping for %s"%bibtype)
504 try: 504 try:
505 docinfo['lang']=getTextFromNode(dom.xpath("//bib/lang")[0]) 505 docinfo['lang']=getTextFromNode(dom.xpath("//bib/lang")[0])
506 except: 506 except:
507 docinfo['lang']='' 507 docinfo['lang']=''
508 508
509 return docinfo 509 return docinfo
510 510
511 511
512 def getDocinfoFromTextTool(self, url, dom=None, docinfo=None): 512 def getDocinfoFromTextTool(self, url, dom=None, docinfo=None):
513 """parse texttool tag in index meta""" 513 """parse texttool tag in index meta"""
514 logging.debug("documentViewer (getdocinfofromtexttool) url: %s" % (url)) 514 #logging.debug("documentViewer (getdocinfofromtexttool) url: %s" % (url))
515 if docinfo is None: 515 if docinfo is None:
516 docinfo = {} 516 docinfo = {}
517 if docinfo.get('lang', None) is None: 517 if docinfo.get('lang', None) is None:
518 docinfo['lang'] = '' # default keine Sprache gesetzt 518 docinfo['lang'] = '' # default keine Sprache gesetzt
519 if dom is None: 519 if dom is None:
536 archivePath = '/' + archivePath 536 archivePath = '/' + archivePath
537 if archiveName and (not archivePath.endswith(archiveName)): 537 if archiveName and (not archivePath.endswith(archiveName)):
538 archivePath += "/" + archiveName 538 archivePath += "/" + archiveName
539 else: 539 else:
540 # try to get archive-path from url 540 # try to get archive-path from url
541 logging.warning("documentViewer (getdocinfofromtexttool) resource/archive-path missing in: %s" % (url)) 541 #logging.warning("documentViewer (getdocinfofromtexttool) resource/archive-path missing in: %s" % (url))
542 if (not url.startswith('http')): 542 if (not url.startswith('http')):
543 archivePath = url.replace('index.meta', '') 543 archivePath = url.replace('index.meta', '')
544 544
545 if archivePath is None: 545 if archivePath is None:
546 # we balk without archive-path 546 # we balk without archive-path
630 pass 630 pass
631 return docinfo 631 return docinfo
632 632
633 def getDocinfoFromImagePath(self,path,docinfo=None,cut=0): 633 def getDocinfoFromImagePath(self,path,docinfo=None,cut=0):
634 """path ist the path to the images it assumes that the index.meta file is one level higher.""" 634 """path ist the path to the images it assumes that the index.meta file is one level higher."""
635 logging.debug("documentViewer (getdocinfofromimagepath) path: %s"%(path)) 635 #logging.debug("documentViewer (getdocinfofromimagepath) path: %s"%(path))
636 if docinfo is None: 636 if docinfo is None:
637 docinfo = {} 637 docinfo = {}
638 path=path.replace("/mpiwg/online","") 638 path=path.replace("/mpiwg/online","")
639 docinfo['imagePath'] = path 639 docinfo['imagePath'] = path
640 docinfo=self.getDirinfoFromDigilib(path,docinfo=docinfo,cut=cut) 640 docinfo=self.getDirinfoFromDigilib(path,docinfo=docinfo,cut=cut)
641 641
642 pathorig=path 642 pathorig=path
643 for x in range(cut): 643 for x in range(cut):
644 path=getParentDir(path) 644 path=getParentDir(path)
645 logging.debug("documentViewer (getdocinfofromimagepath) PATH:"+path) 645 #logging.debug("documentViewer (getdocinfofromimagepath) PATH:"+path)
646 imageUrl=self.digilibBaseUrl+"/servlet/Scaler?fn="+path 646 imageUrl=self.digilibBaseUrl+"/servlet/Scaler?fn="+path
647 docinfo['imageURL'] = imageUrl 647 docinfo['imageURL'] = imageUrl
648 648
649 #path ist the path to the images it assumes that the index.meta file is one level higher. 649 #path ist the path to the images it assumes that the index.meta file is one level higher.
650 docinfo = self.getBibinfoFromIndexMeta(pathorig,docinfo=docinfo,cut=cut+1) 650 docinfo = self.getBibinfoFromIndexMeta(pathorig,docinfo=docinfo,cut=cut+1)
652 return docinfo 652 return docinfo
653 653
654 654
655 def getDocinfo(self, mode, url): 655 def getDocinfo(self, mode, url):
656 """returns docinfo depending on mode""" 656 """returns docinfo depending on mode"""
657 logging.debug("documentViewer (getdocinfo) mode: %s, url: %s"%(mode,url)) 657 #logging.debug("documentViewer (getdocinfo) mode: %s, url: %s"%(mode,url))
658 # look for cached docinfo in session 658 # look for cached docinfo in session
659 if self.REQUEST.SESSION.has_key('docinfo'): 659 if self.REQUEST.SESSION.has_key('docinfo'):
660 docinfo = self.REQUEST.SESSION['docinfo'] 660 docinfo = self.REQUEST.SESSION['docinfo']
661 # check if its still current 661 # check if its still current
662 if docinfo is not None and docinfo.get('mode') == mode and docinfo.get('url') == url: 662 if docinfo is not None and docinfo.get('mode') == mode and docinfo.get('url') == url:
663 logging.debug("documentViewer (getdocinfo) docinfo in session: %s"%docinfo) 663 #logging.debug("documentViewer (getdocinfo) docinfo in session: %s"%docinfo)
664 return docinfo 664 return docinfo
665 # new docinfo 665 # new docinfo
666 docinfo = {'mode': mode, 'url': url} 666 docinfo = {'mode': mode, 'url': url}
667 if mode=="texttool": #index.meta with texttool information 667 if mode=="texttool": #index.meta with texttool information
668 docinfo = self.getDocinfoFromTextTool(url, docinfo=docinfo) 668 docinfo = self.getDocinfoFromTextTool(url, docinfo=docinfo)
669 elif mode=="imagepath": 669 elif mode=="imagepath":
670 docinfo = self.getDocinfoFromImagePath(url, docinfo=docinfo) 670 docinfo = self.getDocinfoFromImagePath(url, docinfo=docinfo)
671 elif mode=="filepath": 671 elif mode=="filepath":
672 docinfo = self.getDocinfoFromImagePath(url, docinfo=docinfo,cut=1) 672 docinfo = self.getDocinfoFromImagePath(url, docinfo=docinfo,cut=1)
673 else: 673 else:
674 logging.error("documentViewer (getdocinfo) unknown mode: %s!"%mode) 674 #logging.error("documentViewer (getdocinfo) unknown mode: %s!"%mode)
675 raise ValueError("Unknown mode %s! Has to be one of 'texttool','imagepath','filepath'."%(mode)) 675 raise ValueError("Unknown mode %s! Has to be one of 'texttool','imagepath','filepath'."%(mode))
676 676
677 logging.debug("documentViewer (getdocinfo) docinfo: %s"%docinfo) 677 #logging.debug("documentViewer (getdocinfo) docinfo: %s"%docinfo)
678 self.REQUEST.SESSION['docinfo'] = docinfo 678 self.REQUEST.SESSION['docinfo'] = docinfo
679 return docinfo 679 return docinfo
680 680
681 def getPageinfo(self, current, start=None, rows=None, cols=None, docinfo=None, viewMode=None, tocMode=None): 681 def getPageinfo(self, current, start=None, rows=None, cols=None, docinfo=None, viewMode=None, tocMode=None):
682 """returns pageinfo with the given parameters""" 682 """returns pageinfo with the given parameters"""
765 "Add a Page Template with optional file content." 765 "Add a Page Template with optional file content."
766 766
767 self._setObject(id, DocumentViewerTemplate(id)) 767 self._setObject(id, DocumentViewerTemplate(id))
768 ob = getattr(self, id) 768 ob = getattr(self, id)
769 txt=file(os.path.join(package_home(globals()),'zpt/viewer_main.zpt'),'r').read() 769 txt=file(os.path.join(package_home(globals()),'zpt/viewer_main.zpt'),'r').read()
770 logging.info("txt %s:"%txt) 770 #logging.info("txt %s:"%txt)
771 ob.pt_edit(txt,"text/html") 771 ob.pt_edit(txt,"text/html")
772 if title: 772 if title:
773 ob.pt_setTitle(title) 773 ob.pt_setTitle(title)
774 try: 774 try:
775 u = self.DestinationURL() 775 u = self.DestinationURL()