--- documentViewer/documentViewer.py 2005/12/18 12:35:02 1.1 +++ documentViewer/documentViewer.py 2010/08/24 13:27:07 1.97 @@ -1,23 +1,38 @@ -genericDigilib="http://nausikaa2.rz-berlin.mpg.de/digitallibrary/" - from OFS.Folder import Folder from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate -from Products.PageTemplates.PageTemplateFile import PageTemplateFile +from Products.PageTemplates.PageTemplateFile import PageTemplateFile from AccessControl import ClassSecurityInfo +from AccessControl import getSecurityManager from Globals import package_home -from Ft.Xml.Domlette import NonvalidatingReader -from Ft.Xml.Domlette import PrettyPrint, Print -from Ft.Xml import EMPTY_NAMESPACE - -import Ft.Xml.XPath - +from Ft.Xml import EMPTY_NAMESPACE, Parse +import Ft.Xml.Domlette import os.path -import cgi +import sys import urllib +import urllib2 +import logging +import math +import urlparse +import cStringIO + +def logger(txt,method,txt2): + """logging""" + logging.info(txt+ txt2) + + +def getInt(number, default=0): + """returns always an int (0 in case of problems)""" + try: + return int(number) + except: + return int(default) def getTextFromNode(nodename): + """get the cdata content of a node""" + if nodename is None: + return "" nodelist=nodename.childNodes rc = "" for node in nodelist: @@ -25,269 +40,746 @@ def getTextFromNode(nodename): rc = rc + node.data return rc -import socket - -def urlopen(url): - """urlopen mit timeout""" - socket.setdefaulttimeout(2) - ret=urllib.urlopen(url) - socket.setdefaulttimeout(5) - return ret - -def getParamFromDigilib(path,param): - """gibt param von dlInfo aus""" - imageUrl=genericDigilib+"/dlInfo-xml.jsp?fn="+path - - try: - dom = NonvalidatingReader.parseUri(imageUrl) - except: - return None - - - params=dom.xpath("//document-parameters/parameter[@name='%s']/@value"%param) - - if params: - return params[0].value - -def parseUrlTextTool(url): - """parse index meta""" - - try: - dom = NonvalidatingReader.parseUri(url) - except: - zLOG.LOG("documentViewer (parseUrlTexttool)", zLOG.INFO,"%s (%s)"%sys.exc_info()[0:2]) - return (None,None,None) - - archivePaths=dom.xpath("//resource/archive-path") - - if archivePaths and (len(archivePaths)>0): - archivePath=getTextFromNode(archivePaths[0]) - else: - archivePath=None - - - images=dom.xpath("//texttool/image") - - if images and (len(images)>0): - image=getTextFromNode(images[0]) - else: - image=None - - if image and archivePath: - image=os.path.join(archivePath,image) - image=image.replace("/mpiwg/online",'') - pt=getParamFromDigilib(image,'pt') +def serializeNode(node, encoding='utf-8'): + """returns a string containing node as XML""" + buf = cStringIO.StringIO() + Ft.Xml.Domlette.Print(node, stream=buf, encoding=encoding) + s = buf.getvalue() + buf.close() + return s + + +def getParentDir(path): + """returns pathname shortened by one""" + return '/'.join(path.split('/')[0:-1]) + + +def getHttpData(url, data=None, num_tries=3, timeout=10): + """returns result from url+data HTTP request""" + # we do GET (by appending data to url) + if isinstance(data, str) or isinstance(data, unicode): + # if data is string then append + url = "%s?%s"%(url,data) + elif isinstance(data, dict) or isinstance(data, list) or isinstance(data, tuple): + # urlencode + url = "%s?%s"%(url,urllib.urlencode(data)) + + response = None + errmsg = None + for cnt in range(num_tries): + try: + logging.debug("getHttpData(#%s %ss) url=%s"%(cnt+1,timeout,url)) + if sys.version_info < (2, 6): + # set timeout on socket -- ugly :-( + import socket + socket.setdefaulttimeout(float(timeout)) + response = urllib2.urlopen(url) + else: + response = urllib2.urlopen(url,timeout=float(timeout)) + # check result? + break + except urllib2.HTTPError, e: + logging.error("getHttpData: HTTP error(%s): %s"%(e.code,e)) + errmsg = str(e) + # stop trying + break + except urllib2.URLError, e: + logging.error("getHttpData: URLLIB error(%s): %s"%(e.reason,e)) + errmsg = str(e) + # stop trying + #break + + if response is not None: + data = response.read() + response.close() + return data + + raise IOError("ERROR fetching HTTP data from %s: %s"%(url,errmsg)) + #return None - else: - image=None - - viewerUrls=dom.xpath("//texttool/digiliburlprefix") - - if viewerUrls and (len(viewerUrls)>0): - viewerUrl=getTextFromNode(viewerUrls[0]) - else: - viewerUrl=None - - - textUrls=dom.xpath("//texttool/text") - - if textUrls and (len(textUrls)>0): - textUrl=getTextFromNode(textUrls[0]) - else: - textUrl=None - return viewerUrl,(image,pt),textUrl -class documentViewer(ZopePageTemplate): +## +## documentViewer class +## +class documentViewer(Folder): """document viewer""" - meta_type="Document viewer" security=ClassSecurityInfo() - manage_options=ZopePageTemplate.manage_options+( + manage_options=Folder.manage_options+( {'label':'main config','action':'changeDocumentViewerForm'}, ) - _default_content_fn = os.path.join(package_home(globals()),'zpt','documentViewer_template.zpt') + # templates and forms + viewer_main = PageTemplateFile('zpt/viewer_main', globals()) + toc_thumbs = PageTemplateFile('zpt/toc_thumbs', globals()) + toc_text = PageTemplateFile('zpt/toc_text', globals()) + toc_figures = PageTemplateFile('zpt/toc_figures', globals()) + page_main_images = PageTemplateFile('zpt/page_main_images', globals()) + page_main_text = PageTemplateFile('zpt/page_main_text', globals()) + page_main_text_dict = PageTemplateFile('zpt/page_main_text_dict', globals()) + page_main_gis =PageTemplateFile ('zpt/page_main_gis', globals()) + page_main_xml = PageTemplateFile('zpt/page_main_xml', globals()) + head_main = PageTemplateFile('zpt/head_main', globals()) + docuviewer_css = PageTemplateFile('css/docuviewer.css', globals()) + info_xml = PageTemplateFile('zpt/info_xml', globals()) - def __init__(self,id,imageViewerUrl,title=""): - """init document viewer""" - self.id=id - self.title=title - self.imageViewerUrl=imageViewerUrl - + + thumbs_main_rss = PageTemplateFile('zpt/thumbs_main_rss', globals()) security.declareProtected('View management screens','changeDocumentViewerForm') - def changeDocumentViewerForm(self): - """change it""" - pt=PageTemplateFile(os.path.join(package_home(globals()),'zpt','changeDocumentViewer.zpt')).__of__(self) - return pt() - - - def changeDocumentViewer(self,imageViewerUrl,title="",RESPONSE=None): + changeDocumentViewerForm = PageTemplateFile('zpt/changeDocumentViewer', globals()) + + + def __init__(self,id,imageScalerUrl=None,textServerName=None,title="",digilibBaseUrl=None,thumbcols=2,thumbrows=5,authgroups="mpiwg"): """init document viewer""" + self.id=id self.title=title - self.imageViewerUrl=imageViewerUrl - - if RESPONSE is not None: - RESPONSE.redirect('manage_main') - + self.thumbcols = thumbcols + self.thumbrows = thumbrows + # authgroups is list of authorized groups (delimited by ,) + self.authgroups = [s.strip().lower() for s in authgroups.split(',')] + # create template folder so we can always use template.something + + templateFolder = Folder('template') + #self['template'] = templateFolder # Zope-2.12 style + self._setObject('template',templateFolder) # old style + try: + import MpdlXmlTextServer + textServer = MpdlXmlTextServer.MpdlXmlTextServer(id='fulltextclient',serverName=textServerName) + #templateFolder['fulltextclient'] = xmlRpcClient + templateFolder._setObject('fulltextclient',textServer) + except Exception, e: + logging.error("Unable to create MpdlXmlTextServer for fulltextclient: "+str(e)) + try: + from Products.zogiLib.zogiLib import zogiLib + zogilib = zogiLib(id="zogilib", title="zogilib for docuviewer", dlServerURL=imageScalerUrl, layout="book") + #templateFolder['zogilib'] = zogilib + templateFolder._setObject('zogilib',zogilib) + except Exception, e: + logging.error("Unable to create zogiLib for zogilib: "+str(e)) + + + # proxy text server methods to fulltextclient + def getTextPage(self, **args): + """get page""" + return self.template.fulltextclient.getTextPage(**args) + + def getQuery(self, **args): + """get query""" + return self.template.fulltextclient.getQuery(**args) + + def getSearch(self, **args): + """get search""" + return self.template.fulltextclient.getSearch(**args) + + def getNumPages(self, docinfo): + """get numpages""" + return self.template.fulltextclient.getNumPages(docinfo) + + def getTranslate(self, **args): + """get translate""" + return self.template.fulltextclient.getTranslate(**args) + + def getLemma(self, **args): + """get lemma""" + return self.template.fulltextclient.getLemma(**args) + + def getToc(self, **args): + """get toc""" + return self.template.fulltextclient.getToc(**args) + + def getTocPage(self, **args): + """get tocpage""" + return self.template.fulltextclient.getTocPage(**args) + - def imageLink(self,nr): - """link hinter den images""" - paramsTmp=cgi.parse_qs(self.REQUEST['QUERY_STRING']) - params={} - for x in paramsTmp.iteritems(): - params[x[0]]=x[1][0] - - params['pn']=nr - newUrl=self.REQUEST['URL']+"?"+urllib.urlencode(params) - return newUrl + security.declareProtected('View','thumbs_rss') + def thumbs_rss(self,mode,url,viewMode="auto",start=None,pn=1): + ''' + view it + @param mode: defines how to access the document behind url + @param url: url which contains display information + @param viewMode: if images display images, if text display text, default is images (text,images or auto) + ''' + logging.debug("HHHHHHHHHHHHHH:load the rss") + logger("documentViewer (index)", logging.INFO, "mode: %s url:%s start:%s pn:%s"%(mode,url,start,pn)) - def thumbruler(self,cols,rows,start,maximum): - """ruler for thumbs""" - ret="" - paramsTmp=cgi.parse_qs(self.REQUEST['QUERY_STRING']) - params={} - for x in paramsTmp.iteritems(): - - if not x[0]=="start": - params[x[0]]=x[1][0] - - newUrlSelect=self.REQUEST['URL']+"?"+urllib.urlencode(params) - if start>0: - newStart=max(start-cols*rows,0) - params['start']=newStart - newUrl=self.REQUEST['URL']+"?"+urllib.urlencode(params) - ret+="""prev"""%newUrl - - - ret+="""" + viewMode="images" + + return pt(docinfo=docinfo,pageinfo=pageinfo,viewMode=viewMode) + + security.declareProtected('View','index_html') + def index_html(self,url,mode="texttool",viewMode="auto",tocMode="thumbs",start=None,pn=1,mk=None, query=None, querySearch=None, characterNormalization=""): + ''' + view it + @param mode: defines how to access the document behind url + @param url: url which contains display information + @param viewMode: if images display images, if text display text, default is auto (text,images or auto) + @param tocMode: type of 'table of contents' for navigation (thumbs, text, figures, none) + @param characterNormalization type of text display (reg, norm, none) + @param querySearch: type of different search modes (fulltext, fulltextMorph, xpath, xquery, ftIndex, ftIndexMorph, fulltextMorphLemma) + ''' - if startnext"""%newUrl + logging.debug("documentViewer (index) mode: %s url:%s start:%s pn:%s"%(mode,url,start,pn)) - return ret + if not hasattr(self, 'template'): + # this won't work + logging.error("template folder missing!") + return "ERROR: template folder missing!" + + if not getattr(self, 'digilibBaseUrl', None): + self.digilibBaseUrl = self.findDigilibUrl() or "http://digilib.mpiwg-berlin.mpg.de/digitallibrary" + + docinfo = self.getDocinfo(mode=mode,url=url) - def textToolThumb(self,url,start=0): - """understands the texttool format - @param url: url to index.meta with texttool tag - """ - (viewerUrl,imagepath,textpath)=parseUrlTextTool(url) + if tocMode != "thumbs": + # get table of contents + docinfo = self.getToc(mode=tocMode, docinfo=docinfo) + + if viewMode=="auto": # automodus gewaehlt + if docinfo.has_key('textURL') or docinfo.has_key('textURLPath'): #texturl gesetzt und textViewer konfiguriert + viewMode="text_dict" + else: + viewMode="images" + + pageinfo = self.getPageinfo(start=start,current=pn,docinfo=docinfo,viewMode=viewMode,tocMode=tocMode) - imageUrl=genericDigilib+"/servlet/Scaler?fn=%s"%imagepath[0] + pt = getattr(self.template, 'viewer_main') + return pt(docinfo=docinfo,pageinfo=pageinfo,viewMode=viewMode,mk=self.generateMarks(mk)) + + def generateMarks(self,mk): + ret="" + if mk is None: + return "" + if not isinstance(mk, list): + mk=[mk] + for m in mk: + ret+="mk=%s"%m + return ret + + + def findDigilibUrl(self): + """try to get the digilib URL from zogilib""" + url = self.template.zogilib.getDLBaseUrl() + return url + + def getDocumentViewerURL(self): + """returns the URL of this instance""" + return self.absolute_url() + + def getStyle(self, idx, selected, style=""): + """returns a string with the given style and append 'sel' if path == selected.""" + #logger("documentViewer (getstyle)", logging.INFO, "idx: %s selected: %s style: %s"%(idx,selected,style)) + if idx == selected: + return style + 'sel' + else: + return style + + def getLink(self,param=None,val=None): + """link to documentviewer with parameter param set to val""" + params=self.REQUEST.form.copy() + if param is not None: + if val is None: + if params.has_key(param): + del params[param] + else: + params[param] = str(val) + + if params.get("mode", None) == "filepath": #wenn beim erst Aufruf filepath gesetzt wurde aendere das nun zu imagepath + params["mode"] = "imagepath" + params["url"] = getParentDir(params["url"]) + + # quote values and assemble into query string + #ps = "&".join(["%s=%s"%(k,urllib.quote(v)) for (k, v) in params.items()]) + ps = urllib.urlencode(params) + url=self.REQUEST['URL1']+"?"+ps + return url + + def getLinkAmp(self,param=None,val=None): + """link to documentviewer with parameter param set to val""" + params=self.REQUEST.form.copy() + if param is not None: + if val is None: + if params.has_key(param): + del params[param] + else: + params[param] = str(val) + + # quote values and assemble into query string + logging.debug("XYXXXXX: %s"%repr(params.items())) + ps = "&".join(["%s=%s"%(k,urllib.quote(v)) for (k, v) in params.items()]) + url=self.REQUEST['URL1']+"?"+ps + return url + + def getInfo_xml(self,url,mode): + """returns info about the document as XML""" + + if not self.digilibBaseUrl: + self.digilibBaseUrl = self.findDigilibUrl() or "http://nausikaa.mpiwg-berlin.mpg.de/digitallibrary" + + docinfo = self.getDocinfo(mode=mode,url=url) + pt = getattr(self.template, 'info_xml') + return pt(docinfo=docinfo) + + + def isAccessible(self, docinfo): + """returns if access to the resource is granted""" + access = docinfo.get('accessType', None) + logging.debug("documentViewer (accessOK) access type %s"%access) + if access is not None and access == 'free': + logging.debug("documentViewer (accessOK) access is free") + return True + elif access is None or access in self.authgroups: + # only local access -- only logged in users + user = getSecurityManager().getUser() + logging.debug("documentViewer (accessOK) user=%s ip=%s"%(user,self.REQUEST.getClientAddr())) + if user is not None: + #print "user: ", user + return (user.getUserName() != "Anonymous User") + else: + return False - pt=PageTemplateFile(os.path.join(package_home(globals()),'zpt','thumbs.zpt')).__of__(self) - return pt(imageUrl=imageUrl,pt=imagepath[1],start=start) + logging.error("documentViewer (accessOK) unknown access type %s"%access) + return False - def text(self,mode,url,pn): - """give text""" - if mode=="texttool": #index.meta with texttool information - (viewerUrl,imagepath,textpath)=parseUrlTextTool(url) - - print textpath - try: - dom = NonvalidatingReader.parseUri(textpath) - except: - return None + + def getDirinfoFromDigilib(self,path,docinfo=None,cut=0): + """gibt param von dlInfo aus""" + if docinfo is None: + docinfo = {} + + for x in range(cut): + + path=getParentDir(path) + + infoUrl=self.digilibBaseUrl+"/dirInfo-xml.jsp?mo=dir&fn="+path - list=[] - nodes=dom.xpath("//pb") - - node=nodes[int(pn)-1] - - p=node - - while p.tagName!="p": - p=p.parentNode - - - endNode=nodes[int(pn)] - - - e=endNode - - while e.tagName!="p": - e=e.parentNode + logging.debug("documentViewer (getparamfromdigilib) dirInfo from %s"%(infoUrl)) + txt = getHttpData(infoUrl) + if txt is None: + raise IOError("Unable to get dir-info from %s"%(infoUrl)) + + dom = Parse(txt) + sizes=dom.xpath("//dir/size") + logging.debug("documentViewer (getparamfromdigilib) dirInfo:size"%sizes) + + if sizes: + docinfo['numPages'] = int(getTextFromNode(sizes[0])) + else: + docinfo['numPages'] = 0 + + # TODO: produce and keep list of image names and numbers + + return docinfo + + + def getIndexMeta(self, url): + """returns dom of index.meta document at url""" + dom = None + metaUrl = None + if url.startswith("http://"): + # real URL + metaUrl = url + else: + # online path + server=self.digilibBaseUrl+"/servlet/Texter?fn=" + metaUrl=server+url.replace("/mpiwg/online","") + if not metaUrl.endswith("index.meta"): + metaUrl += "/index.meta" + + logging.debug("(getIndexMeta): METAURL: %s"%metaUrl) + txt=getHttpData(metaUrl) + if txt is None: + raise IOError("Unable to read index meta from %s"%(url)) + + dom = Parse(txt) + return dom + + def getPresentationInfoXML(self, url): + """returns dom of info.xml document at url""" + dom = None + metaUrl = None + if url.startswith("http://"): + # real URL + metaUrl = url + else: + # online path + server=self.digilibBaseUrl+"/servlet/Texter?fn=" + metaUrl=server+url.replace("/mpiwg/online","") + + txt=getHttpData(metaUrl) + if txt is None: + raise IOError("Unable to read infoXMLfrom %s"%(url)) + + dom = Parse(txt) + return dom + + + def getAuthinfoFromIndexMeta(self,path,docinfo=None,dom=None,cut=0): + """gets authorization info from the index.meta file at path or given by dom""" + logging.debug("documentViewer (getauthinfofromindexmeta) path: %s"%(path)) - next=node.parentNode - - #sammle s - while next and (next!=endNode.parentNode): - list.append(next) - next=next.nextSibling - list.append(endNode.parentNode) - - if p==e:# beide im selben paragraphen + access = None - else: - next=p - while next!=e: - print next,e - list.append(next) - next=next.nextSibling + if docinfo is None: + docinfo = {} - for x in list: - PrettyPrint(x) - - return list + if dom is None: + for x in range(cut): + path=getParentDir(path) + dom = self.getIndexMeta(path) + + acctype = dom.xpath("//access-conditions/access/@type") + if acctype and (len(acctype)>0): + access=acctype[0].value + if access in ['group', 'institution']: + access = getTextFromNode(dom.xpath("//access-conditions/access/name")[0]).lower() + + docinfo['accessType'] = access + return docinfo - def image(self,mode,url,pn): - """give image out""" - if mode=="texttool": #index.meta with texttool information - (viewerUrl,imagepath,textpath)=parseUrlTextTool(url) - url=viewerUrl+"pn=%s&fn=%s"%(pn,imagepath[0]) - ret="""