from OFS.Folder import Folder from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate from Products.PageTemplates.PageTemplateFile import PageTemplateFile from App.ImageFile import ImageFile from AccessControl import ClassSecurityInfo from AccessControl import getSecurityManager import xml.etree.ElementTree as ET import os import urllib import logging import math import urlparse import json from Products.MetaDataProvider import MetaDataFolder from SrvTxtUtils import getInt, utf8ify, getText, getHttpData, refreshingImageFileIndexHtml, sslifyUrl INDEXMETA_NS="http://md.mpiwg-berlin.mpg.de/ns/indexMeta#" def removeINDEXMETA_NS(root): #entfernt den namespace von indexmeta aus dem dom #TODO evertyhing should be changed that it can deal with NS for elem in root.getiterator(): print ("ETAG") print(elem.tag) if not hasattr(elem.tag, 'find'): continue # (1) i = elem.tag.find('{%s}'%INDEXMETA_NS) if i >= 0: elem.tag = elem.tag[i+len(('{%s}'%INDEXMETA_NS)):] print(elem.tag) def getMDText(node): """returns the @text content from the MetaDataProvider metadata node""" if isinstance(node, dict): return node.get('@text', None) if isinstance(node,list): #more than one text file if there is an attribute don't choose it for nodeInList in node: attr = nodeInList.get("@attr",None) if attr is None: return node.get('@text',None) return None return node def getParentPath(path, cnt=1): """returns pathname shortened by cnt""" # make sure path doesn't end with / path = path.rstrip('/') # split by /, shorten, and reassemble return '/'.join(path.split('/')[0:-cnt]) def getPnForPf(docinfo, pf, default=0): """returns image number for image file name or default""" if 'imgFileNames' in docinfo: pn = docinfo['imgFileNames'].get(pf, None) if pn is None: # try to cut extension xi = pf.rfind('.') if xi > 0: pf = pf[:xi] # try again, else return 0 pn = docinfo['imgFileNames'].get(pf, default) else: # no extension pn = default return pn return default def getPfForPn(docinfo, pn, default=None): """returns image file name for image number or default""" if 'imgFileIndexes' in docinfo: pn = docinfo['imgFileIndexes'].get(pn, default) return pn return default ## ## documentViewer class ## class documentViewer(Folder): """document viewer""" meta_type="Document viewer" security=ClassSecurityInfo() manage_options=Folder.manage_options+( {'label':'Configuration','action':'changeDocumentViewerForm'}, ) metadataService = None """MetaDataFolder instance""" # # templates and forms # # viewMode templates viewer_text = PageTemplateFile('zpt/viewer/viewer_text', globals()) viewer_hocr = PageTemplateFile('zpt/viewer/viewer_hocr', globals()) viewer_xml = PageTemplateFile('zpt/viewer/viewer_xml', globals()) viewer_image = PageTemplateFile('zpt/viewer/viewer_image', globals()) viewer_index = PageTemplateFile('zpt/viewer/viewer_index', globals()) viewer_thumbs = PageTemplateFile('zpt/viewer/viewer_thumbs', globals()) viewer_indexonly = PageTemplateFile('zpt/viewer/viewer_indexonly', globals()) viewer_text_image = PageTemplateFile('zpt/viewer/viewer_text_image', globals()) # available layer types (annotator not default) builtinLayers = {'text': ['dict','search','gis'], 'xml': None, 'image': None, 'index': ['extended'],'text_image': ['dict'],} availableLayers = builtinLayers; # layer templates layer_text_dict = PageTemplateFile('zpt/viewer/layer_text_dict', globals()) layer_text_image_dict = PageTemplateFile('zpt/viewer/layer_text_image_dict', globals()) layer_text_search = PageTemplateFile('zpt/viewer/layer_text_search', globals()) layer_text_annotator = PageTemplateFile('zpt/viewer/layer_text_annotator', globals()) layer_text_gis = PageTemplateFile('zpt/viewer/layer_text_gis', globals()) layer_text_pundit = PageTemplateFile('zpt/viewer/layer_text_pundit', globals()) layer_image_annotator = PageTemplateFile('zpt/viewer/layer_image_annotator', globals()) layer_image_search = PageTemplateFile('zpt/viewer/layer_image_search', globals()) layer_index_extended = PageTemplateFile('zpt/viewer/layer_index_extended', globals()) # toc templates toc_thumbs = PageTemplateFile('zpt/viewer/toc_thumbs', globals()) toc_text = PageTemplateFile('zpt/viewer/toc_text', globals()) toc_figures = PageTemplateFile('zpt/viewer/toc_figures', globals()) toc_concordance = PageTemplateFile('zpt/viewer/toc_concordance', globals()) toc_notes = PageTemplateFile('zpt/viewer/toc_notes', globals()) toc_handwritten = PageTemplateFile('zpt/viewer/toc_handwritten', globals()) toc_none = PageTemplateFile('zpt/viewer/toc_none', globals()) # other templates common_template = PageTemplateFile('zpt/viewer/common_template', globals()) info_xml = PageTemplateFile('zpt/viewer/info_xml', globals()) docuviewer_css = ImageFile('css/docuviewer.css',globals()) # make docuviewer_css refreshable for development docuviewer_css.index_html = refreshingImageFileIndexHtml docuviewer_ie_css = ImageFile('css/docuviewer_ie.css',globals()) # make docuviewer_ie_css refreshable for development #docuviewer_ie_css.index_html = refreshingImageFileIndexHtml jquery_js = ImageFile('js/jquery.js',globals()) def __init__(self,id,imageScalerUrl=None,textServerName=None,title="",digilibBaseUrl=None,thumbcols=2,thumbrows=5,authgroups="mpiwg"): """init document viewer""" self.id=id self.title=title self.thumbcols = thumbcols self.thumbrows = thumbrows # authgroups is list of authorized groups (delimited by ,) self.authgroups = [s.strip().lower() for s in authgroups.split(',')] # create template folder so we can always use template.something templateFolder = Folder('template') self['template'] = templateFolder # Zope-2.12 style #self._setObject('template',templateFolder) # old style try: import MpdlXmlTextServer textServer = MpdlXmlTextServer.MpdlXmlTextServer(id='fulltextclient',serverName=textServerName) templateFolder['fulltextclient'] = textServer #templateFolder._setObject('fulltextclient',textServer) except Exception, e: logging.error("Unable to create MpdlXmlTextServer for fulltextclient: "+str(e)) try: from Products.zogiLib.zogiLib import zogiLib zogilib = zogiLib(id="zogilib", title="zogilib for docuviewer", dlServerURL=imageScalerUrl, layout="book") templateFolder['zogilib'] = zogilib #templateFolder._setObject('zogilib',zogilib) except Exception, e: logging.error("Unable to create zogiLib for 'zogilib': "+str(e)) try: # assume MetaDataFolder instance is called metadata self.metadataService = getattr(self, 'metadata') except Exception, e: logging.error("Unable to find MetaDataFolder 'metadata': "+str(e)) if digilibBaseUrl is not None: self.digilibBaseUrl = digilibBaseUrl self.digilibScalerUrl = digilibBaseUrl + '/servlet/Scaler' self.digilibViewerUrl = digilibBaseUrl + '/jquery/digilib.html' # proxy text server methods to fulltextclient def getTextPage(self, **args): """returns full text content of page""" return self.template.fulltextclient.getTextPage(**args) def getSearchResults(self, **args): """loads list of search results and stores XML in docinfo""" return self.template.fulltextclient.getSearchResults(**args) def getResultsPage(self, **args): """returns one page of the search results""" return self.template.fulltextclient.getResultsPage(**args) def getTextInfo(self, **args): """returns document info from the text server""" return self.template.fulltextclient.getTextInfo(**args) def getToc(self, **args): """loads table of contents and stores XML in docinfo""" return self.template.fulltextclient.getToc(**args) def getTocPage(self, **args): """returns one page of the table of contents""" return self.template.fulltextclient.getTocPage(**args) def getRepositoryType(self, **args): """get repository type""" return self.template.fulltextclient.getRepositoryType(**args) def getTextDownloadUrl(self, **args): """get URL to download the full text""" return self.template.fulltextclient.getTextDownloadUrl(**args) def getPlacesOnPage(self, **args): """get list of gis places on one page""" return self.template.fulltextclient.getPlacesOnPage(**args) # Thumb list for CoolIris Plugin thumbs_main_rss = PageTemplateFile('zpt/thumbs_main_rss', globals()) security.declareProtected('View','thumbs_rss') def thumbs_rss(self,mode,url,viewMode="auto",start=None,pn=1): ''' view it @param mode: defines how to access the document behind url @param url: url which contains display information @param viewMode: image: display images, text: display text, default is auto (try text, else image) ''' if not hasattr(self, 'template'): # this won't work logging.error("template folder missing!") return "ERROR: template folder missing!" if not self.digilibBaseUrl: self.digilibBaseUrl = self.findDigilibUrl() or "http://digilib.mpiwg-berlin.mpg.de/digitallibrary" docinfo = self.getDocinfo(mode=mode,url=url) #pageinfo = self.getPageinfo(start=start,current=pn,docinfo=docinfo) pageinfo = self.getPageinfo(start=start,pn=pn, docinfo=docinfo) ''' ZDES ''' pt = getattr(self.template, 'thumbs_main_rss') if viewMode=="auto": # automodus gewaehlt if docinfo.has_key("textURL") or docinfo.get('textURLPath',None): #texturl gesetzt und textViewer konfiguriert viewMode="text" else: viewMode="image" return pt(docinfo=docinfo,pageinfo=pageinfo,viewMode=viewMode) security.declareProtected('View','index_html') def index_html(self, url, mode="texttool", viewMode="auto", viewLayer=None, tocMode=None, start=None, pn=None, pf=None): """ show page @param url: url which contains display information @param mode: defines how to access the document behind url @param viewMode: 'image': display images, 'text': display text, 'xml': display xml, default is 'auto', 'hocr' : hocr format @param viewLayer: sub-type of viewMode, e.g. layer 'dict' for viewMode='text' @param tocMode: type of 'table of contents' for navigation (thumbs, text, figures, none) """ logging.debug("documentViewer(index_html) mode=%s url=%s viewMode=%s viewLayer=%s start=%s pn=%s pf=%s"%(mode,url,viewMode,viewLayer,start,pn,pf)) if not hasattr(self, 'template'): # this won't work logging.error("template folder missing!") return "ERROR: template folder missing!" if not getattr(self, 'digilibBaseUrl', None): self.digilibBaseUrl = self.findDigilibUrl() or "http://digilib.mpiwg-berlin.mpg.de/digitallibrary" # mode=filepath should not have toc-thumbs if tocMode is None: if mode == "filepath": tocMode = "none" else: tocMode = "thumbs" # docinfo: information about document (cached) docinfo = self.getDocinfo(mode=mode,url=url,tocMode=tocMode) # userinfo: user settings (cached) userinfo = self.getUserinfo() # auto viewMode: text if there is a text else images if viewMode=="auto": if docinfo.get('textURLPath', None): # docinfo.get('textURL', None) not implemented yet viewMode = "text" if viewLayer is None and 'viewLayer' not in userinfo: # use layer dict as default viewLayer = "dict" else: viewMode = "image" elif viewMode == "text_dict": # legacy fix viewMode = "text" viewLayer = "dict" elif viewMode == 'images': # legacy fix viewMode = 'image' self.REQUEST['viewMode'] = 'image' # safe viewLayer in userinfo userinfo['viewLayer'] = viewLayer # pageinfo: information about page (not cached) pageinfo = self.getPageinfo(start=start, pn=pn, pf=pf, docinfo=docinfo, userinfo=userinfo, viewMode=viewMode, viewLayer=viewLayer, tocMode=tocMode) # get template /template/viewer_$viewMode pt = getattr(self.template, 'viewer_%s'%viewMode, None) if pt is None: logging.error("No template for viewMode=%s!"%viewMode) # TODO: error page? return "No template for viewMode=%s!"%viewMode # and execute with parameters return pt(docinfo=docinfo, pageinfo=pageinfo) def getAvailableLayers(self): """returns dict with list of available layers per viewMode""" return self.availableLayers def findDigilibUrl(self): """try to get the digilib URL from zogilib""" url = self.template.zogilib.getDLBaseUrl() return url def getScalerUrl(self, fn=None, pn=None, dw=100, dh=100, docinfo=None): """returns URL to digilib Scaler with params""" url = None if docinfo is not None: url = docinfo.get('imageURL', None) if url is None: url = self.digilibScalerUrl if fn is None and docinfo is not None: fn = docinfo.get('imagePath','') url += "fn=%s"%fn if pn: url += "&pn=%s"%pn url += "&dw=%s&dh=%s"%(dw,dh) return sslifyUrl(url, self, force=True) def getDocumentViewerURL(self): """returns the URL of this instance""" return self.absolute_url() def getStyle(self, idx, selected, style=""): """returns a string with the given style and append 'sel' if idx == selected.""" #logger("documentViewer (getstyle)", logging.INFO, "idx: %s selected: %s style: %s"%(idx,selected,style)) if idx == selected: return style + 'sel' else: return style def getParams(self, param=None, val=None, params=None, duplicates=None): """returns dict with URL parameters. Takes URL parameters and additionally param=val or dict params. Deletes key if value is None.""" # copy existing request params newParams=self.REQUEST.form.copy() # change single param if param is not None: if val is None: if newParams.has_key(param): del newParams[param] else: newParams[param] = str(val) # change more params if params is not None: for (k, v) in params.items(): if v is None: # val=None removes param if newParams.has_key(k): del newParams[k] else: newParams[k] = v if duplicates: # eliminate lists (coming from duplicate keys) for (k,v) in newParams.items(): if isinstance(v, list): if duplicates == 'comma': # make comma-separated list of non-empty entries newParams[k] = ','.join([t for t in v if t]) elif duplicates == 'first': # take first non-empty entry newParams[k] = [t for t in v if t][0] return newParams def getLink(self, param=None, val=None, params=None, baseUrl=None, paramSep='&', duplicates='comma'): """returns URL to documentviewer with parameter param set to val or from dict params""" urlParams = self.getParams(param=param, val=val, params=params, duplicates=duplicates) # quote values and assemble into query string (not escaping '/') ps = paramSep.join(["%s=%s"%(k, urllib.quote_plus(utf8ify(v), '/')) for (k, v) in urlParams.items()]) if baseUrl is None: baseUrl = self.getDocumentViewerURL() url = "%s?%s"%(baseUrl, ps) return url def getLinkAmp(self, param=None, val=None, params=None, baseUrl=None, duplicates='comma'): """link to documentviewer with parameter param set to val""" return self.getLink(param=param, val=val, params=params, baseUrl=baseUrl, paramSep='&', duplicates=duplicates) def setAvailableLayers(self, newLayerString=None): """sets availableLayers to newLayerString or tries to autodetect available layers. assumes layer templates have the form layer_{m}_{l} for layer l in mode m. newLayerString is parsed as JSON.""" if newLayerString is not None: try: layers = json.loads(newLayerString) if 'text' in layers and 'image' in layers: self.availableLayers = layers return except: pass logging.error("invalid layers=%s! autodetecting..."%repr(newLayerString)) # start with builtin layers self.availableLayers = self.builtinLayers.copy() # add layers from templates for t in self.template: if t.startswith('layer_'): try: (x, m, l) = t.split('_', 3) if m not in self.availableLayers: # mode m doesn't exist -> new list self.availableLayers[m] = [l] else: # m exists -> append if l not in self.availableLayers[m]: self.availableLayers[m].append() except: pass def getAvailableLayersJson(self): """returns available layers as JSON string.""" return json.dumps(self.availableLayers) def getInfo_xml(self,url,mode): """returns info about the document as XML""" if not self.digilibBaseUrl: self.digilibBaseUrl = self.findDigilibUrl() or "http://digilib.mpiwg-berlin.mpg.de/digitallibrary" docinfo = self.getDocinfo(mode=mode,url=url) pt = getattr(self.template, 'info_xml') return pt(docinfo=docinfo) def getAuthenticatedUser(self, anon=None): """returns the authenticated user object or None. (ignores Zopes anonymous user)""" user = getSecurityManager().getUser() if user is not None and user.getUserName() != "Anonymous User": return user else: return anon def isAccessible(self, docinfo): """returns if access to the resource is granted""" access = docinfo.get('accessType', None) logging.debug("documentViewer (accessOK) access type %s"%access) if access == 'free': logging.debug("documentViewer (accessOK) access is free") return True elif access is None or access in self.authgroups: # only local access -- only logged in users user = self.getAuthenticatedUser() logging.debug("documentViewer (accessOK) user=%s ip=%s"%(user,self.REQUEST.getClientAddr())) return (user is not None) logging.error("documentViewer (accessOK) unknown access type %s"%access) return False def getUserinfo(self): """returns userinfo object""" logging.debug("getUserinfo") userinfo = {} # look for cached userinfo in session if self.REQUEST.SESSION.has_key('userinfo'): userinfo = self.REQUEST.SESSION['userinfo'] # check if its still current? else: # store in session self.REQUEST.SESSION['userinfo'] = userinfo return userinfo def getDocinfoJSON(self, mode, url, tocMode=None): """returns docinfo depending on mode""" import json dc = self.getDocinfo( mode, url, tocMode) return json.dumps(dc) def getDocinfo(self, mode, url, tocMode=None): """returns docinfo depending on mode""" logging.debug("getDocinfo: mode=%s, url=%s"%(mode,url)) # look for cached docinfo in session if self.REQUEST.SESSION.has_key('docinfo'): docinfo = self.REQUEST.SESSION['docinfo'] # check if its still current if docinfo is not None and docinfo.get('mode', None) == mode and docinfo.get('url', None) == url: logging.debug("getDocinfo: docinfo in session. keys=%s"%docinfo.keys()) return docinfo # new docinfo docinfo = {'mode': mode, 'url': url} # add self url docinfo['viewerUrl'] = self.getDocumentViewerURL() docinfo['digilibBaseUrl'] = sslifyUrl(self.digilibBaseUrl, self, force=True) docinfo['digilibScalerUrl'] = sslifyUrl(self.digilibScalerUrl, self, force=True) docinfo['digilibViewerUrl'] = sslifyUrl(self.digilibViewerUrl, self, force=True) # get index.meta DOM docUrl = None metaDom = None if mode=="texttool": # url points to document dir or index.meta metaDom = self.metadataService.getDomFromPathOrUrl(url) removeINDEXMETA_NS(metaDom) if metaDom is None: raise IOError("Unable to find index.meta for mode=texttool!") docUrl = url.replace('/index.meta', '') if url.startswith('/mpiwg/online/'): docUrl = url.replace('/mpiwg/online/', '', 1) elif mode=="textpath": #url points to an textfile #index.meta optional #assume index.meta in parent dir docUrl = getParentPath(url) docinfo['viewmode'] = "text" try: metaDom = self.metadataService.getDomFromPathOrUrl(docUrl) removeINDEXMETA_NS(metaDom) except: metaDom = None #metaDom = self.metadataService.getDomFromPathOrUrl(docUrl) #docinfo['imagePath'] = url.replace('/mpiwg/online', '', 1) docinfo['textURLPath'] = url.replace('/mpiwg/online', '', 1) docinfo['textURL'] = url if docinfo.get("creator", None) is None: docinfo['creator'] = "" if docinfo.get("title", None) is None: docinfo['title'] = "" if docinfo.get("documentPath", None) is None: docinfo['documentPath'] = url.replace('/mpiwg/online', '', 1) docinfo['documentPath'] = url.replace('/pages', '', 1) docinfo['numPages'] = 1 elif mode=="imagepath": # url points to folder with images, index.meta optional # asssume index.meta in parent dir docUrl = getParentPath(url) metaDom = self.metadataService.getDomFromPathOrUrl(docUrl) docinfo['imagePath'] = url.replace('/mpiwg/online', '', 1) elif mode=="hocr": # url points to folder with images, index.meta optional # asssume index.meta in parent dir docUrl = getParentPath(url) metaDom = self.metadataService.getDomFromPathOrUrl(docUrl) docinfo['imagePath'] = url.replace('/mpiwg/online', '', 1) docinfo['textURLPath'] = url.replace('/mpiwg/online', '', 1) if docinfo.get("creator", None) is None: docinfo['creator'] = "" if docinfo.get("title", None) is None: docinfo['title'] = "" if docinfo.get("documentPath", None) is None: docinfo['documentPath'] = url.replace('/mpiwg/online', '', 1) docinfo['documentPath'] = url.replace('/pages', '', 1) elif mode=="filepath": # url points to image file, index.meta optional docinfo['imageURL'] = "%s?fn=%s"%(self.digilibScalerUrl, url) docinfo['numPages'] = 1 # asssume index.meta is two path segments up docUrl = getParentPath(url, 2) metaDom = self.metadataService.getDomFromPathOrUrl(docUrl) else: logging.error("documentViewer (getdocinfo) unknown mode: %s!"%mode) raise ValueError("Unknown mode %s! Has to be one of 'texttool','imagepath','filepath'."%(mode)) docinfo['documentUrl'] = docUrl # process index.meta contents if metaDom is not None and (metaDom.tag == 'resource' or metaDom.tag == "{%s}resource"%INDEXMETA_NS): # document directory name and path resource = self.metadataService.getResourceData(dom=metaDom, recursive=1) if resource: docinfo = self.getDocinfoFromResource(docinfo, resource) # texttool info texttool = self.metadataService.getTexttoolData(dom=metaDom, recursive=1, all=True) if texttool: docinfo = self.getDocinfoFromTexttool(docinfo, texttool) # document info from full text server if docinfo.get('textURLPath', None): docinfo = self.getTextInfo(mode=None, docinfo=docinfo) # include list of pages TODO: do we need this always? docinfo = self.getTextInfo(mode='pages', docinfo=docinfo) # bib info bib = self.metadataService.getBibData(dom=metaDom) if bib: # save extended version as 'bibx' TODO: ugly bibx = self.metadataService.getBibData(dom=metaDom, all=True, recursive=1) if len(bibx) == 1: # unwrap list if possible bibx = bibx[0] docinfo['bibx'] = bibx docinfo = self.getDocinfoFromBib(docinfo, bib, bibx) else: # no bib - try info.xml docinfo = self.getDocinfoFromPresentationInfoXml(docinfo) # auth info access = self.metadataService.getAccessData(dom=metaDom) if access: docinfo = self.getDocinfoFromAccess(docinfo, access) # attribution info attribution = self.metadataService.getAttributionData(dom=metaDom) if attribution: logging.debug("getDocinfo: attribution=%s"%repr(attribution)) docinfo['attribution'] = attribution # copyright info copyright = self.metadataService.getCopyrightData(dom=metaDom) if copyright: logging.debug("getDocinfo: copyright=%s"%repr(copyright)) docinfo['copyright'] = copyright # DRI (permanent ID) dri = self.metadataService.getDRI(dom=metaDom, type='mpiwg') if dri: docinfo['DRI'] = dri # (presentation) context ctx = self.metadataService.getContextData(dom=metaDom, all=True) if ctx: logging.debug("getcontext: ctx=%s"%repr(ctx)) docinfo['presentationContext'] = ctx # image path if mode != 'texttool': # override image path from texttool with url parameter TODO: how about mode=auto? docinfo['imagePath'] = url.replace('/mpiwg/online', '', 1) # check numPages if docinfo.get('numPages', 0) == 0: # number of images from digilib if docinfo.get('imagePath', None): imgpath = docinfo['imagePath'].replace('/mpiwg/online', '', 1) logging.debug("imgpath=%s"%imgpath) docinfo['imageURL'] = sslifyUrl("%s?fn=%s"%(self.digilibScalerUrl, imgpath), self, force=True) docinfo = self.getDocinfoFromDigilib(docinfo, imgpath) else: # imagePath still missing? try "./pageimg" imgPath = os.path.join(docUrl, 'pageimg') docinfo = self.getDocinfoFromDigilib(docinfo, imgPath) if docinfo.get('numPages', 0) > 0: # there are pages docinfo['imagePath'] = imgPath docinfo['imageURL'] = sslifyUrl("%s?fn=%s"%(self.digilibScalerUrl, docinfo['imagePath']), self, force=True) # check numPages if docinfo.get('numPages', 0) == 0: if docinfo.get('numTextPages', 0) > 0: # replace with numTextPages (text-only?) docinfo['numPages'] = docinfo['numTextPages'] # min and max page no docinfo['minPageNo'] = docinfo.get('minPageNo', 1) docinfo['maxPageNo'] = docinfo.get('maxPageNo', docinfo['numPages']) # part-of information partOfPath = docinfo.get('partOfPath', None) if partOfPath is not None: partOfDom = self.metadataService.getDomFromPathOrUrl(partOfPath) if partOfDom is not None: docinfo['partOfLabel'] = self.metadataService.getBibFormattedLabel(dom=partOfDom) docinfo['partOfUrl'] = "%s?url=%s"%(self.getDocumentViewerURL(), partOfPath) logging.debug("partOfLabel=%s partOfUrl=%s"%(docinfo['partOfLabel'],docinfo['partOfUrl'])) # normalize path if 'imagePath' in docinfo and not docinfo['imagePath'].startswith('/'): docinfo['imagePath'] = '/' + docinfo['imagePath'] logging.debug("documentViewer (getdocinfo) docinfo: keys=%s"%docinfo.keys()) # store in session self.REQUEST.SESSION['docinfo'] = docinfo return docinfo def getDocinfoFromResource(self, docinfo, resource): """reads contents of resource element into docinfo""" logging.debug("getDocinfoFromResource: resource=%s"%(repr(resource))) docName = getMDText(resource.get('name', None)) docinfo['documentName'] = docName docPath = getMDText(resource.get('archive-path', None)) if docPath: # clean up document path if docPath[0] != '/': docPath = '/' + docPath if docName and (not docPath.endswith(docName)): docPath += "/" + docName else: # use docUrl as docPath docUrl = docinfo['documentURL'] if not docUrl.startswith('http:'): docPath = docUrl if docPath: # fix URLs starting with /mpiwg/online docPath = docPath.replace('/mpiwg/online', '', 1) docinfo['documentPath'] = docPath # is this part-of? partOf = resource.get('is-part-of', None) if partOf is not None: partOf = getMDText(partOf.get('archive-path', None)) if partOf is not None: docinfo['partOfPath'] = partOf.strip() return docinfo def getDocinfoFromTexttool(self, docinfo, texttool): """reads contents of texttool element into docinfo""" logging.debug("texttool=%s"%repr(texttool)) # unpack list if necessary if isinstance(texttool, list): texttool = texttool[0] # image dir imageDir = getMDText(texttool.get('image', None)) docPath = getMDText(docinfo.get('documentPath', None)) if imageDir: if imageDir.startswith('/'): # absolute path imageDir = imageDir.replace('/mpiwg/online', '', 1) docinfo['imagePath'] = imageDir elif docPath: # relative path imageDir = os.path.join(docPath, imageDir) imageDir = imageDir.replace('/mpiwg/online', '', 1) docinfo['imagePath'] = imageDir # start and end page (for subdocuments of other documents) imgStartNo = getMDText(texttool.get('image-start-no', None)) minPageNo = getInt(imgStartNo, 1) docinfo['minPageNo'] = minPageNo imgEndNo = getMDText(texttool.get('image-end-no', None)) if imgEndNo: docinfo['maxPageNo'] = getInt(imgEndNo) # old style text URL textUrl = getMDText(texttool.get('text', None)) if textUrl and docPath: if urlparse.urlparse(textUrl)[0] == "": #keine url textUrl = os.path.join(docPath, textUrl) docinfo['textURL'] = textUrl # new style text-url-path (can be more than one with "repository" attribute) textUrlNode = texttool.get('text-url-path', None) if not isinstance(textUrlNode, list): textUrlNode = [textUrlNode] for tun in textUrlNode: textUrl = getMDText(tun) if textUrl: textUrlAtts = tun.get('@attr') if (textUrlAtts and 'repository' in textUrlAtts): textRepo = textUrlAtts['repository'] # use matching repository if self.getRepositoryType() == textRepo: docinfo['textURLPath'] = textUrl docinfo['textURLRepository'] = textRepo break else: # no repo attribute - use always docinfo['textURLPath'] = textUrl # page flow docinfo['pageFlow'] = getMDText(texttool.get('page-flow', 'ltr')) # odd pages are left docinfo['oddPage'] = getMDText(texttool.get('odd-scan-position', 'left')) # number of title page (default 1) docinfo['titlePage'] = getMDText(texttool.get('title-scan-no', minPageNo)) # old presentation stuff presentation = getMDText(texttool.get('presentation', None)) if presentation and docPath: if presentation.startswith('http:'): docinfo['presentationUrl'] = presentation else: docinfo['presentationUrl'] = os.path.join(docPath, presentation) # make sure we have at least fake DC data if 'creator' not in docinfo: docinfo['creator'] = '[no author found]' if 'title' not in docinfo: docinfo['title'] = '[no title found]' if 'date' not in docinfo: docinfo['date'] = '[no date found]' return docinfo def getDocinfoFromBib(self, docinfo, bib, bibx=None): """reads contents of bib element into docinfo""" logging.debug("getDocinfoFromBib bib=%s"%repr(bib)) # put all raw bib fields in dict "bib" docinfo['bib'] = bib bibtype = bib.get('@type', None) docinfo['bibType'] = bibtype # also store DC metadata for convenience dc = self.metadataService.getDCMappedData(bib) docinfo['creator'] = dc.get('creator','') docinfo['title'] = dc.get('title','') docinfo['date'] = dc.get('date','') return docinfo def getDocinfoFromAccess(self, docinfo, acc): """reads contents of access element into docinfo""" #TODO: also read resource type logging.debug("getDocinfoFromAccess acc=%s"%repr(acc)) try: acctype = acc['@attr']['type'] if acctype: access=acctype if access in ['group', 'institution']: access = acc['name'].lower() docinfo['accessType'] = access except: pass return docinfo def getDocinfoFromDigilib(self, docinfo, path): infoUrl=self.digilibBaseUrl+"/api/dirInfo-xml.jsp?fn="+path # fetch data txt = getHttpData(infoUrl) if not txt: logging.error("Unable to get dir-info from %s"%(infoUrl)) return docinfo dom = ET.fromstring(txt) dir = dom # save size size = dir.findtext('size') logging.debug("getDocinfoFromDigilib: size=%s"%size) if size: docinfo['numPages'] = int(size) else: docinfo['numPages'] = 0 return docinfo # save list of image names and numbers imgNames = {} imgIndexes = {} for f in dir: fn = f.findtext('name') pn = getInt(f.findtext('index')) imgNames[fn] = pn imgIndexes[pn] = fn docinfo['imgFileNames'] = imgNames docinfo['imgFileIndexes'] = imgIndexes return docinfo def getDocinfoFromPresentationInfoXml(self,docinfo): """gets DC-like bibliographical information from the presentation entry in texttools""" url = docinfo.get('presentationUrl', None) if not url: logging.error("getDocinfoFromPresentation: no URL!") return docinfo dom = None metaUrl = None if url.startswith("http://"): # real URL metaUrl = url else: # online path server=self.digilibBaseUrl+"/servlet/Texter?fn=" metaUrl=server+url txt=getHttpData(metaUrl) if txt is None: logging.error("Unable to read info.xml from %s"%(url)) return docinfo dom = ET.fromstring(txt) docinfo['creator']=getText(dom.find(".//author")) docinfo['title']=getText(dom.find(".//title")) docinfo['date']=getText(dom.find(".//date")) return docinfo def getPageinfo(self, pn=None, pf=None, start=None, rows=None, cols=None, docinfo=None, userinfo=None, viewMode=None, viewLayer=None, tocMode=None): """returns pageinfo with the given parameters""" logging.debug("getPageInfo(pn=%s, pf=%s, start=%s, rows=%s, cols=%s, viewMode=%s, viewLayer=%s, tocMode=%s)"%(pn,pf,start,rows,cols,viewMode,viewLayer,tocMode)) pageinfo = {} pageinfo['viewMode'] = viewMode # split viewLayer if necessary if isinstance(viewLayer,basestring): viewLayer = viewLayer.split(',') if isinstance(viewLayer, list): logging.debug("getPageinfo: viewLayer is list:%s"%viewLayer) # save (unique) list in viewLayers seen = set() viewLayers = [l for l in viewLayer if l and l not in seen and not seen.add(l)] pageinfo['viewLayers'] = viewLayers # stringify viewLayer viewLayer = ','.join(viewLayers) else: #create list pageinfo['viewLayers'] = [viewLayer] pageinfo['viewLayer'] = viewLayer pageinfo['tocMode'] = tocMode minPageNo = docinfo.get('minPageNo', 1) # pf takes precedence over pn if pf: pageinfo['pf'] = pf pn = getPnForPf(docinfo, pf) # replace pf in request params (used for creating new URLs) self.REQUEST.form.pop('pf', None) self.REQUEST.form['pn'] = pn else: pn = getInt(pn, minPageNo) pf = getPfForPn(docinfo, pn) pageinfo['pf'] = pf pageinfo['pn'] = pn rows = int(rows or self.thumbrows) pageinfo['rows'] = rows cols = int(cols or self.thumbcols) pageinfo['cols'] = cols grpsize = cols * rows pageinfo['groupsize'] = grpsize # if start is empty use one around pn grouppn = math.ceil(float(pn)/float(grpsize))*grpsize-(grpsize-1) # but not smaller than minPageNo start = getInt(start, max(grouppn, minPageNo)) pageinfo['start'] = start # get number of pages numPages = int(docinfo.get('numPages', 0)) if numPages == 0: # try numTextPages numPages = docinfo.get('numTextPages', 0) if numPages != 0: docinfo['numPages'] = numPages maxPageNo = docinfo.get('maxPageNo', numPages) logging.debug("minPageNo=%s maxPageNo=%s start=%s numPages=%s"%(minPageNo,maxPageNo,start,numPages)) np = maxPageNo # cache table of contents pageinfo['tocPageSize'] = getInt(self.REQUEST.get('tocPageSize', 30)) pageinfo['numgroups'] = int(np / grpsize) if np % grpsize > 0: pageinfo['numgroups'] += 1 pageFlowLtr = docinfo.get('pageFlow', 'ltr') != 'rtl' oddScanLeft = docinfo.get('oddPage', 'left') != 'right' # add zeroth page for two columns pageZero = (cols == 2 and (pageFlowLtr != oddScanLeft)) pageinfo['pageZero'] = pageZero pageinfo['pageBatch'] = self.getPageBatch(start=start, rows=rows, cols=cols, pageFlowLtr=pageFlowLtr, pageZero=pageZero, minIdx=minPageNo, maxIdx=np) # more page parameters #pageinfo['characterNormalization'] = self.REQUEST.get('characterNormalization','reg') #becuase it is buggy this currently disabled and set to orig. pageinfo['characterNormalization'] = 'orig' if docinfo.get('pageNumbers'): # get original page numbers pageNumber = docinfo['pageNumbers'].get(pn, None) if pageNumber is not None: pageinfo['pageNumberOrig'] = pageNumber['no'] pageinfo['pageNumberOrigNorm'] = pageNumber['non'] # cache search results query = self.REQUEST.get('query',None) pageinfo['query'] = query if query and viewMode == 'text': pageinfo['resultPageSize'] = getInt(self.REQUEST.get('resultPageSize', 10)) queryType = self.REQUEST.get('queryType', 'fulltextMorph') pageinfo['queryType'] = queryType pageinfo['resultStart'] = getInt(self.REQUEST.get('resultStart', '1')) self.getSearchResults(mode=queryType, query=query, pageinfo=pageinfo, docinfo=docinfo) # highlighting highlightQuery = self.REQUEST.get('highlightQuery', None) if highlightQuery: pageinfo['highlightQuery'] = highlightQuery pageinfo['highlightElement'] = self.REQUEST.get('highlightElement', '') pageinfo['highlightElementPos'] = self.REQUEST.get('highlightElementPos', '') return pageinfo def getPageBatch(self, start=1, rows=10, cols=2, pageFlowLtr=True, pageZero=False, minIdx=1, maxIdx=0): """Return dict with array of page information for one screenfull of thumbnails. :param start: index of current page :param rows: number of rows in one batch :param cols: number of columns in one batch :param pageFlowLtr: do indexes increase from left to right :param pageZero: is there a zeroth non-visible page :param minIdx: minimum index to use :param maxIdx: maximum index to use :returns: dict with first: first page index last: last page index batches: list of all possible batches(dict: 'start': index, 'end': index) pages: list for current batch of rows(list of cols(list of pages(dict: 'idx': index))) nextStart: first index of next batch prevStart: first index of previous batch """ logging.debug("getPageBatch start=%s minIdx=%s maxIdx=%s"%(start,minIdx,maxIdx)) batch = {} grpsize = rows * cols if maxIdx == 0: maxIdx = start + grpsize np = maxIdx - minIdx + 1 if pageZero: # correct number of pages for batching np += 1 nb = int(math.ceil(np / float(grpsize))) # list of all batch start and end points batches = [] if pageZero: ofs = minIdx - 1 else: ofs = minIdx for i in range(nb): s = i * grpsize + ofs e = min((i + 1) * grpsize + ofs - 1, maxIdx) batches.append({'start':s, 'end':e}) batch['batches'] = batches # list of pages for current screen pages = [] if pageZero and start == minIdx: # correct beginning idx = minIdx - 1 else: idx = start for r in range(rows): row = [] for c in range(cols): if idx < minIdx or idx > maxIdx: page = {'idx':None} else: page = {'idx':idx} idx += 1 if pageFlowLtr: row.append(page) else: row.insert(0, page) pages.append(row) if start > minIdx: batch['prevStart'] = max(start - grpsize, minIdx) else: batch['prevStart'] = None if start + grpsize <= maxIdx: if pageZero and start == minIdx: # correct nextStart for pageZero batch['nextStart'] = grpsize else: batch['nextStart'] = start + grpsize else: batch['nextStart'] = None batch['pages'] = pages batch['first'] = minIdx batch['last'] = maxIdx logging.debug("batch: %s"%repr(batch)) return batch def getBatch(self, start=1, size=10, end=0, data=None, fullData=True): """returns dict with information for one screenfull of data.""" batch = {} if end == 0: end = start + size nb = int(math.ceil(end / float(size))) # list of all batch start and end points batches = [] for i in range(nb): s = i * size + 1 e = min((i + 1) * size, end) batches.append({'start':s, 'end':e}) batch['batches'] = batches # list of elements in this batch this = [] j = 0 for i in range(start, min(start+size, end+1)): if data: if fullData: d = data.get(i, None) else: d = data.get(j, None) j += 1 else: d = i+1 this.append(d) batch['this'] = this if start > 1: batch['prevStart'] = max(start - size, 1) else: batch['prevStart'] = None if start + size < end: batch['nextStart'] = start + size else: batch['nextStart'] = None batch['first'] = start batch['last'] = end return batch def getAnnotatorGroupsForUser(self, user, annotationServerUrl="http://tuxserve03.mpiwg-berlin.mpg.de/AnnotationManager"): """returns list of groups {name:*, id:*} on the annotation server for the user""" groups = [] # add matching http(s) from our URL annotationServerUrl = sslifyUrl(annotationServerUrl, self) groupsUrl = "%s/annotator/groups?user=%s"%(annotationServerUrl,user) data = getHttpData(url=groupsUrl, noExceptions=True) if data: res = json.loads(data) rows = res.get('rows', None) if rows is None: return groups for r in rows: groups.append({'id': r.get('id', None), 'name': r.get('name', None), 'uri': r.get('uri', None)}) return groups def sslifyUrl(self, url, **args): """returns URL with http or https""" return sslifyUrl(url, **args) security.declareProtected('View management screens','changeDocumentViewerForm') changeDocumentViewerForm = PageTemplateFile('zpt/changeDocumentViewer', globals()) def changeDocumentViewer(self,title="",digilibBaseUrl=None,thumbrows=2,thumbcols=5,authgroups='mpiwg',availableLayers=None,RESPONSE=None): """init document viewer""" self.title=title self.digilibBaseUrl = digilibBaseUrl self.digilibScalerUrl = digilibBaseUrl + '/servlet/Scaler' self.digilibViewerUrl = digilibBaseUrl + '/jquery/digilib.html' self.thumbrows = thumbrows self.thumbcols = thumbcols self.authgroups = [s.strip().lower() for s in authgroups.split(',')] try: # assume MetaDataFolder instance is called metadata self.metadataService = getattr(self, 'metadata') except Exception, e: logging.error("Unable to find MetaDataFolder 'metadata': "+str(e)) self.setAvailableLayers(availableLayers) if RESPONSE is not None: RESPONSE.redirect('manage_main') def manage_AddDocumentViewerForm(self): """add the viewer form""" pt=PageTemplateFile('zpt/addDocumentViewer', globals()).__of__(self) return pt() def manage_AddDocumentViewer(self,id,imageScalerUrl="",textServerName="",title="",RESPONSE=None): """add the viewer""" newObj=documentViewer(id,imageScalerUrl=imageScalerUrl,title=title,textServerName=textServerName) self._setObject(id,newObj) if RESPONSE is not None: RESPONSE.redirect('manage_main')