view documentViewer.py @ 635:8d460ddb45b7 default tip

update digilib dirInfo-xml path.
author casties
date Fri, 06 May 2016 18:59:35 +0200
parents 618b600c805a
children
line wrap: on
line source

from OFS.Folder import Folder
from Products.PageTemplates.ZopePageTemplate import ZopePageTemplate
from Products.PageTemplates.PageTemplateFile import PageTemplateFile
from App.ImageFile import ImageFile
from AccessControl import ClassSecurityInfo
from AccessControl import getSecurityManager

import xml.etree.ElementTree as ET

import os
import urllib
import logging
import math
import urlparse 
import json

from Products.MetaDataProvider import MetaDataFolder

from SrvTxtUtils import getInt, utf8ify, getText, getHttpData, refreshingImageFileIndexHtml, sslifyUrl
    

INDEXMETA_NS="http://md.mpiwg-berlin.mpg.de/ns/indexMeta#"

def removeINDEXMETA_NS(root): #entfernt den namespace von indexmeta aus dem dom #TODO evertyhing should be changed that it can deal with NS
    for elem in root.getiterator():
        print ("ETAG")
        print(elem.tag)
        if not hasattr(elem.tag, 'find'): continue  # (1)
        
        i = elem.tag.find('{%s}'%INDEXMETA_NS)
        if i >= 0:
            elem.tag = elem.tag[i+len(('{%s}'%INDEXMETA_NS)):]

        print(elem.tag)

def getMDText(node):
    """returns the @text content from the MetaDataProvider metadata node"""
    if isinstance(node, dict):
        return node.get('@text', None)
    
    if isinstance(node,list): #more than one text file if there is an attribute don't choose it
        for nodeInList in node:
            attr = nodeInList.get("@attr",None)
            if attr is None:
                return node.get('@text',None)
        return None

    return node

def getParentPath(path, cnt=1):
    """returns pathname shortened by cnt"""
    # make sure path doesn't end with /
    path = path.rstrip('/')
    # split by /, shorten, and reassemble
    return '/'.join(path.split('/')[0:-cnt])

def getPnForPf(docinfo, pf, default=0):
    """returns image number for image file name or default"""
    if 'imgFileNames' in docinfo:
        pn = docinfo['imgFileNames'].get(pf, None)
        if pn is None:
            # try to cut extension
            xi = pf.rfind('.')
            if xi > 0:
                pf = pf[:xi]
                # try again, else return 0
                pn = docinfo['imgFileNames'].get(pf, default)
            else:
                # no extension
                pn = default
                
        return pn
    
    return default

def getPfForPn(docinfo, pn, default=None):
    """returns image file name for image number or default"""
    if 'imgFileIndexes' in docinfo:
        pn = docinfo['imgFileIndexes'].get(pn, default)
        return pn
    
    return default


##
## documentViewer class
##
class documentViewer(Folder):
    """document viewer"""
    meta_type="Document viewer"
    
    security=ClassSecurityInfo()
    manage_options=Folder.manage_options+(
        {'label':'Configuration','action':'changeDocumentViewerForm'},
        )
    
    metadataService = None
    """MetaDataFolder instance"""
    

    #
    # templates and forms
    #
    # viewMode templates
    viewer_text = PageTemplateFile('zpt/viewer/viewer_text', globals())
    viewer_hocr = PageTemplateFile('zpt/viewer/viewer_hocr', globals())
    viewer_xml = PageTemplateFile('zpt/viewer/viewer_xml', globals())
    viewer_image = PageTemplateFile('zpt/viewer/viewer_image', globals())
    viewer_index = PageTemplateFile('zpt/viewer/viewer_index', globals())
    viewer_thumbs = PageTemplateFile('zpt/viewer/viewer_thumbs', globals())
    viewer_indexonly = PageTemplateFile('zpt/viewer/viewer_indexonly', globals())
    viewer_text_image = PageTemplateFile('zpt/viewer/viewer_text_image', globals())
    # available layer types (annotator not default)
    builtinLayers = {'text': ['dict','search','gis'],
                     'xml': None, 'image': None, 'index': ['extended'],'text_image': ['dict'],}
    availableLayers = builtinLayers;
    # layer templates
    layer_text_dict = PageTemplateFile('zpt/viewer/layer_text_dict', globals())
    layer_text_image_dict = PageTemplateFile('zpt/viewer/layer_text_image_dict', globals())
    layer_text_search = PageTemplateFile('zpt/viewer/layer_text_search', globals())
    layer_text_annotator = PageTemplateFile('zpt/viewer/layer_text_annotator', globals())
    layer_text_gis = PageTemplateFile('zpt/viewer/layer_text_gis', globals())
    layer_text_pundit = PageTemplateFile('zpt/viewer/layer_text_pundit', globals())
    layer_image_annotator = PageTemplateFile('zpt/viewer/layer_image_annotator', globals())
    layer_image_search = PageTemplateFile('zpt/viewer/layer_image_search', globals())
    layer_index_extended = PageTemplateFile('zpt/viewer/layer_index_extended', globals())
    # toc templates
    toc_thumbs = PageTemplateFile('zpt/viewer/toc_thumbs', globals())
    toc_text = PageTemplateFile('zpt/viewer/toc_text', globals())
    toc_figures = PageTemplateFile('zpt/viewer/toc_figures', globals())
    toc_concordance = PageTemplateFile('zpt/viewer/toc_concordance', globals())
    toc_notes = PageTemplateFile('zpt/viewer/toc_notes', globals())
    toc_handwritten = PageTemplateFile('zpt/viewer/toc_handwritten', globals())
    toc_none = PageTemplateFile('zpt/viewer/toc_none', globals())
    # other templates
    common_template = PageTemplateFile('zpt/viewer/common_template', globals())
    info_xml = PageTemplateFile('zpt/viewer/info_xml', globals())
    docuviewer_css = ImageFile('css/docuviewer.css',globals())
    # make docuviewer_css refreshable for development
    docuviewer_css.index_html = refreshingImageFileIndexHtml
    docuviewer_ie_css = ImageFile('css/docuviewer_ie.css',globals())
    # make docuviewer_ie_css refreshable for development
    #docuviewer_ie_css.index_html = refreshingImageFileIndexHtml
    jquery_js = ImageFile('js/jquery.js',globals())
    
    
    def __init__(self,id,imageScalerUrl=None,textServerName=None,title="",digilibBaseUrl=None,thumbcols=2,thumbrows=5,authgroups="mpiwg"):
        """init document viewer"""
        self.id=id
        self.title=title
        self.thumbcols = thumbcols
        self.thumbrows = thumbrows
        # authgroups is list of authorized groups (delimited by ,)
        self.authgroups = [s.strip().lower() for s in authgroups.split(',')]
        # create template folder so we can always use template.something
        
        templateFolder = Folder('template')
        self['template'] = templateFolder # Zope-2.12 style
        #self._setObject('template',templateFolder) # old style
        try:
            import MpdlXmlTextServer
            textServer = MpdlXmlTextServer.MpdlXmlTextServer(id='fulltextclient',serverName=textServerName)
            templateFolder['fulltextclient'] = textServer
            #templateFolder._setObject('fulltextclient',textServer)
        except Exception, e:
            logging.error("Unable to create MpdlXmlTextServer for fulltextclient: "+str(e))
            
        try:
            from Products.zogiLib.zogiLib import zogiLib
            zogilib = zogiLib(id="zogilib", title="zogilib for docuviewer", dlServerURL=imageScalerUrl, layout="book")
            templateFolder['zogilib'] = zogilib
            #templateFolder._setObject('zogilib',zogilib)
        except Exception, e:
            logging.error("Unable to create zogiLib for 'zogilib': "+str(e))
            
        try:
            # assume MetaDataFolder instance is called metadata 
            self.metadataService = getattr(self, 'metadata')
        except Exception, e:
            logging.error("Unable to find MetaDataFolder 'metadata': "+str(e))
            
        if digilibBaseUrl is not None:
            self.digilibBaseUrl = digilibBaseUrl
            self.digilibScalerUrl = digilibBaseUrl + '/servlet/Scaler'
            self.digilibViewerUrl = digilibBaseUrl + '/jquery/digilib.html'
            

    # proxy text server methods to fulltextclient
    def getTextPage(self, **args):
        """returns full text content of page"""
        return self.template.fulltextclient.getTextPage(**args)

    def getSearchResults(self, **args):
        """loads list of search results and stores XML in docinfo"""
        return self.template.fulltextclient.getSearchResults(**args)

    def getResultsPage(self, **args):
        """returns one page of the search results"""
        return self.template.fulltextclient.getResultsPage(**args)

    def getTextInfo(self, **args):
        """returns document info from the text server"""
        return self.template.fulltextclient.getTextInfo(**args)

    def getToc(self, **args):
        """loads table of contents and stores XML in docinfo"""
        return self.template.fulltextclient.getToc(**args)

    def getTocPage(self, **args):
        """returns one page of the table of contents"""
        return self.template.fulltextclient.getTocPage(**args)

    def getRepositoryType(self, **args):
        """get repository type"""
        return self.template.fulltextclient.getRepositoryType(**args)

    def getTextDownloadUrl(self, **args):
        """get URL to download the full text"""
        return self.template.fulltextclient.getTextDownloadUrl(**args)
 
    def getPlacesOnPage(self, **args):
        """get list of gis places on one page"""
        return self.template.fulltextclient.getPlacesOnPage(**args)
 
    # Thumb list for CoolIris Plugin
    thumbs_main_rss = PageTemplateFile('zpt/thumbs_main_rss', globals())
    security.declareProtected('View','thumbs_rss')
    def thumbs_rss(self,mode,url,viewMode="auto",start=None,pn=1):
        '''
        view it
        @param mode: defines how to access the document behind url 
        @param url: url which contains display information
        @param viewMode: image: display images, text: display text, default is auto (try text, else image)
        
        '''
        
        if not hasattr(self, 'template'):
            # this won't work
            logging.error("template folder missing!")
            return "ERROR: template folder missing!"
                        
        if not self.digilibBaseUrl:
            self.digilibBaseUrl = self.findDigilibUrl() or "http://digilib.mpiwg-berlin.mpg.de/digitallibrary"
            
        docinfo = self.getDocinfo(mode=mode,url=url)
        #pageinfo = self.getPageinfo(start=start,current=pn,docinfo=docinfo)
        pageinfo = self.getPageinfo(start=start,pn=pn, docinfo=docinfo)
        ''' ZDES '''
        pt = getattr(self.template, 'thumbs_main_rss')
        
        if viewMode=="auto": # automodus gewaehlt
            if docinfo.has_key("textURL") or docinfo.get('textURLPath',None): #texturl gesetzt und textViewer konfiguriert
                viewMode="text"
            else:
                viewMode="image"
               
        return pt(docinfo=docinfo,pageinfo=pageinfo,viewMode=viewMode)

  
    security.declareProtected('View','index_html')
    def index_html(self, url, mode="texttool", viewMode="auto", viewLayer=None, tocMode=None, start=None, pn=None, pf=None):
        """
        show page
        @param url: url which contains display information
        @param mode: defines how to access the document behind url 
        @param viewMode: 'image': display images, 'text': display text, 'xml': display xml, default is 'auto', 'hocr' : hocr format
        @param viewLayer: sub-type of viewMode, e.g. layer 'dict' for viewMode='text'
        @param tocMode: type of 'table of contents' for navigation (thumbs, text, figures, none)
        """
        
        logging.debug("documentViewer(index_html) mode=%s url=%s viewMode=%s viewLayer=%s start=%s pn=%s pf=%s"%(mode,url,viewMode,viewLayer,start,pn,pf))
        
        if not hasattr(self, 'template'):
            # this won't work
            logging.error("template folder missing!")
            return "ERROR: template folder missing!"

        if not getattr(self, 'digilibBaseUrl', None):
            self.digilibBaseUrl = self.findDigilibUrl() or "http://digilib.mpiwg-berlin.mpg.de/digitallibrary"
            
        # mode=filepath should not have toc-thumbs
        if tocMode is None:
            if mode == "filepath":
                tocMode = "none"
            else:
                tocMode = "thumbs"
            
        # docinfo: information about document (cached)
        docinfo = self.getDocinfo(mode=mode,url=url,tocMode=tocMode)
        
        # userinfo: user settings (cached)
        userinfo = self.getUserinfo()
        
        # auto viewMode: text if there is a text else images
        if viewMode=="auto": 
            if docinfo.get('textURLPath', None):
                # docinfo.get('textURL', None) not implemented yet
                viewMode = "text"
                if viewLayer is None and 'viewLayer' not in userinfo:
                    # use layer dict as default 
                    viewLayer = "dict"
            else:
                viewMode = "image"
                
        elif viewMode == "text_dict":
            # legacy fix
            viewMode = "text"
            viewLayer = "dict"
            
        elif viewMode == 'images':
            # legacy fix
            viewMode = 'image'
            self.REQUEST['viewMode'] = 'image'

        # safe viewLayer in userinfo
        userinfo['viewLayer'] = viewLayer
                
        # pageinfo: information about page (not cached)
        pageinfo = self.getPageinfo(start=start, pn=pn, pf=pf, docinfo=docinfo, userinfo=userinfo, viewMode=viewMode, viewLayer=viewLayer, tocMode=tocMode)
                    
        # get template /template/viewer_$viewMode
        pt = getattr(self.template, 'viewer_%s'%viewMode, None)
        if pt is None:
            logging.error("No template for viewMode=%s!"%viewMode)
            # TODO: error page?
            return "No template for viewMode=%s!"%viewMode
        
        # and execute with parameters
        return pt(docinfo=docinfo, pageinfo=pageinfo)
  
    def getAvailableLayers(self):
        """returns dict with list of available layers per viewMode"""
        return self.availableLayers
    
    def findDigilibUrl(self):
        """try to get the digilib URL from zogilib"""
        url = self.template.zogilib.getDLBaseUrl()
        return url
    
    def getScalerUrl(self, fn=None, pn=None, dw=100, dh=100, docinfo=None):
        """returns URL to digilib Scaler with params"""
        url = None
        if docinfo is not None:
            url = docinfo.get('imageURL', None)
            
        if url is None:
            url = self.digilibScalerUrl
            if fn is None and docinfo is not None:
                fn = docinfo.get('imagePath','')
            
            url += "fn=%s"%fn
            
        if pn:
            url += "&pn=%s"%pn
            
        url += "&dw=%s&dh=%s"%(dw,dh)
        return sslifyUrl(url, self, force=True)

    def getDocumentViewerURL(self):
        """returns the URL of this instance"""
        return self.absolute_url()
    
    def getStyle(self, idx, selected, style=""):
        """returns a string with the given style and append 'sel' if idx == selected."""
        #logger("documentViewer (getstyle)", logging.INFO, "idx: %s selected: %s style: %s"%(idx,selected,style))
        if idx == selected:
            return style + 'sel'
        else:
            return style
    
    def getParams(self, param=None, val=None, params=None, duplicates=None):
        """returns dict with URL parameters.
        
        Takes URL parameters and additionally param=val or dict params.
        Deletes key if value is None."""
        # copy existing request params
        newParams=self.REQUEST.form.copy()
        # change single param
        if param is not None:
            if val is None:
                if newParams.has_key(param):
                    del newParams[param]
            else:
                newParams[param] = str(val)
                
        # change more params
        if params is not None:
            for (k, v) in params.items():
                if v is None:
                    # val=None removes param
                    if newParams.has_key(k):
                        del newParams[k]
                        
                else:
                    newParams[k] = v

        if duplicates:
            # eliminate lists (coming from duplicate keys)
            for (k,v) in newParams.items():
                if isinstance(v, list):
                    if duplicates == 'comma':
                        # make comma-separated list of non-empty entries
                        newParams[k] = ','.join([t for t in v if t])
                    elif duplicates == 'first':
                        # take first non-empty entry
                        newParams[k] = [t for t in v if t][0]
        
        return newParams
    
    def getLink(self, param=None, val=None, params=None, baseUrl=None, paramSep='&', duplicates='comma'):
        """returns URL to documentviewer with parameter param set to val or from dict params"""
        urlParams = self.getParams(param=param, val=val, params=params, duplicates=duplicates)
        # quote values and assemble into query string (not escaping '/')
        ps = paramSep.join(["%s=%s"%(k, urllib.quote_plus(utf8ify(v), '/')) for (k, v) in urlParams.items()])
        if baseUrl is None:
            baseUrl = self.getDocumentViewerURL()
            
        url = "%s?%s"%(baseUrl, ps)
        return url

    def getLinkAmp(self, param=None, val=None, params=None, baseUrl=None, duplicates='comma'):
        """link to documentviewer with parameter param set to val"""
        return self.getLink(param=param, val=val, params=params, baseUrl=baseUrl, paramSep='&', duplicates=duplicates)
    

    def setAvailableLayers(self, newLayerString=None):
        """sets availableLayers to newLayerString or tries to autodetect available layers.
        assumes layer templates have the form layer_{m}_{l} for layer l in mode m.
        newLayerString is parsed as JSON."""
        if newLayerString is not None:
            try:
                layers = json.loads(newLayerString)
                if 'text' in layers and 'image' in layers:
                    self.availableLayers = layers
                    return
            except:
                pass

            logging.error("invalid layers=%s! autodetecting..."%repr(newLayerString))
            
        # start with builtin layers
        self.availableLayers = self.builtinLayers.copy()
        # add layers from templates
        for t in self.template:
            if t.startswith('layer_'):
                try:
                    (x, m, l) = t.split('_', 3)
                    if m not in self.availableLayers:
                        # mode m doesn't exist -> new list
                        self.availableLayers[m] = [l]
                        
                    else:
                        # m exists -> append
                        if l not in self.availableLayers[m]:
                            self.availableLayers[m].append()
                            
                except:
                    pass

    def getAvailableLayersJson(self):
        """returns available layers as JSON string."""
        return json.dumps(self.availableLayers)
    
    
    def getInfo_xml(self,url,mode):
        """returns info about the document as XML"""
        if not self.digilibBaseUrl:
            self.digilibBaseUrl = self.findDigilibUrl() or "http://digilib.mpiwg-berlin.mpg.de/digitallibrary"
        
        docinfo = self.getDocinfo(mode=mode,url=url)
        pt = getattr(self.template, 'info_xml')
        return pt(docinfo=docinfo)

    def getAuthenticatedUser(self, anon=None):
        """returns the authenticated user object or None. (ignores Zopes anonymous user)"""
        user = getSecurityManager().getUser()
        if user is not None and user.getUserName() != "Anonymous User":
            return user
        else:
            return anon

    def isAccessible(self, docinfo):
        """returns if access to the resource is granted"""
        access = docinfo.get('accessType', None)
        logging.debug("documentViewer (accessOK) access type %s"%access)
        if access == 'free':
            logging.debug("documentViewer (accessOK) access is free")
            return True
        
        elif access is None or access in self.authgroups:
            # only local access -- only logged in users
            user = self.getAuthenticatedUser()
            logging.debug("documentViewer (accessOK) user=%s ip=%s"%(user,self.REQUEST.getClientAddr()))
            return (user is not None)
        
        logging.error("documentViewer (accessOK) unknown access type %s"%access)
        return False

    def getUserinfo(self):
        """returns userinfo object"""
        logging.debug("getUserinfo")
        userinfo = {}
        # look for cached userinfo in session
        if self.REQUEST.SESSION.has_key('userinfo'):
            userinfo = self.REQUEST.SESSION['userinfo']
            # check if its still current?
        else:
            # store in session
            self.REQUEST.SESSION['userinfo'] = userinfo
            
        return userinfo

    def getDocinfoJSON(self, mode, url, tocMode=None):
        """returns docinfo depending on mode"""
        import json
        
        dc = self.getDocinfo( mode, url, tocMode)
        
        return json.dumps(dc)
    
    
    def getDocinfo(self, mode, url, tocMode=None):
        """returns docinfo depending on mode"""
        logging.debug("getDocinfo: mode=%s, url=%s"%(mode,url))
        # look for cached docinfo in session
        if self.REQUEST.SESSION.has_key('docinfo'):
            docinfo = self.REQUEST.SESSION['docinfo']
            # check if its still current
            if docinfo is not None and docinfo.get('mode', None) == mode and docinfo.get('url', None) == url:
                logging.debug("getDocinfo: docinfo in session. keys=%s"%docinfo.keys())
                return docinfo
            
        # new docinfo
        docinfo = {'mode': mode, 'url': url}
        # add self url
        docinfo['viewerUrl'] = self.getDocumentViewerURL()
        docinfo['digilibBaseUrl'] = sslifyUrl(self.digilibBaseUrl, self, force=True)
        docinfo['digilibScalerUrl'] = sslifyUrl(self.digilibScalerUrl, self, force=True)
        docinfo['digilibViewerUrl'] = sslifyUrl(self.digilibViewerUrl, self, force=True)
        # get index.meta DOM
        docUrl = None
        metaDom = None
        if mode=="texttool": 
            # url points to document dir or index.meta
            metaDom = self.metadataService.getDomFromPathOrUrl(url)
            removeINDEXMETA_NS(metaDom)

            if metaDom is None:
                raise IOError("Unable to find index.meta for mode=texttool!")
            
            docUrl = url.replace('/index.meta', '')
            if url.startswith('/mpiwg/online/'):
                docUrl = url.replace('/mpiwg/online/', '', 1)
                
        elif mode=="textpath":
            #url points to an textfile
            #index.meta optional
            #assume index.meta in parent dir
            docUrl = getParentPath(url)
            docinfo['viewmode'] = "text"
            
            try:
                metaDom = self.metadataService.getDomFromPathOrUrl(docUrl)
                removeINDEXMETA_NS(metaDom)
                
            except:
                metaDom = None
                
            #metaDom = self.metadataService.getDomFromPathOrUrl(docUrl)
            #docinfo['imagePath'] = url.replace('/mpiwg/online', '', 1)
            docinfo['textURLPath'] = url.replace('/mpiwg/online', '', 1)
            docinfo['textURL'] = url
            if docinfo.get("creator", None) is None:
                docinfo['creator'] = "" 
            
            if docinfo.get("title", None) is None:
                docinfo['title'] = "" 

            if docinfo.get("documentPath", None) is None:
                docinfo['documentPath'] = url.replace('/mpiwg/online', '', 1)
                docinfo['documentPath'] = url.replace('/pages', '', 1)
                
            docinfo['numPages'] = 1

        elif mode=="imagepath":
            # url points to folder with images, index.meta optional
            # asssume index.meta in parent dir
            docUrl = getParentPath(url)
            metaDom = self.metadataService.getDomFromPathOrUrl(docUrl)
            docinfo['imagePath'] = url.replace('/mpiwg/online', '', 1)
            
        elif mode=="hocr":
            # url points to folder with images, index.meta optional
            # asssume index.meta in parent dir
            docUrl = getParentPath(url)
            metaDom = self.metadataService.getDomFromPathOrUrl(docUrl)
            docinfo['imagePath'] = url.replace('/mpiwg/online', '', 1)
            docinfo['textURLPath'] = url.replace('/mpiwg/online', '', 1)
            if docinfo.get("creator", None) is None:
                docinfo['creator'] = "" 
            
            if docinfo.get("title", None) is None:
                docinfo['title'] = "" 

            if docinfo.get("documentPath", None) is None:
                docinfo['documentPath'] = url.replace('/mpiwg/online', '', 1)
                docinfo['documentPath'] = url.replace('/pages', '', 1)

        elif mode=="filepath":
            # url points to image file, index.meta optional
            docinfo['imageURL'] = "%s?fn=%s"%(self.digilibScalerUrl, url)
            docinfo['numPages'] = 1
            # asssume index.meta is two path segments up
            docUrl = getParentPath(url, 2)
            metaDom = self.metadataService.getDomFromPathOrUrl(docUrl)

        else:
            logging.error("documentViewer (getdocinfo) unknown mode: %s!"%mode)
            raise ValueError("Unknown mode %s! Has to be one of 'texttool','imagepath','filepath'."%(mode))
        
        docinfo['documentUrl'] = docUrl
        # process index.meta contents
        
        if metaDom is not None and (metaDom.tag == 'resource' or metaDom.tag == "{%s}resource"%INDEXMETA_NS):
            # document directory name and path
            resource = self.metadataService.getResourceData(dom=metaDom, recursive=1)
            if resource:
                docinfo = self.getDocinfoFromResource(docinfo, resource)
           
            # texttool info
            texttool = self.metadataService.getTexttoolData(dom=metaDom, recursive=1, all=True)
            if texttool:
                docinfo = self.getDocinfoFromTexttool(docinfo, texttool)
                # document info from full text server
                if docinfo.get('textURLPath', None):
                    docinfo = self.getTextInfo(mode=None, docinfo=docinfo)
                    # include list of pages TODO: do we need this always? 
                    docinfo = self.getTextInfo(mode='pages', docinfo=docinfo)
            
            # bib info
            bib = self.metadataService.getBibData(dom=metaDom)
            if bib:
                # save extended version as 'bibx' TODO: ugly
                bibx = self.metadataService.getBibData(dom=metaDom, all=True, recursive=1)
                if len(bibx) == 1:
                    # unwrap list if possible
                    bibx = bibx[0]
                    
                docinfo['bibx'] = bibx
                docinfo = self.getDocinfoFromBib(docinfo, bib, bibx)
            else:
                # no bib - try info.xml
                docinfo = self.getDocinfoFromPresentationInfoXml(docinfo)
                
            # auth info
            access = self.metadataService.getAccessData(dom=metaDom)
            if access:
                docinfo = self.getDocinfoFromAccess(docinfo, access)

            # attribution info
            attribution = self.metadataService.getAttributionData(dom=metaDom)
            if attribution:
                logging.debug("getDocinfo: attribution=%s"%repr(attribution))
                docinfo['attribution'] = attribution

            # copyright info
            copyright = self.metadataService.getCopyrightData(dom=metaDom)
            if copyright:
                logging.debug("getDocinfo: copyright=%s"%repr(copyright))
                docinfo['copyright'] = copyright

            # DRI (permanent ID)
            dri = self.metadataService.getDRI(dom=metaDom, type='mpiwg')
            if dri:
                docinfo['DRI'] = dri

            # (presentation) context
            ctx = self.metadataService.getContextData(dom=metaDom, all=True)
            if ctx:
                logging.debug("getcontext: ctx=%s"%repr(ctx))
                docinfo['presentationContext'] = ctx

        # image path
        if mode != 'texttool':
            # override image path from texttool with url parameter TODO: how about mode=auto?
            docinfo['imagePath'] = url.replace('/mpiwg/online', '', 1)

        # check numPages
        if docinfo.get('numPages', 0) == 0:
            # number of images from digilib
            if docinfo.get('imagePath', None):
                imgpath = docinfo['imagePath'].replace('/mpiwg/online', '', 1)
                logging.debug("imgpath=%s"%imgpath)
                docinfo['imageURL'] = sslifyUrl("%s?fn=%s"%(self.digilibScalerUrl, imgpath), self, force=True)
                docinfo = self.getDocinfoFromDigilib(docinfo, imgpath)
            else:
                # imagePath still missing? try "./pageimg"
                imgPath = os.path.join(docUrl, 'pageimg')
                docinfo = self.getDocinfoFromDigilib(docinfo, imgPath)
                if docinfo.get('numPages', 0) > 0:
                    # there are pages
                    docinfo['imagePath'] = imgPath
                    docinfo['imageURL'] = sslifyUrl("%s?fn=%s"%(self.digilibScalerUrl, docinfo['imagePath']), self, force=True)

        # check numPages
        if docinfo.get('numPages', 0) == 0:
            if docinfo.get('numTextPages', 0) > 0:
                # replace with numTextPages (text-only?)
                docinfo['numPages'] = docinfo['numTextPages']
                
        # min and max page no
        docinfo['minPageNo'] = docinfo.get('minPageNo', 1)
        docinfo['maxPageNo'] = docinfo.get('maxPageNo', docinfo['numPages'])

        # part-of information
        partOfPath = docinfo.get('partOfPath', None)
        if partOfPath is not None:
            partOfDom = self.metadataService.getDomFromPathOrUrl(partOfPath)
            if partOfDom is not None:
                docinfo['partOfLabel'] = self.metadataService.getBibFormattedLabel(dom=partOfDom)
                docinfo['partOfUrl'] = "%s?url=%s"%(self.getDocumentViewerURL(), partOfPath)
                logging.debug("partOfLabel=%s partOfUrl=%s"%(docinfo['partOfLabel'],docinfo['partOfUrl']))

        # normalize path
        if 'imagePath' in docinfo and not docinfo['imagePath'].startswith('/'):
            docinfo['imagePath'] = '/' + docinfo['imagePath']

        logging.debug("documentViewer (getdocinfo) docinfo: keys=%s"%docinfo.keys())
        # store in session
        self.REQUEST.SESSION['docinfo'] = docinfo
        return docinfo


    def getDocinfoFromResource(self, docinfo, resource):
        """reads contents of resource element into docinfo"""
        logging.debug("getDocinfoFromResource: resource=%s"%(repr(resource)))
        docName = getMDText(resource.get('name', None))
        docinfo['documentName'] = docName
        docPath = getMDText(resource.get('archive-path', None))
        if docPath:
            # clean up document path
            if docPath[0] != '/':
                docPath = '/' + docPath
                
            if docName and (not docPath.endswith(docName)):
                docPath += "/" + docName
            
        else:
            # use docUrl as docPath
            docUrl = docinfo['documentURL']
            if not docUrl.startswith('http:'):
                docPath = docUrl
                
        if docPath:
            # fix URLs starting with /mpiwg/online
            docPath = docPath.replace('/mpiwg/online', '', 1)

        docinfo['documentPath'] = docPath
        
        # is this part-of?
        partOf = resource.get('is-part-of', None)
        if partOf is not None:
            partOf = getMDText(partOf.get('archive-path', None))
            if partOf is not None:
                docinfo['partOfPath'] = partOf.strip()
                
        return docinfo

    def getDocinfoFromTexttool(self, docinfo, texttool):
        """reads contents of texttool element into docinfo"""
        logging.debug("texttool=%s"%repr(texttool))
        # unpack list if necessary
        if isinstance(texttool, list):
            texttool = texttool[0]
                                    
        # image dir
        imageDir = getMDText(texttool.get('image', None))
        docPath = getMDText(docinfo.get('documentPath', None))
        if imageDir:
            if imageDir.startswith('/'):
                # absolute path
                imageDir = imageDir.replace('/mpiwg/online', '', 1)
                docinfo['imagePath'] = imageDir
                
            elif docPath:
                # relative path
                imageDir = os.path.join(docPath, imageDir)
                imageDir = imageDir.replace('/mpiwg/online', '', 1)
                docinfo['imagePath'] = imageDir
                
        # start and end page (for subdocuments of other documents)
        imgStartNo = getMDText(texttool.get('image-start-no', None))            
        minPageNo = getInt(imgStartNo, 1)
        docinfo['minPageNo'] = minPageNo

        imgEndNo = getMDText(texttool.get('image-end-no', None))
        if imgEndNo:
            docinfo['maxPageNo'] = getInt(imgEndNo)
        
        # old style text URL
        textUrl = getMDText(texttool.get('text', None))

        if textUrl and docPath:
            if urlparse.urlparse(textUrl)[0] == "": #keine url
                textUrl = os.path.join(docPath, textUrl) 
            
            docinfo['textURL'] = textUrl
    
        # new style text-url-path (can be more than one with "repository" attribute)
        textUrlNode = texttool.get('text-url-path', None)
        if not isinstance(textUrlNode, list):
            textUrlNode = [textUrlNode]

        for tun in textUrlNode:
            textUrl = getMDText(tun)
            if textUrl:
                textUrlAtts = tun.get('@attr')
                if (textUrlAtts and 'repository' in textUrlAtts):
                    textRepo = textUrlAtts['repository']
                    # use matching repository
                    if self.getRepositoryType() == textRepo:
                        docinfo['textURLPath'] = textUrl
                        docinfo['textURLRepository'] = textRepo
                        break
                
                else:
                    # no repo attribute - use always
                    docinfo['textURLPath'] = textUrl
            
        # page flow
        docinfo['pageFlow'] = getMDText(texttool.get('page-flow', 'ltr'))
            
        # odd pages are left
        docinfo['oddPage'] = getMDText(texttool.get('odd-scan-position', 'left'))
            
        # number of title page (default 1)
        docinfo['titlePage'] = getMDText(texttool.get('title-scan-no', minPageNo))
            
        # old presentation stuff
        presentation = getMDText(texttool.get('presentation', None))
        if presentation and docPath:
            if presentation.startswith('http:'):
                docinfo['presentationUrl'] = presentation
            else:
                docinfo['presentationUrl'] = os.path.join(docPath, presentation)
                
        # make sure we have at least fake DC data
        if 'creator' not in docinfo:
            docinfo['creator'] = '[no author found]'
            
        if 'title' not in docinfo:
            docinfo['title'] = '[no title found]'
            
        if 'date' not in docinfo:
            docinfo['date'] = '[no date found]'
        
        return docinfo

    def getDocinfoFromBib(self, docinfo, bib, bibx=None):
        """reads contents of bib element into docinfo"""
        logging.debug("getDocinfoFromBib bib=%s"%repr(bib))
        # put all raw bib fields in dict "bib"
        docinfo['bib'] = bib
        bibtype = bib.get('@type', None)
        docinfo['bibType'] = bibtype
        # also store DC metadata for convenience
        dc = self.metadataService.getDCMappedData(bib)
        docinfo['creator'] = dc.get('creator','')
        docinfo['title'] = dc.get('title','')
        docinfo['date'] = dc.get('date','')
        return docinfo
            
    def getDocinfoFromAccess(self, docinfo, acc):
        """reads contents of access element into docinfo"""
        #TODO: also read resource type
        logging.debug("getDocinfoFromAccess acc=%s"%repr(acc))
        try:
            acctype = acc['@attr']['type']
            if acctype:
                access=acctype
                if access in ['group', 'institution']:
                    access = acc['name'].lower()
                
                docinfo['accessType'] = access

        except:
            pass
        
        return docinfo

    def getDocinfoFromDigilib(self, docinfo, path):
        infoUrl=self.digilibBaseUrl+"/api/dirInfo-xml.jsp?fn="+path
        # fetch data
        txt = getHttpData(infoUrl)
        if not txt:
            logging.error("Unable to get dir-info from %s"%(infoUrl))
            return docinfo

        dom = ET.fromstring(txt)
        dir = dom
        # save size
        size = dir.findtext('size')
        logging.debug("getDocinfoFromDigilib: size=%s"%size)
        if size:
            docinfo['numPages'] = int(size)
        else:
            docinfo['numPages'] = 0
            return docinfo
            
        # save list of image names and numbers
        imgNames = {}
        imgIndexes = {}
        for f in dir:
            fn = f.findtext('name')
            pn = getInt(f.findtext('index'))
            imgNames[fn] = pn
            imgIndexes[pn] = fn
            
        docinfo['imgFileNames'] = imgNames
        docinfo['imgFileIndexes'] = imgIndexes
        return docinfo
            
            
    def getDocinfoFromPresentationInfoXml(self,docinfo):
        """gets DC-like bibliographical information from the presentation entry in texttools"""
        url = docinfo.get('presentationUrl', None)
        if not url:
            logging.error("getDocinfoFromPresentation: no URL!")
            return docinfo
        
        dom = None
        metaUrl = None
        if url.startswith("http://"):
            # real URL
            metaUrl = url
        else:
            # online path
            server=self.digilibBaseUrl+"/servlet/Texter?fn="
            metaUrl=server+url
        
        txt=getHttpData(metaUrl)
        if txt is None:
            logging.error("Unable to read info.xml from %s"%(url))
            return docinfo
            
        dom = ET.fromstring(txt)
        docinfo['creator']=getText(dom.find(".//author"))
        docinfo['title']=getText(dom.find(".//title"))
        docinfo['date']=getText(dom.find(".//date"))
        return docinfo
    

    def getPageinfo(self, pn=None, pf=None, start=None, rows=None, cols=None, docinfo=None, userinfo=None, viewMode=None, viewLayer=None, tocMode=None):
        """returns pageinfo with the given parameters"""
        logging.debug("getPageInfo(pn=%s, pf=%s, start=%s, rows=%s, cols=%s, viewMode=%s, viewLayer=%s, tocMode=%s)"%(pn,pf,start,rows,cols,viewMode,viewLayer,tocMode))
        pageinfo = {}
        pageinfo['viewMode'] = viewMode
        # split viewLayer if necessary
        if isinstance(viewLayer,basestring):
            viewLayer = viewLayer.split(',')
            
        if isinstance(viewLayer, list):
            logging.debug("getPageinfo: viewLayer is list:%s"%viewLayer)
            # save (unique) list in viewLayers
            seen = set()
            viewLayers = [l for l in viewLayer if l and l not in seen and not seen.add(l)]
            pageinfo['viewLayers'] = viewLayers
            # stringify viewLayer
            viewLayer = ','.join(viewLayers)
        else:
            #create list
            pageinfo['viewLayers'] = [viewLayer]
                        
        pageinfo['viewLayer'] = viewLayer
        pageinfo['tocMode'] = tocMode

        minPageNo = docinfo.get('minPageNo', 1)

        # pf takes precedence over pn
        if pf:
            pageinfo['pf'] = pf
            pn = getPnForPf(docinfo, pf)
            # replace pf in request params (used for creating new URLs)
            self.REQUEST.form.pop('pf', None)
            self.REQUEST.form['pn'] = pn
        else:
            pn = getInt(pn, minPageNo)
            pf = getPfForPn(docinfo, pn)
            pageinfo['pf'] = pf
            
        pageinfo['pn'] = pn
        rows = int(rows or self.thumbrows)
        pageinfo['rows'] = rows
        cols = int(cols or self.thumbcols)
        pageinfo['cols'] = cols
        grpsize = cols * rows
        pageinfo['groupsize'] = grpsize
        # if start is empty use one around pn
        grouppn = math.ceil(float(pn)/float(grpsize))*grpsize-(grpsize-1)
        # but not smaller than minPageNo
        start = getInt(start, max(grouppn, minPageNo))
        pageinfo['start'] = start
        # get number of pages
        numPages = int(docinfo.get('numPages', 0))
        if numPages == 0:
            # try numTextPages
            numPages = docinfo.get('numTextPages', 0)
            if numPages != 0:
                docinfo['numPages'] = numPages

        maxPageNo = docinfo.get('maxPageNo', numPages)
        logging.debug("minPageNo=%s maxPageNo=%s start=%s numPages=%s"%(minPageNo,maxPageNo,start,numPages))
        np = maxPageNo

        # cache table of contents
        pageinfo['tocPageSize'] = getInt(self.REQUEST.get('tocPageSize', 30))
        pageinfo['numgroups'] = int(np / grpsize)
        if np % grpsize > 0:
            pageinfo['numgroups'] += 1

        pageFlowLtr = docinfo.get('pageFlow', 'ltr') != 'rtl'
        oddScanLeft = docinfo.get('oddPage', 'left') != 'right'
        # add zeroth page for two columns
        pageZero = (cols == 2 and (pageFlowLtr != oddScanLeft))
        pageinfo['pageZero'] = pageZero
        pageinfo['pageBatch'] = self.getPageBatch(start=start, rows=rows, cols=cols, pageFlowLtr=pageFlowLtr, pageZero=pageZero, minIdx=minPageNo, maxIdx=np)
        # more page parameters
        #pageinfo['characterNormalization'] = self.REQUEST.get('characterNormalization','reg')
        #becuase it is buggy this currently disabled and set to orig.
        pageinfo['characterNormalization'] = 'orig'
        if docinfo.get('pageNumbers'):
            # get original page numbers
            pageNumber = docinfo['pageNumbers'].get(pn, None)
            if pageNumber is not None:
                pageinfo['pageNumberOrig'] = pageNumber['no']
                pageinfo['pageNumberOrigNorm'] = pageNumber['non']
        
        # cache search results
        query = self.REQUEST.get('query',None)
        pageinfo['query'] = query
        if query and viewMode == 'text':
            pageinfo['resultPageSize'] = getInt(self.REQUEST.get('resultPageSize', 10))
            queryType = self.REQUEST.get('queryType', 'fulltextMorph')
            pageinfo['queryType'] = queryType
            pageinfo['resultStart'] = getInt(self.REQUEST.get('resultStart', '1'))
            self.getSearchResults(mode=queryType, query=query, pageinfo=pageinfo, docinfo=docinfo)
            
            # highlighting
            highlightQuery = self.REQUEST.get('highlightQuery', None)
            if highlightQuery:
                pageinfo['highlightQuery'] = highlightQuery
                pageinfo['highlightElement'] = self.REQUEST.get('highlightElement', '')
                pageinfo['highlightElementPos'] = self.REQUEST.get('highlightElementPos', '')
            
        return pageinfo


    def getPageBatch(self, start=1, rows=10, cols=2, pageFlowLtr=True, pageZero=False, minIdx=1, maxIdx=0):
        """Return dict with array of page information for one screenfull of thumbnails.

        :param start: index of current page
        :param rows: number of rows in one batch
        :param cols: number of columns in one batch
        :param pageFlowLtr: do indexes increase from left to right
        :param pageZero: is there a zeroth non-visible page
        :param minIdx: minimum index to use
        :param maxIdx: maximum index to use
        :returns: dict with
            first: first page index
            last: last page index
            batches: list of all possible batches(dict: 'start': index, 'end': index)
            pages: list for current batch of rows(list of cols(list of pages(dict: 'idx': index)))
            nextStart: first index of next batch
            prevStart: first index of previous batch 
        """
        logging.debug("getPageBatch start=%s minIdx=%s maxIdx=%s"%(start,minIdx,maxIdx))
        batch = {}
        grpsize = rows * cols
        if maxIdx == 0:
            maxIdx = start + grpsize

        np = maxIdx - minIdx + 1
        if pageZero:
            # correct number of pages for batching
            np += 1
            
        nb = int(math.ceil(np / float(grpsize)))
        
        # list of all batch start and end points
        batches = []
        if pageZero:
            ofs = minIdx - 1
        else:
            ofs = minIdx
            
        for i in range(nb):
            s = i * grpsize + ofs
            e = min((i + 1) * grpsize + ofs - 1, maxIdx)
            batches.append({'start':s, 'end':e})
            
        batch['batches'] = batches

        # list of pages for current screen
        pages = []
        if pageZero and start == minIdx:
            # correct beginning
            idx = minIdx - 1
        else:
            idx = start
            
        for r in range(rows):
            row = []
            for c in range(cols):
                if idx < minIdx or idx > maxIdx:
                    page = {'idx':None}
                else:
                    page = {'idx':idx}
                    
                idx += 1
                if pageFlowLtr:
                    row.append(page)
                else:
                    row.insert(0, page) 
                
            pages.append(row)
            
        if start > minIdx:
            batch['prevStart'] = max(start - grpsize, minIdx)
        else:
            batch['prevStart'] = None
            
        if start + grpsize <= maxIdx:
            if pageZero and start == minIdx:
                # correct nextStart for pageZero
                batch['nextStart'] = grpsize
            else:
                batch['nextStart'] = start + grpsize
        else:
            batch['nextStart'] = None

        batch['pages'] = pages
        batch['first'] = minIdx
        batch['last'] = maxIdx
        logging.debug("batch: %s"%repr(batch))
        return batch
        
        
    def getBatch(self, start=1, size=10, end=0, data=None, fullData=True):
        """returns dict with information for one screenfull of data."""
        batch = {}
        if end == 0:
            end = start + size                    
            
        nb = int(math.ceil(end / float(size)))
        # list of all batch start and end points
        batches = []
        for i in range(nb):
            s = i * size + 1
            e = min((i + 1) * size, end)
            batches.append({'start':s, 'end':e})
            
        batch['batches'] = batches
        # list of elements in this batch
        this = []
        j = 0
        for i in range(start, min(start+size, end+1)):
            if data:
                if fullData:
                    d = data.get(i, None)
                else:
                    d = data.get(j, None)
                    j += 1
            
            else:
                d = i+1
                
            this.append(d)
            
        batch['this'] = this
        if start > 1:
            batch['prevStart'] = max(start - size, 1)
        else:
            batch['prevStart'] = None
            
        if start + size < end:
            batch['nextStart'] = start + size
        else:
            batch['nextStart'] = None
        
        batch['first'] = start
        batch['last'] = end
        return batch
        

    def getAnnotatorGroupsForUser(self, user, annotationServerUrl="http://tuxserve03.mpiwg-berlin.mpg.de/AnnotationManager"):
        """returns list of groups {name:*, id:*} on the annotation server for the user"""
        groups = []
        # add matching http(s) from our URL
        annotationServerUrl = sslifyUrl(annotationServerUrl, self)
            
        groupsUrl = "%s/annotator/groups?user=%s"%(annotationServerUrl,user)
        data = getHttpData(url=groupsUrl, noExceptions=True)
        if data:
            res = json.loads(data)
            rows = res.get('rows', None)
            if rows is None:
                return groups
            for r in rows:
                groups.append({'id': r.get('id', None), 'name': r.get('name', None), 'uri': r.get('uri', None)})
                
        return groups
    
    def sslifyUrl(self, url, **args):
        """returns URL with http or https"""
        return sslifyUrl(url, **args)

    security.declareProtected('View management screens','changeDocumentViewerForm')    
    changeDocumentViewerForm = PageTemplateFile('zpt/changeDocumentViewer', globals())
    
    def changeDocumentViewer(self,title="",digilibBaseUrl=None,thumbrows=2,thumbcols=5,authgroups='mpiwg',availableLayers=None,RESPONSE=None):
        """init document viewer"""
        self.title=title
        self.digilibBaseUrl = digilibBaseUrl
        self.digilibScalerUrl = digilibBaseUrl + '/servlet/Scaler'
        self.digilibViewerUrl = digilibBaseUrl + '/jquery/digilib.html'
        self.thumbrows = thumbrows
        self.thumbcols = thumbcols
        self.authgroups = [s.strip().lower() for s in authgroups.split(',')]
        try:
            # assume MetaDataFolder instance is called metadata 
            self.metadataService = getattr(self, 'metadata')
        except Exception, e:
            logging.error("Unable to find MetaDataFolder 'metadata': "+str(e))
            
        self.setAvailableLayers(availableLayers)

        if RESPONSE is not None:
            RESPONSE.redirect('manage_main')
            
        
def manage_AddDocumentViewerForm(self):
    """add the viewer form"""
    pt=PageTemplateFile('zpt/addDocumentViewer', globals()).__of__(self)
    return pt()
  
def manage_AddDocumentViewer(self,id,imageScalerUrl="",textServerName="",title="",RESPONSE=None):
    """add the viewer"""
    newObj=documentViewer(id,imageScalerUrl=imageScalerUrl,title=title,textServerName=textServerName)
    self._setObject(id,newObj)
    
    if RESPONSE is not None:
        RESPONSE.redirect('manage_main')