view MpdlXmlTextServer.py @ 613:c57d80a649ea

CLOSED - # 281: List of thumbnails verschluckt Seite, wenn odd-scan-position gesetzt ist https://it-dev.mpiwg-berlin.mpg.de/tracs/mpdl-project-software/ticket/281
author casties
date Thu, 17 Oct 2013 16:25:39 +0200
parents e1034c2ca255
children
line wrap: on
line source

from OFS.SimpleItem import SimpleItem
from Products.PageTemplates.PageTemplateFile import PageTemplateFile 

import xml.etree.ElementTree as ET

import re
import logging
import urllib
import urlparse
import base64

from SrvTxtUtils import getInt, getText, getHttpData, serialize


class MpdlXmlTextServer(SimpleItem):
    """TextServer implementation for MPDL-XML eXist server"""
    meta_type="MPDL-XML TextServer"

    manage_options=(
        {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'},
       )+SimpleItem.manage_options
    
    manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals())
        
    def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40, repositoryType='production'):
        """constructor"""
        self.id=id
        self.title=title
        self.timeout = timeout
        self.repositoryType = repositoryType
        if serverName is None:
            self.serverUrl = serverUrl
        else:
            self.serverUrl = "http://%s/mpdl/interface/"%serverName
        
    def getHttpData(self, url, data=None):
        """returns result from url+data HTTP request"""
        return getHttpData(url,data,timeout=self.timeout)
    
    def getServerData(self, method, data=None):
        """returns result from text server for method+data"""
        url = self.serverUrl+method
        return getHttpData(url,data,timeout=self.timeout)


    def getRepositoryType(self):
        """returns the repository type, e.g. 'production'"""
        return getattr(self, 'repositoryType', None)

    def getTextDownloadUrl(self, type='xml', docinfo=None):
        """returns a URL to download the current text"""
        docpath = docinfo.get('textURLPath', None)
        if not docpath:
            return None

        docpath = docpath.replace('.xml','.'+type)
        url = '%sgetDoc?doc=%s'%(self.serverUrl.replace('interface/',''), docpath)
        return url


    def getPlacesOnPage(self, docinfo=None, pn=None):
        """Returns list of GIS places of page pn"""
        docpath = docinfo.get('textURLPath',None)
        if not docpath:
            return None

        places=[]
        text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn))
        dom = ET.fromstring(text)
        result = dom.findall(".//resultPage/place")
        for l in result:
            id = l.get("id")
            name = l.text
            place = {'id': id, 'name': name}
            places.append(place)

        return places
    
          
    def getTextInfo(self, mode='', docinfo=None):
        """reads document info, including page concordance, from text server"""
        logging.debug("getTextInfo mode=%s"%mode)
        if mode not in ['toc', 'figures', '']:
            mode = ''
        # check cached info
        if mode:
            # cached toc-request?
            if 'full_%s'%mode in docinfo:
                return docinfo
            
        else:
            # no toc-request
            if 'numTextPages' in docinfo:
                return docinfo
                
        docpath = docinfo.get('textURLPath', None)
        if docpath is None:
            logging.error("getTextInfo: no textURLPath!")
            return docinfo
              
        try:
            # we need to set a result set size
            pagesize = 10000
            pn = 1
            # fetch docinfo            
            pagexml = self.getServerData("doc-info.xql","document=%s&info=%s&pageSize=%s&pn=%s"%(docpath,mode,pagesize,pn))
            dom = ET.fromstring(pagexml)
            # all info in tag <document>
            doc = dom.find("document")
        except Exception, e:
            logging.error("getTextInfo: Error reading doc info: %s"%e)
            return docinfo
            
        if doc is None:
            logging.error("getTextInfo: unable to find document-tag!")
        else:
            # go through all child elements
            for tag in doc:
                name = tag.tag
                # numTextPages
                if name == 'countPages':
                    np = getInt(tag.text)                    
                    if np > 0:
                        docinfo['numTextPages'] = np
                   
                # numFigureEntries
                elif name == 'countFigureEntries':
                    docinfo['numFigureEntries'] = getInt(tag.text)
                    
                # numTocEntries
                elif name == 'countTocEntries':
                    # WTF: s1 = int(s)/30+1
                    docinfo['numTocEntries'] = getInt(tag.text)
                    
                # numPlaces
                elif name == 'countPlaces':
                    docinfo['numPlaces'] = getInt(tag.text)
                    
                # pageNumbers
                elif name == 'pageNumbers':
                    # contains tags with page numbers
                    # <pn><n>4</n><no>4</no><non/></pn>
                    # n=scan number, no=original page no, non=normalized original page no
                    # pageNumbers is a dict indexed by scan number
                    pages = {}
                    for pn in tag:
                        page = {}
                        n = 0
                        for p in pn:
                            if p.tag == 'n':
                                n = getInt(p.text)
                                page['pn'] = n
                            elif p.tag == 'no':
                                page['no'] = p.text
                            elif p.tag == 'non':
                                page['non'] = p.text
                                
                        if n > 0:
                            pages[n] = page
                        
                    docinfo['pageNumbers'] = pages
                    #logging.debug("got pageNumbers=%s"%repr(pages))
                                
                # toc
                elif name == 'toc':
                    # contains tags with table of contents/figures
                    # <toc-entry><page>13</page><level>3</level><content>Chapter I</content><level-string>1.</level-string><real-level>1</real-level></toc-entry>
                    tocs = []
                    for te in tag:
                        toc = {}
                        for t in te:
                            if t.tag == 'page':
                                toc['pn'] = getInt(t.text)
                            elif t.tag == 'level':
                                toc['level'] = t.text
                            elif t.tag == 'content':
                                toc['content'] = t.text
                            elif t.tag == 'level-string':
                                toc['level-string'] = t.text
                            elif t.tag == 'real-level':
                                toc['real-level'] = t.text
                                
                        tocs.append(toc)
                    
                    # save as full_toc/full_figures
                    docinfo['full_%s'%mode] = tocs

        return docinfo
        
          
    def processPageInfo(self, dom, docinfo, pageinfo):
        """processes page info divs from dom and stores in docinfo and pageinfo"""
        # assume first second level div is pageMeta
        alldivs = dom.find("div")
        
        if alldivs is None or alldivs.get('class', '') != 'pageMeta':
            logging.error("processPageInfo: pageMeta div not found!")
            return
        
        for div in alldivs:
            dc = div.get('class')
            
            # pageNumberOrig  
            if dc == 'pageNumberOrig':
                pageinfo['pageNumberOrig'] = div.text
                
            # pageNumberOrigNorm
            elif dc == 'pageNumberOrigNorm':
                pageinfo['pageNumberOrigNorm'] = div.text
                
            # pageHeaderTitle
            elif dc == 'pageHeaderTitle':
                pageinfo['pageHeaderTitle'] = div.text
                        
        #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo))
        return
         
           
    def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
        """returns single page from fulltext"""
        
        logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
        # check for cached text -- but ideally this shouldn't be called twice
        if pageinfo.has_key('textPage'):
            logging.debug("getTextPage: using cached text")
            return pageinfo['textPage']
        
        docpath = docinfo.get('textURLPath', None)
        if not docpath:
            return None
        
        # stuff for constructing full urls
        selfurl = docinfo['viewerUrl']
        textParams = {'document': docpath,
                      'pn': pn}
        if 'characterNormalization' in pageinfo:
            textParams['characterNormalization'] = pageinfo['characterNormalization']
        
        if not mode:
            # default is dict
            mode = 'text'

        modes = mode.split(',')
        # check for multiple layers
        if len(modes) > 1:
            logging.debug("getTextPage: more than one mode=%s"%mode)
                        
        # search mode
        if 'search' in modes:
            # add highlighting
            highlightQuery = pageinfo.get('highlightQuery', None)
            if highlightQuery:
                textParams['highlightQuery'] = highlightQuery
                textParams['highlightElement'] = pageinfo.get('highlightElement', '')
                textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '')
                
            # ignore mode in the following
            modes.remove('search')
                            
        # pundit mode
        punditMode = False
        if 'pundit' in modes:
            punditMode = True
            # ignore mode in the following
            modes.remove('pundit')
                            
        # other modes don't combine
        if 'dict' in modes:
            # dict is called textPollux in the backend
            textmode = 'textPollux'
        elif 'xml' in modes:
            # xml mode
            textmode = 'xml'
            textParams['characterNormalization'] = 'orig'
        elif 'gis' in modes:
            textmode = 'gis'
        else:
            # text is default mode
            textmode = 'text'
        
        textParams['mode'] = textmode
        
        try:
            # fetch the page
            pagexml = self.getServerData("page-fragment.xql",urllib.urlencode(textParams))
            dom = ET.fromstring(pagexml)
        except Exception, e:
            logging.error("getTextPage: Error reading page: %s"%e)
            return None
            
        # extract additional info
        self.processPageInfo(dom, docinfo, pageinfo)
        # page content is in <div class="pageContent">
        pagediv = None
        # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent']
        # so we look at the second level divs
        alldivs = dom.findall('div')
        for div in alldivs:
            dc = div.get('class')
            # page content div
            if dc == 'pageContent':
                pagediv = div
                break
        
        # plain text mode
        if textmode == "text":
            # get full url assuming documentViewer is parent
            selfurl = self.getLink()
            if pagediv is not None:
                if punditMode:
                    pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo)
                    
                # fix empty div tags
                divs = pagediv.findall('.//div')
                for d in divs:
                    if len(d) == 0 and not d.text:
                        # make empty divs non-empty
                        d.text = ' '
                    
                # check all a-tags
                links = pagediv.findall('.//a')
                for l in links:
                    href = l.get('href')
                    if href and href.startswith('#note-'):
                        href = href.replace('#note-',"%s#note-"%selfurl)
                        l.set('href', href)

                return serialize(pagediv)
            
        # text-with-links mode
        elif textmode == "textPollux":
            if pagediv is not None:
                viewerurl = docinfo['viewerUrl']
                selfurl = self.getLink()
                if punditMode:
                    pagediv = self.addPunditAttributes(pagediv, pageinfo, docinfo)
                    
                # fix empty div tags
                divs = pagediv.findall('.//div')
                for d in divs:
                    if len(d) == 0 and not d.text:
                        # make empty divs non-empty
                        d.text = ' '
                    
                # check all a-tags
                links = pagediv.findall(".//a")
                for l in links:
                    href = l.get('href')
                    
                    if href:
                        # is link with href
                        linkurl = urlparse.urlparse(href)
                        #logging.debug("getTextPage: linkurl=%s"%repr(linkurl))
                        if linkurl.path.endswith('GetDictionaryEntries'):
                            #TODO: replace wordInfo page
                            # is dictionary link - change href (keeping parameters)
                            #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl))
                            # add target to open new page
                            l.set('target', '_blank')
                                                          
                        if href.startswith('#note-'):
                            # note link
                            l.set('href', href.replace('#note-',"%s#note-"%selfurl))
                              
                return serialize(pagediv)
            
        # xml mode
        elif textmode == "xml":
            if pagediv is not None:
                return serialize(pagediv)
            
        # pureXml mode WTF?
        elif textmode == "pureXml":
            if pagediv is not None:
                return serialize(pagediv)
                  
        # gis mode
        elif textmode == "gis":
            if pagediv is not None:
                # fix empty div tags
                divs = pagediv.findall('.//div')
                for d in divs:
                    if len(d) == 0 and not d.text:
                        # make empty divs non-empty
                        d.text = ' '
                    
                # check all a-tags
                links = pagediv.findall(".//a")
                # add our URL as backlink
                selfurl = self.getLink()
                doc = base64.b64encode(selfurl)
                for l in links:
                    href = l.get('href')
                    if href:
                        if href.startswith('http://mappit.mpiwg-berlin.mpg.de'):
                            l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href))
                            l.set('target', '_blank')
                            
                return serialize(pagediv)
                    
        return None
    
    def addPunditAttributes(self, pagediv, pageinfo, docinfo):
        """add about attributes for pundit annotation tool"""
        textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???'))
        pn = pageinfo.get('pn', '1')
        #  TODO: use pn as well?
        # check all div-tags
        divs = pagediv.findall(".//div")
        for d in divs:
            id = d.get('id')
            if id:
                d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id))
                cls = d.get('class','')
                cls += ' pundit-content'
                d.set('class', cls.strip())

        return pagediv

    def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None):
        """loads list of search results and stores XML in docinfo"""
        
        logging.debug("getSearchResults mode=%s query=%s"%(mode, query))
        if mode == "none":
            return docinfo
              
        cachedQuery = docinfo.get('cachedQuery', None)
        if cachedQuery is not None:
            # cached search result
            if cachedQuery == '%s_%s'%(mode,query):
                # same query
                return docinfo
            
            else:
                # different query
                del docinfo['resultSize']
                del docinfo['resultXML']
        
        # cache query
        docinfo['cachedQuery'] = '%s_%s'%(mode,query)
        
        # fetch full results
        docpath = docinfo['textURLPath']
        params = {'document': docpath,
                  'mode': 'text',
                  'queryType': mode,
                  'query': query,
                  'queryResultPageSize': 1000,
                  'queryResultPN': 1,
                  'characterNormalization': pageinfo.get('characterNormalization', 'reg')}
        pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params))
        #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery)))
        dom = ET.fromstring(pagexml)
        # page content is in <div class="queryResultPage">
        pagediv = None
        # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
        alldivs = dom.findall("div")
        for div in alldivs:
            dc = div.get('class')
            # page content div
            if dc == 'queryResultPage':
                pagediv = div
                
            elif dc == 'queryResultHits':
                docinfo['resultSize'] = getInt(div.text)

        if pagediv is not None:
            # store XML in docinfo
            docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8')

        return docinfo
    

    def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None):
        """returns single page from the table of contents"""
        logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn))
        # get (cached) result
        self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
            
        resultxml = docinfo.get('resultXML', None)
        if not resultxml:
            logging.error("getResultPage: unable to find resultXML")
            return "Error: no result!"
        
        if size is None:
            size = pageinfo.get('resultPageSize', 10)
            
        if start is None:
            start = (pn - 1) * size

        fullresult = ET.fromstring(resultxml)
        
        if fullresult is not None:
            # paginate
            first = start-1
            len = size
            del fullresult[:first]
            del fullresult[len:]
            tocdivs = fullresult
            
            # check all a-tags
            links = tocdivs.findall(".//a")
            for l in links:
                href = l.get('href')
                if href:
                    # assume all links go to pages
                    linkUrl = urlparse.urlparse(href)
                    linkParams = urlparse.parse_qs(linkUrl.query)
                    # take some parameters
                    params = {'pn': linkParams['pn'],
                              'highlightQuery': linkParams.get('highlightQuery',''),
                              'highlightElement': linkParams.get('highlightElement',''),
                              'highlightElementPos': linkParams.get('highlightElementPos','')
                              }
                    url = self.getLink(params=params)
                    l.set('href', url)
                        
            return serialize(tocdivs)
        
        return "ERROR: no results!"


    def getToc(self, mode='text', docinfo=None):
        """returns list of table of contents from docinfo"""
        logging.debug("getToc mode=%s"%mode)
        if mode == 'text':
            queryType = 'toc'
        else:
            queryType = mode
            
        if not 'full_%s'%queryType in docinfo:
            # get new toc
            docinfo = self.getTextInfo(queryType, docinfo)
            
        return docinfo.get('full_%s'%queryType, [])

    def getTocPage(self, mode='text', pn=None, start=None, size=None, pageinfo=None, docinfo=None):
        """returns single page from the table of contents"""
        logging.debug("getTocPage mode=%s, pn=%s start=%s size=%s"%(mode,repr(pn),repr(start),repr(size)))
        fulltoc = self.getToc(mode=mode, docinfo=docinfo)
        if len(fulltoc) < 1:
            logging.error("getTocPage: unable to find toc!")
            return "Error: no table of contents!"        
        
        if size is None:
            size = pageinfo.get('tocPageSize', 30)
            
        if start is None:
            start = (pn - 1) * size

        # paginate
        first = (start - 1)
        last = first + size
        tocs = fulltoc[first:last]
        tp = '<div>'
        for toc in tocs:
            pageurl = self.getLink('pn', toc['pn'])
            tp += '<div class="tocline">'
            tp += '<div class="toc name">[%s %s]</div>'%(toc['level-string'], toc['content'])
            tp += '<div class="toc float right page"><a href="%s">Page: %s</a></div>'%(pageurl, toc['pn'])
            tp += '</div>\n'
            
        tp += '</div>\n'
        
        return tp
           
    
    def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,repositoryType=None,RESPONSE=None):
        """change settings"""
        self.title=title
        self.timeout = timeout
        self.serverUrl = serverUrl
        if repositoryType:
            self.repositoryType = repositoryType
        if RESPONSE is not None:
            RESPONSE.redirect('manage_main')
        
# management methods
def manage_addMpdlXmlTextServerForm(self):
    """Form for adding"""
    pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self)
    return pt()

def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
#def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None):    
    """add zogiimage"""
    newObj = MpdlXmlTextServer(id,title,serverUrl,timeout)
    self.Destination()._setObject(id, newObj)
    if RESPONSE is not None:
        RESPONSE.redirect('manage_main')