view MpdlXmlTextServer.py @ 517:aaacdf551f6f

remove global info from processPageInfo.
author casties
date Mon, 05 Mar 2012 19:11:59 +0100
parents 7d7b639d7be7
children 91051b36b9cc
line wrap: on
line source

from OFS.SimpleItem import SimpleItem
from Products.PageTemplates.PageTemplateFile import PageTemplateFile 

import xml.etree.ElementTree as ET

import re
import logging
import urllib
import urlparse
import base64

from SrvTxtUtils import getInt, getText, getHttpData

def serialize(node):
    """returns a string containing an XML snippet of node"""
    s = ET.tostring(node, 'UTF-8')
    # snip off XML declaration
    if s.startswith('<?xml'):
        i = s.find('?>')
        return s[i+3:]

    return s


class MpdlXmlTextServer(SimpleItem):
    """TextServer implementation for MPDL-XML eXist server"""
    meta_type="MPDL-XML TextServer"

    manage_options=(
        {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'},
       )+SimpleItem.manage_options
    
    manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals())
        
    def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40):
        """constructor"""
        self.id=id
        self.title=title
        self.timeout = timeout
        if serverName is None:
            self.serverUrl = serverUrl
        else:
            self.serverUrl = "http://%s/mpdl/interface/"%serverName
        
    def getHttpData(self, url, data=None):
        """returns result from url+data HTTP request"""
        return getHttpData(url,data,timeout=self.timeout)
    
    def getServerData(self, method, data=None):
        """returns result from text server for method+data"""
        url = self.serverUrl+method
        return getHttpData(url,data,timeout=self.timeout)


    def getPlacesOnPage(self, docinfo=None, pn=None):
        """Returns list of GIS places of page pn"""
        docpath = docinfo.get('textURLPath',None)
        if not docpath:
            return None

        places=[]
        text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn))
        dom = ET.fromstring(text)
        result = dom.findall(".//resultPage/place")
        for l in result:
            id = l.get("id")
            name = l.text
            place = {'id': id, 'name': name}
            places.append(place)

        return places
    
          
    def getTextInfo(self, mode='', docinfo=None):
        """reads document info, including page concordance, from text server"""
        logging.debug("getDocInfo")
        #TODO: check cached info
        docpath = docinfo.get('textURLPath', None)
        if docpath is None:
            logging.error("getTextInfo: no textURLPath!")
            return docinfo
                
        # we need to set a result set size
        pagesize = 10000
        pn = 1
        # fetch docinfo
        pagexml = self.getServerData("doc-info.xql","document=%s&info=%s&pageSize=%s&pn=%s"%(docpath,mode,pagesize,pn))
        dom = ET.fromstring(pagexml)
        # all info in tag <document>
        doc = dom.find("document")
        if doc is None:
            logging.error("getTextInfo: unable to find document-tag!")
        else:
            # go through all child elements
            for tag in doc:
                name = tag.tag
                # numTextPages
                if name == 'countPages':
                    np = getInt(tag.text)                    
                    if np > 0:
                        docinfo['numTextPages'] = np
                   
                # numFigureEntries
                elif name == 'countFigureEntries':
                    docinfo['numFigureEntries'] = getInt(tag.text)
                    
                # numTocEntries
                elif name == 'countTocEntries':
                    # WTF: s1 = int(s)/30+1
                    docinfo['numTocEntries'] = getInt(tag.text)
                    
                # numPlaces
                elif name == 'countPlaces':
                    docinfo['numPlaces'] = getInt(tag.text)
                    
                # pageNumbers
                elif name == 'pageNumbers':
                    # contains tags with page numbers
                    # <pn><n>4</n><no>4</no><non/></pn>
                    # n=scan number, no=original page no, non=normalized original page no
                    # pageNumbers is a dict indexed by scan number
                    pages = {}
                    for pn in tag:
                        page = {}
                        n = 0
                        for p in pn:
                            if p.tag == 'n':
                                n = getInt(p.text)
                                page['n'] = n
                            elif p.tag == 'no':
                                page['no'] = p.text
                            elif p.tag == 'non':
                                page['non'] = p.text
                                
                        if n > 0:
                            pages[n] = page
                        
                    docinfo['pageNumbers'] = pages
                    #logging.debug("got pageNumbers=%s"%repr(pages))
                                
                # toc
                elif name == 'toc':
                    # contains tags with table of contents
                    # TODO: implement
                    pass

        return docinfo
        
          
    def processPageInfo(self, dom, docinfo, pageinfo):
        """processes page info divs from dom and stores in docinfo and pageinfo"""
        # assume first second level div is pageMeta
        alldivs = dom.find("div")
        
        if alldivs is None or alldivs.get('class', '') != 'pageMeta':
            logging.error("processPageInfo: pageMeta div not found!")
            return
        
        for div in alldivs:
            dc = div.get('class')
            
            # pageNumberOrig  
            if dc == 'pageNumberOrig':
                pageinfo['pageNumberOrig'] = div.text
                
            # pageNumberOrigNorm
            elif dc == 'pageNumberOrigNorm':
                pageinfo['pageNumberOrigNorm'] = div.text
                
            # pageHeaderTitle
            elif dc == 'pageHeaderTitle':
                pageinfo['pageHeaderTitle'] = div.text
                        
        #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo))
        return
         
           
    def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
        """returns single page from fulltext"""
        
        logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
        # check for cached text -- but ideally this shouldn't be called twice
        if pageinfo.has_key('textPage'):
            logging.debug("getTextPage: using cached text")
            return pageinfo['textPage']
        
        docpath = docinfo['textURLPath']
        # just checking
        if pageinfo['current'] != pn:
            logging.warning("getTextPage: current!=pn!")
            
        # stuff for constructing full urls
        selfurl = docinfo['viewerUrl']
        textParams = {'document': docpath,
                      'pn': pn}
        if 'characterNormalization' in pageinfo:
            textParams['characterNormalization'] = pageinfo['characterNormalization']
        
        if not mode:
            # default is dict
            mode = 'text'

        modes = mode.split(',')
        # check for multiple layers
        if len(modes) > 1:
            logging.debug("getTextPage: more than one mode=%s"%mode)
            
        # search mode
        if 'search' in modes:
            # add highlighting
            highlightQuery = pageinfo.get('highlightQuery', None)
            if highlightQuery:
                textParams['highlightQuery'] = highlightQuery
                textParams['highlightElement'] = pageinfo.get('highlightElement', '')
                textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '')
                
            # ignore mode in the following
            modes.remove('search')
                            
        # other modes don't combine
        if 'dict' in modes:
            # dict is called textPollux in the backend
            textmode = 'textPollux'
        elif len(modes) == 0:
            # text is default mode
            textmode = 'text'
        else:
            # just take first mode
            textmode = modes[0]
        
        textParams['mode'] = textmode
        
        # fetch the page
        pagexml = self.getServerData("page-fragment.xql",urllib.urlencode(textParams))
        dom = ET.fromstring(pagexml)
        # extract additional info
        self.processPageInfo(dom, docinfo, pageinfo)
        # page content is in <div class="pageContent">
        pagediv = None
        # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent']
        # so we look at the second level divs
        alldivs = dom.findall("div")
        for div in alldivs:
            dc = div.get('class')
            # page content div
            if dc == 'pageContent':
                pagediv = div
                break
        
        # plain text mode
        if textmode == "text":
            # get full url assuming documentViewer is parent
            selfurl = self.getLink()
            if pagediv is not None:
                links = pagediv.findall(".//a")
                for l in links:
                    href = l.get('href')
                    if href and href.startswith('#note-'):
                        href = href.replace('#note-',"%s#note-"%selfurl)
                        l.set('href', href)

                return serialize(pagediv)
            
        # text-with-links mode
        elif textmode == "textPollux":
            if pagediv is not None:
                viewerurl = docinfo['viewerUrl']
                selfurl = self.getLink()
                # check all a-tags
                links = pagediv.findall(".//a")
                for l in links:
                    href = l.get('href')
                    
                    if href:
                        # is link with href
                        linkurl = urlparse.urlparse(href)
                        #logging.debug("getTextPage: linkurl=%s"%repr(linkurl))
                        if linkurl.path.endswith('GetDictionaryEntries'):
                            #TODO: replace wordInfo page
                            # is dictionary link - change href (keeping parameters)
                            #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl))
                            # add target to open new page
                            l.set('target', '_blank')
                                                          
                        # TODO: is this needed?
#                        if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'):
#                            selfurl = self.absolute_url()
#                            l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl))
#                            l.set('target', '_blank')
#                            l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;")
#                            l.set('ondblclick', 'popupWin.focus();')   
                    
                        if href.startswith('#note-'):
                            # note link
                            l.set('href', href.replace('#note-',"%s#note-"%selfurl))
                              
                return serialize(pagediv)
            
        # xml mode
        elif textmode == "xml":
            if pagediv is not None:
                return serialize(pagediv)
            
        # pureXml mode
        elif textmode == "pureXml":
            if pagediv is not None:
                return serialize(pagediv)
                  
        # gis mode
        elif textmode == "gis":
            if pagediv is not None:
                # check all a-tags
                links = pagediv.findall(".//a")
                # add our URL as backlink
                selfurl = self.getLink()
                doc = base64.b64encode(selfurl)
                for l in links:
                    href = l.get('href')
                    if href:
                        if href.startswith('http://mappit.mpiwg-berlin.mpg.de'):
                            l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href))
                            l.set('target', '_blank')
                            
                return serialize(pagediv)
                    
        return None
    

    def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None):
        """loads list of search results and stores XML in docinfo"""
        
        logging.debug("getSearchResults mode=%s query=%s"%(mode, query))
        if mode == "none":
            return docinfo
              
        cachedQuery = docinfo.get('cachedQuery', None)
        if cachedQuery is not None:
            # cached search result
            if cachedQuery == '%s_%s'%(mode,query):
                # same query
                return docinfo
            
            else:
                # different query
                del docinfo['resultSize']
                del docinfo['resultXML']
        
        # cache query
        docinfo['cachedQuery'] = '%s_%s'%(mode,query)
        
        # fetch full results
        docpath = docinfo['textURLPath']
        params = {'document': docpath,
                  'mode': 'text',
                  'queryType': mode,
                  'query': query,
                  'queryResultPageSize': 1000,
                  'queryResultPN': 1,
                  'characterNormalization': pageinfo.get('characterNormalization', 'reg')}
        pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params))
        #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery)))
        dom = ET.fromstring(pagexml)
        # page content is in <div class="queryResultPage">
        pagediv = None
        # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
        alldivs = dom.findall("div")
        for div in alldivs:
            dc = div.get('class')
            # page content div
            if dc == 'queryResultPage':
                pagediv = div
                
            elif dc == 'queryResultHits':
                docinfo['resultSize'] = getInt(div.text)

        if pagediv is not None:
            # store XML in docinfo
            docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8')

        return docinfo
    

    def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None):
        """returns single page from the table of contents"""
        logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn))
        # get (cached) result
        self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
            
        resultxml = docinfo.get('resultXML', None)
        if not resultxml:
            logging.error("getResultPage: unable to find resultXML")
            return "Error: no result!"
        
        if size is None:
            size = pageinfo.get('resultPageSize', 10)
            
        if start is None:
            start = (pn - 1) * size

        fullresult = ET.fromstring(resultxml)
        
        if fullresult is not None:
            # paginate
            first = start-1
            len = size
            del fullresult[:first]
            del fullresult[len:]
            tocdivs = fullresult
            
            # check all a-tags
            links = tocdivs.findall(".//a")
            for l in links:
                href = l.get('href')
                if href:
                    # assume all links go to pages
                    linkUrl = urlparse.urlparse(href)
                    linkParams = urlparse.parse_qs(linkUrl.query)
                    # take some parameters
                    params = {'pn': linkParams['pn'],
                              'highlightQuery': linkParams.get('highlightQuery',''),
                              'highlightElement': linkParams.get('highlightElement',''),
                              'highlightElementPos': linkParams.get('highlightElementPos','')
                              }
                    url = self.getLink(params=params)
                    l.set('href', url)
                        
            return serialize(tocdivs)
        
        return "ERROR: no results!"


    def getToc(self, mode="text", docinfo=None):
        """loads table of contents and stores XML in docinfo"""
        logging.debug("getToc mode=%s"%mode)
        if mode == "none":
            return docinfo
              
        if 'tocSize_%s'%mode in docinfo:
            # cached toc
            return docinfo
        
        docpath = docinfo['textURLPath']
        # we need to set a result set size
        pagesize = 1000
        pn = 1
        if mode == "text":
            queryType = "toc"
        else:
            queryType = mode
        # number of entries in toc
        tocSize = 0
        tocDiv = None
        # fetch full toc
        pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn))
        dom = ET.fromstring(pagexml)
        # page content is in <div class="queryResultPage">
        pagediv = None
        # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
        alldivs = dom.findall("div")
        for div in alldivs:
            dc = div.get('class')
            # page content div
            if dc == 'queryResultPage':
                pagediv = div
                
            elif dc == 'queryResultHits':
                docinfo['tocSize_%s'%mode] = getInt(div.text)

        if pagediv is not None:
            # store XML in docinfo
            docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8')

        return docinfo
    
    def getTocPage(self, mode="text", pn=None, start=None, size=None, pageinfo=None, docinfo=None):
        """returns single page from the table of contents"""
        logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn))
        if mode == "text":
            queryType = "toc"
        else:
            queryType = mode
            
        # check for cached TOC
        if not docinfo.has_key('tocXML_%s'%mode):
            self.getToc(mode=mode, docinfo=docinfo)
            
        tocxml = docinfo.get('tocXML_%s'%mode, None)
        if not tocxml:
            logging.error("getTocPage: unable to find tocXML")
            return "Error: no table of contents!"
        
        if size is None:
            size = pageinfo.get('tocPageSize', 30)
            
        if start is None:
            start = (pn - 1) * size

        fulltoc = ET.fromstring(tocxml)
        
        if fulltoc is not None:
            # paginate
            first = (start - 1) * 2
            len = size * 2
            del fulltoc[:first]
            del fulltoc[len:]
            tocdivs = fulltoc
            
            # check all a-tags
            links = tocdivs.findall(".//a")
            for l in links:
                href = l.get('href')
                if href:
                    # take pn from href
                    m = re.match(r'page-fragment\.xql.*pn=(\d+)', href)
                    if m is not None:
                        # and create new url (assuming parent is documentViewer)
                        url = self.getLink('pn', m.group(1))
                        l.set('href', url)
                    else:
                        logging.warning("getTocPage: Problem with link=%s"%href)
                        
            # fix two-divs-per-row with containing div
            newtoc = ET.Element('div', {'class':'queryResultPage'})
            for (d1,d2) in zip(tocdivs[::2],tocdivs[1::2]):
                e = ET.Element('div',{'class':'tocline'})
                e.append(d1)
                e.append(d2)
                newtoc.append(e)
                
            return serialize(newtoc)
        
        return "ERROR: no table of contents!"
    
    
    def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
        """change settings"""
        self.title=title
        self.timeout = timeout
        self.serverUrl = serverUrl
        if RESPONSE is not None:
            RESPONSE.redirect('manage_main')
        
# management methods
def manage_addMpdlXmlTextServerForm(self):
    """Form for adding"""
    pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self)
    return pt()

def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
#def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None):    
    """add zogiimage"""
    newObj = MpdlXmlTextServer(id,title,serverUrl,timeout)
    self.Destination()._setObject(id, newObj)
    if RESPONSE is not None:
        RESPONSE.redirect('manage_main')