view MpdlXmlTextServer.py @ 513:67095296c95a

Merge from elementtree branch 92a6443a6f16ff25674d43814ec0d6c0a43a5e1a
author casties
date Tue, 28 Feb 2012 19:10:08 +0100
parents 91daab0c219b 551ca1641a5e
children 7d7b639d7be7
line wrap: on
line source

from OFS.SimpleItem import SimpleItem
from Products.PageTemplates.PageTemplateFile import PageTemplateFile 

import xml.etree.ElementTree as ET

import re
import logging
import urllib
import urlparse
import base64

from SrvTxtUtils import getInt, getText, getHttpData

def serialize(node):
    """returns a string containing an XML snippet of node"""
    s = ET.tostring(node, 'UTF-8')
    # snip off XML declaration
    if s.startswith('<?xml'):
        i = s.find('?>')
        return s[i+3:]

    return s


class MpdlXmlTextServer(SimpleItem):
    """TextServer implementation for MPDL-XML eXist server"""
    meta_type="MPDL-XML TextServer"

    manage_options=(
        {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'},
       )+SimpleItem.manage_options
    
    manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals())
        
    def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40):
        """constructor"""
        self.id=id
        self.title=title
        self.timeout = timeout
        if serverName is None:
            self.serverUrl = serverUrl
        else:
            self.serverUrl = "http://%s/mpdl/interface/"%serverName
        
    def getHttpData(self, url, data=None):
        """returns result from url+data HTTP request"""
        return getHttpData(url,data,timeout=self.timeout)
    
    def getServerData(self, method, data=None):
        """returns result from text server for method+data"""
        url = self.serverUrl+method
        return getHttpData(url,data,timeout=self.timeout)


    def getPlacesOnPage(self, docinfo=None, pn=None):
        """Returns list of GIS places of page pn"""
        docpath = docinfo.get('textURLPath',None)
        if not docpath:
            return None

        places=[]
        text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn))
        dom = ET.fromstring(text)
        result = dom.findall(".//resultPage/place")
        for l in result:
            id = l.get("id")
            name = l.text
            place = {'id': id, 'name': name}
            places.append(place)

        return places
    
          
    def processPageInfo(self, dom, docinfo, pageinfo):
        """processes page info divs from dom and stores in docinfo and pageinfo"""
        # assume first second level div is pageMeta
        alldivs = dom.find("div")
        
        if alldivs is None or alldivs.get('class', '') != 'pageMeta':
            logging.error("processPageInfo: pageMeta div not found!")
            return
        
        for div in alldivs:
            dc = div.get('class')
            
            # pageNumberOrig  
            if dc == 'pageNumberOrig':
                pageinfo['pageNumberOrig'] = div.text
                
            # pageNumberOrigNorm
            elif dc == 'pageNumberOrigNorm':
                pageinfo['pageNumberOrigNorm'] = div.text
                
            # pageHeaderTitle
            elif dc == 'pageHeaderTitle':
                pageinfo['pageHeaderTitle'] = div.text
                
            # numFigureEntries
            elif dc == 'countFigureEntries':
                docinfo['numFigureEntries'] = getInt(div.text)
                
            # numTocEntries
            elif dc == 'countTocEntries':
                # WTF: s1 = int(s)/30+1
                docinfo['numTocEntries'] = getInt(div.text)
                
            # numPlaces
            elif dc == 'countPlaces':
                docinfo['numPlaces'] = getInt(div.text)
                
            # numTextPages
            elif dc == 'countPages':
                np = getInt(div.text)                    
                if np > 0:
                    docinfo['numTextPages'] = np
                    if docinfo.get('numPages', 0) == 0:
                        # seems to be text-only - update page count
                        docinfo['numPages'] = np
                        #pageinfo['end'] = min(pageinfo['end'], np)
                        pageinfo['numgroups'] = int(np / pageinfo['groupsize'])
                        if np % pageinfo['groupsize'] > 0:
                            pageinfo['numgroups'] += 1
        
        #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo))
        return
         
           
    def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
        """returns single page from fulltext"""
        
        logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
        # check for cached text -- but ideally this shouldn't be called twice
        if pageinfo.has_key('textPage'):
            logging.debug("getTextPage: using cached text")
            return pageinfo['textPage']
        
        docpath = docinfo['textURLPath']
        # just checking
        if pageinfo['current'] != pn:
            logging.warning("getTextPage: current!=pn!")
            
        # stuff for constructing full urls
        selfurl = docinfo['viewerUrl']
        textParams = {'document': docpath,
                      'pn': pn}
        if 'characterNormalization' in pageinfo:
            textParams['characterNormalization'] = pageinfo['characterNormalization']
        
        if not mode:
            # default is dict
            mode = 'text'

        modes = mode.split(',')
        # check for multiple layers
        if len(modes) > 1:
            logging.debug("getTextPage: more than one mode=%s"%mode)
            
        # search mode
        if 'search' in modes:
            # add highlighting
            highlightQuery = pageinfo.get('highlightQuery', None)
            if highlightQuery:
                textParams['highlightQuery'] = highlightQuery
                textParams['highlightElement'] = pageinfo.get('highlightElement', '')
                textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '')
                
            # ignore mode in the following
            modes.remove('search')
                            
        # other modes don't combine
        if 'dict' in modes:
            # dict is called textPollux in the backend
            textmode = 'textPollux'
        elif len(modes) == 0:
            # text is default mode
            textmode = 'text'
        else:
            # just take first mode
            textmode = modes[0]
        
        textParams['mode'] = textmode
        
        # fetch the page
        pagexml = self.getServerData("page-fragment.xql",urllib.urlencode(textParams))
        dom = ET.fromstring(pagexml)
        # extract additional info
        self.processPageInfo(dom, docinfo, pageinfo)
        # page content is in <div class="pageContent">
        pagediv = None
        # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent']
        # so we look at the second level divs
        alldivs = dom.findall("div")
        for div in alldivs:
            dc = div.get('class')
            # page content div
            if dc == 'pageContent':
                pagediv = div
                break
        
        # plain text mode
        if textmode == "text":
            # get full url assuming documentViewer is parent
            selfurl = self.getLink()
            if pagediv is not None:
                links = pagediv.findall(".//a")
                for l in links:
                    href = l.get('href')
                    if href and href.startswith('#note-'):
                        href = href.replace('#note-',"%s#note-"%selfurl)
                        l.set('href', href)

                return serialize(pagediv)
            
        # text-with-links mode
        elif textmode == "textPollux":
            if pagediv is not None:
                viewerurl = docinfo['viewerUrl']
                selfurl = self.getLink()
                # check all a-tags
                links = pagediv.findall(".//a")
                for l in links:
                    href = l.get('href')
                    
                    if href:
                        # is link with href
                        linkurl = urlparse.urlparse(href)
                        #logging.debug("getTextPage: linkurl=%s"%repr(linkurl))
                        if linkurl.path.endswith('GetDictionaryEntries'):
                            #TODO: replace wordInfo page
                            # is dictionary link - change href (keeping parameters)
                            #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl))
                            # add target to open new page
                            l.set('target', '_blank')
                                                          
                        # TODO: is this needed?
#                        if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'):
#                            selfurl = self.absolute_url()
#                            l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl))
#                            l.set('target', '_blank')
#                            l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;")
#                            l.set('ondblclick', 'popupWin.focus();')   
                    
                        if href.startswith('#note-'):
                            # note link
                            l.set('href', href.replace('#note-',"%s#note-"%selfurl))
                              
                return serialize(pagediv)
            
        # xml mode
        elif textmode == "xml":
            if pagediv is not None:
                return serialize(pagediv)
            
        # pureXml mode
        elif textmode == "pureXml":
            if pagediv is not None:
                return serialize(pagediv)
                  
        # gis mode
        elif textmode == "gis":
            if pagediv is not None:
                # check all a-tags
                links = pagediv.findall(".//a")
                # add our URL as backlink
                selfurl = self.getLink()
                doc = base64.b64encode(selfurl)
                for l in links:
                    href = l.get('href')
                    if href:
                        if href.startswith('http://mappit.mpiwg-berlin.mpg.de'):
                            l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href))
                            l.set('target', '_blank')
                            
                return serialize(pagediv)
                    
        return None
    

    def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None):
        """loads list of search results and stores XML in docinfo"""
        
        logging.debug("getSearchResults mode=%s query=%s"%(mode, query))
        if mode == "none":
            return docinfo
              
        cachedQuery = docinfo.get('cachedQuery', None)
        if cachedQuery is not None:
            # cached search result
            if cachedQuery == '%s_%s'%(mode,query):
                # same query
                return docinfo
            
            else:
                # different query
                del docinfo['resultSize']
                del docinfo['resultXML']
        
        # cache query
        docinfo['cachedQuery'] = '%s_%s'%(mode,query)
        
        # fetch full results
        docpath = docinfo['textURLPath']
        params = {'document': docpath,
                  'mode': 'text',
                  'queryType': mode,
                  'query': query,
                  'queryResultPageSize': 1000,
                  'queryResultPN': 1,
                  'characterNormalization': pageinfo.get('characterNormalization', 'reg')}
        pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params))
        #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery)))
        dom = ET.fromstring(pagexml)
        # page content is in <div class="queryResultPage">
        pagediv = None
        # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
        alldivs = dom.findall("div")
        for div in alldivs:
            dc = div.get('class')
            # page content div
            if dc == 'queryResultPage':
                pagediv = div
                
            elif dc == 'queryResultHits':
                docinfo['resultSize'] = getInt(div.text)

        if pagediv is not None:
            # store XML in docinfo
            docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8')

        return docinfo
    

    def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None):
        """returns single page from the table of contents"""
        logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn))
        # check for cached result
        if not 'resultXML' in docinfo:
            self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
            
        resultxml = docinfo.get('resultXML', None)
        if not resultxml:
            logging.error("getResultPage: unable to find resultXML")
            return "Error: no result!"
        
        if size is None:
            size = pageinfo.get('resultPageSize', 10)
            
        if start is None:
            start = (pn - 1) * size

        fullresult = ET.fromstring(resultxml)
        
        if fullresult is not None:
            # paginate
            first = start-1
            len = size
            del fullresult[:first]
            del fullresult[len:]
            tocdivs = fullresult
            
            # check all a-tags
            links = tocdivs.findall(".//a")
            for l in links:
                href = l.get('href')
                if href:
                    # assume all links go to pages
                    linkUrl = urlparse.urlparse(href)
                    linkParams = urlparse.parse_qs(linkUrl.query)
                    # take some parameters
                    params = {'pn': linkParams['pn'],
                              'highlightQuery': linkParams.get('highlightQuery',''),
                              'highlightElement': linkParams.get('highlightElement',''),
                              'highlightElementPos': linkParams.get('highlightElementPos','')
                              }
                    url = self.getLink(params=params)
                    l.set('href', url)
                        
            return serialize(tocdivs)
        
        return "ERROR: no results!"


    def getToc(self, mode="text", docinfo=None):
        """loads table of contents and stores XML in docinfo"""
        logging.debug("getToc mode=%s"%mode)
        if mode == "none":
            return docinfo
              
        if 'tocSize_%s'%mode in docinfo:
            # cached toc
            return docinfo
        
        docpath = docinfo['textURLPath']
        # we need to set a result set size
        pagesize = 1000
        pn = 1
        if mode == "text":
            queryType = "toc"
        else:
            queryType = mode
        # number of entries in toc
        tocSize = 0
        tocDiv = None
        # fetch full toc
        pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn))
        dom = ET.fromstring(pagexml)
        # page content is in <div class="queryResultPage">
        pagediv = None
        # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
        alldivs = dom.findall("div")
        for div in alldivs:
            dc = div.get('class')
            # page content div
            if dc == 'queryResultPage':
                pagediv = div
                
            elif dc == 'queryResultHits':
                docinfo['tocSize_%s'%mode] = getInt(div.text)

        if pagediv is not None:
            # store XML in docinfo
            docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8')

        return docinfo
    
    def getTocPage(self, mode="text", pn=None, start=None, size=None, pageinfo=None, docinfo=None):
        """returns single page from the table of contents"""
        logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn))
        if mode == "text":
            queryType = "toc"
        else:
            queryType = mode
            
        # check for cached TOC
        if not docinfo.has_key('tocXML_%s'%mode):
            self.getToc(mode=mode, docinfo=docinfo)
            
        tocxml = docinfo.get('tocXML_%s'%mode, None)
        if not tocxml:
            logging.error("getTocPage: unable to find tocXML")
            return "Error: no table of contents!"
        
        if size is None:
            size = pageinfo.get('tocPageSize', 30)
            
        if start is None:
            start = (pn - 1) * size

        fulltoc = ET.fromstring(tocxml)
        
        if fulltoc is not None:
            # paginate
            first = (start - 1) * 2
            len = size * 2
            del fulltoc[:first]
            del fulltoc[len:]
            tocdivs = fulltoc
            
            # check all a-tags
            links = tocdivs.findall(".//a")
            for l in links:
                href = l.get('href')
                if href:
                    # take pn from href
                    m = re.match(r'page-fragment\.xql.*pn=(\d+)', href)
                    if m is not None:
                        # and create new url (assuming parent is documentViewer)
                        url = self.getLink('pn', m.group(1))
                        l.set('href', url)
                    else:
                        logging.warning("getTocPage: Problem with link=%s"%href)
                        
            # fix two-divs-per-row with containing div
            newtoc = ET.Element('div', {'class':'queryResultPage'})
            for (d1,d2) in zip(tocdivs[::2],tocdivs[1::2]):
                e = ET.Element('div',{'class':'tocline'})
                e.append(d1)
                e.append(d2)
                newtoc.append(e)
                
            return serialize(newtoc)
        
        return "ERROR: no table of contents!"
    
    
    def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
        """change settings"""
        self.title=title
        self.timeout = timeout
        self.serverUrl = serverUrl
        if RESPONSE is not None:
            RESPONSE.redirect('manage_main')
        
# management methods
def manage_addMpdlXmlTextServerForm(self):
    """Form for adding"""
    pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self)
    return pt()

def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
#def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None):    
    """add zogiimage"""
    newObj = MpdlXmlTextServer(id,title,serverUrl,timeout)
    self.Destination()._setObject(id, newObj)
    if RESPONSE is not None:
        RESPONSE.redirect('manage_main')