view MpiwgXmlTextServer.py @ 577:9251719154a3

toc with list of handwritten notes.
author casties
date Thu, 18 Oct 2012 17:53:09 +0200
parents b2c7e272e075
children fc861a6cef17
line wrap: on
line source

from OFS.SimpleItem import SimpleItem
from Products.PageTemplates.PageTemplateFile import PageTemplateFile 

import xml.etree.ElementTree as ET

import re
import logging
import urllib
import urlparse
import base64

from datetime import datetime

from SrvTxtUtils import getInt, getText, getHttpData

def serialize(node):
    """returns a string containing an XML snippet of node"""
    s = ET.tostring(node, 'UTF-8')
    # snip off XML declaration
    if s.startswith('<?xml'):
        i = s.find('?>')
        return s[i+3:]

    return s


class MpiwgXmlTextServer(SimpleItem):
    """TextServer implementation for MPIWG-XML server"""
    meta_type="MPIWG-XML TextServer"

    manage_options=(
        {'label':'Config','action':'manage_changeMpiwgXmlTextServerForm'},
       )+SimpleItem.manage_options
    
    manage_changeMpiwgXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpiwgXmlTextServer", globals())
        
    def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpiwg-mpdl-cms-web/", timeout=40, serverName=None, repositoryType='production'):
        """constructor"""
        self.id=id
        self.title=title
        self.timeout = timeout
        self.repositoryType = repositoryType
        if serverName is None:
            self.serverUrl = serverUrl
        else:
            self.serverUrl = "http://%s/mpiwg-mpdl-cms-web/"%serverName
        
    def getHttpData(self, url, data=None):
        """returns result from url+data HTTP request"""
        return getHttpData(url,data,timeout=self.timeout)
    
    def getServerData(self, method, data=None):
        """returns result from text server for method+data"""
        url = self.serverUrl+method
        return getHttpData(url,data,timeout=self.timeout)


    def getRepositoryType(self):
        """returns the repository type, e.g. 'production'"""
        return getattr(self, 'repositoryType', None)

    def getTextDownloadUrl(self, type='xml', docinfo=None):
        """returns a URL to download the current text"""
        docpath = docinfo.get('textURLPath', None)
        if not docpath:
            return None

        docpath = docpath.replace('.xml','.'+type)
        url = '%sdoc/GetDocument?id=%s'%(self.serverUrl.replace('interface/',''), docpath)
        return url


    def getPlacesOnPage(self, docinfo=None, pn=None):
        """Returns list of GIS places of page pn"""
        #FIXME!
        docpath = docinfo.get('textURLPath',None)
        if not docpath:
            return None

        places=[]
        text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn))
        dom = ET.fromstring(text)
        result = dom.findall(".//resultPage/place")
        for l in result:
            id = l.get("id")
            name = l.text
            place = {'id': id, 'name': name}
            places.append(place)

        return places
    
          
    def getTextInfo(self, mode=None, docinfo=None):
        """reads document info, including page concordance, from text server"""
        logging.debug("getTextInfo mode=%s"%mode)
        
        field = ''
        if mode in ['pages', 'toc', 'figures', 'handwritten']:
            # translate mode to field param
            field = '&field=%s'%mode
        else:
            mode = None

        # check cached info
        if mode:
            # cached toc-request?
            if 'full_%s'%mode in docinfo:
                return docinfo
            
        else:
            # cached but no toc-request?
            if 'numTextPages' in docinfo:
                return docinfo
                
        docpath = docinfo.get('textURLPath', None)
        if docpath is None:
            logging.error("getTextInfo: no textURLPath!")
            return docinfo
                
        # fetch docinfo            
        pagexml = self.getServerData("query/GetDocInfo","docId=%s%s"%(docpath,field))
        dom = ET.fromstring(pagexml)
        # all info in tag <doc>
        doc = dom
        if doc is None:
            logging.error("getTextInfo: unable to find document-tag!")
        else:
            if mode is None:
                # get general info from system-tag
                sys = doc.find('system')
                if sys is not None:
                    docinfo['numTextPages'] = getInt(getText(sys.find('countPages'))) 
                    docinfo['numFigureEntries'] = getInt(getText(sys.find('countFigures'))) 
                    docinfo['numHandwritten'] = getInt(getText(sys.find('countHandwritten'))) 
                    docinfo['numTocEntries'] = getInt(getText(sys.find('countTocEntries'))) 
                    
            else:
                # result is in list-tag
                l = doc.find('list')
                if l is not None:
                    lt = l.get('type')
                    # pageNumbers
                    if lt == 'pages':
                        # contains tags with page numbers
                        # <item n="14" o="2" o-norm="2" file="0014"/>
                        # n=scan number, o=original page no, on=normalized original page no
                        # pageNumbers is a dict indexed by scan number
                        pages = {}
                        for i in l:
                            page = {}
                            pn = getInt(i.get('n'))
                            page['pn'] = pn
                            no = i.get('o')
                            page['no'] = no
                            non = i.get('o-norm')
                            page['non'] = non
                                    
                            if pn > 0:
                                pages[pn] = page
                            
                        docinfo['pageNumbers'] = pages
                                    
                    # toc
                    elif lt == 'toc' or lt == 'figures' or lt == 'handwritten':
                        # contains tags with table of contents/figures
                        # <item n="2.1." lv="2">CAP.I. <ref o="119">132</ref></item>
                        tocs = []
                        for te in l:
                            if te.tag == 'item':
                                toc = {}
                                toc['level-string'] = te.get('n')
                                toc['level'] = te.get('lv')
                                toc['content'] = te.text.strip()
                                ref = te.find('ref')
                                toc['pn'] = getInt(ref.text)
                                toc['no'] = ref.get('o')
                                toc['non'] = ref.get('o-norm')
                                tocs.append(toc)
                        
                        # save as full_toc/full_figures
                        docinfo['full_%s'%mode] = tocs

        return docinfo
        
          
    def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
        """returns single page from fulltext"""
        
        logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
        startTime = datetime.now()
        # check for cached text -- but ideally this shouldn't be called twice
        if pageinfo.has_key('textPage'):
            logging.debug("getTextPage: using cached text")
            return pageinfo['textPage']
        
        docpath = docinfo.get('textURLPath', None)
        if not docpath:
            return None
        
        # just checking
        if pageinfo['current'] != pn:
            logging.warning("getTextPage: current!=pn!")
            
        # stuff for constructing full urls
        selfurl = docinfo['viewerUrl']
        textParams = {'docId': docpath,
                      'page': pn}
        
        normMode = pageinfo.get('characterNormalization', 'reg')
        # TODO: change values in form
        if normMode == 'regPlusNorm':
            normMode = 'norm'
        
        # TODO: this should not be necessary when the backend is fixed                
        textParams['normalization'] = normMode
        
        if not mode:
            # default is dict
            mode = 'text'

        modes = mode.split(',')
        # check for multiple layers
        if len(modes) > 1:
            logging.debug("getTextPage: more than one mode=%s"%mode)
                        
        # search mode
        if 'search' in modes:
            # add highlighting
            highlightQuery = pageinfo.get('highlightQuery', None)
            if highlightQuery:
                textParams['highlightQuery'] = highlightQuery
                textParams['highlightElem'] = pageinfo.get('highlightElement', '')
                textParams['highlightElemPos'] = pageinfo.get('highlightElementPos', '')
                
            # ignore mode in the following
            modes.remove('search')
                            
        # pundit mode
        punditMode = False
        if 'pundit' in modes:
            punditMode = True
            # ignore mode in the following
            modes.remove('pundit')
                            
        # other modes don't combine
        if 'dict' in modes:
            textmode = 'dict'
            textParams['outputFormat'] = 'html'
        elif 'xml' in modes:
            textmode = 'xml'
            textParams['outputFormat'] = 'xmlDisplay'
            normMode = 'orig'
        elif 'gis' in modes:
            #FIXME!
            textmode = 'gis'
        else:
            # text is default mode
            textmode = 'plain'
            textParams['outputFormat'] = 'html'
        
        try:
            # fetch the page
            pagexml = self.getServerData("query/GetPage",urllib.urlencode(textParams))
            dom = ET.fromstring(pagexml)
        except Exception, e:
            logging.error("Error reading page: %s"%e)
            return None
        
        # plain text or text-with-links mode
        if textmode == "plain" or textmode == "dict":
            # the text is in div@class=text
            pagediv = dom.find(".//div[@class='text']")
            logging.debug("pagediv: %s"%repr(pagediv))
            if pagediv is not None:
                # add textmode and normMode classes
                pagediv.set('class', 'text %s %s'%(textmode, normMode))
                self._processWTags(textmode, normMode, pagediv)
                #self._processPbTag(pagediv, pageinfo)
                self._processFigures(pagediv, docinfo)
                #self._fixEmptyDivs(pagediv)
                # get full url assuming documentViewer is parent
                selfurl = self.getLink()
                # check all a-tags
                links = pagediv.findall('.//a')
                for l in links:
                    href = l.get('href')
                    if href:
                        # is link with href
                        linkurl = urlparse.urlparse(href)
                        if linkurl.path.endswith('GetDictionaryEntries'):
                            #TODO: replace wordInfo page
                            # add target to open new page
                            l.set('target', '_blank')
                        
                if punditMode:
                    self._addPunditAttributes(pagediv, pageinfo, docinfo)
                    
                # TODO: move empty page text
                ep = dom.find(".//div[@class='emptyPage']")
                if ep is not None:
                    pagediv.append(ep)
                 
                s = serialize(pagediv)
                logging.debug("getTextPage done in %s"%(datetime.now()-startTime))    
                return s
            
        # xml mode
        elif textmode == "xml":
            # the text is in body
            pagediv = dom.find(".//body")
            logging.debug("pagediv: %s"%repr(pagediv))
            if pagediv is not None:
                return serialize(pagediv)
            
        # pureXml mode WTF?
        elif textmode == "pureXml":
            # the text is in body
            pagediv = dom.find(".//body")
            logging.debug("pagediv: %s"%repr(pagediv))
            if pagediv is not None:
                return serialize(pagediv)
                  
        # gis mode FIXME!
        elif textmode == "gis":
            # the text is in div@class=text
            pagediv = dom.find(".//div[@class='text']")
            logging.debug("pagediv: %s"%repr(pagediv))
            if pagediv is not None:
                # fix empty div tags
                self._fixEmptyDivs(pagediv)
                # check all a-tags
                links = pagediv.findall(".//a")
                # add our URL as backlink
                selfurl = self.getLink()
                doc = base64.b64encode(selfurl)
                for l in links:
                    href = l.get('href')
                    if href:
                        if href.startswith('http://mappit.mpiwg-berlin.mpg.de'):
                            l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href))
                            l.set('target', '_blank')
                            
                return serialize(pagediv)
                    
        logging.error("getTextPage: error in text mode %s or text!"%(textmode))
        return None

    def _processWTags(self, textMode, normMode, pagediv):
        """selects the necessary information from w-spans and removes the rest from pagediv"""
        logging.debug("processWTags(textMode=%s,norm=%s,pagediv"%(repr(textMode),repr(normMode)))
        startTime = datetime.now()
        wtags = pagediv.findall(".//span[@class='w']")
        for wtag in wtags:
            if textMode == 'dict':
                # delete non-a-tags
                wtag.remove(wtag.find("span[@class='nodictionary orig']"))
                wtag.remove(wtag.find("span[@class='nodictionary reg']"))
                wtag.remove(wtag.find("span[@class='nodictionary norm']"))
                # delete non-matching children of a-tag and suppress remaining tag name
                atag = wtag.find("a[@class='dictionary']")
                if normMode == 'orig':
                    atag.remove(atag.find("span[@class='reg']"))
                    atag.remove(atag.find("span[@class='norm']"))
                    atag.find("span[@class='orig']").tag = None
                elif normMode == 'reg':
                    atag.remove(atag.find("span[@class='orig']"))
                    atag.remove(atag.find("span[@class='norm']"))
                    atag.find("span[@class='reg']").tag = None
                elif normMode == 'norm':
                    atag.remove(atag.find("span[@class='orig']"))
                    atag.remove(atag.find("span[@class='reg']"))
                    atag.find("span[@class='norm']").tag = None
                    
            else:
                # delete a-tag
                wtag.remove(wtag.find("a[@class='dictionary']"))
                # delete non-matching children and suppress remaining tag name
                if normMode == 'orig':
                    wtag.remove(wtag.find("span[@class='nodictionary reg']"))
                    wtag.remove(wtag.find("span[@class='nodictionary norm']"))
                    wtag.find("span[@class='nodictionary orig']").tag = None
                elif normMode == 'reg':
                    wtag.remove(wtag.find("span[@class='nodictionary orig']"))
                    wtag.remove(wtag.find("span[@class='nodictionary norm']"))
                    wtag.find("span[@class='nodictionary reg']").tag = None
                elif normMode == 'norm':
                    wtag.remove(wtag.find("span[@class='nodictionary orig']"))
                    wtag.remove(wtag.find("span[@class='nodictionary reg']"))
                    wtag.find("span[@class='nodictionary norm']").tag = None
                
            # suppress w-tag name
            wtag.tag = None
            
        logging.debug("processWTags in %s"%(datetime.now()-startTime))
        return pagediv
        
    def _processPbTag(self, pagediv, pageinfo):
        """extracts information from pb-tag and removes it from pagediv"""
        pbdiv = pagediv.find(".//span[@class='pb']")
        if pbdiv is None:
            logging.warning("getTextPage: no pb-span!")
            return pagediv
        
        # extract running head
        rh = pbdiv.find(".//span[@class='rhead']")
        if rh is not None:
            pageinfo['pageHeaderTitle'] = getText(rh)
            
        # remove pb-div from parent
        ppdiv = pagediv.find(".//span[@class='pb']/..")
        ppdiv.remove(pbdiv)        
        return pagediv
    
    def _addPunditAttributes(self, pagediv, pageinfo, docinfo):
        """add about attributes for pundit annotation tool"""
        textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???'))
        pn = pageinfo.get('pn', '1')
        #  TODO: use pn as well?
        # check all div-tags
        divs = pagediv.findall(".//div")
        for d in divs:
            id = d.get('id')
            if id:
                # TODO: check path (cf RFC2396)
                d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id))
                cls = d.get('class','')
                cls += ' pundit-content'
                d.set('class', cls.strip())

        return pagediv

    def _processFigures(self, pagediv, docinfo):
        """processes figure-tags"""
        # unfortunately etree can not select class.startswith('figure')
        divs = pagediv.findall(".//span[@class]")
        scalerUrl = docinfo['digilibScalerUrl']
        viewerUrl = docinfo['digilibViewerUrl']
        for d in divs:
            if not d.get('class').startswith('figure'):
                continue
            
            try:
                a = d.find('a')
                img = a.find('img')
                imgsrc = img.get('src')
                imgurl = urlparse.urlparse(imgsrc)
                imgq = imgurl.query
                imgparams = urlparse.parse_qs(imgq)
                fn = imgparams.get('fn', None)
                if fn is not None:
                    # parse_qs puts parameters in lists
                    fn = fn[0]
                    # TODO: check valid path
                    # fix img@src
                    newsrc = '%s?fn=%s&dw=200&dh=200'%(scalerUrl,fn)
                    img.set('src', newsrc)
                    # fix a@href
                    newlink = '%s?fn=%s'%(viewerUrl,fn)
                    a.set('href', newlink)
                    a.set('target', '_blank')
                    
            except:
                logging.warn("processFigures: strange figure!")
                
    
    def _fixEmptyDivs(self, pagediv):
        """fixes empty div-tags by inserting a space"""
        divs = pagediv.findall('.//div')
        for d in divs:
            if len(d) == 0 and not d.text:
                # make empty divs non-empty
                d.text = ' '
  
        return pagediv


    def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None):
        """loads list of search results and stores XML in docinfo"""
        
        logging.debug("getSearchResults mode=%s query=%s"%(mode, query))
        if mode == "none":
            return docinfo
              
        #TODO: put mode into query
        
        cachedQuery = docinfo.get('cachedQuery', None)
        if cachedQuery is not None:
            # cached search result
            if cachedQuery == '%s_%s'%(mode,query):
                # same query
                return docinfo
            
            else:
                # different query
                del docinfo['resultSize']
                del docinfo['results']
        
        # cache query
        docinfo['cachedQuery'] = '%s_%s'%(mode,query)
        
        # fetch full results
        docpath = docinfo['textURLPath']
        params = {'docId': docpath,
                  'query': query,
                  'pageSize': 1000,
                  'page': 1,
                  'outputFormat': 'html'}
        pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params))
        results = []
        try:
            dom = ET.fromstring(pagexml)
            # page content is currently in multiple <td align=left>
            alldivs = dom.findall(".//tr[@class='hit']")
            for div in alldivs:
                # change tr to div
                div.tag = 'div'
                # change td to span
                for d in div.findall('td'):
                    d.tag = 'span'
                    
                # TODO: can we put etree in the session?
                results.append(div)
        
        except Exception, e:
            logging.error("GetSearchResults: Error parsing search result: %s"%e)
                
        # store results in docinfo
        docinfo['resultSize'] = len(results)
        docinfo['results'] = results

        return docinfo
    

    def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None):
        """returns single page from the table of contents"""
        logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn))
        # get (cached) result
        self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
            
        resultxml = docinfo.get('results', None)
        if not resultxml:
            logging.error("getResultPage: unable to find results")
            return "Error: no result!"
        
        if size is None:
            size = pageinfo.get('resultPageSize', 10)
            
        if start is None:
            start = (pn - 1) * size

        if resultxml is not None:
            # paginate
            first = start-1
            last = first+size
            tocdivs = resultxml[first:last]
            
            toc = ET.Element('div', attrib={'class':'queryResultPage'})
            for div in tocdivs:
                # check all a-tags
                links = div.findall(".//a")
                for l in links:
                    href = l.get('href')
                    if href:
                        # assume all links go to pages
                        linkUrl = urlparse.urlparse(href)
                        linkParams = urlparse.parse_qs(linkUrl.query)
                        # take some parameters (make sure it works even if the link was already parsed)
                        params = {'pn': linkParams.get('page',linkParams.get('pn', None)),
                                  'highlightQuery': linkParams.get('highlightQuery',None),
                                  'highlightElement': linkParams.get('highlightElem',linkParams.get('highlightElement',None)),
                                  'highlightElementPos': linkParams.get('highlightElemPos',linkParams.get('highlightElementPos',None))
                                  }
                        if not params['pn']:
                            logging.warn("getResultsPage: link has no page: %s"%href)
                            
                        url = self.getLink(params=params)
                        l.set('href', url)
                        
                toc.append(div)
                        
            return serialize(toc)
        
        return "ERROR: no results!"


    def getToc(self, mode='text', docinfo=None):
        """returns list of table of contents from docinfo"""
        logging.debug("getToc mode=%s"%mode)
        if mode == 'text':
            queryType = 'toc'
        else:
            queryType = mode
            
        if not 'full_%s'%queryType in docinfo:
            # get new toc
            docinfo = self.getTextInfo(queryType, docinfo)
            
        return docinfo.get('full_%s'%queryType, [])


    def getTocPage(self, mode='text', pn=None, start=None, size=None, pageinfo=None, docinfo=None):
        """returns single page from the table of contents"""
        logging.debug("getTocPage mode=%s, pn=%s start=%s size=%s"%(mode,repr(pn),repr(start),repr(size)))
        fulltoc = self.getToc(mode=mode, docinfo=docinfo)
        if len(fulltoc) < 1:
            logging.error("getTocPage: unable to find toc!")
            return "Error: no table of contents!"        
        
        if size is None:
            size = pageinfo.get('tocPageSize', 30)
            
        if start is None:
            start = (pn - 1) * size

        # paginate
        first = (start - 1)
        last = first + size
        tocs = fulltoc[first:last]
        tp = '<div>'
        label = {'figures': 'Figure', 'handwritten': 'Handwritten note'}.get(mode, 'Item')
        for toc in tocs:
            pageurl = self.getLink('pn', toc['pn'])
            tp += '<div class="tocline">'
            content = toc['content']
            if content:
                tp += '<div class="toc name">[%s] %s</div>'%(toc['level-string'], toc['content'])
            else:
                tp += '<div class="toc name">[%s %s]</div>'%(label, toc['level-string'])
            
            if toc.get('no', None):
                tp += '<div class="toc page"><a href="%s">Page: %s (%s)</a></div>'%(pageurl, toc['pn'], toc['no'])
            else:
                tp += '<div class="toc page"><a href="%s">Page: %s</a></div>'%(pageurl, toc['pn'])
                
            tp += '</div>\n'
            
        tp += '</div>\n'
        
        return tp
           
    
    def manage_changeMpiwgXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,repositoryType=None,RESPONSE=None):
        """change settings"""
        self.title=title
        self.timeout = timeout
        self.serverUrl = serverUrl
        if repositoryType:
            self.repositoryType = repositoryType
        if RESPONSE is not None:
            RESPONSE.redirect('manage_main')
        
# management methods
def manage_addMpiwgXmlTextServerForm(self):
    """Form for adding"""
    pt = PageTemplateFile("zpt/manage_addMpiwgXmlTextServer", globals()).__of__(self)
    return pt()

def manage_addMpiwgXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
    """add MpiwgXmlTextServer"""
    newObj = MpiwgXmlTextServer(id=id,title=title,serverUrl=serverUrl,timeout=timeout)
    self.Destination()._setObject(id, newObj)
    if RESPONSE is not None:
        RESPONSE.redirect('manage_main')