view @ 630:25295ceb11b1

updated annotation templates to new digilib-annotator version 1.3.5
author casties
date Thu, 04 Jun 2015 16:23:29 +0200
parents c57d80a649ea
children 4a75a760def2
line wrap: on
line source

from OFS.SimpleItem import SimpleItem
from Products.PageTemplates.PageTemplateFile import PageTemplateFile 

import xml.etree.ElementTree as ET

import re
import logging
import urllib
import urlparse
import base64

from datetime import datetime

from SrvTxtUtils import getInt, getText, getHttpData, serialize

# mapping of fields in the output of /mpiwg-mpdl-cms-web/query/GetDocInfo to documentViewer docinfo
textinfoFieldMap = {
                    'countPages' : 'numTextPages',
                    'countFigures' : 'numFigureEntries',
                    'countNotesHandwritten' : 'numHandwritten',
                    'countNotes' : 'numNotes',
                    'countPlaces' : 'numPlaces',
                    'countTocEntries' : 'numTocEntries'

class MpiwgXmlTextServer(SimpleItem):
    """TextServer implementation for MPIWG-XML server"""
    meta_type="MPIWG-XML TextServer"

    manage_changeMpiwgXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpiwgXmlTextServer", globals())
    def __init__(self,id,title="",serverUrl="", timeout=40, serverName=None, repositoryType='production'):
        self.timeout = timeout
        self.repositoryType = repositoryType
        if serverName is None:
            self.serverUrl = serverUrl
            self.serverUrl = "http://%s/mpiwg-mpdl-cms-web/"%serverName
    def getHttpData(self, url, data=None):
        """returns result from url+data HTTP request"""
        return getHttpData(url,data,timeout=self.timeout)
    def getServerData(self, method, data=None):
        """returns result from text server for method+data"""
        url = self.serverUrl+method
        return getHttpData(url,data,timeout=self.timeout)

    def getRepositoryType(self):
        """returns the repository type, e.g. 'production'"""
        return getattr(self, 'repositoryType', None)

    def getTextDownloadUrl(self, type='xml', docinfo=None):
        """returns a URL to download the current text"""
        docpath = docinfo.get('textURLPath', None)
        if not docpath:
            return None

        docpath = docpath.replace('.xml','.'+type)
        url = '%sdoc/GetDocument?id=%s'%(self.serverUrl.replace('interface/',''), docpath)
        return url

    def getPlacesOnPage(self, docinfo=None, pn=None):
        """Returns list of GIS places of page pn"""
        if not 'places' in docinfo:
            self.getTextInfo('places', docinfo)
        allplaces = docinfo.get('places', None)
        if len(allplaces) == 0:
            return []
        # search for places on this page TODO: is there a better way?
        places = [p for p in allplaces if p['pn'] == pn]
        return places
        docpath = docinfo.get('textURLPath',None)
        if not docpath:
            return None

        text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn))
        dom = ET.fromstring(text)
        result = dom.findall(".//resultPage/place")
        for l in result:
            id = l.get("id")
            name = l.text
            place = {'id': id, 'name': name}

        return places"""
    def getTextInfo(self, mode=None, docinfo=None):
        """reads document info, including page concordance, from text server"""
        logging.debug("getTextInfo mode=%s"%mode)
        field = ''
        if mode in ['pages', 'toc', 'figures', 'notes', 'handwritten', 'places']:
            # translate mode to field param
            if mode == 'handwritten':
                field = '&field=notesHandwritten'
                field = '&field=%s'%mode
            mode = None

        # check cached info
        if mode:
            # cached toc-request?
            if 'full_%s'%mode in docinfo:
                return docinfo
            # cached but no toc-request?
            if 'numTextPages' in docinfo:
                return docinfo
        docpath = docinfo.get('textURLPath', None)
        if docpath is None:
            logging.error("getTextInfo: no textURLPath!")
            return docinfo
        # fetch docinfo            
        pagexml = self.getServerData("query/GetDocInfo","docId=%s%s"%(docpath,field))
        dom = ET.fromstring(pagexml)
        # all info in tag <doc>
        doc = dom
        if doc is None:
            logging.error("getTextInfo: unable to find document-tag!")
            if mode is None:
                # get general info from system-tag
                sys = doc.find('system')
                if sys is not None:
                    for (k,v) in textinfoFieldMap.items():
                        # copy into docinfo (even if empty)
                        docinfo[v] = getInt(getText(sys.find(k)))
                # result is in list-tag
                l = doc.find('list')
                if l is not None:
                    # look for general info
                    for (k,v) in textinfoFieldMap.items():
                        # copy into docinfo (only if not empty)
                        s = doc.find(k)
                        if s is not None:
                            docinfo[v] = getInt(getText(s))
                    lt = l.get('type')
                    # pageNumbers
                    if lt == 'pages':
                        # contains tags with page numbers
                        # <item n="14" o="2" o-norm="2" file="0014"/>
                        # n=scan number, o=original page no, on=normalized original page no
                        # pageNumbers is a dict indexed by scan number
                        pages = {}
                        for i in l:
                            page = {}
                            pn = getInt(i.get('n'))
                            page['pn'] = pn
                            no = i.get('o')
                            page['no'] = no
                            non = i.get('o-norm')
                            page['non'] = non
                            if pn > 0:
                                pages[pn] = page
                        docinfo['pageNumbers'] = pages
                    # toc
                    elif lt in ['toc', 'figures', 'notes', 'notesHandwritten']:
                        # contains tags with table of contents/figures
                        # <item n="2.1." lv="2">CAP.I. <ref o="119">132</ref></item>
                        tocs = []
                        for te in l:
                            if te.tag == 'item':
                                toc = {}
                                toc['level-string'] = te.get('n')
                                toc['level'] = te.get('lv')
                                toc['content'] = te.text.strip()
                                ref = te.find('ref')
                                toc['pn'] = getInt(ref.text)
                                toc['no'] = ref.get('o')
                                toc['non'] = ref.get('o-norm')
                        # save as full_toc/full_figures
                        docinfo['full_%s'%mode] = tocs

                    # places
                    # toc
                    elif lt in ['places']:
                        # contains tags with place-ids
                        # <item id="N40004F-01"><ref>4</ref></item>
                        places = []
                        for p in l:
                            if p.tag == 'item':
                                place = {}
                                place['id'] = p.get('id')
                                ref = p.find('ref')
                                place['pn'] = getInt(ref.text)
                        docinfo['places'] = places

        return docinfo
    def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
        """returns single page from fulltext"""
        logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
        startTime =
        # check for cached text -- but ideally this shouldn't be called twice
        if pageinfo.has_key('textPage'):
            logging.debug("getTextPage: using cached text")
            return pageinfo['textPage']
        docpath = docinfo.get('textURLPath', None)
        if not docpath:
            return None
        # stuff for constructing full urls
        selfurl = docinfo['viewerUrl']
        textParams = {'docId': docpath,
                      'page': pn}
        normMode = pageinfo.get('characterNormalization', 'reg')
        # TODO: change values in form
        if normMode == 'regPlusNorm':
            normMode = 'norm'
        # TODO: this should not be necessary when the backend is fixed                
        #textParams['normalization'] = normMode
        if not mode:
            # default is dict
            mode = 'text'

        modes = mode.split(',')
        # check for multiple layers
        if len(modes) > 1:
            logging.debug("getTextPage: more than one mode=%s"%mode)
        # mode defaults
        gisMode = False
        punditMode = False
        # search mode
        if 'search' in modes:
            # add highlighting
            highlightQuery = pageinfo.get('highlightQuery', None)
            if highlightQuery:
                textParams['highlightQuery'] = highlightQuery
                textParams['highlightElem'] = pageinfo.get('highlightElement', '')
                textParams['highlightElemPos'] = pageinfo.get('highlightElementPos', '')
            # ignore mode in the following
        # pundit mode
        if 'pundit' in modes:
            punditMode = True
            # ignore mode in the following
        # other modes don't combine
        if 'dict' in modes:
            textmode = 'dict'
            textParams['outputFormat'] = 'html'
        elif 'xml' in modes:
            textmode = 'xml'
            textParams['outputFormat'] = 'xmlDisplay'
            normMode = 'orig'
        elif 'gis' in modes:
            gisMode = True
            # gis mode uses plain text
            textmode = 'plain'
            textParams['outputFormat'] = 'html'
            # text is default mode
            textmode = 'plain'
            textParams['outputFormat'] = 'html'
            # fetch the page
            pagexml = self.getServerData("query/GetPage",urllib.urlencode(textParams))
            dom = ET.fromstring(pagexml)
        except Exception, e:
            logging.error("Error reading page: %s"%e)
            return None
        # plain text or text-with-links mode
        if textmode == 'plain' or textmode == 'dict':
            # the text is in div@class=text
            pagediv = dom.find(".//div[@class='text']")
            logging.debug("pagediv: %s"%repr(pagediv))
            if pagediv is not None:
                # add textmode and normMode classes
                #pagediv.set('class', 'text %s %s'%(textmode, normMode))
                self._processWTags(textmode, normMode, pagediv)
                #self._processPbTag(pagediv, pageinfo)
                self._processFigures(pagediv, docinfo)
                # get full url assuming documentViewer is parent
                selfurl = self.getLink()
                # check all a-tags
                links = pagediv.findall('.//a')
                for l in links:
                    href = l.get('href')
                    if href:
                        # is link with href
                        linkurl = urlparse.urlparse(href)
                        if linkurl.path.endswith('GetDictionaryEntries'):
                            #TODO: replace wordInfo page
                            # add target to open new page
                            l.set('target', '_blank')
                if punditMode:
                    self._addPunditAttributes(pagediv, pageinfo, docinfo)
                if gisMode:
                    self._addGisTags(pagediv, pageinfo, docinfo)
                s = serialize(pagediv)
                logging.debug("getTextPage done in %s"%(    
                return s
        # xml mode
        elif textmode == "xml":
            # the text is in body
            pagediv = dom.find(".//body")
            logging.debug("pagediv: %s"%repr(pagediv))
            if pagediv is not None:
                return serialize(pagediv)
        logging.error("getTextPage: error in text mode %s or in text!"%(textmode))
        return None

    def _processWTags(self, textMode, normMode, pagediv):
        """selects the necessary information from w-spans and removes the rest from pagediv"""
        startTime =
        wtags = pagediv.findall(".//span[@class='w']")
        for wtag in wtags:
            if textMode == 'dict':
                # delete non-a-tags
                wtag.remove(wtag.find("span[@class='nodictionary orig']"))
                wtag.remove(wtag.find("span[@class='nodictionary reg']"))
                wtag.remove(wtag.find("span[@class='nodictionary norm']"))
                # delete non-matching children of a-tag and suppress remaining tag name
                atag = wtag.find("*[@class='dictionary']")
                if normMode == 'orig':
                    atag.find("span[@class='orig']").tag = None
                elif normMode == 'reg':
                    atag.find("span[@class='reg']").tag = None
                elif normMode == 'norm':
                    atag.find("span[@class='norm']").tag = None
                # delete a-tag
                # delete non-matching children and suppress remaining tag name
                if normMode == 'orig':
                    wtag.remove(wtag.find("span[@class='nodictionary reg']"))
                    wtag.remove(wtag.find("span[@class='nodictionary norm']"))
                    wtag.find("span[@class='nodictionary orig']").tag = None
                elif normMode == 'reg':
                    wtag.remove(wtag.find("span[@class='nodictionary orig']"))
                    wtag.remove(wtag.find("span[@class='nodictionary norm']"))
                    wtag.find("span[@class='nodictionary reg']").tag = None
                elif normMode == 'norm':
                    wtag.remove(wtag.find("span[@class='nodictionary orig']"))
                    wtag.remove(wtag.find("span[@class='nodictionary reg']"))
                    wtag.find("span[@class='nodictionary norm']").tag = None
            # suppress w-tag name
            wtag.tag = None
        logging.debug("processWTags in %s"%(
        return pagediv
    def _processPbTag(self, pagediv, pageinfo):
        """extracts information from pb-tag and removes it from pagediv"""
        pbdiv = pagediv.find(".//span[@class='pb']")
        if pbdiv is None:
            logging.warning("getTextPage: no pb-span!")
            return pagediv
        # extract running head
        rh = pbdiv.find(".//span[@class='rhead']")
        if rh is not None:
            pageinfo['pageHeaderTitle'] = getText(rh)
        # remove pb-div from parent
        ppdiv = pagediv.find(".//span[@class='pb']/..")
        return pagediv
    def _addPunditAttributes(self, pagediv, pageinfo, docinfo):
        """add about-attributes to divs for pundit annotation tool"""
        textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???'))
        pn = pageinfo.get('pn', '1')
        # check all div-tags
        divs = pagediv.findall(".//div")
        for d in divs:
            id = d.get('id')
            if id:
                # TODO: check path (cf RFC2396)
                d.set('about', ""%(textid,pn,id))
                cls = d.get('class','')
                cls += ' pundit-content'
                d.set('class', cls.strip())

        return pagediv

    def _addGisTags(self, pagediv, pageinfo, docinfo):
        """add links for gis places"""
        # use last part of documentPath as db-id
        docpath = docinfo.get('documentPath', '')
        textid = docpath.split('/')[-1]
        # add our URL as backlink
        selfurl = self.getLink()
        doc = base64.b64encode(selfurl)
        # check all span@class=place
        spans = pagediv.findall(".//span[@class='place']")
        for s in spans:
            id = s.get('id')
            if id:
                # make links like
                s.tag = 'a'
                # TODO: make links configurable
                url = ""%(textid,id,doc)
                s.set('href', url)
                s.set('target', '_blank')

        return pagediv

    def _processFigures(self, pagediv, docinfo):
        """processes figure-tags"""
        # unfortunately etree can not select class.startswith('figure')
        divs = pagediv.findall(".//span[@class]")
        scalerUrl = docinfo['digilibScalerUrl']
        viewerUrl = docinfo['digilibViewerUrl']
        for d in divs:
            if not d.get('class').startswith('figure'):
                a = d.find('a')
                img = a.find('img')
                imgsrc = img.get('src')
                imgurl = urlparse.urlparse(imgsrc)
                imgq = imgurl.query
                imgparams = urlparse.parse_qs(imgq)
                fn = imgparams.get('fn', None)
                if fn is not None:
                    # parse_qs puts parameters in lists
                    fn = fn[0]
                    # TODO: check valid path
                    # fix img@src
                    newsrc = '%s?fn=%s&dw=200&dh=200'%(scalerUrl,fn)
                    img.set('src', newsrc)
                    # fix a@href
                    newlink = '%s?fn=%s'%(viewerUrl,fn)
                    a.set('href', newlink)
                    a.set('target', '_blank')
                logging.warn("processFigures: strange figure!")

    def _cleanSearchResult(self, pagediv):
        """fixes search result html (change pbs and figures)"""
        # replace figure-tag with figureNumText
        for fig in pagediv.findall(".//span[@class='figure']"):
            txt = fig.findtext(".//span[@class='figureNumText']")
            tail = fig.tail
            fig.set('class', 'figure')
            fig.text = txt
            fig.tail = tail
        # replace lb-tag with "//"
        for lb in pagediv.findall(".//br[@class='lb']"):
            lb.tag = 'span'
            lb.text = '//'
        # replace pb-tag with "///"
        for pb in pagediv.findall(".//span[@class='pb']"):
            tail = pb.tail
            pb.set('class', 'pb')
            pb.text = '///'
            pb.tail = tail
        return pagediv
    def _cleanSearchResult2(self, pagediv):
        """fixes search result html (change pbs and figures)"""
        # unfortunately etree can not select class.startswith('figure')
        divs = pagediv.findall(".//span[@class]")
        for d in divs:
            cls = d.get('class')
            if cls.startswith('figure'):
                # replace figure-tag with figureNumText
                txt = d.findtext(".//span[@class='figureNumText']")
                d.set('class', 'figure')
                d.text = txt
            elif cls.startswith('pb'):
                # replace pb-tag with "//"
                d.set('class', 'pb')
                d.text = '//'
        return pagediv

    def _fixEmptyDivs(self, pagediv):
        """fixes empty div-tags by inserting a space"""
        divs = pagediv.findall('.//div')
        for d in divs:
            if len(d) == 0 and not d.text:
                # make empty divs non-empty
                d.text = ' '
        return pagediv

    def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None):
        """loads list of search results and stores XML in docinfo"""
        normMode = pageinfo.get('characterNormalization', 'reg')
        logging.debug("getSearchResults mode=%s query=%s norm=%s"%(mode, query, normMode))
        if mode == "none":
            return docinfo
        #TODO: put mode into query
        cachedQuery = docinfo.get('cachedQuery', None)
        if cachedQuery is not None:
            # cached search result
            if cachedQuery == '%s_%s_%s'%(mode,query,normMode):
                # same query
                return docinfo
                # different query
                del docinfo['resultSize']
                del docinfo['results']
        # cache query
        docinfo['cachedQuery'] = '%s_%s_%s'%(mode,query,normMode)
        # fetch full results
        docpath = docinfo['textURLPath']
        params = {'docId': docpath,
                  'query': query,
                  'pageSize': 1000,
                  'page': 1,
                  'outputFormat': 'html'}
        pagexml = self.getServerData("query/QueryDocument",urllib.urlencode(params))
        results = []
            dom = ET.fromstring(pagexml)
            # clean html output
            self._processWTags('plain', normMode, dom)
            # page content is currently in multiple <td align=left>
            alldivs = dom.findall(".//tr[@class='hit']")
            for div in alldivs:
                # change tr to div
                div.tag = 'div'
                # change td to span
                for d in div.findall('td'):
                    d.tag = 'span'
                # TODO: can we put etree in the session?
        except Exception, e:
            logging.error("GetSearchResults: Error parsing search result: %s"%e)
        # store results in docinfo
        docinfo['resultSize'] = len(results)
        docinfo['results'] = results

        return docinfo

    def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None):
        """returns single page from the list of search results"""
        logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn))
        # get (cached) result
        self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
        resultxml = docinfo.get('results', None)
        if not resultxml:
            logging.error("getResultPage: unable to find results")
            return "Error: no result!"
        if size is None:
            size = pageinfo.get('resultPageSize', 10)
        if start is None:
            start = (pn - 1) * size

        if resultxml is not None:
            # paginate
            first = start-1
            last = first+size
            tocdivs = resultxml[first:last]
            toc = ET.Element('div', attrib={'class':'queryResultPage'})
            for div in tocdivs:
                # check all a-tags
                links = div.findall(".//a")
                for l in links:
                    href = l.get('href')
                    if href:
                        # assume all links go to pages
                        linkUrl = urlparse.urlparse(href)
                        linkParams = urlparse.parse_qs(linkUrl.query)
                        # take some parameters (make sure it works even if the link was already parsed)
                        params = {'pn': linkParams.get('page',linkParams.get('pn', None)),
                                  'highlightQuery': linkParams.get('highlightQuery',None),
                                  'highlightElement': linkParams.get('highlightElem',linkParams.get('highlightElement',None)),
                                  'highlightElementPos': linkParams.get('highlightElemPos',linkParams.get('highlightElementPos',None))
                        if not params['pn']:
                            logging.warn("getResultsPage: link has no page: %s"%href)
                        url = self.getLink(params=params)
                        l.set('href', url)
            return serialize(toc)
        return "ERROR: no results!"

    def getToc(self, mode='text', docinfo=None):
        """returns list of table of contents from docinfo"""
        logging.debug("getToc mode=%s"%mode)
        if mode == 'text':
            queryType = 'toc'
            queryType = mode
        if not 'full_%s'%queryType in docinfo:
            # get new toc
            docinfo = self.getTextInfo(queryType, docinfo)
        return docinfo.get('full_%s'%queryType, [])

    def getTocPage(self, mode='text', pn=None, start=None, size=None, pageinfo=None, docinfo=None):
        """returns single page from the table of contents"""
        logging.debug("getTocPage mode=%s, pn=%s start=%s size=%s"%(mode,repr(pn),repr(start),repr(size)))
        fulltoc = self.getToc(mode=mode, docinfo=docinfo)
        if len(fulltoc) < 1:
            logging.error("getTocPage: unable to find toc!")
            return "Error: no table of contents!"        
        if size is None:
            size = pageinfo.get('tocPageSize', 30)
        if start is None:
            start = (pn - 1) * size

        # paginate
        first = (start - 1)
        last = first + size
        tocs = fulltoc[first:last]
        tp = '<div>'
        label = {'figures': 'Figure', 'notes': 'Note', 'handwritten': 'Handwritten note'}.get(mode, 'Item')
        for toc in tocs:
            pageurl = self.getLink('pn', toc['pn'])
            tp += '<div class="tocline">'
            content = toc['content']
            lvs = toc['level-string']
            if content:
                tp += '<div class="toc name">[%s] %s</div>'%(lvs, toc['content'])
            elif lvs:
                tp += '<div class="toc name">[%s %s]</div>'%(label, lvs)
                tp += '<div class="toc name">[%s]</div>'%(label)
            if toc.get('no', None):
                tp += '<div class="toc page"><a href="%s">Page: %s (%s)</a></div>'%(pageurl, toc['pn'], toc['no'])
                tp += '<div class="toc page"><a href="%s">Page: %s</a></div>'%(pageurl, toc['pn'])
            tp += '</div>\n'
        tp += '</div>\n'
        return tp
    def manage_changeMpiwgXmlTextServer(self,title="",serverUrl="",timeout=40,repositoryType=None,RESPONSE=None):
        """change settings"""
        self.timeout = timeout
        self.serverUrl = serverUrl
        if repositoryType:
            self.repositoryType = repositoryType
        if RESPONSE is not None:
# management methods
def manage_addMpiwgXmlTextServerForm(self):
    """Form for adding"""
    pt = PageTemplateFile("zpt/manage_addMpiwgXmlTextServer", globals()).__of__(self)
    return pt()

def manage_addMpiwgXmlTextServer(self,id,title="",serverUrl="",timeout=40,RESPONSE=None):
    """add MpiwgXmlTextServer"""
    newObj = MpiwgXmlTextServer(id=id,title=title,serverUrl=serverUrl,timeout=timeout)
    self.Destination()._setObject(id, newObj)
    if RESPONSE is not None: