view HocrTextServer.py @ 617:7aefbddddaf9

alpaha of hocr server support
author dwinter
date Wed, 23 Jul 2014 17:36:04 +0200
parents
children
line wrap: on
line source

from OFS.SimpleItem import SimpleItem
from Products.PageTemplates.PageTemplateFile import PageTemplateFile 

import xml.etree.ElementTree as ET

import re
import logging
import urllib
import urlparse
import base64

from HocrTxtUtils import getInt, getText, getHttpData

def serialize(node):
    """returns a string containing an XML snippet of node"""
    s = ET.tostring(node, 'UTF-8')
    # snip off XML declaration
    if s.startswith('<?xml'):
        i = s.find('?>')
        return s[i+3:]

    return s


class HocrTextServer(SimpleItem):
    """TextServer implementation for MPDL-XML eXist server"""
    meta_type="Hocr TextServer"

    manage_options=(
        {'label':'Config','action':'manage_changeHocrTextServerForm'},
       )+SimpleItem.manage_options
    
    manage_changeHocrTextServerForm = PageTemplateFile("zpt/manage_changeHocrTextServer", globals())
        
    def __init__(self,id,title="",serverUrl="http://localhost:8080/hocr", timeout=40, repositoryType='production'):
        """constructor"""
        self.id=id
        self.title=title
        self.timeout = timeout
        self.repositoryType = repositoryType
       
        self.serverUrl = serverUrl
      
    def getHttpData(self, url, data=None):
        """returns result from url+data HTTP request"""
        return getHttpData(url,data,timeout=self.timeout)
    
    def getServerData(self, pn, data=None):
        """returns result from text server for method+data"""
        url = self.serverUrl
        return getHttpData(url,pn,data=data,timeout=self.timeout)


    def getRepositoryType(self):
        """returns the repository type, e.g. 'production'"""
        return getattr(self, 'repositoryType', None)

    def getTextDownloadUrl(self, type='xml', docinfo=None):
        """returns a URL to download the current text"""
        docpath = docinfo.get('textURLPath', None)
        if not docpath:
            return None

        docpath = docpath.replace('.xml','.'+type)
        url = '%sgetDoc?doc=%s'%(self.serverUrl.replace('interface/',''), docpath)
        return url


    def getPlacesOnPage(self, docinfo=None, pn=None):
        """Returns list of GIS places of page pn"""
        docpath = docinfo.get('textURLPath',None)
        if not docpath:
            return None

        places=[]
        text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn))
        dom = ET.fromstring(text)
        result = dom.findall(".//resultPage/place")
        for l in result:
            id = l.get("id")
            name = l.text
            place = {'id': id, 'name': name}
            places.append(place)

        return places
    
          
    def getTextInfo(self, mode='', docinfo=None):
        """reads document info, including page concordance, from text server"""
        logging.debug("getTextInfo mode=%s"%mode)
        if mode not in ['toc', 'figures', '']:
            mode = ''
        # check cached info
        if mode:
            # cached toc-request?
            if 'full_%s'%mode in docinfo:
                return docinfo
            
        else:
            # no toc-request
            if 'numTextPages' in docinfo:
                return docinfo
                
        docpath = docinfo.get('textURLPath', None)
        if docpath is None:
            logging.error("getTextInfo: no textURLPath!")
            return docinfo
              
        try:
            # we need to set a result set size
            pagesize = 10000
            pn = 1
            # fetch docinfo            
            pagexml = self.getServerData("doc-info.xql","document=%s&info=%s&pageSize=%s&pn=%s"%(docpath,mode,pagesize,pn))
            dom = ET.fromstring(pagexml)
            # all info in tag <document>
            doc = dom.find("document")
        except Exception, e:
            logging.error("getTextInfo: Error reading doc info: %s"%e)
            return docinfo
            
        if doc is None:
            logging.error("getTextInfo: unable to find document-tag!")
        else:
            # go through all child elements
            for tag in doc:
                name = tag.tag
                # numTextPages
                if name == 'countPages':
                    np = getInt(tag.text)                    
                    if np > 0:
                        docinfo['numTextPages'] = np
                   
                # numFigureEntries
                elif name == 'countFigureEntries':
                    docinfo['numFigureEntries'] = getInt(tag.text)
                    
                # numTocEntries
                elif name == 'countTocEntries':
                    # WTF: s1 = int(s)/30+1
                    docinfo['numTocEntries'] = getInt(tag.text)
                    
                # numPlaces
                elif name == 'countPlaces':
                    docinfo['numPlaces'] = getInt(tag.text)
                    
                # pageNumbers
                elif name == 'pageNumbers':
                    # contains tags with page numbers
                    # <pn><n>4</n><no>4</no><non/></pn>
                    # n=scan number, no=original page no, non=normalized original page no
                    # pageNumbers is a dict indexed by scan number
                    pages = {}
                    for pn in tag:
                        page = {}
                        n = 0
                        for p in pn:
                            if p.tag == 'n':
                                n = getInt(p.text)
                                page['pn'] = n
                            elif p.tag == 'no':
                                page['no'] = p.text
                            elif p.tag == 'non':
                                page['non'] = p.text
                                
                        if n > 0:
                            pages[n] = page
                        
                    docinfo['pageNumbers'] = pages
                    #logging.debug("got pageNumbers=%s"%repr(pages))
                                
                # toc
                elif name == 'toc':
                    # contains tags with table of contents/figures
                    # <toc-entry><page>13</page><level>3</level><content>Chapter I</content><level-string>1.</level-string><real-level>1</real-level></toc-entry>
                    tocs = []
                    for te in tag:
                        toc = {}
                        for t in te:
                            if t.tag == 'page':
                                toc['pn'] = getInt(t.text)
                            elif t.tag == 'level':
                                toc['level'] = t.text
                            elif t.tag == 'content':
                                toc['content'] = t.text
                            elif t.tag == 'level-string':
                                toc['level-string'] = t.text
                            elif t.tag == 'real-level':
                                toc['real-level'] = t.text
                                
                        tocs.append(toc)
                    
                    # save as full_toc/full_figures
                    docinfo['full_%s'%mode] = tocs

        return docinfo
        
          
    def processPageInfo(self, dom, docinfo, pageinfo):
        """processes page info divs from dom and stores in docinfo and pageinfo"""
        # assume first second level div is pageMeta
        alldivs = dom.find("div")
        
        if alldivs is None or alldivs.get('class', '') != 'pageMeta':
            logging.error("processPageInfo: pageMeta div not found!")
            return
        
        for div in alldivs:
            dc = div.get('class')
            
            # pageNumberOrig  
            if dc == 'pageNumberOrig':
                pageinfo['pageNumberOrig'] = div.text
                
            # pageNumberOrigNorm
            elif dc == 'pageNumberOrigNorm':
                pageinfo['pageNumberOrigNorm'] = div.text
                
            # pageHeaderTitle
            elif dc == 'pageHeaderTitle':
                pageinfo['pageHeaderTitle'] = div.text
                        
        #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo))
        return
         
           
    def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None):
        """returns single page from fulltext"""
        
        
        logging.debug("getTextPage Hocr mode=%s, pn=%s"%(mode,pn))
        # check for cached text -- but ideally this shouldn't be called twice
        if pageinfo.has_key('textPage'):
            logging.debug("getTextPage: using cached text")
            return pageinfo['textPage']
        
        docpath = docinfo.get('textURLPath', None)
        
        docpath=docpath.replace("pages","hocr")
        
        logging.debug("getTextPage docpath= %s"%docpath)
        if not docpath:
            return None
        
        # stuff for constructing full urls
        selfurl = docinfo['viewerUrl']
        textParams = {'document': docpath,
                      'pn': pn}
        if 'characterNormalization' in pageinfo:
            textParams['characterNormalization'] = pageinfo['characterNormalization']
        
        if not mode:
            # default is dict
            mode = 'text'

        logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn))
        modes = mode.split(',')
        # check for multiple layers
        if len(modes) > 1:
            logging.debug("getTextPage: more than one mode=%s"%mode)
                        
        # search mode
        if 'search' in modes:
            # add highlighting
            highlightQuery = pageinfo.get('highlightQuery', None)
            if highlightQuery:
                textParams['highlightQuery'] = highlightQuery
                textParams['highlightElement'] = pageinfo.get('highlightElement', '')
                textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '')
                
            # ignore mode in the following
            modes.remove('search')
                            
        # pundit mode
        punditMode = False
        if 'pundit' in modes:
            punditMode = True
            # ignore mode in the following
            modes.remove('pundit')
                            
        # other modes don't combine
        if 'dict' in modes:
            # dict is called textPollux in the backend
            textmode = 'textPollux'
        elif 'xml' in modes:
            # xml mode
            textmode = 'xml'
            textParams['characterNormalization'] = 'orig'
        elif 'gis' in modes:
            textmode = 'gis'
        else:
            # text is default mode
            textmode = 'text'
        
        textParams['mode'] = textmode
        
        logging.debug("getTextPage (textparams: %s"%textParams)
          
        try:
            # fetch the page
            pagexml = self.getServerData(pn,urllib.urlencode(textParams))
            return pagexml
        except Exception, e:
            logging.error("getTextPage: Error reading page: %s"%e)
            return None
            


        return None
    
    def addPunditAttributes(self, pagediv, pageinfo, docinfo):
        """add about attributes for pundit annotation tool"""
        textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???'))
        pn = pageinfo.get('pn', '1')
        #  TODO: use pn as well?
        # check all div-tags
        divs = pagediv.findall(".//div")
        for d in divs:
            id = d.get('id')
            if id:
                d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id))
                cls = d.get('class','')
                cls += ' pundit-content'
                d.set('class', cls.strip())

        return pagediv

    def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None):
        """loads list of search results and stores XML in docinfo"""
        
        logging.debug("getSearchResults mode=%s query=%s"%(mode, query))
        if mode == "none":
            return docinfo
              
        cachedQuery = docinfo.get('cachedQuery', None)
        if cachedQuery is not None:
            # cached search result
            if cachedQuery == '%s_%s'%(mode,query):
                # same query
                return docinfo
            
            else:
                # different query
                del docinfo['resultSize']
                del docinfo['resultXML']
        
        # cache query
        docinfo['cachedQuery'] = '%s_%s'%(mode,query)
        
        # fetch full results
        docpath = docinfo['textURLPath']
        params = {'document': docpath,
                  'mode': 'text',
                  'queryType': mode,
                  'query': query,
                  'queryResultPageSize': 1000,
                  'queryResultPN': 1,
                  'characterNormalization': pageinfo.get('characterNormalization', 'reg')}
        pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params))
        #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery)))
        dom = ET.fromstring(pagexml)
        # page content is in <div class="queryResultPage">
        pagediv = None
        # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage']
        alldivs = dom.findall("div")
        for div in alldivs:
            dc = div.get('class')
            # page content div
            if dc == 'queryResultPage':
                pagediv = div
                
            elif dc == 'queryResultHits':
                docinfo['resultSize'] = getInt(div.text)

        if pagediv is not None:
            # store XML in docinfo
            docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8')

        return docinfo
    

    def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None):
        """returns single page from the table of contents"""
        logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn))
        # get (cached) result
        self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo)
            
        resultxml = docinfo.get('resultXML', None)
        if not resultxml:
            logging.error("getResultPage: unable to find resultXML")
            return "Error: no result!"
        
        if size is None:
            size = pageinfo.get('resultPageSize', 10)
            
        if start is None:
            start = (pn - 1) * size

        fullresult = ET.fromstring(resultxml)
        
        if fullresult is not None:
            # paginate
            first = start-1
            len = size
            del fullresult[:first]
            del fullresult[len:]
            tocdivs = fullresult
            
            # check all a-tags
            links = tocdivs.findall(".//a")
            for l in links:
                href = l.get('href')
                if href:
                    # assume all links go to pages
                    linkUrl = urlparse.urlparse(href)
                    linkParams = urlparse.parse_qs(linkUrl.query)
                    # take some parameters
                    params = {'pn': linkParams['pn'],
                              'highlightQuery': linkParams.get('highlightQuery',''),
                              'highlightElement': linkParams.get('highlightElement',''),
                              'highlightElementPos': linkParams.get('highlightElementPos','')
                              }
                    url = self.getLink(params=params)
                    l.set('href', url)
                        
            return serialize(tocdivs)
        
        return "ERROR: no results!"


    def getToc(self, mode='text', docinfo=None):
        """returns list of table of contents from docinfo"""
        logging.debug("getToc mode=%s"%mode)
        if mode == 'text':
            queryType = 'toc'
        else:
            queryType = mode
            
        if not 'full_%s'%queryType in docinfo:
            # get new toc
            docinfo = self.getTextInfo(queryType, docinfo)
            
        return docinfo.get('full_%s'%queryType, [])

    def getTocPage(self, mode='text', pn=None, start=None, size=None, pageinfo=None, docinfo=None):
        """returns single page from the table of contents"""
        logging.debug("getTocPage mode=%s, pn=%s start=%s size=%s"%(mode,repr(pn),repr(start),repr(size)))
        fulltoc = self.getToc(mode=mode, docinfo=docinfo)
        if len(fulltoc) < 1:
            logging.error("getTocPage: unable to find toc!")
            return "Error: no table of contents!"        
        
        if size is None:
            size = pageinfo.get('tocPageSize', 30)
            
        if start is None:
            start = (pn - 1) * size

        # paginate
        first = (start - 1)
        last = first + size
        tocs = fulltoc[first:last]
        tp = '<div>'
        for toc in tocs:
            pageurl = self.getLink('pn', toc['pn'])
            tp += '<div class="tocline">'
            tp += '<div class="toc name">[%s %s]</div>'%(toc['level-string'], toc['content'])
            tp += '<div class="toc float right page"><a href="%s">Page: %s</a></div>'%(pageurl, toc['pn'])
            tp += '</div>\n'
            
        tp += '</div>\n'
        
        return tp
           
    
    def manage_changeHocrTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,repositoryType=None,RESPONSE=None):
        """change settings"""
        self.title=title
        self.timeout = timeout
        self.serverUrl = serverUrl
        if repositoryType:
            self.repositoryType = repositoryType
        if RESPONSE is not None:
            RESPONSE.redirect('manage_main')
        
# management methods
def manage_addHocrTextServerForm(self):
    """Form for adding"""
    pt = PageTemplateFile("zpt/manage_addHocrTextServer", globals()).__of__(self)
    return pt()

def manage_addHocrTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None):
#def manage_addHocrTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None):    
    """add zogiimage"""
    newObj = HocrTextServer(id,title,serverUrl,timeout)
    self.Destination()._setObject(id, newObj)
    if RESPONSE is not None:
        RESPONSE.redirect('manage_main')