Mercurial > hg > documentViewer
view HocrTextServer.py @ 617:7aefbddddaf9
alpaha of hocr server support
author | dwinter |
---|---|
date | Wed, 23 Jul 2014 17:36:04 +0200 |
parents | |
children |
line wrap: on
line source
from OFS.SimpleItem import SimpleItem from Products.PageTemplates.PageTemplateFile import PageTemplateFile import xml.etree.ElementTree as ET import re import logging import urllib import urlparse import base64 from HocrTxtUtils import getInt, getText, getHttpData def serialize(node): """returns a string containing an XML snippet of node""" s = ET.tostring(node, 'UTF-8') # snip off XML declaration if s.startswith('<?xml'): i = s.find('?>') return s[i+3:] return s class HocrTextServer(SimpleItem): """TextServer implementation for MPDL-XML eXist server""" meta_type="Hocr TextServer" manage_options=( {'label':'Config','action':'manage_changeHocrTextServerForm'}, )+SimpleItem.manage_options manage_changeHocrTextServerForm = PageTemplateFile("zpt/manage_changeHocrTextServer", globals()) def __init__(self,id,title="",serverUrl="http://localhost:8080/hocr", timeout=40, repositoryType='production'): """constructor""" self.id=id self.title=title self.timeout = timeout self.repositoryType = repositoryType self.serverUrl = serverUrl def getHttpData(self, url, data=None): """returns result from url+data HTTP request""" return getHttpData(url,data,timeout=self.timeout) def getServerData(self, pn, data=None): """returns result from text server for method+data""" url = self.serverUrl return getHttpData(url,pn,data=data,timeout=self.timeout) def getRepositoryType(self): """returns the repository type, e.g. 'production'""" return getattr(self, 'repositoryType', None) def getTextDownloadUrl(self, type='xml', docinfo=None): """returns a URL to download the current text""" docpath = docinfo.get('textURLPath', None) if not docpath: return None docpath = docpath.replace('.xml','.'+type) url = '%sgetDoc?doc=%s'%(self.serverUrl.replace('interface/',''), docpath) return url def getPlacesOnPage(self, docinfo=None, pn=None): """Returns list of GIS places of page pn""" docpath = docinfo.get('textURLPath',None) if not docpath: return None places=[] text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn)) dom = ET.fromstring(text) result = dom.findall(".//resultPage/place") for l in result: id = l.get("id") name = l.text place = {'id': id, 'name': name} places.append(place) return places def getTextInfo(self, mode='', docinfo=None): """reads document info, including page concordance, from text server""" logging.debug("getTextInfo mode=%s"%mode) if mode not in ['toc', 'figures', '']: mode = '' # check cached info if mode: # cached toc-request? if 'full_%s'%mode in docinfo: return docinfo else: # no toc-request if 'numTextPages' in docinfo: return docinfo docpath = docinfo.get('textURLPath', None) if docpath is None: logging.error("getTextInfo: no textURLPath!") return docinfo try: # we need to set a result set size pagesize = 10000 pn = 1 # fetch docinfo pagexml = self.getServerData("doc-info.xql","document=%s&info=%s&pageSize=%s&pn=%s"%(docpath,mode,pagesize,pn)) dom = ET.fromstring(pagexml) # all info in tag <document> doc = dom.find("document") except Exception, e: logging.error("getTextInfo: Error reading doc info: %s"%e) return docinfo if doc is None: logging.error("getTextInfo: unable to find document-tag!") else: # go through all child elements for tag in doc: name = tag.tag # numTextPages if name == 'countPages': np = getInt(tag.text) if np > 0: docinfo['numTextPages'] = np # numFigureEntries elif name == 'countFigureEntries': docinfo['numFigureEntries'] = getInt(tag.text) # numTocEntries elif name == 'countTocEntries': # WTF: s1 = int(s)/30+1 docinfo['numTocEntries'] = getInt(tag.text) # numPlaces elif name == 'countPlaces': docinfo['numPlaces'] = getInt(tag.text) # pageNumbers elif name == 'pageNumbers': # contains tags with page numbers # <pn><n>4</n><no>4</no><non/></pn> # n=scan number, no=original page no, non=normalized original page no # pageNumbers is a dict indexed by scan number pages = {} for pn in tag: page = {} n = 0 for p in pn: if p.tag == 'n': n = getInt(p.text) page['pn'] = n elif p.tag == 'no': page['no'] = p.text elif p.tag == 'non': page['non'] = p.text if n > 0: pages[n] = page docinfo['pageNumbers'] = pages #logging.debug("got pageNumbers=%s"%repr(pages)) # toc elif name == 'toc': # contains tags with table of contents/figures # <toc-entry><page>13</page><level>3</level><content>Chapter I</content><level-string>1.</level-string><real-level>1</real-level></toc-entry> tocs = [] for te in tag: toc = {} for t in te: if t.tag == 'page': toc['pn'] = getInt(t.text) elif t.tag == 'level': toc['level'] = t.text elif t.tag == 'content': toc['content'] = t.text elif t.tag == 'level-string': toc['level-string'] = t.text elif t.tag == 'real-level': toc['real-level'] = t.text tocs.append(toc) # save as full_toc/full_figures docinfo['full_%s'%mode] = tocs return docinfo def processPageInfo(self, dom, docinfo, pageinfo): """processes page info divs from dom and stores in docinfo and pageinfo""" # assume first second level div is pageMeta alldivs = dom.find("div") if alldivs is None or alldivs.get('class', '') != 'pageMeta': logging.error("processPageInfo: pageMeta div not found!") return for div in alldivs: dc = div.get('class') # pageNumberOrig if dc == 'pageNumberOrig': pageinfo['pageNumberOrig'] = div.text # pageNumberOrigNorm elif dc == 'pageNumberOrigNorm': pageinfo['pageNumberOrigNorm'] = div.text # pageHeaderTitle elif dc == 'pageHeaderTitle': pageinfo['pageHeaderTitle'] = div.text #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo)) return def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): """returns single page from fulltext""" logging.debug("getTextPage Hocr mode=%s, pn=%s"%(mode,pn)) # check for cached text -- but ideally this shouldn't be called twice if pageinfo.has_key('textPage'): logging.debug("getTextPage: using cached text") return pageinfo['textPage'] docpath = docinfo.get('textURLPath', None) docpath=docpath.replace("pages","hocr") logging.debug("getTextPage docpath= %s"%docpath) if not docpath: return None # stuff for constructing full urls selfurl = docinfo['viewerUrl'] textParams = {'document': docpath, 'pn': pn} if 'characterNormalization' in pageinfo: textParams['characterNormalization'] = pageinfo['characterNormalization'] if not mode: # default is dict mode = 'text' logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) modes = mode.split(',') # check for multiple layers if len(modes) > 1: logging.debug("getTextPage: more than one mode=%s"%mode) # search mode if 'search' in modes: # add highlighting highlightQuery = pageinfo.get('highlightQuery', None) if highlightQuery: textParams['highlightQuery'] = highlightQuery textParams['highlightElement'] = pageinfo.get('highlightElement', '') textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '') # ignore mode in the following modes.remove('search') # pundit mode punditMode = False if 'pundit' in modes: punditMode = True # ignore mode in the following modes.remove('pundit') # other modes don't combine if 'dict' in modes: # dict is called textPollux in the backend textmode = 'textPollux' elif 'xml' in modes: # xml mode textmode = 'xml' textParams['characterNormalization'] = 'orig' elif 'gis' in modes: textmode = 'gis' else: # text is default mode textmode = 'text' textParams['mode'] = textmode logging.debug("getTextPage (textparams: %s"%textParams) try: # fetch the page pagexml = self.getServerData(pn,urllib.urlencode(textParams)) return pagexml except Exception, e: logging.error("getTextPage: Error reading page: %s"%e) return None return None def addPunditAttributes(self, pagediv, pageinfo, docinfo): """add about attributes for pundit annotation tool""" textid = docinfo.get('DRI', "fn=%s"%docinfo.get('documentPath', '???')) pn = pageinfo.get('pn', '1') # TODO: use pn as well? # check all div-tags divs = pagediv.findall(".//div") for d in divs: id = d.get('id') if id: d.set('about', "http://echo.mpiwg-berlin.mpg.de/%s/pn=%s/#%s"%(textid,pn,id)) cls = d.get('class','') cls += ' pundit-content' d.set('class', cls.strip()) return pagediv def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): """loads list of search results and stores XML in docinfo""" logging.debug("getSearchResults mode=%s query=%s"%(mode, query)) if mode == "none": return docinfo cachedQuery = docinfo.get('cachedQuery', None) if cachedQuery is not None: # cached search result if cachedQuery == '%s_%s'%(mode,query): # same query return docinfo else: # different query del docinfo['resultSize'] del docinfo['resultXML'] # cache query docinfo['cachedQuery'] = '%s_%s'%(mode,query) # fetch full results docpath = docinfo['textURLPath'] params = {'document': docpath, 'mode': 'text', 'queryType': mode, 'query': query, 'queryResultPageSize': 1000, 'queryResultPN': 1, 'characterNormalization': pageinfo.get('characterNormalization', 'reg')} pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params)) #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) dom = ET.fromstring(pagexml) # page content is in <div class="queryResultPage"> pagediv = None # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] alldivs = dom.findall("div") for div in alldivs: dc = div.get('class') # page content div if dc == 'queryResultPage': pagediv = div elif dc == 'queryResultHits': docinfo['resultSize'] = getInt(div.text) if pagediv is not None: # store XML in docinfo docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8') return docinfo def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): """returns single page from the table of contents""" logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) # get (cached) result self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) resultxml = docinfo.get('resultXML', None) if not resultxml: logging.error("getResultPage: unable to find resultXML") return "Error: no result!" if size is None: size = pageinfo.get('resultPageSize', 10) if start is None: start = (pn - 1) * size fullresult = ET.fromstring(resultxml) if fullresult is not None: # paginate first = start-1 len = size del fullresult[:first] del fullresult[len:] tocdivs = fullresult # check all a-tags links = tocdivs.findall(".//a") for l in links: href = l.get('href') if href: # assume all links go to pages linkUrl = urlparse.urlparse(href) linkParams = urlparse.parse_qs(linkUrl.query) # take some parameters params = {'pn': linkParams['pn'], 'highlightQuery': linkParams.get('highlightQuery',''), 'highlightElement': linkParams.get('highlightElement',''), 'highlightElementPos': linkParams.get('highlightElementPos','') } url = self.getLink(params=params) l.set('href', url) return serialize(tocdivs) return "ERROR: no results!" def getToc(self, mode='text', docinfo=None): """returns list of table of contents from docinfo""" logging.debug("getToc mode=%s"%mode) if mode == 'text': queryType = 'toc' else: queryType = mode if not 'full_%s'%queryType in docinfo: # get new toc docinfo = self.getTextInfo(queryType, docinfo) return docinfo.get('full_%s'%queryType, []) def getTocPage(self, mode='text', pn=None, start=None, size=None, pageinfo=None, docinfo=None): """returns single page from the table of contents""" logging.debug("getTocPage mode=%s, pn=%s start=%s size=%s"%(mode,repr(pn),repr(start),repr(size))) fulltoc = self.getToc(mode=mode, docinfo=docinfo) if len(fulltoc) < 1: logging.error("getTocPage: unable to find toc!") return "Error: no table of contents!" if size is None: size = pageinfo.get('tocPageSize', 30) if start is None: start = (pn - 1) * size # paginate first = (start - 1) last = first + size tocs = fulltoc[first:last] tp = '<div>' for toc in tocs: pageurl = self.getLink('pn', toc['pn']) tp += '<div class="tocline">' tp += '<div class="toc name">[%s %s]</div>'%(toc['level-string'], toc['content']) tp += '<div class="toc float right page"><a href="%s">Page: %s</a></div>'%(pageurl, toc['pn']) tp += '</div>\n' tp += '</div>\n' return tp def manage_changeHocrTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,repositoryType=None,RESPONSE=None): """change settings""" self.title=title self.timeout = timeout self.serverUrl = serverUrl if repositoryType: self.repositoryType = repositoryType if RESPONSE is not None: RESPONSE.redirect('manage_main') # management methods def manage_addHocrTextServerForm(self): """Form for adding""" pt = PageTemplateFile("zpt/manage_addHocrTextServer", globals()).__of__(self) return pt() def manage_addHocrTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): #def manage_addHocrTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None): """add zogiimage""" newObj = HocrTextServer(id,title,serverUrl,timeout) self.Destination()._setObject(id, newObj) if RESPONSE is not None: RESPONSE.redirect('manage_main')