Mercurial > hg > documentViewer
view MpdlXmlTextServer.py @ 512:92a6443a6f16 elementtree
fix pageFlow.
author | casties |
---|---|
date | Tue, 28 Feb 2012 18:53:06 +0100 |
parents | 551ca1641a5e |
children | 67095296c95a |
line wrap: on
line source
from OFS.SimpleItem import SimpleItem from Products.PageTemplates.PageTemplateFile import PageTemplateFile import xml.etree.ElementTree as ET import re import logging import urllib import urlparse import base64 from SrvTxtUtils import getInt, getText, getHttpData def serialize(node): """returns a string containing an XML snippet of node""" s = ET.tostring(node, 'UTF-8') # snip off XML declaration if s.startswith('<?xml'): i = s.find('?>') return s[i+3:] return s class MpdlXmlTextServer(SimpleItem): """TextServer implementation for MPDL-XML eXist server""" meta_type="MPDL-XML TextServer" manage_options=( {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'}, )+SimpleItem.manage_options manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): """constructor""" self.id=id self.title=title self.timeout = timeout if serverName is None: self.serverUrl = serverUrl else: self.serverUrl = "http://%s/mpdl/interface/"%serverName def getHttpData(self, url, data=None): """returns result from url+data HTTP request""" return getHttpData(url,data,timeout=self.timeout) def getServerData(self, method, data=None): """returns result from text server for method+data""" url = self.serverUrl+method return getHttpData(url,data,timeout=self.timeout) def getPlacesOnPage(self, docinfo=None, pn=None): """Returns list of GIS places of page pn""" docpath = docinfo.get('textURLPath',None) if not docpath: return None places=[] text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn)) dom = ET.fromstring(text) result = dom.findall(".//resultPage/place") for l in result: id = l.get("id") name = l.text place = {'id': id, 'name': name} places.append(place) return places def processPageInfo(self, dom, docinfo, pageinfo): """processes page info divs from dom and stores in docinfo and pageinfo""" # assume first second level div is pageMeta alldivs = dom.find("div") if alldivs is None or alldivs.get('class', '') != 'pageMeta': logging.error("processPageInfo: pageMeta div not found!") return for div in alldivs: dc = div.get('class') # pageNumberOrig if dc == 'pageNumberOrig': pageinfo['pageNumberOrig'] = div.text # pageNumberOrigNorm elif dc == 'pageNumberOrigNorm': pageinfo['pageNumberOrigNorm'] = div.text # pageHeaderTitle elif dc == 'pageHeaderTitle': pageinfo['pageHeaderTitle'] = div.text # numFigureEntries elif dc == 'countFigureEntries': docinfo['numFigureEntries'] = getInt(div.text) # numTocEntries elif dc == 'countTocEntries': # WTF: s1 = int(s)/30+1 docinfo['numTocEntries'] = getInt(div.text) # numPlaces elif dc == 'countPlaces': docinfo['numPlaces'] = getInt(div.text) # numTextPages elif dc == 'countPages': np = getInt(div.text) if np > 0: docinfo['numTextPages'] = np if docinfo.get('numPages', 0) == 0: # seems to be text-only - update page count docinfo['numPages'] = np #pageinfo['end'] = min(pageinfo['end'], np) pageinfo['numgroups'] = int(np / pageinfo['groupsize']) if np % pageinfo['groupsize'] > 0: pageinfo['numgroups'] += 1 #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo)) return def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): """returns single page from fulltext""" logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) # check for cached text -- but ideally this shouldn't be called twice if pageinfo.has_key('textPage'): logging.debug("getTextPage: using cached text") return pageinfo['textPage'] docpath = docinfo['textURLPath'] # just checking if pageinfo['current'] != pn: logging.warning("getTextPage: current!=pn!") # stuff for constructing full urls selfurl = docinfo['viewerUrl'] textParams = {'document': docpath, 'pn': pn} if 'characterNormalization' in pageinfo: textParams['characterNormalization'] = pageinfo['characterNormalization'] if not mode: # default is dict mode = 'text' modes = mode.split(',') # check for multiple layers if len(modes) > 1: logging.debug("getTextPage: more than one mode=%s"%mode) # search mode if 'search' in modes: # add highlighting highlightQuery = pageinfo.get('highlightQuery', None) if highlightQuery: textParams['highlightQuery'] = highlightQuery textParams['highlightElement'] = pageinfo.get('highlightElement', '') textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '') # ignore mode in the following modes.remove('search') # other modes don't combine if 'dict' in modes: # dict is called textPollux in the backend textmode = 'textPollux' elif len(modes) == 0: # text is default mode textmode = 'text' else: # just take first mode textmode = modes[0] textParams['mode'] = textmode # fetch the page pagexml = self.getServerData("page-fragment.xql",urllib.urlencode(textParams)) dom = ET.fromstring(pagexml) # extract additional info self.processPageInfo(dom, docinfo, pageinfo) # page content is in <div class="pageContent"> pagediv = None # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent'] # so we look at the second level divs alldivs = dom.findall("div") for div in alldivs: dc = div.get('class') # page content div if dc == 'pageContent': pagediv = div break # plain text mode if textmode == "text": # get full url assuming documentViewer is parent selfurl = self.getLink() if pagediv is not None: links = pagediv.findall(".//a") for l in links: href = l.get('href') if href and href.startswith('#note-'): href = href.replace('#note-',"%s#note-"%selfurl) l.set('href', href) return serialize(pagediv) # text-with-links mode elif textmode == "textPollux": if pagediv is not None: viewerurl = docinfo['viewerUrl'] selfurl = self.getLink() # check all a-tags links = pagediv.findall(".//a") for l in links: href = l.get('href') if href: # is link with href linkurl = urlparse.urlparse(href) #logging.debug("getTextPage: linkurl=%s"%repr(linkurl)) if linkurl.path.endswith('GetDictionaryEntries'): #TODO: replace wordInfo page # is dictionary link - change href (keeping parameters) #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl)) # add target to open new page l.set('target', '_blank') # TODO: is this needed? # if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): # selfurl = self.absolute_url() # l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl)) # l.set('target', '_blank') # l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") # l.set('ondblclick', 'popupWin.focus();') if href.startswith('#note-'): # note link l.set('href', href.replace('#note-',"%s#note-"%selfurl)) return serialize(pagediv) # xml mode elif textmode == "xml": if pagediv is not None: return serialize(pagediv) # pureXml mode elif textmode == "pureXml": if pagediv is not None: return serialize(pagediv) # gis mode elif textmode == "gis": if pagediv is not None: # check all a-tags links = pagediv.findall(".//a") # add our URL as backlink selfurl = self.getLink() doc = base64.b64encode(selfurl) for l in links: href = l.get('href') if href: if href.startswith('http://mappit.mpiwg-berlin.mpg.de'): l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href)) l.set('target', '_blank') return serialize(pagediv) return None def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): """loads list of search results and stores XML in docinfo""" logging.debug("getSearchResults mode=%s query=%s"%(mode, query)) if mode == "none": return docinfo cachedQuery = docinfo.get('cachedQuery', None) if cachedQuery is not None: # cached search result if cachedQuery == '%s_%s'%(mode,query): # same query return docinfo else: # different query del docinfo['resultSize'] del docinfo['resultXML'] # cache query docinfo['cachedQuery'] = '%s_%s'%(mode,query) # fetch full results docpath = docinfo['textURLPath'] params = {'document': docpath, 'mode': 'text', 'queryType': mode, 'query': query, 'queryResultPageSize': 1000, 'queryResultPN': 1, 'characterNormalization': pageinfo.get('characterNormalization', 'reg')} pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params)) #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) dom = ET.fromstring(pagexml) # page content is in <div class="queryResultPage"> pagediv = None # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] alldivs = dom.findall("div") for div in alldivs: dc = div.get('class') # page content div if dc == 'queryResultPage': pagediv = div elif dc == 'queryResultHits': docinfo['resultSize'] = getInt(div.text) if pagediv is not None: # store XML in docinfo docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8') return docinfo def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): """returns single page from the table of contents""" logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) # check for cached result if not 'resultXML' in docinfo: self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) resultxml = docinfo.get('resultXML', None) if not resultxml: logging.error("getResultPage: unable to find resultXML") return "Error: no result!" if size is None: size = pageinfo.get('resultPageSize', 10) if start is None: start = (pn - 1) * size fullresult = ET.fromstring(resultxml) if fullresult is not None: # paginate first = start-1 len = size del fullresult[:first] del fullresult[len:] tocdivs = fullresult # check all a-tags links = tocdivs.findall(".//a") for l in links: href = l.get('href') if href: # assume all links go to pages linkUrl = urlparse.urlparse(href) linkParams = urlparse.parse_qs(linkUrl.query) # take some parameters params = {'pn': linkParams['pn'], 'highlightQuery': linkParams.get('highlightQuery',''), 'highlightElement': linkParams.get('highlightElement',''), 'highlightElementPos': linkParams.get('highlightElementPos','') } url = self.getLink(params=params) l.set('href', url) return serialize(tocdivs) return "ERROR: no results!" def getToc(self, mode="text", docinfo=None): """loads table of contents and stores XML in docinfo""" logging.debug("getToc mode=%s"%mode) if mode == "none": return docinfo if 'tocSize_%s'%mode in docinfo: # cached toc return docinfo docpath = docinfo['textURLPath'] # we need to set a result set size pagesize = 1000 pn = 1 if mode == "text": queryType = "toc" else: queryType = mode # number of entries in toc tocSize = 0 tocDiv = None # fetch full toc pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) dom = ET.fromstring(pagexml) # page content is in <div class="queryResultPage"> pagediv = None # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] alldivs = dom.findall("div") for div in alldivs: dc = div.get('class') # page content div if dc == 'queryResultPage': pagediv = div elif dc == 'queryResultHits': docinfo['tocSize_%s'%mode] = getInt(div.text) if pagediv is not None: # store XML in docinfo docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8') return docinfo def getTocPage(self, mode="text", pn=None, start=None, size=None, pageinfo=None, docinfo=None): """returns single page from the table of contents""" logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn)) if mode == "text": queryType = "toc" else: queryType = mode # check for cached TOC if not docinfo.has_key('tocXML_%s'%mode): self.getToc(mode=mode, docinfo=docinfo) tocxml = docinfo.get('tocXML_%s'%mode, None) if not tocxml: logging.error("getTocPage: unable to find tocXML") return "Error: no table of contents!" if size is None: size = pageinfo.get('tocPageSize', 30) if start is None: start = (pn - 1) * size fulltoc = ET.fromstring(tocxml) if fulltoc is not None: # paginate first = (start - 1) * 2 len = size * 2 del fulltoc[:first] del fulltoc[len:] tocdivs = fulltoc # check all a-tags links = tocdivs.findall(".//a") for l in links: href = l.get('href') if href: # take pn from href m = re.match(r'page-fragment\.xql.*pn=(\d+)', href) if m is not None: # and create new url (assuming parent is documentViewer) url = self.getLink('pn', m.group(1)) l.set('href', url) else: logging.warning("getTocPage: Problem with link=%s"%href) # fix two-divs-per-row with containing div newtoc = ET.Element('div', {'class':'queryResultPage'}) for (d1,d2) in zip(tocdivs[::2],tocdivs[1::2]): e = ET.Element('div',{'class':'tocline'}) e.append(d1) e.append(d2) newtoc.append(e) return serialize(newtoc) return "ERROR: no table of contents!" def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): """change settings""" self.title=title self.timeout = timeout self.serverUrl = serverUrl if RESPONSE is not None: RESPONSE.redirect('manage_main') # management methods def manage_addMpdlXmlTextServerForm(self): """Form for adding""" pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self) return pt() def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): #def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None): """add zogiimage""" newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) self.Destination()._setObject(id, newObj) if RESPONSE is not None: RESPONSE.redirect('manage_main')