Mercurial > hg > documentViewer
view MpdlXmlTextServer.py @ 538:dbf25bd05fc6
digilib buttons get icons. pid on index page.
author | casties |
---|---|
date | Mon, 30 Jul 2012 19:41:48 +0200 |
parents | 5c7433c2515c |
children | 6c529ec1b295 |
line wrap: on
line source
from OFS.SimpleItem import SimpleItem from Products.PageTemplates.PageTemplateFile import PageTemplateFile import xml.etree.ElementTree as ET import re import logging import urllib import urlparse import base64 from SrvTxtUtils import getInt, getText, getHttpData def serialize(node): """returns a string containing an XML snippet of node""" s = ET.tostring(node, 'UTF-8') # snip off XML declaration if s.startswith('<?xml'): i = s.find('?>') return s[i+3:] return s class MpdlXmlTextServer(SimpleItem): """TextServer implementation for MPDL-XML eXist server""" meta_type="MPDL-XML TextServer" manage_options=( {'label':'Config','action':'manage_changeMpdlXmlTextServerForm'}, )+SimpleItem.manage_options manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): """constructor""" self.id=id self.title=title self.timeout = timeout if serverName is None: self.serverUrl = serverUrl else: self.serverUrl = "http://%s/mpdl/interface/"%serverName def getHttpData(self, url, data=None): """returns result from url+data HTTP request""" return getHttpData(url,data,timeout=self.timeout) def getServerData(self, method, data=None): """returns result from text server for method+data""" url = self.serverUrl+method return getHttpData(url,data,timeout=self.timeout) def getPlacesOnPage(self, docinfo=None, pn=None): """Returns list of GIS places of page pn""" docpath = docinfo.get('textURLPath',None) if not docpath: return None places=[] text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn)) dom = ET.fromstring(text) result = dom.findall(".//resultPage/place") for l in result: id = l.get("id") name = l.text place = {'id': id, 'name': name} places.append(place) return places def getTextInfo(self, mode='', docinfo=None): """reads document info, including page concordance, from text server""" logging.debug("getTextInfo mode=%s"%mode) if mode not in ['toc', 'figures', '']: mode = '' # check cached info if mode: # cached toc-request? if 'full_%s'%mode in docinfo: return docinfo else: # no toc-request if 'numTextPages' in docinfo: return docinfo docpath = docinfo.get('textURLPath', None) if docpath is None: logging.error("getTextInfo: no textURLPath!") return docinfo # we need to set a result set size pagesize = 10000 pn = 1 # fetch docinfo pagexml = self.getServerData("doc-info.xql","document=%s&info=%s&pageSize=%s&pn=%s"%(docpath,mode,pagesize,pn)) dom = ET.fromstring(pagexml) # all info in tag <document> doc = dom.find("document") if doc is None: logging.error("getTextInfo: unable to find document-tag!") else: # go through all child elements for tag in doc: name = tag.tag # numTextPages if name == 'countPages': np = getInt(tag.text) if np > 0: docinfo['numTextPages'] = np # numFigureEntries elif name == 'countFigureEntries': docinfo['numFigureEntries'] = getInt(tag.text) # numTocEntries elif name == 'countTocEntries': # WTF: s1 = int(s)/30+1 docinfo['numTocEntries'] = getInt(tag.text) # numPlaces elif name == 'countPlaces': docinfo['numPlaces'] = getInt(tag.text) # pageNumbers elif name == 'pageNumbers': # contains tags with page numbers # <pn><n>4</n><no>4</no><non/></pn> # n=scan number, no=original page no, non=normalized original page no # pageNumbers is a dict indexed by scan number pages = {} for pn in tag: page = {} n = 0 for p in pn: if p.tag == 'n': n = getInt(p.text) page['pn'] = n elif p.tag == 'no': page['no'] = p.text elif p.tag == 'non': page['non'] = p.text if n > 0: pages[n] = page docinfo['pageNumbers'] = pages #logging.debug("got pageNumbers=%s"%repr(pages)) # toc elif name == 'toc': # contains tags with table of contents/figures # <toc-entry><page>13</page><level>3</level><content>Chapter I</content><level-string>1.</level-string><real-level>1</real-level></toc-entry> tocs = [] for te in tag: toc = {} for t in te: if t.tag == 'page': toc['pn'] = getInt(t.text) elif t.tag == 'level': toc['level'] = t.text elif t.tag == 'content': toc['content'] = t.text elif t.tag == 'level-string': toc['level-string'] = t.text elif t.tag == 'real-level': toc['real-level'] = t.text tocs.append(toc) # save as full_toc/full_figures docinfo['full_%s'%mode] = tocs return docinfo def processPageInfo(self, dom, docinfo, pageinfo): """processes page info divs from dom and stores in docinfo and pageinfo""" # assume first second level div is pageMeta alldivs = dom.find("div") if alldivs is None or alldivs.get('class', '') != 'pageMeta': logging.error("processPageInfo: pageMeta div not found!") return for div in alldivs: dc = div.get('class') # pageNumberOrig if dc == 'pageNumberOrig': pageinfo['pageNumberOrig'] = div.text # pageNumberOrigNorm elif dc == 'pageNumberOrigNorm': pageinfo['pageNumberOrigNorm'] = div.text # pageHeaderTitle elif dc == 'pageHeaderTitle': pageinfo['pageHeaderTitle'] = div.text #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo)) return def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): """returns single page from fulltext""" logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) # check for cached text -- but ideally this shouldn't be called twice if pageinfo.has_key('textPage'): logging.debug("getTextPage: using cached text") return pageinfo['textPage'] docpath = docinfo.get('textURLPath', None) if not docpath: return None # just checking if pageinfo['current'] != pn: logging.warning("getTextPage: current!=pn!") # stuff for constructing full urls selfurl = docinfo['viewerUrl'] textParams = {'document': docpath, 'pn': pn} if 'characterNormalization' in pageinfo: textParams['characterNormalization'] = pageinfo['characterNormalization'] if not mode: # default is dict mode = 'text' modes = mode.split(',') # check for multiple layers if len(modes) > 1: logging.debug("getTextPage: more than one mode=%s"%mode) # search mode if 'search' in modes: # add highlighting highlightQuery = pageinfo.get('highlightQuery', None) if highlightQuery: textParams['highlightQuery'] = highlightQuery textParams['highlightElement'] = pageinfo.get('highlightElement', '') textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '') # ignore mode in the following modes.remove('search') # other modes don't combine if 'dict' in modes: # dict is called textPollux in the backend textmode = 'textPollux' elif 'xml' in modes: # xml mode textmode = 'xml' textParams['characterNormalization'] = 'orig' else: # text is default mode textmode = 'text' textParams['mode'] = textmode # fetch the page pagexml = self.getServerData("page-fragment.xql",urllib.urlencode(textParams)) dom = ET.fromstring(pagexml) # extract additional info self.processPageInfo(dom, docinfo, pageinfo) # page content is in <div class="pageContent"> pagediv = None # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent'] # so we look at the second level divs alldivs = dom.findall("div") for div in alldivs: dc = div.get('class') # page content div if dc == 'pageContent': pagediv = div break # plain text mode if textmode == "text": # get full url assuming documentViewer is parent selfurl = self.getLink() if pagediv is not None: links = pagediv.findall(".//a") for l in links: href = l.get('href') if href and href.startswith('#note-'): href = href.replace('#note-',"%s#note-"%selfurl) l.set('href', href) return serialize(pagediv) # text-with-links mode elif textmode == "textPollux": if pagediv is not None: viewerurl = docinfo['viewerUrl'] selfurl = self.getLink() # check all a-tags links = pagediv.findall(".//a") for l in links: href = l.get('href') if href: # is link with href linkurl = urlparse.urlparse(href) #logging.debug("getTextPage: linkurl=%s"%repr(linkurl)) if linkurl.path.endswith('GetDictionaryEntries'): #TODO: replace wordInfo page # is dictionary link - change href (keeping parameters) #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl)) # add target to open new page l.set('target', '_blank') # TODO: is this needed? # if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): # selfurl = self.absolute_url() # l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl)) # l.set('target', '_blank') # l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") # l.set('ondblclick', 'popupWin.focus();') if href.startswith('#note-'): # note link l.set('href', href.replace('#note-',"%s#note-"%selfurl)) return serialize(pagediv) # xml mode elif textmode == "xml": if pagediv is not None: return serialize(pagediv) # pureXml mode elif textmode == "pureXml": if pagediv is not None: return serialize(pagediv) # gis mode elif textmode == "gis": if pagediv is not None: # check all a-tags links = pagediv.findall(".//a") # add our URL as backlink selfurl = self.getLink() doc = base64.b64encode(selfurl) for l in links: href = l.get('href') if href: if href.startswith('http://mappit.mpiwg-berlin.mpg.de'): l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href)) l.set('target', '_blank') return serialize(pagediv) return None def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): """loads list of search results and stores XML in docinfo""" logging.debug("getSearchResults mode=%s query=%s"%(mode, query)) if mode == "none": return docinfo cachedQuery = docinfo.get('cachedQuery', None) if cachedQuery is not None: # cached search result if cachedQuery == '%s_%s'%(mode,query): # same query return docinfo else: # different query del docinfo['resultSize'] del docinfo['resultXML'] # cache query docinfo['cachedQuery'] = '%s_%s'%(mode,query) # fetch full results docpath = docinfo['textURLPath'] params = {'document': docpath, 'mode': 'text', 'queryType': mode, 'query': query, 'queryResultPageSize': 1000, 'queryResultPN': 1, 'characterNormalization': pageinfo.get('characterNormalization', 'reg')} pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params)) #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) dom = ET.fromstring(pagexml) # page content is in <div class="queryResultPage"> pagediv = None # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] alldivs = dom.findall("div") for div in alldivs: dc = div.get('class') # page content div if dc == 'queryResultPage': pagediv = div elif dc == 'queryResultHits': docinfo['resultSize'] = getInt(div.text) if pagediv is not None: # store XML in docinfo docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8') return docinfo def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): """returns single page from the table of contents""" logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) # get (cached) result self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) resultxml = docinfo.get('resultXML', None) if not resultxml: logging.error("getResultPage: unable to find resultXML") return "Error: no result!" if size is None: size = pageinfo.get('resultPageSize', 10) if start is None: start = (pn - 1) * size fullresult = ET.fromstring(resultxml) if fullresult is not None: # paginate first = start-1 len = size del fullresult[:first] del fullresult[len:] tocdivs = fullresult # check all a-tags links = tocdivs.findall(".//a") for l in links: href = l.get('href') if href: # assume all links go to pages linkUrl = urlparse.urlparse(href) linkParams = urlparse.parse_qs(linkUrl.query) # take some parameters params = {'pn': linkParams['pn'], 'highlightQuery': linkParams.get('highlightQuery',''), 'highlightElement': linkParams.get('highlightElement',''), 'highlightElementPos': linkParams.get('highlightElementPos','') } url = self.getLink(params=params) l.set('href', url) return serialize(tocdivs) return "ERROR: no results!" def getToc(self, mode='text', docinfo=None): """returns list of table of contents from docinfo""" logging.debug("getToc mode=%s"%mode) if mode == 'text': queryType = 'toc' else: queryType = mode if not 'full_%s'%queryType in docinfo: # get new toc docinfo = self.getTextInfo(queryType, docinfo) return docinfo.get('full_%s'%queryType, []) def getTocPage(self, mode='text', pn=None, start=None, size=None, pageinfo=None, docinfo=None): """returns single page from the table of contents""" logging.debug("getTocPage mode=%s, pn=%s start=%s size=%s"%(mode,repr(pn),repr(start),repr(size))) fulltoc = self.getToc(mode=mode, docinfo=docinfo) if len(fulltoc) < 1: logging.error("getTocPage: unable to find toc!") return "Error: no table of contents!" if size is None: size = pageinfo.get('tocPageSize', 30) if start is None: start = (pn - 1) * size # paginate first = (start - 1) last = first + size tocs = fulltoc[first:last] tp = '<div>' for toc in tocs: pageurl = self.getLink('pn', toc['pn']) tp += '<div class="tocline">' tp += '<div class="toc name">[%s %s]</div>'%(toc['level-string'], toc['content']) tp += '<div class="toc float right page"><a href="%s">Page: %s</a></div>'%(pageurl, toc['pn']) tp += '</div>\n' tp += '</div>\n' return tp def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): """change settings""" self.title=title self.timeout = timeout self.serverUrl = serverUrl if RESPONSE is not None: RESPONSE.redirect('manage_main') # management methods def manage_addMpdlXmlTextServerForm(self): """Form for adding""" pt = PageTemplateFile("zpt/manage_addMpdlXmlTextServer", globals()).__of__(self) return pt() def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): #def manage_addMpdlXmlTextServer(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None): """add zogiimage""" newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) self.Destination()._setObject(id, newObj) if RESPONSE is not None: RESPONSE.redirect('manage_main')