Mercurial > hg > documentViewer
diff MpdlXmlTextServer.py @ 513:67095296c95a
Merge from elementtree branch
92a6443a6f16ff25674d43814ec0d6c0a43a5e1a
author | casties |
---|---|
date | Tue, 28 Feb 2012 19:10:08 +0100 |
parents | 91daab0c219b 551ca1641a5e |
children | 7d7b639d7be7 |
line wrap: on
line diff
--- a/MpdlXmlTextServer.py Fri Feb 10 14:46:39 2012 +0000 +++ b/MpdlXmlTextServer.py Tue Feb 28 19:10:08 2012 +0100 @@ -1,15 +1,26 @@ - from OFS.SimpleItem import SimpleItem from Products.PageTemplates.PageTemplateFile import PageTemplateFile -from Ft.Xml import EMPTY_NAMESPACE, Parse -from Ft.Xml.Domlette import NonvalidatingReader -import md5 -import sys +import xml.etree.ElementTree as ET + +import re import logging import urllib -import documentViewer -from documentViewer import getTextFromNode, serializeNode +import urlparse +import base64 + +from SrvTxtUtils import getInt, getText, getHttpData + +def serialize(node): + """returns a string containing an XML snippet of node""" + s = ET.tostring(node, 'UTF-8') + # snip off XML declaration + if s.startswith('<?xml'): + i = s.find('?>') + return s[i+3:] + + return s + class MpdlXmlTextServer(SimpleItem): """TextServer implementation for MPDL-XML eXist server""" @@ -21,9 +32,7 @@ manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) - def __init__(self,id,title="",serverUrl="http://mpdl-test.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): - #def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/", serverName=None, timeout=40): - + def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): """constructor""" self.id=id self.title=title @@ -35,428 +44,348 @@ def getHttpData(self, url, data=None): """returns result from url+data HTTP request""" - return documentViewer.getHttpData(url,data,timeout=self.timeout) + return getHttpData(url,data,timeout=self.timeout) def getServerData(self, method, data=None): """returns result from text server for method+data""" url = self.serverUrl+method - return documentViewer.getHttpData(url,data,timeout=self.timeout) + return getHttpData(url,data,timeout=self.timeout) + - def getSearch(self, pageinfo=None, docinfo=None): - """get search list""" - docpath = docinfo['textURLPath'] - url = docinfo['url'] - pagesize = pageinfo['queryPageSize'] - pn = pageinfo.get('searchPN',1) - #sn = pageinfo['sn'] - s = pageinfo['s'] - highlightElementPos =pageinfo ['highlightElementPos'] - highlightElement = pageinfo ['highlightElement'] - - highlightQuery = pageinfo['highlightQuery'] - query =pageinfo['query'] - queryType =pageinfo['queryType'] - viewMode= pageinfo['viewMode'] - tocMode = pageinfo['tocMode'] - characterNormalization = pageinfo['characterNormalization'] - #optionToggle = pageinfo['optionToggle'] - tocPN = pageinfo['tocPN'] - selfurl = self.absolute_url() - data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) - #data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&characterNormalization=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, sn, viewMode,characterNormalization, urllib.quote(highlightQuery))) - pagexml = data.replace('?document=%s'%str(docpath),'?url=%s'%url) - pagedom = Parse(pagexml) - - """ - pagedivs = pagedom.xpath("//div[@class='queryResultHits']") - if (pagedivs == pagedom.xpath("//div[@class='queryResultHits']")): - if len(pagedivs)>0: - docinfo['queryResultHits'] = int(getTextFromNode(pagedivs[0])) - s = getTextFromNode(pagedivs[0]) - s1 = int(s)/10+1 - try: - docinfo['queryResultHits'] = int(s1) - logging.debug("SEARCH ENTRIES: %s"%(s1)) - except: - docinfo['queryResultHits'] = 0 - """ - if (queryType=="fulltext")or(queryType=="xpath")or(queryType=="xquery")or(queryType=="fulltextMorphLemma"): - pagedivs = pagedom.xpath("//div[@class='queryResultPage']") - if len(pagedivs)>0: - pagenode=pagedivs[0] - links=pagenode.xpath("//a") - for l in links: - hrefNode = l.getAttributeNodeNS(None, u"href") - if hrefNode: - href = hrefNode.nodeValue - if href.startswith('page-fragment.xql'): - selfurl = self.absolute_url() - pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN, characterNormalization)) - hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) - #logging.debug("PUREXML :%s"%(serializeNode(pagenode))) - return serializeNode(pagenode) - if (queryType=="fulltextMorph"): - pagedivs = pagedom.xpath("//div[@class='queryResult']") - if len(pagedivs)>0: - pagenode=pagedivs[0] - links=pagenode.xpath("//a") - for l in links: - hrefNode = l.getAttributeNodeNS(None, u"href") - if hrefNode: - href = hrefNode.nodeValue - if href.startswith('page-fragment.xql'): - selfurl = self.absolute_url() - pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN,characterNormalization)) - hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) - if href.startswith('../lt/lemma.xql'): - hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_query'%(selfurl)) - l.setAttributeNS(None, 'target', '_blank') - l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=300,height=400,top=180, left=400, scrollbars=1'); return false;") - l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') - pagedivs = pagedom.xpath("//div[@class='queryResultMorphExpansion']") - return serializeNode(pagenode) - if (queryType=="ftIndex")or(queryType=="ftIndexMorph"): - pagedivs= pagedom.xpath("//div[@class='queryResultPage']") - if len(pagedivs)>0: - pagenode=pagedivs[0] - links=pagenode.xpath("//a") - for l in links: - hrefNode = l.getAttributeNodeNS(None, u"href") - if hrefNode: - href = hrefNode.nodeValue - hrefNode.nodeValue=href.replace('mode=text','mode=texttool&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s&characterNormalization=%s'%(viewMode,tocMode,tocPN,pn,characterNormalization)) - if href.startswith('../lt/lex.xql'): - hrefNode.nodeValue = href.replace('../lt/lex.xql','%s/template/head_main_lex'%selfurl) - l.setAttributeNS(None, 'target', '_blank') - l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") - l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') - if href.startswith('../lt/lemma.xql'): - hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma'%(selfurl)) - l.setAttributeNS(None, 'target', '_blank') - l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=300,height=400,top=180, left=400, scrollbars=1'); return false;") - l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') - return serializeNode(pagenode) - return "no text here" - - def getGisPlaces(self, docinfo=None, pageinfo=None): - """ Show all Gis Places of whole Page""" - xpath='//place' + def getPlacesOnPage(self, docinfo=None, pn=None): + """Returns list of GIS places of page pn""" docpath = docinfo.get('textURLPath',None) if not docpath: return None - url = docinfo['url'] - selfurl = self.absolute_url() - pn = pageinfo['current'] - hrefList=[] - myList= "" - text=self.getServerData("xpath.xql", "document=%s&xpath=%s&pn=%s"%(docinfo['textURLPath'],xpath,pn)) - dom = Parse(text) - result = dom.xpath("//result/resultPage/place") + places=[] + text=self.getServerData("xpath.xql", "document=%s&xpath=//place&pn=%s"%(docpath,pn)) + dom = ET.fromstring(text) + result = dom.findall(".//resultPage/place") for l in result: - hrefNode= l.getAttributeNodeNS(None, u"id") - href= hrefNode.nodeValue - hrefList.append(href) - myList = ",".join(hrefList) - #logging.debug("getGisPlaces :%s"%(myList)) - return myList + id = l.get("id") + name = l.text + place = {'id': id, 'name': name} + places.append(place) + + return places - def getAllGisPlaces (self, docinfo=None, pageinfo=None): - """Show all Gis Places of whole Book """ - xpath ='//echo:place' - docpath =docinfo['textURLPath'] - url = docinfo['url'] - selfurl =self.absolute_url() - pn =pageinfo['current'] - hrefList=[] - myList="" - text=self.getServerData("xpath.xql", "document=%s&xpath=%s"%(docinfo['textURLPath'],xpath)) - dom =Parse(text) - result = dom.xpath("//result/resultPage/place") - - for l in result: - hrefNode = l.getAttributeNodeNS(None, u"id") - href= hrefNode.nodeValue - hrefList.append(href) - myList = ",".join(hrefList) - #logging.debug("getALLGisPlaces :%s"%(myList)) - return myList - - def getTextPage(self, mode="text_dict", pn=1, docinfo=None, pageinfo=None): - """returns single page from fulltext""" - docpath = docinfo['textURLPath'] - path = docinfo['textURLPath'] - url = docinfo.get('url',None) - name = docinfo.get('name',None) - pn =pageinfo['current'] - #sn = pageinfo['sn'] - s = pageinfo['s'] - highlightElementPos =pageinfo ['highlightElementPos'] - highlightElement = pageinfo ['highlightElement'] - #optionToggle =pageinfo ['optionToggle'] - highlightQuery = pageinfo['highlightQuery'] - #mode = pageinfo ['viewMode'] - tocMode = pageinfo['tocMode'] - xpointer = pageinfo['xpointer'] - characterNormalization=pageinfo['characterNormalization'] - tocPN = pageinfo['tocPN'] - selfurl = self.absolute_url() + def processPageInfo(self, dom, docinfo, pageinfo): + """processes page info divs from dom and stores in docinfo and pageinfo""" + # assume first second level div is pageMeta + alldivs = dom.find("div") + + if alldivs is None or alldivs.get('class', '') != 'pageMeta': + logging.error("processPageInfo: pageMeta div not found!") + return - if mode == "text_dict": - textmode = "textPollux" - else: - textmode = mode - - textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s&xpointer=%s&options=withIdentifier"%(docpath,textmode,pn,characterNormalization, xpointer) - if highlightQuery is not None: - #textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) - textParam +="&highlightQuery=%s&s=%s&highlightElement=%s&highlightElementPos=%s"%(urllib.quote(highlightQuery),s, highlightElement, highlightElementPos) - - pagexml = self.getServerData("page-fragment.xql",textParam) - dom = Parse(pagexml) - #dom = NonvalidatingReader.parseStream(pagexml) - - #original Pages - pagedivs = dom.xpath("//div[@class='pageNumberOrig']") - - """if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): - if len(pagedivs)>0: - docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) - logging.debug("ORIGINAL PAGE: %s"%(docinfo['pageNumberOrig'])) + for div in alldivs: + dc = div.get('class') + + # pageNumberOrig + if dc == 'pageNumberOrig': + pageinfo['pageNumberOrig'] = div.text + + # pageNumberOrigNorm + elif dc == 'pageNumberOrigNorm': + pageinfo['pageNumberOrigNorm'] = div.text + + # pageHeaderTitle + elif dc == 'pageHeaderTitle': + pageinfo['pageHeaderTitle'] = div.text + + # numFigureEntries + elif dc == 'countFigureEntries': + docinfo['numFigureEntries'] = getInt(div.text) + + # numTocEntries + elif dc == 'countTocEntries': + # WTF: s1 = int(s)/30+1 + docinfo['numTocEntries'] = getInt(div.text) + + # numPlaces + elif dc == 'countPlaces': + docinfo['numPlaces'] = getInt(div.text) + + # numTextPages + elif dc == 'countPages': + np = getInt(div.text) + if np > 0: + docinfo['numTextPages'] = np + if docinfo.get('numPages', 0) == 0: + # seems to be text-only - update page count + docinfo['numPages'] = np + #pageinfo['end'] = min(pageinfo['end'], np) + pageinfo['numgroups'] = int(np / pageinfo['groupsize']) + if np % pageinfo['groupsize'] > 0: + pageinfo['numgroups'] += 1 - #original Pages Norm - pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") - if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): - if len(pagedivs)>0: - docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) - logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm'])) - """ - #figureEntries - pagedivs = dom.xpath("//div[@class='countFigureEntries']") - if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): - if len(pagedivs)>0: - docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) - s = getTextFromNode(pagedivs[0]) - if s=='0': - try: - docinfo['countFigureEntries'] = int(s) - except: - docinfo['countFigureEntries'] = 0 - else: - s1 = int(s)/30+1 - try: - docinfo['countFigureEntries'] = int(s1) - except: - docinfo['countFigureEntries'] = 0 + #logging.debug("processPageInfo: pageinfo=%s"%repr(pageinfo)) + return + + + def getTextPage(self, mode="text", pn=1, docinfo=None, pageinfo=None): + """returns single page from fulltext""" - #allPlaces - pagedivs = dom.xpath("//div[@class='countPlaces']") - if pagedivs == dom.xpath("//div[@class='countPlaces']"): - if len(pagedivs)>0: - docinfo['countPlaces']= getTextFromNode(pagedivs[0]) - s = getTextFromNode(pagedivs[0]) - try: - docinfo['countPlaces'] = int(s) - except: - docinfo['countPlaces'] = 0 + logging.debug("getTextPage mode=%s, pn=%s"%(mode,pn)) + # check for cached text -- but ideally this shouldn't be called twice + if pageinfo.has_key('textPage'): + logging.debug("getTextPage: using cached text") + return pageinfo['textPage'] + + docpath = docinfo['textURLPath'] + # just checking + if pageinfo['current'] != pn: + logging.warning("getTextPage: current!=pn!") + + # stuff for constructing full urls + selfurl = docinfo['viewerUrl'] + textParams = {'document': docpath, + 'pn': pn} + if 'characterNormalization' in pageinfo: + textParams['characterNormalization'] = pageinfo['characterNormalization'] - #tocEntries - pagedivs = dom.xpath("//div[@class='countTocEntries']") - if pagedivs == dom.xpath("//div[@class='countTocEntries']"): - if len(pagedivs)>0: - docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) - s = getTextFromNode(pagedivs[0]) - if s=='0': - try: - docinfo['countTocEntries'] = int(s) - except: - docinfo['countTocEntries'] = 0 - else: - s1 = int(s)/30+1 - try: - docinfo['countTocEntries'] = int(s1) - except: - docinfo['countTocEntries'] = 0 + if not mode: + # default is dict + mode = 'text' + + modes = mode.split(',') + # check for multiple layers + if len(modes) > 1: + logging.debug("getTextPage: more than one mode=%s"%mode) + + # search mode + if 'search' in modes: + # add highlighting + highlightQuery = pageinfo.get('highlightQuery', None) + if highlightQuery: + textParams['highlightQuery'] = highlightQuery + textParams['highlightElement'] = pageinfo.get('highlightElement', '') + textParams['highlightElementPos'] = pageinfo.get('highlightElementPos', '') + + # ignore mode in the following + modes.remove('search') + + # other modes don't combine + if 'dict' in modes: + # dict is called textPollux in the backend + textmode = 'textPollux' + elif len(modes) == 0: + # text is default mode + textmode = 'text' + else: + # just take first mode + textmode = modes[0] - #numTextPages - pagedivs = dom.xpath("//div[@class='countPages']") - if pagedivs == dom.xpath("//div[@class='countPages']"): - if len(pagedivs)>0: - docinfo['numPages'] = getTextFromNode(pagedivs[0]) - s = getTextFromNode(pagedivs[0]) - - try: - docinfo['numPages'] = int(s) - #logging.debug("PAGE NUMBER: %s"%(s)) - - np = docinfo['numPages'] - pageinfo['end'] = min(pageinfo['end'], np) - pageinfo['numgroups'] = int(np / pageinfo['groupsize']) - if np % pageinfo['groupsize'] > 0: - pageinfo['numgroups'] += 1 - except: - docinfo['numPages'] = 0 - - else: - #no full text -- init to 0 - docinfo['pageNumberOrig'] = 0 - docinfo['countFigureEntries'] = 0 - docinfo['countPlaces'] = 0 - docinfo['countTocEntries'] = 0 - docinfo['numPages'] = 0 - docinfo['pageNumberOrigNorm'] = 0 - #return docinfo + textParams['mode'] = textmode + + # fetch the page + pagexml = self.getServerData("page-fragment.xql",urllib.urlencode(textParams)) + dom = ET.fromstring(pagexml) + # extract additional info + self.processPageInfo(dom, docinfo, pageinfo) + # page content is in <div class="pageContent"> + pagediv = None + # ElementTree 1.2 in Python 2.6 can't do div[@class='pageContent'] + # so we look at the second level divs + alldivs = dom.findall("div") + for div in alldivs: + dc = div.get('class') + # page content div + if dc == 'pageContent': + pagediv = div + break # plain text mode - if mode == "text": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - links = pagenode.xpath("//a") + if textmode == "text": + # get full url assuming documentViewer is parent + selfurl = self.getLink() + if pagediv is not None: + links = pagediv.findall(".//a") for l in links: - hrefNode = l.getAttributeNodeNS(None, u"href") - if hrefNode: - href= hrefNode.nodeValue - if href.startswith('#note-'): - hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) - #if href.startswith(): - return serializeNode(pagenode) - if mode == "xml": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - return serializeNode(pagenode) - if mode == "gis": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - links =pagenode.xpath("//a") - for l in links: - hrefNode =l.getAttributeNodeNS(None, u"href") - if hrefNode: - href=hrefNode.nodeValue - if href.startswith('http://mappit.mpiwg-berlin.mpg.de'): - hrefNode.nodeValue =href.replace('db/REST/db/chgis/mpdl','db/RESTdb/db/mpdl/%s'%name) - l.setAttributeNS(None, 'target', '_blank') - return serializeNode(pagenode) + href = l.get('href') + if href and href.startswith('#note-'): + href = href.replace('#note-',"%s#note-"%selfurl) + l.set('href', href) + + return serialize(pagediv) + + # text-with-links mode + elif textmode == "textPollux": + if pagediv is not None: + viewerurl = docinfo['viewerUrl'] + selfurl = self.getLink() + # check all a-tags + links = pagediv.findall(".//a") + for l in links: + href = l.get('href') - if mode == "pureXml": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - return serializeNode(pagenode) - # text-with-links mode - if mode == "text_dict": - # first div contains text - #mode = pageinfo ['viewMode'] - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - # check all a-tags - links = pagenode.xpath("//a") - - for l in links: - hrefNode = l.getAttributeNodeNS(None, u"href") - - if hrefNode: + if href: # is link with href - href = hrefNode.nodeValue - if href.startswith('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'): - # is pollux link - selfurl = self.absolute_url() - # change href - hrefNode.nodeValue = href.replace('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl) - # add target - l.setAttributeNS(None, 'target', '_blank') - #l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") - #l.setAttributeNS(None, "ondblclick", "popupWin.focus();") - #window.open("this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=yes, scrollbars=1'"); return false;") + linkurl = urlparse.urlparse(href) + #logging.debug("getTextPage: linkurl=%s"%repr(linkurl)) + if linkurl.path.endswith('GetDictionaryEntries'): + #TODO: replace wordInfo page + # is dictionary link - change href (keeping parameters) + #l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/template/viewer_wordinfo'%viewerurl)) + # add target to open new page + l.set('target', '_blank') - if href.startswith('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): - selfurl = self.absolute_url() - hrefNode.nodeValue = href.replace('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl) - l.setAttributeNS(None, 'target', '_blank') - l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=300,height=400,top=180, left=700, toolbar=no, scrollbars=1'); return false;") - l.setAttributeNS(None, 'ondblclick', 'popupWin.focus();') + # TODO: is this needed? +# if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): +# selfurl = self.absolute_url() +# l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl)) +# l.set('target', '_blank') +# l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") +# l.set('ondblclick', 'popupWin.focus();') if href.startswith('#note-'): - hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) + # note link + l.set('href', href.replace('#note-',"%s#note-"%selfurl)) - return serializeNode(pagenode) - return "no text here" - - def getOrigPages(self, docinfo=None, pageinfo=None): - docpath = docinfo['textURLPath'] - pn =pageinfo['current'] - selfurl = self.absolute_url() - pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) - dom = Parse(pagexml) - pagedivs = dom.xpath("//div[@class='pageNumberOrig']") - if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): - if len(pagedivs)>0: - docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) - return docinfo['pageNumberOrig'] - - def getOrigPagesNorm(self, docinfo=None, pageinfo=None): - docpath = docinfo['textURLPath'] - pn =pageinfo['current'] - selfurl = self.absolute_url() - pagexml = self.getServerData("page-fragment.xql","document=%s&pn=%s"%(docpath, pn)) - dom = Parse(pagexml) - pagedivs = dom.xpath("//div[@class='pageNumberOrigNorm']") - if pagedivs == dom.xpath("//div[@class='pageNumberOrigNorm']"): - if len(pagedivs)>0: - docinfo['pageNumberOrigNorm']= getTextFromNode(pagedivs[0]) - return docinfo['pageNumberOrigNorm'] - - - def getTranslate(self, word=None, language=None, display=None): - """translate into another languages""" - data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&display="+urllib.quote(display)+"&output=html") - #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query))) - return data + return serialize(pagediv) + + # xml mode + elif textmode == "xml": + if pagediv is not None: + return serialize(pagediv) + + # pureXml mode + elif textmode == "pureXml": + if pagediv is not None: + return serialize(pagediv) + + # gis mode + elif textmode == "gis": + if pagediv is not None: + # check all a-tags + links = pagediv.findall(".//a") + # add our URL as backlink + selfurl = self.getLink() + doc = base64.b64encode(selfurl) + for l in links: + href = l.get('href') + if href: + if href.startswith('http://mappit.mpiwg-berlin.mpg.de'): + l.set('href', re.sub(r'doc=[\w+/=]+', 'doc=%s'%doc, href)) + l.set('target', '_blank') + + return serialize(pagediv) + + return None - def getLemma(self, lemma=None, language=None): - """simular words lemma """ - data = self.getServerData("lt/lemma.xql","language="+str(language)+"&lemma="+urllib.quote(lemma)+"&output=html") - return data - - def getLemmaQuery(self, query=None, language=None): - """simular words lemma """ - data = self.getServerData("lt/lemma.xql","language="+str(language)+"&query="+urllib.quote(query)+"&output=html") - return data - - def getLex(self, query=None, language=None): - #simular words lemma - data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query)) - return data + + def getSearchResults(self, mode, query=None, pageinfo=None, docinfo=None): + """loads list of search results and stores XML in docinfo""" + + logging.debug("getSearchResults mode=%s query=%s"%(mode, query)) + if mode == "none": + return docinfo + + cachedQuery = docinfo.get('cachedQuery', None) + if cachedQuery is not None: + # cached search result + if cachedQuery == '%s_%s'%(mode,query): + # same query + return docinfo + + else: + # different query + del docinfo['resultSize'] + del docinfo['resultXML'] + + # cache query + docinfo['cachedQuery'] = '%s_%s'%(mode,query) + + # fetch full results + docpath = docinfo['textURLPath'] + params = {'document': docpath, + 'mode': 'text', + 'queryType': mode, + 'query': query, + 'queryResultPageSize': 1000, + 'queryResultPN': 1, + 'characterNormalization': pageinfo.get('characterNormalization', 'reg')} + pagexml = self.getServerData("doc-query.xql",urllib.urlencode(params)) + #pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) + dom = ET.fromstring(pagexml) + # page content is in <div class="queryResultPage"> + pagediv = None + # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] + alldivs = dom.findall("div") + for div in alldivs: + dc = div.get('class') + # page content div + if dc == 'queryResultPage': + pagediv = div + + elif dc == 'queryResultHits': + docinfo['resultSize'] = getInt(div.text) + + if pagediv is not None: + # store XML in docinfo + docinfo['resultXML'] = ET.tostring(pagediv, 'UTF-8') + + return docinfo - def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1): - #number of - docpath = docinfo['textURLPath'] - pagesize = pageinfo['queryPageSize'] - pn = pageinfo['searchPN'] - query =pageinfo['query'] - queryType =pageinfo['queryType'] - tocSearch = 0 - tocDiv = None - - pagexml = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn)) - pagedom = Parse(pagexml) - numdivs = pagedom.xpath("//div[@class='queryResultHits']") - tocSearch = int(getTextFromNode(numdivs[0])) - tc=int((tocSearch/10)+1) - return tc - + + def getResultsPage(self, mode="text", query=None, pn=None, start=None, size=None, pageinfo=None, docinfo=None): + """returns single page from the table of contents""" + logging.debug("getResultsPage mode=%s, pn=%s"%(mode,pn)) + # check for cached result + if not 'resultXML' in docinfo: + self.getSearchResults(mode=mode, query=query, pageinfo=pageinfo, docinfo=docinfo) + + resultxml = docinfo.get('resultXML', None) + if not resultxml: + logging.error("getResultPage: unable to find resultXML") + return "Error: no result!" + + if size is None: + size = pageinfo.get('resultPageSize', 10) + + if start is None: + start = (pn - 1) * size + + fullresult = ET.fromstring(resultxml) + + if fullresult is not None: + # paginate + first = start-1 + len = size + del fullresult[:first] + del fullresult[len:] + tocdivs = fullresult + + # check all a-tags + links = tocdivs.findall(".//a") + for l in links: + href = l.get('href') + if href: + # assume all links go to pages + linkUrl = urlparse.urlparse(href) + linkParams = urlparse.parse_qs(linkUrl.query) + # take some parameters + params = {'pn': linkParams['pn'], + 'highlightQuery': linkParams.get('highlightQuery',''), + 'highlightElement': linkParams.get('highlightElement',''), + 'highlightElementPos': linkParams.get('highlightElementPos','') + } + url = self.getLink(params=params) + l.set('href', url) + + return serialize(tocdivs) + + return "ERROR: no results!" + + def getToc(self, mode="text", docinfo=None): - """loads table of contents and stores in docinfo""" + """loads table of contents and stores XML in docinfo""" + logging.debug("getToc mode=%s"%mode) if mode == "none": - return docinfo + return docinfo + if 'tocSize_%s'%mode in docinfo: # cached toc return docinfo @@ -472,44 +401,89 @@ # number of entries in toc tocSize = 0 tocDiv = None - + # fetch full toc pagexml = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s"%(docpath,queryType, pagesize, pn)) - - # post-processing downloaded xml - pagedom = Parse(pagexml) - # get number of entries - numdivs = pagedom.xpath("//div[@class='queryResultHits']") - if len(numdivs) > 0: - tocSize = int(getTextFromNode(numdivs[0])) - docinfo['tocSize_%s'%mode] = tocSize + dom = ET.fromstring(pagexml) + # page content is in <div class="queryResultPage"> + pagediv = None + # ElementTree 1.2 in Python 2.6 can't do div[@class='queryResultPage'] + alldivs = dom.findall("div") + for div in alldivs: + dc = div.get('class') + # page content div + if dc == 'queryResultPage': + pagediv = div + + elif dc == 'queryResultHits': + docinfo['tocSize_%s'%mode] = getInt(div.text) + + if pagediv is not None: + # store XML in docinfo + docinfo['tocXML_%s'%mode] = ET.tostring(pagediv, 'UTF-8') + return docinfo - def getTocPage(self, mode="text", pn=1, pageinfo=None, docinfo=None): + def getTocPage(self, mode="text", pn=None, start=None, size=None, pageinfo=None, docinfo=None): """returns single page from the table of contents""" - # TODO: this should use the cached TOC + logging.debug("getTocPage mode=%s, pn=%s"%(mode,pn)) if mode == "text": queryType = "toc" else: queryType = mode - docpath = docinfo['textURLPath'] - path = docinfo['textURLPath'] - pagesize = pageinfo['tocPageSize'] - pn = pageinfo['tocPN'] - url = docinfo['url'] - selfurl = self.absolute_url() - viewMode= pageinfo['viewMode'] - characterNormalization = pageinfo ['characterNormalization'] - #optionToggle =pageinfo ['optionToggle'] - tocMode = pageinfo['tocMode'] - tocPN = pageinfo['tocPN'] + + # check for cached TOC + if not docinfo.has_key('tocXML_%s'%mode): + self.getToc(mode=mode, docinfo=docinfo) + + tocxml = docinfo.get('tocXML_%s'%mode, None) + if not tocxml: + logging.error("getTocPage: unable to find tocXML") + return "Error: no table of contents!" + + if size is None: + size = pageinfo.get('tocPageSize', 30) + + if start is None: + start = (pn - 1) * size + + fulltoc = ET.fromstring(tocxml) - data = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s&characterNormalization=regPlusNorm"%(docpath,queryType, pagesize, pn)) - page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s'%(selfurl,url, viewMode, tocMode, tocPN)) - text = page.replace('mode=image','mode=texttool') - return text + if fulltoc is not None: + # paginate + first = (start - 1) * 2 + len = size * 2 + del fulltoc[:first] + del fulltoc[len:] + tocdivs = fulltoc + + # check all a-tags + links = tocdivs.findall(".//a") + for l in links: + href = l.get('href') + if href: + # take pn from href + m = re.match(r'page-fragment\.xql.*pn=(\d+)', href) + if m is not None: + # and create new url (assuming parent is documentViewer) + url = self.getLink('pn', m.group(1)) + l.set('href', url) + else: + logging.warning("getTocPage: Problem with link=%s"%href) + + # fix two-divs-per-row with containing div + newtoc = ET.Element('div', {'class':'queryResultPage'}) + for (d1,d2) in zip(tocdivs[::2],tocdivs[1::2]): + e = ET.Element('div',{'class':'tocline'}) + e.append(d1) + e.append(d2) + newtoc.append(e) + + return serialize(newtoc) + + return "ERROR: no table of contents!" + def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/",timeout=40,RESPONSE=None): - #def manage_changeMpdlXmlTextServer(self,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/",timeout=40,RESPONSE=None): """change settings""" self.title=title self.timeout = timeout @@ -529,4 +503,6 @@ newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) self.Destination()._setObject(id, newObj) if RESPONSE is not None: - RESPONSE.redirect('manage_main') \ No newline at end of file + RESPONSE.redirect('manage_main') + + \ No newline at end of file