--- documentViewer/MpdlXmlTextServer.py 2011/07/14 17:43:56 1.238.2.1 +++ documentViewer/MpdlXmlTextServer.py 2012/03/20 14:09:17 1.245 @@ -3,74 +3,13 @@ from OFS.SimpleItem import SimpleItem from Products.PageTemplates.PageTemplateFile import PageTemplateFile from Ft.Xml import EMPTY_NAMESPACE, Parse from Ft.Xml.Domlette import NonvalidatingReader -import Ft.Xml.Domlette -import cStringIO - -import xml.etree.ElementTree as ET import md5 import sys import logging import urllib import documentViewer -#from documentViewer import getTextFromNode, serializeNode - -def getText(node): - """get the cdata content of a node""" - if node is None: - return "" - # ET: - text = node.text or "" - for e in node: - text += gettext(e) - if e.tail: - text += e.tail - - return text - -def serialize(node): - """returns a string containing an XML snippet of node""" - s = ET.tostring(node, 'UTF-8') - # snip off XML declaration - if s.startswith('') - return s[i+3:] - - return s - - -def getTextFromNode(node): - """get the cdata content of a node""" - if node is None: - return "" - # ET: - #text = node.text or "" - #for e in node: - # text += gettext(e) - # if e.tail: - # text += e.tail - - # 4Suite: - nodelist=node.childNodes - text = "" - for n in nodelist: - if n.nodeType == node.TEXT_NODE: - text = text + n.data - - return text - -def serializeNode(node, encoding="utf-8"): - """returns a string containing node as XML""" - #s = ET.tostring(node) - - # 4Suite: - stream = cStringIO.StringIO() - Ft.Xml.Domlette.Print(node, stream=stream, encoding=encoding) - s = stream.getvalue() - stream.close() - - return s - +from documentViewer import getTextFromNode, serializeNode class MpdlXmlTextServer(SimpleItem): """TextServer implementation for MPDL-XML eXist server""" @@ -82,7 +21,7 @@ class MpdlXmlTextServer(SimpleItem): manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) - def __init__(self,id,title="",serverUrl="http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): + def __init__(self,id,title="",serverUrl="http://mpdl-test.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): #def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/", serverName=None, timeout=40): """constructor""" @@ -109,7 +48,11 @@ class MpdlXmlTextServer(SimpleItem): url = docinfo['url'] pagesize = pageinfo['queryPageSize'] pn = pageinfo.get('searchPN',1) - sn = pageinfo['sn'] + #sn = pageinfo['sn'] + s = pageinfo['s'] + highlightElementPos =pageinfo ['highlightElementPos'] + highlightElement = pageinfo ['highlightElement'] + highlightQuery = pageinfo['highlightQuery'] query =pageinfo['query'] queryType =pageinfo['queryType'] @@ -119,7 +62,8 @@ class MpdlXmlTextServer(SimpleItem): #optionToggle = pageinfo['optionToggle'] tocPN = pageinfo['tocPN'] selfurl = self.absolute_url() - data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&characterNormalization=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, sn, viewMode,characterNormalization, urllib.quote(highlightQuery))) + data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&s=%s&viewMode=%s&characterNormalization=%s&highlightElementPos=%s&highlightElement=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, s, viewMode,characterNormalization, highlightElementPos, highlightElement, urllib.quote(highlightQuery))) + #data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&characterNormalization=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, sn, viewMode,characterNormalization, urllib.quote(highlightQuery))) pagexml = data.replace('?document=%s'%str(docpath),'?url=%s'%url) pagedom = Parse(pagexml) @@ -167,7 +111,7 @@ class MpdlXmlTextServer(SimpleItem): if href.startswith('../lt/lemma.xql'): hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_query'%(selfurl)) l.setAttributeNS(None, 'target', '_blank') - l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") + l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=300,height=400,top=180, left=400, scrollbars=1'); return false;") l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') pagedivs = pagedom.xpath("//div[@class='queryResultMorphExpansion']") return serializeNode(pagenode) @@ -189,7 +133,7 @@ class MpdlXmlTextServer(SimpleItem): if href.startswith('../lt/lemma.xql'): hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_lemma'%(selfurl)) l.setAttributeNS(None, 'target', '_blank') - l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=500,height=600,top=180, left=400, scrollbars=1'); return false;") + l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'contacts', 'location,width=300,height=400,top=180, left=400, scrollbars=1'); return false;") l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') return serializeNode(pagenode) return "no text here" @@ -246,29 +190,36 @@ class MpdlXmlTextServer(SimpleItem): url = docinfo.get('url',None) name = docinfo.get('name',None) pn =pageinfo['current'] - sn = pageinfo['sn'] + #sn = pageinfo['sn'] + s = pageinfo['s'] + highlightElementPos =pageinfo ['highlightElementPos'] + highlightElement = pageinfo ['highlightElement'] #optionToggle =pageinfo ['optionToggle'] highlightQuery = pageinfo['highlightQuery'] #mode = pageinfo ['viewMode'] tocMode = pageinfo['tocMode'] + xpointer = pageinfo['xpointer'] characterNormalization=pageinfo['characterNormalization'] tocPN = pageinfo['tocPN'] selfurl = self.absolute_url() + if mode == "text_dict": textmode = "textPollux" else: textmode = mode - textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s"%(docpath,textmode,pn,characterNormalization) + textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s&xpointer=%s"%(docpath,textmode,pn,characterNormalization, xpointer) + #textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s&xpointer=%s&options=withIdentifier"%(docpath,textmode,pn,characterNormalization, xpointer) if highlightQuery is not None: - textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) + #textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) + textParam +="&highlightQuery=%s&s=%s&highlightElement=%s&highlightElementPos=%s"%(urllib.quote(highlightQuery),s, highlightElement, highlightElementPos) pagexml = self.getServerData("page-fragment.xql",textParam) - dom = ET.fromstring(pagexml) + dom = Parse(pagexml) #dom = NonvalidatingReader.parseStream(pagexml) #original Pages - #pagedivs = dom.xpath("//div[@class='pageNumberOrig']") + pagedivs = dom.xpath("//div[@class='pageNumberOrig']") """if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): if len(pagedivs)>0: @@ -283,150 +234,161 @@ class MpdlXmlTextServer(SimpleItem): logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm'])) """ #figureEntries -# pagedivs = dom.xpath("//div[@class='countFigureEntries']") -# if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): -# if len(pagedivs)>0: -# docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) -# s = getTextFromNode(pagedivs[0]) -# if s=='0': -# try: -# docinfo['countFigureEntries'] = int(s) -# except: -# docinfo['countFigureEntries'] = 0 -# else: -# s1 = int(s)/30+1 -# try: -# docinfo['countFigureEntries'] = int(s1) -# except: -# docinfo['countFigureEntries'] = 0 -# -# #allPlaces -# pagedivs = dom.xpath("//div[@class='countPlaces']") -# if pagedivs == dom.xpath("//div[@class='countPlaces']"): -# if len(pagedivs)>0: -# docinfo['countPlaces']= getTextFromNode(pagedivs[0]) -# s = getTextFromNode(pagedivs[0]) -# try: -# docinfo['countPlaces'] = int(s) -# except: -# docinfo['countPlaces'] = 0 -# -# #tocEntries -# pagedivs = dom.xpath("//div[@class='countTocEntries']") -# if pagedivs == dom.xpath("//div[@class='countTocEntries']"): -# if len(pagedivs)>0: -# docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) -# s = getTextFromNode(pagedivs[0]) -# if s=='0': -# try: -# docinfo['countTocEntries'] = int(s) -# except: -# docinfo['countTocEntries'] = 0 -# else: -# s1 = int(s)/30+1 -# try: -# docinfo['countTocEntries'] = int(s1) -# except: -# docinfo['countTocEntries'] = 0 + pagedivs = dom.xpath("//div[@class='countFigureEntries']") + if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): + if len(pagedivs)>0: + docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) + s = getTextFromNode(pagedivs[0]) + if s=='0': + try: + docinfo['countFigureEntries'] = int(s) + except: + docinfo['countFigureEntries'] = 0 + else: + s1 = int(s)/30+1 + try: + docinfo['countFigureEntries'] = int(s1) + except: + docinfo['countFigureEntries'] = 0 + + #allPlaces + pagedivs = dom.xpath("//div[@class='countPlaces']") + if pagedivs == dom.xpath("//div[@class='countPlaces']"): + if len(pagedivs)>0: + docinfo['countPlaces']= getTextFromNode(pagedivs[0]) + s = getTextFromNode(pagedivs[0]) + try: + docinfo['countPlaces'] = int(s) + except: + docinfo['countPlaces'] = 0 + + #tocEntries + pagedivs = dom.xpath("//div[@class='countTocEntries']") + if pagedivs == dom.xpath("//div[@class='countTocEntries']"): + if len(pagedivs)>0: + docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) + s = getTextFromNode(pagedivs[0]) + if s=='0': + try: + docinfo['countTocEntries'] = int(s) + except: + docinfo['countTocEntries'] = 0 + else: + s1 = int(s)/30+1 + try: + docinfo['countTocEntries'] = int(s1) + except: + docinfo['countTocEntries'] = 0 #numTextPages - #pagedivs = dom.xpath("//div[@class='countPages']") - alldivs = dom.findall(".//div") - pagediv = None - for div in alldivs: - dc = div.get('class') - if dc == 'pageContent': - pagediv = div + pagedivs = dom.xpath("//div[@class='countPages']") + if pagedivs == dom.xpath("//div[@class='countPages']"): + if len(pagedivs)>0: + docinfo['numPages'] = getTextFromNode(pagedivs[0]) + s = getTextFromNode(pagedivs[0]) - if dc == 'countPages': try: - np = int(div.text) - docinfo['numPages'] = np + docinfo['numPages'] = int(s) + #logging.debug("PAGE NUMBER: %s"%(s)) + + np = docinfo['numPages'] pageinfo['end'] = min(pageinfo['end'], np) pageinfo['numgroups'] = int(np / pageinfo['groupsize']) if np % pageinfo['groupsize'] > 0: - pageinfo['numgroups'] += 1 - + pageinfo['numgroups'] += 1 except: docinfo['numPages'] = 0 - - break -# ROC: why? -# else: -# #no full text -- init to 0 -# docinfo['pageNumberOrig'] = 0 -# docinfo['countFigureEntries'] = 0 -# docinfo['countPlaces'] = 0 -# docinfo['countTocEntries'] = 0 -# docinfo['numPages'] = 0 -# docinfo['pageNumberOrigNorm'] = 0 -# #return docinfo + else: + #no full text -- init to 0 + docinfo['pageNumberOrig'] = 0 + docinfo['countFigureEntries'] = 0 + docinfo['countPlaces'] = 0 + docinfo['countTocEntries'] = 0 + docinfo['numPages'] = 0 + docinfo['pageNumberOrigNorm'] = 0 + #return docinfo # plain text mode if mode == "text": - #pagedivs = dom.xpath("/div") - if pagediv: - links = pagediv.findall(".//a") + # first div contains text + pagedivs = dom.xpath("/div") + if len(pagedivs) > 0: + pagenode = pagedivs[0] + links = pagenode.xpath("//a") for l in links: - href = l.get('href') - if href and href.startswith('#note-'): - href = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) - l.set('href', href) - logging.debug("page=%s"%ET.tostring(pagediv, 'UTF-8')) - return serialize(pagediv) - + hrefNode = l.getAttributeNodeNS(None, u"href") + if hrefNode: + href= hrefNode.nodeValue + if href.startswith('#note-'): + hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) + #if href.startswith(): + return serializeNode(pagenode) if mode == "xml": - if pagediv: - return serialize(pagediv) - - if mode == "pureXml": - if pagediv: - return serialize(pagediv) - + # first div contains text + pagedivs = dom.xpath("/div") + if len(pagedivs) > 0: + pagenode = pagedivs[0] + return serializeNode(pagenode) if mode == "gis": - if pagediv: - # check all a-tags - links = pagediv.findall(".//a") - for l in links: - href = l.get('href') - if href: - if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'): - l.set('href', href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name)) - l.set('target', '_blank') - - return serialize(pagenode) + # first div contains text + pagedivs = dom.xpath("/div") + if len(pagedivs) > 0: + pagenode = pagedivs[0] + links =pagenode.xpath("//a") + for l in links: + hrefNode =l.getAttributeNodeNS(None, u"href") + if hrefNode: + href=hrefNode.nodeValue + if href.startswith('http://mappit.mpiwg-berlin.mpg.de'): + hrefNode.nodeValue =href.replace('db/REST/db/chgis/mpdl','db/RESTdb/db/mpdl/%s'%name) + l.setAttributeNS(None, 'target', '_blank') + return serializeNode(pagenode) + if mode == "pureXml": + # first div contains text + pagedivs = dom.xpath("/div") + if len(pagedivs) > 0: + pagenode = pagedivs[0] + return serializeNode(pagenode) # text-with-links mode if mode == "text_dict": - if pagediv: + # first div contains text + #mode = pageinfo ['viewMode'] + pagedivs = dom.xpath("/div") + if len(pagedivs) > 0: + pagenode = pagedivs[0] # check all a-tags - links = pagediv.findall(".//a") + links = pagenode.xpath("//a") + for l in links: - href = l.get('href') + hrefNode = l.getAttributeNodeNS(None, u"href") - if href: + if hrefNode: # is link with href - if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'): + href = hrefNode.nodeValue + if href.startswith('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'): # is pollux link selfurl = self.absolute_url() # change href - l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl)) + hrefNode.nodeValue = href.replace('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl) # add target - l.set('target', '_blank') + l.setAttributeNS(None, 'target', '_blank') + #l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") + #l.setAttributeNS(None, "ondblclick", "popupWin.focus();") + #window.open("this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=yes, scrollbars=1'"); return false;") - if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): + if href.startswith('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): selfurl = self.absolute_url() - l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl)) - l.set('target', '_blank') - l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") - l.set('ondblclick', 'popupWin.focus();') + hrefNode.nodeValue = href.replace('http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl) + l.setAttributeNS(None, 'target', '_blank') + l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=300,height=400,top=180, left=700, toolbar=no, scrollbars=1'); return false;") + l.setAttributeNS(None, 'ondblclick', 'popupWin.focus();') if href.startswith('#note-'): - l.set('href', href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn))) + hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) - return serialize(pagediv) - + return serializeNode(pagenode) return "no text here" def getOrigPages(self, docinfo=None, pageinfo=None): @@ -454,9 +416,9 @@ class MpdlXmlTextServer(SimpleItem): return docinfo['pageNumberOrigNorm'] - def getTranslate(self, word=None, language=None): + def getTranslate(self, word=None, language=None, display=None): """translate into another languages""" - data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&output=html") + data = self.getServerData("lt/wordInfo.xql","language="+str(language)+"&word="+urllib.quote(word)+"&display="+urllib.quote(display)+"&output=html") #pagexml=self.template.fulltextclient.eval("/mpdl/interface/lt/lex.xql","document=&language="+str(language)+"&query="+url_quote(str(query))) return data @@ -568,4 +530,4 @@ def manage_addMpdlXmlTextServer(self,id, newObj = MpdlXmlTextServer(id,title,serverUrl,timeout) self.Destination()._setObject(id, newObj) if RESPONSE is not None: - RESPONSE.redirect('manage_main') \ No newline at end of file + RESPONSE.redirect('manage_main')