--- documentViewer/MpdlXmlTextServer.py 2011/04/28 09:58:39 1.235 +++ documentViewer/MpdlXmlTextServer.py 2011/07/14 17:43:56 1.238.2.1 @@ -2,13 +2,75 @@ from OFS.SimpleItem import SimpleItem from Products.PageTemplates.PageTemplateFile import PageTemplateFile from Ft.Xml import EMPTY_NAMESPACE, Parse +from Ft.Xml.Domlette import NonvalidatingReader +import Ft.Xml.Domlette +import cStringIO + +import xml.etree.ElementTree as ET import md5 import sys import logging import urllib import documentViewer -from documentViewer import getTextFromNode, serializeNode +#from documentViewer import getTextFromNode, serializeNode + +def getText(node): + """get the cdata content of a node""" + if node is None: + return "" + # ET: + text = node.text or "" + for e in node: + text += gettext(e) + if e.tail: + text += e.tail + + return text + +def serialize(node): + """returns a string containing an XML snippet of node""" + s = ET.tostring(node, 'UTF-8') + # snip off XML declaration + if s.startswith('') + return s[i+3:] + + return s + + +def getTextFromNode(node): + """get the cdata content of a node""" + if node is None: + return "" + # ET: + #text = node.text or "" + #for e in node: + # text += gettext(e) + # if e.tail: + # text += e.tail + + # 4Suite: + nodelist=node.childNodes + text = "" + for n in nodelist: + if n.nodeType == node.TEXT_NODE: + text = text + n.data + + return text + +def serializeNode(node, encoding="utf-8"): + """returns a string containing node as XML""" + #s = ET.tostring(node) + + # 4Suite: + stream = cStringIO.StringIO() + Ft.Xml.Domlette.Print(node, stream=stream, encoding=encoding) + s = stream.getvalue() + stream.close() + + return s + class MpdlXmlTextServer(SimpleItem): """TextServer implementation for MPDL-XML eXist server""" @@ -20,7 +82,7 @@ class MpdlXmlTextServer(SimpleItem): manage_changeMpdlXmlTextServerForm = PageTemplateFile("zpt/manage_changeMpdlXmlTextServer", globals()) - def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): + def __init__(self,id,title="",serverUrl="http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/interface/", serverName=None, timeout=40): #def __init__(self,id,title="",serverUrl="http://mpdl-text.mpiwg-berlin.mpg.de:30030/mpdl/interface/", serverName=None, timeout=40): """constructor""" @@ -54,10 +116,10 @@ class MpdlXmlTextServer(SimpleItem): viewMode= pageinfo['viewMode'] tocMode = pageinfo['tocMode'] characterNormalization = pageinfo['characterNormalization'] - optionToggle = pageinfo['optionToggle'] + #optionToggle = pageinfo['optionToggle'] tocPN = pageinfo['tocPN'] selfurl = self.absolute_url() - data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&characterNormalization=%s&optionToggle=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, sn, viewMode,characterNormalization,optionToggle ,urllib.quote(highlightQuery))) + data = self.getServerData("doc-query.xql","document=%s&mode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&sn=%s&viewMode=%s&characterNormalization=%s&highlightQuery=%s"%(docpath, 'text', queryType, urllib.quote(query), pagesize, pn, sn, viewMode,characterNormalization, urllib.quote(highlightQuery))) pagexml = data.replace('?document=%s'%str(docpath),'?url=%s'%url) pagedom = Parse(pagexml) @@ -85,7 +147,7 @@ class MpdlXmlTextServer(SimpleItem): href = hrefNode.nodeValue if href.startswith('page-fragment.xql'): selfurl = self.absolute_url() - pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&optionToggle=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN,optionToggle,characterNormalization)) + pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN, characterNormalization)) hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) #logging.debug("PUREXML :%s"%(serializeNode(pagenode))) return serializeNode(pagenode) @@ -100,7 +162,7 @@ class MpdlXmlTextServer(SimpleItem): href = hrefNode.nodeValue if href.startswith('page-fragment.xql'): selfurl = self.absolute_url() - pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&optionToggle=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN,optionToggle,characterNormalization)) + pagexml=href.replace('mode=text','mode=texttool&viewMode=%s&queryType=%s&query=%s&queryResultPageSize=%s&queryResultPN=%s&tocMode=%s&searchPN=%s&tocPN=%s&characterNormalization=%s'%(viewMode,queryType,urllib.quote(query),pagesize,pn,tocMode,pn,tocPN,characterNormalization)) hrefNode.nodeValue = pagexml.replace('page-fragment.xql','%s'%selfurl) if href.startswith('../lt/lemma.xql'): hrefNode.nodeValue = href.replace('../lt/lemma.xql','%s/template/head_main_query'%(selfurl)) @@ -118,7 +180,7 @@ class MpdlXmlTextServer(SimpleItem): hrefNode = l.getAttributeNodeNS(None, u"href") if hrefNode: href = hrefNode.nodeValue - hrefNode.nodeValue=href.replace('mode=text','mode=texttool&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s&optionToggle=%s&characterNormalization=%s'%(viewMode,tocMode,tocPN,pn,optionToggle,characterNormalization)) + hrefNode.nodeValue=href.replace('mode=text','mode=texttool&viewMode=%s&tocMode=%s&tocPN=%s&pn=%s&characterNormalization=%s'%(viewMode,tocMode,tocPN,pn,characterNormalization)) if href.startswith('../lt/lex.xql'): hrefNode.nodeValue = href.replace('../lt/lex.xql','%s/template/head_main_lex'%selfurl) l.setAttributeNS(None, 'target', '_blank') @@ -185,7 +247,7 @@ class MpdlXmlTextServer(SimpleItem): name = docinfo.get('name',None) pn =pageinfo['current'] sn = pageinfo['sn'] - optionToggle =pageinfo ['optionToggle'] + #optionToggle =pageinfo ['optionToggle'] highlightQuery = pageinfo['highlightQuery'] #mode = pageinfo ['viewMode'] tocMode = pageinfo['tocMode'] @@ -197,14 +259,17 @@ class MpdlXmlTextServer(SimpleItem): else: textmode = mode - textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s&optionToggle=%s"%(docpath,textmode,pn,characterNormalization,optionToggle) + textParam = "document=%s&mode=%s&pn=%s&characterNormalization=%s"%(docpath,textmode,pn,characterNormalization) if highlightQuery is not None: textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) pagexml = self.getServerData("page-fragment.xql",textParam) - dom = Parse(pagexml) + dom = ET.fromstring(pagexml) + #dom = NonvalidatingReader.parseStream(pagexml) + #original Pages - pagedivs = dom.xpath("//div[@class='pageNumberOrig']") + #pagedivs = dom.xpath("//div[@class='pageNumberOrig']") + """if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): if len(pagedivs)>0: docinfo['pageNumberOrig']= getTextFromNode(pagedivs[0]) @@ -218,157 +283,150 @@ class MpdlXmlTextServer(SimpleItem): logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm'])) """ #figureEntries - pagedivs = dom.xpath("//div[@class='countFigureEntries']") - if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): - if len(pagedivs)>0: - docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) - s = getTextFromNode(pagedivs[0]) - if s=='0': - try: - docinfo['countFigureEntries'] = int(s) - except: - docinfo['countFigureEntries'] = 0 - else: - s1 = int(s)/30+1 - try: - docinfo['countFigureEntries'] = int(s1) - except: - docinfo['countFigureEntries'] = 0 - - #allPlaces - pagedivs = dom.xpath("//div[@class='countPlaces']") - if pagedivs == dom.xpath("//div[@class='countPlaces']"): - if len(pagedivs)>0: - docinfo['countPlaces']= getTextFromNode(pagedivs[0]) - s = getTextFromNode(pagedivs[0]) - try: - docinfo['countPlaces'] = int(s) - except: - docinfo['countPlaces'] = 0 - - #tocEntries - pagedivs = dom.xpath("//div[@class='countTocEntries']") - if pagedivs == dom.xpath("//div[@class='countTocEntries']"): - if len(pagedivs)>0: - docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) - s = getTextFromNode(pagedivs[0]) - if s=='0': - try: - docinfo['countTocEntries'] = int(s) - except: - docinfo['countTocEntries'] = 0 - else: - s1 = int(s)/30+1 - try: - docinfo['countTocEntries'] = int(s1) - except: - docinfo['countTocEntries'] = 0 +# pagedivs = dom.xpath("//div[@class='countFigureEntries']") +# if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): +# if len(pagedivs)>0: +# docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) +# s = getTextFromNode(pagedivs[0]) +# if s=='0': +# try: +# docinfo['countFigureEntries'] = int(s) +# except: +# docinfo['countFigureEntries'] = 0 +# else: +# s1 = int(s)/30+1 +# try: +# docinfo['countFigureEntries'] = int(s1) +# except: +# docinfo['countFigureEntries'] = 0 +# +# #allPlaces +# pagedivs = dom.xpath("//div[@class='countPlaces']") +# if pagedivs == dom.xpath("//div[@class='countPlaces']"): +# if len(pagedivs)>0: +# docinfo['countPlaces']= getTextFromNode(pagedivs[0]) +# s = getTextFromNode(pagedivs[0]) +# try: +# docinfo['countPlaces'] = int(s) +# except: +# docinfo['countPlaces'] = 0 +# +# #tocEntries +# pagedivs = dom.xpath("//div[@class='countTocEntries']") +# if pagedivs == dom.xpath("//div[@class='countTocEntries']"): +# if len(pagedivs)>0: +# docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) +# s = getTextFromNode(pagedivs[0]) +# if s=='0': +# try: +# docinfo['countTocEntries'] = int(s) +# except: +# docinfo['countTocEntries'] = 0 +# else: +# s1 = int(s)/30+1 +# try: +# docinfo['countTocEntries'] = int(s1) +# except: +# docinfo['countTocEntries'] = 0 #numTextPages - pagedivs = dom.xpath("//div[@class='countPages']") - if pagedivs == dom.xpath("//div[@class='countPages']"): - if len(pagedivs)>0: - docinfo['numPages'] = getTextFromNode(pagedivs[0]) - s = getTextFromNode(pagedivs[0]) + #pagedivs = dom.xpath("//div[@class='countPages']") + alldivs = dom.findall(".//div") + pagediv = None + for div in alldivs: + dc = div.get('class') + if dc == 'pageContent': + pagediv = div + if dc == 'countPages': try: - docinfo['numPages'] = int(s) - #logging.debug("PAGE NUMBER: %s"%(s)) - - np = docinfo['numPages'] + np = int(div.text) + docinfo['numPages'] = np pageinfo['end'] = min(pageinfo['end'], np) pageinfo['numgroups'] = int(np / pageinfo['groupsize']) if np % pageinfo['groupsize'] > 0: - pageinfo['numgroups'] += 1 + pageinfo['numgroups'] += 1 + except: docinfo['numPages'] = 0 + + break - else: - #no full text -- init to 0 - docinfo['pageNumberOrig'] = 0 - docinfo['countFigureEntries'] = 0 - docinfo['countPlaces'] = 0 - docinfo['countTocEntries'] = 0 - docinfo['numPages'] = 0 - docinfo['pageNumberOrigNorm'] = 0 - #return docinfo +# ROC: why? +# else: +# #no full text -- init to 0 +# docinfo['pageNumberOrig'] = 0 +# docinfo['countFigureEntries'] = 0 +# docinfo['countPlaces'] = 0 +# docinfo['countTocEntries'] = 0 +# docinfo['numPages'] = 0 +# docinfo['pageNumberOrigNorm'] = 0 +# #return docinfo # plain text mode if mode == "text": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - links = pagenode.xpath("//a") + #pagedivs = dom.xpath("/div") + if pagediv: + links = pagediv.findall(".//a") for l in links: - hrefNode = l.getAttributeNodeNS(None, u"href") - if hrefNode: - href= hrefNode.nodeValue - if href.startswith('#note-'): - hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) - return serializeNode(pagenode) + href = l.get('href') + if href and href.startswith('#note-'): + href = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) + l.set('href', href) + logging.debug("page=%s"%ET.tostring(pagediv, 'UTF-8')) + return serialize(pagediv) + if mode == "xml": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - return serializeNode(pagenode) + if pagediv: + return serialize(pagediv) + + if mode == "pureXml": + if pagediv: + return serialize(pagediv) + if mode == "gis": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - links =pagenode.xpath("//a") - for l in links: - hrefNode =l.getAttributeNodeNS(None, u"href") - if hrefNode: - href=hrefNode.nodeValue - if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'): - hrefNode.nodeValue =href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name) - l.setAttributeNS(None, 'target', '_blank') - return serializeNode(pagenode) + if pagediv: + # check all a-tags + links = pagediv.findall(".//a") + for l in links: + href = l.get('href') + if href: + if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'): + l.set('href', href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name)) + l.set('target', '_blank') + + return serialize(pagenode) - if mode == "pureXml": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - return serializeNode(pagenode) # text-with-links mode if mode == "text_dict": - # first div contains text - #mode = pageinfo ['viewMode'] - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] + if pagediv: # check all a-tags - links = pagenode.xpath("//a") + links = pagediv.findall(".//a") for l in links: - hrefNode = l.getAttributeNodeNS(None, u"href") - if hrefNode: + href = l.get('href') + + if href: # is link with href - href = hrefNode.nodeValue if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'): # is pollux link selfurl = self.absolute_url() # change href - hrefNode.nodeValue = href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl) + l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl)) # add target - l.setAttributeNS(None, 'target', '_blank') - l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") - l.setAttributeNS(None, "onDblclick", "popupWin.focus();") - #window.open("this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=yes, scrollbars=1'"); return false;") + l.set('target', '_blank') if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): selfurl = self.absolute_url() - hrefNode.nodeValue = href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl) - l.setAttributeNS(None, 'target', '_blank') - l.setAttributeNS(None, 'onClick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") - l.setAttributeNS(None, 'onDblclick', 'popupWin.focus();') - + l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl)) + l.set('target', '_blank') + l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") + l.set('ondblclick', 'popupWin.focus();') + if href.startswith('#note-'): - hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) - return serializeNode(pagenode) + l.set('href', href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn))) + + return serialize(pagediv) + return "no text here" def getOrigPages(self, docinfo=None, pageinfo=None): @@ -417,7 +475,7 @@ class MpdlXmlTextServer(SimpleItem): data = self.getServerData("lt/lex.xql","document=&language="+str(language)+"&query="+urllib.quote(query)) return data - def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1, optionToggle=None): + def getQuery (self, docinfo=None, pageinfo=None, query=None, queryType=None, pn=1): #number of docpath = docinfo['textURLPath'] pagesize = pageinfo['queryPageSize'] @@ -480,12 +538,12 @@ class MpdlXmlTextServer(SimpleItem): selfurl = self.absolute_url() viewMode= pageinfo['viewMode'] characterNormalization = pageinfo ['characterNormalization'] - optionToggle =pageinfo ['optionToggle'] + #optionToggle =pageinfo ['optionToggle'] tocMode = pageinfo['tocMode'] tocPN = pageinfo['tocPN'] - data = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s&characterNormalization=regPlusNorm&optionToggle=1"%(docpath,queryType, pagesize, pn)) - page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s&optionToggle=1'%(selfurl,url, viewMode, tocMode, tocPN)) + data = self.getServerData("doc-query.xql","document=%s&queryType=%s&queryResultPageSize=%s&queryResultPN=%s&characterNormalization=regPlusNorm"%(docpath,queryType, pagesize, pn)) + page = data.replace('page-fragment.xql?document=%s'%str(path),'%s?url=%s&viewMode=%s&tocMode=%s&tocPN=%s'%(selfurl,url, viewMode, tocMode, tocPN)) text = page.replace('mode=image','mode=texttool') return text