--- documentViewer/MpdlXmlTextServer.py 2011/06/14 09:57:11 1.238 +++ documentViewer/MpdlXmlTextServer.py 2011/07/14 17:43:56 1.238.2.1 @@ -3,13 +3,74 @@ from OFS.SimpleItem import SimpleItem from Products.PageTemplates.PageTemplateFile import PageTemplateFile from Ft.Xml import EMPTY_NAMESPACE, Parse from Ft.Xml.Domlette import NonvalidatingReader +import Ft.Xml.Domlette +import cStringIO + +import xml.etree.ElementTree as ET import md5 import sys import logging import urllib import documentViewer -from documentViewer import getTextFromNode, serializeNode +#from documentViewer import getTextFromNode, serializeNode + +def getText(node): + """get the cdata content of a node""" + if node is None: + return "" + # ET: + text = node.text or "" + for e in node: + text += gettext(e) + if e.tail: + text += e.tail + + return text + +def serialize(node): + """returns a string containing an XML snippet of node""" + s = ET.tostring(node, 'UTF-8') + # snip off XML declaration + if s.startswith('') + return s[i+3:] + + return s + + +def getTextFromNode(node): + """get the cdata content of a node""" + if node is None: + return "" + # ET: + #text = node.text or "" + #for e in node: + # text += gettext(e) + # if e.tail: + # text += e.tail + + # 4Suite: + nodelist=node.childNodes + text = "" + for n in nodelist: + if n.nodeType == node.TEXT_NODE: + text = text + n.data + + return text + +def serializeNode(node, encoding="utf-8"): + """returns a string containing node as XML""" + #s = ET.tostring(node) + + # 4Suite: + stream = cStringIO.StringIO() + Ft.Xml.Domlette.Print(node, stream=stream, encoding=encoding) + s = stream.getvalue() + stream.close() + + return s + class MpdlXmlTextServer(SimpleItem): """TextServer implementation for MPDL-XML eXist server""" @@ -203,11 +264,11 @@ class MpdlXmlTextServer(SimpleItem): textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) pagexml = self.getServerData("page-fragment.xql",textParam) - dom = Parse(pagexml) + dom = ET.fromstring(pagexml) #dom = NonvalidatingReader.parseStream(pagexml) #original Pages - pagedivs = dom.xpath("//div[@class='pageNumberOrig']") + #pagedivs = dom.xpath("//div[@class='pageNumberOrig']") """if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): if len(pagedivs)>0: @@ -222,160 +283,150 @@ class MpdlXmlTextServer(SimpleItem): logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm'])) """ #figureEntries - pagedivs = dom.xpath("//div[@class='countFigureEntries']") - if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): - if len(pagedivs)>0: - docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) - s = getTextFromNode(pagedivs[0]) - if s=='0': - try: - docinfo['countFigureEntries'] = int(s) - except: - docinfo['countFigureEntries'] = 0 - else: - s1 = int(s)/30+1 - try: - docinfo['countFigureEntries'] = int(s1) - except: - docinfo['countFigureEntries'] = 0 - - #allPlaces - pagedivs = dom.xpath("//div[@class='countPlaces']") - if pagedivs == dom.xpath("//div[@class='countPlaces']"): - if len(pagedivs)>0: - docinfo['countPlaces']= getTextFromNode(pagedivs[0]) - s = getTextFromNode(pagedivs[0]) - try: - docinfo['countPlaces'] = int(s) - except: - docinfo['countPlaces'] = 0 - - #tocEntries - pagedivs = dom.xpath("//div[@class='countTocEntries']") - if pagedivs == dom.xpath("//div[@class='countTocEntries']"): - if len(pagedivs)>0: - docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) - s = getTextFromNode(pagedivs[0]) - if s=='0': - try: - docinfo['countTocEntries'] = int(s) - except: - docinfo['countTocEntries'] = 0 - else: - s1 = int(s)/30+1 - try: - docinfo['countTocEntries'] = int(s1) - except: - docinfo['countTocEntries'] = 0 +# pagedivs = dom.xpath("//div[@class='countFigureEntries']") +# if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): +# if len(pagedivs)>0: +# docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) +# s = getTextFromNode(pagedivs[0]) +# if s=='0': +# try: +# docinfo['countFigureEntries'] = int(s) +# except: +# docinfo['countFigureEntries'] = 0 +# else: +# s1 = int(s)/30+1 +# try: +# docinfo['countFigureEntries'] = int(s1) +# except: +# docinfo['countFigureEntries'] = 0 +# +# #allPlaces +# pagedivs = dom.xpath("//div[@class='countPlaces']") +# if pagedivs == dom.xpath("//div[@class='countPlaces']"): +# if len(pagedivs)>0: +# docinfo['countPlaces']= getTextFromNode(pagedivs[0]) +# s = getTextFromNode(pagedivs[0]) +# try: +# docinfo['countPlaces'] = int(s) +# except: +# docinfo['countPlaces'] = 0 +# +# #tocEntries +# pagedivs = dom.xpath("//div[@class='countTocEntries']") +# if pagedivs == dom.xpath("//div[@class='countTocEntries']"): +# if len(pagedivs)>0: +# docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) +# s = getTextFromNode(pagedivs[0]) +# if s=='0': +# try: +# docinfo['countTocEntries'] = int(s) +# except: +# docinfo['countTocEntries'] = 0 +# else: +# s1 = int(s)/30+1 +# try: +# docinfo['countTocEntries'] = int(s1) +# except: +# docinfo['countTocEntries'] = 0 #numTextPages - pagedivs = dom.xpath("//div[@class='countPages']") - if pagedivs == dom.xpath("//div[@class='countPages']"): - if len(pagedivs)>0: - docinfo['numPages'] = getTextFromNode(pagedivs[0]) - s = getTextFromNode(pagedivs[0]) + #pagedivs = dom.xpath("//div[@class='countPages']") + alldivs = dom.findall(".//div") + pagediv = None + for div in alldivs: + dc = div.get('class') + if dc == 'pageContent': + pagediv = div + if dc == 'countPages': try: - docinfo['numPages'] = int(s) - #logging.debug("PAGE NUMBER: %s"%(s)) - - np = docinfo['numPages'] + np = int(div.text) + docinfo['numPages'] = np pageinfo['end'] = min(pageinfo['end'], np) pageinfo['numgroups'] = int(np / pageinfo['groupsize']) if np % pageinfo['groupsize'] > 0: - pageinfo['numgroups'] += 1 + pageinfo['numgroups'] += 1 + except: docinfo['numPages'] = 0 + + break - else: - #no full text -- init to 0 - docinfo['pageNumberOrig'] = 0 - docinfo['countFigureEntries'] = 0 - docinfo['countPlaces'] = 0 - docinfo['countTocEntries'] = 0 - docinfo['numPages'] = 0 - docinfo['pageNumberOrigNorm'] = 0 - #return docinfo +# ROC: why? +# else: +# #no full text -- init to 0 +# docinfo['pageNumberOrig'] = 0 +# docinfo['countFigureEntries'] = 0 +# docinfo['countPlaces'] = 0 +# docinfo['countTocEntries'] = 0 +# docinfo['numPages'] = 0 +# docinfo['pageNumberOrigNorm'] = 0 +# #return docinfo # plain text mode if mode == "text": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - links = pagenode.xpath("//a") + #pagedivs = dom.xpath("/div") + if pagediv: + links = pagediv.findall(".//a") for l in links: - hrefNode = l.getAttributeNodeNS(None, u"href") - if hrefNode: - href= hrefNode.nodeValue - if href.startswith('#note-'): - hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) - return serializeNode(pagenode) + href = l.get('href') + if href and href.startswith('#note-'): + href = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) + l.set('href', href) + logging.debug("page=%s"%ET.tostring(pagediv, 'UTF-8')) + return serialize(pagediv) + if mode == "xml": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - return serializeNode(pagenode) + if pagediv: + return serialize(pagediv) + + if mode == "pureXml": + if pagediv: + return serialize(pagediv) + if mode == "gis": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - links =pagenode.xpath("//a") - for l in links: - hrefNode =l.getAttributeNodeNS(None, u"href") - if hrefNode: - href=hrefNode.nodeValue - if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'): - hrefNode.nodeValue =href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name) - l.setAttributeNS(None, 'target', '_blank') - return serializeNode(pagenode) + if pagediv: + # check all a-tags + links = pagediv.findall(".//a") + for l in links: + href = l.get('href') + if href: + if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'): + l.set('href', href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name)) + l.set('target', '_blank') + + return serialize(pagenode) - if mode == "pureXml": - # first div contains text - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] - return serializeNode(pagenode) # text-with-links mode if mode == "text_dict": - # first div contains text - #mode = pageinfo ['viewMode'] - pagedivs = dom.xpath("/div") - if len(pagedivs) > 0: - pagenode = pagedivs[0] + if pagediv: # check all a-tags - links = pagenode.xpath("//a") - + links = pagediv.findall(".//a") for l in links: - hrefNode = l.getAttributeNodeNS(None, u"href") + href = l.get('href') - if hrefNode: + if href: # is link with href - href = hrefNode.nodeValue if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'): # is pollux link selfurl = self.absolute_url() # change href - hrefNode.nodeValue = href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl) + l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl)) # add target - l.setAttributeNS(None, 'target', '_blank') - #l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") - #l.setAttributeNS(None, "ondblclick", "popupWin.focus();") - #window.open("this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=yes, scrollbars=1'"); return false;") + l.set('target', '_blank') if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): selfurl = self.absolute_url() - hrefNode.nodeValue = href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl) - l.setAttributeNS(None, 'target', '_blank') - l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") - l.setAttributeNS(None, 'ondblclick', 'popupWin.focus();') + l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl)) + l.set('target', '_blank') + l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") + l.set('ondblclick', 'popupWin.focus();') if href.startswith('#note-'): - hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) + l.set('href', href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn))) - return serializeNode(pagenode) + return serialize(pagediv) + return "no text here" def getOrigPages(self, docinfo=None, pageinfo=None):