version 1.237, 2011/05/17 13:02:57
|
version 1.238.2.1, 2011/07/14 17:43:56
|
Line 2
|
Line 2
|
from OFS.SimpleItem import SimpleItem |
from OFS.SimpleItem import SimpleItem |
from Products.PageTemplates.PageTemplateFile import PageTemplateFile |
from Products.PageTemplates.PageTemplateFile import PageTemplateFile |
from Ft.Xml import EMPTY_NAMESPACE, Parse |
from Ft.Xml import EMPTY_NAMESPACE, Parse |
|
from Ft.Xml.Domlette import NonvalidatingReader |
|
import Ft.Xml.Domlette |
|
import cStringIO |
|
|
|
import xml.etree.ElementTree as ET |
|
|
import md5 |
import md5 |
import sys |
import sys |
import logging |
import logging |
import urllib |
import urllib |
import documentViewer |
import documentViewer |
from documentViewer import getTextFromNode, serializeNode |
#from documentViewer import getTextFromNode, serializeNode |
|
|
|
def getText(node): |
|
"""get the cdata content of a node""" |
|
if node is None: |
|
return "" |
|
# ET: |
|
text = node.text or "" |
|
for e in node: |
|
text += gettext(e) |
|
if e.tail: |
|
text += e.tail |
|
|
|
return text |
|
|
|
def serialize(node): |
|
"""returns a string containing an XML snippet of node""" |
|
s = ET.tostring(node, 'UTF-8') |
|
# snip off XML declaration |
|
if s.startswith('<?xml'): |
|
i = s.find('?>') |
|
return s[i+3:] |
|
|
|
return s |
|
|
|
|
|
def getTextFromNode(node): |
|
"""get the cdata content of a node""" |
|
if node is None: |
|
return "" |
|
# ET: |
|
#text = node.text or "" |
|
#for e in node: |
|
# text += gettext(e) |
|
# if e.tail: |
|
# text += e.tail |
|
|
|
# 4Suite: |
|
nodelist=node.childNodes |
|
text = "" |
|
for n in nodelist: |
|
if n.nodeType == node.TEXT_NODE: |
|
text = text + n.data |
|
|
|
return text |
|
|
|
def serializeNode(node, encoding="utf-8"): |
|
"""returns a string containing node as XML""" |
|
#s = ET.tostring(node) |
|
|
|
# 4Suite: |
|
stream = cStringIO.StringIO() |
|
Ft.Xml.Domlette.Print(node, stream=stream, encoding=encoding) |
|
s = stream.getvalue() |
|
stream.close() |
|
|
|
return s |
|
|
|
|
class MpdlXmlTextServer(SimpleItem): |
class MpdlXmlTextServer(SimpleItem): |
"""TextServer implementation for MPDL-XML eXist server""" |
"""TextServer implementation for MPDL-XML eXist server""" |
Line 202 class MpdlXmlTextServer(SimpleItem):
|
Line 264 class MpdlXmlTextServer(SimpleItem):
|
textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) |
textParam +="&highlightQuery=%s&sn=%s"%(urllib.quote(highlightQuery),sn) |
|
|
pagexml = self.getServerData("page-fragment.xql",textParam) |
pagexml = self.getServerData("page-fragment.xql",textParam) |
dom = Parse(pagexml) |
dom = ET.fromstring(pagexml) |
|
#dom = NonvalidatingReader.parseStream(pagexml) |
|
|
#original Pages |
#original Pages |
pagedivs = dom.xpath("//div[@class='pageNumberOrig']") |
#pagedivs = dom.xpath("//div[@class='pageNumberOrig']") |
|
|
"""if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): |
"""if pagedivs == dom.xpath("//div[@class='pageNumberOrig']"): |
if len(pagedivs)>0: |
if len(pagedivs)>0: |
Line 221 class MpdlXmlTextServer(SimpleItem):
|
Line 283 class MpdlXmlTextServer(SimpleItem):
|
logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm'])) |
logging.debug("ORIGINAL PAGE NORM: %s"%(docinfo['pageNumberOrigNorm'])) |
""" |
""" |
#figureEntries |
#figureEntries |
pagedivs = dom.xpath("//div[@class='countFigureEntries']") |
# pagedivs = dom.xpath("//div[@class='countFigureEntries']") |
if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): |
# if pagedivs == dom.xpath("//div[@class='countFigureEntries']"): |
if len(pagedivs)>0: |
# if len(pagedivs)>0: |
docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) |
# docinfo['countFigureEntries'] = getTextFromNode(pagedivs[0]) |
s = getTextFromNode(pagedivs[0]) |
# s = getTextFromNode(pagedivs[0]) |
if s=='0': |
# if s=='0': |
try: |
# try: |
docinfo['countFigureEntries'] = int(s) |
# docinfo['countFigureEntries'] = int(s) |
except: |
# except: |
docinfo['countFigureEntries'] = 0 |
# docinfo['countFigureEntries'] = 0 |
else: |
# else: |
s1 = int(s)/30+1 |
# s1 = int(s)/30+1 |
try: |
# try: |
docinfo['countFigureEntries'] = int(s1) |
# docinfo['countFigureEntries'] = int(s1) |
except: |
# except: |
docinfo['countFigureEntries'] = 0 |
# docinfo['countFigureEntries'] = 0 |
|
# |
#allPlaces |
# #allPlaces |
pagedivs = dom.xpath("//div[@class='countPlaces']") |
# pagedivs = dom.xpath("//div[@class='countPlaces']") |
if pagedivs == dom.xpath("//div[@class='countPlaces']"): |
# if pagedivs == dom.xpath("//div[@class='countPlaces']"): |
if len(pagedivs)>0: |
# if len(pagedivs)>0: |
docinfo['countPlaces']= getTextFromNode(pagedivs[0]) |
# docinfo['countPlaces']= getTextFromNode(pagedivs[0]) |
s = getTextFromNode(pagedivs[0]) |
# s = getTextFromNode(pagedivs[0]) |
try: |
# try: |
docinfo['countPlaces'] = int(s) |
# docinfo['countPlaces'] = int(s) |
except: |
# except: |
docinfo['countPlaces'] = 0 |
# docinfo['countPlaces'] = 0 |
|
# |
#tocEntries |
# #tocEntries |
pagedivs = dom.xpath("//div[@class='countTocEntries']") |
# pagedivs = dom.xpath("//div[@class='countTocEntries']") |
if pagedivs == dom.xpath("//div[@class='countTocEntries']"): |
# if pagedivs == dom.xpath("//div[@class='countTocEntries']"): |
if len(pagedivs)>0: |
# if len(pagedivs)>0: |
docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) |
# docinfo['countTocEntries'] = int(getTextFromNode(pagedivs[0])) |
s = getTextFromNode(pagedivs[0]) |
# s = getTextFromNode(pagedivs[0]) |
if s=='0': |
# if s=='0': |
try: |
# try: |
docinfo['countTocEntries'] = int(s) |
# docinfo['countTocEntries'] = int(s) |
except: |
# except: |
docinfo['countTocEntries'] = 0 |
# docinfo['countTocEntries'] = 0 |
else: |
# else: |
s1 = int(s)/30+1 |
# s1 = int(s)/30+1 |
try: |
# try: |
docinfo['countTocEntries'] = int(s1) |
# docinfo['countTocEntries'] = int(s1) |
except: |
# except: |
docinfo['countTocEntries'] = 0 |
# docinfo['countTocEntries'] = 0 |
|
|
#numTextPages |
#numTextPages |
pagedivs = dom.xpath("//div[@class='countPages']") |
#pagedivs = dom.xpath("//div[@class='countPages']") |
if pagedivs == dom.xpath("//div[@class='countPages']"): |
alldivs = dom.findall(".//div") |
if len(pagedivs)>0: |
pagediv = None |
docinfo['numPages'] = getTextFromNode(pagedivs[0]) |
for div in alldivs: |
s = getTextFromNode(pagedivs[0]) |
dc = div.get('class') |
|
if dc == 'pageContent': |
|
pagediv = div |
|
|
|
if dc == 'countPages': |
try: |
try: |
docinfo['numPages'] = int(s) |
np = int(div.text) |
#logging.debug("PAGE NUMBER: %s"%(s)) |
docinfo['numPages'] = np |
|
|
np = docinfo['numPages'] |
|
pageinfo['end'] = min(pageinfo['end'], np) |
pageinfo['end'] = min(pageinfo['end'], np) |
pageinfo['numgroups'] = int(np / pageinfo['groupsize']) |
pageinfo['numgroups'] = int(np / pageinfo['groupsize']) |
if np % pageinfo['groupsize'] > 0: |
if np % pageinfo['groupsize'] > 0: |
pageinfo['numgroups'] += 1 |
pageinfo['numgroups'] += 1 |
|
|
except: |
except: |
docinfo['numPages'] = 0 |
docinfo['numPages'] = 0 |
|
|
else: |
break |
#no full text -- init to 0 |
|
docinfo['pageNumberOrig'] = 0 |
# ROC: why? |
docinfo['countFigureEntries'] = 0 |
# else: |
docinfo['countPlaces'] = 0 |
# #no full text -- init to 0 |
docinfo['countTocEntries'] = 0 |
# docinfo['pageNumberOrig'] = 0 |
docinfo['numPages'] = 0 |
# docinfo['countFigureEntries'] = 0 |
docinfo['pageNumberOrigNorm'] = 0 |
# docinfo['countPlaces'] = 0 |
#return docinfo |
# docinfo['countTocEntries'] = 0 |
|
# docinfo['numPages'] = 0 |
|
# docinfo['pageNumberOrigNorm'] = 0 |
|
# #return docinfo |
|
|
# plain text mode |
# plain text mode |
if mode == "text": |
if mode == "text": |
# first div contains text |
#pagedivs = dom.xpath("/div") |
pagedivs = dom.xpath("/div") |
if pagediv: |
if len(pagedivs) > 0: |
links = pagediv.findall(".//a") |
pagenode = pagedivs[0] |
|
links = pagenode.xpath("//a") |
|
for l in links: |
for l in links: |
hrefNode = l.getAttributeNodeNS(None, u"href") |
href = l.get('href') |
if hrefNode: |
if href and href.startswith('#note-'): |
href= hrefNode.nodeValue |
href = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) |
if href.startswith('#note-'): |
l.set('href', href) |
hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) |
logging.debug("page=%s"%ET.tostring(pagediv, 'UTF-8')) |
return serializeNode(pagenode) |
return serialize(pagediv) |
|
|
if mode == "xml": |
if mode == "xml": |
# first div contains text |
if pagediv: |
pagedivs = dom.xpath("/div") |
return serialize(pagediv) |
if len(pagedivs) > 0: |
|
pagenode = pagedivs[0] |
if mode == "pureXml": |
return serializeNode(pagenode) |
if pagediv: |
|
return serialize(pagediv) |
|
|
if mode == "gis": |
if mode == "gis": |
# first div contains text |
if pagediv: |
pagedivs = dom.xpath("/div") |
# check all a-tags |
if len(pagedivs) > 0: |
links = pagediv.findall(".//a") |
pagenode = pagedivs[0] |
|
links =pagenode.xpath("//a") |
|
for l in links: |
for l in links: |
hrefNode =l.getAttributeNodeNS(None, u"href") |
href = l.get('href') |
if hrefNode: |
if href: |
href=hrefNode.nodeValue |
|
if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'): |
if href.startswith('http://chinagis.mpiwg-berlin.mpg.de'): |
hrefNode.nodeValue =href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name) |
l.set('href', href.replace('chinagis_REST/REST/db/chgis/mpdl','chinagis/REST/db/mpdl/%s'%name)) |
l.setAttributeNS(None, 'target', '_blank') |
l.set('target', '_blank') |
return serializeNode(pagenode) |
|
|
return serialize(pagenode) |
|
|
if mode == "pureXml": |
|
# first div contains text |
|
pagedivs = dom.xpath("/div") |
|
if len(pagedivs) > 0: |
|
pagenode = pagedivs[0] |
|
return serializeNode(pagenode) |
|
# text-with-links mode |
# text-with-links mode |
if mode == "text_dict": |
if mode == "text_dict": |
# first div contains text |
if pagediv: |
#mode = pageinfo ['viewMode'] |
|
pagedivs = dom.xpath("/div") |
|
if len(pagedivs) > 0: |
|
pagenode = pagedivs[0] |
|
# check all a-tags |
# check all a-tags |
links = pagenode.xpath("//a") |
links = pagediv.findall(".//a") |
|
|
for l in links: |
for l in links: |
hrefNode = l.getAttributeNodeNS(None, u"href") |
href = l.get('href') |
|
|
if hrefNode: |
if href: |
# is link with href |
# is link with href |
href = hrefNode.nodeValue |
|
if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'): |
if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql'): |
# is pollux link |
# is pollux link |
selfurl = self.absolute_url() |
selfurl = self.absolute_url() |
# change href |
# change href |
hrefNode.nodeValue = href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl) |
l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/interface/lt/wordInfo.xql','%s/head_main_voc'%selfurl)) |
# add target |
# add target |
l.setAttributeNS(None, 'target', '_blank') |
l.set('target', '_blank') |
l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") |
|
l.setAttributeNS(None, "ondblclick", "popupWin.focus();") |
|
#window.open("this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=yes, scrollbars=1'"); return false;") |
|
|
|
if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): |
if href.startswith('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql'): |
selfurl = self.absolute_url() |
selfurl = self.absolute_url() |
hrefNode.nodeValue = href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl) |
l.set('href', href.replace('http://mpdl-proto.mpiwg-berlin.mpg.de/mpdl/lt/lemma.xql','%s/head_main_lemma'%selfurl)) |
l.setAttributeNS(None, 'target', '_blank') |
l.set('target', '_blank') |
l.setAttributeNS(None, 'onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") |
l.set('onclick',"popupWin = window.open(this.href, 'InfoWindow', 'menubar=no, location,width=500,height=600,top=180, left=700, toolbar=no, scrollbars=1'); return false;") |
l.setAttributeNS(None, 'ondblclick', 'popupWin.focus();') |
l.set('ondblclick', 'popupWin.focus();') |
|
|
if href.startswith('#note-'): |
if href.startswith('#note-'): |
hrefNode.nodeValue = href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn)) |
l.set('href', href.replace('#note-',"?url=%s&viewMode=text_dict&tocMode=%s&tocPN=%s&pn=%s#note-"%(url,tocMode,tocPN,pn))) |
|
|
|
return serialize(pagediv) |
|
|
return serializeNode(pagenode) |
|
return "no text here" |
return "no text here" |
|
|
def getOrigPages(self, docinfo=None, pageinfo=None): |
def getOrigPages(self, docinfo=None, pageinfo=None): |