version 1.4, 2004/10/06 13:02:56
|
version 1.15, 2008/09/08 11:12:41
|
Line 1
|
Line 1
|
from sys import argv |
from sys import argv |
|
|
import string |
import string |
|
import logging |
import xml.dom.minidom |
import xml.dom.minidom |
import Ft.Xml.XLink.Processor |
import Ft.Xml.XLink.Processor |
import Ft.Xml.XLink.XLinkElements |
import Ft.Xml.XLink.XLinkElements |
Line 9 from Ft.Xml import XPath
|
Line 10 from Ft.Xml import XPath
|
from Ft.Xml.XPath import Evaluate |
from Ft.Xml.XPath import Evaluate |
from Ft.Xml.XLink import XLINK_NAMESPACE |
from Ft.Xml.XLink import XLINK_NAMESPACE |
from Ft.Xml.XLink import XLinkElements |
from Ft.Xml.XLink import XLinkElements |
|
import cStringIO |
#from Ft.Xml.Domlette import NonvalidatingReader,InputSource |
from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print |
#from Ft.Xml import EMPTY_NAMESPACE |
from Ft.Xml import EMPTY_NAMESPACE |
from Ft.Lib import Uri |
from Ft.Lib import Uri |
import urllib |
import urllib |
import re |
import re |
|
from ECHO_collection import unicodify,utf8ify |
|
|
|
patternTXT=r"<\s*txt.*?>(.*?)</txt>" |
|
regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL) |
patternPage=r"<\s*page.*?>(.*?)</page>" |
patternPage=r"<\s*page.*?>(.*?)</page>" |
regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL) |
regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL) |
|
|
xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')} |
#xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')} |
|
# |
|
#def addToDict(dict,name,value): |
|
# if name=="": |
|
# return 0 |
|
# else: |
|
# |
|
# if not dict.has_key(name): |
|
# dict[name]=[] # als array anlegen |
|
# |
|
# dict[name].append(value) |
|
# return 1 |
|
# |
|
#def proj2hash(self,xmlstring): |
|
# """wandelt xml-files fuer die projekte in ein hash""" |
|
# |
|
# dom=xml.dom.minidom.parseString(xmlstring) |
|
# |
|
# |
|
# list={} |
|
# |
|
# #gettitle |
|
# pars=Evaluate('par',dom.getElementsByTagName('part')[0]) |
|
# for par in pars: |
|
# className=par.getAttribute('class') |
|
# content=getText(self,par.childNodes) |
|
# addToDict(list,className,content) |
|
# |
|
# |
|
# sectionXPath="section" |
|
# |
|
# |
|
# sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) |
|
# |
|
# while sections: |
|
# |
|
# for section in sections: |
|
# |
|
# sec=parseSection(self,section) |
|
# |
|
# if sec[0]=="WEB_project_header": # Sonderfall project |
|
# addToDict(list,'WEB_project_header',sec[1]) # store title |
|
# addToDict(list,'WEB_project_description',sec[2]) #store description |
|
# else: # no information in heading |
|
# level=int(sec[3])+2 |
|
# aTag="<h%i>"%level |
|
# eTag="</h%i>"%level |
|
# addToDict(list,"text",aTag+sec[1]+eTag) |
|
# addToDict(list,"text",sec[2]) |
|
# sectionXPath+="/section" |
|
# sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) |
|
# return list |
|
# |
|
# |
|
#def parseSection(self,section): |
|
# type="" |
|
# header="" |
|
# level=section.getAttribute('level') |
|
# for heading in section.childNodes: |
|
# if getattr(heading,'tagName','')=="heading": |
|
# |
|
# type=heading.getAttribute('class') |
|
# header=getText(self,heading.childNodes) |
|
# |
|
# if type=="": # falls heading fehlt, pruefe ob erster par richtig |
|
# par=section.getElementsByTagName('par')[0] |
|
# type=par.getAttribute('class') |
|
# header=getText(par.childNodes) |
|
# |
|
# #print section.childNodes |
|
# #pars=Evaluate('par',section) |
|
# pars=section.childNodes |
|
# content=par2html(self,pars) |
|
# #print "CONTENT",repr(content) |
|
# return (type,header,content,level) |
|
# |
|
#def parseTable(table): |
|
# fields={} |
|
# rows=table.getElementsByTagName('html:tr') |
|
# for row in rows: |
|
# #print "ROW" |
|
# cols=row.getElementsByTagName('html:td') |
|
# |
|
# #Name des Datenfeldes einlesen |
|
# try: |
|
# field=cols[0].getElementsByTagName('par')[0].getAttribute('class') |
|
# #print "field",field |
|
# except: |
|
# print "error" |
|
# field="" |
|
# |
|
# #Wandeln der Eintrge in HTML |
|
# |
|
# #pars=cols[1].getElementsByTagName('par') |
|
# pars=cols[1].childNodes |
|
# |
|
# html=par2html(self,pars,tags=("",";")) |
|
# |
|
# addToDict(fields,field,html) |
|
# #print fields |
|
# return fields |
|
# |
|
#def par2html(self,pars,tags=None): |
|
# html="" |
|
# |
|
# for par in pars: |
|
# tagName=getattr(par,'tagName','') |
|
# if tagName in ["par","inline"]: |
|
# #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND') |
|
# #print "par",par |
|
# if not tags: |
|
# try: |
|
# tag=xml2htmlArray[par.getAttribute('class')] |
|
# except: |
|
# tag=('<p>','</p>') |
|
# else: |
|
# tag=tags |
|
# #print "TAG",tag |
|
# content=getText(self,par.childNodes,par.getAttribute('class')) |
|
# |
|
# |
|
# |
|
# #print par.getAttribute('class'),node |
|
# try: |
|
# html+=tag[0]+content+tag[1] |
|
# except: |
|
# html=+tag[0]+content+tag[1] |
|
# |
|
# elif tagName=="pb": |
|
# html+="<pb/>" |
|
# |
|
# |
|
# try: |
|
# |
|
# return html |
|
# except: |
|
# return "" |
|
|
def addToDict(dict,name,value): |
def getXlink(nodes): |
if name=="": |
"""searches xlinks and gives them back as html""" |
return 0 |
ret="" |
else: |
for node in nodes: |
|
if node.attributes: |
|
if 'xlink:type' in node.attributes.keys(): #is a xlink? |
|
ret +=xlink2html(node) |
|
return ret |
|
|
if not dict.has_key(name): |
def checkRef(self,ref): |
dict[name]=[] # als array anlegen |
"""teste ob reference angezeigt werden sollen""" |
|
dbs={'vl_literature':'AND online = \'1\'', |
|
'vl_technology':'AND complete =\'yes\'', |
|
'vl_people':'AND complete =\'yes\'', |
|
'vl_sites':'AND complete =\'yes\'', |
|
'vl_transcript':'AND complete =\'yes\'', |
|
'vl_essays':'AND online =\'yes\'', |
|
'vl_categories':'' |
|
} |
|
res=None |
|
for db in dbs.keys(): |
|
searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db])) |
|
res=res or self.search(var=searchStr) |
|
return res |
|
|
dict[name].append(value) |
def link2html(self,str): |
return 1 |
"""link2html links in html wandeln""" |
|
if str: |
|
|
def proj2hash(self,xmlstring): |
str=re.sub("\&","&",str) |
"""wandelt xml-files fuer die projekte in ein hash""" |
dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>") |
|
|
dom=xml.dom.minidom.parseString(xmlstring) |
|
|
|
|
links=dom.getElementsByTagName("link") |
|
|
list={} |
|
|
|
#gettitle |
for link in links: |
pars=Evaluate('par',dom.getElementsByTagName('part')[0]) |
link.tagName="a" |
for par in pars: |
ref=link.getAttribute("ref") |
className=par.getAttribute('class') |
pn=link.getAttribute("page") |
content=getText(self,par.childNodes) |
mk=link.getAttribute("mk") |
addToDict(list,className,content) |
href= link.getAttribute("href") |
|
if href: |
|
link.setAttribute("class","external") |
|
|
|
if self.checkRef(ref): |
|
more = "" |
|
if pn: |
|
more += "&page=%s"%pn |
|
|
sectionXPath="section" |
if mk: |
|
more += "&mk=%s"%mk |
|
|
|
link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+more) |
|
|
sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) |
|
|
|
while sections: |
newxml=dom.toxml('utf-8') |
|
|
for section in sections: |
|
|
|
sec=parseSection(self,section) |
|
|
|
if sec[0]=="WEB_project_header": # Sonderfall project |
retStr=regexpTXT.search(newxml) |
addToDict(list,'WEB_project_header',sec[1]) # store title |
retStr = retStr.group(1) |
addToDict(list,'WEB_project_description',sec[2]) #store description |
|
else: # no information in heading |
|
level=int(sec[3])+2 |
|
aTag="<h%i>"%level |
|
eTag="</h%i>"%level |
|
addToDict(list,"text",aTag+sec[1]+eTag) |
|
addToDict(list,"text",sec[2]) |
|
sectionXPath+="/section" |
|
sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) |
|
return list |
|
|
|
|
return retStr.decode('utf-8') # we return unicode |
|
|
def parseSection(self,section): |
return u"" |
type="" |
|
header="" |
|
level=section.getAttribute('level') |
|
for heading in section.childNodes: |
|
if getattr(heading,'tagName','')=="heading": |
|
|
|
type=heading.getAttribute('class') |
def related2html(self,str): |
header=getText(self,heading.childNodes) |
"""related library items: xlinks in html wandeln / mb 22.11.2006""" |
|
if str: |
|
|
if type=="": # falls heading fehlt, pruefe ob erster par richtig |
str=re.sub("\&","&",str) |
par=section.getElementsByTagName('par')[0] |
dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>") |
type=par.getAttribute('class') |
links=dom.getElementsByTagName("link") |
header=getText(par.childNodes) |
|
|
|
#print section.childNodes |
for link in links: |
#pars=Evaluate('par',section) |
link.tagName = "a" |
pars=section.childNodes |
ref = link.getAttribute("ref") |
content=par2html(self,pars) |
pn = link.getAttribute("page") |
#print "CONTENT",repr(content) |
|
return (type,header,content,level) |
|
|
|
def parseTable(table): |
searchStr="select fullreference, online from vl_literature where reference =\'%s\' and authorized = 1"%(ref) |
fields={} |
res = self.search(var=searchStr) |
rows=table.getElementsByTagName('html:tr') |
|
for row in rows: |
|
#print "ROW" |
|
cols=row.getElementsByTagName('html:td') |
|
|
|
#Name des Datenfeldes einlesen |
if res: |
try: |
if res[0]['online'] == 1: |
field=cols[0].getElementsByTagName('par')[0].getAttribute('class') |
# item online verfuegbar |
#print "field",field |
if pn: |
except: |
link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn) |
print "error" |
else: |
field="" |
link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref) |
|
|
#Wandeln der Eintrge in HTML |
link.setAttribute("title", "click to view") |
|
link.removeAttribute("ref") |
|
|
#pars=cols[1].getElementsByTagName('par') |
# prefix preceding the link |
pars=cols[1].childNodes |
prefix = dom.createTextNode(U"\u2013\u0020") # = ndash + space |
|
dom.documentElement.insertBefore(prefix, link) |
|
|
html=par2html(self,pars,tags=("",";")) |
|
|
|
addToDict(fields,field,html) |
|
#print fields |
|
return fields |
|
|
|
def par2html(self,pars,tags=None): |
|
html="" |
|
|
|
for par in pars: |
|
tagName=getattr(par,'tagName','') |
|
if tagName in ["par","inline"]: |
|
#print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND') |
|
#print "par",par |
|
if not tags: |
|
try: |
|
tag=xml2htmlArray[par.getAttribute('class')] |
|
except: |
|
tag=('<p>','</p>') |
|
else: |
else: |
tag=tags |
# item nur als bibliographische angabe vorhanden |
#print "TAG",tag |
link.setAttribute("alt", unicodify(res[0]['fullreference'])) |
content=getText(self,par.childNodes,par.getAttribute('class')) |
link.setAttribute("title", "click to expand") |
|
link.setAttribute("onclick", "return toggle(this);") |
|
link.setAttribute("class", "x_offline") |
|
|
|
# prefix inside link text |
|
link.firstChild.data = '+ ' + link.firstChild.data |
|
|
#print par.getAttribute('class'),node |
|
try: |
|
html+=tag[0]+content+tag[1] |
|
except: |
|
html=+tag[0]+content+tag[1] |
|
|
|
elif tagName=="pb": |
|
html+="<pb/>" |
|
|
|
|
newxml=dom.toxml('utf-8') |
|
|
try: |
retStr=regexpTXT.search(newxml) |
|
retStr = retStr.group(1) |
|
#logging.debug("related2html out=%s"%repr(retStr)) |
|
return retStr.decode('utf-8') # we return unicode |
|
|
return html |
return u"" |
except: |
|
return "" |
|
|
|
def getXlink(nodes): |
|
"""searches xlinks and gives them back as html""" |
|
ret="" |
|
for node in nodes: |
|
if node.attributes: |
|
if 'xlink:type' in node.attributes.keys(): #is a xlink? |
|
ret +=xlink2html(node) |
|
return ret |
|
|
|
def checkRef(self,ref): |
|
dbs={'vl_literature':'AND CD LIKE \'%lise%\'','vl_technology':'','vl_people':'','vl_sites':''} |
|
res=None |
|
for db in dbs.keys(): |
|
|
|
res=res or self.search(var=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db]))) |
|
return res |
|
|
|
def xml2html(self,str,quote="yes"): |
def xml2html(self,str,quote="yes"): |
"""link2html fuer VLP muss hier noch raus""" |
"""link2html fuer VLP muss hier noch raus""" |
|
|
|
|
if str: |
if str: |
if quote=="yes2": |
if quote=="yes2": |
str=re.sub("\&","&",str) |
str=re.sub("\&","&",str) |
|
#dom=xml.dom.minidom.parseString(str) |
|
dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/") |
|
#links=dom.getElementsByTagName("link") |
|
links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom) |
|
for link in links: |
|
#link.tagName="a" |
|
|
str=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',str)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt. |
ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref") |
#str=re.sub("ref\=([.[*^[>]]])",'XX',str) |
pn=link.getAttributeNS(EMPTY_NAMESPACE,"page") |
#print "STR::",str |
|
dom=xml.dom.minidom.parseString(str) |
cns=link.childNodes[0:] |
links=dom.getElementsByTagName("link") |
|
|
newLink=dom.createElementNS(EMPTY_NAMESPACE,"a") |
|
for x in cns: |
|
newLink.appendChild(x) |
|
|
for link in links: |
|
link.tagName="a" |
|
ref=link.getAttribute("ref") |
|
pn=link.getAttribute("page") |
|
|
|
if checkRef(self,ref): |
|
|
link.parentNode.replaceChild(newLink,link) |
|
|
|
if self.checkRef(ref): |
if pn: |
if pn: |
link.setAttribute("href",self.aq_parent.absolute_url()+"/vlp_coll?id="+ref+"&p="+pn) |
newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn) |
else: |
else: |
link.setAttribute("href",self.aq_parent.absolute_url()+"/vlp_coll?id="+ref) |
newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref) |
|
|
str= dom.toxml() |
|
|
|
|
#str= dom.toxml('utf-8') |
|
buf = cStringIO.StringIO() |
|
PrettyPrint(dom, stream=buf) |
|
str = buf.getvalue() |
|
buf.close() |
|
#str=PrettyPrint(dom.documentElement,encoding='UTF-8') |
#print link.toxml('utf-8') |
#print link.toxml('utf-8') |
|
#print type(str) |
retStr=regexpPage.search(str) |
retStr=regexpPage.search(str) |
|
|
try: |
try: # hack warum fehtl manchmal page?? |
return retStr.group(1) |
return retStr.group(1).decode('utf-8') |
except: |
except: |
exStr="""<?xml version="1.0" ?>""" |
return str |
str=re.sub("\n","",str) |
|
#str= |
|
#print repr(str) |
|
return str.replace(exStr,'') |
|
return "" |
return "" |
|
|
|
|
def xlink2html(self,xlink,parClass=None): |
def xlink2html(self,xlink,parClass=None): |
ret="" |
ret="" |
attributes=xlink.attributes |
attributes=xlink.attributes |