--- ECHO_content/vlp_xmlhelpers.py 2004/10/06 13:02:56 1.4 +++ ECHO_content/vlp_xmlhelpers.py 2008/08/05 16:17:46 1.14 @@ -1,6 +1,7 @@ from sys import argv import string +import logging import xml.dom.minidom import Ft.Xml.XLink.Processor import Ft.Xml.XLink.XLinkElements @@ -9,153 +10,156 @@ from Ft.Xml import XPath from Ft.Xml.XPath import Evaluate from Ft.Xml.XLink import XLINK_NAMESPACE from Ft.Xml.XLink import XLinkElements - -#from Ft.Xml.Domlette import NonvalidatingReader,InputSource -#from Ft.Xml import EMPTY_NAMESPACE +import cStringIO +from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print +from Ft.Xml import EMPTY_NAMESPACE from Ft.Lib import Uri import urllib import re +from ECHO_collection import unicodify,utf8ify +patternTXT=r"<\s*txt.*?>(.*?)" +regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL) patternPage=r"<\s*page.*?>(.*?)" regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL) -xml2htmlArray={'WEB_normal':('

','

'),'Normal':('

','

'),'WEB_picture':('

','

'),'WEB_figuretitle':('

','

'),'WEB_bibliography':('

','

'),'Web_kursiv':('',''),'WEB_kursiv':('',''),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('

','

'),'FigureTitle':('

','

')} - -def addToDict(dict,name,value): - if name=="": - return 0 - else: - - if not dict.has_key(name): - dict[name]=[] # als array anlegen - - dict[name].append(value) - return 1 - -def proj2hash(self,xmlstring): - """wandelt xml-files fuer die projekte in ein hash""" - - dom=xml.dom.minidom.parseString(xmlstring) - - - list={} - - #gettitle - pars=Evaluate('par',dom.getElementsByTagName('part')[0]) - for par in pars: - className=par.getAttribute('class') - content=getText(self,par.childNodes) - addToDict(list,className,content) - - - sectionXPath="section" - - - sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) - - while sections: - - for section in sections: - - sec=parseSection(self,section) - - if sec[0]=="WEB_project_header": # Sonderfall project - addToDict(list,'WEB_project_header',sec[1]) # store title - addToDict(list,'WEB_project_description',sec[2]) #store description - else: # no information in heading - level=int(sec[3])+2 - aTag=""%level - eTag=""%level - addToDict(list,"text",aTag+sec[1]+eTag) - addToDict(list,"text",sec[2]) - sectionXPath+="/section" - sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) - return list - - -def parseSection(self,section): - type="" - header="" - level=section.getAttribute('level') - for heading in section.childNodes: - if getattr(heading,'tagName','')=="heading": - - type=heading.getAttribute('class') - header=getText(self,heading.childNodes) - - if type=="": # falls heading fehlt, pruefe ob erster par richtig - par=section.getElementsByTagName('par')[0] - type=par.getAttribute('class') - header=getText(par.childNodes) - - #print section.childNodes - #pars=Evaluate('par',section) - pars=section.childNodes - content=par2html(self,pars) - #print "CONTENT",repr(content) - return (type,header,content,level) - -def parseTable(table): - fields={} - rows=table.getElementsByTagName('html:tr') - for row in rows: - #print "ROW" - cols=row.getElementsByTagName('html:td') - - #Name des Datenfeldes einlesen - try: - field=cols[0].getElementsByTagName('par')[0].getAttribute('class') - #print "field",field - except: - print "error" - field="" - - #Wandeln der Eintrge in HTML - - #pars=cols[1].getElementsByTagName('par') - pars=cols[1].childNodes - - html=par2html(self,pars,tags=("",";")) - - addToDict(fields,field,html) - #print fields - return fields - -def par2html(self,pars,tags=None): - html="" - - for par in pars: - tagName=getattr(par,'tagName','') - if tagName in ["par","inline"]: - #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND') - #print "par",par - if not tags: - try: - tag=xml2htmlArray[par.getAttribute('class')] - except: - tag=('

','

') - else: - tag=tags - #print "TAG",tag - content=getText(self,par.childNodes,par.getAttribute('class')) - - - - #print par.getAttribute('class'),node - try: - html+=tag[0]+content+tag[1] - except: - html=+tag[0]+content+tag[1] - - elif tagName=="pb": - html+="" - - - try: - - return html - except: - return "" +#xml2htmlArray={'WEB_normal':('

','

'),'Normal':('

','

'),'WEB_picture':('

','

'),'WEB_figuretitle':('

','

'),'WEB_bibliography':('

','

'),'Web_kursiv':('',''),'WEB_kursiv':('',''),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('

','

'),'FigureTitle':('

','

')} +# +#def addToDict(dict,name,value): +# if name=="": +# return 0 +# else: +# +# if not dict.has_key(name): +# dict[name]=[] # als array anlegen +# +# dict[name].append(value) +# return 1 +# +#def proj2hash(self,xmlstring): +# """wandelt xml-files fuer die projekte in ein hash""" +# +# dom=xml.dom.minidom.parseString(xmlstring) +# +# +# list={} +# +# #gettitle +# pars=Evaluate('par',dom.getElementsByTagName('part')[0]) +# for par in pars: +# className=par.getAttribute('class') +# content=getText(self,par.childNodes) +# addToDict(list,className,content) +# +# +# sectionXPath="section" +# +# +# sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) +# +# while sections: +# +# for section in sections: +# +# sec=parseSection(self,section) +# +# if sec[0]=="WEB_project_header": # Sonderfall project +# addToDict(list,'WEB_project_header',sec[1]) # store title +# addToDict(list,'WEB_project_description',sec[2]) #store description +# else: # no information in heading +# level=int(sec[3])+2 +# aTag=""%level +# eTag=""%level +# addToDict(list,"text",aTag+sec[1]+eTag) +# addToDict(list,"text",sec[2]) +# sectionXPath+="/section" +# sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) +# return list +# +# +#def parseSection(self,section): +# type="" +# header="" +# level=section.getAttribute('level') +# for heading in section.childNodes: +# if getattr(heading,'tagName','')=="heading": +# +# type=heading.getAttribute('class') +# header=getText(self,heading.childNodes) +# +# if type=="": # falls heading fehlt, pruefe ob erster par richtig +# par=section.getElementsByTagName('par')[0] +# type=par.getAttribute('class') +# header=getText(par.childNodes) +# +# #print section.childNodes +# #pars=Evaluate('par',section) +# pars=section.childNodes +# content=par2html(self,pars) +# #print "CONTENT",repr(content) +# return (type,header,content,level) +# +#def parseTable(table): +# fields={} +# rows=table.getElementsByTagName('html:tr') +# for row in rows: +# #print "ROW" +# cols=row.getElementsByTagName('html:td') +# +# #Name des Datenfeldes einlesen +# try: +# field=cols[0].getElementsByTagName('par')[0].getAttribute('class') +# #print "field",field +# except: +# print "error" +# field="" +# +# #Wandeln der Eintrge in HTML +# +# #pars=cols[1].getElementsByTagName('par') +# pars=cols[1].childNodes +# +# html=par2html(self,pars,tags=("",";")) +# +# addToDict(fields,field,html) +# #print fields +# return fields +# +#def par2html(self,pars,tags=None): +# html="" +# +# for par in pars: +# tagName=getattr(par,'tagName','') +# if tagName in ["par","inline"]: +# #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND') +# #print "par",par +# if not tags: +# try: +# tag=xml2htmlArray[par.getAttribute('class')] +# except: +# tag=('

','

') +# else: +# tag=tags +# #print "TAG",tag +# content=getText(self,par.childNodes,par.getAttribute('class')) +# +# +# +# #print par.getAttribute('class'),node +# try: +# html+=tag[0]+content+tag[1] +# except: +# html=+tag[0]+content+tag[1] +# +# elif tagName=="pb": +# html+="" +# +# +# try: +# +# return html +# except: +# return "" def getXlink(nodes): """searches xlinks and gives them back as html""" @@ -167,52 +171,156 @@ def getXlink(nodes): return ret def checkRef(self,ref): - dbs={'vl_literature':'AND CD LIKE \'%lise%\'','vl_technology':'','vl_people':'','vl_sites':''} + """teste ob reference angezeigt werden sollen""" + dbs={'vl_literature':'AND online = \'1\'', + 'vl_technology':'AND complete =\'yes\'', + 'vl_people':'AND complete =\'yes\'', + 'vl_sites':'AND complete =\'yes\'', + 'vl_transcript':'AND complete =\'yes\'', + 'vl_essays':'AND online =\'yes\'', + 'vl_categories':'' + } res=None for db in dbs.keys(): - - res=res or self.search(var=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db]))) + searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db])) + res=res or self.search(var=searchStr) return res - -def xml2html(self,str,quote="yes"): - """link2html fuer VLP muss hier noch raus""" - - + +def link2html(self,str): + """link2html links in html wandeln""" if str: - if quote=="yes2": - str=re.sub("\&","&",str) - - str=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',str)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt. - #str=re.sub("ref\=([.[*^[>]]])",'XX',str) - #print "STR::",str - dom=xml.dom.minidom.parseString(str) + + str=re.sub("\&","&",str) + dom=xml.dom.minidom.parseString(""+utf8ify(str)+"") links=dom.getElementsByTagName("link") + for link in links: link.tagName="a" ref=link.getAttribute("ref") - pn=link.getAttribute("page") + pn=link.getAttribute("page") + mk=link.getAttribute("mk") + + if self.checkRef(ref): + more = "" + if pn: + more += "&page=%s"%pn + + if mk: + more += "&mk=%s"%mk + + link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+more) + + newxml=dom.toxml('utf-8') + + retStr=regexpTXT.search(newxml) + retStr = retStr.group(1) + + return retStr.decode('utf-8') # we return unicode + + return u"" + +def related2html(self,str): + """related library items: xlinks in html wandeln / mb 22.11.2006""" + if str: + + str=re.sub("\&","&",str) + dom=xml.dom.minidom.parseString(""+utf8ify(str)+"") + links=dom.getElementsByTagName("link") + + for link in links: + link.tagName = "a" + ref = link.getAttribute("ref") + pn = link.getAttribute("page") + + searchStr="select fullreference, online from vl_literature where reference =\'%s\' and authorized = 1"%(ref) + res = self.search(var=searchStr) + + if res: + if res[0]['online'] == 1: + # item online verfuegbar + if pn: + link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn) + else: + link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref) + + link.setAttribute("title", "click to view") + link.removeAttribute("ref") + + # prefix preceding the link + prefix = dom.createTextNode(U"\u2013\u0020") # = ndash + space + dom.documentElement.insertBefore(prefix, link) + + else: + # item nur als bibliographische angabe vorhanden + link.setAttribute("alt", unicodify(res[0]['fullreference'])) + link.setAttribute("title", "click to expand") + link.setAttribute("onclick", "return toggle(this);") + link.setAttribute("class", "x_offline") + + # prefix inside link text + link.firstChild.data = '+ ' + link.firstChild.data + + + newxml=dom.toxml('utf-8') + + retStr=regexpTXT.search(newxml) + retStr = retStr.group(1) + #logging.debug("related2html out=%s"%repr(retStr)) + return retStr.decode('utf-8') # we return unicode + + return u"" + + - if checkRef(self,ref): - if pn: - link.setAttribute("href",self.aq_parent.absolute_url()+"/vlp_coll?id="+ref+"&p="+pn) - else: - link.setAttribute("href",self.aq_parent.absolute_url()+"/vlp_coll?id="+ref) - - str= dom.toxml() - - #print link.toxml('utf-8') - retStr=regexpPage.search(str) - try: - return retStr.group(1) +def xml2html(self,str,quote="yes"): + """link2html fuer VLP muss hier noch raus""" + if str: + if quote=="yes2": + str=re.sub("\&","&",str) + #dom=xml.dom.minidom.parseString(str) + dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/") + #links=dom.getElementsByTagName("link") + links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom) + for link in links: + #link.tagName="a" + + ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref") + pn=link.getAttributeNS(EMPTY_NAMESPACE,"page") + + cns=link.childNodes[0:] + + newLink=dom.createElementNS(EMPTY_NAMESPACE,"a") + for x in cns: + newLink.appendChild(x) + + + + link.parentNode.replaceChild(newLink,link) + + if self.checkRef(ref): + if pn: + newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn) + else: + newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref) + + #str= dom.toxml('utf-8') + buf = cStringIO.StringIO() + PrettyPrint(dom, stream=buf) + str = buf.getvalue() + buf.close() + #str=PrettyPrint(dom.documentElement,encoding='UTF-8') + #print link.toxml('utf-8') + #print type(str) + retStr=regexpPage.search(str) + + try: # hack warum fehtl manchmal page?? + return retStr.group(1).decode('utf-8') except: - exStr="""""" - str=re.sub("\n","",str) - #str= - #print repr(str) - return str.replace(exStr,'') + return str return "" + def xlink2html(self,xlink,parClass=None): ret=""