ECHO_content/vlp_xmlhelpers.py - view

File: [Repository] / ECHO_content / vlp_xmlhelpers.py
Revision 1.11: download - view: text, annotated - select for diffs - revision graph
Tue Jul 24 09:11:46 2007 UTC (16 years, 11 months ago) by casties
Branches: MAIN
CVS tags: HEAD

fixed some problems with unicode in zope 2.10

from sys import argv import string import logging import xml.dom.minidom import Ft.Xml.XLink.Processor import Ft.Xml.XLink.XLinkElements from Ft.Xml import XPath from Ft.Xml.XPath import Evaluate from Ft.Xml.XLink import XLINK_NAMESPACE from Ft.Xml.XLink import XLinkElements import cStringIO from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print from Ft.Xml import EMPTY_NAMESPACE from Ft.Lib import Uri import urllib import re from ECHO_collection import unicodify,utf8ify patternTXT=r"<\s*txt.*?>(.*?)</txt>" regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL) patternPage=r"<\s*page.*?>(.*?)</page>" regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL) xml2htmlArray={'WEB_normal':('',''),'Normal':('',''),'WEB_picture':('',''),'WEB_figuretitle':('',''),'WEB_bibliography':('',''),'Web_kursiv':('',''),'WEB_kursiv':('',''),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('',''),'FigureTitle':('','')} def addToDict(dict,name,value): if name=="": return 0 else: if not dict.has_key(name): dict[name]=[] # als array anlegen dict[name].append(value) return 1 def proj2hash(self,xmlstring): """wandelt xml-files fuer die projekte in ein hash""" dom=xml.dom.minidom.parseString(xmlstring) list={} #gettitle pars=Evaluate('par',dom.getElementsByTagName('part')[0]) for par in pars: className=par.getAttribute('class') content=getText(self,par.childNodes) addToDict(list,className,content) sectionXPath="section" sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) while sections: for section in sections: sec=parseSection(self,section) if sec[0]=="WEB_project_header": # Sonderfall project addToDict(list,'WEB_project_header',sec[1]) # store title addToDict(list,'WEB_project_description',sec[2]) #store description else: # no information in heading level=int(sec[3])+2 aTag="<h%i>"%level eTag="</h%i>"%level addToDict(list,"text",aTag+sec[1]+eTag) addToDict(list,"text",sec[2]) sectionXPath+="/section" sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) return list def parseSection(self,section): type="" header="" level=section.getAttribute('level') for heading in section.childNodes: if getattr(heading,'tagName','')=="heading": type=heading.getAttribute('class') header=getText(self,heading.childNodes) if type=="": # falls heading fehlt, pruefe ob erster par richtig par=section.getElementsByTagName('par')[0] type=par.getAttribute('class') header=getText(par.childNodes) #print section.childNodes #pars=Evaluate('par',section) pars=section.childNodes content=par2html(self,pars) #print "CONTENT",repr(content) return (type,header,content,level) def parseTable(table): fields={} rows=table.getElementsByTagName('html:tr') for row in rows: #print "ROW" cols=row.getElementsByTagName('html:td') #Name des Datenfeldes einlesen try: field=cols[0].getElementsByTagName('par')[0].getAttribute('class') #print "field",field except: print "error" field="" #Wandeln der Eintrge in HTML #pars=cols[1].getElementsByTagName('par') pars=cols[1].childNodes html=par2html(self,pars,tags=("",";")) addToDict(fields,field,html) #print fields return fields def par2html(self,pars,tags=None): html="" for par in pars: tagName=getattr(par,'tagName','') if tagName in ["par","inline"]: #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND') #print "par",par if not tags: try: tag=xml2htmlArray[par.getAttribute('class')] except: tag=('','') else: tag=tags #print "TAG",tag content=getText(self,par.childNodes,par.getAttribute('class')) #print par.getAttribute('class'),node try: html+=tag[0]+content+tag[1] except: html=+tag[0]+content+tag[1] elif tagName=="pb": html+="<pb/>" try: return html except: return "" def getXlink(nodes): """searches xlinks and gives them back as html""" ret="" for node in nodes: if node.attributes: if 'xlink:type' in node.attributes.keys(): #is a xlink? ret +=xlink2html(node) return ret def checkRef(self,ref): """teste ob reference angezeigt werden sollen""" dbs={'vl_literature':'AND online = \'1\'', 'vl_technology':'AND complete =\'yes\'', 'vl_people':'AND complete =\'yes\'', 'vl_sites':'AND complete =\'yes\'', 'vl_transcript':'AND complete =\'yes\'', 'vl_essays':'AND online =\'yes\'', 'vl_categories':'' } res=None for db in dbs.keys(): searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db])) res=res or self.search(var=searchStr) return res def link2html(self,str): """link2html liks in html wandeln""" if str: str=re.sub("\&","&",str) dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>") links=dom.getElementsByTagName("link") for link in links: link.tagName="a" ref=link.getAttribute("ref") pn=link.getAttribute("page") if self.checkRef(ref): if pn: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn) else: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref) newxml=dom.toxml('utf-8') retStr=regexpTXT.search(newxml) retStr = retStr.group(1) return retStr.decode('utf-8') # we return unicode return u"" def related2html(self,str): """related library items: xlinks in html wandeln / mb 22.11.2006""" if str: str=re.sub("\&","&",str) dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+str+"</txt>") links=dom.getElementsByTagName("link") for link in links: link.tagName = "a" ref = link.getAttribute("ref") pn = link.getAttribute("page") searchStr="select fullreference, online from vl_literature where reference =\'%s\' and authorized = 1"%(ref) res = self.search(var=searchStr) if res: if res[0]['online'] == 1: # item online verfuegbar if pn: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn) else: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref) link.setAttribute("title", "click to view") link.removeAttribute("ref") # prefix preceding the link prefix = dom.createTextNode(U"\u2013\u0020") # = ndash + space dom.documentElement.insertBefore(prefix, link) else: # item nur als bibliographische angabe vorhanden link.setAttribute("alt", res[0]['fullreference'].decode('utf-8')) link.setAttribute("title", "click to expand") link.setAttribute("onclick", "return toggle(this);") link.setAttribute("class", "x_offline") # prefix inside link text link.firstChild.data = '+ ' + link.firstChild.data newxml=dom.toxml('utf-8') retStr=regexpTXT.search(newxml) retStr = retStr.group(1) return retStr.decode('utf-8') # we return unicode return u"" def xml2html(self,str,quote="yes"): """link2html fuer VLP muss hier noch raus""" if str: if quote=="yes2": str=re.sub("\&","&",str) #dom=xml.dom.minidom.parseString(str) dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/") #links=dom.getElementsByTagName("link") links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom) for link in links: #link.tagName="a" ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref") pn=link.getAttributeNS(EMPTY_NAMESPACE,"page") cns=link.childNodes[0:] newLink=dom.createElementNS(EMPTY_NAMESPACE,"a") for x in cns: newLink.appendChild(x) link.parentNode.replaceChild(newLink,link) if self.checkRef(ref): if pn: newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn) else: newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref) #str= dom.toxml('utf-8') buf = cStringIO.StringIO() PrettyPrint(dom, stream=buf) str = buf.getvalue() buf.close() #str=PrettyPrint(dom.documentElement,encoding='UTF-8') #print link.toxml('utf-8') #print type(str) retStr=regexpPage.search(str) try: # hack warum fehtl manchmal page?? return retStr.group(1).decode('utf-8') except: return str return "" def xlink2html(self,xlink,parClass=None): ret="" attributes=xlink.attributes if xlink.tagName.lower()=="image": ret +="""<img src="%s" />"""%xlink.getAttribute('href') elif xlink.tagName.lower()=="link": reference=urllib.unquote(xlink.getAttribute('href')) label=getText(self,xlink.childNodes) # check if href is already a correct url if reference.split(":")[0] in ['http','file']: if parClass=="Picture": ret +="""<img src="%s" />"""%(reference) else: ret +="""<a href="%s" >%s</a>"""%(reference,label) else: # transform #href=xml2html(self,reference) #print "refer",reference reference=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',reference)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt. ret +=reference return ret def getText(self,nodelist,parClass=None): rc = u'' for node in nodelist: if node.nodeType == node.TEXT_NODE: try: try: #rc += node.data.encode('utf-8','ignore') rc += node.data except: #rc= node.data.encode('utf-8','ignore') rc=node.data except: rc="ERROR" #node.data.decode('utf-8','ignore') node.data.encode('utf-8','ignore') #print "RC",rc elif node.tagName =="inline": rc+=par2html(self,[node]) elif node.tagName =="pb": rc+="<pb/>" elif node.attributes: if 'type' in node.attributes.keys(): #is a xlink? try: rc +=xlink2html(self,node,parClass).encode('utf-8') except: rc +=xlink2html(self,node,parClass) #print "RWT",rc return rc #filename=argv[1] #fileString=file(filename).read() #print proj2hash(fileString)