from sys import argv import string import xml.dom.minidom import Ft.Xml.XLink.Processor import Ft.Xml.XLink.XLinkElements from Ft.Xml import XPath from Ft.Xml.XPath import Evaluate from Ft.Xml.XLink import XLINK_NAMESPACE from Ft.Xml.XLink import XLinkElements #from Ft.Xml.Domlette import NonvalidatingReader,InputSource #from Ft.Xml import EMPTY_NAMESPACE from Ft.Lib import Uri import urllib import re patternPage=r"<\s*page.*?>(.*?)" regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL) xml2htmlArray={'WEB_normal':('

','

'),'Normal':('

','

'),'WEB_picture':('

','

'),'WEB_figuretitle':('

','

'),'WEB_bibliography':('

','

'),'Web_kursiv':('',''),'WEB_kursiv':('',''),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('

','

'),'FigureTitle':('

','

')} def addToDict(dict,name,value): if name=="": return 0 else: if not dict.has_key(name): dict[name]=[] # als array anlegen dict[name].append(value) return 1 def proj2hash(self,xmlstring): """wandelt xml-files fuer die projekte in ein hash""" dom=xml.dom.minidom.parseString(xmlstring) list={} #gettitle pars=Evaluate('par',dom.getElementsByTagName('part')[0]) for par in pars: className=par.getAttribute('class') content=getText(self,par.childNodes) addToDict(list,className,content) sectionXPath="section" sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) while sections: for section in sections: sec=parseSection(self,section) if sec[0]=="WEB_project_header": # Sonderfall project addToDict(list,'WEB_project_header',sec[1]) # store title addToDict(list,'WEB_project_description',sec[2]) #store description else: # no information in heading level=sec[3] aTag=""%level eTag=""%level addToDict(list,"text",aTag+sec[1]+eTag) addToDict(list,"text",sec[2]) sectionXPath+="/section" sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) return list def parseSection(self,section): type="" header="" level=section.getAttribute('level') for heading in section.childNodes: if getattr(heading,'tagName','')=="heading": type=heading.getAttribute('class') header=getText(self,heading.childNodes) if type=="": # falls heading fehlt, pruefe ob erster par richtig par=section.getElementsByTagName('par')[0] type=par.getAttribute('class') header=getText(par.childNodes) #print section.childNodes #pars=Evaluate('par',section) pars=section.childNodes content=par2html(self,pars) #print "CONTENT",repr(content) return (type,header,content,level) def parseTable(table): fields={} rows=table.getElementsByTagName('html:tr') for row in rows: #print "ROW" cols=row.getElementsByTagName('html:td') #Name des Datenfeldes einlesen try: field=cols[0].getElementsByTagName('par')[0].getAttribute('class') #print "field",field except: print "error" field="" #Wandeln der Eintrge in HTML #pars=cols[1].getElementsByTagName('par') pars=cols[1].childNodes html=par2html(self,pars,tags=("",";")) addToDict(fields,field,html) #print fields return fields def par2html(self,pars,tags=None): html="" for par in pars: tagName=getattr(par,'tagName','') if tagName in ["par","inline"]: #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND') #print "par",par if not tags: try: tag=xml2htmlArray[par.getAttribute('class')] except: tag=('

','

') else: tag=tags #print "TAG",tag content=getText(self,par.childNodes,par.getAttribute('class')) #print par.getAttribute('class'),node try: html+=tag[0]+content+tag[1] except: html=+tag[0]+content+tag[1] elif tagName=="pb": html+="" elif tagName=="img": html+="XXX" try: return html except: return "" def getXlink(nodes): """searches xlinks and gives them back as html""" ret="" for node in nodes: if node.attributes: if 'xlink:type' in node.attributes.keys(): #is a xlink? ret +=xlink2html(node) return ret def checkRef(self,ref): dbs={'vl_literature':'AND CD LIKE \'%lise%\'','vl_technology':'','vl_people':'','vl_sites':''} res=None for db in dbs.keys(): res=res or self.search(var=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db]))) return res def xml2html(self,str,quote="yes"): """link2html fuer VLP muss hier noch raus""" if str: if quote=="yes2": str=re.sub("\&","&",str) str=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',str)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt. #str=re.sub("ref\=([.[*^[>]]])",'XX',str) #print "STR::",str dom=xml.dom.minidom.parseString(str) links=dom.getElementsByTagName("link") for link in links: link.tagName="a" ref=link.getAttribute("ref") pn=link.getAttribute("page") if checkRef(self,ref): if pn: link.setAttribute("href",self.aq_parent.absolute_url()+"/vlp_coll?id="+ref+"&p="+pn) else: link.setAttribute("href",self.aq_parent.absolute_url()+"/vlp_coll?id="+ref) str= dom.toxml() #print link.toxml('utf-8') retStr=regexpPage.search(str) try: return retStr.group(1) except: exStr="""""" str=re.sub("\n","",str) #str= #print repr(str) return str.replace(exStr,'') return "" def xlink2html(self,xlink,parClass=None): ret="" attributes=xlink.attributes if xlink.tagName.lower()=="image": ret +=""""""%xlink.getAttribute('href') elif xlink.tagName.lower()=="link": reference=urllib.unquote(xlink.getAttribute('href')) label=getText(self,xlink.childNodes) # check if href is already a correct url if reference.split(":")[0] in ['http','file']: if parClass=="Picture": ret +=""""""%(reference) else: ret +="""%s"""%(reference,label) else: # transform #href=xml2html(self,reference) #print "refer",reference reference=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',reference)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt. ret +=reference return ret def getText(self,nodelist,parClass=None): rc = u'' for node in nodelist: if node.nodeType == node.TEXT_NODE: try: try: #rc += node.data.encode('utf-8','ignore') rc += node.data except: #rc= node.data.encode('utf-8','ignore') rc=node.data except: rc="ERROR" #node.data.decode('utf-8','ignore') node.data.encode('utf-8','ignore') #print "RC",rc elif node.tagName =="inline": rc+=par2html(self,[node]) elif node.tagName =="pb": rc+="" elif node.attributes: if 'type' in node.attributes.keys(): #is a xlink? try: rc +=xlink2html(self,node,parClass).encode('utf-8') except: rc +=xlink2html(self,node,parClass) #print "RWT",rc return rc #filename=argv[1] #fileString=file(filename).read() #print proj2hash(fileString)