from sys import argv import string import xml.dom.minidom import Ft.Xml.XLink.Processor import Ft.Xml.XLink.XLinkElements from Ft.Xml import XPath from Ft.Xml.XPath import Evaluate from Ft.Xml.XLink import XLINK_NAMESPACE from Ft.Xml.XLink import XLinkElements import cStringIO from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print from Ft.Xml import EMPTY_NAMESPACE from Ft.Lib import Uri import urllib import re patternTXT=r"<\s*txt.*?>(.*?)" regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL) patternPage=r"<\s*page.*?>(.*?)" regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL) xml2htmlArray={'WEB_normal':('

','

'),'Normal':('

','

'),'WEB_picture':('

','

'),'WEB_figuretitle':('

','

'),'WEB_bibliography':('

','

'),'Web_kursiv':('',''),'WEB_kursiv':('',''),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('

','

'),'FigureTitle':('

','

')} def addToDict(dict,name,value): if name=="": return 0 else: if not dict.has_key(name): dict[name]=[] # als array anlegen dict[name].append(value) return 1 def proj2hash(self,xmlstring): """wandelt xml-files fuer die projekte in ein hash""" dom=xml.dom.minidom.parseString(xmlstring) list={} #gettitle pars=Evaluate('par',dom.getElementsByTagName('part')[0]) for par in pars: className=par.getAttribute('class') content=getText(self,par.childNodes) addToDict(list,className,content) sectionXPath="section" sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) while sections: for section in sections: sec=parseSection(self,section) if sec[0]=="WEB_project_header": # Sonderfall project addToDict(list,'WEB_project_header',sec[1]) # store title addToDict(list,'WEB_project_description',sec[2]) #store description else: # no information in heading level=int(sec[3])+2 aTag=""%level eTag=""%level addToDict(list,"text",aTag+sec[1]+eTag) addToDict(list,"text",sec[2]) sectionXPath+="/section" sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) return list def parseSection(self,section): type="" header="" level=section.getAttribute('level') for heading in section.childNodes: if getattr(heading,'tagName','')=="heading": type=heading.getAttribute('class') header=getText(self,heading.childNodes) if type=="": # falls heading fehlt, pruefe ob erster par richtig par=section.getElementsByTagName('par')[0] type=par.getAttribute('class') header=getText(par.childNodes) #print section.childNodes #pars=Evaluate('par',section) pars=section.childNodes content=par2html(self,pars) #print "CONTENT",repr(content) return (type,header,content,level) def parseTable(table): fields={} rows=table.getElementsByTagName('html:tr') for row in rows: #print "ROW" cols=row.getElementsByTagName('html:td') #Name des Datenfeldes einlesen try: field=cols[0].getElementsByTagName('par')[0].getAttribute('class') #print "field",field except: print "error" field="" #Wandeln der Eintrge in HTML #pars=cols[1].getElementsByTagName('par') pars=cols[1].childNodes html=par2html(self,pars,tags=("",";")) addToDict(fields,field,html) #print fields return fields def par2html(self,pars,tags=None): html="" for par in pars: tagName=getattr(par,'tagName','') if tagName in ["par","inline"]: #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND') #print "par",par if not tags: try: tag=xml2htmlArray[par.getAttribute('class')] except: tag=('

','

') else: tag=tags #print "TAG",tag content=getText(self,par.childNodes,par.getAttribute('class')) #print par.getAttribute('class'),node try: html+=tag[0]+content+tag[1] except: html=+tag[0]+content+tag[1] elif tagName=="pb": html+="" try: return html except: return "" def getXlink(nodes): """searches xlinks and gives them back as html""" ret="" for node in nodes: if node.attributes: if 'xlink:type' in node.attributes.keys(): #is a xlink? ret +=xlink2html(node) return ret def checkRef(self,ref): """teste ob reference angezeigt werden sollen""" dbs={'vl_literature':'AND online = \'1\'', 'vl_technology':'AND complete =\'yes\'', 'vl_people':'AND complete =\'yes\'', 'vl_sites':'AND complete =\'yes\'', 'vl_transcript':'AND complete =\'yes\'', 'vl_essays':'AND online =\'yes\'' } res=None for db in dbs.keys(): searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db])) res=res or self.search(var=searchStr) return res def link2html(self,str): """link2html liks in html wandeln""" if str: str=re.sub("\&","&",str) dom=xml.dom.minidom.parseString(""+str+"") links=dom.getElementsByTagName("link") for link in links: link.tagName="a" ref=link.getAttribute("ref") pn=link.getAttribute("page") if self.checkRef(ref): if pn: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn) else: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref) newxml=dom.toxml('utf-8') retStr=regexpTXT.search(newxml) return retStr.group(1) return "" def xml2html(self,str,quote="yes"): """link2html fuer VLP muss hier noch raus""" if str: if quote=="yes2": str=re.sub("\&","&",str) #dom=xml.dom.minidom.parseString(str) dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/") #links=dom.getElementsByTagName("link") links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom) for link in links: #link.tagName="a" ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref") pn=link.getAttributeNS(EMPTY_NAMESPACE,"page") cns=link.childNodes[0:] newLink=dom.createElementNS(EMPTY_NAMESPACE,"a") for x in cns: newLink.appendChild(x) link.parentNode.replaceChild(newLink,link) if self.checkRef(ref): if pn: newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&p="+pn) else: newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref) #str= dom.toxml('utf-8') buf = cStringIO.StringIO() PrettyPrint(dom, stream=buf) str = buf.getvalue() buf.close() #str=PrettyPrint(dom.documentElement,encoding='UTF-8') #print link.toxml('utf-8') #print type(str) retStr=regexpPage.search(str) try: # hack warum fehtl manchmal page?? return retStr.group(1) except: return str return "" def xlink2html(self,xlink,parClass=None): ret="" attributes=xlink.attributes if xlink.tagName.lower()=="image": ret +=""""""%xlink.getAttribute('href') elif xlink.tagName.lower()=="link": reference=urllib.unquote(xlink.getAttribute('href')) label=getText(self,xlink.childNodes) # check if href is already a correct url if reference.split(":")[0] in ['http','file']: if parClass=="Picture": ret +=""""""%(reference) else: ret +="""%s"""%(reference,label) else: # transform #href=xml2html(self,reference) #print "refer",reference reference=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',reference)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt. ret +=reference return ret def getText(self,nodelist,parClass=None): rc = u'' for node in nodelist: if node.nodeType == node.TEXT_NODE: try: try: #rc += node.data.encode('utf-8','ignore') rc += node.data except: #rc= node.data.encode('utf-8','ignore') rc=node.data except: rc="ERROR" #node.data.decode('utf-8','ignore') node.data.encode('utf-8','ignore') #print "RC",rc elif node.tagName =="inline": rc+=par2html(self,[node]) elif node.tagName =="pb": rc+="" elif node.attributes: if 'type' in node.attributes.keys(): #is a xlink? try: rc +=xlink2html(self,node,parClass).encode('utf-8') except: rc +=xlink2html(self,node,parClass) #print "RWT",rc return rc #filename=argv[1] #fileString=file(filename).read() #print proj2hash(fileString)