MPIWGWeb/xmlhelper.py - view

File: [Repository] / MPIWGWeb / xmlhelper.py
Revision 1.7: download - view: text, annotated - select for diffs - revision graph
Wed Feb 18 13:01:17 2009 UTC (15 years, 2 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD

*** empty log message ***

from sys import argv import string import xml.dom.minidom import Ft.Xml.XLink.Processor import Ft.Xml.XLink.XLinkElements from Ft.Xml import XPath from Ft.Xml.XPath import Evaluate from Ft.Xml.XLink import XLINK_NAMESPACE from Ft.Xml.XLink import XLinkElements #from Ft.Xml.Domlette import NonvalidatingReader,InputSource #from Ft.Xml import EMPTY_NAMESPACE from Ft.Lib import Uri xml2html={'WEB_normal':('',''),'Normal':('',''),'WEB_picture':('',''),'WEB_figuretitle':('',''),'WEB_bibliography':('',''),'Web_kursiv':('',''),'WEB_kursiv':('',''),'WEB_hyperlink':('',''),'Hyperlink':('','')} def addToDict(dict,name,value): if name=="": return 0 else: if not dict.has_key(name): dict[name]=[] # als array anlegen dict[name].append(value) return 1 def proj2hash(xmlstring): """wandelt xml-files fuer die projekte in ein hash""" dom=xml.dom.minidom.parseString(xmlstring) list={} #gettitle pars=Evaluate('par',dom.getElementsByTagName('part')[0]) for par in pars: className=par.getAttribute('class') content=getText(par.childNodes) addToDict(list,className,content) list.update(parseTable(dom.getElementsByTagName('html:table')[0])) # Parse the Table #evaluate level 1 sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections #print sections,dom.getElementsByTagName('part')[0] for section in sections: sec=parseSection(section) if sec[0]=="WEB_project_header": # Sonderfall project addToDict(list,'WEB_project_header',sec[1]) # store title addToDict(list,'WEB_project_description',sec[2]) #store description else: # no information in heading addToDict(list,sec[0],sec[2]) #evaluate higher level sections sections=Evaluate('section/section',dom.getElementsByTagName('part')[0]) for section in sections: sec=parseSection(section) if sec[0]=="WEB_project_header": # Sonderfall project addToDict(list,'WEB_project_header',sec[1]) # store title addToDict(list,'WEB_project_description',sec[2]) #store description else: # no information in heading addToDict(list,sec[0],sec[2]) return list def parseSection(section): type="" header="" for heading in section.childNodes: if getattr(heading,'tagName','')=="heading": type=heading.getAttribute('class') header=getText(heading.childNodes) if type=="": # falls heading fehlt, pruefe ob erster par richtig par=section.getElementsByTagName('par')[0] type=par.getAttribute('class') header=getText(par.childNodes) #print section.childNodes pars=Evaluate('par',section) content=par2html(pars) return (type,header,content) def parseTable(table): fields={} rows=table.getElementsByTagName('html:tr') for row in rows: #print "ROW" cols=row.getElementsByTagName('html:td') #Name des Datenfeldes einlesen try: field=cols[0].getElementsByTagName('par')[0].getAttribute('class') #print "field",field except: print "error" field="" #Wandeln der Eintrge in HTML pars=cols[1].getElementsByTagName('par') html=par2html(pars,tags=("",";")) addToDict(fields,field,html) #print fields return fields def par2html(pars,tags=None): #html="" for par in pars: #print "par",par if not tags: try: tag=xml2html[par.getAttribute('class')] except: tag=('','') else: tag=tags content=getText(par.childNodes) #print "CONTETN",content #print par.getAttribute('class'),node try: html=html+tag[0]+content+tag[1] except: html=tag[0]+content+tag[1] try: return html except: return "" def getXlink(nodes): """searches xlinks and gives them back as html""" ret="" for node in nodes: if node.attributes: if 'xlink:type' in node.attributes.keys(): #is a xlink? ret +=xlink2html(node) return ret def xlink2html(xlink): ret="" attributes=xlink.attributes if xlink.tagName.lower()=="image": ret +="<img src=%s />"%xlink.getAttribute('xlink:href') elif xlink.tagName.lower()=="link": ret +="<a href='%s' >%s</a>"%(xlink.getAttribute('xlink:href'),getText(xlink.childNodes)) return ret def getText(nodelist): rc = u'' for node in nodelist: if node.nodeType == node.TEXT_NODE: #print "node",node #print "NODE",node.data.encode('utf-8','ignore'),"V" #print "HALII" try: try: #rc += node.data.encode('utf-8','ignore') rc += node.data except: #rc= node.data.encode('utf-8','ignore') rc=node.data except: rc="ERROR" #node.data.decode('utf-8','ignore') print "ERROR" node.data.encode('utf-8','ignore') #print "RC",rc elif node.tagName =="inline": rc+=par2html([node]) elif node.attributes: if 'xlink:type' in node.attributes.keys(): #is a xlink? rc +=xlink2html(node) #print "RWT",rc return rc #filename=argv[1] #fileString=file(filename).read() #print proj2hash(fileString)