--- MPIWGWeb/xmlhelper.py 2012/01/09 07:33:31 1.6.2.2 +++ MPIWGWeb/xmlhelper.py 2012/02/15 11:51:47 1.6.2.3 @@ -16,6 +16,9 @@ import xml.dom.minidom #from Ft.Lib import Uri +from xml.etree import ElementTree +import logging + xml2html={'WEB_normal':('

','

'),'Normal':('

','

'),'WEB_picture':('

','

'),'WEB_figuretitle':('

','

'),'WEB_bibliography':('

','

'),'Web_kursiv':('',''),'WEB_kursiv':('',''),'WEB_hyperlink':('',''),'Hyperlink':('','')} def addToDict(dict,name,value): @@ -32,23 +35,33 @@ def addToDict(dict,name,value): def proj2hash(xmlstring): """wandelt xml-files fuer die projekte in ein hash""" - dom=xml.dom.minidom.parseString(xmlstring) + #dom=xml.dom.minidom.parseString(xmlstring) - + tree = ElementTree.fromstring(xmlstring) + + + pars = tree.findall(".//part[0]/par") + list={} #gettitle - pars=Evaluate('par',dom.getElementsByTagName('part')[0]) + #part= dom.getElementsByTagName('part')[0] + #pars=part.getElementsByTagName('par') + #pars=Evaluate('par',dom.getElementsByTagName('part')[0]) + logging.debug(pars) for par in pars: - className=par.getAttribute('class') - content=getText(par.childNodes) + logging.debug(par) + className=par.attrib['class'] + #.getAttribute('class') + content=par.text addToDict(list,className,content) - list.update(parseTable(dom.getElementsByTagName('html:table')[0])) # Parse the Table + list.update(parseTable(tree.find('.//{http://www.w3.org/HTML/1998/html4}table'))) # Parse the Table #evaluate level 1 - - sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections + sections = tree.findall(".//part[0]/section") + #sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections + #sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections #print sections,dom.getElementsByTagName('part')[0] for section in sections: @@ -62,10 +75,11 @@ def proj2hash(xmlstring): addToDict(list,sec[0],sec[2]) #evaluate higher level sections - - sections=Evaluate('section/section',dom.getElementsByTagName('part')[0]) + sections = tree.findall(".//part[0]/section/section") + #sections=Evaluate('section/section',dom.getElementsByTagName('part')[0]) for section in sections: + logging.debug("sections2:"+repr(section)) sec=parseSection(section) if sec[0]=="WEB_project_header": # Sonderfall project @@ -81,63 +95,80 @@ def proj2hash(xmlstring): def parseSection(section): type="" header="" - for heading in section.childNodes: - if getattr(heading,'tagName','')=="heading": - - type=heading.getAttribute('class') - header=getText(heading.childNodes) - + #for heading in section.childNodes: + + heading=section.find(".//heading") + # if getattr(heading,'tagName','')=="heading": + + + type=heading.attrib['class'] + logging.debug("parseSection (class):"+type) + header=heading.text + logging.debug("parseSection (header):"+header) + if type=="": # falls heading fehlt, pruefe ob erster par richtig - par=section.getElementsByTagName('par')[0] - type=par.getAttribute('class') - header=getText(par.childNodes) + par=section.find(".//par") + #par=section.getElementsByTagName('par')[0] + type=par.attrib['class'] + header=par.text #print section.childNodes - pars=Evaluate('par',section) + pars=section.findall(".//par") + #pars=Evaluate('par',section) content=par2html(pars) return (type,header,content) def parseTable(table): fields={} - rows=table.getElementsByTagName('html:tr') + rows=table.findall('.//{http://www.w3.org/HTML/1998/html4}tr') + #rows=table.getElementsByTagName('html:tr') for row in rows: - #print "ROW" - cols=row.getElementsByTagName('html:td') + logging.debug("ROW") + cols=row.findall('.//{http://www.w3.org/HTML/1998/html4}td') + #cols=row.getElementsByTagName('html:td') #Name des Datenfeldes einlesen try: - field=cols[0].getElementsByTagName('par')[0].getAttribute('class') + field=cols[0].find('.//par').attrib['class'] + #field=cols[0].getElementsByTagName('par')[0].getAttribute('class') #print "field",field except: - print "error" + logging.debug("error") field="" #Wandeln der Eintrge in HTML - - pars=cols[1].getElementsByTagName('par') + + pars=cols[1].findall('.//par') + #pars=cols[1].getElementsByTagName('par') html=par2html(pars,tags=("",";")) - + logging.debug("field:"+field) + logging.debug("html:"+html) addToDict(fields,field,html) #print fields return fields def par2html(pars,tags=None): #html="" - + logging.debug("part2html:"+repr(pars)) + if pars is None: + return "" for par in pars: - #print "par",par + logging.debug("part2html:"+repr(par)) if not tags: try: - tag=xml2html[par.getAttribute('class')] + tag=xml2html[par.attrib['class']] except: tag=('

','

') else: tag=tags - content=getText(par.childNodes) + content=par.text + if content is None: + content="" + logging.debug("part2html:"+content) #print "CONTETN",content #print par.getAttribute('class'),node