from sys import argv import string import xml.dom.minidom #import Ft.Xml.XLink.Processor #import Ft.Xml.XLink.XLinkElements # #from Ft.Xml import XPath #from Ft.Xml.XPath import Evaluate #from Ft.Xml.XLink import XLINK_NAMESPACE #from Ft.Xml.XLink import XLinkElements #from Ft.Xml.Domlette import NonvalidatingReader,InputSource #from Ft.Xml import EMPTY_NAMESPACE #from Ft.Lib import Uri from xml.etree import ElementTree import logging xml2html={'WEB_normal':('

','

'),'Normal':('

','

'),'WEB_picture':('

','

'),'WEB_figuretitle':('

','

'),'WEB_bibliography':('

','

'),'Web_kursiv':('',''),'WEB_kursiv':('',''),'WEB_hyperlink':('',''),'Hyperlink':('','')} def addToDict(dict,name,value): if name=="": return 0 else: if not dict.has_key(name): dict[name]=[] # als array anlegen dict[name].append(value) return 1 def proj2hash(xmlstring): """wandelt xml-files fuer die projekte in ein hash""" #dom=xml.dom.minidom.parseString(xmlstring) tree = ElementTree.fromstring(xmlstring) pars = tree.findall(".//part[0]/par") list={} #gettitle #part= dom.getElementsByTagName('part')[0] #pars=part.getElementsByTagName('par') #pars=Evaluate('par',dom.getElementsByTagName('part')[0]) logging.debug(pars) for par in pars: logging.debug(par) className=par.attrib['class'] #.getAttribute('class') content=par.text addToDict(list,className,content) list.update(parseTable(tree.find('.//{http://www.w3.org/HTML/1998/html4}table'))) # Parse the Table #evaluate level 1 sections = tree.findall(".//part[0]/section") #sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections #sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections #print sections,dom.getElementsByTagName('part')[0] for section in sections: sec=parseSection(section) if sec[0]=="WEB_project_header": # Sonderfall project addToDict(list,'WEB_project_header',sec[1]) # store title addToDict(list,'WEB_project_description',sec[2]) #store description else: # no information in heading addToDict(list,sec[0],sec[2]) #evaluate higher level sections sections = tree.findall(".//part[0]/section/section") #sections=Evaluate('section/section',dom.getElementsByTagName('part')[0]) for section in sections: logging.debug("sections2:"+repr(section)) sec=parseSection(section) if sec[0]=="WEB_project_header": # Sonderfall project addToDict(list,'WEB_project_header',sec[1]) # store title addToDict(list,'WEB_project_description',sec[2]) #store description else: # no information in heading addToDict(list,sec[0],sec[2]) return list def parseSection(section): type="" header="" #for heading in section.childNodes: heading=section.find(".//heading") # if getattr(heading,'tagName','')=="heading": type=heading.attrib['class'] logging.debug("parseSection (class):"+type) header=heading.text logging.debug("parseSection (header):"+header) if type=="": # falls heading fehlt, pruefe ob erster par richtig par=section.find(".//par") #par=section.getElementsByTagName('par')[0] type=par.attrib['class'] header=par.text #print section.childNodes pars=section.findall(".//par") #pars=Evaluate('par',section) content=par2html(pars) return (type,header,content) def parseTable(table): fields={} rows=table.findall('.//{http://www.w3.org/HTML/1998/html4}tr') #rows=table.getElementsByTagName('html:tr') for row in rows: logging.debug("ROW") cols=row.findall('.//{http://www.w3.org/HTML/1998/html4}td') #cols=row.getElementsByTagName('html:td') #Name des Datenfeldes einlesen try: field=cols[0].find('.//par').attrib['class'] #field=cols[0].getElementsByTagName('par')[0].getAttribute('class') #print "field",field except: logging.debug("error") field="" #Wandeln der Eintrge in HTML pars=cols[1].findall('.//par') #pars=cols[1].getElementsByTagName('par') html=par2html(pars,tags=("",";")) logging.debug("field:"+field) logging.debug("html:"+html) addToDict(fields,field,html) #print fields return fields def par2html(pars,tags=None): #html="" logging.debug("part2html:"+repr(pars)) if pars is None: return "" for par in pars: logging.debug("part2html:"+repr(par)) if not tags: try: tag=xml2html[par.attrib['class']] except: tag=('

','

') else: tag=tags content=par.text if content is None: content="" logging.debug("part2html:"+content) #print "CONTETN",content #print par.getAttribute('class'),node try: html=html+tag[0]+content+tag[1] except: html=tag[0]+content+tag[1] try: return html except: return "" def getXlink(nodes): """searches xlinks and gives them back as html""" ret="" for node in nodes: if node.attributes: if 'xlink:type' in node.attributes.keys(): #is a xlink? ret +=xlink2html(node) return ret def xlink2html(xlink): ret="" attributes=xlink.attributes if xlink.tagName.lower()=="image": ret +=""%xlink.getAttribute('xlink:href') elif xlink.tagName.lower()=="link": ret +="%s"%(xlink.getAttribute('xlink:href'),getText(xlink.childNodes)) return ret def getText(nodelist): rc = u'' for node in nodelist: if node.nodeType == node.TEXT_NODE: #print "node",node #print "NODE",node.data.encode('utf-8','ignore'),"V" #print "HALII" try: try: #rc += node.data.encode('utf-8','ignore') rc += node.data except: #rc= node.data.encode('utf-8','ignore') rc=node.data except: rc="ERROR" #node.data.decode('utf-8','ignore') print "ERROR" node.data.encode('utf-8','ignore') #print "RC",rc elif node.tagName =="inline": rc+=par2html([node]) elif node.attributes: if 'xlink:type' in node.attributes.keys(): #is a xlink? rc +=xlink2html(node) #print "RWT",rc return rc #filename=argv[1] #fileString=file(filename).read() #print proj2hash(fileString)