from sys import argv import string import xml.dom.minidom import Ft.Xml.XLink.Processor import Ft.Xml.XLink.XLinkElements from Ft.Xml import XPath from Ft.Xml.XPath import Evaluate from Ft.Xml.XLink import XLINK_NAMESPACE from Ft.Xml.XLink import XLinkElements #from Ft.Xml.Domlette import NonvalidatingReader,InputSource #from Ft.Xml import EMPTY_NAMESPACE from Ft.Lib import Uri xml2html={'WEB_normal':('

','

'),'Normal':('

','

'),'WEB_picture':('

','

'),'WEB_figuretitle':('',''),'WEB_bibliography':('

','

'),'Web_kursiv':('',''),'WEB_kursiv':('',''),'WEB_hyperlink':('',''),'Hyperlink':('','')} def addToDict(dict,name,value): if name=="": return 0 else: if not dict.has_key(name): dict[name]=[] # als array anlegen dict[name].append(value) return 1 def proj2hash(xmlstring): """wandelt xml-files fuer die projekte in ein hash""" dom=xml.dom.minidom.parseString(xmlstring) list={} #gettitle pars=Evaluate('par',dom.getElementsByTagName('part')[0]) for par in pars: className=par.getAttribute('class') content=getText(par.childNodes) addToDict(list,className,content) list.update(parseTable(dom.getElementsByTagName('html:table')[0])) # Parse the Table #evaluate level 1 sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections #print sections,dom.getElementsByTagName('part')[0] for section in sections: sec=parseSection(section) if sec[0]=="WEB_project_header": # Sonderfall project addToDict(list,'WEB_project_header',sec[1]) # store title addToDict(list,'WEB_project_description',sec[2]) #store description else: # no information in heading addToDict(list,sec[0],sec[2]) #evaluate higher level sections sections=Evaluate('section/section',dom.getElementsByTagName('part')[0]) for section in sections: sec=parseSection(section) if sec[0]=="WEB_project_header": # Sonderfall project addToDict(list,'WEB_project_header',sec[1]) # store title addToDict(list,'WEB_project_description',sec[2]) #store description else: # no information in heading addToDict(list,sec[0],sec[2]) return list def parseSection(section): heading=section.getElementsByTagName('heading')[0] type=heading.getAttribute('class') header=getText(heading.childNodes) #print section.childNodes pars=Evaluate('par',section) content=par2html(pars) return (type,header,content) def parseTable(table): fields={} rows=table.getElementsByTagName('html:tr') for row in rows: #print "ROW" cols=row.getElementsByTagName('html:td') #Name des Datenfeldes einlesen try: field=cols[0].getElementsByTagName('par')[0].getAttribute('class') #print "field",field except: print "error" field="" #Wandeln der Eintrge in HTML pars=cols[1].getElementsByTagName('par') html=par2html(pars,tags=("",";")) addToDict(fields,field,html) #print fields return fields def par2html(pars,tags=None): #html="" for par in pars: #print "par",par if not tags: try: tag=xml2html[par.getAttribute('class')] except: tag=('

','

') else: tag=tags content=getText(par.childNodes) #print "CONTETN",content #print par.getAttribute('class'),node try: html=html+tag[0]+content+tag[1] except: html=tag[0]+content+tag[1] try: return html except: return "" def getXlink(nodes): """searches xlinks and gives them back as html""" ret="" for node in nodes: if node.attributes: if 'xlink:type' in node.attributes.keys(): #is a xlink? ret +=xlink2html(node) return ret def xlink2html(xlink): ret="" attributes=xlink.attributes if xlink.tagName.lower()=="image": ret +=""%xlink.getAttribute('xlink:href') elif xlink.tagName.lower()=="link": ret +="%s"%(xlink.getAttribute('xlink:href'),getText(xlink.childNodes)) return ret def getText(nodelist): rc = u'' for node in nodelist: print "HHHH" if node.nodeType == node.TEXT_NODE: #print "node",node #print "NODE",node.data.encode('utf-8','ignore'),"V" #print "HALII" try: try: print "try1" #rc += node.data.encode('utf-8','ignore') rc += node.data except: print "try2" #rc= node.data.encode('utf-8','ignore') rc=node.data except: rc="ERROR" #node.data.decode('utf-8','ignore') print "ERROR" node.data.encode('utf-8','ignore') #print "RC",rc elif node.tagName =="inline": rc+=par2html([node]) elif node.attributes: print "xlink?" if 'xlink:type' in node.attributes.keys(): #is a xlink? rc +=xlink2html(node) #print "RWT",rc return rc #filename=argv[1] #fileString=file(filename).read() #print proj2hash(fileString)