![]() ![]() | ![]() |
first import product fuer www des insitutes
from sys import argv import string import xml.dom.minidom import Ft.Xml.XLink.Processor import Ft.Xml.XLink.XLinkElements from Ft.Xml import XPath from Ft.Xml.XPath import Evaluate from Ft.Xml.XLink import XLINK_NAMESPACE from Ft.Xml.XLink import XLinkElements #from Ft.Xml.Domlette import NonvalidatingReader,InputSource #from Ft.Xml import EMPTY_NAMESPACE from Ft.Lib import Uri xml2html={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p>','</p>'),'WEB_figuretitle':('<i>','</i>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('','')} def addToDict(dict,name,value): if name=="": return 0 else: if not dict.has_key(name): dict[name]=[] # als array anlegen dict[name].append(value) return 1 def proj2hash(xmlstring): """wandelt xml-files fuer die projekte in ein hash""" dom=xml.dom.minidom.parseString(xmlstring) list={} #gettitle pars=Evaluate('par',dom.getElementsByTagName('part')[0]) for par in pars: className=par.getAttribute('class') content=getText(par.childNodes) addToDict(list,className,content) list.update(parseTable(dom.getElementsByTagName('html:table')[0])) # Parse the Table #evaluate level 1 sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections #print sections,dom.getElementsByTagName('part')[0] for section in sections: sec=parseSection(section) if sec[0]=="WEB_project_header": # Sonderfall project addToDict(list,'WEB_project_header',sec[1]) # store title addToDict(list,'WEB_project_description',sec[2]) #store description else: # no information in heading addToDict(list,sec[0],sec[2]) #evaluate higher level sections sections=Evaluate('section/section',dom.getElementsByTagName('part')[0]) for section in sections: sec=parseSection(section) if sec[0]=="WEB_project_header": # Sonderfall project addToDict(list,'WEB_project_header',sec[1]) # store title addToDict(list,'WEB_project_description',sec[2]) #store description else: # no information in heading addToDict(list,sec[0],sec[2]) return list def parseSection(section): heading=section.getElementsByTagName('heading')[0] type=heading.getAttribute('class') header=getText(heading.childNodes) #print section.childNodes pars=Evaluate('par',section) content=par2html(pars) return (type,header,content) def parseTable(table): fields={} rows=table.getElementsByTagName('html:tr') for row in rows: #print "ROW" cols=row.getElementsByTagName('html:td') #Name des Datenfeldes einlesen try: field=cols[0].getElementsByTagName('par')[0].getAttribute('class') #print "field",field except: print "error" field="" #Wandeln der Eintrge in HTML pars=cols[1].getElementsByTagName('par') html=par2html(pars,tags=("",";")) addToDict(fields,field,html) #print fields return fields def par2html(pars,tags=None): #html="" for par in pars: #print "par",par if not tags: try: tag=xml2html[par.getAttribute('class')] except: tag=('<p>','</p>') else: tag=tags content=getText(par.childNodes) #print "CONTETN",content #print par.getAttribute('class'),node try: html=html+tag[0]+content+tag[1] except: html=tag[0]+content+tag[1] try: return html except: return "" def getXlink(nodes): """searches xlinks and gives them back as html""" ret="" for node in nodes: if node.attributes: if 'xlink:type' in node.attributes.keys(): #is a xlink? ret +=xlink2html(node) return ret def xlink2html(xlink): ret="" attributes=xlink.attributes if xlink.tagName.lower()=="image": ret +="<img src=%s />"%xlink.getAttribute('xlink:href') elif xlink.tagName.lower()=="link": ret +="<a href='%s' >%s</a>"%(xlink.getAttribute('xlink:href'),getText(xlink.childNodes)) return ret def getText(nodelist): rc = u'' for node in nodelist: print "HHHH" if node.nodeType == node.TEXT_NODE: #print "node",node #print "NODE",node.data.encode('utf-8','ignore'),"V" #print "HALII" try: try: print "try1" #rc += node.data.encode('utf-8','ignore') rc += node.data except: print "try2" #rc= node.data.encode('utf-8','ignore') rc=node.data except: rc="ERROR" #node.data.decode('utf-8','ignore') print "ERROR" node.data.encode('utf-8','ignore') #print "RC",rc elif node.tagName =="inline": print "HI", node.getAttribute('class') rc+=par2html([node]) elif node.attributes: print "xlink?" if 'xlink:type' in node.attributes.keys(): #is a xlink? rc +=xlink2html(node) #print "RWT",rc return rc #filename=argv[1] #fileString=file(filename).read() #print proj2hash(fileString)