from sys import argv
import string
import xml.dom.minidom
#import Ft.Xml.XLink.Processor
#import Ft.Xml.XLink.XLinkElements
#
#from Ft.Xml import XPath
#from Ft.Xml.XPath import Evaluate
#from Ft.Xml.XLink import XLINK_NAMESPACE
#from Ft.Xml.XLink import XLinkElements
#from Ft.Xml.Domlette import NonvalidatingReader,InputSource
#from Ft.Xml import EMPTY_NAMESPACE
#from Ft.Lib import Uri
from xml.etree import ElementTree
import logging
xml2html={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('','')}
def addToDict(dict,name,value):
if name=="":
return 0
else:
if not dict.has_key(name):
dict[name]=[] # als array anlegen
dict[name].append(value)
return 1
def proj2hash(xmlstring):
"""wandelt xml-files fuer die projekte in ein hash"""
#dom=xml.dom.minidom.parseString(xmlstring)
tree = ElementTree.fromstring(xmlstring)
pars = tree.findall(".//part[0]/par")
list={}
#gettitle
#part= dom.getElementsByTagName('part')[0]
#pars=part.getElementsByTagName('par')
#pars=Evaluate('par',dom.getElementsByTagName('part')[0])
logging.debug(pars)
for par in pars:
logging.debug(par)
className=par.attrib['class']
#.getAttribute('class')
content=par.text
addToDict(list,className,content)
list.update(parseTable(tree.find('.//{http://www.w3.org/HTML/1998/html4}table'))) # Parse the Table
#evaluate level 1
sections = tree.findall(".//part[0]/section")
#sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections
#sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections
#print sections,dom.getElementsByTagName('part')[0]
for section in sections:
sec=parseSection(section)
if sec[0]=="WEB_project_header": # Sonderfall project
addToDict(list,'WEB_project_header',sec[1]) # store title
addToDict(list,'WEB_project_description',sec[2]) #store description
else: # no information in heading
addToDict(list,sec[0],sec[2])
#evaluate higher level sections
sections = tree.findall(".//part[0]/section/section")
#sections=Evaluate('section/section',dom.getElementsByTagName('part')[0])
for section in sections:
logging.debug("sections2:"+repr(section))
sec=parseSection(section)
if sec[0]=="WEB_project_header": # Sonderfall project
addToDict(list,'WEB_project_header',sec[1]) # store title
addToDict(list,'WEB_project_description',sec[2]) #store description
else: # no information in heading
addToDict(list,sec[0],sec[2])
return list
def parseSection(section):
type=""
header=""
#for heading in section.childNodes:
heading=section.find(".//heading")
# if getattr(heading,'tagName','')=="heading":
type=heading.attrib['class']
logging.debug("parseSection (class):"+type)
header=heading.text
logging.debug("parseSection (header):"+header)
if type=="": # falls heading fehlt, pruefe ob erster par richtig
par=section.find(".//par")
#par=section.getElementsByTagName('par')[0]
type=par.attrib['class']
header=par.text
#print section.childNodes
pars=section.findall(".//par")
#pars=Evaluate('par',section)
content=par2html(pars)
return (type,header,content)
def parseTable(table):
fields={}
rows=table.findall('.//{http://www.w3.org/HTML/1998/html4}tr')
#rows=table.getElementsByTagName('html:tr')
for row in rows:
logging.debug("ROW")
cols=row.findall('.//{http://www.w3.org/HTML/1998/html4}td')
#cols=row.getElementsByTagName('html:td')
#Name des Datenfeldes einlesen
try:
field=cols[0].find('.//par').attrib['class']
#field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
#print "field",field
except:
logging.debug("error")
field=""
#Wandeln der Eintrge in HTML
pars=cols[1].findall('.//par')
#pars=cols[1].getElementsByTagName('par')
html=par2html(pars,tags=("",";"))
logging.debug("field:"+field)
logging.debug("html:"+html)
addToDict(fields,field,html)
#print fields
return fields
def par2html(pars,tags=None):
#html=""
logging.debug("part2html:"+repr(pars))
if pars is None:
return ""
for par in pars:
logging.debug("part2html:"+repr(par))
if not tags:
try:
tag=xml2html[par.attrib['class']]
except:
tag=('<p>','</p>')
else:
tag=tags
content=par.text
if content is None:
content=""
logging.debug("part2html:"+content)
#print "CONTETN",content
#print par.getAttribute('class'),node
try:
html=html+tag[0]+content+tag[1]
except:
html=tag[0]+content+tag[1]
try:
return html
except:
return ""
def getXlink(nodes):
"""searches xlinks and gives them back as html"""
ret=""
for node in nodes:
if node.attributes:
if 'xlink:type' in node.attributes.keys(): #is a xlink?
ret +=xlink2html(node)
return ret
def xlink2html(xlink):
ret=""
attributes=xlink.attributes
if xlink.tagName.lower()=="image":
ret +="<img src=%s />"%xlink.getAttribute('xlink:href')
elif xlink.tagName.lower()=="link":
ret +="<a href='%s' >%s</a>"%(xlink.getAttribute('xlink:href'),getText(xlink.childNodes))
return ret
def getText(nodelist):
rc = u''
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
#print "node",node
#print "NODE",node.data.encode('utf-8','ignore'),"V"
#print "HALII"
try:
try:
#rc += node.data.encode('utf-8','ignore')
rc += node.data
except:
#rc= node.data.encode('utf-8','ignore')
rc=node.data
except:
rc="ERROR"
#node.data.decode('utf-8','ignore')
print "ERROR"
node.data.encode('utf-8','ignore')
#print "RC",rc
elif node.tagName =="inline":
rc+=par2html([node])
elif node.attributes:
if 'xlink:type' in node.attributes.keys(): #is a xlink?
rc +=xlink2html(node)
#print "RWT",rc
return rc
#filename=argv[1]
#fileString=file(filename).read()
#print proj2hash(fileString)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>