version 1.13, 2007/12/11 17:00:01
|
version 1.14, 2008/08/05 16:17:46
|
Line 23 regexpTXT = re.compile(patternTXT, re.IG
|
Line 23 regexpTXT = re.compile(patternTXT, re.IG
|
patternPage=r"<\s*page.*?>(.*?)</page>" |
patternPage=r"<\s*page.*?>(.*?)</page>" |
regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL) |
regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL) |
|
|
xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')} |
#xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')} |
|
# |
def addToDict(dict,name,value): |
#def addToDict(dict,name,value): |
if name=="": |
# if name=="": |
return 0 |
# return 0 |
else: |
# else: |
|
# |
if not dict.has_key(name): |
# if not dict.has_key(name): |
dict[name]=[] # als array anlegen |
# dict[name]=[] # als array anlegen |
|
# |
dict[name].append(value) |
# dict[name].append(value) |
return 1 |
# return 1 |
|
# |
def proj2hash(self,xmlstring): |
#def proj2hash(self,xmlstring): |
"""wandelt xml-files fuer die projekte in ein hash""" |
# """wandelt xml-files fuer die projekte in ein hash""" |
|
# |
dom=xml.dom.minidom.parseString(xmlstring) |
# dom=xml.dom.minidom.parseString(xmlstring) |
|
# |
|
# |
list={} |
# list={} |
|
# |
#gettitle |
# #gettitle |
pars=Evaluate('par',dom.getElementsByTagName('part')[0]) |
# pars=Evaluate('par',dom.getElementsByTagName('part')[0]) |
for par in pars: |
# for par in pars: |
className=par.getAttribute('class') |
# className=par.getAttribute('class') |
content=getText(self,par.childNodes) |
# content=getText(self,par.childNodes) |
addToDict(list,className,content) |
# addToDict(list,className,content) |
|
# |
|
# |
sectionXPath="section" |
# sectionXPath="section" |
|
# |
|
# |
sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) |
# sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) |
|
# |
while sections: |
# while sections: |
|
# |
for section in sections: |
# for section in sections: |
|
# |
sec=parseSection(self,section) |
# sec=parseSection(self,section) |
|
# |
if sec[0]=="WEB_project_header": # Sonderfall project |
# if sec[0]=="WEB_project_header": # Sonderfall project |
addToDict(list,'WEB_project_header',sec[1]) # store title |
# addToDict(list,'WEB_project_header',sec[1]) # store title |
addToDict(list,'WEB_project_description',sec[2]) #store description |
# addToDict(list,'WEB_project_description',sec[2]) #store description |
else: # no information in heading |
# else: # no information in heading |
level=int(sec[3])+2 |
# level=int(sec[3])+2 |
aTag="<h%i>"%level |
# aTag="<h%i>"%level |
eTag="</h%i>"%level |
# eTag="</h%i>"%level |
addToDict(list,"text",aTag+sec[1]+eTag) |
# addToDict(list,"text",aTag+sec[1]+eTag) |
addToDict(list,"text",sec[2]) |
# addToDict(list,"text",sec[2]) |
sectionXPath+="/section" |
# sectionXPath+="/section" |
sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) |
# sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0]) |
return list |
# return list |
|
# |
|
# |
def parseSection(self,section): |
#def parseSection(self,section): |
type="" |
# type="" |
header="" |
# header="" |
level=section.getAttribute('level') |
# level=section.getAttribute('level') |
for heading in section.childNodes: |
# for heading in section.childNodes: |
if getattr(heading,'tagName','')=="heading": |
# if getattr(heading,'tagName','')=="heading": |
|
# |
type=heading.getAttribute('class') |
# type=heading.getAttribute('class') |
header=getText(self,heading.childNodes) |
# header=getText(self,heading.childNodes) |
|
# |
if type=="": # falls heading fehlt, pruefe ob erster par richtig |
# if type=="": # falls heading fehlt, pruefe ob erster par richtig |
par=section.getElementsByTagName('par')[0] |
# par=section.getElementsByTagName('par')[0] |
type=par.getAttribute('class') |
# type=par.getAttribute('class') |
header=getText(par.childNodes) |
# header=getText(par.childNodes) |
|
# |
#print section.childNodes |
# #print section.childNodes |
#pars=Evaluate('par',section) |
# #pars=Evaluate('par',section) |
pars=section.childNodes |
# pars=section.childNodes |
content=par2html(self,pars) |
# content=par2html(self,pars) |
#print "CONTENT",repr(content) |
# #print "CONTENT",repr(content) |
return (type,header,content,level) |
# return (type,header,content,level) |
|
# |
def parseTable(table): |
#def parseTable(table): |
fields={} |
# fields={} |
rows=table.getElementsByTagName('html:tr') |
# rows=table.getElementsByTagName('html:tr') |
for row in rows: |
# for row in rows: |
#print "ROW" |
# #print "ROW" |
cols=row.getElementsByTagName('html:td') |
# cols=row.getElementsByTagName('html:td') |
|
# |
#Name des Datenfeldes einlesen |
# #Name des Datenfeldes einlesen |
try: |
# try: |
field=cols[0].getElementsByTagName('par')[0].getAttribute('class') |
# field=cols[0].getElementsByTagName('par')[0].getAttribute('class') |
#print "field",field |
# #print "field",field |
except: |
# except: |
print "error" |
# print "error" |
field="" |
# field="" |
|
# |
#Wandeln der Eintrge in HTML |
# #Wandeln der Eintrge in HTML |
|
# |
#pars=cols[1].getElementsByTagName('par') |
# #pars=cols[1].getElementsByTagName('par') |
pars=cols[1].childNodes |
# pars=cols[1].childNodes |
|
# |
html=par2html(self,pars,tags=("",";")) |
# html=par2html(self,pars,tags=("",";")) |
|
# |
addToDict(fields,field,html) |
# addToDict(fields,field,html) |
#print fields |
# #print fields |
return fields |
# return fields |
|
# |
def par2html(self,pars,tags=None): |
#def par2html(self,pars,tags=None): |
html="" |
# html="" |
|
# |
for par in pars: |
# for par in pars: |
tagName=getattr(par,'tagName','') |
# tagName=getattr(par,'tagName','') |
if tagName in ["par","inline"]: |
# if tagName in ["par","inline"]: |
#print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND') |
# #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND') |
#print "par",par |
# #print "par",par |
if not tags: |
# if not tags: |
try: |
# try: |
tag=xml2htmlArray[par.getAttribute('class')] |
# tag=xml2htmlArray[par.getAttribute('class')] |
except: |
# except: |
tag=('<p>','</p>') |
# tag=('<p>','</p>') |
else: |
# else: |
tag=tags |
# tag=tags |
#print "TAG",tag |
# #print "TAG",tag |
content=getText(self,par.childNodes,par.getAttribute('class')) |
# content=getText(self,par.childNodes,par.getAttribute('class')) |
|
# |
|
# |
|
# |
#print par.getAttribute('class'),node |
# #print par.getAttribute('class'),node |
try: |
# try: |
html+=tag[0]+content+tag[1] |
# html+=tag[0]+content+tag[1] |
except: |
# except: |
html=+tag[0]+content+tag[1] |
# html=+tag[0]+content+tag[1] |
|
# |
elif tagName=="pb": |
# elif tagName=="pb": |
html+="<pb/>" |
# html+="<pb/>" |
|
# |
|
# |
try: |
# try: |
|
# |
return html |
# return html |
except: |
# except: |
return "" |
# return "" |
|
|
def getXlink(nodes): |
def getXlink(nodes): |
"""searches xlinks and gives them back as html""" |
"""searches xlinks and gives them back as html""" |