version 1.1.1.1, 2004/02/04 10:03:33
|
version 1.6.2.3, 2012/02/15 11:51:47
|
Line 3 from sys import argv
|
Line 3 from sys import argv
|
|
|
import string |
import string |
import xml.dom.minidom |
import xml.dom.minidom |
import Ft.Xml.XLink.Processor |
#import Ft.Xml.XLink.Processor |
import Ft.Xml.XLink.XLinkElements |
#import Ft.Xml.XLink.XLinkElements |
|
# |
from Ft.Xml import XPath |
#from Ft.Xml import XPath |
from Ft.Xml.XPath import Evaluate |
#from Ft.Xml.XPath import Evaluate |
from Ft.Xml.XLink import XLINK_NAMESPACE |
#from Ft.Xml.XLink import XLINK_NAMESPACE |
from Ft.Xml.XLink import XLinkElements |
#from Ft.Xml.XLink import XLinkElements |
|
|
#from Ft.Xml.Domlette import NonvalidatingReader,InputSource |
#from Ft.Xml.Domlette import NonvalidatingReader,InputSource |
#from Ft.Xml import EMPTY_NAMESPACE |
#from Ft.Xml import EMPTY_NAMESPACE |
from Ft.Lib import Uri |
|
|
|
xml2html={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p>','</p>'),'WEB_figuretitle':('<i>','</i>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('','')} |
#from Ft.Lib import Uri |
|
|
|
from xml.etree import ElementTree |
|
import logging |
|
|
|
xml2html={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('','')} |
|
|
def addToDict(dict,name,value): |
def addToDict(dict,name,value): |
if name=="": |
if name=="": |
Line 31 def addToDict(dict,name,value):
|
Line 35 def addToDict(dict,name,value):
|
def proj2hash(xmlstring): |
def proj2hash(xmlstring): |
"""wandelt xml-files fuer die projekte in ein hash""" |
"""wandelt xml-files fuer die projekte in ein hash""" |
|
|
dom=xml.dom.minidom.parseString(xmlstring) |
#dom=xml.dom.minidom.parseString(xmlstring) |
|
|
|
tree = ElementTree.fromstring(xmlstring) |
|
|
|
|
|
pars = tree.findall(".//part[0]/par") |
|
|
list={} |
list={} |
|
|
#gettitle |
#gettitle |
pars=Evaluate('par',dom.getElementsByTagName('part')[0]) |
#part= dom.getElementsByTagName('part')[0] |
|
#pars=part.getElementsByTagName('par') |
|
#pars=Evaluate('par',dom.getElementsByTagName('part')[0]) |
|
logging.debug(pars) |
for par in pars: |
for par in pars: |
className=par.getAttribute('class') |
logging.debug(par) |
content=getText(par.childNodes) |
className=par.attrib['class'] |
|
#.getAttribute('class') |
|
content=par.text |
addToDict(list,className,content) |
addToDict(list,className,content) |
|
|
list.update(parseTable(dom.getElementsByTagName('html:table')[0])) # Parse the Table |
list.update(parseTable(tree.find('.//{http://www.w3.org/HTML/1998/html4}table'))) # Parse the Table |
|
|
#evaluate level 1 |
#evaluate level 1 |
|
sections = tree.findall(".//part[0]/section") |
sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections |
#sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections |
|
#sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections |
#print sections,dom.getElementsByTagName('part')[0] |
#print sections,dom.getElementsByTagName('part')[0] |
for section in sections: |
for section in sections: |
|
|
sec=parseSection(section) |
sec=parseSection(section) |
if sec[0]=="WEB_project_header": # Sonderfall project |
if sec[0]=="WEB_project_header": # Sonderfall project |
|
|
addToDict(list,'WEB_project_header',sec[1]) # store title |
addToDict(list,'WEB_project_header',sec[1]) # store title |
addToDict(list,'WEB_project_description',sec[2]) #store description |
addToDict(list,'WEB_project_description',sec[2]) #store description |
else: # no information in heading |
else: # no information in heading |
|
|
addToDict(list,sec[0],sec[2]) |
addToDict(list,sec[0],sec[2]) |
|
|
#evaluate higher level sections |
#evaluate higher level sections |
|
sections = tree.findall(".//part[0]/section/section") |
sections=Evaluate('section/section',dom.getElementsByTagName('part')[0]) |
#sections=Evaluate('section/section',dom.getElementsByTagName('part')[0]) |
|
|
for section in sections: |
for section in sections: |
|
logging.debug("sections2:"+repr(section)) |
sec=parseSection(section) |
sec=parseSection(section) |
|
|
if sec[0]=="WEB_project_header": # Sonderfall project |
if sec[0]=="WEB_project_header": # Sonderfall project |
Line 76 def proj2hash(xmlstring):
|
Line 93 def proj2hash(xmlstring):
|
|
|
|
|
def parseSection(section): |
def parseSection(section): |
heading=section.getElementsByTagName('heading')[0] |
type="" |
type=heading.getAttribute('class') |
header="" |
header=getText(heading.childNodes) |
#for heading in section.childNodes: |
|
|
|
heading=section.find(".//heading") |
|
# if getattr(heading,'tagName','')=="heading": |
|
|
|
|
|
type=heading.attrib['class'] |
|
logging.debug("parseSection (class):"+type) |
|
header=heading.text |
|
logging.debug("parseSection (header):"+header) |
|
|
|
if type=="": # falls heading fehlt, pruefe ob erster par richtig |
|
par=section.find(".//par") |
|
#par=section.getElementsByTagName('par')[0] |
|
type=par.attrib['class'] |
|
header=par.text |
|
|
#print section.childNodes |
#print section.childNodes |
pars=Evaluate('par',section) |
pars=section.findall(".//par") |
|
#pars=Evaluate('par',section) |
content=par2html(pars) |
content=par2html(pars) |
|
|
return (type,header,content) |
return (type,header,content) |
|
|
def parseTable(table): |
def parseTable(table): |
fields={} |
fields={} |
rows=table.getElementsByTagName('html:tr') |
rows=table.findall('.//{http://www.w3.org/HTML/1998/html4}tr') |
|
#rows=table.getElementsByTagName('html:tr') |
for row in rows: |
for row in rows: |
#print "ROW" |
logging.debug("ROW") |
cols=row.getElementsByTagName('html:td') |
cols=row.findall('.//{http://www.w3.org/HTML/1998/html4}td') |
|
#cols=row.getElementsByTagName('html:td') |
|
|
#Name des Datenfeldes einlesen |
#Name des Datenfeldes einlesen |
try: |
try: |
field=cols[0].getElementsByTagName('par')[0].getAttribute('class') |
field=cols[0].find('.//par').attrib['class'] |
|
#field=cols[0].getElementsByTagName('par')[0].getAttribute('class') |
#print "field",field |
#print "field",field |
except: |
except: |
print "error" |
logging.debug("error") |
field="" |
field="" |
|
|
#Wandeln der Eintrge in HTML |
#Wandeln der Eintrge in HTML |
|
|
pars=cols[1].getElementsByTagName('par') |
pars=cols[1].findall('.//par') |
|
#pars=cols[1].getElementsByTagName('par') |
|
|
|
|
html=par2html(pars,tags=("",";")) |
html=par2html(pars,tags=("",";")) |
|
logging.debug("field:"+field) |
|
logging.debug("html:"+html) |
addToDict(fields,field,html) |
addToDict(fields,field,html) |
#print fields |
#print fields |
return fields |
return fields |
|
|
def par2html(pars,tags=None): |
def par2html(pars,tags=None): |
#html="" |
#html="" |
|
logging.debug("part2html:"+repr(pars)) |
|
if pars is None: |
|
return "" |
for par in pars: |
for par in pars: |
#print "par",par |
logging.debug("part2html:"+repr(par)) |
if not tags: |
if not tags: |
try: |
try: |
tag=xml2html[par.getAttribute('class')] |
tag=xml2html[par.attrib['class']] |
except: |
except: |
tag=('<p>','</p>') |
tag=('<p>','</p>') |
else: |
else: |
tag=tags |
tag=tags |
|
|
content=getText(par.childNodes) |
content=par.text |
|
if content is None: |
|
content="" |
|
logging.debug("part2html:"+content) |
#print "CONTETN",content |
#print "CONTETN",content |
|
|
#print par.getAttribute('class'),node |
#print par.getAttribute('class'),node |
Line 165 def getText(nodelist):
|
Line 209 def getText(nodelist):
|
|
|
rc = u'' |
rc = u'' |
for node in nodelist: |
for node in nodelist: |
print "HHHH" |
|
if node.nodeType == node.TEXT_NODE: |
if node.nodeType == node.TEXT_NODE: |
#print "node",node |
#print "node",node |
#print "NODE",node.data.encode('utf-8','ignore'),"V" |
#print "NODE",node.data.encode('utf-8','ignore'),"V" |
#print "HALII" |
#print "HALII" |
try: |
try: |
try: |
try: |
print "try1" |
|
#rc += node.data.encode('utf-8','ignore') |
#rc += node.data.encode('utf-8','ignore') |
rc += node.data |
rc += node.data |
|
|
except: |
except: |
print "try2" |
|
#rc= node.data.encode('utf-8','ignore') |
#rc= node.data.encode('utf-8','ignore') |
rc=node.data |
rc=node.data |
except: |
except: |
Line 187 def getText(nodelist):
|
Line 228 def getText(nodelist):
|
node.data.encode('utf-8','ignore') |
node.data.encode('utf-8','ignore') |
#print "RC",rc |
#print "RC",rc |
elif node.tagName =="inline": |
elif node.tagName =="inline": |
print "HI", node.getAttribute('class') |
|
rc+=par2html([node]) |
rc+=par2html([node]) |
elif node.attributes: |
elif node.attributes: |
print "xlink?" |
|
if 'xlink:type' in node.attributes.keys(): #is a xlink? |
if 'xlink:type' in node.attributes.keys(): #is a xlink? |
rc +=xlink2html(node) |
rc +=xlink2html(node) |
#print "RWT",rc |
#print "RWT",rc |