1:
2: from sys import argv
3:
4: import string
5: import xml.dom.minidom
6: #import Ft.Xml.XLink.Processor
7: #import Ft.Xml.XLink.XLinkElements
8: #
9: #from Ft.Xml import XPath
10: #from Ft.Xml.XPath import Evaluate
11: #from Ft.Xml.XLink import XLINK_NAMESPACE
12: #from Ft.Xml.XLink import XLinkElements
13:
14: #from Ft.Xml.Domlette import NonvalidatingReader,InputSource
15: #from Ft.Xml import EMPTY_NAMESPACE
16:
17: #from Ft.Lib import Uri
18:
19: from xml.etree import ElementTree
20: import logging
21:
22: xml2html={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('','')}
23:
24: def addToDict(dict,name,value):
25: if name=="":
26: return 0
27: else:
28:
29: if not dict.has_key(name):
30: dict[name]=[] # als array anlegen
31:
32: dict[name].append(value)
33: return 1
34:
35: def proj2hash(xmlstring):
36: """wandelt xml-files fuer die projekte in ein hash"""
37:
38: #dom=xml.dom.minidom.parseString(xmlstring)
39:
40: tree = ElementTree.fromstring(xmlstring)
41:
42:
43: pars = tree.findall(".//part[0]/par")
44:
45: list={}
46:
47: #gettitle
48: #part= dom.getElementsByTagName('part')[0]
49: #pars=part.getElementsByTagName('par')
50: #pars=Evaluate('par',dom.getElementsByTagName('part')[0])
51: logging.debug(pars)
52: for par in pars:
53: logging.debug(par)
54: className=par.attrib['class']
55: #.getAttribute('class')
56: content=par.text
57: addToDict(list,className,content)
58:
59: list.update(parseTable(tree.find('.//{http://www.w3.org/HTML/1998/html4}table'))) # Parse the Table
60:
61: #evaluate level 1
62: sections = tree.findall(".//part[0]/section")
63: #sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections
64: #sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections
65: #print sections,dom.getElementsByTagName('part')[0]
66: for section in sections:
67:
68: sec=parseSection(section)
69: if sec[0]=="WEB_project_header": # Sonderfall project
70:
71: addToDict(list,'WEB_project_header',sec[1]) # store title
72: addToDict(list,'WEB_project_description',sec[2]) #store description
73: else: # no information in heading
74:
75: addToDict(list,sec[0],sec[2])
76:
77: #evaluate higher level sections
78: sections = tree.findall(".//part[0]/section/section")
79: #sections=Evaluate('section/section',dom.getElementsByTagName('part')[0])
80:
81: for section in sections:
82: logging.debug("sections2:"+repr(section))
83: sec=parseSection(section)
84:
85: if sec[0]=="WEB_project_header": # Sonderfall project
86: addToDict(list,'WEB_project_header',sec[1]) # store title
87: addToDict(list,'WEB_project_description',sec[2]) #store description
88: else: # no information in heading
89: addToDict(list,sec[0],sec[2])
90:
91:
92: return list
93:
94:
95: def parseSection(section):
96: type=""
97: header=""
98: #for heading in section.childNodes:
99:
100: heading=section.find(".//heading")
101: # if getattr(heading,'tagName','')=="heading":
102:
103:
104: type=heading.attrib['class']
105: logging.debug("parseSection (class):"+type)
106: header=heading.text
107: logging.debug("parseSection (header):"+header)
108:
109: if type=="": # falls heading fehlt, pruefe ob erster par richtig
110: par=section.find(".//par")
111: #par=section.getElementsByTagName('par')[0]
112: type=par.attrib['class']
113: header=par.text
114:
115: #print section.childNodes
116: pars=section.findall(".//par")
117: #pars=Evaluate('par',section)
118: content=par2html(pars)
119:
120: return (type,header,content)
121:
122: def parseTable(table):
123: fields={}
124: rows=table.findall('.//{http://www.w3.org/HTML/1998/html4}tr')
125: #rows=table.getElementsByTagName('html:tr')
126: for row in rows:
127: logging.debug("ROW")
128: cols=row.findall('.//{http://www.w3.org/HTML/1998/html4}td')
129: #cols=row.getElementsByTagName('html:td')
130:
131: #Name des Datenfeldes einlesen
132: try:
133: field=cols[0].find('.//par').attrib['class']
134: #field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
135: #print "field",field
136: except:
137: logging.debug("error")
138: field=""
139:
140: #Wandeln der Eintrge in HTML
141:
142: pars=cols[1].findall('.//par')
143: #pars=cols[1].getElementsByTagName('par')
144:
145:
146: html=par2html(pars,tags=("",";"))
147: logging.debug("field:"+field)
148: logging.debug("html:"+html)
149: addToDict(fields,field,html)
150: #print fields
151: return fields
152:
153: def par2html(pars,tags=None):
154: #html=""
155: logging.debug("part2html:"+repr(pars))
156: if pars is None:
157: return ""
158: for par in pars:
159: logging.debug("part2html:"+repr(par))
160: if not tags:
161: try:
162: tag=xml2html[par.attrib['class']]
163: except:
164: tag=('<p>','</p>')
165: else:
166: tag=tags
167:
168: content=par.text
169: if content is None:
170: content=""
171: logging.debug("part2html:"+content)
172: #print "CONTETN",content
173:
174: #print par.getAttribute('class'),node
175: try:
176: html=html+tag[0]+content+tag[1]
177: except:
178: html=tag[0]+content+tag[1]
179:
180: try:
181: return html
182: except:
183: return ""
184:
185: def getXlink(nodes):
186: """searches xlinks and gives them back as html"""
187: ret=""
188: for node in nodes:
189: if node.attributes:
190: if 'xlink:type' in node.attributes.keys(): #is a xlink?
191: ret +=xlink2html(node)
192: return ret
193:
194: def xlink2html(xlink):
195: ret=""
196: attributes=xlink.attributes
197:
198: if xlink.tagName.lower()=="image":
199: ret +="<img src=%s />"%xlink.getAttribute('xlink:href')
200: elif xlink.tagName.lower()=="link":
201: ret +="<a href='%s' >%s</a>"%(xlink.getAttribute('xlink:href'),getText(xlink.childNodes))
202:
203:
204:
205:
206: return ret
207:
208: def getText(nodelist):
209:
210: rc = u''
211: for node in nodelist:
212: if node.nodeType == node.TEXT_NODE:
213: #print "node",node
214: #print "NODE",node.data.encode('utf-8','ignore'),"V"
215: #print "HALII"
216: try:
217: try:
218: #rc += node.data.encode('utf-8','ignore')
219: rc += node.data
220:
221: except:
222: #rc= node.data.encode('utf-8','ignore')
223: rc=node.data
224: except:
225: rc="ERROR"
226: #node.data.decode('utf-8','ignore')
227: print "ERROR"
228: node.data.encode('utf-8','ignore')
229: #print "RC",rc
230: elif node.tagName =="inline":
231: rc+=par2html([node])
232: elif node.attributes:
233:
234: if 'xlink:type' in node.attributes.keys(): #is a xlink?
235: rc +=xlink2html(node)
236: #print "RWT",rc
237: return rc
238:
239:
240: #filename=argv[1]
241: #fileString=file(filename).read()
242: #print proj2hash(fileString)
243:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>