Annotation of ECHO_content/vlp_xmlhelpers.py, revision 1.10
1.1 dwinter 1: from sys import argv
2:
3: import string
4: import xml.dom.minidom
5: import Ft.Xml.XLink.Processor
6: import Ft.Xml.XLink.XLinkElements
7:
8: from Ft.Xml import XPath
9: from Ft.Xml.XPath import Evaluate
10: from Ft.Xml.XLink import XLINK_NAMESPACE
11: from Ft.Xml.XLink import XLinkElements
1.5 dwinter 12: import cStringIO
13: from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print
14: from Ft.Xml import EMPTY_NAMESPACE
1.1 dwinter 15: from Ft.Lib import Uri
16: import urllib
17: import re
18:
1.5 dwinter 19: patternTXT=r"<\s*txt.*?>(.*?)</txt>"
20: regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL)
1.1 dwinter 21: patternPage=r"<\s*page.*?>(.*?)</page>"
22: regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL)
23:
24: xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')}
25:
26: def addToDict(dict,name,value):
27: if name=="":
28: return 0
29: else:
30:
31: if not dict.has_key(name):
32: dict[name]=[] # als array anlegen
33:
34: dict[name].append(value)
35: return 1
36:
37: def proj2hash(self,xmlstring):
38: """wandelt xml-files fuer die projekte in ein hash"""
39:
40: dom=xml.dom.minidom.parseString(xmlstring)
41:
42:
43: list={}
44:
45: #gettitle
46: pars=Evaluate('par',dom.getElementsByTagName('part')[0])
47: for par in pars:
48: className=par.getAttribute('class')
49: content=getText(self,par.childNodes)
50: addToDict(list,className,content)
51:
52:
53: sectionXPath="section"
54:
55:
56: sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
57:
58: while sections:
59:
60: for section in sections:
61:
62: sec=parseSection(self,section)
63:
64: if sec[0]=="WEB_project_header": # Sonderfall project
65: addToDict(list,'WEB_project_header',sec[1]) # store title
66: addToDict(list,'WEB_project_description',sec[2]) #store description
67: else: # no information in heading
1.3 dwinter 68: level=int(sec[3])+2
69: aTag="<h%i>"%level
70: eTag="</h%i>"%level
1.1 dwinter 71: addToDict(list,"text",aTag+sec[1]+eTag)
72: addToDict(list,"text",sec[2])
73: sectionXPath+="/section"
74: sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
75: return list
76:
77:
78: def parseSection(self,section):
79: type=""
80: header=""
81: level=section.getAttribute('level')
82: for heading in section.childNodes:
83: if getattr(heading,'tagName','')=="heading":
84:
85: type=heading.getAttribute('class')
86: header=getText(self,heading.childNodes)
87:
88: if type=="": # falls heading fehlt, pruefe ob erster par richtig
89: par=section.getElementsByTagName('par')[0]
90: type=par.getAttribute('class')
91: header=getText(par.childNodes)
92:
93: #print section.childNodes
94: #pars=Evaluate('par',section)
95: pars=section.childNodes
96: content=par2html(self,pars)
97: #print "CONTENT",repr(content)
98: return (type,header,content,level)
99:
100: def parseTable(table):
101: fields={}
102: rows=table.getElementsByTagName('html:tr')
103: for row in rows:
104: #print "ROW"
105: cols=row.getElementsByTagName('html:td')
106:
107: #Name des Datenfeldes einlesen
108: try:
109: field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
110: #print "field",field
111: except:
112: print "error"
113: field=""
114:
115: #Wandeln der Eintrge in HTML
116:
117: #pars=cols[1].getElementsByTagName('par')
118: pars=cols[1].childNodes
119:
120: html=par2html(self,pars,tags=("",";"))
121:
122: addToDict(fields,field,html)
123: #print fields
124: return fields
125:
126: def par2html(self,pars,tags=None):
127: html=""
128:
129: for par in pars:
130: tagName=getattr(par,'tagName','')
131: if tagName in ["par","inline"]:
132: #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND')
133: #print "par",par
134: if not tags:
135: try:
136: tag=xml2htmlArray[par.getAttribute('class')]
137: except:
138: tag=('<p>','</p>')
139: else:
140: tag=tags
141: #print "TAG",tag
142: content=getText(self,par.childNodes,par.getAttribute('class'))
143:
144:
145:
146: #print par.getAttribute('class'),node
147: try:
148: html+=tag[0]+content+tag[1]
149: except:
150: html=+tag[0]+content+tag[1]
151:
152: elif tagName=="pb":
153: html+="<pb/>"
1.4 dwinter 154:
1.1 dwinter 155:
156: try:
157:
158: return html
159: except:
160: return ""
161:
162: def getXlink(nodes):
163: """searches xlinks and gives them back as html"""
164: ret=""
165: for node in nodes:
166: if node.attributes:
167: if 'xlink:type' in node.attributes.keys(): #is a xlink?
168: ret +=xlink2html(node)
169: return ret
170:
171: def checkRef(self,ref):
1.5 dwinter 172: """teste ob reference angezeigt werden sollen"""
173: dbs={'vl_literature':'AND online = \'1\'',
174: 'vl_technology':'AND complete =\'yes\'',
175: 'vl_people':'AND complete =\'yes\'',
176: 'vl_sites':'AND complete =\'yes\'',
177: 'vl_transcript':'AND complete =\'yes\'',
178: 'vl_essays':'AND online =\'yes\''
179: }
1.1 dwinter 180: res=None
181: for db in dbs.keys():
1.5 dwinter 182: searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db]))
183: res=res or self.search(var=searchStr)
184: return res
185:
186: def link2html(self,str):
187: """link2html liks in html wandeln"""
188: if str:
189:
190: str=re.sub("\&","&",str)
191: dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+str+"</txt>")
192: links=dom.getElementsByTagName("link")
193:
194:
195: for link in links:
196: link.tagName="a"
197: ref=link.getAttribute("ref")
198: pn=link.getAttribute("page")
199:
200: if self.checkRef(ref):
201: if pn:
202: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
203: else:
204: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
205:
206:
207: newxml=dom.toxml('utf-8')
208:
209: retStr=regexpTXT.search(newxml)
210:
211: return retStr.group(1)
212:
213:
214: return ""
1.7 casties 215:
216: def related2html(self,str):
1.10 ! casties 217: """related library items: xlinks in html wandeln / mb 22.11.2006"""
1.7 casties 218: if str:
219:
220: str=re.sub("\&","&",str)
221: dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+str+"</txt>")
222: links=dom.getElementsByTagName("link")
223:
224: for link in links:
225: link.tagName = "a"
226: ref = link.getAttribute("ref")
227: pn = link.getAttribute("page")
228:
1.8 casties 229: searchStr="select fullreference, online from vl_literature where reference =\'%s\' and authorized = 1"%(ref)
1.7 casties 230: res = self.search(var=searchStr)
231:
232: if res:
1.8 casties 233: if res[0]['online'] == 1:
234: # item online verfuegbar
1.7 casties 235: if pn:
236: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
237: else:
238: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
1.10 ! casties 239:
! 240: link.setAttribute("title", "click to view")
! 241: link.removeAttribute("ref")
! 242:
! 243: # prefix preceding the link
! 244: prefix = dom.createTextNode(U"\u2013\u0020") # = ndash + space
! 245: dom.documentElement.insertBefore(prefix, link)
! 246:
1.7 casties 247: else:
248: # item nur als bibliographische angabe vorhanden
1.9 casties 249: link.setAttribute("alt", res[0]['fullreference'].decode('utf-8'))
1.7 casties 250: link.setAttribute("title", "click to expand")
251: link.setAttribute("onclick", "return toggle(this);")
252: link.setAttribute("class", "x_offline")
1.10 ! casties 253:
! 254: # prefix inside link text
! 255: link.firstChild.data = '+ ' + link.firstChild.data
! 256:
1.7 casties 257:
258: newxml=dom.toxml('utf-8')
259:
260: retStr=regexpTXT.search(newxml)
261:
262: return retStr.group(1)
263:
264: return ""
265:
1.5 dwinter 266:
1.1 dwinter 267:
268:
269: def xml2html(self,str,quote="yes"):
270: """link2html fuer VLP muss hier noch raus"""
271: if str:
272: if quote=="yes2":
273: str=re.sub("\&","&",str)
1.5 dwinter 274: #dom=xml.dom.minidom.parseString(str)
275: dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/")
276: #links=dom.getElementsByTagName("link")
277: links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom)
1.1 dwinter 278: for link in links:
1.5 dwinter 279: #link.tagName="a"
280:
281: ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref")
282: pn=link.getAttributeNS(EMPTY_NAMESPACE,"page")
1.1 dwinter 283:
1.5 dwinter 284: cns=link.childNodes[0:]
285:
286: newLink=dom.createElementNS(EMPTY_NAMESPACE,"a")
287: for x in cns:
288: newLink.appendChild(x)
289:
290:
291:
292: link.parentNode.replaceChild(newLink,link)
293:
294: if self.checkRef(ref):
295: if pn:
1.6 dwinter 296: newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
1.5 dwinter 297: else:
298: newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
299:
300: #str= dom.toxml('utf-8')
301: buf = cStringIO.StringIO()
302: PrettyPrint(dom, stream=buf)
303: str = buf.getvalue()
304: buf.close()
305: #str=PrettyPrint(dom.documentElement,encoding='UTF-8')
306: #print link.toxml('utf-8')
307: #print type(str)
308: retStr=regexpPage.search(str)
309:
310: try: # hack warum fehtl manchmal page??
311: return retStr.group(1)
1.1 dwinter 312: except:
1.5 dwinter 313: return str
1.1 dwinter 314: return ""
1.5 dwinter 315:
1.1 dwinter 316:
317: def xlink2html(self,xlink,parClass=None):
318: ret=""
319: attributes=xlink.attributes
320:
321: if xlink.tagName.lower()=="image":
322: ret +="""<img src="%s" />"""%xlink.getAttribute('href')
323: elif xlink.tagName.lower()=="link":
324: reference=urllib.unquote(xlink.getAttribute('href'))
325: label=getText(self,xlink.childNodes)
326:
327: # check if href is already a correct url
328: if reference.split(":")[0] in ['http','file']:
329: if parClass=="Picture":
330: ret +="""<img src="%s" />"""%(reference)
331: else:
332:
333: ret +="""<a href="%s" >%s</a>"""%(reference,label)
334: else: # transform
335: #href=xml2html(self,reference)
336: #print "refer",reference
337: reference=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',reference)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt.
338: ret +=reference
339:
340: return ret
341:
342: def getText(self,nodelist,parClass=None):
343:
344: rc = u''
345: for node in nodelist:
346:
347: if node.nodeType == node.TEXT_NODE:
348:
349: try:
350: try:
351: #rc += node.data.encode('utf-8','ignore')
352: rc += node.data
353:
354: except:
355: #rc= node.data.encode('utf-8','ignore')
356: rc=node.data
357: except:
358: rc="ERROR"
359: #node.data.decode('utf-8','ignore')
360:
361: node.data.encode('utf-8','ignore')
362: #print "RC",rc
363: elif node.tagName =="inline":
364:
365: rc+=par2html(self,[node])
366:
367: elif node.tagName =="pb":
368: rc+="<pb/>"
369: elif node.attributes:
370:
371: if 'type' in node.attributes.keys(): #is a xlink?
372:
373: try:
374: rc +=xlink2html(self,node,parClass).encode('utf-8')
375:
376: except:
377: rc +=xlink2html(self,node,parClass)
378:
379: #print "RWT",rc
380: return rc
381:
382:
383: #filename=argv[1]
384: #fileString=file(filename).read()
385: #print proj2hash(fileString)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>