Annotation of ECHO_content/vlp_xmlhelpers.py, revision 1.12
1.1 dwinter 1: from sys import argv
2:
3: import string
1.11 casties 4: import logging
1.1 dwinter 5: import xml.dom.minidom
6: import Ft.Xml.XLink.Processor
7: import Ft.Xml.XLink.XLinkElements
8:
9: from Ft.Xml import XPath
10: from Ft.Xml.XPath import Evaluate
11: from Ft.Xml.XLink import XLINK_NAMESPACE
12: from Ft.Xml.XLink import XLinkElements
1.5 dwinter 13: import cStringIO
14: from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print
15: from Ft.Xml import EMPTY_NAMESPACE
1.1 dwinter 16: from Ft.Lib import Uri
17: import urllib
18: import re
1.11 casties 19: from ECHO_collection import unicodify,utf8ify
1.1 dwinter 20:
1.5 dwinter 21: patternTXT=r"<\s*txt.*?>(.*?)</txt>"
22: regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL)
1.1 dwinter 23: patternPage=r"<\s*page.*?>(.*?)</page>"
24: regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL)
25:
26: xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')}
27:
28: def addToDict(dict,name,value):
29: if name=="":
30: return 0
31: else:
32:
33: if not dict.has_key(name):
34: dict[name]=[] # als array anlegen
35:
36: dict[name].append(value)
37: return 1
38:
39: def proj2hash(self,xmlstring):
40: """wandelt xml-files fuer die projekte in ein hash"""
41:
42: dom=xml.dom.minidom.parseString(xmlstring)
43:
44:
45: list={}
46:
47: #gettitle
48: pars=Evaluate('par',dom.getElementsByTagName('part')[0])
49: for par in pars:
50: className=par.getAttribute('class')
51: content=getText(self,par.childNodes)
52: addToDict(list,className,content)
53:
54:
55: sectionXPath="section"
56:
57:
58: sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
59:
60: while sections:
61:
62: for section in sections:
63:
64: sec=parseSection(self,section)
65:
66: if sec[0]=="WEB_project_header": # Sonderfall project
67: addToDict(list,'WEB_project_header',sec[1]) # store title
68: addToDict(list,'WEB_project_description',sec[2]) #store description
69: else: # no information in heading
1.3 dwinter 70: level=int(sec[3])+2
71: aTag="<h%i>"%level
72: eTag="</h%i>"%level
1.1 dwinter 73: addToDict(list,"text",aTag+sec[1]+eTag)
74: addToDict(list,"text",sec[2])
75: sectionXPath+="/section"
76: sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
77: return list
78:
79:
80: def parseSection(self,section):
81: type=""
82: header=""
83: level=section.getAttribute('level')
84: for heading in section.childNodes:
85: if getattr(heading,'tagName','')=="heading":
86:
87: type=heading.getAttribute('class')
88: header=getText(self,heading.childNodes)
89:
90: if type=="": # falls heading fehlt, pruefe ob erster par richtig
91: par=section.getElementsByTagName('par')[0]
92: type=par.getAttribute('class')
93: header=getText(par.childNodes)
94:
95: #print section.childNodes
96: #pars=Evaluate('par',section)
97: pars=section.childNodes
98: content=par2html(self,pars)
99: #print "CONTENT",repr(content)
100: return (type,header,content,level)
101:
102: def parseTable(table):
103: fields={}
104: rows=table.getElementsByTagName('html:tr')
105: for row in rows:
106: #print "ROW"
107: cols=row.getElementsByTagName('html:td')
108:
109: #Name des Datenfeldes einlesen
110: try:
111: field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
112: #print "field",field
113: except:
114: print "error"
115: field=""
116:
117: #Wandeln der Eintrge in HTML
118:
119: #pars=cols[1].getElementsByTagName('par')
120: pars=cols[1].childNodes
121:
122: html=par2html(self,pars,tags=("",";"))
123:
124: addToDict(fields,field,html)
125: #print fields
126: return fields
127:
128: def par2html(self,pars,tags=None):
129: html=""
130:
131: for par in pars:
132: tagName=getattr(par,'tagName','')
133: if tagName in ["par","inline"]:
134: #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND')
135: #print "par",par
136: if not tags:
137: try:
138: tag=xml2htmlArray[par.getAttribute('class')]
139: except:
140: tag=('<p>','</p>')
141: else:
142: tag=tags
143: #print "TAG",tag
144: content=getText(self,par.childNodes,par.getAttribute('class'))
145:
146:
147:
148: #print par.getAttribute('class'),node
149: try:
150: html+=tag[0]+content+tag[1]
151: except:
152: html=+tag[0]+content+tag[1]
153:
154: elif tagName=="pb":
155: html+="<pb/>"
1.4 dwinter 156:
1.1 dwinter 157:
158: try:
159:
160: return html
161: except:
162: return ""
163:
164: def getXlink(nodes):
165: """searches xlinks and gives them back as html"""
166: ret=""
167: for node in nodes:
168: if node.attributes:
169: if 'xlink:type' in node.attributes.keys(): #is a xlink?
170: ret +=xlink2html(node)
171: return ret
172:
173: def checkRef(self,ref):
1.5 dwinter 174: """teste ob reference angezeigt werden sollen"""
175: dbs={'vl_literature':'AND online = \'1\'',
176: 'vl_technology':'AND complete =\'yes\'',
177: 'vl_people':'AND complete =\'yes\'',
178: 'vl_sites':'AND complete =\'yes\'',
179: 'vl_transcript':'AND complete =\'yes\'',
1.11 casties 180: 'vl_essays':'AND online =\'yes\'',
181: 'vl_categories':''
1.5 dwinter 182: }
1.1 dwinter 183: res=None
184: for db in dbs.keys():
1.5 dwinter 185: searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db]))
186: res=res or self.search(var=searchStr)
187: return res
188:
189: def link2html(self,str):
1.12 ! casties 190: """link2html links in html wandeln"""
1.5 dwinter 191: if str:
192:
193: str=re.sub("\&","&",str)
1.11 casties 194: dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
1.5 dwinter 195: links=dom.getElementsByTagName("link")
196:
197:
198: for link in links:
199: link.tagName="a"
200: ref=link.getAttribute("ref")
201: pn=link.getAttribute("page")
202:
203: if self.checkRef(ref):
204: if pn:
205: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
206: else:
207: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
208:
209:
210: newxml=dom.toxml('utf-8')
211:
212: retStr=regexpTXT.search(newxml)
1.11 casties 213: retStr = retStr.group(1)
1.5 dwinter 214:
1.11 casties 215: return retStr.decode('utf-8') # we return unicode
1.5 dwinter 216:
1.11 casties 217: return u""
1.7 casties 218:
219: def related2html(self,str):
1.10 casties 220: """related library items: xlinks in html wandeln / mb 22.11.2006"""
1.7 casties 221: if str:
222:
223: str=re.sub("\&","&",str)
1.12 ! casties 224: dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
1.7 casties 225: links=dom.getElementsByTagName("link")
226:
227: for link in links:
228: link.tagName = "a"
229: ref = link.getAttribute("ref")
230: pn = link.getAttribute("page")
231:
1.8 casties 232: searchStr="select fullreference, online from vl_literature where reference =\'%s\' and authorized = 1"%(ref)
1.7 casties 233: res = self.search(var=searchStr)
234:
235: if res:
1.8 casties 236: if res[0]['online'] == 1:
237: # item online verfuegbar
1.7 casties 238: if pn:
239: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
240: else:
241: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
1.10 casties 242:
243: link.setAttribute("title", "click to view")
244: link.removeAttribute("ref")
245:
246: # prefix preceding the link
247: prefix = dom.createTextNode(U"\u2013\u0020") # = ndash + space
248: dom.documentElement.insertBefore(prefix, link)
249:
1.7 casties 250: else:
251: # item nur als bibliographische angabe vorhanden
1.12 ! casties 252: link.setAttribute("alt", unicodify(res[0]['fullreference']))
1.7 casties 253: link.setAttribute("title", "click to expand")
254: link.setAttribute("onclick", "return toggle(this);")
255: link.setAttribute("class", "x_offline")
1.10 casties 256:
257: # prefix inside link text
258: link.firstChild.data = '+ ' + link.firstChild.data
259:
1.7 casties 260:
261: newxml=dom.toxml('utf-8')
262:
263: retStr=regexpTXT.search(newxml)
1.11 casties 264: retStr = retStr.group(1)
1.12 ! casties 265: #logging.debug("related2html out=%s"%repr(retStr))
! 266: return retStr.decode('utf-8') # we return unicode
1.11 casties 267:
268: return u""
1.7 casties 269:
1.5 dwinter 270:
1.1 dwinter 271:
272:
273: def xml2html(self,str,quote="yes"):
274: """link2html fuer VLP muss hier noch raus"""
275: if str:
276: if quote=="yes2":
277: str=re.sub("\&","&",str)
1.5 dwinter 278: #dom=xml.dom.minidom.parseString(str)
279: dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/")
280: #links=dom.getElementsByTagName("link")
281: links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom)
1.1 dwinter 282: for link in links:
1.5 dwinter 283: #link.tagName="a"
284:
285: ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref")
286: pn=link.getAttributeNS(EMPTY_NAMESPACE,"page")
1.1 dwinter 287:
1.5 dwinter 288: cns=link.childNodes[0:]
289:
290: newLink=dom.createElementNS(EMPTY_NAMESPACE,"a")
291: for x in cns:
292: newLink.appendChild(x)
293:
294:
295:
296: link.parentNode.replaceChild(newLink,link)
297:
298: if self.checkRef(ref):
299: if pn:
1.6 dwinter 300: newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
1.5 dwinter 301: else:
302: newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
303:
304: #str= dom.toxml('utf-8')
305: buf = cStringIO.StringIO()
306: PrettyPrint(dom, stream=buf)
307: str = buf.getvalue()
308: buf.close()
309: #str=PrettyPrint(dom.documentElement,encoding='UTF-8')
310: #print link.toxml('utf-8')
311: #print type(str)
312: retStr=regexpPage.search(str)
313:
314: try: # hack warum fehtl manchmal page??
1.11 casties 315: return retStr.group(1).decode('utf-8')
1.1 dwinter 316: except:
1.5 dwinter 317: return str
1.1 dwinter 318: return ""
1.5 dwinter 319:
1.1 dwinter 320:
321: def xlink2html(self,xlink,parClass=None):
322: ret=""
323: attributes=xlink.attributes
324:
325: if xlink.tagName.lower()=="image":
326: ret +="""<img src="%s" />"""%xlink.getAttribute('href')
327: elif xlink.tagName.lower()=="link":
328: reference=urllib.unquote(xlink.getAttribute('href'))
329: label=getText(self,xlink.childNodes)
330:
331: # check if href is already a correct url
332: if reference.split(":")[0] in ['http','file']:
333: if parClass=="Picture":
334: ret +="""<img src="%s" />"""%(reference)
335: else:
336:
337: ret +="""<a href="%s" >%s</a>"""%(reference,label)
338: else: # transform
339: #href=xml2html(self,reference)
340: #print "refer",reference
341: reference=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',reference)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt.
342: ret +=reference
343:
344: return ret
345:
346: def getText(self,nodelist,parClass=None):
347:
348: rc = u''
349: for node in nodelist:
350:
351: if node.nodeType == node.TEXT_NODE:
352:
353: try:
354: try:
355: #rc += node.data.encode('utf-8','ignore')
356: rc += node.data
357:
358: except:
359: #rc= node.data.encode('utf-8','ignore')
360: rc=node.data
361: except:
362: rc="ERROR"
363: #node.data.decode('utf-8','ignore')
364:
365: node.data.encode('utf-8','ignore')
366: #print "RC",rc
367: elif node.tagName =="inline":
368:
369: rc+=par2html(self,[node])
370:
371: elif node.tagName =="pb":
372: rc+="<pb/>"
373: elif node.attributes:
374:
375: if 'type' in node.attributes.keys(): #is a xlink?
376:
377: try:
378: rc +=xlink2html(self,node,parClass).encode('utf-8')
379:
380: except:
381: rc +=xlink2html(self,node,parClass)
382:
383: #print "RWT",rc
384: return rc
385:
386:
387: #filename=argv[1]
388: #fileString=file(filename).read()
389: #print proj2hash(fileString)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>