Annotation of ECHO_content/vlp_xmlhelpers.py, revision 1.14
1.1 dwinter 1: from sys import argv
2:
3: import string
1.11 casties 4: import logging
1.1 dwinter 5: import xml.dom.minidom
6: import Ft.Xml.XLink.Processor
7: import Ft.Xml.XLink.XLinkElements
8:
9: from Ft.Xml import XPath
10: from Ft.Xml.XPath import Evaluate
11: from Ft.Xml.XLink import XLINK_NAMESPACE
12: from Ft.Xml.XLink import XLinkElements
1.5 dwinter 13: import cStringIO
14: from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print
15: from Ft.Xml import EMPTY_NAMESPACE
1.1 dwinter 16: from Ft.Lib import Uri
17: import urllib
18: import re
1.11 casties 19: from ECHO_collection import unicodify,utf8ify
1.1 dwinter 20:
1.5 dwinter 21: patternTXT=r"<\s*txt.*?>(.*?)</txt>"
22: regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL)
1.1 dwinter 23: patternPage=r"<\s*page.*?>(.*?)</page>"
24: regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL)
25:
1.14 ! casties 26: #xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')}
! 27: #
! 28: #def addToDict(dict,name,value):
! 29: # if name=="":
! 30: # return 0
! 31: # else:
! 32: #
! 33: # if not dict.has_key(name):
! 34: # dict[name]=[] # als array anlegen
! 35: #
! 36: # dict[name].append(value)
! 37: # return 1
! 38: #
! 39: #def proj2hash(self,xmlstring):
! 40: # """wandelt xml-files fuer die projekte in ein hash"""
! 41: #
! 42: # dom=xml.dom.minidom.parseString(xmlstring)
! 43: #
! 44: #
! 45: # list={}
! 46: #
! 47: # #gettitle
! 48: # pars=Evaluate('par',dom.getElementsByTagName('part')[0])
! 49: # for par in pars:
! 50: # className=par.getAttribute('class')
! 51: # content=getText(self,par.childNodes)
! 52: # addToDict(list,className,content)
! 53: #
! 54: #
! 55: # sectionXPath="section"
! 56: #
! 57: #
! 58: # sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
! 59: #
! 60: # while sections:
! 61: #
! 62: # for section in sections:
! 63: #
! 64: # sec=parseSection(self,section)
! 65: #
! 66: # if sec[0]=="WEB_project_header": # Sonderfall project
! 67: # addToDict(list,'WEB_project_header',sec[1]) # store title
! 68: # addToDict(list,'WEB_project_description',sec[2]) #store description
! 69: # else: # no information in heading
! 70: # level=int(sec[3])+2
! 71: # aTag="<h%i>"%level
! 72: # eTag="</h%i>"%level
! 73: # addToDict(list,"text",aTag+sec[1]+eTag)
! 74: # addToDict(list,"text",sec[2])
! 75: # sectionXPath+="/section"
! 76: # sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
! 77: # return list
! 78: #
! 79: #
! 80: #def parseSection(self,section):
! 81: # type=""
! 82: # header=""
! 83: # level=section.getAttribute('level')
! 84: # for heading in section.childNodes:
! 85: # if getattr(heading,'tagName','')=="heading":
! 86: #
! 87: # type=heading.getAttribute('class')
! 88: # header=getText(self,heading.childNodes)
! 89: #
! 90: # if type=="": # falls heading fehlt, pruefe ob erster par richtig
! 91: # par=section.getElementsByTagName('par')[0]
! 92: # type=par.getAttribute('class')
! 93: # header=getText(par.childNodes)
! 94: #
! 95: # #print section.childNodes
! 96: # #pars=Evaluate('par',section)
! 97: # pars=section.childNodes
! 98: # content=par2html(self,pars)
! 99: # #print "CONTENT",repr(content)
! 100: # return (type,header,content,level)
! 101: #
! 102: #def parseTable(table):
! 103: # fields={}
! 104: # rows=table.getElementsByTagName('html:tr')
! 105: # for row in rows:
! 106: # #print "ROW"
! 107: # cols=row.getElementsByTagName('html:td')
! 108: #
! 109: # #Name des Datenfeldes einlesen
! 110: # try:
! 111: # field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
! 112: # #print "field",field
! 113: # except:
! 114: # print "error"
! 115: # field=""
! 116: #
! 117: # #Wandeln der Eintrge in HTML
! 118: #
! 119: # #pars=cols[1].getElementsByTagName('par')
! 120: # pars=cols[1].childNodes
! 121: #
! 122: # html=par2html(self,pars,tags=("",";"))
! 123: #
! 124: # addToDict(fields,field,html)
! 125: # #print fields
! 126: # return fields
! 127: #
! 128: #def par2html(self,pars,tags=None):
! 129: # html=""
! 130: #
! 131: # for par in pars:
! 132: # tagName=getattr(par,'tagName','')
! 133: # if tagName in ["par","inline"]:
! 134: # #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND')
! 135: # #print "par",par
! 136: # if not tags:
! 137: # try:
! 138: # tag=xml2htmlArray[par.getAttribute('class')]
! 139: # except:
! 140: # tag=('<p>','</p>')
! 141: # else:
! 142: # tag=tags
! 143: # #print "TAG",tag
! 144: # content=getText(self,par.childNodes,par.getAttribute('class'))
! 145: #
! 146: #
! 147: #
! 148: # #print par.getAttribute('class'),node
! 149: # try:
! 150: # html+=tag[0]+content+tag[1]
! 151: # except:
! 152: # html=+tag[0]+content+tag[1]
! 153: #
! 154: # elif tagName=="pb":
! 155: # html+="<pb/>"
! 156: #
! 157: #
! 158: # try:
! 159: #
! 160: # return html
! 161: # except:
! 162: # return ""
1.1 dwinter 163:
164: def getXlink(nodes):
165: """searches xlinks and gives them back as html"""
166: ret=""
167: for node in nodes:
168: if node.attributes:
169: if 'xlink:type' in node.attributes.keys(): #is a xlink?
170: ret +=xlink2html(node)
171: return ret
172:
173: def checkRef(self,ref):
1.5 dwinter 174: """teste ob reference angezeigt werden sollen"""
175: dbs={'vl_literature':'AND online = \'1\'',
176: 'vl_technology':'AND complete =\'yes\'',
177: 'vl_people':'AND complete =\'yes\'',
178: 'vl_sites':'AND complete =\'yes\'',
179: 'vl_transcript':'AND complete =\'yes\'',
1.11 casties 180: 'vl_essays':'AND online =\'yes\'',
181: 'vl_categories':''
1.5 dwinter 182: }
1.1 dwinter 183: res=None
184: for db in dbs.keys():
1.5 dwinter 185: searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db]))
186: res=res or self.search(var=searchStr)
187: return res
188:
189: def link2html(self,str):
1.12 casties 190: """link2html links in html wandeln"""
1.5 dwinter 191: if str:
192:
193: str=re.sub("\&","&",str)
1.11 casties 194: dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
1.5 dwinter 195: links=dom.getElementsByTagName("link")
196:
197:
198: for link in links:
199: link.tagName="a"
200: ref=link.getAttribute("ref")
201: pn=link.getAttribute("page")
1.13 casties 202: mk=link.getAttribute("mk")
1.5 dwinter 203:
204: if self.checkRef(ref):
1.13 casties 205: more = ""
206: if pn:
207: more += "&page=%s"%pn
208:
209: if mk:
210: more += "&mk=%s"%mk
211:
212: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+more)
1.5 dwinter 213:
214: newxml=dom.toxml('utf-8')
215:
216: retStr=regexpTXT.search(newxml)
1.11 casties 217: retStr = retStr.group(1)
1.5 dwinter 218:
1.11 casties 219: return retStr.decode('utf-8') # we return unicode
1.5 dwinter 220:
1.11 casties 221: return u""
1.7 casties 222:
223: def related2html(self,str):
1.10 casties 224: """related library items: xlinks in html wandeln / mb 22.11.2006"""
1.7 casties 225: if str:
226:
227: str=re.sub("\&","&",str)
1.12 casties 228: dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
1.7 casties 229: links=dom.getElementsByTagName("link")
230:
231: for link in links:
232: link.tagName = "a"
233: ref = link.getAttribute("ref")
234: pn = link.getAttribute("page")
235:
1.8 casties 236: searchStr="select fullreference, online from vl_literature where reference =\'%s\' and authorized = 1"%(ref)
1.7 casties 237: res = self.search(var=searchStr)
238:
239: if res:
1.8 casties 240: if res[0]['online'] == 1:
241: # item online verfuegbar
1.7 casties 242: if pn:
243: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
244: else:
245: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
1.10 casties 246:
247: link.setAttribute("title", "click to view")
248: link.removeAttribute("ref")
249:
250: # prefix preceding the link
251: prefix = dom.createTextNode(U"\u2013\u0020") # = ndash + space
252: dom.documentElement.insertBefore(prefix, link)
253:
1.7 casties 254: else:
255: # item nur als bibliographische angabe vorhanden
1.12 casties 256: link.setAttribute("alt", unicodify(res[0]['fullreference']))
1.7 casties 257: link.setAttribute("title", "click to expand")
258: link.setAttribute("onclick", "return toggle(this);")
259: link.setAttribute("class", "x_offline")
1.10 casties 260:
261: # prefix inside link text
262: link.firstChild.data = '+ ' + link.firstChild.data
263:
1.7 casties 264:
265: newxml=dom.toxml('utf-8')
266:
267: retStr=regexpTXT.search(newxml)
1.11 casties 268: retStr = retStr.group(1)
1.12 casties 269: #logging.debug("related2html out=%s"%repr(retStr))
270: return retStr.decode('utf-8') # we return unicode
1.11 casties 271:
272: return u""
1.7 casties 273:
1.5 dwinter 274:
1.1 dwinter 275:
276:
277: def xml2html(self,str,quote="yes"):
278: """link2html fuer VLP muss hier noch raus"""
279: if str:
280: if quote=="yes2":
281: str=re.sub("\&","&",str)
1.5 dwinter 282: #dom=xml.dom.minidom.parseString(str)
283: dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/")
284: #links=dom.getElementsByTagName("link")
285: links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom)
1.1 dwinter 286: for link in links:
1.5 dwinter 287: #link.tagName="a"
288:
289: ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref")
290: pn=link.getAttributeNS(EMPTY_NAMESPACE,"page")
1.1 dwinter 291:
1.5 dwinter 292: cns=link.childNodes[0:]
293:
294: newLink=dom.createElementNS(EMPTY_NAMESPACE,"a")
295: for x in cns:
296: newLink.appendChild(x)
297:
298:
299:
300: link.parentNode.replaceChild(newLink,link)
301:
302: if self.checkRef(ref):
303: if pn:
1.6 dwinter 304: newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
1.5 dwinter 305: else:
306: newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
307:
308: #str= dom.toxml('utf-8')
309: buf = cStringIO.StringIO()
310: PrettyPrint(dom, stream=buf)
311: str = buf.getvalue()
312: buf.close()
313: #str=PrettyPrint(dom.documentElement,encoding='UTF-8')
314: #print link.toxml('utf-8')
315: #print type(str)
316: retStr=regexpPage.search(str)
317:
318: try: # hack warum fehtl manchmal page??
1.11 casties 319: return retStr.group(1).decode('utf-8')
1.1 dwinter 320: except:
1.5 dwinter 321: return str
1.1 dwinter 322: return ""
1.5 dwinter 323:
1.1 dwinter 324:
325: def xlink2html(self,xlink,parClass=None):
326: ret=""
327: attributes=xlink.attributes
328:
329: if xlink.tagName.lower()=="image":
330: ret +="""<img src="%s" />"""%xlink.getAttribute('href')
331: elif xlink.tagName.lower()=="link":
332: reference=urllib.unquote(xlink.getAttribute('href'))
333: label=getText(self,xlink.childNodes)
334:
335: # check if href is already a correct url
336: if reference.split(":")[0] in ['http','file']:
337: if parClass=="Picture":
338: ret +="""<img src="%s" />"""%(reference)
339: else:
340:
341: ret +="""<a href="%s" >%s</a>"""%(reference,label)
342: else: # transform
343: #href=xml2html(self,reference)
344: #print "refer",reference
345: reference=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',reference)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt.
346: ret +=reference
347:
348: return ret
349:
350: def getText(self,nodelist,parClass=None):
351:
352: rc = u''
353: for node in nodelist:
354:
355: if node.nodeType == node.TEXT_NODE:
356:
357: try:
358: try:
359: #rc += node.data.encode('utf-8','ignore')
360: rc += node.data
361:
362: except:
363: #rc= node.data.encode('utf-8','ignore')
364: rc=node.data
365: except:
366: rc="ERROR"
367: #node.data.decode('utf-8','ignore')
368:
369: node.data.encode('utf-8','ignore')
370: #print "RC",rc
371: elif node.tagName =="inline":
372:
373: rc+=par2html(self,[node])
374:
375: elif node.tagName =="pb":
376: rc+="<pb/>"
377: elif node.attributes:
378:
379: if 'type' in node.attributes.keys(): #is a xlink?
380:
381: try:
382: rc +=xlink2html(self,node,parClass).encode('utf-8')
383:
384: except:
385: rc +=xlink2html(self,node,parClass)
386:
387: #print "RWT",rc
388: return rc
389:
390:
391: #filename=argv[1]
392: #fileString=file(filename).read()
393: #print proj2hash(fileString)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>