Annotation of ECHO_content/vlp_xmlhelpers.py, revision 1.13
1.1 dwinter 1: from sys import argv
2:
3: import string
1.11 casties 4: import logging
1.1 dwinter 5: import xml.dom.minidom
6: import Ft.Xml.XLink.Processor
7: import Ft.Xml.XLink.XLinkElements
8:
9: from Ft.Xml import XPath
10: from Ft.Xml.XPath import Evaluate
11: from Ft.Xml.XLink import XLINK_NAMESPACE
12: from Ft.Xml.XLink import XLinkElements
1.5 dwinter 13: import cStringIO
14: from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print
15: from Ft.Xml import EMPTY_NAMESPACE
1.1 dwinter 16: from Ft.Lib import Uri
17: import urllib
18: import re
1.11 casties 19: from ECHO_collection import unicodify,utf8ify
1.1 dwinter 20:
1.5 dwinter 21: patternTXT=r"<\s*txt.*?>(.*?)</txt>"
22: regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL)
1.1 dwinter 23: patternPage=r"<\s*page.*?>(.*?)</page>"
24: regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL)
25:
26: xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')}
27:
28: def addToDict(dict,name,value):
29: if name=="":
30: return 0
31: else:
32:
33: if not dict.has_key(name):
34: dict[name]=[] # als array anlegen
35:
36: dict[name].append(value)
37: return 1
38:
39: def proj2hash(self,xmlstring):
40: """wandelt xml-files fuer die projekte in ein hash"""
41:
42: dom=xml.dom.minidom.parseString(xmlstring)
43:
44:
45: list={}
46:
47: #gettitle
48: pars=Evaluate('par',dom.getElementsByTagName('part')[0])
49: for par in pars:
50: className=par.getAttribute('class')
51: content=getText(self,par.childNodes)
52: addToDict(list,className,content)
53:
54:
55: sectionXPath="section"
56:
57:
58: sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
59:
60: while sections:
61:
62: for section in sections:
63:
64: sec=parseSection(self,section)
65:
66: if sec[0]=="WEB_project_header": # Sonderfall project
67: addToDict(list,'WEB_project_header',sec[1]) # store title
68: addToDict(list,'WEB_project_description',sec[2]) #store description
69: else: # no information in heading
1.3 dwinter 70: level=int(sec[3])+2
71: aTag="<h%i>"%level
72: eTag="</h%i>"%level
1.1 dwinter 73: addToDict(list,"text",aTag+sec[1]+eTag)
74: addToDict(list,"text",sec[2])
75: sectionXPath+="/section"
76: sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
77: return list
78:
79:
80: def parseSection(self,section):
81: type=""
82: header=""
83: level=section.getAttribute('level')
84: for heading in section.childNodes:
85: if getattr(heading,'tagName','')=="heading":
86:
87: type=heading.getAttribute('class')
88: header=getText(self,heading.childNodes)
89:
90: if type=="": # falls heading fehlt, pruefe ob erster par richtig
91: par=section.getElementsByTagName('par')[0]
92: type=par.getAttribute('class')
93: header=getText(par.childNodes)
94:
95: #print section.childNodes
96: #pars=Evaluate('par',section)
97: pars=section.childNodes
98: content=par2html(self,pars)
99: #print "CONTENT",repr(content)
100: return (type,header,content,level)
101:
102: def parseTable(table):
103: fields={}
104: rows=table.getElementsByTagName('html:tr')
105: for row in rows:
106: #print "ROW"
107: cols=row.getElementsByTagName('html:td')
108:
109: #Name des Datenfeldes einlesen
110: try:
111: field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
112: #print "field",field
113: except:
114: print "error"
115: field=""
116:
117: #Wandeln der Eintrge in HTML
118:
119: #pars=cols[1].getElementsByTagName('par')
120: pars=cols[1].childNodes
121:
122: html=par2html(self,pars,tags=("",";"))
123:
124: addToDict(fields,field,html)
125: #print fields
126: return fields
127:
128: def par2html(self,pars,tags=None):
129: html=""
130:
131: for par in pars:
132: tagName=getattr(par,'tagName','')
133: if tagName in ["par","inline"]:
134: #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND')
135: #print "par",par
136: if not tags:
137: try:
138: tag=xml2htmlArray[par.getAttribute('class')]
139: except:
140: tag=('<p>','</p>')
141: else:
142: tag=tags
143: #print "TAG",tag
144: content=getText(self,par.childNodes,par.getAttribute('class'))
145:
146:
147:
148: #print par.getAttribute('class'),node
149: try:
150: html+=tag[0]+content+tag[1]
151: except:
152: html=+tag[0]+content+tag[1]
153:
154: elif tagName=="pb":
155: html+="<pb/>"
1.4 dwinter 156:
1.1 dwinter 157:
158: try:
159:
160: return html
161: except:
162: return ""
163:
164: def getXlink(nodes):
165: """searches xlinks and gives them back as html"""
166: ret=""
167: for node in nodes:
168: if node.attributes:
169: if 'xlink:type' in node.attributes.keys(): #is a xlink?
170: ret +=xlink2html(node)
171: return ret
172:
173: def checkRef(self,ref):
1.5 dwinter 174: """teste ob reference angezeigt werden sollen"""
175: dbs={'vl_literature':'AND online = \'1\'',
176: 'vl_technology':'AND complete =\'yes\'',
177: 'vl_people':'AND complete =\'yes\'',
178: 'vl_sites':'AND complete =\'yes\'',
179: 'vl_transcript':'AND complete =\'yes\'',
1.11 casties 180: 'vl_essays':'AND online =\'yes\'',
181: 'vl_categories':''
1.5 dwinter 182: }
1.1 dwinter 183: res=None
184: for db in dbs.keys():
1.5 dwinter 185: searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db]))
186: res=res or self.search(var=searchStr)
187: return res
188:
189: def link2html(self,str):
1.12 casties 190: """link2html links in html wandeln"""
1.5 dwinter 191: if str:
192:
193: str=re.sub("\&","&",str)
1.11 casties 194: dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
1.5 dwinter 195: links=dom.getElementsByTagName("link")
196:
197:
198: for link in links:
199: link.tagName="a"
200: ref=link.getAttribute("ref")
201: pn=link.getAttribute("page")
1.13 ! casties 202: mk=link.getAttribute("mk")
1.5 dwinter 203:
204: if self.checkRef(ref):
1.13 ! casties 205: more = ""
! 206: if pn:
! 207: more += "&page=%s"%pn
! 208:
! 209: if mk:
! 210: more += "&mk=%s"%mk
! 211:
! 212: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+more)
1.5 dwinter 213:
214: newxml=dom.toxml('utf-8')
215:
216: retStr=regexpTXT.search(newxml)
1.11 casties 217: retStr = retStr.group(1)
1.5 dwinter 218:
1.11 casties 219: return retStr.decode('utf-8') # we return unicode
1.5 dwinter 220:
1.11 casties 221: return u""
1.7 casties 222:
223: def related2html(self,str):
1.10 casties 224: """related library items: xlinks in html wandeln / mb 22.11.2006"""
1.7 casties 225: if str:
226:
227: str=re.sub("\&","&",str)
1.12 casties 228: dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
1.7 casties 229: links=dom.getElementsByTagName("link")
230:
231: for link in links:
232: link.tagName = "a"
233: ref = link.getAttribute("ref")
234: pn = link.getAttribute("page")
235:
1.8 casties 236: searchStr="select fullreference, online from vl_literature where reference =\'%s\' and authorized = 1"%(ref)
1.7 casties 237: res = self.search(var=searchStr)
238:
239: if res:
1.8 casties 240: if res[0]['online'] == 1:
241: # item online verfuegbar
1.7 casties 242: if pn:
243: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
244: else:
245: link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
1.10 casties 246:
247: link.setAttribute("title", "click to view")
248: link.removeAttribute("ref")
249:
250: # prefix preceding the link
251: prefix = dom.createTextNode(U"\u2013\u0020") # = ndash + space
252: dom.documentElement.insertBefore(prefix, link)
253:
1.7 casties 254: else:
255: # item nur als bibliographische angabe vorhanden
1.12 casties 256: link.setAttribute("alt", unicodify(res[0]['fullreference']))
1.7 casties 257: link.setAttribute("title", "click to expand")
258: link.setAttribute("onclick", "return toggle(this);")
259: link.setAttribute("class", "x_offline")
1.10 casties 260:
261: # prefix inside link text
262: link.firstChild.data = '+ ' + link.firstChild.data
263:
1.7 casties 264:
265: newxml=dom.toxml('utf-8')
266:
267: retStr=regexpTXT.search(newxml)
1.11 casties 268: retStr = retStr.group(1)
1.12 casties 269: #logging.debug("related2html out=%s"%repr(retStr))
270: return retStr.decode('utf-8') # we return unicode
1.11 casties 271:
272: return u""
1.7 casties 273:
1.5 dwinter 274:
1.1 dwinter 275:
276:
277: def xml2html(self,str,quote="yes"):
278: """link2html fuer VLP muss hier noch raus"""
279: if str:
280: if quote=="yes2":
281: str=re.sub("\&","&",str)
1.5 dwinter 282: #dom=xml.dom.minidom.parseString(str)
283: dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/")
284: #links=dom.getElementsByTagName("link")
285: links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom)
1.1 dwinter 286: for link in links:
1.5 dwinter 287: #link.tagName="a"
288:
289: ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref")
290: pn=link.getAttributeNS(EMPTY_NAMESPACE,"page")
1.1 dwinter 291:
1.5 dwinter 292: cns=link.childNodes[0:]
293:
294: newLink=dom.createElementNS(EMPTY_NAMESPACE,"a")
295: for x in cns:
296: newLink.appendChild(x)
297:
298:
299:
300: link.parentNode.replaceChild(newLink,link)
301:
302: if self.checkRef(ref):
303: if pn:
1.6 dwinter 304: newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
1.5 dwinter 305: else:
306: newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
307:
308: #str= dom.toxml('utf-8')
309: buf = cStringIO.StringIO()
310: PrettyPrint(dom, stream=buf)
311: str = buf.getvalue()
312: buf.close()
313: #str=PrettyPrint(dom.documentElement,encoding='UTF-8')
314: #print link.toxml('utf-8')
315: #print type(str)
316: retStr=regexpPage.search(str)
317:
318: try: # hack warum fehtl manchmal page??
1.11 casties 319: return retStr.group(1).decode('utf-8')
1.1 dwinter 320: except:
1.5 dwinter 321: return str
1.1 dwinter 322: return ""
1.5 dwinter 323:
1.1 dwinter 324:
325: def xlink2html(self,xlink,parClass=None):
326: ret=""
327: attributes=xlink.attributes
328:
329: if xlink.tagName.lower()=="image":
330: ret +="""<img src="%s" />"""%xlink.getAttribute('href')
331: elif xlink.tagName.lower()=="link":
332: reference=urllib.unquote(xlink.getAttribute('href'))
333: label=getText(self,xlink.childNodes)
334:
335: # check if href is already a correct url
336: if reference.split(":")[0] in ['http','file']:
337: if parClass=="Picture":
338: ret +="""<img src="%s" />"""%(reference)
339: else:
340:
341: ret +="""<a href="%s" >%s</a>"""%(reference,label)
342: else: # transform
343: #href=xml2html(self,reference)
344: #print "refer",reference
345: reference=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',reference)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt.
346: ret +=reference
347:
348: return ret
349:
350: def getText(self,nodelist,parClass=None):
351:
352: rc = u''
353: for node in nodelist:
354:
355: if node.nodeType == node.TEXT_NODE:
356:
357: try:
358: try:
359: #rc += node.data.encode('utf-8','ignore')
360: rc += node.data
361:
362: except:
363: #rc= node.data.encode('utf-8','ignore')
364: rc=node.data
365: except:
366: rc="ERROR"
367: #node.data.decode('utf-8','ignore')
368:
369: node.data.encode('utf-8','ignore')
370: #print "RC",rc
371: elif node.tagName =="inline":
372:
373: rc+=par2html(self,[node])
374:
375: elif node.tagName =="pb":
376: rc+="<pb/>"
377: elif node.attributes:
378:
379: if 'type' in node.attributes.keys(): #is a xlink?
380:
381: try:
382: rc +=xlink2html(self,node,parClass).encode('utf-8')
383:
384: except:
385: rc +=xlink2html(self,node,parClass)
386:
387: #print "RWT",rc
388: return rc
389:
390:
391: #filename=argv[1]
392: #fileString=file(filename).read()
393: #print proj2hash(fileString)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>