from sys import argv
import string
import logging
import xml.dom.minidom
import Ft.Xml.XLink.Processor
import Ft.Xml.XLink.XLinkElements
from Ft.Xml import XPath
from Ft.Xml.XPath import Evaluate
from Ft.Xml.XLink import XLINK_NAMESPACE
from Ft.Xml.XLink import XLinkElements
import cStringIO
from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print
from Ft.Xml import EMPTY_NAMESPACE
from Ft.Lib import Uri
import urllib
import re
from ECHO_collection import unicodify,utf8ify
patternTXT=r"<\s*txt.*?>(.*?)</txt>"
regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL)
patternPage=r"<\s*page.*?>(.*?)</page>"
regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL)
xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')}
def addToDict(dict,name,value):
if name=="":
return 0
else:
if not dict.has_key(name):
dict[name]=[] # als array anlegen
dict[name].append(value)
return 1
def proj2hash(self,xmlstring):
"""wandelt xml-files fuer die projekte in ein hash"""
dom=xml.dom.minidom.parseString(xmlstring)
list={}
#gettitle
pars=Evaluate('par',dom.getElementsByTagName('part')[0])
for par in pars:
className=par.getAttribute('class')
content=getText(self,par.childNodes)
addToDict(list,className,content)
sectionXPath="section"
sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
while sections:
for section in sections:
sec=parseSection(self,section)
if sec[0]=="WEB_project_header": # Sonderfall project
addToDict(list,'WEB_project_header',sec[1]) # store title
addToDict(list,'WEB_project_description',sec[2]) #store description
else: # no information in heading
level=int(sec[3])+2
aTag="<h%i>"%level
eTag="</h%i>"%level
addToDict(list,"text",aTag+sec[1]+eTag)
addToDict(list,"text",sec[2])
sectionXPath+="/section"
sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
return list
def parseSection(self,section):
type=""
header=""
level=section.getAttribute('level')
for heading in section.childNodes:
if getattr(heading,'tagName','')=="heading":
type=heading.getAttribute('class')
header=getText(self,heading.childNodes)
if type=="": # falls heading fehlt, pruefe ob erster par richtig
par=section.getElementsByTagName('par')[0]
type=par.getAttribute('class')
header=getText(par.childNodes)
#print section.childNodes
#pars=Evaluate('par',section)
pars=section.childNodes
content=par2html(self,pars)
#print "CONTENT",repr(content)
return (type,header,content,level)
def parseTable(table):
fields={}
rows=table.getElementsByTagName('html:tr')
for row in rows:
#print "ROW"
cols=row.getElementsByTagName('html:td')
#Name des Datenfeldes einlesen
try:
field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
#print "field",field
except:
print "error"
field=""
#Wandeln der Eintrge in HTML
#pars=cols[1].getElementsByTagName('par')
pars=cols[1].childNodes
html=par2html(self,pars,tags=("",";"))
addToDict(fields,field,html)
#print fields
return fields
def par2html(self,pars,tags=None):
html=""
for par in pars:
tagName=getattr(par,'tagName','')
if tagName in ["par","inline"]:
#print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND')
#print "par",par
if not tags:
try:
tag=xml2htmlArray[par.getAttribute('class')]
except:
tag=('<p>','</p>')
else:
tag=tags
#print "TAG",tag
content=getText(self,par.childNodes,par.getAttribute('class'))
#print par.getAttribute('class'),node
try:
html+=tag[0]+content+tag[1]
except:
html=+tag[0]+content+tag[1]
elif tagName=="pb":
html+="<pb/>"
try:
return html
except:
return ""
def getXlink(nodes):
"""searches xlinks and gives them back as html"""
ret=""
for node in nodes:
if node.attributes:
if 'xlink:type' in node.attributes.keys(): #is a xlink?
ret +=xlink2html(node)
return ret
def checkRef(self,ref):
"""teste ob reference angezeigt werden sollen"""
dbs={'vl_literature':'AND online = \'1\'',
'vl_technology':'AND complete =\'yes\'',
'vl_people':'AND complete =\'yes\'',
'vl_sites':'AND complete =\'yes\'',
'vl_transcript':'AND complete =\'yes\'',
'vl_essays':'AND online =\'yes\'',
'vl_categories':''
}
res=None
for db in dbs.keys():
searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db]))
res=res or self.search(var=searchStr)
return res
def link2html(self,str):
"""link2html liks in html wandeln"""
if str:
str=re.sub("\&","&",str)
dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
links=dom.getElementsByTagName("link")
for link in links:
link.tagName="a"
ref=link.getAttribute("ref")
pn=link.getAttribute("page")
if self.checkRef(ref):
if pn:
link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
else:
link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
newxml=dom.toxml('utf-8')
retStr=regexpTXT.search(newxml)
retStr = retStr.group(1)
return retStr.decode('utf-8') # we return unicode
return u""
def related2html(self,str):
"""related library items: xlinks in html wandeln / mb 22.11.2006"""
if str:
str=re.sub("\&","&",str)
dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+str+"</txt>")
links=dom.getElementsByTagName("link")
for link in links:
link.tagName = "a"
ref = link.getAttribute("ref")
pn = link.getAttribute("page")
searchStr="select fullreference, online from vl_literature where reference =\'%s\' and authorized = 1"%(ref)
res = self.search(var=searchStr)
if res:
if res[0]['online'] == 1:
# item online verfuegbar
if pn:
link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
else:
link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
link.setAttribute("title", "click to view")
link.removeAttribute("ref")
# prefix preceding the link
prefix = dom.createTextNode(U"\u2013\u0020") # = ndash + space
dom.documentElement.insertBefore(prefix, link)
else:
# item nur als bibliographische angabe vorhanden
link.setAttribute("alt", res[0]['fullreference'].decode('utf-8'))
link.setAttribute("title", "click to expand")
link.setAttribute("onclick", "return toggle(this);")
link.setAttribute("class", "x_offline")
# prefix inside link text
link.firstChild.data = '+ ' + link.firstChild.data
newxml=dom.toxml('utf-8')
retStr=regexpTXT.search(newxml)
retStr = retStr.group(1)
return retStr.decode('utf-8') # we return unicode
return u""
def xml2html(self,str,quote="yes"):
"""link2html fuer VLP muss hier noch raus"""
if str:
if quote=="yes2":
str=re.sub("\&","&",str)
#dom=xml.dom.minidom.parseString(str)
dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/")
#links=dom.getElementsByTagName("link")
links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom)
for link in links:
#link.tagName="a"
ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref")
pn=link.getAttributeNS(EMPTY_NAMESPACE,"page")
cns=link.childNodes[0:]
newLink=dom.createElementNS(EMPTY_NAMESPACE,"a")
for x in cns:
newLink.appendChild(x)
link.parentNode.replaceChild(newLink,link)
if self.checkRef(ref):
if pn:
newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
else:
newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
#str= dom.toxml('utf-8')
buf = cStringIO.StringIO()
PrettyPrint(dom, stream=buf)
str = buf.getvalue()
buf.close()
#str=PrettyPrint(dom.documentElement,encoding='UTF-8')
#print link.toxml('utf-8')
#print type(str)
retStr=regexpPage.search(str)
try: # hack warum fehtl manchmal page??
return retStr.group(1).decode('utf-8')
except:
return str
return ""
def xlink2html(self,xlink,parClass=None):
ret=""
attributes=xlink.attributes
if xlink.tagName.lower()=="image":
ret +="""<img src="%s" />"""%xlink.getAttribute('href')
elif xlink.tagName.lower()=="link":
reference=urllib.unquote(xlink.getAttribute('href'))
label=getText(self,xlink.childNodes)
# check if href is already a correct url
if reference.split(":")[0] in ['http','file']:
if parClass=="Picture":
ret +="""<img src="%s" />"""%(reference)
else:
ret +="""<a href="%s" >%s</a>"""%(reference,label)
else: # transform
#href=xml2html(self,reference)
#print "refer",reference
reference=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',reference)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt.
ret +=reference
return ret
def getText(self,nodelist,parClass=None):
rc = u''
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
try:
try:
#rc += node.data.encode('utf-8','ignore')
rc += node.data
except:
#rc= node.data.encode('utf-8','ignore')
rc=node.data
except:
rc="ERROR"
#node.data.decode('utf-8','ignore')
node.data.encode('utf-8','ignore')
#print "RC",rc
elif node.tagName =="inline":
rc+=par2html(self,[node])
elif node.tagName =="pb":
rc+="<pb/>"
elif node.attributes:
if 'type' in node.attributes.keys(): #is a xlink?
try:
rc +=xlink2html(self,node,parClass).encode('utf-8')
except:
rc +=xlink2html(self,node,parClass)
#print "RWT",rc
return rc
#filename=argv[1]
#fileString=file(filename).read()
#print proj2hash(fileString)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>