File:  [Repository] / ECHO_content / vlp_xmlhelpers.py
Revision 1.7: download - view: text, annotated - select for diffs - revision graph
Tue Nov 21 16:49:58 2006 UTC (17 years, 6 months ago) by casties
Branches: MAIN
CVS tags: HEAD
new function for VLP links

from sys import argv

import string
import xml.dom.minidom
import Ft.Xml.XLink.Processor
import Ft.Xml.XLink.XLinkElements

from Ft.Xml import XPath
from Ft.Xml.XPath import Evaluate
from Ft.Xml.XLink import XLINK_NAMESPACE
from Ft.Xml.XLink import XLinkElements
import cStringIO
from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print
from Ft.Xml import EMPTY_NAMESPACE
from Ft.Lib import Uri
import urllib
import re

patternTXT=r"<\s*txt.*?>(.*?)</txt>"
regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL)
patternPage=r"<\s*page.*?>(.*?)</page>"
regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL)

xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')}

def addToDict(dict,name,value):
    if name=="":
        return 0
    else:
        
        if not dict.has_key(name):
            dict[name]=[] # als array anlegen

        dict[name].append(value)
        return 1    

def proj2hash(self,xmlstring):
    """wandelt xml-files fuer die projekte in ein hash"""
    
    dom=xml.dom.minidom.parseString(xmlstring)
    
        
    list={}

    #gettitle
    pars=Evaluate('par',dom.getElementsByTagName('part')[0])
    for par in pars:
        className=par.getAttribute('class')
        content=getText(self,par.childNodes)
        addToDict(list,className,content)
             

    sectionXPath="section"

    
    sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
    
    while sections:
        
        for section in sections:
            
            sec=parseSection(self,section)
            
            if sec[0]=="WEB_project_header": # Sonderfall project
                addToDict(list,'WEB_project_header',sec[1]) # store title
                addToDict(list,'WEB_project_description',sec[2]) #store description
            else: # no information in heading
                level=int(sec[3])+2
                aTag="<h%i>"%level
                eTag="</h%i>"%level
                addToDict(list,"text",aTag+sec[1]+eTag)
                addToDict(list,"text",sec[2])
        sectionXPath+="/section"
        sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
    return list


def parseSection(self,section):
    type=""
    header=""
    level=section.getAttribute('level')
    for heading in section.childNodes:
        if getattr(heading,'tagName','')=="heading":
            
            type=heading.getAttribute('class')
            header=getText(self,heading.childNodes)

    if type=="": # falls heading fehlt, pruefe ob erster par richtig
        par=section.getElementsByTagName('par')[0]
        type=par.getAttribute('class')
        header=getText(par.childNodes)

    #print section.childNodes
    #pars=Evaluate('par',section)
    pars=section.childNodes
    content=par2html(self,pars)
    #print "CONTENT",repr(content)
    return (type,header,content,level)

def parseTable(table):
    fields={}
    rows=table.getElementsByTagName('html:tr')
    for row in rows:
        #print "ROW"
        cols=row.getElementsByTagName('html:td')
        
        #Name des Datenfeldes einlesen
        try:
            field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
            #print "field",field
        except:
            print "error"
            field=""

        #Wandeln der Eintrge in HTML

        #pars=cols[1].getElementsByTagName('par')
        pars=cols[1].childNodes
        
        html=par2html(self,pars,tags=("",";"))
        
        addToDict(fields,field,html)
        #print fields
    return fields

def par2html(self,pars,tags=None):
    html=""

    for par in pars:
        tagName=getattr(par,'tagName','')
        if tagName in ["par","inline"]:
            #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND')
            #print "par",par
            if not tags:
                try:
                    tag=xml2htmlArray[par.getAttribute('class')]
                except:
                    tag=('<p>','</p>')
            else:
                tag=tags
            #print "TAG",tag
            content=getText(self,par.childNodes,par.getAttribute('class'))
            
            

            #print par.getAttribute('class'),node
            try:
                html+=tag[0]+content+tag[1]
            except:
                html=+tag[0]+content+tag[1]
            
        elif tagName=="pb":
            html+="<pb/>"
        
    
    try:

        return html
    except:
        return ""

def getXlink(nodes):
    """searches xlinks and gives them back as html"""
    ret=""
    for node in nodes:
        if node.attributes:
            if 'xlink:type' in node.attributes.keys(): #is a xlink?
                ret +=xlink2html(node)
    return ret

def checkRef(self,ref):
        """teste ob reference angezeigt werden sollen"""
        dbs={'vl_literature':'AND online = \'1\'',
             'vl_technology':'AND complete =\'yes\'',
             'vl_people':'AND complete =\'yes\'',
             'vl_sites':'AND complete =\'yes\'',
             'vl_transcript':'AND complete =\'yes\'',
             'vl_essays':'AND online =\'yes\''
             }
        res=None
        for db in dbs.keys():
            searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db]))
            res=res or self.search(var=searchStr)
        return res
    
def link2html(self,str):
        """link2html liks in html wandeln"""
        if str:

            str=re.sub("\&","&amp;",str)
            dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+str+"</txt>")
            links=dom.getElementsByTagName("link")
            

            for link in links:
                link.tagName="a"
                ref=link.getAttribute("ref")
                pn=link.getAttribute("page")
                        
                if self.checkRef(ref):
                        if pn:
                                link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
                        else:
                                link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)


            newxml=dom.toxml('utf-8')
          
            retStr=regexpTXT.search(newxml)

            return retStr.group(1)

                           
        return ""

def related2html(self,str):
    """related library items: xlinks in html wandeln / mb 21.11.2006"""
    if str:
                
        str=re.sub("\&","&amp;",str)
        dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+str+"</txt>")
        links=dom.getElementsByTagName("link")
                
        for link in links:
            link.tagName = "a"
            ref = link.getAttribute("ref")
            pn = link.getAttribute("page")
                        
            searchStr=str("select fullreference, online from vl_literature where reference =\'%s\' and authorized = 1"%(ref))
            res = self.search(var=searchStr)
                                        
            if res:
                if res.online == 1: # achtung: syntax?! = feld 'online' vom abfrageergebnis
                                    # item online verfuegbar
                    link.setAttribute("title", "click to view")
                    if pn:
                        link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
                    else:
                        link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
                else:
                    # item nur als bibliographische angabe vorhanden
                    link.setAttribute("alt", res.fullreference) # achtung: syntax?
                    link.setAttribute("title", "click to expand")
                    link.setAttribute("onclick", "return toggle(this);")
                    link.setAttribute("class", "x_offline")
            
        newxml=dom.toxml('utf-8')
                
        retStr=regexpTXT.search(newxml)
                
        return retStr.group(1)
                                           
    return ""

    


def xml2html(self,str,quote="yes"):
        """link2html fuer VLP muss hier noch raus"""
        if str:
            if quote=="yes2":
                str=re.sub("\&","&amp;",str)
            #dom=xml.dom.minidom.parseString(str)
            dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/")
            #links=dom.getElementsByTagName("link")
            links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom)
            for link in links:
                #link.tagName="a"
        
                ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref")
                pn=link.getAttributeNS(EMPTY_NAMESPACE,"page")

                cns=link.childNodes[0:]
                
                newLink=dom.createElementNS(EMPTY_NAMESPACE,"a")
                for x in cns:
                        newLink.appendChild(x)
                
                        
                
                link.parentNode.replaceChild(newLink,link)

                if self.checkRef(ref):
                        if pn:
                                newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
                        else:
                                newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref)

            #str= dom.toxml('utf-8')
            buf = cStringIO.StringIO()
            PrettyPrint(dom, stream=buf)
            str = buf.getvalue()
            buf.close()
            #str=PrettyPrint(dom.documentElement,encoding='UTF-8')
            #print link.toxml('utf-8')
            #print type(str)
            retStr=regexpPage.search(str)
            
            try: # hack warum fehtl manchmal page??
                    return retStr.group(1)
            except:
                    return str
        return ""

    
def xlink2html(self,xlink,parClass=None):
    ret=""
    attributes=xlink.attributes
 
    if xlink.tagName.lower()=="image":
        ret +="""<img src="%s" />"""%xlink.getAttribute('href')
    elif xlink.tagName.lower()=="link":
        reference=urllib.unquote(xlink.getAttribute('href'))
        label=getText(self,xlink.childNodes)

        # check if href is already a correct url
        if reference.split(":")[0] in ['http','file']:
            if parClass=="Picture":
                ret +="""<img src="%s" />"""%(reference)
            else:

                ret +="""<a href="%s" >%s</a>"""%(reference,label)
        else: # transform
            #href=xml2html(self,reference)
            #print "refer",reference
            reference=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',reference)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt.
            ret +=reference
            
    return ret

def getText(self,nodelist,parClass=None):
    
    rc = u''
    for node in nodelist:
        
    	if node.nodeType == node.TEXT_NODE:

            try:
                try:
                    #rc += node.data.encode('utf-8','ignore')
                    rc += node.data
                                        
                except:
                    #rc= node.data.encode('utf-8','ignore')
                    rc=node.data
            except:
                rc="ERROR"
                #node.data.decode('utf-8','ignore')

            node.data.encode('utf-8','ignore')
            #print "RC",rc
        elif node.tagName =="inline":

            rc+=par2html(self,[node])

        elif node.tagName =="pb":
            rc+="<pb/>"
        elif node.attributes:

            if 'type' in node.attributes.keys(): #is a xlink?

                try:
                    rc +=xlink2html(self,node,parClass).encode('utf-8')
                    
                except:
                    rc +=xlink2html(self,node,parClass)
                    
    #print "RWT",rc        
    return rc


#filename=argv[1]
#fileString=file(filename).read()
#print proj2hash(fileString)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>