File:  [Repository] / ECHO_content / vlp_xmlhelpers.py
Revision 1.12: download - view: text, annotated - select for diffs - revision graph
Thu Jul 26 08:29:06 2007 UTC (16 years, 9 months ago) by casties
Branches: MAIN
CVS tags: HEAD
fixed vlp library update problem with commits to zodb

from sys import argv

import string
import logging
import xml.dom.minidom
import Ft.Xml.XLink.Processor
import Ft.Xml.XLink.XLinkElements

from Ft.Xml import XPath
from Ft.Xml.XPath import Evaluate
from Ft.Xml.XLink import XLINK_NAMESPACE
from Ft.Xml.XLink import XLinkElements
import cStringIO
from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print
from Ft.Xml import EMPTY_NAMESPACE
from Ft.Lib import Uri
import urllib
import re
from ECHO_collection import unicodify,utf8ify

patternTXT=r"<\s*txt.*?>(.*?)</txt>"
regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL)
patternPage=r"<\s*page.*?>(.*?)</page>"
regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL)

xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')}

def addToDict(dict,name,value):
    if name=="":
        return 0
    else:
        
        if not dict.has_key(name):
            dict[name]=[] # als array anlegen

        dict[name].append(value)
        return 1    

def proj2hash(self,xmlstring):
    """wandelt xml-files fuer die projekte in ein hash"""
    
    dom=xml.dom.minidom.parseString(xmlstring)
    
        
    list={}

    #gettitle
    pars=Evaluate('par',dom.getElementsByTagName('part')[0])
    for par in pars:
        className=par.getAttribute('class')
        content=getText(self,par.childNodes)
        addToDict(list,className,content)
             

    sectionXPath="section"

    
    sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
    
    while sections:
        
        for section in sections:
            
            sec=parseSection(self,section)
            
            if sec[0]=="WEB_project_header": # Sonderfall project
                addToDict(list,'WEB_project_header',sec[1]) # store title
                addToDict(list,'WEB_project_description',sec[2]) #store description
            else: # no information in heading
                level=int(sec[3])+2
                aTag="<h%i>"%level
                eTag="</h%i>"%level
                addToDict(list,"text",aTag+sec[1]+eTag)
                addToDict(list,"text",sec[2])
        sectionXPath+="/section"
        sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
    return list


def parseSection(self,section):
    type=""
    header=""
    level=section.getAttribute('level')
    for heading in section.childNodes:
        if getattr(heading,'tagName','')=="heading":
            
            type=heading.getAttribute('class')
            header=getText(self,heading.childNodes)

    if type=="": # falls heading fehlt, pruefe ob erster par richtig
        par=section.getElementsByTagName('par')[0]
        type=par.getAttribute('class')
        header=getText(par.childNodes)

    #print section.childNodes
    #pars=Evaluate('par',section)
    pars=section.childNodes
    content=par2html(self,pars)
    #print "CONTENT",repr(content)
    return (type,header,content,level)

def parseTable(table):
    fields={}
    rows=table.getElementsByTagName('html:tr')
    for row in rows:
        #print "ROW"
        cols=row.getElementsByTagName('html:td')
        
        #Name des Datenfeldes einlesen
        try:
            field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
            #print "field",field
        except:
            print "error"
            field=""

        #Wandeln der Eintrge in HTML

        #pars=cols[1].getElementsByTagName('par')
        pars=cols[1].childNodes
        
        html=par2html(self,pars,tags=("",";"))
        
        addToDict(fields,field,html)
        #print fields
    return fields

def par2html(self,pars,tags=None):
    html=""

    for par in pars:
        tagName=getattr(par,'tagName','')
        if tagName in ["par","inline"]:
            #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND')
            #print "par",par
            if not tags:
                try:
                    tag=xml2htmlArray[par.getAttribute('class')]
                except:
                    tag=('<p>','</p>')
            else:
                tag=tags
            #print "TAG",tag
            content=getText(self,par.childNodes,par.getAttribute('class'))
            
            

            #print par.getAttribute('class'),node
            try:
                html+=tag[0]+content+tag[1]
            except:
                html=+tag[0]+content+tag[1]
            
        elif tagName=="pb":
            html+="<pb/>"
        
    
    try:

        return html
    except:
        return ""

def getXlink(nodes):
    """searches xlinks and gives them back as html"""
    ret=""
    for node in nodes:
        if node.attributes:
            if 'xlink:type' in node.attributes.keys(): #is a xlink?
                ret +=xlink2html(node)
    return ret

def checkRef(self,ref):
        """teste ob reference angezeigt werden sollen"""
        dbs={'vl_literature':'AND online = \'1\'',
             'vl_technology':'AND complete =\'yes\'',
             'vl_people':'AND complete =\'yes\'',
             'vl_sites':'AND complete =\'yes\'',
             'vl_transcript':'AND complete =\'yes\'',
             'vl_essays':'AND online =\'yes\'',
	     'vl_categories':''
             }
        res=None
        for db in dbs.keys():
            searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db]))
            res=res or self.search(var=searchStr)
        return res
    
def link2html(self,str):
        """link2html links in html wandeln"""
        if str:

            str=re.sub("\&","&amp;",str)
            dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
            links=dom.getElementsByTagName("link")
            

            for link in links:
                link.tagName="a"
                ref=link.getAttribute("ref")
                pn=link.getAttribute("page")
                        
                if self.checkRef(ref):
                        if pn:
                                link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
                        else:
                                link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)


            newxml=dom.toxml('utf-8')
          
            retStr=regexpTXT.search(newxml)
            retStr = retStr.group(1)

            return retStr.decode('utf-8') # we return unicode

        return u""

def related2html(self,str):
    """related library items: xlinks in html wandeln / mb 22.11.2006"""
    if str:
                
        str=re.sub("\&","&amp;",str)
        dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
        links=dom.getElementsByTagName("link")
                
        for link in links:
            link.tagName = "a"
            ref = link.getAttribute("ref")
            pn = link.getAttribute("page")
                        
            searchStr="select fullreference, online from vl_literature where reference =\'%s\' and authorized = 1"%(ref)
            res = self.search(var=searchStr)
                                        
            if res:
                if res[0]['online'] == 1: 
                    # item online verfuegbar
                    if pn:
                        link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
                    else:
                        link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
                        
                    link.setAttribute("title", "click to view")
                    link.removeAttribute("ref")
                    
                    # prefix preceding the link
                    prefix = dom.createTextNode(U"\u2013\u0020") # = ndash + space
                    dom.documentElement.insertBefore(prefix, link)

                else:
                    # item nur als bibliographische angabe vorhanden
                    link.setAttribute("alt", unicodify(res[0]['fullreference']))
                    link.setAttribute("title", "click to expand")
                    link.setAttribute("onclick", "return toggle(this);")
                    link.setAttribute("class", "x_offline")
                    
                    # prefix inside link text
                    link.firstChild.data = '+ ' + link.firstChild.data
                    
            
        newxml=dom.toxml('utf-8')
                
        retStr=regexpTXT.search(newxml)
        retStr = retStr.group(1)
        #logging.debug("related2html out=%s"%repr(retStr))
        return retStr.decode('utf-8') # we return unicode

    return u""

    


def xml2html(self,str,quote="yes"):
        """link2html fuer VLP muss hier noch raus"""
        if str:
            if quote=="yes2":
                str=re.sub("\&","&amp;",str)
            #dom=xml.dom.minidom.parseString(str)
            dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/")
            #links=dom.getElementsByTagName("link")
            links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom)
            for link in links:
                #link.tagName="a"
        
                ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref")
                pn=link.getAttributeNS(EMPTY_NAMESPACE,"page")

                cns=link.childNodes[0:]
                
                newLink=dom.createElementNS(EMPTY_NAMESPACE,"a")
                for x in cns:
                        newLink.appendChild(x)
                
                        
                
                link.parentNode.replaceChild(newLink,link)

                if self.checkRef(ref):
                        if pn:
                                newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
                        else:
                                newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref)

            #str= dom.toxml('utf-8')
            buf = cStringIO.StringIO()
            PrettyPrint(dom, stream=buf)
            str = buf.getvalue()
            buf.close()
            #str=PrettyPrint(dom.documentElement,encoding='UTF-8')
            #print link.toxml('utf-8')
            #print type(str)
            retStr=regexpPage.search(str)
            
            try: # hack warum fehtl manchmal page??
                    return retStr.group(1).decode('utf-8')
            except:
                    return str
        return ""

    
def xlink2html(self,xlink,parClass=None):
    ret=""
    attributes=xlink.attributes
 
    if xlink.tagName.lower()=="image":
        ret +="""<img src="%s" />"""%xlink.getAttribute('href')
    elif xlink.tagName.lower()=="link":
        reference=urllib.unquote(xlink.getAttribute('href'))
        label=getText(self,xlink.childNodes)

        # check if href is already a correct url
        if reference.split(":")[0] in ['http','file']:
            if parClass=="Picture":
                ret +="""<img src="%s" />"""%(reference)
            else:

                ret +="""<a href="%s" >%s</a>"""%(reference,label)
        else: # transform
            #href=xml2html(self,reference)
            #print "refer",reference
            reference=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',reference)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt.
            ret +=reference
            
    return ret

def getText(self,nodelist,parClass=None):
    
    rc = u''
    for node in nodelist:
        
    	if node.nodeType == node.TEXT_NODE:

            try:
                try:
                    #rc += node.data.encode('utf-8','ignore')
                    rc += node.data
                                        
                except:
                    #rc= node.data.encode('utf-8','ignore')
                    rc=node.data
            except:
                rc="ERROR"
                #node.data.decode('utf-8','ignore')

            node.data.encode('utf-8','ignore')
            #print "RC",rc
        elif node.tagName =="inline":

            rc+=par2html(self,[node])

        elif node.tagName =="pb":
            rc+="<pb/>"
        elif node.attributes:

            if 'type' in node.attributes.keys(): #is a xlink?

                try:
                    rc +=xlink2html(self,node,parClass).encode('utf-8')
                    
                except:
                    rc +=xlink2html(self,node,parClass)
                    
    #print "RWT",rc        
    return rc


#filename=argv[1]
#fileString=file(filename).read()
#print proj2hash(fileString)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>