File:  [Repository] / ECHO_content / vlp_xmlhelpers.py
Revision 1.15: download - view: text, annotated - select for diffs - revision graph
Mon Sep 8 11:12:41 2008 UTC (15 years, 8 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
*** empty log message ***

from sys import argv

import string
import logging
import xml.dom.minidom
import Ft.Xml.XLink.Processor
import Ft.Xml.XLink.XLinkElements

from Ft.Xml import XPath
from Ft.Xml.XPath import Evaluate
from Ft.Xml.XLink import XLINK_NAMESPACE
from Ft.Xml.XLink import XLinkElements
import cStringIO
from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print
from Ft.Xml import EMPTY_NAMESPACE
from Ft.Lib import Uri
import urllib
import re
from ECHO_collection import unicodify,utf8ify

patternTXT=r"<\s*txt.*?>(.*?)</txt>"
regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL)
patternPage=r"<\s*page.*?>(.*?)</page>"
regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL)

#xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')}
#
#def addToDict(dict,name,value):
#    if name=="":
#        return 0
#    else:
#        
#        if not dict.has_key(name):
#            dict[name]=[] # als array anlegen
#
#        dict[name].append(value)
#        return 1    
#
#def proj2hash(self,xmlstring):
#    """wandelt xml-files fuer die projekte in ein hash"""
#    
#    dom=xml.dom.minidom.parseString(xmlstring)
#    
#        
#    list={}
#
#    #gettitle
#    pars=Evaluate('par',dom.getElementsByTagName('part')[0])
#    for par in pars:
#        className=par.getAttribute('class')
#        content=getText(self,par.childNodes)
#        addToDict(list,className,content)
#             
#
#    sectionXPath="section"
#
#    
#    sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
#    
#    while sections:
#        
#        for section in sections:
#            
#            sec=parseSection(self,section)
#            
#            if sec[0]=="WEB_project_header": # Sonderfall project
#                addToDict(list,'WEB_project_header',sec[1]) # store title
#                addToDict(list,'WEB_project_description',sec[2]) #store description
#            else: # no information in heading
#                level=int(sec[3])+2
#                aTag="<h%i>"%level
#                eTag="</h%i>"%level
#                addToDict(list,"text",aTag+sec[1]+eTag)
#                addToDict(list,"text",sec[2])
#        sectionXPath+="/section"
#        sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
#    return list
#
#
#def parseSection(self,section):
#    type=""
#    header=""
#    level=section.getAttribute('level')
#    for heading in section.childNodes:
#        if getattr(heading,'tagName','')=="heading":
#            
#            type=heading.getAttribute('class')
#            header=getText(self,heading.childNodes)
#
#    if type=="": # falls heading fehlt, pruefe ob erster par richtig
#        par=section.getElementsByTagName('par')[0]
#        type=par.getAttribute('class')
#        header=getText(par.childNodes)
#
#    #print section.childNodes
#    #pars=Evaluate('par',section)
#    pars=section.childNodes
#    content=par2html(self,pars)
#    #print "CONTENT",repr(content)
#    return (type,header,content,level)
#
#def parseTable(table):
#    fields={}
#    rows=table.getElementsByTagName('html:tr')
#    for row in rows:
#        #print "ROW"
#        cols=row.getElementsByTagName('html:td')
#        
#        #Name des Datenfeldes einlesen
#        try:
#            field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
#            #print "field",field
#        except:
#            print "error"
#            field=""
#
#        #Wandeln der Eintrge in HTML
#
#        #pars=cols[1].getElementsByTagName('par')
#        pars=cols[1].childNodes
#        
#        html=par2html(self,pars,tags=("",";"))
#        
#        addToDict(fields,field,html)
#        #print fields
#    return fields
#
#def par2html(self,pars,tags=None):
#    html=""
#
#    for par in pars:
#        tagName=getattr(par,'tagName','')
#        if tagName in ["par","inline"]:
#            #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND')
#            #print "par",par
#            if not tags:
#                try:
#                    tag=xml2htmlArray[par.getAttribute('class')]
#                except:
#                    tag=('<p>','</p>')
#            else:
#                tag=tags
#            #print "TAG",tag
#            content=getText(self,par.childNodes,par.getAttribute('class'))
#            
#            
#
#            #print par.getAttribute('class'),node
#            try:
#                html+=tag[0]+content+tag[1]
#            except:
#                html=+tag[0]+content+tag[1]
#            
#        elif tagName=="pb":
#            html+="<pb/>"
#        
#    
#    try:
#
#        return html
#    except:
#        return ""

def getXlink(nodes):
    """searches xlinks and gives them back as html"""
    ret=""
    for node in nodes:
        if node.attributes:
            if 'xlink:type' in node.attributes.keys(): #is a xlink?
                ret +=xlink2html(node)
    return ret

def checkRef(self,ref):
        """teste ob reference angezeigt werden sollen"""
        dbs={'vl_literature':'AND online = \'1\'',
             'vl_technology':'AND complete =\'yes\'',
             'vl_people':'AND complete =\'yes\'',
             'vl_sites':'AND complete =\'yes\'',
             'vl_transcript':'AND complete =\'yes\'',
             'vl_essays':'AND online =\'yes\'',
	     'vl_categories':''
             }
        res=None
        for db in dbs.keys():
            searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db]))
            res=res or self.search(var=searchStr)
        return res
    
def link2html(self,str):
        """link2html links in html wandeln"""
        if str:

            str=re.sub("\&","&amp;",str)
            dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
           
            
            links=dom.getElementsByTagName("link")
            

            for link in links:
                link.tagName="a"
                ref=link.getAttribute("ref")
                pn=link.getAttribute("page")
                mk=link.getAttribute("mk")
                href= link.getAttribute("href")
                if href:
                    link.setAttribute("class","external")
                                    
                if self.checkRef(ref):
                    more = ""
                    if pn:
                        more += "&page=%s"%pn
                        
                    if mk:
                        more += "&mk=%s"%mk
                        
                    link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+more)


            newxml=dom.toxml('utf-8')
          
            
            
            retStr=regexpTXT.search(newxml)
            retStr = retStr.group(1)

            return retStr.decode('utf-8') # we return unicode

        return u""

def related2html(self,str):
    """related library items: xlinks in html wandeln / mb 22.11.2006"""
    if str:
                
        str=re.sub("\&","&amp;",str)
        dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
        links=dom.getElementsByTagName("link")
                
        for link in links:
            link.tagName = "a"
            ref = link.getAttribute("ref")
            pn = link.getAttribute("page")
                        
            searchStr="select fullreference, online from vl_literature where reference =\'%s\' and authorized = 1"%(ref)
            res = self.search(var=searchStr)
                                        
            if res:
                if res[0]['online'] == 1: 
                    # item online verfuegbar
                    if pn:
                        link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
                    else:
                        link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
                        
                    link.setAttribute("title", "click to view")
                    link.removeAttribute("ref")
                    
                    # prefix preceding the link
                    prefix = dom.createTextNode(U"\u2013\u0020") # = ndash + space
                    dom.documentElement.insertBefore(prefix, link)

                else:
                    # item nur als bibliographische angabe vorhanden
                    link.setAttribute("alt", unicodify(res[0]['fullreference']))
                    link.setAttribute("title", "click to expand")
                    link.setAttribute("onclick", "return toggle(this);")
                    link.setAttribute("class", "x_offline")
                    
                    # prefix inside link text
                    link.firstChild.data = '+ ' + link.firstChild.data
                    
            
        newxml=dom.toxml('utf-8')
                
        retStr=regexpTXT.search(newxml)
        retStr = retStr.group(1)
        #logging.debug("related2html out=%s"%repr(retStr))
        return retStr.decode('utf-8') # we return unicode

    return u""

    


def xml2html(self,str,quote="yes"):
        """link2html fuer VLP muss hier noch raus"""
        if str:
            if quote=="yes2":
                str=re.sub("\&","&amp;",str)
            #dom=xml.dom.minidom.parseString(str)
            dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/")
            #links=dom.getElementsByTagName("link")
            links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom)
            for link in links:
                #link.tagName="a"
        
                ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref")
                pn=link.getAttributeNS(EMPTY_NAMESPACE,"page")

                cns=link.childNodes[0:]
                
                newLink=dom.createElementNS(EMPTY_NAMESPACE,"a")
                for x in cns:
                        newLink.appendChild(x)
                
                        
                
                link.parentNode.replaceChild(newLink,link)

                if self.checkRef(ref):
                        if pn:
                                newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
                        else:
                                newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref)

            #str= dom.toxml('utf-8')
            buf = cStringIO.StringIO()
            PrettyPrint(dom, stream=buf)
            str = buf.getvalue()
            buf.close()
            #str=PrettyPrint(dom.documentElement,encoding='UTF-8')
            #print link.toxml('utf-8')
            #print type(str)
            retStr=regexpPage.search(str)
            
            try: # hack warum fehtl manchmal page??
                    return retStr.group(1).decode('utf-8')
            except:
                    return str
        return ""

    
def xlink2html(self,xlink,parClass=None):
    ret=""
    attributes=xlink.attributes
 
    if xlink.tagName.lower()=="image":
        ret +="""<img src="%s" />"""%xlink.getAttribute('href')
    elif xlink.tagName.lower()=="link":
        reference=urllib.unquote(xlink.getAttribute('href'))
        label=getText(self,xlink.childNodes)

        # check if href is already a correct url
        if reference.split(":")[0] in ['http','file']:
            if parClass=="Picture":
                ret +="""<img src="%s" />"""%(reference)
            else:

                ret +="""<a href="%s" >%s</a>"""%(reference,label)
        else: # transform
            #href=xml2html(self,reference)
            #print "refer",reference
            reference=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',reference)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt.
            ret +=reference
            
    return ret

def getText(self,nodelist,parClass=None):
    
    rc = u''
    for node in nodelist:
        
    	if node.nodeType == node.TEXT_NODE:

            try:
                try:
                    #rc += node.data.encode('utf-8','ignore')
                    rc += node.data
                                        
                except:
                    #rc= node.data.encode('utf-8','ignore')
                    rc=node.data
            except:
                rc="ERROR"
                #node.data.decode('utf-8','ignore')

            node.data.encode('utf-8','ignore')
            #print "RC",rc
        elif node.tagName =="inline":

            rc+=par2html(self,[node])

        elif node.tagName =="pb":
            rc+="<pb/>"
        elif node.attributes:

            if 'type' in node.attributes.keys(): #is a xlink?

                try:
                    rc +=xlink2html(self,node,parClass).encode('utf-8')
                    
                except:
                    rc +=xlink2html(self,node,parClass)
                    
    #print "RWT",rc        
    return rc


#filename=argv[1]
#fileString=file(filename).read()
#print proj2hash(fileString)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>