File:  [Repository] / ECHO_content / vlp_xmlhelpers.py
Revision 1.18: download - view: text, annotated - select for diffs - revision graph
Mon Oct 11 13:14:59 2010 UTC (13 years, 8 months ago) by dwinter
Branches: MAIN
CVS tags: cleanup, Root_cleanup, HEAD
bugs in generate label and title fixed

    1: from sys import argv
    2: 
    3: import string
    4: import logging
    5: import xml.dom.minidom
    6: import Ft.Xml.XLink.Processor
    7: import Ft.Xml.XLink.XLinkElements
    8: 
    9: from Ft.Xml import XPath
   10: from Ft.Xml.XPath import Evaluate
   11: from Ft.Xml.XLink import XLINK_NAMESPACE
   12: from Ft.Xml.XLink import XLinkElements
   13: import cStringIO
   14: from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print
   15: from Ft.Xml import EMPTY_NAMESPACE
   16: from Ft.Lib import Uri
   17: import urllib
   18: import re
   19: from ECHO_collection import unicodify,utf8ify
   20: 
   21: patternTXT=r"<\s*txt.*?>(.*?)</txt>"
   22: regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL)
   23: patternPage=r"<\s*page.*?>(.*?)</page>"
   24: regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL)
   25: 
   26: #xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')}
   27: #
   28: #def addToDict(dict,name,value):
   29: #    if name=="":
   30: #        return 0
   31: #    else:
   32: #        
   33: #        if not dict.has_key(name):
   34: #            dict[name]=[] # als array anlegen
   35: #
   36: #        dict[name].append(value)
   37: #        return 1    
   38: #
   39: #def proj2hash(self,xmlstring):
   40: #    """wandelt xml-files fuer die projekte in ein hash"""
   41: #    
   42: #    dom=xml.dom.minidom.parseString(xmlstring)
   43: #    
   44: #        
   45: #    list={}
   46: #
   47: #    #gettitle
   48: #    pars=Evaluate('par',dom.getElementsByTagName('part')[0])
   49: #    for par in pars:
   50: #        className=par.getAttribute('class')
   51: #        content=getText(self,par.childNodes)
   52: #        addToDict(list,className,content)
   53: #             
   54: #
   55: #    sectionXPath="section"
   56: #
   57: #    
   58: #    sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
   59: #    
   60: #    while sections:
   61: #        
   62: #        for section in sections:
   63: #            
   64: #            sec=parseSection(self,section)
   65: #            
   66: #            if sec[0]=="WEB_project_header": # Sonderfall project
   67: #                addToDict(list,'WEB_project_header',sec[1]) # store title
   68: #                addToDict(list,'WEB_project_description',sec[2]) #store description
   69: #            else: # no information in heading
   70: #                level=int(sec[3])+2
   71: #                aTag="<h%i>"%level
   72: #                eTag="</h%i>"%level
   73: #                addToDict(list,"text",aTag+sec[1]+eTag)
   74: #                addToDict(list,"text",sec[2])
   75: #        sectionXPath+="/section"
   76: #        sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
   77: #    return list
   78: #
   79: #
   80: #def parseSection(self,section):
   81: #    type=""
   82: #    header=""
   83: #    level=section.getAttribute('level')
   84: #    for heading in section.childNodes:
   85: #        if getattr(heading,'tagName','')=="heading":
   86: #            
   87: #            type=heading.getAttribute('class')
   88: #            header=getText(self,heading.childNodes)
   89: #
   90: #    if type=="": # falls heading fehlt, pruefe ob erster par richtig
   91: #        par=section.getElementsByTagName('par')[0]
   92: #        type=par.getAttribute('class')
   93: #        header=getText(par.childNodes)
   94: #
   95: #    #print section.childNodes
   96: #    #pars=Evaluate('par',section)
   97: #    pars=section.childNodes
   98: #    content=par2html(self,pars)
   99: #    #print "CONTENT",repr(content)
  100: #    return (type,header,content,level)
  101: #
  102: #def parseTable(table):
  103: #    fields={}
  104: #    rows=table.getElementsByTagName('html:tr')
  105: #    for row in rows:
  106: #        #print "ROW"
  107: #        cols=row.getElementsByTagName('html:td')
  108: #        
  109: #        #Name des Datenfeldes einlesen
  110: #        try:
  111: #            field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
  112: #            #print "field",field
  113: #        except:
  114: #            print "error"
  115: #            field=""
  116: #
  117: #        #Wandeln der Eintrge in HTML
  118: #
  119: #        #pars=cols[1].getElementsByTagName('par')
  120: #        pars=cols[1].childNodes
  121: #        
  122: #        html=par2html(self,pars,tags=("",";"))
  123: #        
  124: #        addToDict(fields,field,html)
  125: #        #print fields
  126: #    return fields
  127: #
  128: #def par2html(self,pars,tags=None):
  129: #    html=""
  130: #
  131: #    for par in pars:
  132: #        tagName=getattr(par,'tagName','')
  133: #        if tagName in ["par","inline"]:
  134: #            #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND')
  135: #            #print "par",par
  136: #            if not tags:
  137: #                try:
  138: #                    tag=xml2htmlArray[par.getAttribute('class')]
  139: #                except:
  140: #                    tag=('<p>','</p>')
  141: #            else:
  142: #                tag=tags
  143: #            #print "TAG",tag
  144: #            content=getText(self,par.childNodes,par.getAttribute('class'))
  145: #            
  146: #            
  147: #
  148: #            #print par.getAttribute('class'),node
  149: #            try:
  150: #                html+=tag[0]+content+tag[1]
  151: #            except:
  152: #                html=+tag[0]+content+tag[1]
  153: #            
  154: #        elif tagName=="pb":
  155: #            html+="<pb/>"
  156: #        
  157: #    
  158: #    try:
  159: #
  160: #        return html
  161: #    except:
  162: #        return ""
  163: 
  164: def getXlink(nodes):
  165:     """searches xlinks and gives them back as html"""
  166:     ret=""
  167:     for node in nodes:
  168:         if node.attributes:
  169:             if 'xlink:type' in node.attributes.keys(): #is a xlink?
  170:                 ret +=xlink2html(node)
  171:     return ret
  172: 
  173: def checkRef(self,ref):
  174:         """teste ob reference angezeigt werden sollen"""
  175:         dbs={'vl_literature':'AND online = \'1\'',
  176:              'vl_technology':'AND complete =\'yes\'',
  177:              'vl_people':'AND complete =\'yes\'',
  178:              'vl_sites':'AND complete =\'yes\'',
  179:              'vl_transcript':'AND complete =\'yes\'',
  180:              'vl_essays':'AND online =\'yes\'',
  181: 	     'vl_categories':''
  182:              }
  183:         res=None
  184:         for db in dbs.keys():
  185:             searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db]))
  186:             res=res or self.search(var=searchStr)
  187:         return res
  188:     
  189: def link2html(self,str):
  190:         """link2html links in html wandeln"""
  191:         if str:
  192: 
  193:             str=re.sub("\&","&amp;",str)
  194:             dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
  195:            
  196:             
  197:             links=dom.getElementsByTagName("link")
  198:             
  199: 
  200:             for link in links:
  201:                 link.tagName="a"
  202:                 ref=link.getAttribute("ref")
  203:                 pn=link.getAttribute("page")
  204:                 mk=link.getAttribute("mk")
  205:                 href= link.getAttribute("href")
  206:                 if href:
  207:                     link.setAttribute("class","external")
  208:                                     
  209:                 if self.checkRef(ref):
  210:                     more = ""
  211:                     if pn:
  212:                         more += "&page=%s"%pn
  213:                         
  214:                     if mk:
  215:                         more += "&mk=%s"%mk
  216:                         
  217:                     link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+more)
  218: 
  219: 
  220:             newxml=dom.toxml('utf-8')
  221:           
  222:             
  223:             
  224:             retStr=regexpTXT.search(newxml)
  225:             retStr = retStr.group(1)
  226: 
  227:             return retStr.decode('utf-8') # we return unicode
  228: 
  229:         return u""
  230: 
  231: def related2html(self,str):
  232:     """related library items: xlinks in html wandeln / mb 22.11.2006"""
  233:     if str:
  234:                 
  235:         str=re.sub("\&","&amp;",str)
  236:         dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
  237:         links=dom.getElementsByTagName("link")
  238:                 
  239:         for link in links:
  240:             link.tagName = "a"
  241:             ref = link.getAttribute("ref")
  242:             pn = link.getAttribute("page")
  243:             obj = ref[0:3]
  244:             
  245:             """erweiterung der related items von literatur auf weitere datenbankobjekte, mb 09.06.2009"""
  246:             searchStr = ''
  247:             if obj == 'lit':            
  248:                 searchStr="select fullreference, online from vl_literature where reference =\'%s\' and authorized = 1"%(ref)
  249:             elif obj == 'sit':
  250:                 searchStr="select reference from vl_sites where reference =\'%s\' and complete = 'yes'"%(ref)
  251:             elif obj == 'per':
  252:                 searchStr="select reference from vl_people where reference =\'%s\' and complete = 'yes'"%(ref)
  253:             elif obj == 'tec':
  254:                 searchStr="select reference from vl_technology where reference =\'%s\' and complete = 'yes'"%(ref)
  255:             elif obj == 'exp':
  256:                 searchStr="select reference from vl_experiments where reference =\'%s\' and complete = 'yes'"%(ref)
  257:                 
  258:             res = self.search(var=searchStr)
  259:                                         
  260:             if res:
  261:                 if obj == 'lit':
  262:                     if res[0]['online'] == 1: 
  263:                         # literatur item online verfuegbar
  264:                         if pn:
  265:                             link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
  266:                         else:
  267:                             link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
  268:                             
  269:                         link.setAttribute("title", "click to view!")
  270:                         link.removeAttribute("ref")
  271:                         
  272:                         # prefix preceding the link
  273:                         prefix = dom.createTextNode(U"\u2013\u0020") # = ndash + space
  274:                         dom.documentElement.insertBefore(prefix, link)
  275:   
  276:                     else:
  277:                         # literatur item nur als bibliographische angabe vorhanden
  278:                         link.setAttribute("alt", unicodify(res[0]['fullreference']))
  279:                         link.setAttribute("title", "click to expand")
  280:                         link.setAttribute("onclick", "return toggle(this);")
  281:                         link.setAttribute("class", "x_offline")
  282:                         
  283:                         # prefix inside link text
  284:                         link.firstChild.data = '+ ' + link.firstChild.data
  285:                 else:
  286:                     # links zu den anderen datenbankobjekten
  287:                     link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
  288:                     link.setAttribute("title", "click to view")
  289:                     link.removeAttribute("ref")
  290:             
  291:                     # prefix preceding the link
  292:                     prefix = dom.createTextNode(U"\u2013\u0020") # = ndash + space
  293:                     dom.documentElement.insertBefore(prefix, link)
  294:             
  295:             else:
  296:                 # objekt nicht verfuegbar/freigegeben oder (web)link mit href statt ref
  297:                 
  298:                 try:
  299:                     link.removeAttribute("ref")
  300:                     link.setAttribute("title", ref)
  301:                 except:
  302:                     pass
  303:                 
  304:                 
  305:                 # prefix preceding the link
  306:                 prefix = dom.createTextNode(U"\u2013\u0020") # = ndash + space
  307:                 dom.documentElement.insertBefore(prefix, link)
  308: 
  309:                 
  310:         newxml=dom.toxml('utf-8')
  311:                 
  312:         retStr=regexpTXT.search(newxml)
  313:         retStr = retStr.group(1)
  314:         #logging.debug("related2html out=%s"%repr(retStr))
  315:         return retStr.decode('utf-8') # we return unicode
  316: 
  317:     return u""
  318: 
  319: 
  320: 
  321: def xml2html(self,str,quote="yes"):
  322:         """link2html fuer VLP muss hier noch raus"""
  323:         if str:
  324:             if quote=="yes2":
  325:                 str=re.sub("\&","&amp;",str)
  326:             #dom=xml.dom.minidom.parseString(str)
  327:             logging.debug(str)
  328:             dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/")
  329:             #links=dom.getElementsByTagName("link")
  330:             links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom)
  331:             for link in links:
  332:                 #link.tagName="a"
  333:         
  334:                 ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref")
  335:                 pn=link.getAttributeNS(EMPTY_NAMESPACE,"page")
  336: 
  337:                 cns=link.childNodes[0:]
  338:                 
  339:                 newLink=dom.createElementNS(EMPTY_NAMESPACE,"a")
  340:                 for x in cns:
  341:                         newLink.appendChild(x)
  342:                 
  343:                         
  344:                 
  345:                 link.parentNode.replaceChild(newLink,link)
  346: 
  347:                 if self.checkRef(ref):
  348:                         if pn:
  349:                                 newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
  350:                         else:
  351:                                 newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
  352: 
  353:             #str= dom.toxml('utf-8')
  354:             buf = cStringIO.StringIO()
  355:             PrettyPrint(dom, stream=buf)
  356:             str = buf.getvalue()
  357:             buf.close()
  358:             #str=PrettyPrint(dom.documentElement,encoding='UTF-8')
  359:             #print link.toxml('utf-8')
  360:             #print type(str)
  361:             retStr=regexpPage.search(str)
  362:             
  363:             try: # hack warum fehtl manchmal page??
  364:                     return retStr.group(1).decode('utf-8')
  365:             except:
  366:                     return str
  367:         return ""
  368: 
  369:     
  370: def xlink2html(self,xlink,parClass=None):
  371:     ret=""
  372:     attributes=xlink.attributes
  373:  
  374:     if xlink.tagName.lower()=="image":
  375:         ret +="""<img src="%s" />"""%xlink.getAttribute('href')
  376:     elif xlink.tagName.lower()=="link":
  377:         reference=urllib.unquote(xlink.getAttribute('href'))
  378:         label=getText(self,xlink.childNodes)
  379: 
  380:         # check if href is already a correct url
  381:         if reference.split(":")[0] in ['http','file']:
  382:             if parClass=="Picture":
  383:                 ret +="""<img src="%s" />"""%(reference)
  384:             else:
  385: 
  386:                 ret +="""<a href="%s" >%s</a>"""%(reference,label)
  387:         else: # transform
  388:             #href=xml2html(self,reference)
  389:             #print "refer",reference
  390:             reference=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',reference)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt.
  391:             ret +=reference
  392:             
  393:     return ret
  394: 
  395: def getText(self,nodelist,parClass=None):
  396:     
  397:     rc = u''
  398:     for node in nodelist:
  399:         
  400:     	if node.nodeType == node.TEXT_NODE:
  401: 
  402:             try:
  403:                 try:
  404:                     #rc += node.data.encode('utf-8','ignore')
  405:                     rc += node.data
  406:                                         
  407:                 except:
  408:                     #rc= node.data.encode('utf-8','ignore')
  409:                     rc=node.data
  410:             except:
  411:                 rc="ERROR"
  412:                 #node.data.decode('utf-8','ignore')
  413: 
  414:             node.data.encode('utf-8','ignore')
  415:             #print "RC",rc
  416:         elif node.tagName =="inline":
  417: 
  418:             rc+=par2html(self,[node])
  419: 
  420:         elif node.tagName =="pb":
  421:             rc+="<pb/>"
  422:         elif node.attributes:
  423: 
  424:             if 'type' in node.attributes.keys(): #is a xlink?
  425: 
  426:                 try:
  427:                     rc +=xlink2html(self,node,parClass).encode('utf-8')
  428:                     
  429:                 except:
  430:                     rc +=xlink2html(self,node,parClass)
  431:                     
  432:     #print "RWT",rc        
  433:     return rc
  434: 
  435: 
  436: #filename=argv[1]
  437: #fileString=file(filename).read()
  438: #print proj2hash(fileString)
  439: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>