File:  [Repository] / ECHO_content / vlp_xmlhelpers.py
Revision 1.13: download - view: text, annotated - select for diffs - revision graph
Tue Dec 11 17:00:01 2007 UTC (16 years, 6 months ago) by casties
Branches: MAIN
CVS tags: HEAD
more fixes for missing mk-params in xlinks

    1: from sys import argv
    2: 
    3: import string
    4: import logging
    5: import xml.dom.minidom
    6: import Ft.Xml.XLink.Processor
    7: import Ft.Xml.XLink.XLinkElements
    8: 
    9: from Ft.Xml import XPath
   10: from Ft.Xml.XPath import Evaluate
   11: from Ft.Xml.XLink import XLINK_NAMESPACE
   12: from Ft.Xml.XLink import XLinkElements
   13: import cStringIO
   14: from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print
   15: from Ft.Xml import EMPTY_NAMESPACE
   16: from Ft.Lib import Uri
   17: import urllib
   18: import re
   19: from ECHO_collection import unicodify,utf8ify
   20: 
   21: patternTXT=r"<\s*txt.*?>(.*?)</txt>"
   22: regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL)
   23: patternPage=r"<\s*page.*?>(.*?)</page>"
   24: regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL)
   25: 
   26: xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')}
   27: 
   28: def addToDict(dict,name,value):
   29:     if name=="":
   30:         return 0
   31:     else:
   32:         
   33:         if not dict.has_key(name):
   34:             dict[name]=[] # als array anlegen
   35: 
   36:         dict[name].append(value)
   37:         return 1    
   38: 
   39: def proj2hash(self,xmlstring):
   40:     """wandelt xml-files fuer die projekte in ein hash"""
   41:     
   42:     dom=xml.dom.minidom.parseString(xmlstring)
   43:     
   44:         
   45:     list={}
   46: 
   47:     #gettitle
   48:     pars=Evaluate('par',dom.getElementsByTagName('part')[0])
   49:     for par in pars:
   50:         className=par.getAttribute('class')
   51:         content=getText(self,par.childNodes)
   52:         addToDict(list,className,content)
   53:              
   54: 
   55:     sectionXPath="section"
   56: 
   57:     
   58:     sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
   59:     
   60:     while sections:
   61:         
   62:         for section in sections:
   63:             
   64:             sec=parseSection(self,section)
   65:             
   66:             if sec[0]=="WEB_project_header": # Sonderfall project
   67:                 addToDict(list,'WEB_project_header',sec[1]) # store title
   68:                 addToDict(list,'WEB_project_description',sec[2]) #store description
   69:             else: # no information in heading
   70:                 level=int(sec[3])+2
   71:                 aTag="<h%i>"%level
   72:                 eTag="</h%i>"%level
   73:                 addToDict(list,"text",aTag+sec[1]+eTag)
   74:                 addToDict(list,"text",sec[2])
   75:         sectionXPath+="/section"
   76:         sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
   77:     return list
   78: 
   79: 
   80: def parseSection(self,section):
   81:     type=""
   82:     header=""
   83:     level=section.getAttribute('level')
   84:     for heading in section.childNodes:
   85:         if getattr(heading,'tagName','')=="heading":
   86:             
   87:             type=heading.getAttribute('class')
   88:             header=getText(self,heading.childNodes)
   89: 
   90:     if type=="": # falls heading fehlt, pruefe ob erster par richtig
   91:         par=section.getElementsByTagName('par')[0]
   92:         type=par.getAttribute('class')
   93:         header=getText(par.childNodes)
   94: 
   95:     #print section.childNodes
   96:     #pars=Evaluate('par',section)
   97:     pars=section.childNodes
   98:     content=par2html(self,pars)
   99:     #print "CONTENT",repr(content)
  100:     return (type,header,content,level)
  101: 
  102: def parseTable(table):
  103:     fields={}
  104:     rows=table.getElementsByTagName('html:tr')
  105:     for row in rows:
  106:         #print "ROW"
  107:         cols=row.getElementsByTagName('html:td')
  108:         
  109:         #Name des Datenfeldes einlesen
  110:         try:
  111:             field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
  112:             #print "field",field
  113:         except:
  114:             print "error"
  115:             field=""
  116: 
  117:         #Wandeln der Eintrge in HTML
  118: 
  119:         #pars=cols[1].getElementsByTagName('par')
  120:         pars=cols[1].childNodes
  121:         
  122:         html=par2html(self,pars,tags=("",";"))
  123:         
  124:         addToDict(fields,field,html)
  125:         #print fields
  126:     return fields
  127: 
  128: def par2html(self,pars,tags=None):
  129:     html=""
  130: 
  131:     for par in pars:
  132:         tagName=getattr(par,'tagName','')
  133:         if tagName in ["par","inline"]:
  134:             #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND')
  135:             #print "par",par
  136:             if not tags:
  137:                 try:
  138:                     tag=xml2htmlArray[par.getAttribute('class')]
  139:                 except:
  140:                     tag=('<p>','</p>')
  141:             else:
  142:                 tag=tags
  143:             #print "TAG",tag
  144:             content=getText(self,par.childNodes,par.getAttribute('class'))
  145:             
  146:             
  147: 
  148:             #print par.getAttribute('class'),node
  149:             try:
  150:                 html+=tag[0]+content+tag[1]
  151:             except:
  152:                 html=+tag[0]+content+tag[1]
  153:             
  154:         elif tagName=="pb":
  155:             html+="<pb/>"
  156:         
  157:     
  158:     try:
  159: 
  160:         return html
  161:     except:
  162:         return ""
  163: 
  164: def getXlink(nodes):
  165:     """searches xlinks and gives them back as html"""
  166:     ret=""
  167:     for node in nodes:
  168:         if node.attributes:
  169:             if 'xlink:type' in node.attributes.keys(): #is a xlink?
  170:                 ret +=xlink2html(node)
  171:     return ret
  172: 
  173: def checkRef(self,ref):
  174:         """teste ob reference angezeigt werden sollen"""
  175:         dbs={'vl_literature':'AND online = \'1\'',
  176:              'vl_technology':'AND complete =\'yes\'',
  177:              'vl_people':'AND complete =\'yes\'',
  178:              'vl_sites':'AND complete =\'yes\'',
  179:              'vl_transcript':'AND complete =\'yes\'',
  180:              'vl_essays':'AND online =\'yes\'',
  181: 	     'vl_categories':''
  182:              }
  183:         res=None
  184:         for db in dbs.keys():
  185:             searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db]))
  186:             res=res or self.search(var=searchStr)
  187:         return res
  188:     
  189: def link2html(self,str):
  190:         """link2html links in html wandeln"""
  191:         if str:
  192: 
  193:             str=re.sub("\&","&amp;",str)
  194:             dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
  195:             links=dom.getElementsByTagName("link")
  196:             
  197: 
  198:             for link in links:
  199:                 link.tagName="a"
  200:                 ref=link.getAttribute("ref")
  201:                 pn=link.getAttribute("page")
  202:                 mk=link.getAttribute("mk")
  203:                         
  204:                 if self.checkRef(ref):
  205:                     more = ""
  206:                     if pn:
  207:                         more += "&page=%s"%pn
  208:                         
  209:                     if mk:
  210:                         more += "&mk=%s"%mk
  211:                         
  212:                     link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+more)
  213: 
  214:             newxml=dom.toxml('utf-8')
  215:           
  216:             retStr=regexpTXT.search(newxml)
  217:             retStr = retStr.group(1)
  218: 
  219:             return retStr.decode('utf-8') # we return unicode
  220: 
  221:         return u""
  222: 
  223: def related2html(self,str):
  224:     """related library items: xlinks in html wandeln / mb 22.11.2006"""
  225:     if str:
  226:                 
  227:         str=re.sub("\&","&amp;",str)
  228:         dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+utf8ify(str)+"</txt>")
  229:         links=dom.getElementsByTagName("link")
  230:                 
  231:         for link in links:
  232:             link.tagName = "a"
  233:             ref = link.getAttribute("ref")
  234:             pn = link.getAttribute("page")
  235:                         
  236:             searchStr="select fullreference, online from vl_literature where reference =\'%s\' and authorized = 1"%(ref)
  237:             res = self.search(var=searchStr)
  238:                                         
  239:             if res:
  240:                 if res[0]['online'] == 1: 
  241:                     # item online verfuegbar
  242:                     if pn:
  243:                         link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
  244:                     else:
  245:                         link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
  246:                         
  247:                     link.setAttribute("title", "click to view")
  248:                     link.removeAttribute("ref")
  249:                     
  250:                     # prefix preceding the link
  251:                     prefix = dom.createTextNode(U"\u2013\u0020") # = ndash + space
  252:                     dom.documentElement.insertBefore(prefix, link)
  253: 
  254:                 else:
  255:                     # item nur als bibliographische angabe vorhanden
  256:                     link.setAttribute("alt", unicodify(res[0]['fullreference']))
  257:                     link.setAttribute("title", "click to expand")
  258:                     link.setAttribute("onclick", "return toggle(this);")
  259:                     link.setAttribute("class", "x_offline")
  260:                     
  261:                     # prefix inside link text
  262:                     link.firstChild.data = '+ ' + link.firstChild.data
  263:                     
  264:             
  265:         newxml=dom.toxml('utf-8')
  266:                 
  267:         retStr=regexpTXT.search(newxml)
  268:         retStr = retStr.group(1)
  269:         #logging.debug("related2html out=%s"%repr(retStr))
  270:         return retStr.decode('utf-8') # we return unicode
  271: 
  272:     return u""
  273: 
  274:     
  275: 
  276: 
  277: def xml2html(self,str,quote="yes"):
  278:         """link2html fuer VLP muss hier noch raus"""
  279:         if str:
  280:             if quote=="yes2":
  281:                 str=re.sub("\&","&amp;",str)
  282:             #dom=xml.dom.minidom.parseString(str)
  283:             dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/")
  284:             #links=dom.getElementsByTagName("link")
  285:             links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom)
  286:             for link in links:
  287:                 #link.tagName="a"
  288:         
  289:                 ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref")
  290:                 pn=link.getAttributeNS(EMPTY_NAMESPACE,"page")
  291: 
  292:                 cns=link.childNodes[0:]
  293:                 
  294:                 newLink=dom.createElementNS(EMPTY_NAMESPACE,"a")
  295:                 for x in cns:
  296:                         newLink.appendChild(x)
  297:                 
  298:                         
  299:                 
  300:                 link.parentNode.replaceChild(newLink,link)
  301: 
  302:                 if self.checkRef(ref):
  303:                         if pn:
  304:                                 newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
  305:                         else:
  306:                                 newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
  307: 
  308:             #str= dom.toxml('utf-8')
  309:             buf = cStringIO.StringIO()
  310:             PrettyPrint(dom, stream=buf)
  311:             str = buf.getvalue()
  312:             buf.close()
  313:             #str=PrettyPrint(dom.documentElement,encoding='UTF-8')
  314:             #print link.toxml('utf-8')
  315:             #print type(str)
  316:             retStr=regexpPage.search(str)
  317:             
  318:             try: # hack warum fehtl manchmal page??
  319:                     return retStr.group(1).decode('utf-8')
  320:             except:
  321:                     return str
  322:         return ""
  323: 
  324:     
  325: def xlink2html(self,xlink,parClass=None):
  326:     ret=""
  327:     attributes=xlink.attributes
  328:  
  329:     if xlink.tagName.lower()=="image":
  330:         ret +="""<img src="%s" />"""%xlink.getAttribute('href')
  331:     elif xlink.tagName.lower()=="link":
  332:         reference=urllib.unquote(xlink.getAttribute('href'))
  333:         label=getText(self,xlink.childNodes)
  334: 
  335:         # check if href is already a correct url
  336:         if reference.split(":")[0] in ['http','file']:
  337:             if parClass=="Picture":
  338:                 ret +="""<img src="%s" />"""%(reference)
  339:             else:
  340: 
  341:                 ret +="""<a href="%s" >%s</a>"""%(reference,label)
  342:         else: # transform
  343:             #href=xml2html(self,reference)
  344:             #print "refer",reference
  345:             reference=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',reference)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt.
  346:             ret +=reference
  347:             
  348:     return ret
  349: 
  350: def getText(self,nodelist,parClass=None):
  351:     
  352:     rc = u''
  353:     for node in nodelist:
  354:         
  355:     	if node.nodeType == node.TEXT_NODE:
  356: 
  357:             try:
  358:                 try:
  359:                     #rc += node.data.encode('utf-8','ignore')
  360:                     rc += node.data
  361:                                         
  362:                 except:
  363:                     #rc= node.data.encode('utf-8','ignore')
  364:                     rc=node.data
  365:             except:
  366:                 rc="ERROR"
  367:                 #node.data.decode('utf-8','ignore')
  368: 
  369:             node.data.encode('utf-8','ignore')
  370:             #print "RC",rc
  371:         elif node.tagName =="inline":
  372: 
  373:             rc+=par2html(self,[node])
  374: 
  375:         elif node.tagName =="pb":
  376:             rc+="<pb/>"
  377:         elif node.attributes:
  378: 
  379:             if 'type' in node.attributes.keys(): #is a xlink?
  380: 
  381:                 try:
  382:                     rc +=xlink2html(self,node,parClass).encode('utf-8')
  383:                     
  384:                 except:
  385:                     rc +=xlink2html(self,node,parClass)
  386:                     
  387:     #print "RWT",rc        
  388:     return rc
  389: 
  390: 
  391: #filename=argv[1]
  392: #fileString=file(filename).read()
  393: #print proj2hash(fileString)
  394: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>