File:  [Repository] / ECHO_content / vlp_xmlhelpers.py
Revision 1.9: download - view: text, annotated - select for diffs - revision graph
Tue Nov 21 18:37:12 2006 UTC (17 years, 6 months ago) by casties
Branches: MAIN
CVS tags: HEAD
fixed utf-8 decode bug in vlp related2html

    1: from sys import argv
    2: 
    3: import string
    4: import xml.dom.minidom
    5: import Ft.Xml.XLink.Processor
    6: import Ft.Xml.XLink.XLinkElements
    7: 
    8: from Ft.Xml import XPath
    9: from Ft.Xml.XPath import Evaluate
   10: from Ft.Xml.XLink import XLINK_NAMESPACE
   11: from Ft.Xml.XLink import XLinkElements
   12: import cStringIO
   13: from Ft.Xml.Domlette import NonvalidatingReader, PrettyPrint,Print
   14: from Ft.Xml import EMPTY_NAMESPACE
   15: from Ft.Lib import Uri
   16: import urllib
   17: import re
   18: 
   19: patternTXT=r"<\s*txt.*?>(.*?)</txt>"
   20: regexpTXT = re.compile(patternTXT, re.IGNORECASE + re.DOTALL)
   21: patternPage=r"<\s*page.*?>(.*?)</page>"
   22: regexpPage = re.compile(patternPage, re.IGNORECASE + re.DOTALL)
   23: 
   24: xml2htmlArray={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('',''),'Picture':('<p class=\"picture\">','</p>'),'FigureTitle':('<p class=\"picturetitle\">','</p>')}
   25: 
   26: def addToDict(dict,name,value):
   27:     if name=="":
   28:         return 0
   29:     else:
   30:         
   31:         if not dict.has_key(name):
   32:             dict[name]=[] # als array anlegen
   33: 
   34:         dict[name].append(value)
   35:         return 1    
   36: 
   37: def proj2hash(self,xmlstring):
   38:     """wandelt xml-files fuer die projekte in ein hash"""
   39:     
   40:     dom=xml.dom.minidom.parseString(xmlstring)
   41:     
   42:         
   43:     list={}
   44: 
   45:     #gettitle
   46:     pars=Evaluate('par',dom.getElementsByTagName('part')[0])
   47:     for par in pars:
   48:         className=par.getAttribute('class')
   49:         content=getText(self,par.childNodes)
   50:         addToDict(list,className,content)
   51:              
   52: 
   53:     sectionXPath="section"
   54: 
   55:     
   56:     sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
   57:     
   58:     while sections:
   59:         
   60:         for section in sections:
   61:             
   62:             sec=parseSection(self,section)
   63:             
   64:             if sec[0]=="WEB_project_header": # Sonderfall project
   65:                 addToDict(list,'WEB_project_header',sec[1]) # store title
   66:                 addToDict(list,'WEB_project_description',sec[2]) #store description
   67:             else: # no information in heading
   68:                 level=int(sec[3])+2
   69:                 aTag="<h%i>"%level
   70:                 eTag="</h%i>"%level
   71:                 addToDict(list,"text",aTag+sec[1]+eTag)
   72:                 addToDict(list,"text",sec[2])
   73:         sectionXPath+="/section"
   74:         sections=Evaluate(sectionXPath,dom.getElementsByTagName('part')[0])
   75:     return list
   76: 
   77: 
   78: def parseSection(self,section):
   79:     type=""
   80:     header=""
   81:     level=section.getAttribute('level')
   82:     for heading in section.childNodes:
   83:         if getattr(heading,'tagName','')=="heading":
   84:             
   85:             type=heading.getAttribute('class')
   86:             header=getText(self,heading.childNodes)
   87: 
   88:     if type=="": # falls heading fehlt, pruefe ob erster par richtig
   89:         par=section.getElementsByTagName('par')[0]
   90:         type=par.getAttribute('class')
   91:         header=getText(par.childNodes)
   92: 
   93:     #print section.childNodes
   94:     #pars=Evaluate('par',section)
   95:     pars=section.childNodes
   96:     content=par2html(self,pars)
   97:     #print "CONTENT",repr(content)
   98:     return (type,header,content,level)
   99: 
  100: def parseTable(table):
  101:     fields={}
  102:     rows=table.getElementsByTagName('html:tr')
  103:     for row in rows:
  104:         #print "ROW"
  105:         cols=row.getElementsByTagName('html:td')
  106:         
  107:         #Name des Datenfeldes einlesen
  108:         try:
  109:             field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
  110:             #print "field",field
  111:         except:
  112:             print "error"
  113:             field=""
  114: 
  115:         #Wandeln der Eintrge in HTML
  116: 
  117:         #pars=cols[1].getElementsByTagName('par')
  118:         pars=cols[1].childNodes
  119:         
  120:         html=par2html(self,pars,tags=("",";"))
  121:         
  122:         addToDict(fields,field,html)
  123:         #print fields
  124:     return fields
  125: 
  126: def par2html(self,pars,tags=None):
  127:     html=""
  128: 
  129:     for par in pars:
  130:         tagName=getattr(par,'tagName','')
  131:         if tagName in ["par","inline"]:
  132:             #print repr(par.getAttribute('class')),xml2htmlArray.get(par.getAttribute('class'),'NOT FOUND')
  133:             #print "par",par
  134:             if not tags:
  135:                 try:
  136:                     tag=xml2htmlArray[par.getAttribute('class')]
  137:                 except:
  138:                     tag=('<p>','</p>')
  139:             else:
  140:                 tag=tags
  141:             #print "TAG",tag
  142:             content=getText(self,par.childNodes,par.getAttribute('class'))
  143:             
  144:             
  145: 
  146:             #print par.getAttribute('class'),node
  147:             try:
  148:                 html+=tag[0]+content+tag[1]
  149:             except:
  150:                 html=+tag[0]+content+tag[1]
  151:             
  152:         elif tagName=="pb":
  153:             html+="<pb/>"
  154:         
  155:     
  156:     try:
  157: 
  158:         return html
  159:     except:
  160:         return ""
  161: 
  162: def getXlink(nodes):
  163:     """searches xlinks and gives them back as html"""
  164:     ret=""
  165:     for node in nodes:
  166:         if node.attributes:
  167:             if 'xlink:type' in node.attributes.keys(): #is a xlink?
  168:                 ret +=xlink2html(node)
  169:     return ret
  170: 
  171: def checkRef(self,ref):
  172:         """teste ob reference angezeigt werden sollen"""
  173:         dbs={'vl_literature':'AND online = \'1\'',
  174:              'vl_technology':'AND complete =\'yes\'',
  175:              'vl_people':'AND complete =\'yes\'',
  176:              'vl_sites':'AND complete =\'yes\'',
  177:              'vl_transcript':'AND complete =\'yes\'',
  178:              'vl_essays':'AND online =\'yes\''
  179:              }
  180:         res=None
  181:         for db in dbs.keys():
  182:             searchStr=str("select reference from %s where reference =\'%s\' %s"%(db,ref,dbs[db]))
  183:             res=res or self.search(var=searchStr)
  184:         return res
  185:     
  186: def link2html(self,str):
  187:         """link2html liks in html wandeln"""
  188:         if str:
  189: 
  190:             str=re.sub("\&","&amp;",str)
  191:             dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+str+"</txt>")
  192:             links=dom.getElementsByTagName("link")
  193:             
  194: 
  195:             for link in links:
  196:                 link.tagName="a"
  197:                 ref=link.getAttribute("ref")
  198:                 pn=link.getAttribute("page")
  199:                         
  200:                 if self.checkRef(ref):
  201:                         if pn:
  202:                                 link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
  203:                         else:
  204:                                 link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
  205: 
  206: 
  207:             newxml=dom.toxml('utf-8')
  208:           
  209:             retStr=regexpTXT.search(newxml)
  210: 
  211:             return retStr.group(1)
  212: 
  213:                            
  214:         return ""
  215: 
  216: def related2html(self,str):
  217:     """related library items: xlinks in html wandeln / mb 21.11.2006"""
  218:     if str:
  219:                 
  220:         str=re.sub("\&","&amp;",str)
  221:         dom=xml.dom.minidom.parseString("<?xml version='1.0' encoding='utf-8'?><txt>"+str+"</txt>")
  222:         links=dom.getElementsByTagName("link")
  223:                 
  224:         for link in links:
  225:             link.tagName = "a"
  226:             ref = link.getAttribute("ref")
  227:             pn = link.getAttribute("page")
  228:                         
  229:             searchStr="select fullreference, online from vl_literature where reference =\'%s\' and authorized = 1"%(ref)
  230:             res = self.search(var=searchStr)
  231:                                         
  232:             if res:
  233:                 if res[0]['online'] == 1: 
  234:                     # item online verfuegbar
  235:                     link.setAttribute("title", "click to view")
  236:                     if pn:
  237:                         link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
  238:                     else:
  239:                         link.setAttribute("href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
  240:                 else:
  241:                     # item nur als bibliographische angabe vorhanden
  242:                     link.setAttribute("alt", res[0]['fullreference'].decode('utf-8'))
  243:                     link.setAttribute("title", "click to expand")
  244:                     link.setAttribute("onclick", "return toggle(this);")
  245:                     link.setAttribute("class", "x_offline")
  246:             
  247:         newxml=dom.toxml('utf-8')
  248:                 
  249:         retStr=regexpTXT.search(newxml)
  250:                 
  251:         return retStr.group(1)
  252:                                            
  253:     return ""
  254: 
  255:     
  256: 
  257: 
  258: def xml2html(self,str,quote="yes"):
  259:         """link2html fuer VLP muss hier noch raus"""
  260:         if str:
  261:             if quote=="yes2":
  262:                 str=re.sub("\&","&amp;",str)
  263:             #dom=xml.dom.minidom.parseString(str)
  264:             dom = NonvalidatingReader.parseString(str,"http://www.mpiwg-berlin.mpg.de/")
  265:             #links=dom.getElementsByTagName("link")
  266:             links=Ft.Xml.XPath.Evaluate(".//link", contextNode=dom)
  267:             for link in links:
  268:                 #link.tagName="a"
  269:         
  270:                 ref=link.getAttributeNS(EMPTY_NAMESPACE,"ref")
  271:                 pn=link.getAttributeNS(EMPTY_NAMESPACE,"page")
  272: 
  273:                 cns=link.childNodes[0:]
  274:                 
  275:                 newLink=dom.createElementNS(EMPTY_NAMESPACE,"a")
  276:                 for x in cns:
  277:                         newLink.appendChild(x)
  278:                 
  279:                         
  280:                 
  281:                 link.parentNode.replaceChild(newLink,link)
  282: 
  283:                 if self.checkRef(ref):
  284:                         if pn:
  285:                                 newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref+"&page="+pn)
  286:                         else:
  287:                                 newLink.setAttributeNS(EMPTY_NAMESPACE,"href",self.REQUEST['SERVER_URL']+"/references?id="+ref)
  288: 
  289:             #str= dom.toxml('utf-8')
  290:             buf = cStringIO.StringIO()
  291:             PrettyPrint(dom, stream=buf)
  292:             str = buf.getvalue()
  293:             buf.close()
  294:             #str=PrettyPrint(dom.documentElement,encoding='UTF-8')
  295:             #print link.toxml('utf-8')
  296:             #print type(str)
  297:             retStr=regexpPage.search(str)
  298:             
  299:             try: # hack warum fehtl manchmal page??
  300:                     return retStr.group(1)
  301:             except:
  302:                     return str
  303:         return ""
  304: 
  305:     
  306: def xlink2html(self,xlink,parClass=None):
  307:     ret=""
  308:     attributes=xlink.attributes
  309:  
  310:     if xlink.tagName.lower()=="image":
  311:         ret +="""<img src="%s" />"""%xlink.getAttribute('href')
  312:     elif xlink.tagName.lower()=="link":
  313:         reference=urllib.unquote(xlink.getAttribute('href'))
  314:         label=getText(self,xlink.childNodes)
  315: 
  316:         # check if href is already a correct url
  317:         if reference.split(":")[0] in ['http','file']:
  318:             if parClass=="Picture":
  319:                 ret +="""<img src="%s" />"""%(reference)
  320:             else:
  321: 
  322:                 ret +="""<a href="%s" >%s</a>"""%(reference,label)
  323:         else: # transform
  324:             #href=xml2html(self,reference)
  325:             #print "refer",reference
  326:             reference=re.sub("ref\=([^>]*)\>",'ref=\"\g<1>\">',reference)# einfuegen anfuehrungszeichen um ref attribut, falls fehlt.
  327:             ret +=reference
  328:             
  329:     return ret
  330: 
  331: def getText(self,nodelist,parClass=None):
  332:     
  333:     rc = u''
  334:     for node in nodelist:
  335:         
  336:     	if node.nodeType == node.TEXT_NODE:
  337: 
  338:             try:
  339:                 try:
  340:                     #rc += node.data.encode('utf-8','ignore')
  341:                     rc += node.data
  342:                                         
  343:                 except:
  344:                     #rc= node.data.encode('utf-8','ignore')
  345:                     rc=node.data
  346:             except:
  347:                 rc="ERROR"
  348:                 #node.data.decode('utf-8','ignore')
  349: 
  350:             node.data.encode('utf-8','ignore')
  351:             #print "RC",rc
  352:         elif node.tagName =="inline":
  353: 
  354:             rc+=par2html(self,[node])
  355: 
  356:         elif node.tagName =="pb":
  357:             rc+="<pb/>"
  358:         elif node.attributes:
  359: 
  360:             if 'type' in node.attributes.keys(): #is a xlink?
  361: 
  362:                 try:
  363:                     rc +=xlink2html(self,node,parClass).encode('utf-8')
  364:                     
  365:                 except:
  366:                     rc +=xlink2html(self,node,parClass)
  367:                     
  368:     #print "RWT",rc        
  369:     return rc
  370: 
  371: 
  372: #filename=argv[1]
  373: #fileString=file(filename).read()
  374: #print proj2hash(fileString)
  375: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>