File:  [Repository] / MPIWGWeb / xmlhelper.py
Revision 1.6.2.3: download - view: text, annotated - select for diffs - revision graph
Wed Feb 15 11:51:47 2012 UTC (12 years, 3 months ago) by dwinter
Branches: r2
character handling in updatePersonaWWW changed

    1: 
    2: from sys import argv
    3: 
    4: import string
    5: import xml.dom.minidom
    6: #import Ft.Xml.XLink.Processor
    7: #import Ft.Xml.XLink.XLinkElements
    8: #
    9: #from Ft.Xml import XPath
   10: #from Ft.Xml.XPath import Evaluate
   11: #from Ft.Xml.XLink import XLINK_NAMESPACE
   12: #from Ft.Xml.XLink import XLinkElements
   13: 
   14: #from Ft.Xml.Domlette import NonvalidatingReader,InputSource
   15: #from Ft.Xml import EMPTY_NAMESPACE
   16: 
   17: #from Ft.Lib import Uri
   18: 
   19: from xml.etree import ElementTree
   20: import logging
   21: 
   22: xml2html={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('','')}
   23: 
   24: def addToDict(dict,name,value):
   25:     if name=="":
   26:         return 0
   27:     else:
   28:         
   29:         if not dict.has_key(name):
   30:             dict[name]=[] # als array anlegen
   31: 
   32:         dict[name].append(value)
   33:         return 1    
   34: 
   35: def proj2hash(xmlstring):
   36:     """wandelt xml-files fuer die projekte in ein hash"""
   37:     
   38:     #dom=xml.dom.minidom.parseString(xmlstring)
   39:     
   40:     tree = ElementTree.fromstring(xmlstring)
   41:     
   42: 
   43:     pars = tree.findall(".//part[0]/par")
   44: 
   45:     list={}
   46: 
   47:     #gettitle
   48:     #part= dom.getElementsByTagName('part')[0]
   49:     #pars=part.getElementsByTagName('par')
   50:     #pars=Evaluate('par',dom.getElementsByTagName('part')[0])
   51:     logging.debug(pars)
   52:     for par in pars:
   53:         logging.debug(par)
   54:         className=par.attrib['class']
   55:         #.getAttribute('class')
   56:         content=par.text
   57:         addToDict(list,className,content)
   58:              
   59:     list.update(parseTable(tree.find('.//{http://www.w3.org/HTML/1998/html4}table'))) # Parse the Table
   60: 
   61:     #evaluate level 1
   62:     sections = tree.findall(".//part[0]/section")
   63:     #sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections
   64:     #sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections
   65:     #print sections,dom.getElementsByTagName('part')[0]
   66:     for section in sections:
   67: 
   68:         sec=parseSection(section)
   69:         if sec[0]=="WEB_project_header": # Sonderfall project
   70: 
   71:             addToDict(list,'WEB_project_header',sec[1]) # store title
   72:             addToDict(list,'WEB_project_description',sec[2]) #store description
   73:         else: # no information in heading
   74: 
   75:             addToDict(list,sec[0],sec[2])
   76: 
   77:     #evaluate higher level sections
   78:     sections = tree.findall(".//part[0]/section/section")
   79:     #sections=Evaluate('section/section',dom.getElementsByTagName('part')[0])
   80: 
   81:     for section in sections:
   82:         logging.debug("sections2:"+repr(section))
   83:         sec=parseSection(section)
   84:         
   85:         if sec[0]=="WEB_project_header": # Sonderfall project
   86:             addToDict(list,'WEB_project_header',sec[1]) # store title
   87:             addToDict(list,'WEB_project_description',sec[2]) #store description
   88:         else: # no information in heading
   89:             addToDict(list,sec[0],sec[2])
   90: 
   91:     
   92:     return list
   93: 
   94: 
   95: def parseSection(section):
   96:     type=""
   97:     header=""
   98:     #for heading in section.childNodes:
   99:     
  100:     heading=section.find(".//heading")
  101:        # if getattr(heading,'tagName','')=="heading":
  102:     
  103:     
  104:     type=heading.attrib['class']
  105:     logging.debug("parseSection (class):"+type)
  106:     header=heading.text
  107:     logging.debug("parseSection (header):"+header)
  108:     
  109:     if type=="": # falls heading fehlt, pruefe ob erster par richtig
  110:         par=section.find(".//par")
  111:         #par=section.getElementsByTagName('par')[0]
  112:         type=par.attrib['class']
  113:         header=par.text
  114:         
  115:     #print section.childNodes
  116:     pars=section.findall(".//par")
  117:     #pars=Evaluate('par',section)
  118:     content=par2html(pars)
  119:     
  120:     return (type,header,content)
  121: 
  122: def parseTable(table):
  123:     fields={}
  124:     rows=table.findall('.//{http://www.w3.org/HTML/1998/html4}tr')
  125:     #rows=table.getElementsByTagName('html:tr')
  126:     for row in rows:
  127:         logging.debug("ROW")
  128:         cols=row.findall('.//{http://www.w3.org/HTML/1998/html4}td')
  129:         #cols=row.getElementsByTagName('html:td')
  130:         
  131:         #Name des Datenfeldes einlesen
  132:         try:
  133:             field=cols[0].find('.//par').attrib['class']
  134:             #field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
  135:             #print "field",field
  136:         except:
  137:             logging.debug("error")
  138:             field=""
  139: 
  140:         #Wandeln der Eintrge in HTML
  141:         
  142:         pars=cols[1].findall('.//par')
  143:         #pars=cols[1].getElementsByTagName('par')
  144: 
  145:         
  146:         html=par2html(pars,tags=("",";"))
  147:         logging.debug("field:"+field)
  148:         logging.debug("html:"+html)
  149:         addToDict(fields,field,html)
  150:         #print fields
  151:     return fields
  152: 
  153: def par2html(pars,tags=None):
  154:     #html=""
  155:     logging.debug("part2html:"+repr(pars))
  156:     if pars is None:
  157:         return ""
  158:     for par in pars:
  159:         logging.debug("part2html:"+repr(par))
  160:         if not tags:
  161:             try:
  162:                 tag=xml2html[par.attrib['class']]
  163:             except:
  164:                 tag=('<p>','</p>')
  165:         else:
  166:             tag=tags
  167:         
  168:         content=par.text
  169:         if content is None:
  170:             content=""
  171:         logging.debug("part2html:"+content)
  172:         #print "CONTETN",content
  173:         
  174:         #print par.getAttribute('class'),node
  175:         try:
  176:             html=html+tag[0]+content+tag[1]
  177:         except:
  178:             html=tag[0]+content+tag[1]
  179: 
  180:     try:    
  181:         return html
  182:     except:
  183:         return ""
  184: 
  185: def getXlink(nodes):
  186:     """searches xlinks and gives them back as html"""
  187:     ret=""
  188:     for node in nodes:
  189:         if node.attributes:
  190:             if 'xlink:type' in node.attributes.keys(): #is a xlink?
  191:                 ret +=xlink2html(node)
  192:     return ret
  193:     
  194: def xlink2html(xlink):
  195:     ret=""
  196:     attributes=xlink.attributes
  197:     
  198:     if xlink.tagName.lower()=="image":
  199:         ret +="<img src=%s />"%xlink.getAttribute('xlink:href')
  200:     elif xlink.tagName.lower()=="link":
  201:         ret +="<a href='%s' >%s</a>"%(xlink.getAttribute('xlink:href'),getText(xlink.childNodes))
  202:     
  203:         
  204:         
  205:     
  206:     return ret
  207: 
  208: def getText(nodelist):
  209:     
  210:     rc = u''
  211:     for node in nodelist:
  212:         if node.nodeType == node.TEXT_NODE:
  213:             #print "node",node
  214:             #print "NODE",node.data.encode('utf-8','ignore'),"V"
  215:             #print "HALII"
  216:             try:
  217:                 try:
  218:                     #rc += node.data.encode('utf-8','ignore')
  219:                     rc += node.data
  220:                                         
  221:                 except:
  222:                     #rc= node.data.encode('utf-8','ignore')
  223:                     rc=node.data
  224:             except:
  225:                 rc="ERROR"
  226:                 #node.data.decode('utf-8','ignore')
  227:                 print "ERROR"
  228:             node.data.encode('utf-8','ignore')
  229:             #print "RC",rc
  230:         elif node.tagName =="inline":
  231:             rc+=par2html([node])
  232:         elif node.attributes:
  233: 
  234:             if 'xlink:type' in node.attributes.keys(): #is a xlink?
  235:                 rc +=xlink2html(node)
  236:     #print "RWT",rc        
  237:     return rc
  238: 
  239: 
  240: #filename=argv[1]
  241: #fileString=file(filename).read()
  242: #print proj2hash(fileString)
  243: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>