MPIWGWeb/xmlhelper.py - view

File: [Repository] / MPIWGWeb / xmlhelper.py
Revision 1.6.2.3: download - view: text, annotated - select for diffs - revision graph
Wed Feb 15 11:51:47 2012 UTC (12 years, 3 months ago) by dwinter
Branches: r2

character handling in updatePersonaWWW changed

1: 2: from sys import argv 3: 4: import string 5: import xml.dom.minidom 6: #import Ft.Xml.XLink.Processor 7: #import Ft.Xml.XLink.XLinkElements 8: # 9: #from Ft.Xml import XPath 10: #from Ft.Xml.XPath import Evaluate 11: #from Ft.Xml.XLink import XLINK_NAMESPACE 12: #from Ft.Xml.XLink import XLinkElements 13: 14: #from Ft.Xml.Domlette import NonvalidatingReader,InputSource 15: #from Ft.Xml import EMPTY_NAMESPACE 16: 17: #from Ft.Lib import Uri 18: 19: from xml.etree import ElementTree 20: import logging 21: 22: xml2html={'WEB_normal':('',''),'Normal':('',''),'WEB_picture':('',''),'WEB_figuretitle':('',''),'WEB_bibliography':('',''),'Web_kursiv':('',''),'WEB_kursiv':('',''),'WEB_hyperlink':('',''),'Hyperlink':('','')} 23: 24: def addToDict(dict,name,value): 25: if name=="": 26: return 0 27: else: 28: 29: if not dict.has_key(name): 30: dict[name]=[] # als array anlegen 31: 32: dict[name].append(value) 33: return 1 34: 35: def proj2hash(xmlstring): 36: """wandelt xml-files fuer die projekte in ein hash""" 37: 38: #dom=xml.dom.minidom.parseString(xmlstring) 39: 40: tree = ElementTree.fromstring(xmlstring) 41: 42: 43: pars = tree.findall(".//part[0]/par") 44: 45: list={} 46: 47: #gettitle 48: #part= dom.getElementsByTagName('part')[0] 49: #pars=part.getElementsByTagName('par') 50: #pars=Evaluate('par',dom.getElementsByTagName('part')[0]) 51: logging.debug(pars) 52: for par in pars: 53: logging.debug(par) 54: className=par.attrib['class'] 55: #.getAttribute('class') 56: content=par.text 57: addToDict(list,className,content) 58: 59: list.update(parseTable(tree.find('.//{http://www.w3.org/HTML/1998/html4}table'))) # Parse the Table 60: 61: #evaluate level 1 62: sections = tree.findall(".//part[0]/section") 63: #sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections 64: #sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections 65: #print sections,dom.getElementsByTagName('part')[0] 66: for section in sections: 67: 68: sec=parseSection(section) 69: if sec[0]=="WEB_project_header": # Sonderfall project 70: 71: addToDict(list,'WEB_project_header',sec[1]) # store title 72: addToDict(list,'WEB_project_description',sec[2]) #store description 73: else: # no information in heading 74: 75: addToDict(list,sec[0],sec[2]) 76: 77: #evaluate higher level sections 78: sections = tree.findall(".//part[0]/section/section") 79: #sections=Evaluate('section/section',dom.getElementsByTagName('part')[0]) 80: 81: for section in sections: 82: logging.debug("sections2:"+repr(section)) 83: sec=parseSection(section) 84: 85: if sec[0]=="WEB_project_header": # Sonderfall project 86: addToDict(list,'WEB_project_header',sec[1]) # store title 87: addToDict(list,'WEB_project_description',sec[2]) #store description 88: else: # no information in heading 89: addToDict(list,sec[0],sec[2]) 90: 91: 92: return list 93: 94: 95: def parseSection(section): 96: type="" 97: header="" 98: #for heading in section.childNodes: 99: 100: heading=section.find(".//heading") 101: # if getattr(heading,'tagName','')=="heading": 102: 103: 104: type=heading.attrib['class'] 105: logging.debug("parseSection (class):"+type) 106: header=heading.text 107: logging.debug("parseSection (header):"+header) 108: 109: if type=="": # falls heading fehlt, pruefe ob erster par richtig 110: par=section.find(".//par") 111: #par=section.getElementsByTagName('par')[0] 112: type=par.attrib['class'] 113: header=par.text 114: 115: #print section.childNodes 116: pars=section.findall(".//par") 117: #pars=Evaluate('par',section) 118: content=par2html(pars) 119: 120: return (type,header,content) 121: 122: def parseTable(table): 123: fields={} 124: rows=table.findall('.//{http://www.w3.org/HTML/1998/html4}tr') 125: #rows=table.getElementsByTagName('html:tr') 126: for row in rows: 127: logging.debug("ROW") 128: cols=row.findall('.//{http://www.w3.org/HTML/1998/html4}td') 129: #cols=row.getElementsByTagName('html:td') 130: 131: #Name des Datenfeldes einlesen 132: try: 133: field=cols[0].find('.//par').attrib['class'] 134: #field=cols[0].getElementsByTagName('par')[0].getAttribute('class') 135: #print "field",field 136: except: 137: logging.debug("error") 138: field="" 139: 140: #Wandeln der Eintrge in HTML 141: 142: pars=cols[1].findall('.//par') 143: #pars=cols[1].getElementsByTagName('par') 144: 145: 146: html=par2html(pars,tags=("",";")) 147: logging.debug("field:"+field) 148: logging.debug("html:"+html) 149: addToDict(fields,field,html) 150: #print fields 151: return fields 152: 153: def par2html(pars,tags=None): 154: #html="" 155: logging.debug("part2html:"+repr(pars)) 156: if pars is None: 157: return "" 158: for par in pars: 159: logging.debug("part2html:"+repr(par)) 160: if not tags: 161: try: 162: tag=xml2html[par.attrib['class']] 163: except: 164: tag=('','') 165: else: 166: tag=tags 167: 168: content=par.text 169: if content is None: 170: content="" 171: logging.debug("part2html:"+content) 172: #print "CONTETN",content 173: 174: #print par.getAttribute('class'),node 175: try: 176: html=html+tag[0]+content+tag[1] 177: except: 178: html=tag[0]+content+tag[1] 179: 180: try: 181: return html 182: except: 183: return "" 184: 185: def getXlink(nodes): 186: """searches xlinks and gives them back as html""" 187: ret="" 188: for node in nodes: 189: if node.attributes: 190: if 'xlink:type' in node.attributes.keys(): #is a xlink? 191: ret +=xlink2html(node) 192: return ret 193: 194: def xlink2html(xlink): 195: ret="" 196: attributes=xlink.attributes 197: 198: if xlink.tagName.lower()=="image": 199: ret +="<img src=%s />"%xlink.getAttribute('xlink:href') 200: elif xlink.tagName.lower()=="link": 201: ret +="<a href='%s' >%s</a>"%(xlink.getAttribute('xlink:href'),getText(xlink.childNodes)) 202: 203: 204: 205: 206: return ret 207: 208: def getText(nodelist): 209: 210: rc = u'' 211: for node in nodelist: 212: if node.nodeType == node.TEXT_NODE: 213: #print "node",node 214: #print "NODE",node.data.encode('utf-8','ignore'),"V" 215: #print "HALII" 216: try: 217: try: 218: #rc += node.data.encode('utf-8','ignore') 219: rc += node.data 220: 221: except: 222: #rc= node.data.encode('utf-8','ignore') 223: rc=node.data 224: except: 225: rc="ERROR" 226: #node.data.decode('utf-8','ignore') 227: print "ERROR" 228: node.data.encode('utf-8','ignore') 229: #print "RC",rc 230: elif node.tagName =="inline": 231: rc+=par2html([node]) 232: elif node.attributes: 233: 234: if 'xlink:type' in node.attributes.keys(): #is a xlink? 235: rc +=xlink2html(node) 236: #print "RWT",rc 237: return rc 238: 239: 240: #filename=argv[1] 241: #fileString=file(filename).read() 242: #print proj2hash(fileString) 243: