Mercurial > hg > MPIWGWeb


from sys import argv

import string
import xml.dom.minidom
#import Ft.Xml.XLink.Processor
#import Ft.Xml.XLink.XLinkElements
#
#from Ft.Xml import XPath
#from Ft.Xml.XPath import Evaluate
#from Ft.Xml.XLink import XLINK_NAMESPACE
#from Ft.Xml.XLink import XLinkElements

#from Ft.Xml.Domlette import NonvalidatingReader,InputSource
#from Ft.Xml import EMPTY_NAMESPACE

#from Ft.Lib import Uri

from xml.etree import ElementTree
import logging

xml2html={'WEB_normal':('<p>','</p>'),'Normal':('<p>','</p>'),'WEB_picture':('<p class=\"picture\">','</p>'),'WEB_figuretitle':('<p class=\"picturetitle\">','</p>'),'WEB_bibliography':('<p><i>','</i></p>'),'Web_kursiv':('<i>','</i>'),'WEB_kursiv':('<i>','</i>'),'WEB_hyperlink':('',''),'Hyperlink':('','')}

def addToDict(dict,name,value):
    if name=="":
        return 0
    else:

        if not dict.has_key(name):
            dict[name]=[] # als array anlegen

        dict[name].append(value)
        return 1

def proj2hash(xmlstring):
    """wandelt xml-files fuer die projekte in ein hash"""

    #dom=xml.dom.minidom.parseString(xmlstring)

    tree = ElementTree.fromstring(xmlstring)


    pars = tree.findall(".//part[0]/par")

    list={}

    #gettitle
    #part= dom.getElementsByTagName('part')[0]
    #pars=part.getElementsByTagName('par')
    #pars=Evaluate('par',dom.getElementsByTagName('part')[0])
    logging.debug(pars)
    for par in pars:
        logging.debug(par)
        className=par.attrib['class']
        #.getAttribute('class')
        content=par.text
        addToDict(list,className,content)

    list.update(parseTable(tree.find('.//{http://www.w3.org/HTML/1998/html4}table'))) # Parse the Table

    #evaluate level 1
    sections = tree.findall(".//part[0]/section")
    #sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections
    #sections=Evaluate('section',dom.getElementsByTagName('part')[0])# Parse all Sections
    #print sections,dom.getElementsByTagName('part')[0]
    for section in sections:

        sec=parseSection(section)
        if sec[0]=="WEB_project_header": # Sonderfall project

            addToDict(list,'WEB_project_header',sec[1]) # store title
            addToDict(list,'WEB_project_description',sec[2]) #store description
        else: # no information in heading

            addToDict(list,sec[0],sec[2])

    #evaluate higher level sections
    sections = tree.findall(".//part[0]/section/section")
    #sections=Evaluate('section/section',dom.getElementsByTagName('part')[0])

    for section in sections:
        logging.debug("sections2:"+repr(section))
        sec=parseSection(section)

        if sec[0]=="WEB_project_header": # Sonderfall project
            addToDict(list,'WEB_project_header',sec[1]) # store title
            addToDict(list,'WEB_project_description',sec[2]) #store description
        else: # no information in heading
            addToDict(list,sec[0],sec[2])


    return list


def parseSection(section):
    type=""
    header=""
    #for heading in section.childNodes:

    heading=section.find(".//heading")
       # if getattr(heading,'tagName','')=="heading":


    type=heading.attrib['class']
    logging.debug("parseSection (class):"+type)
    header=heading.text
    logging.debug("parseSection (header):"+header)

    if type=="": # falls heading fehlt, pruefe ob erster par richtig
        par=section.find(".//par")
        #par=section.getElementsByTagName('par')[0]
        type=par.attrib['class']
        header=par.text

    #print section.childNodes
    pars=section.findall(".//par")
    #pars=Evaluate('par',section)
    content=par2html(pars)

    return (type,header,content)

def parseTable(table):
    fields={}
    rows=table.findall('.//{http://www.w3.org/HTML/1998/html4}tr')
    #rows=table.getElementsByTagName('html:tr')
    for row in rows:
        logging.debug("ROW")
        cols=row.findall('.//{http://www.w3.org/HTML/1998/html4}td')
        #cols=row.getElementsByTagName('html:td')

        #Name des Datenfeldes einlesen
        try:
            field=cols[0].find('.//par').attrib['class']
            #field=cols[0].getElementsByTagName('par')[0].getAttribute('class')
            #print "field",field
        except:
            logging.debug("error")
            field=""

        #Wandeln der Eintrge in HTML

        pars=cols[1].findall('.//par')
        #pars=cols[1].getElementsByTagName('par')


        html=par2html(pars,tags=("",";"))
        logging.debug("field:"+field)
        logging.debug("html:"+html)
        addToDict(fields,field,html)
        #print fields
    return fields

def par2html(pars,tags=None):
    #html=""
    logging.debug("part2html:"+repr(pars))
    if pars is None:
        return ""
    for par in pars:
        logging.debug("part2html:"+repr(par))
        if not tags:
            try:
                tag=xml2html[par.attrib['class']]
            except:
                tag=('<p>','</p>')
        else:
            tag=tags

        content=par.text
        if content is None:
            content=""
        logging.debug("part2html:"+content)
        #print "CONTETN",content

        #print par.getAttribute('class'),node
        try:
            html=html+tag[0]+content+tag[1]
        except:
            html=tag[0]+content+tag[1]

    try:
        return html
    except:
        return ""

def getXlink(nodes):
    """searches xlinks and gives them back as html"""
    ret=""
    for node in nodes:
        if node.attributes:
            if 'xlink:type' in node.attributes.keys(): #is a xlink?
                ret +=xlink2html(node)
    return ret

def xlink2html(xlink):
    ret=""
    attributes=xlink.attributes

    if xlink.tagName.lower()=="image":
        ret +="<img src=%s />"%xlink.getAttribute('xlink:href')
    elif xlink.tagName.lower()=="link":
        ret +="<a href='%s' >%s</a>"%(xlink.getAttribute('xlink:href'),getText(xlink.childNodes))


    return ret

def getText(nodelist):

    rc = u''
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
            #print "node",node
            #print "NODE",node.data.encode('utf-8','ignore'),"V"
            #print "HALII"
            try:
                try:
                    #rc += node.data.encode('utf-8','ignore')
                    rc += node.data

                except:
                    #rc= node.data.encode('utf-8','ignore')
                    rc=node.data
            except:
                rc="ERROR"
                #node.data.decode('utf-8','ignore')
                print "ERROR"
            node.data.encode('utf-8','ignore')
            #print "RC",rc
        elif node.tagName =="inline":
            rc+=par2html([node])
        elif node.attributes:

            if 'xlink:type' in node.attributes.keys(): #is a xlink?
                rc +=xlink2html(node)
    #print "RWT",rc
    return rc


#filename=argv[1]
#fileString=file(filename).read()
#print proj2hash(fileString)
author	casties
date	Thu, 25 Jun 2015 17:44:57 +0200
parents	bca61e893fcc
children