view graphML2RDF.py @ 3:c97d43ac8fa6

first release
author dwinter
date Mon, 23 Jul 2012 09:57:32 +0200
parents 8190d724dc01
children e661aabed2f9
line wrap: on
line source

# Dieses Tool konvertiert graphMl Files erzeugt mit yED in einen graphen, entweder im dot, Format oder als RDF
# TODO: lots of ... Zuordnung graphenTyp zu Ontologie sollte konfiurierbar sein.
from lxml import etree
import os.path
import os


namespaces={'graphML':'http://graphml.graphdrawing.org/xmlns',
            'y':'http://www.yworks.com/xml/graphml'
            }

# Alle wesentlichen Informstionrn fuer einen Knoten
class Node:
    
    label="" #Label in yED, wird auch gebraucht um gleiche Knoten zu identifizieren, wenn mehr als ein Graph zusammengefuehrt werden soll.
    #Es muss also beim Erstellen auf Konsistenz der Label geachtet werden.
    internalID="" #InternalID eine Knotens innerhalb eines Graphen, dieses sind nur pro Graph eindeutig-
    externalRef="" # Referenzen auf externe Weseiten (key="d4")
    internalRef="" # Referenzen auf andere Graphen (key="d4"), unterscheidung zwischen externer und interner, ob Pfad mit http beginnt.
    numId=0 #Id des Knotens sollte eindeutig fuer alle Knoten sein.
    nodeType=-1
    
    def __init__(self,numId):
        self.numId=numId
    
 #Kante   
class Edge:
    src=None
    target=None
    edgeType=-1
    
    def __init__(self,src,target):
        self.src=src
        self.target=target
        

# Erzeugt Ids fuer die Knoten
class IDDispensor:
    currentID=0
    def getID(self):
        self.currentID+=1
        return self.currentID 

# Der eigentliche Graph
class Graph:
    label2Ids={} # Zuweisung labels zu den Ids des Knoten
    #id2nodes={} #Zuweisung id zu den Knoten
    internalId2nodesID={} #Zuweisung interneID zur allgemeinen ID
    edges=set() # Menger der Kanten
    id2label={} # Zuweisung id zu den Labeln
    edgeStyles={}
    edgeDescription={}
    graphURI=""
    nodeStyles={}
    nodeDescription={}
  
    def __init__(self,dispensor):
        self.dispensor=dispensor;
        self.id2nodes={};
        self.edges=set();
        
    
    def convertGrahml(self,filename):
        """Konvertiert ein Grahphml-File in ein Netzwerk mit Knoten und Kanten.
        Die Abbildung von Layout auf Knoten- und Kantentypen erfolgt durch den Vergleich des Layout mit einer Palette
        """
        tree = etree.parse(filename)
        
        nodes=tree.xpath("//graphML:node",namespaces=namespaces)
        
        for node in nodes:
            nodeIDs=node.xpath("@id",namespaces=namespaces)
            labels=node.xpath(".//y:NodeLabel",namespaces=namespaces)
            for nodeID in nodeIDs:
                nodeIDString=unicode(nodeID)
            
            labelString=None
            for label in labels:
                labelString=unicode(label.text).lstrip().rstrip()
                
            
            newNode = Node(self.dispensor.getID())
            newNode.label=labelString
            newNode.internalID=nodeIDString
            
            nodeRefs=node.xpath('./graphML:data[@key="d4"]',namespaces=namespaces)
            #nodeRefs=node.xpath("./graphML:data",namespaces=namespaces)
            for nodeRef in nodeRefs:
                nodeRefString=nodeRef.text
                if nodeRefString is None:
                    continue
                
                #hack dw, scheinbar falsche urs drin
                if nodeRefString.find("http")>0:
                    continue
                if nodeRefString.lstrip().startswith("http:") or nodeRefString.startswith("https:") or nodeRefString.startswith("ftp:"):
                    newNode.externalRef=nodeRefString
                else:
                    newNode.internalRef=nodeRefString
            
            #--
            
            #read styles
            typeID=None
            #fall 1 Generic Node
            gns = node.xpath('./graphML:data[@key="d6"]/y:GenericNode',namespaces=namespaces)
            for gn in gns:
                style="g"
                fills=gn.xpath('./y:Fill',namespaces=namespaces)
                fs=None
                for fill in fills:
                    fs=fill.attrib
                    
                borderstyles=gn.xpath('./y:BorderStyle',namespaces=namespaces)
                bs=None
                for borderstyle in borderstyles:
                    bs=borderstyle.attrib
                    
                
                config=gn.attrib.get("configuration")
                 
               
                typeID=self.getNodeTypeFromPalette(style,fs,bs,config)
               
        
            #fall 2 shape Node
            gns = node.xpath('./graphML:data[@key="d6"]/y:ShapeNode',namespaces=namespaces)
            for gn in gns:
                style="s"
                fills=gn.xpath('./y:Fill',namespaces=namespaces)
                fs=None
                for fill in fills:
                    fs=fill.attrib
                    
                borderstyles=gn.xpath('./y:BorderStyle',namespaces=namespaces)
                bs=None
                for borderstyle in borderstyles:
                    bs=borderstyle.attrib
                    
                
                shapes=gn.xpath('./y:Shape',namespaces=namespaces)
                shapeType=None
                for shape in shapes:
                    shapeType=shape.attrib.get("type")
                    
                    
                typeID=self.getNodeTypeFromPalette(style,fs,bs,shapeType)
               
            
            #---
            if typeID is None:
                typeID=-1
            newNode.nodeType=typeID
            self.id2nodes[newNode.numId]=newNode
            if labelString!=None:
                self.label2Ids[labelString]=newNode.numId
            
            
            self.internalId2nodesID[newNode.internalID]=newNode.numId
            
        edges=tree.xpath("//graphML:edge",namespaces=namespaces)
        
        for edge in edges:
            srcIDs=edge.xpath("@source",namespaces=namespaces)
            tarIDs=edge.xpath("@target",namespaces=namespaces)
            for srcID in srcIDs:
                source=unicode(srcID)
            
            for tarID in tarIDs:
                target=unicode(tarID)
        
            #read styles
            
            #lineStyles=edge.xpath('./graphML:data[@key="d10"]/y:GenericEdge/y:LineStyle',namespaces=namespaces)
            lineStyles=edge.xpath('./graphML:data[@key="d10"]/.//y:LineStyle',namespaces=namespaces)
            ls=None
            for lineStyle in lineStyles:
                ls=lineStyle.attrib
        
            #arrows=edge.xpath('./graphML:data[@key="d10"]/y:GenericEdge/y:Arrows',namespaces=namespaces)
            arrows=edge.xpath('./graphML:data[@key="d10"]/.//y:Arrows',namespaces=namespaces)
            ars=None
            for arrow in arrows:
                ars=arrow.attrib
        
            typeID=self.getTypeFromPalette(ls,ars)
            
            newEdge=Edge(self.internalId2nodesID.get(source),self.internalId2nodesID.get(target))
            newEdge.edgeType=typeID
            
            
            self.edges.add(newEdge) 
    
    
    def getNodeTypeFromPalette(self,style,fs,bs,config):
        for key,value in self.nodeStyles.items():
            styleVorlage,fsVorlage,bsVorlage,configVorlage=value
            if style!=styleVorlage:
                continue
            
            if config!=configVorlage:
                continue
                   
                   
            if self.cmpDict(fs,fsVorlage) and self.cmpDict(bs,bsVorlage):
                print key
                return key
            
        return -1
        
        
        
    def getTypeFromPalette(self,ls,ars):
     
        for key,value in self.edgeStyles.items():
            lsVorlage,arsVorlage=value
            if self.cmpDict(ls,lsVorlage) and self.cmpDict(ars,arsVorlage):
                
                return key
            
        return -1
    
    def cmpDict(self,x,y):
        """Teste zwei dicts auf Gleichheit"""
        
        if (x is None) or (y is None):
            return False
        
        for key in x.keys():
            yVal=y.get(key,None)
            xVal=x.get(key)
            if yVal!=xVal:
                return False
        
        return True
        
    def exportAsDot(self,filename,graphName,onlyMs=False,partOfGraph=None,linksToGraph=None):
        out = file(filename,"w")
    
        type2NodeShape={0:'style="solid" color="blue"',
                    1:'style="solid" color="lightblue"',
                    2:'style="solid" color="blue"',
                    3:'style="tapered" color="orange"',
                    4:'style="solid" color="green"',
                    5:'style="solid" color="sienna"',
                    6:'style="solid" color="magenta"',
                    -1:'style="dotted" color="red"'
                    }
        
        type2EdgeShape={0:'style="dotted" color="blue"',
                    1:'style="solid"',
                    2:'style="bold"',
                    3:'style="tapered"',
                    4:'style="solid" color="green"',
                    5:'style="solid" color="sienna"',
                    6:'style="solid" color="magenta"',
                    -1:'style="dotted" color="red"'
                    }
            
        out.write("""digraph %s {"""%graphName)
        g=self
        
        for key,value in g.id2nodes.items():
            #name=value.label.replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_")
            #name=value.numId
            name=key
            label=value.label.replace("\n","")
            url=value.externalRef
            
            if onlyMs:
                if not label.lstrip().startswith("Add"):
                    continue
            try:
                s="""%s [label="%s" URL="%s" %s];\n"""%(name,label.decode("utf-8"),url,type2NodeShape.get(value.nodeType))
                out.write(s)
            except:
                s="""%s [label="%s" URL="%s" %s];\n"""%(name,repr(label),url,type2NodeShape.get(value.nodeType))
                out.write(s)
            
            
        for edge in g.edges:
            try:
                #sr=g.id2label.get(edge.src).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
                sr=edge.src
                
                tg=edge.target
                #tg=g.id2label.get(edge.target).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
                
                s = """%s -> %s [%s];\n"""%(sr,tg,type2EdgeShape.get(edge.edgeType))
                out.write(s)
            except:
                pass
            
        
        if not partOfGraph is None:
            for nodeID,graphList in partOfGraph.items():
                #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
                fromNode=nodeID
                for graph in graphList:
                    try:
                        s = """G_%s -> %s [color="yellow"];\n"""%(graph.replace(".","_"),fromNode)
                        out.write(s)
                        s = """G_%s  [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(graph.replace(".","_"),graph)
                        out.write(s)
                    except:
                        pass
                    
                    
        if not linksToGraph is None:
            for nodeID,graph in linksToGraph:
                #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
                fromNode=nodeID
                splitted=graph.split("/")
                print graph
                gr = splitted[-1]
                print gr
                try:
                    s = """%s -> G_%s [color="green"];\n"""%(fromNode,gr.replace(".","_"))
                    out.write(s)
                    s = """G_%s  [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(gr.replace(".","_"),gr)
                    out.write(s)
                except:
                    pass
        out.write("}")
  
   
        out.close()
        
        
    def exportAsRDF(self,filename,graphName,onlyMs=False,partOfGraph=None,linksToGraph=None):
        out = file(filename,"w")
    
        base="http://example.org/harriotOnt/"
        type2NodeShape={0: base+"Topic",
                        1: base+"Topic",
                        2: base+"Topic",
                        3: base+"FolioPage",
                        4: base+"4",
                        5: base+"RelatedFolioPage",
                        -1: base+"UNKNOWN"}
        
        type2EdgeShape={0: base+"has_prev_by_pagination",
                        1: base+"has_conjectural_relation",
                        2: base+"has_prev_by_conjection",
                        3: base+"has_prev_by_conjection",
                        4: base+"result_used_from",
                        5: base+"result_used_from",
                        -1: base+"is_related_to"}
            
        #out.write("""digraph %s {"""%graphName)
        g=self
        
        for key,value in g.id2nodes.items():
            #name=value.label.replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_")
            #name=value.numId
            name=key
            label=value.label.replace("\n","")
            url=value.externalRef
            
            if onlyMs:
                if not label.lstrip().startswith("Add"):
                    continue
            
            ressourceURI=base+str(name)
            try:
                s=""
                if label!="":
                    s+="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,label.decode("utf-8").replace('"','\"'))
                if url!="":
                    s+="""<%s> <%s> <%s>.\n"""%(ressourceURI,base+"describes",url)
                print value.nodeType
                s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,type2NodeShape.get(value.nodeType))          
                #s="""%s [label="%s" URL="%s" %s];\n"""%(name,label.decode("utf-8"),url,type2NodeShape.get(value.nodeType))
                out.write(s)
            except:
                if label!="":
                    s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,'CHECK_THIS')
                    #s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s"."""%(ressourceURI,repr(label).replace('"','\"'))
                if url!="":    
                    s+="""<%s> <%s> <%s>."""%(ressourceURI,base+"describes",url)
                s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,type2NodeShape.get(value.nodeType))  
                #s="""%s [label="%s" URL="%s" %s];\n"""%(name,repr(label),url,type2NodeShape.get(value.nodeType))
                out.write(s)
            
            
        for edge in g.edges:
            try:
                #sr=g.id2label.get(edge.src).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
                sr=edge.src
                
                tg=edge.target
                #tg=g.id2label.get(edge.target).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
                
                s ="""<%s><%s><%s>.\n"""%(base+str(sr),type2EdgeShape.get(edge.edgeType),base+str(tg))
                #s = """%s -> %s [%s];\n"""%(sr,tg,type2EdgeShape.get(edge.edgeType))
                out.write(s)
            except:
                pass
            
        
        if not partOfGraph is None:
            for nodeID,graphList in partOfGraph.items():
                #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
                fromNode=nodeID
                for graph in graphList:
                    try:
                        ressourceURI=base+graph
                        s ="""<%s><%s><%s>.\n"""%(base+str(fromNode),base+"is_part_of",ressourceURI)
                        #s = """G_%s -> %s [color="yellow"];\n"""%(graph.replace(".","_"),fromNode)
                        out.write(s)
                        
                        s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,graph)
                        s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,base+"HarriotGraph")  
                        
                        #s = """G_%s  [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(graph.replace(".","_"),graph)
                        out.write(s)
                    except:
                        pass
                    
                    
        if not linksToGraph is None:
            for nodeID,graph in linksToGraph:
                #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
                fromNode=nodeID
                splitted=graph.split("/")
                print graph
                gr = splitted[-1]
                print gr
                ressourceURI=base+gr
                
                
               
                typeSrc=type2NodeShape.get(nodeID)
                if typeSrc==base+"Topic":
                    relation="is_specified_in"
                else:
                    relation="see_also"
                
                try:
                    s ="""<%s><%s><%s>.\n"""%(base+str(fromNode),relation,ressourceURI)
                    #s = """%s -> G_%s [color="green"];\n"""%(fromNode,gr.replace(".","_"))
                    out.write(s)
                    
                    s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,gr)
                    s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,base+"HarriotGraph")  
                       
                    #s = """G_%s  [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(gr.replace(".","_"),gr)
                    out.write(s)
                except:
                    pass
        #out.write("}")
  
   
        out.close()
    def readPalette(self,palettePath):
        typeNr=0
        palette = etree.parse(palettePath)
        edges=palette.xpath("//graphML:edge",namespaces=namespaces)
        # lage alle kanten in der palette
        for edge in edges:
            #relevant fuer die einordnugn ist data key=12 linestyle und arrows
            lineStyles=edge.xpath('./graphML:data[@key="d12"]/y:GenericEdge/y:LineStyle',namespaces=namespaces)
            ls=None
            for lineStyle in lineStyles:
                ls=lineStyle
        
            arrows=edge.xpath('./graphML:data[@key="d12"]/y:GenericEdge/y:Arrows',namespaces=namespaces)
            ars=None
            for arrow in arrows:
                ars=arrow
        
            #get description
            ds=""
            descriptions=edge.xpath('./graphML:data[@key="d9"]',namespaces=namespaces)
            for description in descriptions:
                ds=description
            
            self.edgeDescription[typeNr]=ds.text
            self.edgeStyles[typeNr]=(ls.attrib,ars.attrib)
            typeNr+=1
        
        typeNr=0   
        nodes=palette.xpath("//graphML:node",namespaces=namespaces)
        for node in nodes:
            style=""
            #fall 1 Generic Node
            gns = node.xpath('./graphML:data[@key="d7"]/y:GenericNode',namespaces=namespaces)
            for gn in gns:
                style="g"
                fills=gn.xpath('./y:Fill',namespaces=namespaces)
                fs=None
                for fill in fills:
                    fs=fill.attrib
                    
                borderstyles=gn.xpath('./y:BorderStyle',namespaces=namespaces)
                bs=None
                for borderstyle in borderstyles:
                    bs=borderstyle.attrib
                    
                
                config=gn.attrib.get("configuration")
                 
                #get description
                ds=""
                descriptions=node.xpath('./graphML:data[@key="d4"]',namespaces=namespaces)
                for description in descriptions:
                    ds=description.text
          
                self.nodeDescription[typeNr]=ds
                
                self.nodeStyles[typeNr]=(style,fs,bs,config)
                typeNr+=1
        
            #fall 2 shape Node
            gns = node.xpath('./graphML:data[@key="d7"]/y:ShapeNode',namespaces=namespaces)
            for gn in gns:
                style="s"
                fills=gn.xpath('./y:Fill',namespaces=namespaces)
                fs=None
                for fill in fills:
                    fs=fill.attrib
                    
                borderstyles=gn.xpath('./y:BorderStyle',namespaces=namespaces)
                bs=None
                for borderstyle in borderstyles:
                    bs=borderstyle.attrib
                    
                
                shapes=gn.xpath('./y:Shape',namespaces=namespaces)
                shapeType=None
                for shape in shapes:
                    shapeType=shape.attrib.get("type")
               
               
                 
                #get description
                ds=""
                descriptions=node.xpath('./graphML:data[@key="d4"]',namespaces=namespaces)
                for description in descriptions:
                    ds=description.text
          
                self.nodeDescription[typeNr]=ds
                self.nodeStyles[typeNr]=(style,fs,bs,shapeType)
                typeNr+=1
        
def merge(graphs,dispensor):
    filter=['supermap.graphml']
    partOfGraph={}
    linksToGraph=set()
    
    edges=set()
    mg =Graph(dispensor)
    for g in graphs:
        if g.graphURI in filter:
            continue
        idalt2neu={}
        for nodeid in g.id2nodes.keys():
            node=g.id2nodes.get(nodeid)
            label=node.label
            currentID =mg.label2Ids.get(label,dispensor.getID()) #hole id wenn existent sonst neue
               
            mg.label2Ids[label]=currentID
            mg.id2label[currentID]=label
            idalt2neu[node.numId]=currentID
            mg.id2nodes[currentID]=node
            
            
            if node.internalRef!="":
                linksToGraph.add((currentID,node.internalRef)) 
            
            containedIn = partOfGraph.get(currentID,set())
            
            
            containedIn.add(g.graphURI)
            partOfGraph[currentID]=containedIn
            
        
        for edge in g.edges:
            src=edge.src
            target=edge.target
            
            edge.src=idalt2neu.get(src)
            edge.target=idalt2neu.get(target)
            edges.add(edge)
            
    mg.edges=edges
    return mg,partOfGraph,linksToGraph

if __name__ == '__main__':
    dispensor = IDDispensor()
    #g1=Graph(dispensor)
    #g1.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml")
    #g1.convertGrahml("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/5.7.3_cubics_other.graphml")
    
    
    
    path="/Users/dwinter/Documents/Projekte/Europeana/harriot-graphml/Maps_20120523/"
    ls = os.listdir(path)
    graphs=set()
    
    for l in ls:
        
        g1=Graph(dispensor)
        g1.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml")
    
        g1.convertGrahml(path+l)
        g1.graphURI=l
        graphs.add(g1)
    
    g,po,lg = merge(graphs,dispensor)
 
    
    #print g.label2Ids.keys()
    #print len(g.label2Ids.keys())
    
    #g.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml")
    #g.exportAsDot("/tmp/out.dot", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg)
    g.exportAsDot("/tmp/out.dot", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg)
    g.exportAsRDF("/tmp/out.rdf", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg)
    
    out2 = file("/tmp/out.txt","w")
    for key in g.label2Ids.keys():
        try:
            out2.write('"'+key+'"'+"\n")
        except:
            pass
    out2.close()