changeset 6:6b51bd2418b9

version 0.3 can now deal with graphs and subgraphs as used in the Harriot analysis,
author dwinter
date Tue, 07 Aug 2012 14:02:56 +0200
parents e661aabed2f9
children 768ade75c895
files graphML2RDF.py
diffstat 1 files changed, 171 insertions(+), 44 deletions(-) [+]
line wrap: on
line diff
--- a/graphML2RDF.py	Mon Jul 30 16:36:05 2012 +0200
+++ b/graphML2RDF.py	Tue Aug 07 14:02:56 2012 +0200
@@ -3,7 +3,7 @@
 from lxml import etree
 import os.path
 import os
-
+import logging
 
 namespaces={'graphML':'http://graphml.graphdrawing.org/xmlns',
             'y':'http://www.yworks.com/xml/graphml'
@@ -53,36 +53,58 @@
     graphURI=""
     nodeStyles={}
     nodeDescription={}
+    partOfGraph={}
+    isSubGraphOf={}
   
     def __init__(self,dispensor):
         self.dispensor=dispensor;
         self.id2nodes={};
         self.edges=set();
-        
+        self.partOfGraph={};
+        self.isSubGraphOf={}
+        self.startGraphId=""
     
-    def convertGrahml(self,filename):
-        """Konvertiert ein Grahphml-File in ein Netzwerk mit Knoten und Kanten.
-        Die Abbildung von Layout auf Knoten- und Kantentypen erfolgt durch den Vergleich des Layout mit einer Palette
-        """
-        tree = etree.parse(filename)
+    def readGraph(self,graphNode,partOf="main"):
+        
+        nodes=graphNode.xpath("./graphML:node",namespaces=namespaces)
         
-        nodes=tree.xpath("//graphML:node",namespaces=namespaces)
+        graphIDs=graphNode.xpath("@id",namespaces=namespaces)
+        for graphID in graphIDs:
+            graphIDString = unicode(graphID)
+            
+        
         
+        if partOf=="main": ##startgraph
+            self.startGraphId=graphIDString
+        else:
+            self.isSubGraphOf[graphIDString]=partOf
+            
         for node in nodes:
             nodeIDs=node.xpath("@id",namespaces=namespaces)
-            labels=node.xpath(".//y:NodeLabel",namespaces=namespaces)
+            
+            
+            #labels=node.xpath(".//y:NodeLabel",namespaces=namespaces)
+            labels=node.xpath('./graphML:data[@key="d6"]/y:*/y:NodeLabel',namespaces=namespaces)
+            
             for nodeID in nodeIDs:
                 nodeIDString=unicode(nodeID)
-            
+                
+                
             labelString=None
             for label in labels:
                 labelString=unicode(label.text).lstrip().rstrip()
                 
             
             newNode = Node(self.dispensor.getID())
-            newNode.label=labelString
+            if labelString!=None:
+                newNode.label=labelString
+            else:
+                newNode.label="NODE:"+str(newNode.numId)
+            
             newNode.internalID=nodeIDString
             
+           
+            
             nodeRefs=node.xpath('./graphML:data[@key="d4"]',namespaces=namespaces)
             #nodeRefs=node.xpath("./graphML:data",namespaces=namespaces)
             for nodeRef in nodeRefs:
@@ -152,13 +174,24 @@
                 typeID=-1
             newNode.nodeType=typeID
             self.id2nodes[newNode.numId]=newNode
+            
+            self.partOfGraph[newNode.numId]=graphIDString #speichere node ist teil von
+            
             if labelString!=None:
                 self.label2Ids[labelString]=newNode.numId
             
             
             self.internalId2nodesID[newNode.internalID]=newNode.numId
             
-        edges=tree.xpath("//graphML:edge",namespaces=namespaces)
+            
+            #suche nach subgraphen
+            graphs=node.xpath("./graphML:graph",namespaces=namespaces)
+           
+            for graph in graphs:
+                self.readGraph(graph,graphIDString)
+        
+            
+        edges=graphNode.xpath("./graphML:edge",namespaces=namespaces)
         
         for edge in edges:
             srcIDs=edge.xpath("@source",namespaces=namespaces)
@@ -191,6 +224,21 @@
             
             self.edges.add(newEdge) 
     
+      
+        
+    def convertGraphml(self,filename):
+        """Konvertiert ein Grahphml-File in ein Netzwerk mit Knoten und Kanten.
+        Die Abbildung von Layout auf Knoten- und Kantentypen erfolgt durch den Vergleich des Layout mit einer Palette
+        """
+        print "converting:"+filename
+        
+        tree = etree.parse(filename)
+        
+        #lese hauptgraphen
+        maingraphs=tree.xpath('./graphML:graph',namespaces=namespaces)
+        for maingraph in maingraphs:
+            self.readGraph(maingraph)
+        
     
     def getNodeTypeFromPalette(self,style,fs,bs,config):
         for key,value in self.nodeStyles.items():
@@ -264,7 +312,10 @@
             #name=value.label.replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_")
             #name=value.numId
             name=key
-            label=value.label.replace("\n","")
+            if value.label==None:
+                label="EMPTYLABEL"
+            else:
+                label=value.label.replace("\n","")
             url=value.externalRef
             
             if onlyMs:
@@ -274,7 +325,7 @@
                 s="""%s [label="%s" URL="%s" %s];\n"""%(name,label.decode("utf-8"),url,type2NodeShape.get(value.nodeType))
                 out.write(s)
             except:
-                s="""%s [label="%s" URL="%s" %s];\n"""%(name,repr(label),url,type2NodeShape.get(value.nodeType))
+                s="""%s [label="%s" URL="%s" %s];\n"""%(name,label.encode("utf-8"),url,type2NodeShape.get(value.nodeType))
                 out.write(s)
             
             
@@ -292,11 +343,32 @@
                 pass
             
         
+        toGraphs=set() #sammle alle graphen
+        for fromNode,toGraph in g.partOfGraph.items():
+            
+            s = """%s -> %s [color="blue"];\n"""%(fromNode,toGraph)
+            toGraphs.add(toGraph)
+            out.write(s)
+            
+        for fromNode,toGraph in g.isSubGraphOf.items():
+            s = """%s -> %s [color="blue"];\n"""%(fromNode,toGraph)
+            toGraphs.add(toGraph)
+            out.write(s)
+           
+        for toGraph in toGraphs:
+            s = """%s  [label="%s" color="blue" fillcolor="blue" style="filled"];\n"""%(str(toGraph).replace(".","_"),toGraph)
+            
+            out.write(s)
+            
+        
+        
+        
+        
         if not partOfGraph is None:
             for nodeID,graphList in partOfGraph.items():
                 #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
                 fromNode=nodeID
-                for graph in graphList:
+                for graph in [graphList]:
                     try:
                         s = """G_%s -> %s [color="yellow"];\n"""%(graph.replace(".","_"),fromNode)
                         out.write(s)
@@ -327,11 +399,11 @@
         out.close()
         
         
-    def exportAsRDF(self,filename,graphName,onlyMs=False,partOfGraph=None,linksToGraph=None):
+    def exportAsRDF(self,filename,graphName,onlyMs=False,partOfGraph=None,linksToGraph=None,describe=True):
         out = file(filename,"w")
     
-        base="http://ontologies.mpiwg-berlin.mpg.de/reasearch/harriot.owl/1.0/"
-        ressourceBase="http://entities.mpiwg-berlin.mpg.de/reasearch/harriot.owl/1.0/"
+        base="http://ontologies.mpiwg-berlin.mpg.de/research/harriot.owl/"
+        ressourceBase="http://entities.mpiwg-berlin.mpg.de/research/harriot.owl/"
         type2NodeShape={0: base+"Topic",
                         1: base+"Topic",
                         2: base+"Topic",
@@ -355,7 +427,10 @@
             #name=value.label.replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_")
             #name=value.numId
             name=key
-            label=value.label.replace("\n","")
+            if value.label==None:
+                label="EMPTYLABEL3"
+            else:
+                label=value.label.replace("\n","")
             url=value.externalRef
             
             if onlyMs:
@@ -367,15 +442,16 @@
                 s=""
                 if label!="":
                     s+="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,label.decode("utf-8").replace('"','\"'))
-                if url!="":
-                    s+="""<%s> <%s> <%s>.\n"""%(ressourceURI,base+"describes",url)
+                if url!="" and describe:
+                    s+="""<%s> <%s> <%s>.\n"""%(ressourceURI,base+"describes",url.lstrip().rstrip())
                 print value.nodeType
                 s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,type2NodeShape.get(value.nodeType))          
                 #s="""%s [label="%s" URL="%s" %s];\n"""%(name,label.decode("utf-8"),url,type2NodeShape.get(value.nodeType))
                 out.write(s)
             except:
                 if label!="":
-                    s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,'CHECK_THIS')
+                    #s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,'CHECK_THIS')
+                    s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,label.encode("utf-8").replace('"','\"'))
                     #s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s"."""%(ressourceURI,repr(label).replace('"','\"'))
                 if url!="":    
                     s+="""<%s> <%s> <%s>."""%(ressourceURI,base+"describes",url)
@@ -397,21 +473,36 @@
                 out.write(s)
             except:
                 pass
+        
+        
+        toGraphs=set() #sammle alle graphen
+        for fromNode,toGraph in g.partOfGraph.items():
+            s="""<%s> <%s> <%s>.\n"""%(ressourceBase+str(fromNode),base+"is_part_of_graph",ressourceBase+str(toGraph))  
+            toGraphs.add(toGraph)
+            out.write(s)
             
-        
+        for fromNode,toGraph in g.isSubGraphOf.items():
+            s="""<%s> <%s> <%s>.\n"""%(ressourceBase+str(fromNode),base+"is_subGraph_of",ressourceBase+str(toGraph))
+            toGraphs.add(toGraph)
+            out.write(s)
+           
+        for toGraph in toGraphs:
+            s="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceBase+str(toGraph),base+"SubGraph") 
+            out.write(s)
+            
         if not partOfGraph is None:
             for nodeID,graphList in partOfGraph.items():
                 #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
                 fromNode=nodeID
-                for graph in graphList:
+                for graph in [graphList]:
                     try:
-                        ressourceURI=ressourceBase+graph
-                        s ="""<%s><%s><%s>.\n"""%(ressourceBase+str(fromNode),base+"is_part_of",ressourceURI)
+                        ressourceURI=ressourceBase+graph.replace(" ","_")
+                        s ="""<%s><%s><%s>.\n"""%(ressourceBase+str(fromNode),base+"is_part_of_mainGraph",ressourceURI)
                         #s = """G_%s -> %s [color="yellow"];\n"""%(graph.replace(".","_"),fromNode)
                         out.write(s)
                         
                         s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,graph)
-                        s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,base+"HarriotGraph")  
+                        s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,base+"Graph")  
                         
                         #s = """G_%s  [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(graph.replace(".","_"),graph)
                         out.write(s)
@@ -428,14 +519,14 @@
                 gr = splitted[-1]
                 print gr
                 ressourceURI=ressourceBase+gr
-                
+                ressourceURI.replace(" ","_")
                 
                
                 typeSrc=type2NodeShape.get(nodeID)
                 if typeSrc==base+"Topic":
-                    relation="is_specified_in"
+                    relation=base+"is_specified_in"
                 else:
-                    relation="see_also"
+                    relation=base+"see_also"
                 
                 try:
                     s ="""<%s><%s><%s>.\n"""%(ressourceBase+str(fromNode),relation,ressourceURI)
@@ -443,7 +534,7 @@
                     out.write(s)
                     
                     s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,gr)
-                    s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,base+"HarriotGraph")  
+                    s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,base+"Graph")  
                        
                     #s = """G_%s  [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(gr.replace(".","_"),gr)
                     out.write(s)
@@ -451,7 +542,6 @@
                     pass
         #out.write("}")
   
-   
         out.close()
     def readPalette(self,palettePath):
         typeNr=0
@@ -547,14 +637,20 @@
 def merge(graphs,dispensor):
     filter=['supermap.graphml']
     partOfGraph={}
+    partOfSubGraph={}
     linksToGraph=set()
     
     edges=set()
     mg =Graph(dispensor)
+    
+
     for g in graphs:
+        
         if g.graphURI in filter:
             continue
         idalt2neu={}
+        
+       
         for nodeid in g.id2nodes.keys():
             node=g.id2nodes.get(nodeid)
             label=node.label
@@ -569,11 +665,11 @@
             if node.internalRef!="":
                 linksToGraph.add((currentID,node.internalRef)) 
             
-            containedIn = partOfGraph.get(currentID,set())
+            #containedIn = partOfGraph.get(currentID,set())
             
             
-            containedIn.add(g.graphURI)
-            partOfGraph[currentID]=containedIn
+            #containedIn.add(g.graphURI)
+            #partOfGraph[currentID]=containedIn
             
         
         for edge in g.edges:
@@ -584,7 +680,33 @@
             edge.target=idalt2neu.get(target)
             edges.add(edge)
             
+        graphsOldToNew={}
+        for nodeID,toGraph in g.partOfGraph.items():
+            
+            node=g.id2nodes.get(nodeID)
+            label=node.label
+            currentID =mg.label2Ids.get(label,dispensor.getID()) #hole id wenn existent sonst neue
+          
+            #graphID = graphsOldToNew.get(graph,dispensor.getID()) #hole id wenn existent sonst neue
+            toGraphID = graphsOldToNew.get(toGraph,dispensor.getID()) #hole id wenn existent sonst neue
+            graphsOldToNew[toGraph]=toGraphID
+            mg.partOfGraph[currentID]=toGraphID
+        
+        
+        for fromGraph,toGraph in g.isSubGraphOf.items():
+            toGraphID = graphsOldToNew.get(toGraph,dispensor.getID()) #hole id wenn existent sonst neue
+            graphsOldToNew[toGraph]=toGraphID
+            
+            fromGraphID = graphsOldToNew.get(fromGraph,dispensor.getID()) #hole id wenn existent sonst neue
+            graphsOldToNew[fromGraph]=fromGraphID
+            mg.isSubGraphOf[fromGraphID]=toGraphID
+            
+        
+        startID=graphsOldToNew[g.startGraphId]
+        partOfGraph[startID]=g.graphURI
+       
     mg.edges=edges
+    #mg.partOfGraph=partOfSubGraph
     return mg,partOfGraph,linksToGraph
 
 if __name__ == '__main__':
@@ -595,19 +717,24 @@
     
     
     
-    path="/Users/dwinter/Documents/Projekte/Europeana/harriot-graphml/Maps_20120523/"
+    #path="/Users/dwinter/Documents/Projekte/Europeana/-graphml/Maps_20120523/"
+    path="/Users/dwinter/Documents/Projekte/Europeana/harriot-graphml/Maps_20120626/"
+    #path="/Users/dwinter/Documents/Projekte/Europeana/-graphml/Maps_short/"
     ls = os.listdir(path)
     graphs=set()
     
     for l in ls:
         
-        g1=Graph(dispensor)
-        g1.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml")
+   
+        try:
+            g1=Graph(dispensor)
+            g1.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml")
     
-        g1.convertGrahml(path+l)
-        g1.graphURI=l
-        graphs.add(g1)
-    
+            g1.convertGraphml(path+l)
+            g1.graphURI=l
+            graphs.add(g1)
+        except:
+            logging.error("Can't handle:"+l) 
     g,po,lg = merge(graphs,dispensor)
  
     
@@ -615,9 +742,9 @@
     #print len(g.label2Ids.keys())
     
     #g.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml")
-    #g.exportAsDot("/tmp/out.dot", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg)
+    #g.exportAsDot("/tmp/out.dot", "",onlyMs=False,partOfGraph=po,linksToGraph=lg)
     g.exportAsDot("/tmp/out.dot", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg)
-    g.exportAsRDF("/tmp/out.rdf", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg)
+    g.exportAsRDF("/tmp/out.rdf", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg,describe=True)
     
     out2 = file("/tmp/out.txt","w")
     for key in g.label2Ids.keys():