Mercurial > hg > graphML2RDF
view graphML2RDF.py @ 3:c97d43ac8fa6
first release
author | dwinter |
---|---|
date | Mon, 23 Jul 2012 09:57:32 +0200 |
parents | 8190d724dc01 |
children | e661aabed2f9 |
line wrap: on
line source
# Dieses Tool konvertiert graphMl Files erzeugt mit yED in einen graphen, entweder im dot, Format oder als RDF # TODO: lots of ... Zuordnung graphenTyp zu Ontologie sollte konfiurierbar sein. from lxml import etree import os.path import os namespaces={'graphML':'http://graphml.graphdrawing.org/xmlns', 'y':'http://www.yworks.com/xml/graphml' } # Alle wesentlichen Informstionrn fuer einen Knoten class Node: label="" #Label in yED, wird auch gebraucht um gleiche Knoten zu identifizieren, wenn mehr als ein Graph zusammengefuehrt werden soll. #Es muss also beim Erstellen auf Konsistenz der Label geachtet werden. internalID="" #InternalID eine Knotens innerhalb eines Graphen, dieses sind nur pro Graph eindeutig- externalRef="" # Referenzen auf externe Weseiten (key="d4") internalRef="" # Referenzen auf andere Graphen (key="d4"), unterscheidung zwischen externer und interner, ob Pfad mit http beginnt. numId=0 #Id des Knotens sollte eindeutig fuer alle Knoten sein. nodeType=-1 def __init__(self,numId): self.numId=numId #Kante class Edge: src=None target=None edgeType=-1 def __init__(self,src,target): self.src=src self.target=target # Erzeugt Ids fuer die Knoten class IDDispensor: currentID=0 def getID(self): self.currentID+=1 return self.currentID # Der eigentliche Graph class Graph: label2Ids={} # Zuweisung labels zu den Ids des Knoten #id2nodes={} #Zuweisung id zu den Knoten internalId2nodesID={} #Zuweisung interneID zur allgemeinen ID edges=set() # Menger der Kanten id2label={} # Zuweisung id zu den Labeln edgeStyles={} edgeDescription={} graphURI="" nodeStyles={} nodeDescription={} def __init__(self,dispensor): self.dispensor=dispensor; self.id2nodes={}; self.edges=set(); def convertGrahml(self,filename): """Konvertiert ein Grahphml-File in ein Netzwerk mit Knoten und Kanten. Die Abbildung von Layout auf Knoten- und Kantentypen erfolgt durch den Vergleich des Layout mit einer Palette """ tree = etree.parse(filename) nodes=tree.xpath("//graphML:node",namespaces=namespaces) for node in nodes: nodeIDs=node.xpath("@id",namespaces=namespaces) labels=node.xpath(".//y:NodeLabel",namespaces=namespaces) for nodeID in nodeIDs: nodeIDString=unicode(nodeID) labelString=None for label in labels: labelString=unicode(label.text).lstrip().rstrip() newNode = Node(self.dispensor.getID()) newNode.label=labelString newNode.internalID=nodeIDString nodeRefs=node.xpath('./graphML:data[@key="d4"]',namespaces=namespaces) #nodeRefs=node.xpath("./graphML:data",namespaces=namespaces) for nodeRef in nodeRefs: nodeRefString=nodeRef.text if nodeRefString is None: continue #hack dw, scheinbar falsche urs drin if nodeRefString.find("http")>0: continue if nodeRefString.lstrip().startswith("http:") or nodeRefString.startswith("https:") or nodeRefString.startswith("ftp:"): newNode.externalRef=nodeRefString else: newNode.internalRef=nodeRefString #-- #read styles typeID=None #fall 1 Generic Node gns = node.xpath('./graphML:data[@key="d6"]/y:GenericNode',namespaces=namespaces) for gn in gns: style="g" fills=gn.xpath('./y:Fill',namespaces=namespaces) fs=None for fill in fills: fs=fill.attrib borderstyles=gn.xpath('./y:BorderStyle',namespaces=namespaces) bs=None for borderstyle in borderstyles: bs=borderstyle.attrib config=gn.attrib.get("configuration") typeID=self.getNodeTypeFromPalette(style,fs,bs,config) #fall 2 shape Node gns = node.xpath('./graphML:data[@key="d6"]/y:ShapeNode',namespaces=namespaces) for gn in gns: style="s" fills=gn.xpath('./y:Fill',namespaces=namespaces) fs=None for fill in fills: fs=fill.attrib borderstyles=gn.xpath('./y:BorderStyle',namespaces=namespaces) bs=None for borderstyle in borderstyles: bs=borderstyle.attrib shapes=gn.xpath('./y:Shape',namespaces=namespaces) shapeType=None for shape in shapes: shapeType=shape.attrib.get("type") typeID=self.getNodeTypeFromPalette(style,fs,bs,shapeType) #--- if typeID is None: typeID=-1 newNode.nodeType=typeID self.id2nodes[newNode.numId]=newNode if labelString!=None: self.label2Ids[labelString]=newNode.numId self.internalId2nodesID[newNode.internalID]=newNode.numId edges=tree.xpath("//graphML:edge",namespaces=namespaces) for edge in edges: srcIDs=edge.xpath("@source",namespaces=namespaces) tarIDs=edge.xpath("@target",namespaces=namespaces) for srcID in srcIDs: source=unicode(srcID) for tarID in tarIDs: target=unicode(tarID) #read styles #lineStyles=edge.xpath('./graphML:data[@key="d10"]/y:GenericEdge/y:LineStyle',namespaces=namespaces) lineStyles=edge.xpath('./graphML:data[@key="d10"]/.//y:LineStyle',namespaces=namespaces) ls=None for lineStyle in lineStyles: ls=lineStyle.attrib #arrows=edge.xpath('./graphML:data[@key="d10"]/y:GenericEdge/y:Arrows',namespaces=namespaces) arrows=edge.xpath('./graphML:data[@key="d10"]/.//y:Arrows',namespaces=namespaces) ars=None for arrow in arrows: ars=arrow.attrib typeID=self.getTypeFromPalette(ls,ars) newEdge=Edge(self.internalId2nodesID.get(source),self.internalId2nodesID.get(target)) newEdge.edgeType=typeID self.edges.add(newEdge) def getNodeTypeFromPalette(self,style,fs,bs,config): for key,value in self.nodeStyles.items(): styleVorlage,fsVorlage,bsVorlage,configVorlage=value if style!=styleVorlage: continue if config!=configVorlage: continue if self.cmpDict(fs,fsVorlage) and self.cmpDict(bs,bsVorlage): print key return key return -1 def getTypeFromPalette(self,ls,ars): for key,value in self.edgeStyles.items(): lsVorlage,arsVorlage=value if self.cmpDict(ls,lsVorlage) and self.cmpDict(ars,arsVorlage): return key return -1 def cmpDict(self,x,y): """Teste zwei dicts auf Gleichheit""" if (x is None) or (y is None): return False for key in x.keys(): yVal=y.get(key,None) xVal=x.get(key) if yVal!=xVal: return False return True def exportAsDot(self,filename,graphName,onlyMs=False,partOfGraph=None,linksToGraph=None): out = file(filename,"w") type2NodeShape={0:'style="solid" color="blue"', 1:'style="solid" color="lightblue"', 2:'style="solid" color="blue"', 3:'style="tapered" color="orange"', 4:'style="solid" color="green"', 5:'style="solid" color="sienna"', 6:'style="solid" color="magenta"', -1:'style="dotted" color="red"' } type2EdgeShape={0:'style="dotted" color="blue"', 1:'style="solid"', 2:'style="bold"', 3:'style="tapered"', 4:'style="solid" color="green"', 5:'style="solid" color="sienna"', 6:'style="solid" color="magenta"', -1:'style="dotted" color="red"' } out.write("""digraph %s {"""%graphName) g=self for key,value in g.id2nodes.items(): #name=value.label.replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_") #name=value.numId name=key label=value.label.replace("\n","") url=value.externalRef if onlyMs: if not label.lstrip().startswith("Add"): continue try: s="""%s [label="%s" URL="%s" %s];\n"""%(name,label.decode("utf-8"),url,type2NodeShape.get(value.nodeType)) out.write(s) except: s="""%s [label="%s" URL="%s" %s];\n"""%(name,repr(label),url,type2NodeShape.get(value.nodeType)) out.write(s) for edge in g.edges: try: #sr=g.id2label.get(edge.src).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_") sr=edge.src tg=edge.target #tg=g.id2label.get(edge.target).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_") s = """%s -> %s [%s];\n"""%(sr,tg,type2EdgeShape.get(edge.edgeType)) out.write(s) except: pass if not partOfGraph is None: for nodeID,graphList in partOfGraph.items(): #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_") fromNode=nodeID for graph in graphList: try: s = """G_%s -> %s [color="yellow"];\n"""%(graph.replace(".","_"),fromNode) out.write(s) s = """G_%s [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(graph.replace(".","_"),graph) out.write(s) except: pass if not linksToGraph is None: for nodeID,graph in linksToGraph: #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_") fromNode=nodeID splitted=graph.split("/") print graph gr = splitted[-1] print gr try: s = """%s -> G_%s [color="green"];\n"""%(fromNode,gr.replace(".","_")) out.write(s) s = """G_%s [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(gr.replace(".","_"),gr) out.write(s) except: pass out.write("}") out.close() def exportAsRDF(self,filename,graphName,onlyMs=False,partOfGraph=None,linksToGraph=None): out = file(filename,"w") base="http://example.org/harriotOnt/" type2NodeShape={0: base+"Topic", 1: base+"Topic", 2: base+"Topic", 3: base+"FolioPage", 4: base+"4", 5: base+"RelatedFolioPage", -1: base+"UNKNOWN"} type2EdgeShape={0: base+"has_prev_by_pagination", 1: base+"has_conjectural_relation", 2: base+"has_prev_by_conjection", 3: base+"has_prev_by_conjection", 4: base+"result_used_from", 5: base+"result_used_from", -1: base+"is_related_to"} #out.write("""digraph %s {"""%graphName) g=self for key,value in g.id2nodes.items(): #name=value.label.replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_") #name=value.numId name=key label=value.label.replace("\n","") url=value.externalRef if onlyMs: if not label.lstrip().startswith("Add"): continue ressourceURI=base+str(name) try: s="" if label!="": s+="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,label.decode("utf-8").replace('"','\"')) if url!="": s+="""<%s> <%s> <%s>.\n"""%(ressourceURI,base+"describes",url) print value.nodeType s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,type2NodeShape.get(value.nodeType)) #s="""%s [label="%s" URL="%s" %s];\n"""%(name,label.decode("utf-8"),url,type2NodeShape.get(value.nodeType)) out.write(s) except: if label!="": s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,'CHECK_THIS') #s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s"."""%(ressourceURI,repr(label).replace('"','\"')) if url!="": s+="""<%s> <%s> <%s>."""%(ressourceURI,base+"describes",url) s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,type2NodeShape.get(value.nodeType)) #s="""%s [label="%s" URL="%s" %s];\n"""%(name,repr(label),url,type2NodeShape.get(value.nodeType)) out.write(s) for edge in g.edges: try: #sr=g.id2label.get(edge.src).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_") sr=edge.src tg=edge.target #tg=g.id2label.get(edge.target).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_") s ="""<%s><%s><%s>.\n"""%(base+str(sr),type2EdgeShape.get(edge.edgeType),base+str(tg)) #s = """%s -> %s [%s];\n"""%(sr,tg,type2EdgeShape.get(edge.edgeType)) out.write(s) except: pass if not partOfGraph is None: for nodeID,graphList in partOfGraph.items(): #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_") fromNode=nodeID for graph in graphList: try: ressourceURI=base+graph s ="""<%s><%s><%s>.\n"""%(base+str(fromNode),base+"is_part_of",ressourceURI) #s = """G_%s -> %s [color="yellow"];\n"""%(graph.replace(".","_"),fromNode) out.write(s) s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,graph) s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,base+"HarriotGraph") #s = """G_%s [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(graph.replace(".","_"),graph) out.write(s) except: pass if not linksToGraph is None: for nodeID,graph in linksToGraph: #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_") fromNode=nodeID splitted=graph.split("/") print graph gr = splitted[-1] print gr ressourceURI=base+gr typeSrc=type2NodeShape.get(nodeID) if typeSrc==base+"Topic": relation="is_specified_in" else: relation="see_also" try: s ="""<%s><%s><%s>.\n"""%(base+str(fromNode),relation,ressourceURI) #s = """%s -> G_%s [color="green"];\n"""%(fromNode,gr.replace(".","_")) out.write(s) s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,gr) s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,base+"HarriotGraph") #s = """G_%s [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(gr.replace(".","_"),gr) out.write(s) except: pass #out.write("}") out.close() def readPalette(self,palettePath): typeNr=0 palette = etree.parse(palettePath) edges=palette.xpath("//graphML:edge",namespaces=namespaces) # lage alle kanten in der palette for edge in edges: #relevant fuer die einordnugn ist data key=12 linestyle und arrows lineStyles=edge.xpath('./graphML:data[@key="d12"]/y:GenericEdge/y:LineStyle',namespaces=namespaces) ls=None for lineStyle in lineStyles: ls=lineStyle arrows=edge.xpath('./graphML:data[@key="d12"]/y:GenericEdge/y:Arrows',namespaces=namespaces) ars=None for arrow in arrows: ars=arrow #get description ds="" descriptions=edge.xpath('./graphML:data[@key="d9"]',namespaces=namespaces) for description in descriptions: ds=description self.edgeDescription[typeNr]=ds.text self.edgeStyles[typeNr]=(ls.attrib,ars.attrib) typeNr+=1 typeNr=0 nodes=palette.xpath("//graphML:node",namespaces=namespaces) for node in nodes: style="" #fall 1 Generic Node gns = node.xpath('./graphML:data[@key="d7"]/y:GenericNode',namespaces=namespaces) for gn in gns: style="g" fills=gn.xpath('./y:Fill',namespaces=namespaces) fs=None for fill in fills: fs=fill.attrib borderstyles=gn.xpath('./y:BorderStyle',namespaces=namespaces) bs=None for borderstyle in borderstyles: bs=borderstyle.attrib config=gn.attrib.get("configuration") #get description ds="" descriptions=node.xpath('./graphML:data[@key="d4"]',namespaces=namespaces) for description in descriptions: ds=description.text self.nodeDescription[typeNr]=ds self.nodeStyles[typeNr]=(style,fs,bs,config) typeNr+=1 #fall 2 shape Node gns = node.xpath('./graphML:data[@key="d7"]/y:ShapeNode',namespaces=namespaces) for gn in gns: style="s" fills=gn.xpath('./y:Fill',namespaces=namespaces) fs=None for fill in fills: fs=fill.attrib borderstyles=gn.xpath('./y:BorderStyle',namespaces=namespaces) bs=None for borderstyle in borderstyles: bs=borderstyle.attrib shapes=gn.xpath('./y:Shape',namespaces=namespaces) shapeType=None for shape in shapes: shapeType=shape.attrib.get("type") #get description ds="" descriptions=node.xpath('./graphML:data[@key="d4"]',namespaces=namespaces) for description in descriptions: ds=description.text self.nodeDescription[typeNr]=ds self.nodeStyles[typeNr]=(style,fs,bs,shapeType) typeNr+=1 def merge(graphs,dispensor): filter=['supermap.graphml'] partOfGraph={} linksToGraph=set() edges=set() mg =Graph(dispensor) for g in graphs: if g.graphURI in filter: continue idalt2neu={} for nodeid in g.id2nodes.keys(): node=g.id2nodes.get(nodeid) label=node.label currentID =mg.label2Ids.get(label,dispensor.getID()) #hole id wenn existent sonst neue mg.label2Ids[label]=currentID mg.id2label[currentID]=label idalt2neu[node.numId]=currentID mg.id2nodes[currentID]=node if node.internalRef!="": linksToGraph.add((currentID,node.internalRef)) containedIn = partOfGraph.get(currentID,set()) containedIn.add(g.graphURI) partOfGraph[currentID]=containedIn for edge in g.edges: src=edge.src target=edge.target edge.src=idalt2neu.get(src) edge.target=idalt2neu.get(target) edges.add(edge) mg.edges=edges return mg,partOfGraph,linksToGraph if __name__ == '__main__': dispensor = IDDispensor() #g1=Graph(dispensor) #g1.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml") #g1.convertGrahml("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/5.7.3_cubics_other.graphml") path="/Users/dwinter/Documents/Projekte/Europeana/harriot-graphml/Maps_20120523/" ls = os.listdir(path) graphs=set() for l in ls: g1=Graph(dispensor) g1.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml") g1.convertGrahml(path+l) g1.graphURI=l graphs.add(g1) g,po,lg = merge(graphs,dispensor) #print g.label2Ids.keys() #print len(g.label2Ids.keys()) #g.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml") #g.exportAsDot("/tmp/out.dot", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg) g.exportAsDot("/tmp/out.dot", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg) g.exportAsRDF("/tmp/out.rdf", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg) out2 = file("/tmp/out.txt","w") for key in g.label2Ids.keys(): try: out2.write('"'+key+'"'+"\n") except: pass out2.close()