# HG changeset patch # User dwinter # Date 1344340976 -7200 # Node ID 6b51bd2418b9bb8c859ef38e1ebb6540199b1f10 # Parent e661aabed2f9dbd3c5e8459da7490d4d909f1913 version 0.3 can now deal with graphs and subgraphs as used in the Harriot analysis, diff -r e661aabed2f9 -r 6b51bd2418b9 graphML2RDF.py --- a/graphML2RDF.py Mon Jul 30 16:36:05 2012 +0200 +++ b/graphML2RDF.py Tue Aug 07 14:02:56 2012 +0200 @@ -3,7 +3,7 @@ from lxml import etree import os.path import os - +import logging namespaces={'graphML':'http://graphml.graphdrawing.org/xmlns', 'y':'http://www.yworks.com/xml/graphml' @@ -53,36 +53,58 @@ graphURI="" nodeStyles={} nodeDescription={} + partOfGraph={} + isSubGraphOf={} def __init__(self,dispensor): self.dispensor=dispensor; self.id2nodes={}; self.edges=set(); - + self.partOfGraph={}; + self.isSubGraphOf={} + self.startGraphId="" - def convertGrahml(self,filename): - """Konvertiert ein Grahphml-File in ein Netzwerk mit Knoten und Kanten. - Die Abbildung von Layout auf Knoten- und Kantentypen erfolgt durch den Vergleich des Layout mit einer Palette - """ - tree = etree.parse(filename) + def readGraph(self,graphNode,partOf="main"): + + nodes=graphNode.xpath("./graphML:node",namespaces=namespaces) - nodes=tree.xpath("//graphML:node",namespaces=namespaces) + graphIDs=graphNode.xpath("@id",namespaces=namespaces) + for graphID in graphIDs: + graphIDString = unicode(graphID) + + + if partOf=="main": ##startgraph + self.startGraphId=graphIDString + else: + self.isSubGraphOf[graphIDString]=partOf + for node in nodes: nodeIDs=node.xpath("@id",namespaces=namespaces) - labels=node.xpath(".//y:NodeLabel",namespaces=namespaces) + + + #labels=node.xpath(".//y:NodeLabel",namespaces=namespaces) + labels=node.xpath('./graphML:data[@key="d6"]/y:*/y:NodeLabel',namespaces=namespaces) + for nodeID in nodeIDs: nodeIDString=unicode(nodeID) - + + labelString=None for label in labels: labelString=unicode(label.text).lstrip().rstrip() newNode = Node(self.dispensor.getID()) - newNode.label=labelString + if labelString!=None: + newNode.label=labelString + else: + newNode.label="NODE:"+str(newNode.numId) + newNode.internalID=nodeIDString + + nodeRefs=node.xpath('./graphML:data[@key="d4"]',namespaces=namespaces) #nodeRefs=node.xpath("./graphML:data",namespaces=namespaces) for nodeRef in nodeRefs: @@ -152,13 +174,24 @@ typeID=-1 newNode.nodeType=typeID self.id2nodes[newNode.numId]=newNode + + self.partOfGraph[newNode.numId]=graphIDString #speichere node ist teil von + if labelString!=None: self.label2Ids[labelString]=newNode.numId self.internalId2nodesID[newNode.internalID]=newNode.numId - edges=tree.xpath("//graphML:edge",namespaces=namespaces) + + #suche nach subgraphen + graphs=node.xpath("./graphML:graph",namespaces=namespaces) + + for graph in graphs: + self.readGraph(graph,graphIDString) + + + edges=graphNode.xpath("./graphML:edge",namespaces=namespaces) for edge in edges: srcIDs=edge.xpath("@source",namespaces=namespaces) @@ -191,6 +224,21 @@ self.edges.add(newEdge) + + + def convertGraphml(self,filename): + """Konvertiert ein Grahphml-File in ein Netzwerk mit Knoten und Kanten. + Die Abbildung von Layout auf Knoten- und Kantentypen erfolgt durch den Vergleich des Layout mit einer Palette + """ + print "converting:"+filename + + tree = etree.parse(filename) + + #lese hauptgraphen + maingraphs=tree.xpath('./graphML:graph',namespaces=namespaces) + for maingraph in maingraphs: + self.readGraph(maingraph) + def getNodeTypeFromPalette(self,style,fs,bs,config): for key,value in self.nodeStyles.items(): @@ -264,7 +312,10 @@ #name=value.label.replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_") #name=value.numId name=key - label=value.label.replace("\n","") + if value.label==None: + label="EMPTYLABEL" + else: + label=value.label.replace("\n","") url=value.externalRef if onlyMs: @@ -274,7 +325,7 @@ s="""%s [label="%s" URL="%s" %s];\n"""%(name,label.decode("utf-8"),url,type2NodeShape.get(value.nodeType)) out.write(s) except: - s="""%s [label="%s" URL="%s" %s];\n"""%(name,repr(label),url,type2NodeShape.get(value.nodeType)) + s="""%s [label="%s" URL="%s" %s];\n"""%(name,label.encode("utf-8"),url,type2NodeShape.get(value.nodeType)) out.write(s) @@ -292,11 +343,32 @@ pass + toGraphs=set() #sammle alle graphen + for fromNode,toGraph in g.partOfGraph.items(): + + s = """%s -> %s [color="blue"];\n"""%(fromNode,toGraph) + toGraphs.add(toGraph) + out.write(s) + + for fromNode,toGraph in g.isSubGraphOf.items(): + s = """%s -> %s [color="blue"];\n"""%(fromNode,toGraph) + toGraphs.add(toGraph) + out.write(s) + + for toGraph in toGraphs: + s = """%s [label="%s" color="blue" fillcolor="blue" style="filled"];\n"""%(str(toGraph).replace(".","_"),toGraph) + + out.write(s) + + + + + if not partOfGraph is None: for nodeID,graphList in partOfGraph.items(): #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_") fromNode=nodeID - for graph in graphList: + for graph in [graphList]: try: s = """G_%s -> %s [color="yellow"];\n"""%(graph.replace(".","_"),fromNode) out.write(s) @@ -327,11 +399,11 @@ out.close() - def exportAsRDF(self,filename,graphName,onlyMs=False,partOfGraph=None,linksToGraph=None): + def exportAsRDF(self,filename,graphName,onlyMs=False,partOfGraph=None,linksToGraph=None,describe=True): out = file(filename,"w") - base="http://ontologies.mpiwg-berlin.mpg.de/reasearch/harriot.owl/1.0/" - ressourceBase="http://entities.mpiwg-berlin.mpg.de/reasearch/harriot.owl/1.0/" + base="http://ontologies.mpiwg-berlin.mpg.de/research/harriot.owl/" + ressourceBase="http://entities.mpiwg-berlin.mpg.de/research/harriot.owl/" type2NodeShape={0: base+"Topic", 1: base+"Topic", 2: base+"Topic", @@ -355,7 +427,10 @@ #name=value.label.replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_") #name=value.numId name=key - label=value.label.replace("\n","") + if value.label==None: + label="EMPTYLABEL3" + else: + label=value.label.replace("\n","") url=value.externalRef if onlyMs: @@ -367,15 +442,16 @@ s="" if label!="": s+="""<%s> "%s".\n"""%(ressourceURI,label.decode("utf-8").replace('"','\"')) - if url!="": - s+="""<%s> <%s> <%s>.\n"""%(ressourceURI,base+"describes",url) + if url!="" and describe: + s+="""<%s> <%s> <%s>.\n"""%(ressourceURI,base+"describes",url.lstrip().rstrip()) print value.nodeType s+="""<%s> <%s>.\n"""%(ressourceURI,type2NodeShape.get(value.nodeType)) #s="""%s [label="%s" URL="%s" %s];\n"""%(name,label.decode("utf-8"),url,type2NodeShape.get(value.nodeType)) out.write(s) except: if label!="": - s="""<%s> "%s".\n"""%(ressourceURI,'CHECK_THIS') + #s="""<%s> "%s".\n"""%(ressourceURI,'CHECK_THIS') + s="""<%s> "%s".\n"""%(ressourceURI,label.encode("utf-8").replace('"','\"')) #s="""<%s> "%s"."""%(ressourceURI,repr(label).replace('"','\"')) if url!="": s+="""<%s> <%s> <%s>."""%(ressourceURI,base+"describes",url) @@ -397,21 +473,36 @@ out.write(s) except: pass + + + toGraphs=set() #sammle alle graphen + for fromNode,toGraph in g.partOfGraph.items(): + s="""<%s> <%s> <%s>.\n"""%(ressourceBase+str(fromNode),base+"is_part_of_graph",ressourceBase+str(toGraph)) + toGraphs.add(toGraph) + out.write(s) - + for fromNode,toGraph in g.isSubGraphOf.items(): + s="""<%s> <%s> <%s>.\n"""%(ressourceBase+str(fromNode),base+"is_subGraph_of",ressourceBase+str(toGraph)) + toGraphs.add(toGraph) + out.write(s) + + for toGraph in toGraphs: + s="""<%s> <%s>.\n"""%(ressourceBase+str(toGraph),base+"SubGraph") + out.write(s) + if not partOfGraph is None: for nodeID,graphList in partOfGraph.items(): #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_") fromNode=nodeID - for graph in graphList: + for graph in [graphList]: try: - ressourceURI=ressourceBase+graph - s ="""<%s><%s><%s>.\n"""%(ressourceBase+str(fromNode),base+"is_part_of",ressourceURI) + ressourceURI=ressourceBase+graph.replace(" ","_") + s ="""<%s><%s><%s>.\n"""%(ressourceBase+str(fromNode),base+"is_part_of_mainGraph",ressourceURI) #s = """G_%s -> %s [color="yellow"];\n"""%(graph.replace(".","_"),fromNode) out.write(s) s="""<%s> "%s".\n"""%(ressourceURI,graph) - s+="""<%s> <%s>.\n"""%(ressourceURI,base+"HarriotGraph") + s+="""<%s> <%s>.\n"""%(ressourceURI,base+"Graph") #s = """G_%s [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(graph.replace(".","_"),graph) out.write(s) @@ -428,14 +519,14 @@ gr = splitted[-1] print gr ressourceURI=ressourceBase+gr - + ressourceURI.replace(" ","_") typeSrc=type2NodeShape.get(nodeID) if typeSrc==base+"Topic": - relation="is_specified_in" + relation=base+"is_specified_in" else: - relation="see_also" + relation=base+"see_also" try: s ="""<%s><%s><%s>.\n"""%(ressourceBase+str(fromNode),relation,ressourceURI) @@ -443,7 +534,7 @@ out.write(s) s="""<%s> "%s".\n"""%(ressourceURI,gr) - s+="""<%s> <%s>.\n"""%(ressourceURI,base+"HarriotGraph") + s+="""<%s> <%s>.\n"""%(ressourceURI,base+"Graph") #s = """G_%s [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(gr.replace(".","_"),gr) out.write(s) @@ -451,7 +542,6 @@ pass #out.write("}") - out.close() def readPalette(self,palettePath): typeNr=0 @@ -547,14 +637,20 @@ def merge(graphs,dispensor): filter=['supermap.graphml'] partOfGraph={} + partOfSubGraph={} linksToGraph=set() edges=set() mg =Graph(dispensor) + + for g in graphs: + if g.graphURI in filter: continue idalt2neu={} + + for nodeid in g.id2nodes.keys(): node=g.id2nodes.get(nodeid) label=node.label @@ -569,11 +665,11 @@ if node.internalRef!="": linksToGraph.add((currentID,node.internalRef)) - containedIn = partOfGraph.get(currentID,set()) + #containedIn = partOfGraph.get(currentID,set()) - containedIn.add(g.graphURI) - partOfGraph[currentID]=containedIn + #containedIn.add(g.graphURI) + #partOfGraph[currentID]=containedIn for edge in g.edges: @@ -584,7 +680,33 @@ edge.target=idalt2neu.get(target) edges.add(edge) + graphsOldToNew={} + for nodeID,toGraph in g.partOfGraph.items(): + + node=g.id2nodes.get(nodeID) + label=node.label + currentID =mg.label2Ids.get(label,dispensor.getID()) #hole id wenn existent sonst neue + + #graphID = graphsOldToNew.get(graph,dispensor.getID()) #hole id wenn existent sonst neue + toGraphID = graphsOldToNew.get(toGraph,dispensor.getID()) #hole id wenn existent sonst neue + graphsOldToNew[toGraph]=toGraphID + mg.partOfGraph[currentID]=toGraphID + + + for fromGraph,toGraph in g.isSubGraphOf.items(): + toGraphID = graphsOldToNew.get(toGraph,dispensor.getID()) #hole id wenn existent sonst neue + graphsOldToNew[toGraph]=toGraphID + + fromGraphID = graphsOldToNew.get(fromGraph,dispensor.getID()) #hole id wenn existent sonst neue + graphsOldToNew[fromGraph]=fromGraphID + mg.isSubGraphOf[fromGraphID]=toGraphID + + + startID=graphsOldToNew[g.startGraphId] + partOfGraph[startID]=g.graphURI + mg.edges=edges + #mg.partOfGraph=partOfSubGraph return mg,partOfGraph,linksToGraph if __name__ == '__main__': @@ -595,19 +717,24 @@ - path="/Users/dwinter/Documents/Projekte/Europeana/harriot-graphml/Maps_20120523/" + #path="/Users/dwinter/Documents/Projekte/Europeana/-graphml/Maps_20120523/" + path="/Users/dwinter/Documents/Projekte/Europeana/harriot-graphml/Maps_20120626/" + #path="/Users/dwinter/Documents/Projekte/Europeana/-graphml/Maps_short/" ls = os.listdir(path) graphs=set() for l in ls: - g1=Graph(dispensor) - g1.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml") + + try: + g1=Graph(dispensor) + g1.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml") - g1.convertGrahml(path+l) - g1.graphURI=l - graphs.add(g1) - + g1.convertGraphml(path+l) + g1.graphURI=l + graphs.add(g1) + except: + logging.error("Can't handle:"+l) g,po,lg = merge(graphs,dispensor) @@ -615,9 +742,9 @@ #print len(g.label2Ids.keys()) #g.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml") - #g.exportAsDot("/tmp/out.dot", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg) + #g.exportAsDot("/tmp/out.dot", "",onlyMs=False,partOfGraph=po,linksToGraph=lg) g.exportAsDot("/tmp/out.dot", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg) - g.exportAsRDF("/tmp/out.rdf", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg) + g.exportAsRDF("/tmp/out.rdf", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg,describe=True) out2 = file("/tmp/out.txt","w") for key in g.label2Ids.keys():