comparison graphML2RDF.py @ 0:8190d724dc01

first release
author dwinter
date Mon, 23 Jul 2012 09:48:23 +0200
parents
children e661aabed2f9
comparison
equal deleted inserted replaced
-1:000000000000 0:8190d724dc01
1 # Dieses Tool konvertiert graphMl Files erzeugt mit yED in einen graphen, entweder im dot, Format oder als RDF
2 # TODO: lots of ... Zuordnung graphenTyp zu Ontologie sollte konfiurierbar sein.
3 from lxml import etree
4 import os.path
5 import os
6
7
8 namespaces={'graphML':'http://graphml.graphdrawing.org/xmlns',
9 'y':'http://www.yworks.com/xml/graphml'
10 }
11
12 # Alle wesentlichen Informstionrn fuer einen Knoten
13 class Node:
14
15 label="" #Label in yED, wird auch gebraucht um gleiche Knoten zu identifizieren, wenn mehr als ein Graph zusammengefuehrt werden soll.
16 #Es muss also beim Erstellen auf Konsistenz der Label geachtet werden.
17 internalID="" #InternalID eine Knotens innerhalb eines Graphen, dieses sind nur pro Graph eindeutig-
18 externalRef="" # Referenzen auf externe Weseiten (key="d4")
19 internalRef="" # Referenzen auf andere Graphen (key="d4"), unterscheidung zwischen externer und interner, ob Pfad mit http beginnt.
20 numId=0 #Id des Knotens sollte eindeutig fuer alle Knoten sein.
21 nodeType=-1
22
23 def __init__(self,numId):
24 self.numId=numId
25
26 #Kante
27 class Edge:
28 src=None
29 target=None
30 edgeType=-1
31
32 def __init__(self,src,target):
33 self.src=src
34 self.target=target
35
36
37 # Erzeugt Ids fuer die Knoten
38 class IDDispensor:
39 currentID=0
40 def getID(self):
41 self.currentID+=1
42 return self.currentID
43
44 # Der eigentliche Graph
45 class Graph:
46 label2Ids={} # Zuweisung labels zu den Ids des Knoten
47 #id2nodes={} #Zuweisung id zu den Knoten
48 internalId2nodesID={} #Zuweisung interneID zur allgemeinen ID
49 edges=set() # Menger der Kanten
50 id2label={} # Zuweisung id zu den Labeln
51 edgeStyles={}
52 edgeDescription={}
53 graphURI=""
54 nodeStyles={}
55 nodeDescription={}
56
57 def __init__(self,dispensor):
58 self.dispensor=dispensor;
59 self.id2nodes={};
60 self.edges=set();
61
62
63 def convertGrahml(self,filename):
64 """Konvertiert ein Grahphml-File in ein Netzwerk mit Knoten und Kanten.
65 Die Abbildung von Layout auf Knoten- und Kantentypen erfolgt durch den Vergleich des Layout mit einer Palette
66 """
67 tree = etree.parse(filename)
68
69 nodes=tree.xpath("//graphML:node",namespaces=namespaces)
70
71 for node in nodes:
72 nodeIDs=node.xpath("@id",namespaces=namespaces)
73 labels=node.xpath(".//y:NodeLabel",namespaces=namespaces)
74 for nodeID in nodeIDs:
75 nodeIDString=unicode(nodeID)
76
77 labelString=None
78 for label in labels:
79 labelString=unicode(label.text).lstrip().rstrip()
80
81
82 newNode = Node(self.dispensor.getID())
83 newNode.label=labelString
84 newNode.internalID=nodeIDString
85
86 nodeRefs=node.xpath('./graphML:data[@key="d4"]',namespaces=namespaces)
87 #nodeRefs=node.xpath("./graphML:data",namespaces=namespaces)
88 for nodeRef in nodeRefs:
89 nodeRefString=nodeRef.text
90 if nodeRefString is None:
91 continue
92
93 #hack dw, scheinbar falsche urs drin
94 if nodeRefString.find("http")>0:
95 continue
96 if nodeRefString.lstrip().startswith("http:") or nodeRefString.startswith("https:") or nodeRefString.startswith("ftp:"):
97 newNode.externalRef=nodeRefString
98 else:
99 newNode.internalRef=nodeRefString
100
101 #--
102
103 #read styles
104 typeID=None
105 #fall 1 Generic Node
106 gns = node.xpath('./graphML:data[@key="d6"]/y:GenericNode',namespaces=namespaces)
107 for gn in gns:
108 style="g"
109 fills=gn.xpath('./y:Fill',namespaces=namespaces)
110 fs=None
111 for fill in fills:
112 fs=fill.attrib
113
114 borderstyles=gn.xpath('./y:BorderStyle',namespaces=namespaces)
115 bs=None
116 for borderstyle in borderstyles:
117 bs=borderstyle.attrib
118
119
120 config=gn.attrib.get("configuration")
121
122
123 typeID=self.getNodeTypeFromPalette(style,fs,bs,config)
124
125
126 #fall 2 shape Node
127 gns = node.xpath('./graphML:data[@key="d6"]/y:ShapeNode',namespaces=namespaces)
128 for gn in gns:
129 style="s"
130 fills=gn.xpath('./y:Fill',namespaces=namespaces)
131 fs=None
132 for fill in fills:
133 fs=fill.attrib
134
135 borderstyles=gn.xpath('./y:BorderStyle',namespaces=namespaces)
136 bs=None
137 for borderstyle in borderstyles:
138 bs=borderstyle.attrib
139
140
141 shapes=gn.xpath('./y:Shape',namespaces=namespaces)
142 shapeType=None
143 for shape in shapes:
144 shapeType=shape.attrib.get("type")
145
146
147 typeID=self.getNodeTypeFromPalette(style,fs,bs,shapeType)
148
149
150 #---
151 if typeID is None:
152 typeID=-1
153 newNode.nodeType=typeID
154 self.id2nodes[newNode.numId]=newNode
155 if labelString!=None:
156 self.label2Ids[labelString]=newNode.numId
157
158
159 self.internalId2nodesID[newNode.internalID]=newNode.numId
160
161 edges=tree.xpath("//graphML:edge",namespaces=namespaces)
162
163 for edge in edges:
164 srcIDs=edge.xpath("@source",namespaces=namespaces)
165 tarIDs=edge.xpath("@target",namespaces=namespaces)
166 for srcID in srcIDs:
167 source=unicode(srcID)
168
169 for tarID in tarIDs:
170 target=unicode(tarID)
171
172 #read styles
173
174 #lineStyles=edge.xpath('./graphML:data[@key="d10"]/y:GenericEdge/y:LineStyle',namespaces=namespaces)
175 lineStyles=edge.xpath('./graphML:data[@key="d10"]/.//y:LineStyle',namespaces=namespaces)
176 ls=None
177 for lineStyle in lineStyles:
178 ls=lineStyle.attrib
179
180 #arrows=edge.xpath('./graphML:data[@key="d10"]/y:GenericEdge/y:Arrows',namespaces=namespaces)
181 arrows=edge.xpath('./graphML:data[@key="d10"]/.//y:Arrows',namespaces=namespaces)
182 ars=None
183 for arrow in arrows:
184 ars=arrow.attrib
185
186 typeID=self.getTypeFromPalette(ls,ars)
187
188 newEdge=Edge(self.internalId2nodesID.get(source),self.internalId2nodesID.get(target))
189 newEdge.edgeType=typeID
190
191
192 self.edges.add(newEdge)
193
194
195 def getNodeTypeFromPalette(self,style,fs,bs,config):
196 for key,value in self.nodeStyles.items():
197 styleVorlage,fsVorlage,bsVorlage,configVorlage=value
198 if style!=styleVorlage:
199 continue
200
201 if config!=configVorlage:
202 continue
203
204
205 if self.cmpDict(fs,fsVorlage) and self.cmpDict(bs,bsVorlage):
206 print key
207 return key
208
209 return -1
210
211
212
213 def getTypeFromPalette(self,ls,ars):
214
215 for key,value in self.edgeStyles.items():
216 lsVorlage,arsVorlage=value
217 if self.cmpDict(ls,lsVorlage) and self.cmpDict(ars,arsVorlage):
218
219 return key
220
221 return -1
222
223 def cmpDict(self,x,y):
224 """Teste zwei dicts auf Gleichheit"""
225
226 if (x is None) or (y is None):
227 return False
228
229 for key in x.keys():
230 yVal=y.get(key,None)
231 xVal=x.get(key)
232 if yVal!=xVal:
233 return False
234
235 return True
236
237 def exportAsDot(self,filename,graphName,onlyMs=False,partOfGraph=None,linksToGraph=None):
238 out = file(filename,"w")
239
240 type2NodeShape={0:'style="solid" color="blue"',
241 1:'style="solid" color="lightblue"',
242 2:'style="solid" color="blue"',
243 3:'style="tapered" color="orange"',
244 4:'style="solid" color="green"',
245 5:'style="solid" color="sienna"',
246 6:'style="solid" color="magenta"',
247 -1:'style="dotted" color="red"'
248 }
249
250 type2EdgeShape={0:'style="dotted" color="blue"',
251 1:'style="solid"',
252 2:'style="bold"',
253 3:'style="tapered"',
254 4:'style="solid" color="green"',
255 5:'style="solid" color="sienna"',
256 6:'style="solid" color="magenta"',
257 -1:'style="dotted" color="red"'
258 }
259
260 out.write("""digraph %s {"""%graphName)
261 g=self
262
263 for key,value in g.id2nodes.items():
264 #name=value.label.replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_")
265 #name=value.numId
266 name=key
267 label=value.label.replace("\n","")
268 url=value.externalRef
269
270 if onlyMs:
271 if not label.lstrip().startswith("Add"):
272 continue
273 try:
274 s="""%s [label="%s" URL="%s" %s];\n"""%(name,label.decode("utf-8"),url,type2NodeShape.get(value.nodeType))
275 out.write(s)
276 except:
277 s="""%s [label="%s" URL="%s" %s];\n"""%(name,repr(label),url,type2NodeShape.get(value.nodeType))
278 out.write(s)
279
280
281 for edge in g.edges:
282 try:
283 #sr=g.id2label.get(edge.src).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
284 sr=edge.src
285
286 tg=edge.target
287 #tg=g.id2label.get(edge.target).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
288
289 s = """%s -> %s [%s];\n"""%(sr,tg,type2EdgeShape.get(edge.edgeType))
290 out.write(s)
291 except:
292 pass
293
294
295 if not partOfGraph is None:
296 for nodeID,graphList in partOfGraph.items():
297 #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
298 fromNode=nodeID
299 for graph in graphList:
300 try:
301 s = """G_%s -> %s [color="yellow"];\n"""%(graph.replace(".","_"),fromNode)
302 out.write(s)
303 s = """G_%s [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(graph.replace(".","_"),graph)
304 out.write(s)
305 except:
306 pass
307
308
309 if not linksToGraph is None:
310 for nodeID,graph in linksToGraph:
311 #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
312 fromNode=nodeID
313 splitted=graph.split("/")
314 print graph
315 gr = splitted[-1]
316 print gr
317 try:
318 s = """%s -> G_%s [color="green"];\n"""%(fromNode,gr.replace(".","_"))
319 out.write(s)
320 s = """G_%s [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(gr.replace(".","_"),gr)
321 out.write(s)
322 except:
323 pass
324 out.write("}")
325
326
327 out.close()
328
329
330 def exportAsRDF(self,filename,graphName,onlyMs=False,partOfGraph=None,linksToGraph=None):
331 out = file(filename,"w")
332
333 base="http://example.org/harriotOnt/"
334 type2NodeShape={0: base+"Topic",
335 1: base+"Topic",
336 2: base+"Topic",
337 3: base+"FolioPage",
338 4: base+"4",
339 5: base+"RelatedFolioPage",
340 -1: base+"UNKNOWN"}
341
342 type2EdgeShape={0: base+"has_prev_by_pagination",
343 1: base+"has_conjectural_relation",
344 2: base+"has_prev_by_conjection",
345 3: base+"has_prev_by_conjection",
346 4: base+"result_used_from",
347 5: base+"result_used_from",
348 -1: base+"is_related_to"}
349
350 #out.write("""digraph %s {"""%graphName)
351 g=self
352
353 for key,value in g.id2nodes.items():
354 #name=value.label.replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_")
355 #name=value.numId
356 name=key
357 label=value.label.replace("\n","")
358 url=value.externalRef
359
360 if onlyMs:
361 if not label.lstrip().startswith("Add"):
362 continue
363
364 ressourceURI=base+str(name)
365 try:
366 s=""
367 if label!="":
368 s+="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,label.decode("utf-8").replace('"','\"'))
369 if url!="":
370 s+="""<%s> <%s> <%s>.\n"""%(ressourceURI,base+"describes",url)
371 print value.nodeType
372 s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,type2NodeShape.get(value.nodeType))
373 #s="""%s [label="%s" URL="%s" %s];\n"""%(name,label.decode("utf-8"),url,type2NodeShape.get(value.nodeType))
374 out.write(s)
375 except:
376 if label!="":
377 s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,'CHECK_THIS')
378 #s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s"."""%(ressourceURI,repr(label).replace('"','\"'))
379 if url!="":
380 s+="""<%s> <%s> <%s>."""%(ressourceURI,base+"describes",url)
381 s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,type2NodeShape.get(value.nodeType))
382 #s="""%s [label="%s" URL="%s" %s];\n"""%(name,repr(label),url,type2NodeShape.get(value.nodeType))
383 out.write(s)
384
385
386 for edge in g.edges:
387 try:
388 #sr=g.id2label.get(edge.src).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
389 sr=edge.src
390
391 tg=edge.target
392 #tg=g.id2label.get(edge.target).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
393
394 s ="""<%s><%s><%s>.\n"""%(base+str(sr),type2EdgeShape.get(edge.edgeType),base+str(tg))
395 #s = """%s -> %s [%s];\n"""%(sr,tg,type2EdgeShape.get(edge.edgeType))
396 out.write(s)
397 except:
398 pass
399
400
401 if not partOfGraph is None:
402 for nodeID,graphList in partOfGraph.items():
403 #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
404 fromNode=nodeID
405 for graph in graphList:
406 try:
407 ressourceURI=base+graph
408 s ="""<%s><%s><%s>.\n"""%(base+str(fromNode),base+"is_part_of",ressourceURI)
409 #s = """G_%s -> %s [color="yellow"];\n"""%(graph.replace(".","_"),fromNode)
410 out.write(s)
411
412 s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,graph)
413 s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,base+"HarriotGraph")
414
415 #s = """G_%s [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(graph.replace(".","_"),graph)
416 out.write(s)
417 except:
418 pass
419
420
421 if not linksToGraph is None:
422 for nodeID,graph in linksToGraph:
423 #fromNode=g.id2label.get(nodeID).replace("\n","").replace(" ","_").replace(".","_").replace("(","_").replace(")","_").replace("?","_").replace("'","_").replace(",","_").replace("-","_")
424 fromNode=nodeID
425 splitted=graph.split("/")
426 print graph
427 gr = splitted[-1]
428 print gr
429 ressourceURI=base+gr
430
431
432
433 typeSrc=type2NodeShape.get(nodeID)
434 if typeSrc==base+"Topic":
435 relation="is_specified_in"
436 else:
437 relation="see_also"
438
439 try:
440 s ="""<%s><%s><%s>.\n"""%(base+str(fromNode),relation,ressourceURI)
441 #s = """%s -> G_%s [color="green"];\n"""%(fromNode,gr.replace(".","_"))
442 out.write(s)
443
444 s="""<%s> <http://www.w3.org/2000/01/rdf-schema#label> "%s".\n"""%(ressourceURI,gr)
445 s+="""<%s> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><%s>.\n"""%(ressourceURI,base+"HarriotGraph")
446
447 #s = """G_%s [label="%s" color="green" fillcolor="green" style="filled"];\n"""%(gr.replace(".","_"),gr)
448 out.write(s)
449 except:
450 pass
451 #out.write("}")
452
453
454 out.close()
455 def readPalette(self,palettePath):
456 typeNr=0
457 palette = etree.parse(palettePath)
458 edges=palette.xpath("//graphML:edge",namespaces=namespaces)
459 # lage alle kanten in der palette
460 for edge in edges:
461 #relevant fuer die einordnugn ist data key=12 linestyle und arrows
462 lineStyles=edge.xpath('./graphML:data[@key="d12"]/y:GenericEdge/y:LineStyle',namespaces=namespaces)
463 ls=None
464 for lineStyle in lineStyles:
465 ls=lineStyle
466
467 arrows=edge.xpath('./graphML:data[@key="d12"]/y:GenericEdge/y:Arrows',namespaces=namespaces)
468 ars=None
469 for arrow in arrows:
470 ars=arrow
471
472 #get description
473 ds=""
474 descriptions=edge.xpath('./graphML:data[@key="d9"]',namespaces=namespaces)
475 for description in descriptions:
476 ds=description
477
478 self.edgeDescription[typeNr]=ds.text
479 self.edgeStyles[typeNr]=(ls.attrib,ars.attrib)
480 typeNr+=1
481
482 typeNr=0
483 nodes=palette.xpath("//graphML:node",namespaces=namespaces)
484 for node in nodes:
485 style=""
486 #fall 1 Generic Node
487 gns = node.xpath('./graphML:data[@key="d7"]/y:GenericNode',namespaces=namespaces)
488 for gn in gns:
489 style="g"
490 fills=gn.xpath('./y:Fill',namespaces=namespaces)
491 fs=None
492 for fill in fills:
493 fs=fill.attrib
494
495 borderstyles=gn.xpath('./y:BorderStyle',namespaces=namespaces)
496 bs=None
497 for borderstyle in borderstyles:
498 bs=borderstyle.attrib
499
500
501 config=gn.attrib.get("configuration")
502
503 #get description
504 ds=""
505 descriptions=node.xpath('./graphML:data[@key="d4"]',namespaces=namespaces)
506 for description in descriptions:
507 ds=description.text
508
509 self.nodeDescription[typeNr]=ds
510
511 self.nodeStyles[typeNr]=(style,fs,bs,config)
512 typeNr+=1
513
514 #fall 2 shape Node
515 gns = node.xpath('./graphML:data[@key="d7"]/y:ShapeNode',namespaces=namespaces)
516 for gn in gns:
517 style="s"
518 fills=gn.xpath('./y:Fill',namespaces=namespaces)
519 fs=None
520 for fill in fills:
521 fs=fill.attrib
522
523 borderstyles=gn.xpath('./y:BorderStyle',namespaces=namespaces)
524 bs=None
525 for borderstyle in borderstyles:
526 bs=borderstyle.attrib
527
528
529 shapes=gn.xpath('./y:Shape',namespaces=namespaces)
530 shapeType=None
531 for shape in shapes:
532 shapeType=shape.attrib.get("type")
533
534
535
536 #get description
537 ds=""
538 descriptions=node.xpath('./graphML:data[@key="d4"]',namespaces=namespaces)
539 for description in descriptions:
540 ds=description.text
541
542 self.nodeDescription[typeNr]=ds
543 self.nodeStyles[typeNr]=(style,fs,bs,shapeType)
544 typeNr+=1
545
546 def merge(graphs,dispensor):
547 filter=['supermap.graphml']
548 partOfGraph={}
549 linksToGraph=set()
550
551 edges=set()
552 mg =Graph(dispensor)
553 for g in graphs:
554 if g.graphURI in filter:
555 continue
556 idalt2neu={}
557 for nodeid in g.id2nodes.keys():
558 node=g.id2nodes.get(nodeid)
559 label=node.label
560 currentID =mg.label2Ids.get(label,dispensor.getID()) #hole id wenn existent sonst neue
561
562 mg.label2Ids[label]=currentID
563 mg.id2label[currentID]=label
564 idalt2neu[node.numId]=currentID
565 mg.id2nodes[currentID]=node
566
567
568 if node.internalRef!="":
569 linksToGraph.add((currentID,node.internalRef))
570
571 containedIn = partOfGraph.get(currentID,set())
572
573
574 containedIn.add(g.graphURI)
575 partOfGraph[currentID]=containedIn
576
577
578 for edge in g.edges:
579 src=edge.src
580 target=edge.target
581
582 edge.src=idalt2neu.get(src)
583 edge.target=idalt2neu.get(target)
584 edges.add(edge)
585
586 mg.edges=edges
587 return mg,partOfGraph,linksToGraph
588
589 if __name__ == '__main__':
590 dispensor = IDDispensor()
591 #g1=Graph(dispensor)
592 #g1.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml")
593 #g1.convertGrahml("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/5.7.3_cubics_other.graphml")
594
595
596
597 path="/Users/dwinter/Documents/Projekte/Europeana/harriot-graphml/Maps_20120523/"
598 ls = os.listdir(path)
599 graphs=set()
600
601 for l in ls:
602
603 g1=Graph(dispensor)
604 g1.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml")
605
606 g1.convertGrahml(path+l)
607 g1.graphURI=l
608 graphs.add(g1)
609
610 g,po,lg = merge(graphs,dispensor)
611
612
613 #print g.label2Ids.keys()
614 #print len(g.label2Ids.keys())
615
616 #g.readPalette("/Users/dwinter/Documents/Projekte/Diss - data-mining/eclipseWorkspace/graphML2RDF/examples/Manuscripts_3.graphml")
617 #g.exportAsDot("/tmp/out.dot", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg)
618 g.exportAsDot("/tmp/out.dot", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg)
619 g.exportAsRDF("/tmp/out.rdf", "harriot",onlyMs=False,partOfGraph=po,linksToGraph=lg)
620
621 out2 = file("/tmp/out.txt","w")
622 for key in g.label2Ids.keys():
623 try:
624 out2.write('"'+key+'"'+"\n")
625 except:
626 pass
627 out2.close()
628