diff importFromOpenMind/importer/model2model.py @ 25:5bdcb5805d29

updated openmind-networkx-neo4j conversion with dates, locations and links.
author casties
date Thu, 24 Sep 2015 18:17:41 +0200
parents 97f2da68fb5f
children 248bf8d1e2e7
line wrap: on
line diff
--- a/importFromOpenMind/importer/model2model.py	Wed Sep 23 19:47:02 2015 +0200
+++ b/importFromOpenMind/importer/model2model.py	Thu Sep 24 18:17:41 2015 +0200
@@ -1,5 +1,6 @@
 import networkx as nx
 import sys
+import csv
 
 ## configure behaviour
 
@@ -8,13 +9,24 @@
 output_fn = 'ismi_graph_mod.gpickle'
 
 # operations
-ops = ['contract', 'inv_rels']
+ops = ['locate', 'contract', 'inv_rels', 'add_links']
+
+# types of object to locate
+locate_objects_of_type = ['PLACE']
 
-# add relations to these objects as attributes with the relations name
-contract_relations_into_attributes = {'PLACE': ['label'],
+# file with place location information
+places_fn = 'ismi_places_loc.csv'
+
+# node types to remove from the graph
+#remove_objects_of_type = ['DIGITALIZATION', 'REFERENCE']
+
+# add relations to these objects as attributes with the relation's name
+contract_relations_into_attributes = {'PLACE': ['label', 'latitude', 'longitude'],
                                       'ALIAS': ['label']}
 
 
+# add URLs to nodes using an attribute in a pattern
+add_link_attributes = {'ismi_id': 'https://ismi-dev.mpiwg-berlin.mpg.de/drupal-ismi/entity/%s'}
 
 
 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
@@ -42,11 +54,158 @@
     return name
 
 
+def locatePlaces(nx_graph):
+    """add location information to objects in the graph"""
+    
+    print("Adding location information from %s to %s."%(places_fn, locate_objects_of_type))
+    cnt = 0
+    
+    # read place location file
+    locations = {}
+    with open(places_fn, encoding='utf-8') as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            lat = row['Latitude']
+            lon = row['Longitude']
+            name = row['Address']
+            if lat and lon:
+                locations[name] = {'latitude': lat, 'longitude': lon}
+
+    # iterate all nodes
+    for n in nx.nodes_iter(nx_graph):
+        attrs = nx_graph.node[n]
+        if attrs['type'] in locate_objects_of_type:
+            # locatable object
+            name = attrs['label']
+            if name in locations:
+                # place name match
+                location = locations[name]
+                attrs['latitude'] = location['latitude']
+                attrs['longitude'] = location['longitude']
+                
+            else:
+                print("WARNING: no location for name '%s'"%name)
+
+        cnt += 1
+        if cnt % 100 == 0:
+            print("  %s nodes"%cnt)
+
+    
+
+def genAttName(attrs, name):
+    """Generate new attribute name.
+    """
+    while attrs.get(name, None) is not None:
+    # attribute exists
+        if name[-1].isnumeric(): # increment last digit
+            name = name[:-1] + str(int(name[-1]) + 1)
+        else:
+            name += '2'
+    
+    return name
+
+
 def contractRelations(nx_graph):
     """contract relations into attributes"""
     
+    print("Contracting relations to attributes.")
+    cnt = 0
+    for nx_edge in nx.edges_iter(nx_graph):
+        (nx_src, nx_tar) = nx_edge
+        # get attributes of edge
+        rel_attrs = nx_graph.edge[nx_src][nx_tar][0]
+        rel_type = rel_attrs['type']
+        # get attributes of source and target nodes
+        src_attrs = nx_graph.node[nx_src]
+        tar_attrs = nx_graph.node[nx_tar]
+        
+        # contract source relations
+        tar_type = tar_attrs['type']
+        if tar_type in contract_relations_into_attributes:
+            # get list of attributes to transfer
+            transfer_atts = contract_relations_into_attributes[tar_type]
+            for transfer_att in transfer_atts:
+                if transfer_att not in tar_attrs:
+                    # target has no attribute
+                    continue
+                
+                # name for new attribute starts with relation name
+                att_name = fixName(rel_type, att_from_rel=True)
+                # then attribute name
+                if transfer_att != 'label':
+                    att_name += "_%s"%transfer_att
+                    
+                # then generate unique name
+                att_name = genAttName(src_attrs, att_name)
+                # add target node's attribute
+                src_attrs[att_name] = tar_attrs.get(transfer_att)
+            
+        # contract target relations
+        src_type = src_attrs['type']
+        if src_type in contract_relations_into_attributes:
+            # get list of attributes to transfer
+            transfer_atts = contract_relations_into_attributes[src_type]
+            for transfer_att in transfer_atts:
+                if transfer_att not in src_attrs:
+                    # target has no attribute
+                    continue
+                
+                # name for new attribute starts with relation name
+                att_name = fixName(rel_type, att_from_rel=True)
+                # then attribute name
+                if transfer_att != 'label':
+                    att_name += "_%s"%transfer_att
+                    
+                # then generate unique name
+                att_name = genAttName(tar_attrs, att_name)
+                # add target node's attribute
+                tar_attrs[att_name] = src_attrs.get(transfer_att)
+
+        cnt += 1
+        if cnt % 100 == 0:
+            print("  %s relations"%cnt)
+
+
+def invertRelations(nx_graph):
+    """Add inverse relations to each relation"""
     
+    print("Adding inverse relations.")
+    # copy list of edges because we add edges in the loop
+    edges = nx.edges(nx_graph)[:]
+    # iterate list
+    cnt = 0
+    for nx_edge in edges:
+        (nx_src, nx_tar) = nx_edge
+        # get attributes of edge
+        rel_attrs = nx_graph.edge[nx_src][nx_tar][0]
+        rel_type = rel_attrs['type']
+        rel_id = rel_attrs['ismi_id']
+        # create new relation
+        nx_graph.add_edge(nx_tar, nx_src, type=fixName(rel_type, is_tar_rel=True), ismi_id=-rel_id)
+
+        cnt += 1
+        if cnt % 100 == 0:
+            print("  %s relations"%cnt)
+        
     
+def addLinks(nx_graph):
+    """Add link attributes to all nodes."""
+    
+    print("Adding links: %s"%repr(add_link_attributes))
+    cnt = 0
+    for link_att, link_pattern in add_link_attributes.items():
+        # iterate all nodes
+        for n in nx.nodes_iter(nx_graph):
+            attrs = nx_graph.node[n]
+            if link_att in attrs:
+                url = link_pattern%attrs[link_att]
+                # TODO: which target attribute for multiple?
+                attrs['link'] = url
+
+            cnt += 1
+            if cnt % 100 == 0:
+                print("  %s nodes"%cnt)
+        
     
 ## main
 
@@ -65,11 +224,19 @@
 
 # operate    
 for op in ops:
-    if op == 'contract':
+    if op == 'locate':
+        locatePlaces(nx_graph)
+        
+    elif op == 'contract':
         contractRelations(nx_graph)
         
     elif op == 'inv_rels':
         invertRelations(nx_graph)
 
+    elif op == 'add_links':
+        addLinks(nx_graph)
+
+print("Writing graph to %s"%output_fn)
+nx_graph = nx.write_gpickle(nx_graph, output_fn)
 
 print("Done.")