changeset 25:5bdcb5805d29

updated openmind-networkx-neo4j conversion with dates, locations and links.
author casties
date Thu, 24 Sep 2015 18:17:41 +0200
parents 97f2da68fb5f
children 248bf8d1e2e7
files importFromOpenMind/importer/ismi2model.py importFromOpenMind/importer/model2model.py importFromOpenMind/importer/model2neo4j.py
diffstat 3 files changed, 212 insertions(+), 48 deletions(-) [+]
line wrap: on
line diff
--- a/importFromOpenMind/importer/ismi2model.py	Wed Sep 23 19:47:02 2015 +0200
+++ b/importFromOpenMind/importer/ismi2model.py	Thu Sep 24 18:17:41 2015 +0200
@@ -8,13 +8,12 @@
 # output filename
 output_fn = "ismi_graph.gpickle"
 
-# contract relations to these objects into attributes with the relations' name
-#contract_relations_into_attributes = ['PLACE', 'ALIAS']
-contract_relations_into_attributes = []
-
 # OpenMind base URL
 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
 
+# node types to exclude from the graph
+exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE']
+
 
 entsURL=baseURL+"method=get_ents&oc=%s"
 
@@ -111,7 +110,22 @@
             # date attribute
             key = att['name']
             val = att['ov']
-            print("don't know what to do with date: %s=%s"%(key,val))
+            # try to parse date object to get gregorian year
+            try:
+                year = None
+                date_json = json.loads(val)
+                if 'from' in date_json:
+                    year = date_json['from'].get('year', None)
+                elif 'date' in date_json:
+                    year = date_json['date'].get('year', None)
+                else:
+                    print("don't know what to do with date on %s: %s=%s"%(ent['id'],key,val))
+                    
+                if year is not None:
+                    attrs[key] = year
+                    
+            except:
+                print("ERROR: invalid JSON in date: %s"%repr(val))
             
         elif ct == 'old':
             # ignore attribute
@@ -128,6 +142,11 @@
         print("ERROR: entity type doesn't match!")
         return null
             
+    # rename if type attr exists
+    if 'type' in attrs:
+        attrs['type2'] = attrs['type']
+        
+    # set type
     attrs['type'] = fixName(oc)
                 
     ismi_id = ent['id']
@@ -154,6 +173,12 @@
     # go through src_rels and tar_rels
     rels = ent.get('src_rels', []) + ent.get('tar_rels', [])
     for rel in rels:
+        src_type = rel['src_oc']
+        tar_type = rel['tar_oc']
+        if src_type in exclude_objects_of_type or tar_type in exclude_objects_of_type:
+            # skip relation to excluded objects
+            continue
+        
         rel_id = rel['id']
         if rel_id in relations:
             old_rel = relations[rel_id]
@@ -195,41 +220,6 @@
             print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
             continue
         
-        if contract_relations_into_attributes:
-            # contract source relations
-            tar_type = rel['tar_oc']
-            if tar_type in contract_relations_into_attributes:
-                att_name = fixName(rel_name, att_from_rel=True)
-                # TODO: clean up attribute names
-                while src.get(att_name, None) is not None:
-                    # attribute exists
-                    if att_name[-1].isnumeric():
-                        # increment last digit
-                        att_name = att_name[:-1] + str(int(att_name[-1]) + 1)
-                    else:
-                        att_name += '2'
-                    
-                # add target node's label as attribute
-                #print("contracting tar to attribute %s on id=%s"%(att_name, src_id))
-                nx_graph.node[src_id][att_name] = nx_graph.node[tar_id]['label']
-                
-            # contract target relations
-            src_type = rel['src_oc']
-            if src_type in contract_relations_into_attributes:
-                att_name = fixName(rel_name, att_from_rel=True)
-                # TODO: clean up attribute names
-                while tar.get(att_name, None) is not None:
-                    # attribute exists
-                    if att_name[-1].isnumeric():
-                        # increment last digit
-                        att_name = att_name[:-1] + str(int(att_name[-1]) + 1)
-                    else:
-                        att_name += '2'
-                    
-                # add target node's label as attribute
-                #print("contracting src to attribute %s on id=%s"%(att_name, tar_id))
-                nx_graph.node[tar_id][att_name] = nx_graph.node[src_id]['label']
-        
         # create relation with type
         nx_rel = nx_graph.add_edge(src_id, tar_id, type=fixName(rel_name), ismi_id=rel_id)
         
@@ -267,7 +257,7 @@
                 print("ERROR: entity with id=%s exists!"%ismi_id)
                 return
             
-            # create neo4j node
+            # create networkx node
             node = nodeFromEnt(ent_data, etype)
             
             # save node reference
@@ -285,6 +275,10 @@
 def importAllEnts(etypes):
     
     for etype in etypes:
+        if etype in exclude_objects_of_type:
+            # skip this type
+            continue
+        
         importEnts(etype)
         
     relationsFromRels(ismi_relations, nx_nodes)
@@ -300,6 +294,9 @@
 
 # import everything
 print("Reading graph from OpenMind at %s"%baseURL)
+if len(exclude_objects_of_type) > 0:
+    print("  Skipping objects of type %s"%exclude_objects_of_type);
+    
 importAllEnts(ismi_defs)
 #importAllEnts(['TEXT'])
 
--- a/importFromOpenMind/importer/model2model.py	Wed Sep 23 19:47:02 2015 +0200
+++ b/importFromOpenMind/importer/model2model.py	Thu Sep 24 18:17:41 2015 +0200
@@ -1,5 +1,6 @@
 import networkx as nx
 import sys
+import csv
 
 ## configure behaviour
 
@@ -8,13 +9,24 @@
 output_fn = 'ismi_graph_mod.gpickle'
 
 # operations
-ops = ['contract', 'inv_rels']
+ops = ['locate', 'contract', 'inv_rels', 'add_links']
+
+# types of object to locate
+locate_objects_of_type = ['PLACE']
 
-# add relations to these objects as attributes with the relations name
-contract_relations_into_attributes = {'PLACE': ['label'],
+# file with place location information
+places_fn = 'ismi_places_loc.csv'
+
+# node types to remove from the graph
+#remove_objects_of_type = ['DIGITALIZATION', 'REFERENCE']
+
+# add relations to these objects as attributes with the relation's name
+contract_relations_into_attributes = {'PLACE': ['label', 'latitude', 'longitude'],
                                       'ALIAS': ['label']}
 
 
+# add URLs to nodes using an attribute in a pattern
+add_link_attributes = {'ismi_id': 'https://ismi-dev.mpiwg-berlin.mpg.de/drupal-ismi/entity/%s'}
 
 
 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
@@ -42,11 +54,158 @@
     return name
 
 
+def locatePlaces(nx_graph):
+    """add location information to objects in the graph"""
+    
+    print("Adding location information from %s to %s."%(places_fn, locate_objects_of_type))
+    cnt = 0
+    
+    # read place location file
+    locations = {}
+    with open(places_fn, encoding='utf-8') as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            lat = row['Latitude']
+            lon = row['Longitude']
+            name = row['Address']
+            if lat and lon:
+                locations[name] = {'latitude': lat, 'longitude': lon}
+
+    # iterate all nodes
+    for n in nx.nodes_iter(nx_graph):
+        attrs = nx_graph.node[n]
+        if attrs['type'] in locate_objects_of_type:
+            # locatable object
+            name = attrs['label']
+            if name in locations:
+                # place name match
+                location = locations[name]
+                attrs['latitude'] = location['latitude']
+                attrs['longitude'] = location['longitude']
+                
+            else:
+                print("WARNING: no location for name '%s'"%name)
+
+        cnt += 1
+        if cnt % 100 == 0:
+            print("  %s nodes"%cnt)
+
+    
+
+def genAttName(attrs, name):
+    """Generate new attribute name.
+    """
+    while attrs.get(name, None) is not None:
+    # attribute exists
+        if name[-1].isnumeric(): # increment last digit
+            name = name[:-1] + str(int(name[-1]) + 1)
+        else:
+            name += '2'
+    
+    return name
+
+
 def contractRelations(nx_graph):
     """contract relations into attributes"""
     
+    print("Contracting relations to attributes.")
+    cnt = 0
+    for nx_edge in nx.edges_iter(nx_graph):
+        (nx_src, nx_tar) = nx_edge
+        # get attributes of edge
+        rel_attrs = nx_graph.edge[nx_src][nx_tar][0]
+        rel_type = rel_attrs['type']
+        # get attributes of source and target nodes
+        src_attrs = nx_graph.node[nx_src]
+        tar_attrs = nx_graph.node[nx_tar]
+        
+        # contract source relations
+        tar_type = tar_attrs['type']
+        if tar_type in contract_relations_into_attributes:
+            # get list of attributes to transfer
+            transfer_atts = contract_relations_into_attributes[tar_type]
+            for transfer_att in transfer_atts:
+                if transfer_att not in tar_attrs:
+                    # target has no attribute
+                    continue
+                
+                # name for new attribute starts with relation name
+                att_name = fixName(rel_type, att_from_rel=True)
+                # then attribute name
+                if transfer_att != 'label':
+                    att_name += "_%s"%transfer_att
+                    
+                # then generate unique name
+                att_name = genAttName(src_attrs, att_name)
+                # add target node's attribute
+                src_attrs[att_name] = tar_attrs.get(transfer_att)
+            
+        # contract target relations
+        src_type = src_attrs['type']
+        if src_type in contract_relations_into_attributes:
+            # get list of attributes to transfer
+            transfer_atts = contract_relations_into_attributes[src_type]
+            for transfer_att in transfer_atts:
+                if transfer_att not in src_attrs:
+                    # target has no attribute
+                    continue
+                
+                # name for new attribute starts with relation name
+                att_name = fixName(rel_type, att_from_rel=True)
+                # then attribute name
+                if transfer_att != 'label':
+                    att_name += "_%s"%transfer_att
+                    
+                # then generate unique name
+                att_name = genAttName(tar_attrs, att_name)
+                # add target node's attribute
+                tar_attrs[att_name] = src_attrs.get(transfer_att)
+
+        cnt += 1
+        if cnt % 100 == 0:
+            print("  %s relations"%cnt)
+
+
+def invertRelations(nx_graph):
+    """Add inverse relations to each relation"""
     
+    print("Adding inverse relations.")
+    # copy list of edges because we add edges in the loop
+    edges = nx.edges(nx_graph)[:]
+    # iterate list
+    cnt = 0
+    for nx_edge in edges:
+        (nx_src, nx_tar) = nx_edge
+        # get attributes of edge
+        rel_attrs = nx_graph.edge[nx_src][nx_tar][0]
+        rel_type = rel_attrs['type']
+        rel_id = rel_attrs['ismi_id']
+        # create new relation
+        nx_graph.add_edge(nx_tar, nx_src, type=fixName(rel_type, is_tar_rel=True), ismi_id=-rel_id)
+
+        cnt += 1
+        if cnt % 100 == 0:
+            print("  %s relations"%cnt)
+        
     
+def addLinks(nx_graph):
+    """Add link attributes to all nodes."""
+    
+    print("Adding links: %s"%repr(add_link_attributes))
+    cnt = 0
+    for link_att, link_pattern in add_link_attributes.items():
+        # iterate all nodes
+        for n in nx.nodes_iter(nx_graph):
+            attrs = nx_graph.node[n]
+            if link_att in attrs:
+                url = link_pattern%attrs[link_att]
+                # TODO: which target attribute for multiple?
+                attrs['link'] = url
+
+            cnt += 1
+            if cnt % 100 == 0:
+                print("  %s nodes"%cnt)
+        
     
 ## main
 
@@ -65,11 +224,19 @@
 
 # operate    
 for op in ops:
-    if op == 'contract':
+    if op == 'locate':
+        locatePlaces(nx_graph)
+        
+    elif op == 'contract':
         contractRelations(nx_graph)
         
     elif op == 'inv_rels':
         invertRelations(nx_graph)
 
+    elif op == 'add_links':
+        addLinks(nx_graph)
+
+print("Writing graph to %s"%output_fn)
+nx_graph = nx.write_gpickle(nx_graph, output_fn)
 
 print("Done.")
--- a/importFromOpenMind/importer/model2neo4j.py	Wed Sep 23 19:47:02 2015 +0200
+++ b/importFromOpenMind/importer/model2neo4j.py	Thu Sep 24 18:17:41 2015 +0200
@@ -8,7 +8,7 @@
 input_fn = 'ismi_graph.gpickle'
 
 # label added to all nodes
-project_label = '_ismi4'
+project_label = '_ismi6'
 
 # neo4j base URL
 neo4jBaseURL = "http://localhost:7474/db/data/"
@@ -63,7 +63,7 @@
 
         cnt += 1
         if cnt % 100 == 0:
-            print("  %s"%cnt)
+            print("  %s nodes"%cnt)
 
 
 def copyRelations(nx_graph, n4j_graph):
@@ -97,7 +97,7 @@
 
         cnt += 1
         if cnt % 100 == 0:
-            print("  %s"%cnt)
+            print("  %s relations"%cnt)
 
 
 ## main