changeset 16:de0a06eef13b

new neo4j importer for network visualisation frontend.
author casties
date Fri, 28 Aug 2015 17:24:45 +0200
parents 61767ff5ce2b
children 4dfd832e9cd9
files importFromOpenMind/importer/ismi2neo4j.py
diffstat 1 files changed, 208 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/importFromOpenMind/importer/ismi2neo4j.py	Fri Aug 28 17:24:45 2015 +0200
@@ -0,0 +1,208 @@
+import urllib.request
+import json
+from neo4jrestclient.client import GraphDatabase, Node
+
+# In[111]:
+ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
+
+baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
+
+entsURL=baseURL+"method=get_ents&oc=%s"
+
+entURL=baseURL+"method=get_ent&id=%s&include_content=True"
+
+
+def readJSON(url):
+    wsh=urllib.request.urlopen(url)
+    txt = wsh.read()
+    return json.loads(txt.decode("utf-8"))
+    
+defs_json = readJSON(baseURL+"method=get_defs")
+
+ismi_defs = [atts['ov'] for atts in defs_json['defs']]
+
+
+gdb = GraphDatabase("http://localhost:7474/db/data/", username="neo4j", password="neo5j")
+
+n4j_nodes = {}
+ismi_relations = {}
+n4j_relations = {}
+
+keep_nodes = False
+
+ent_exclude_attrs = [
+    'lw',
+    'node_type',
+    'nov'
+]
+
+
+def getNode(ismi_id=None):
+    if ismi_id is not None:
+        res = gdb.query("match (n {ismi_id: %s}) return n"%40635, returns=(Node))
+        if len(res) > 0:
+            return res[0]
+    
+    return None
+
+def nodeFromEnt(ent, etype):
+    attrs = {}
+    # go through all attributes
+    for att in ent['atts']:
+        ct = att.get('content_type', None)
+        if ct in ['text', 'arabic', 'bool', 'url']:
+            # normal text attribute
+            key = att['name']
+            val = att['ov']
+            
+            if key in ent_exclude_attrs:
+                # exclude attribute
+                continue
+
+            # keep attribute
+            attrs[key] = val
+            
+        elif ct == 'date':
+            # date attribute
+            key = att['name']
+            val = att['ov']
+            #print("don't know what to do with date: %s=%s"%(key,val))
+            
+        elif ct == 'old':
+            # ignore attribute
+            continue
+            
+        else:
+            #print("WARN: attribute with unknown content_type: %s"%repr(att))
+            # ignore other content types
+            continue
+            
+    # process base attributes
+    oc = ent['oc']
+    if oc != etype:
+        print("ERROR: entity type doesn't match!")
+        return null
+            
+    attrs['type'] = oc
+                
+    ismi_id = ent['id']
+    # rename id to ismi_id
+    attrs['ismi_id'] = ismi_id
+            
+    ov = ent.get('ov', None)
+    if ov is not None:
+        # save ov as label
+        attrs['label'] = ov
+                        
+    # create node with attributes
+    node = gdb.nodes.create(**attrs)
+    # add labels
+    node.labels.add(['project_ismi', etype])
+    return node
+
+
+# In[77]:
+
+def relsFromEnt(ent, relations):
+    # go through src_rels and tar_rels
+    rels = ent.get('src_rels', []) + ent.get('tar_rels', [])
+    for rel in rels:
+        rel_id = rel['id']
+        if rel_id in relations:
+            old_rel = relations[rel_id]
+            if rel != old_rel:
+                print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel)))
+                continue
+            
+        relations[rel_id] = rel
+        
+    return relations
+
+
+# In[110]:
+
+def n4jrelationsFromRels(rels, nodes):
+    # go through all rels
+    print("importing %s relations"%len(rels))
+    cnt = 0
+    for rel in rels.values():
+        cnt += 1
+        if cnt % 100 == 0:
+            print(" %s relations"%cnt)
+            
+        rel_id = rel['id']
+        rel_name = rel['name']
+        src_id = rel['src_id']
+        tar_id = rel['tar_id']
+        src = nodes.get(src_id, None)
+        if src is None:
+            print("ERROR: relation %s src node %s missing!"%(rel_id,src_id))
+            continue
+        
+        tar = nodes.get(tar_id, None)
+        if tar is None:
+            print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
+            continue
+        
+        n4j_rel = gdb.relationships.create(src, rel_name, tar)
+        n4j_relations[rel_id] = n4j_rel
+        
+    return n4j_relations
+
+
+# In[114]:
+
+def importEnts(etype):
+    # read json for all entities of given type
+    json = readJSON(entsURL%etype)
+    ents = json['ents']
+    print("importing %s %ss"%(len(ents),etype))
+    cnt = 0
+    for ent in ents:
+        cnt += 1
+        if cnt % 100 == 0:
+            print(" %s %ss"%(cnt, etype))
+            
+        # extract ismi id
+        ismi_id = ent['id']
+        
+        node = None
+        
+        # fetch full data for entity
+        ent_json = readJSON(entURL%ismi_id)
+        ent_data = ent_json['ent']
+        # create neo4j node
+        if keep_nodes:
+            node = getNode(ismi_id)
+        
+        if node is None:
+            node = nodeFromEnt(ent_data, etype)
+        
+        if ismi_id in n4j_nodes:
+            print("ERROR: entity with id=%s exists!"%ismi_id)
+            return
+        
+        # save node reference
+        n4j_nodes[ismi_id] = node
+        
+        # extract relations
+        relsFromEnt(ent_data, ismi_relations)
+        
+        #if cnt >= 100:
+        #    return
+
+
+# In[119]:
+
+def importAllEnts(etypes):
+    
+    for etype in etypes:
+        importEnts(etype)
+        
+    n4jrelationsFromRels(ismi_relations, n4j_nodes)
+
+
+# In[120]:
+
+#importAllEnts(ismi_types)
+importAllEnts(ismi_defs)