diff importFromOpenMind/importer/ismi2neo4j.py @ 19:ca1e02a2a9c4

unfilteredIsmi: openmind to json exporter like filterISMI. ismi2model: openmind importer like ismi2neo4j that saves networkx pickle file.
author casties
date Wed, 09 Sep 2015 17:32:42 +0200
parents 0827156df210
children a9bfd49355f8
line wrap: on
line diff
--- a/importFromOpenMind/importer/ismi2neo4j.py	Mon Sep 07 16:57:10 2015 +0200
+++ b/importFromOpenMind/importer/ismi2neo4j.py	Wed Sep 09 17:32:42 2015 +0200
@@ -10,11 +10,8 @@
 # add relations to these objects as attributes with the relations name
 contract_relations_into_attributes = ['PLACE', 'ALIAS']
 
-# try to find and re-use existing nodes in neo4j (slow!)
-keep_nodes = False
-
 # label added to all nodes
-project_label = '_ismi2'
+project_label = '_ismi3'
 
 # OpenMind base URL
 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
@@ -25,10 +22,13 @@
 
 entsURL=baseURL+"method=get_ents&oc=%s"
 
+entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s"
+
 entURL=baseURL+"method=get_ent&id=%s&include_content=True"
 
 
 def readJSON(url):
+    #print("JSON loading %s"%url)
     wsh=urllib.request.urlopen(url)
     txt = wsh.read()
     return json.loads(txt.decode("utf-8"))
@@ -243,7 +243,7 @@
                     
                 # add target node's label as attribute
                 #print("contracting src to attribute %s on id=%s"%(att_name, tar_id))
-                src.set(att_name, src.get('label'))
+                tar.set(att_name, src.get('label'))
         
         if add_inverse_relations:
             n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar),
@@ -264,36 +264,36 @@
     json = readJSON(entsURL%etype)
     ents = json['ents']
     print("importing %s %ss"%(len(ents),etype))
+    size = 100
+    batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)]
     cnt = 0
-    for ent in ents:
-        cnt += 1
+    for batch in batches:
+        cnt += size
         if cnt % 100 == 0:
             print(" %s %ss"%(cnt, etype))
             
-        # extract ismi id
-        ismi_id = ent['id']
-        
-        node = None
+        # extract list of ismi ids
+        ismi_ids = [str(ent['id']) for ent in batch]
         
-        # fetch full data for entity
-        ent_json = readJSON(entURL%ismi_id)
-        ent_data = ent_json['ent']
-        # create neo4j node
-        if keep_nodes:
-            node = getNode(ismi_id)
+        # fetch full data for list of entities
+        ent_json = readJSON(entsByIdURL%','.join(ismi_ids))
+        ents_data = ent_json['ents']
         
-        if ismi_id in n4j_nodes:
-            print("ERROR: entity with id=%s exists!"%ismi_id)
-            return
-        
-        if node is None:
+        # iterate through results batch
+        for ent_data in ents_data:
+            ismi_id = ent_data['id']
+            if ismi_id in n4j_nodes:
+                print("ERROR: entity with id=%s exists!"%ismi_id)
+                return
+            
+            # create neo4j node
             node = nodeFromEnt(ent_data, etype)
-        
-        # save node reference
-        n4j_nodes[ismi_id] = node
-        
-        # extract relations
-        relsFromEnt(ent_data, ismi_relations)
+            
+            # save node reference
+            n4j_nodes[ismi_id] = node
+            
+            # extract relations
+            relsFromEnt(ent_data, ismi_relations)
         
         #if cnt >= 100:
         #    return