changeset 17:4dfd832e9cd9

added automatic creation of inverse relations. added more attribute types.
author casties
date Thu, 03 Sep 2015 18:48:21 +0200
parents de0a06eef13b
children 0827156df210
files importFromOpenMind/importer/ismi2neo4j.py
diffstat 1 files changed, 61 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/importFromOpenMind/importer/ismi2neo4j.py	Fri Aug 28 17:24:45 2015 +0200
+++ b/importFromOpenMind/importer/ismi2neo4j.py	Thu Sep 03 18:48:21 2015 +0200
@@ -2,11 +2,24 @@
 import json
 from neo4jrestclient.client import GraphDatabase, Node
 
-# In[111]:
-ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
+## configure behaviour
+
+# add inverse relations as "<relation"
+add_inverse_relations = True
+
+# try to find and re-use existing nodes in neo4j (slow!)
+keep_nodes = False
 
+# label added to all nodes
+project_label = '_ismi_inv_rel'
+
+# OpenMind base URL
 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
 
+# neo4j base URL
+neo4jBaseURL = "http://localhost:7474/db/data/"
+
+
 entsURL=baseURL+"method=get_ents&oc=%s"
 
 entURL=baseURL+"method=get_ent&id=%s&include_content=True"
@@ -19,39 +32,57 @@
     
 defs_json = readJSON(baseURL+"method=get_defs")
 
+# current list of all definitions 
 ismi_defs = [atts['ov'] for atts in defs_json['defs']]
 
+#ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
 
-gdb = GraphDatabase("http://localhost:7474/db/data/", username="neo4j", password="neo5j")
+
+gdb = GraphDatabase(neo4jBaseURL, username="neo4j", password="neo5j")
 
 n4j_nodes = {}
 ismi_relations = {}
 n4j_relations = {}
 
-keep_nodes = False
-
 ent_exclude_attrs = [
     'lw',
     'node_type',
     'nov'
 ]
 
+def fixName(name, is_src_rel=False, is_tar_rel=False):
+    # these are too embarrasing...
+    if 'FLORUIT' in name:
+        name = name.replace('FLORUIT', 'FLOURISH')
+        
+    elif 'floruit' in name:
+        name = name.replace('floruit', 'flourish')
+        
+    if is_src_rel:
+        name = name + '>'
+        
+    if is_tar_rel:
+        name = '<' + name
+
+    return name
+
 
 def getNode(ismi_id=None):
     if ismi_id is not None:
-        res = gdb.query("match (n {ismi_id: %s}) return n"%40635, returns=(Node))
+        res = gdb.query("match (n {ismi_id: %s}) return n"%ismi_id, returns=(Node))
         if len(res) > 0:
             return res[0]
     
     return None
 
+
 def nodeFromEnt(ent, etype):
     attrs = {}
     # go through all attributes
     for att in ent['atts']:
         ct = att.get('content_type', None)
-        if ct in ['text', 'arabic', 'bool', 'url']:
-            # normal text attribute
+        if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
+            # normal text attribute (assume no content_type is text too...)
             key = att['name']
             val = att['ov']
             
@@ -62,6 +93,18 @@
             # keep attribute
             attrs[key] = val
             
+        elif ct == 'num':
+            # number attribute
+            key = att['name']
+            val = att['ov']
+            
+            if key in ent_exclude_attrs:
+                # exclude attribute
+                continue
+
+            # keep attribute, assume num is int
+            attrs[key] = int(val)
+            
         elif ct == 'date':
             # date attribute
             key = att['name']
@@ -73,7 +116,7 @@
             continue
             
         else:
-            #print("WARN: attribute with unknown content_type: %s"%repr(att))
+            print("WARN: attribute with unknown content_type: %s"%repr(att))
             # ignore other content types
             continue
             
@@ -83,7 +126,7 @@
         print("ERROR: entity type doesn't match!")
         return null
             
-    attrs['type'] = oc
+    attrs['type'] = fixName(oc)
                 
     ismi_id = ent['id']
     # rename id to ismi_id
@@ -97,7 +140,7 @@
     # create node with attributes
     node = gdb.nodes.create(**attrs)
     # add labels
-    node.labels.add(['project_ismi', etype])
+    node.labels.add([project_label, fixName(etype)])
     return node
 
 
@@ -144,7 +187,13 @@
             print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
             continue
         
-        n4j_rel = gdb.relationships.create(src, rel_name, tar)
+        if add_inverse_relations:
+            n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar),
+                       gdb.relationships.create(tar, fixName(rel_name, is_tar_rel=True), src)]
+                       
+        else:
+            n4j_rel = gdb.relationships.create(src, fixName(rel_name), tar)
+
         n4j_relations[rel_id] = n4j_rel
         
     return n4j_relations