diff importFromOpenMind/importer/ismi2model.py @ 19:ca1e02a2a9c4

unfilteredIsmi: openmind to json exporter like filterISMI. ismi2model: openmind importer like ismi2neo4j that saves networkx pickle file.
author casties
date Wed, 09 Sep 2015 17:32:42 +0200
parents
children 45a823b5bf33
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/importFromOpenMind/importer/ismi2model.py	Wed Sep 09 17:32:42 2015 +0200
@@ -0,0 +1,303 @@
+import urllib.request
+import json
+import networkx
+
+## configure behaviour
+
+# output filename
+output_fn = "ismi_graph.gpickle"
+
+# contract relations to these objects into attributes with the relations' name
+#contract_relations_into_attributes = ['PLACE', 'ALIAS']
+contract_relations_into_attributes = []
+
+# OpenMind base URL
+baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
+
+
+entsURL=baseURL+"method=get_ents&oc=%s"
+
+entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s"
+
+entURL=baseURL+"method=get_ent&id=%s&include_content=True"
+
+
+def readJSON(url):
+    #print("JSON loading %s"%url)
+    wsh=urllib.request.urlopen(url)
+    txt = wsh.read()
+    return json.loads(txt.decode("utf-8"))
+    
+defs_json = readJSON(baseURL+"method=get_defs")
+
+# current list of all definitions 
+ismi_defs = [atts['ov'] for atts in defs_json['defs']]
+
+#ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
+
+
+nx_graph = networkx.MultiDiGraph()
+
+nx_nodes = {}
+ismi_relations = {}
+nx_relations = {}
+
+ent_exclude_attrs = [
+    'lw',
+    'node_type',
+    'nov'
+]
+
+def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
+    # these are too embarrassing...
+    if 'FLORUIT' in name:
+        name = name.replace('FLORUIT', 'FLOURISH')
+        
+    elif 'floruit' in name:
+        name = name.replace('floruit', 'flourish')
+        
+    if is_src_rel:
+        #name = name + '>'
+        pass
+        
+    if is_tar_rel:
+        name = '<' + name
+        
+    if att_from_rel:
+        # clean up relations as attribute names
+        name = name.replace('is_', '')
+        name = name.replace('has_', '')
+        name = name.replace('was_', '')
+        name = name.replace('_of', '')
+
+    return name
+
+
+def nodeFromEnt(ent, etype):
+    """Create a Neo4J node from the given JSON entity.
+    
+    Creates the node in gdb and returns the node.
+    """ 
+    attrs = {}
+    # go through all attributes
+    for att in ent['atts']:
+        ct = att.get('content_type', None)
+        if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
+            # normal text attribute (assume no content_type is text too...)
+            key = att['name']
+            val = att['ov']
+            
+            if key in ent_exclude_attrs:
+                # exclude attribute
+                continue
+
+            # keep attribute
+            attrs[key] = val
+            
+        elif ct == 'num':
+            # number attribute
+            key = att['name']
+            val = att['ov']
+            
+            if key in ent_exclude_attrs:
+                # exclude attribute
+                continue
+
+            # keep attribute, assume num is int
+            attrs[key] = int(val)
+            
+        elif ct == 'date':
+            # date attribute
+            key = att['name']
+            val = att['ov']
+            #print("don't know what to do with date: %s=%s"%(key,val))
+            
+        elif ct == 'old':
+            # ignore attribute
+            continue
+            
+        else:
+            print("WARN: attribute with unknown content_type: %s"%repr(att))
+            # ignore other content types
+            continue
+            
+    # process base attributes
+    oc = ent['oc']
+    if oc != etype:
+        print("ERROR: entity type doesn't match!")
+        return null
+            
+    attrs['type'] = fixName(oc)
+                
+    ismi_id = ent['id']
+    # rename id to ismi_id
+    attrs['ismi_id'] = ismi_id
+            
+    ov = ent.get('ov', None)
+    if ov is not None:
+        # save ov as label
+        attrs['label'] = ov
+                        
+    # create node with attributes
+    nx_graph.add_node(ismi_id, **attrs)
+    node = nx_graph.node[ismi_id]
+    
+    return node
+
+
+def relsFromEnt(ent, relations):
+    """Extract all relations from JSON entity.
+    
+    Adds JSON to dict relations under relation's id.
+    """
+    # go through src_rels and tar_rels
+    rels = ent.get('src_rels', []) + ent.get('tar_rels', [])
+    for rel in rels:
+        rel_id = rel['id']
+        if rel_id in relations:
+            old_rel = relations[rel_id]
+            if rel != old_rel:
+                print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel)))
+                continue
+            
+        relations[rel_id] = rel
+        
+    return relations
+
+
+def relationsFromRels(rels, nodes):
+    """Create relations in Neo4J.
+    
+    Args:
+        rels: dict of JSON relations
+        nodes: dict of existing Neo4J nodes
+    Returns:
+        dict of Neo4J relations
+    """
+    # go through all rels
+    print("importing %s relations"%len(rels))
+    cnt = 0
+    for rel in rels.values():
+        cnt += 1
+        if cnt % 100 == 0:
+            print(" %s relations"%cnt)
+            
+        rel_id = rel['id']
+        rel_name = rel['name']
+        src_id = rel['src_id']
+        tar_id = rel['tar_id']
+        if not src_id in nodes:
+            print("ERROR: relation %s src node %s missing!"%(rel_id,src_id))
+            continue
+        
+        if not tar_id in nodes:
+            print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
+            continue
+        
+        if contract_relations_into_attributes:
+            # contract source relations
+            tar_type = rel['tar_oc']
+            if tar_type in contract_relations_into_attributes:
+                att_name = fixName(rel_name, att_from_rel=True)
+                # TODO: clean up attribute names
+                while src.get(att_name, None) is not None:
+                    # attribute exists
+                    if att_name[-1].isnumeric():
+                        # increment last digit
+                        att_name = att_name[:-1] + str(int(att_name[-1]) + 1)
+                    else:
+                        att_name += '2'
+                    
+                # add target node's label as attribute
+                #print("contracting tar to attribute %s on id=%s"%(att_name, src_id))
+                nx_graph.node[src_id][att_name] = nx_graph.node[tar_id]['label']
+                
+            # contract target relations
+            src_type = rel['src_oc']
+            if src_type in contract_relations_into_attributes:
+                att_name = fixName(rel_name, att_from_rel=True)
+                # TODO: clean up attribute names
+                while tar.get(att_name, None) is not None:
+                    # attribute exists
+                    if att_name[-1].isnumeric():
+                        # increment last digit
+                        att_name = att_name[:-1] + str(int(att_name[-1]) + 1)
+                    else:
+                        att_name += '2'
+                    
+                # add target node's label as attribute
+                #print("contracting src to attribute %s on id=%s"%(att_name, tar_id))
+                nx_graph.node[tar_id][att_name] = nx_graph.node[src_id]['label']
+        
+        # create relation with type
+        nx_rel = nx_graph.add_edge(src_id, tar_id, type=fixName(rel_name))
+        
+        nx_relations[rel_id] = nx_rel
+        
+    return nx_relations
+
+
+def importEnts(etype):
+    """Import all entities of the given type.
+    """
+    # read json for all entities of given type
+    json = readJSON(entsURL%etype)
+    ents = json['ents']
+    print("importing %s %ss"%(len(ents),etype))
+    size = 100
+    batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)]
+    cnt = 0
+    for batch in batches:
+        cnt += size
+        if cnt % 100 == 0:
+            print(" %s %ss"%(cnt, etype))
+            
+        # extract list of ismi ids
+        ismi_ids = [str(ent['id']) for ent in batch]
+        
+        # fetch full data for list of entities
+        ent_json = readJSON(entsByIdURL%','.join(ismi_ids))
+        ents_data = ent_json['ents']
+        
+        # iterate through results batch
+        for ent_data in ents_data:
+            ismi_id = ent_data['id']
+            if ismi_id in nx_nodes:
+                print("ERROR: entity with id=%s exists!"%ismi_id)
+                return
+            
+            # create neo4j node
+            node = nodeFromEnt(ent_data, etype)
+            
+            # save node reference
+            nx_nodes[ismi_id] = node
+            
+            # extract relations
+            relsFromEnt(ent_data, ismi_relations)
+        
+        #if cnt >= 100:
+        #    return
+
+
+# In[119]:
+
+def importAllEnts(etypes):
+    
+    for etype in etypes:
+        importEnts(etype)
+        
+    relationsFromRels(ismi_relations, nx_nodes)
+
+
+# In[120]:
+
+importAllEnts(ismi_defs)
+#importAllEnts(['TEXT'])
+
+print("Graph info: %s"%networkx.info(nx_graph))
+print("Number of nodes: %s"%networkx.number_of_nodes(nx_graph))
+print("Number of edges: %s"%networkx.number_of_edges(nx_graph))
+#print(" nodes:%s"%repr(nx_graph.nodes(data=True)))
+# export pickle
+networkx.write_gpickle(nx_graph, output_fn)
+print("Wrote file %s"%output_fn)