changeset 19:ca1e02a2a9c4

unfilteredIsmi: openmind to json exporter like filterISMI. ismi2model: openmind importer like ismi2neo4j that saves networkx pickle file.
author casties
date Wed, 09 Sep 2015 17:32:42 +0200
parents 0827156df210
children bdf91a4a40ff
files importFromOpenMind/importer/ismi2model.py importFromOpenMind/importer/ismi2neo4j.py importFromOpenMind/importer/unfilteredISMI.py
diffstat 3 files changed, 524 insertions(+), 28 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/importFromOpenMind/importer/ismi2model.py	Wed Sep 09 17:32:42 2015 +0200
@@ -0,0 +1,303 @@
+import urllib.request
+import json
+import networkx
+
+## configure behaviour
+
+# output filename
+output_fn = "ismi_graph.gpickle"
+
+# contract relations to these objects into attributes with the relations' name
+#contract_relations_into_attributes = ['PLACE', 'ALIAS']
+contract_relations_into_attributes = []
+
+# OpenMind base URL
+baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
+
+
+entsURL=baseURL+"method=get_ents&oc=%s"
+
+entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s"
+
+entURL=baseURL+"method=get_ent&id=%s&include_content=True"
+
+
+def readJSON(url):
+    #print("JSON loading %s"%url)
+    wsh=urllib.request.urlopen(url)
+    txt = wsh.read()
+    return json.loads(txt.decode("utf-8"))
+    
+defs_json = readJSON(baseURL+"method=get_defs")
+
+# current list of all definitions 
+ismi_defs = [atts['ov'] for atts in defs_json['defs']]
+
+#ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
+
+
+nx_graph = networkx.MultiDiGraph()
+
+nx_nodes = {}
+ismi_relations = {}
+nx_relations = {}
+
+ent_exclude_attrs = [
+    'lw',
+    'node_type',
+    'nov'
+]
+
+def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
+    # these are too embarrassing...
+    if 'FLORUIT' in name:
+        name = name.replace('FLORUIT', 'FLOURISH')
+        
+    elif 'floruit' in name:
+        name = name.replace('floruit', 'flourish')
+        
+    if is_src_rel:
+        #name = name + '>'
+        pass
+        
+    if is_tar_rel:
+        name = '<' + name
+        
+    if att_from_rel:
+        # clean up relations as attribute names
+        name = name.replace('is_', '')
+        name = name.replace('has_', '')
+        name = name.replace('was_', '')
+        name = name.replace('_of', '')
+
+    return name
+
+
+def nodeFromEnt(ent, etype):
+    """Create a Neo4J node from the given JSON entity.
+    
+    Creates the node in gdb and returns the node.
+    """ 
+    attrs = {}
+    # go through all attributes
+    for att in ent['atts']:
+        ct = att.get('content_type', None)
+        if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
+            # normal text attribute (assume no content_type is text too...)
+            key = att['name']
+            val = att['ov']
+            
+            if key in ent_exclude_attrs:
+                # exclude attribute
+                continue
+
+            # keep attribute
+            attrs[key] = val
+            
+        elif ct == 'num':
+            # number attribute
+            key = att['name']
+            val = att['ov']
+            
+            if key in ent_exclude_attrs:
+                # exclude attribute
+                continue
+
+            # keep attribute, assume num is int
+            attrs[key] = int(val)
+            
+        elif ct == 'date':
+            # date attribute
+            key = att['name']
+            val = att['ov']
+            #print("don't know what to do with date: %s=%s"%(key,val))
+            
+        elif ct == 'old':
+            # ignore attribute
+            continue
+            
+        else:
+            print("WARN: attribute with unknown content_type: %s"%repr(att))
+            # ignore other content types
+            continue
+            
+    # process base attributes
+    oc = ent['oc']
+    if oc != etype:
+        print("ERROR: entity type doesn't match!")
+        return null
+            
+    attrs['type'] = fixName(oc)
+                
+    ismi_id = ent['id']
+    # rename id to ismi_id
+    attrs['ismi_id'] = ismi_id
+            
+    ov = ent.get('ov', None)
+    if ov is not None:
+        # save ov as label
+        attrs['label'] = ov
+                        
+    # create node with attributes
+    nx_graph.add_node(ismi_id, **attrs)
+    node = nx_graph.node[ismi_id]
+    
+    return node
+
+
+def relsFromEnt(ent, relations):
+    """Extract all relations from JSON entity.
+    
+    Adds JSON to dict relations under relation's id.
+    """
+    # go through src_rels and tar_rels
+    rels = ent.get('src_rels', []) + ent.get('tar_rels', [])
+    for rel in rels:
+        rel_id = rel['id']
+        if rel_id in relations:
+            old_rel = relations[rel_id]
+            if rel != old_rel:
+                print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel)))
+                continue
+            
+        relations[rel_id] = rel
+        
+    return relations
+
+
+def relationsFromRels(rels, nodes):
+    """Create relations in Neo4J.
+    
+    Args:
+        rels: dict of JSON relations
+        nodes: dict of existing Neo4J nodes
+    Returns:
+        dict of Neo4J relations
+    """
+    # go through all rels
+    print("importing %s relations"%len(rels))
+    cnt = 0
+    for rel in rels.values():
+        cnt += 1
+        if cnt % 100 == 0:
+            print(" %s relations"%cnt)
+            
+        rel_id = rel['id']
+        rel_name = rel['name']
+        src_id = rel['src_id']
+        tar_id = rel['tar_id']
+        if not src_id in nodes:
+            print("ERROR: relation %s src node %s missing!"%(rel_id,src_id))
+            continue
+        
+        if not tar_id in nodes:
+            print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
+            continue
+        
+        if contract_relations_into_attributes:
+            # contract source relations
+            tar_type = rel['tar_oc']
+            if tar_type in contract_relations_into_attributes:
+                att_name = fixName(rel_name, att_from_rel=True)
+                # TODO: clean up attribute names
+                while src.get(att_name, None) is not None:
+                    # attribute exists
+                    if att_name[-1].isnumeric():
+                        # increment last digit
+                        att_name = att_name[:-1] + str(int(att_name[-1]) + 1)
+                    else:
+                        att_name += '2'
+                    
+                # add target node's label as attribute
+                #print("contracting tar to attribute %s on id=%s"%(att_name, src_id))
+                nx_graph.node[src_id][att_name] = nx_graph.node[tar_id]['label']
+                
+            # contract target relations
+            src_type = rel['src_oc']
+            if src_type in contract_relations_into_attributes:
+                att_name = fixName(rel_name, att_from_rel=True)
+                # TODO: clean up attribute names
+                while tar.get(att_name, None) is not None:
+                    # attribute exists
+                    if att_name[-1].isnumeric():
+                        # increment last digit
+                        att_name = att_name[:-1] + str(int(att_name[-1]) + 1)
+                    else:
+                        att_name += '2'
+                    
+                # add target node's label as attribute
+                #print("contracting src to attribute %s on id=%s"%(att_name, tar_id))
+                nx_graph.node[tar_id][att_name] = nx_graph.node[src_id]['label']
+        
+        # create relation with type
+        nx_rel = nx_graph.add_edge(src_id, tar_id, type=fixName(rel_name))
+        
+        nx_relations[rel_id] = nx_rel
+        
+    return nx_relations
+
+
+def importEnts(etype):
+    """Import all entities of the given type.
+    """
+    # read json for all entities of given type
+    json = readJSON(entsURL%etype)
+    ents = json['ents']
+    print("importing %s %ss"%(len(ents),etype))
+    size = 100
+    batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)]
+    cnt = 0
+    for batch in batches:
+        cnt += size
+        if cnt % 100 == 0:
+            print(" %s %ss"%(cnt, etype))
+            
+        # extract list of ismi ids
+        ismi_ids = [str(ent['id']) for ent in batch]
+        
+        # fetch full data for list of entities
+        ent_json = readJSON(entsByIdURL%','.join(ismi_ids))
+        ents_data = ent_json['ents']
+        
+        # iterate through results batch
+        for ent_data in ents_data:
+            ismi_id = ent_data['id']
+            if ismi_id in nx_nodes:
+                print("ERROR: entity with id=%s exists!"%ismi_id)
+                return
+            
+            # create neo4j node
+            node = nodeFromEnt(ent_data, etype)
+            
+            # save node reference
+            nx_nodes[ismi_id] = node
+            
+            # extract relations
+            relsFromEnt(ent_data, ismi_relations)
+        
+        #if cnt >= 100:
+        #    return
+
+
+# In[119]:
+
+def importAllEnts(etypes):
+    
+    for etype in etypes:
+        importEnts(etype)
+        
+    relationsFromRels(ismi_relations, nx_nodes)
+
+
+# In[120]:
+
+importAllEnts(ismi_defs)
+#importAllEnts(['TEXT'])
+
+print("Graph info: %s"%networkx.info(nx_graph))
+print("Number of nodes: %s"%networkx.number_of_nodes(nx_graph))
+print("Number of edges: %s"%networkx.number_of_edges(nx_graph))
+#print(" nodes:%s"%repr(nx_graph.nodes(data=True)))
+# export pickle
+networkx.write_gpickle(nx_graph, output_fn)
+print("Wrote file %s"%output_fn)
--- a/importFromOpenMind/importer/ismi2neo4j.py	Mon Sep 07 16:57:10 2015 +0200
+++ b/importFromOpenMind/importer/ismi2neo4j.py	Wed Sep 09 17:32:42 2015 +0200
@@ -10,11 +10,8 @@
 # add relations to these objects as attributes with the relations name
 contract_relations_into_attributes = ['PLACE', 'ALIAS']
 
-# try to find and re-use existing nodes in neo4j (slow!)
-keep_nodes = False
-
 # label added to all nodes
-project_label = '_ismi2'
+project_label = '_ismi3'
 
 # OpenMind base URL
 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
@@ -25,10 +22,13 @@
 
 entsURL=baseURL+"method=get_ents&oc=%s"
 
+entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s"
+
 entURL=baseURL+"method=get_ent&id=%s&include_content=True"
 
 
 def readJSON(url):
+    #print("JSON loading %s"%url)
     wsh=urllib.request.urlopen(url)
     txt = wsh.read()
     return json.loads(txt.decode("utf-8"))
@@ -243,7 +243,7 @@
                     
                 # add target node's label as attribute
                 #print("contracting src to attribute %s on id=%s"%(att_name, tar_id))
-                src.set(att_name, src.get('label'))
+                tar.set(att_name, src.get('label'))
         
         if add_inverse_relations:
             n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar),
@@ -264,36 +264,36 @@
     json = readJSON(entsURL%etype)
     ents = json['ents']
     print("importing %s %ss"%(len(ents),etype))
+    size = 100
+    batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)]
     cnt = 0
-    for ent in ents:
-        cnt += 1
+    for batch in batches:
+        cnt += size
         if cnt % 100 == 0:
             print(" %s %ss"%(cnt, etype))
             
-        # extract ismi id
-        ismi_id = ent['id']
-        
-        node = None
+        # extract list of ismi ids
+        ismi_ids = [str(ent['id']) for ent in batch]
         
-        # fetch full data for entity
-        ent_json = readJSON(entURL%ismi_id)
-        ent_data = ent_json['ent']
-        # create neo4j node
-        if keep_nodes:
-            node = getNode(ismi_id)
+        # fetch full data for list of entities
+        ent_json = readJSON(entsByIdURL%','.join(ismi_ids))
+        ents_data = ent_json['ents']
         
-        if ismi_id in n4j_nodes:
-            print("ERROR: entity with id=%s exists!"%ismi_id)
-            return
-        
-        if node is None:
+        # iterate through results batch
+        for ent_data in ents_data:
+            ismi_id = ent_data['id']
+            if ismi_id in n4j_nodes:
+                print("ERROR: entity with id=%s exists!"%ismi_id)
+                return
+            
+            # create neo4j node
             node = nodeFromEnt(ent_data, etype)
-        
-        # save node reference
-        n4j_nodes[ismi_id] = node
-        
-        # extract relations
-        relsFromEnt(ent_data, ismi_relations)
+            
+            # save node reference
+            n4j_nodes[ismi_id] = node
+            
+            # extract relations
+            relsFromEnt(ent_data, ismi_relations)
         
         #if cnt >= 100:
         #    return
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/importFromOpenMind/importer/unfilteredISMI.py	Wed Sep 09 17:32:42 2015 +0200
@@ -0,0 +1,193 @@
+'''
+Created on 22.04.2014
+
+@author: dwinter
+'''
+
+import os
+import json
+import urllib.request
+
+#ismiBaseUrl="https://ismi.mpiwg-berlin.mpg.de/om4-ismi"
+ismiBaseUrl="http://localhost:18080/ismi-richfaces"
+
+class Importer:
+    
+    allents = {}
+    allrels = {}
+    
+    def loadJSON(self,url):
+        """Load JSON from URL.
+        
+        Saves JSON in data member.
+        """
+        #print("  loading "+url)
+        response = urllib.request.urlopen(url)
+        str_response = response.readall().decode('utf-8')
+    
+        self.data = json.loads(str_response)
+        
+    
+    def loadJSONFromFile(self,fn):
+        """Load JSON from file.
+        
+        Saves JSON in data member.
+        """
+        print("  loading "+fn+".json")
+        self.data = json.load(open(fn+".json",'r', encoding="utf-8"),encoding="utf-8")
+        
+        
+    def getEntIds(self):
+        """Extract entities from data member.
+        
+        Checks all relations.
+        Returns a set of ids of related objects and a list of the relations.  
+        """
+        
+        ents = self.data.get("ents")
+        
+        ret=set()
+        rels=[]
+        
+        for ent in ents:
+            ret.add(str(ent.get('id')))
+            if 'src_rels' in ent:
+                print("src_rels: %s"%ent.get('src_rels'))
+                rels.extend(ent.get('src_rels'))
+
+            if 'tar_rels' in ent:
+                print("tar_rels: %s"%ent.get('tar_rels'))
+                rels.extend(ent.get('tar_rels'))
+        
+        return ret,rels
+        
+       
+    def loadallEnts(self,kind="tar",filterOC=[]):
+        """Get related entities from OpenMind.
+        
+        Gets all related entities' ids using kind and filterOC via getEntIdsMentioned().
+        Downloads the entities from OpenMind using the ids.
+        Returns the entities as JSON-string and a list of relations.
+        """
+        
+        ids,rels = self.getEntIds()
+        
+        baseUrl=ismiBaseUrl+"/jsonInterface?include_content=true&include_romanization=true&method=get_ents"
+        
+        lenId = len(ids)
+        portions = int(lenId / 500) 
+        print("loading %s entities"%lenId)
+                
+        ents = []
+        for p in range(portions+1):
+            
+            start = p * 500
+            end = min(lenId,(p+1)*500)
+            
+            idsFrak = list(ids)[start:end]
+            idsString = ",".join(idsFrak)
+            
+            
+            qs = baseUrl+"&ids="+idsString
+            #print("  loading ents from "+qs)
+            response = urllib.request.urlopen(qs)
+            entsJ = json.loads(response.readall().decode('utf-8'));
+            ents += entsJ.get("ents")
+            
+            # iterate all entities
+            for ent in entsJ.get("ents"):
+                ismi_id = ent.get('id')
+                if ismi_id in self.allents:
+                    print("entity id=%s exists!"%ismi_id)
+                else:
+                    self.allents[ismi_id] = ent
+                
+                # extract relations
+                if 'src_rels' in ent:
+                    #print("src_rels: %s"%ent.get('src_rels'))
+                    rels.extend(ent.get('src_rels'))
+                    
+                    for rel in ent.get('src_rels'):
+                        rel_id = rel.get('id')
+                        if rel_id in self.allrels:
+                            print("relation id=%s exists!"%rel_id)
+                        else:
+                            self.allrels[rel_id] = rel
+    
+                if 'tar_rels' in ent:
+                    #print("tar_rels: %s"%ent.get('tar_rels'))
+                    rels.extend(ent.get('tar_rels'))
+                    
+                    for rel in ent.get('tar_rels'):
+                        rel_id = rel.get('id')
+                        if rel_id in self.allrels:
+                            print("relation id=%s exists!"%rel_id)
+                        else:
+                            self.allrels[rel_id] = rel
+                    
+        #str_response = json.dumps({"ents":ents});
+        return ents,rels
+    
+    
+    def saveallEnts(self,filename,kind="tar",filterOC=[]):
+        """Loads all related entities and saves as JSON.
+        
+        Loads all related entities using kind and filterOC via LoadAllEnts().
+        Saves entities in file filename.json.
+        Saves relations in file filename_rels.json.
+        """
+        
+        ents,rels = self.loadallEnts(kind=kind,filterOC=filterOC)
+        
+        print("  writing ", filename+".json")
+        of = open(filename+".json","wb")
+        of.write(json.dumps({"ents":ents}).encode('utf-8'))
+        of.close()
+        
+        print("  writing ", filename+"_rels.json")
+        of = open(filename+"_rels.json","w")
+        json.dump({'rels':rels},of);
+        of.close()
+        
+    
+if __name__ == '__main__':
+    imp = Importer()
+    
+    # get current list of all definitions 
+    imp.loadJSON(ismiBaseUrl+"/jsonInterface?method=get_defs")
+    ismi_defs = [atts['ov'] for atts in imp.data['defs']]
+    
+    # create directory for export files
+    exportDir = '/tmp/ismi_data'
+    if not os.access(exportDir, os.R_OK):
+        # dir doesn't exist -> create
+        os.makedirs(exportDir)
+    
+    for ismi_def in ismi_defs:
+        print("loading entities of type %s"%ismi_def)
+        #
+        # load all entities of type ismi_def
+        # contains entities with attributes and first-order relations
+        #
+        url = ismiBaseUrl+"/jsonInterface?method=get_ents&oc=%s"%ismi_def
+        imp.loadJSON(url)
+        
+        #
+        # load and save all target relations of entities as entities.json
+        #
+        imp.saveallEnts(exportDir+"/%s"%ismi_def)
+    
+    #
+    # save all entities in one file
+    #
+    print("  writing ", "ALL.json")
+    of = open(exportDir+"/ALL.json","wb")
+    allents = [ent for ent in imp.allents.values()]
+    of.write(json.dumps({"ents":allents}).encode('utf-8'))
+    of.close()
+
+    print("  writing ", "ALL_rels.json")
+    of = open(exportDir+"/ALL_rels.json","wb")
+    allrels = [rel for rel in imp.allrels.values()]
+    of.write(json.dumps({"rels":allrels}).encode('utf-8'))
+    of.close()