view importFromOpenMind/importer/ismi2model.py @ 19:ca1e02a2a9c4

unfilteredIsmi: openmind to json exporter like filterISMI. ismi2model: openmind importer like ismi2neo4j that saves networkx pickle file.
author casties
date Wed, 09 Sep 2015 17:32:42 +0200
parents
children 45a823b5bf33
line wrap: on
line source

import urllib.request
import json
import networkx

## configure behaviour

# output filename
output_fn = "ismi_graph.gpickle"

# contract relations to these objects into attributes with the relations' name
#contract_relations_into_attributes = ['PLACE', 'ALIAS']
contract_relations_into_attributes = []

# OpenMind base URL
baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"


entsURL=baseURL+"method=get_ents&oc=%s"

entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s"

entURL=baseURL+"method=get_ent&id=%s&include_content=True"


def readJSON(url):
    #print("JSON loading %s"%url)
    wsh=urllib.request.urlopen(url)
    txt = wsh.read()
    return json.loads(txt.decode("utf-8"))
    
defs_json = readJSON(baseURL+"method=get_defs")

# current list of all definitions 
ismi_defs = [atts['ov'] for atts in defs_json['defs']]

#ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]


nx_graph = networkx.MultiDiGraph()

nx_nodes = {}
ismi_relations = {}
nx_relations = {}

ent_exclude_attrs = [
    'lw',
    'node_type',
    'nov'
]

def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
    # these are too embarrassing...
    if 'FLORUIT' in name:
        name = name.replace('FLORUIT', 'FLOURISH')
        
    elif 'floruit' in name:
        name = name.replace('floruit', 'flourish')
        
    if is_src_rel:
        #name = name + '>'
        pass
        
    if is_tar_rel:
        name = '<' + name
        
    if att_from_rel:
        # clean up relations as attribute names
        name = name.replace('is_', '')
        name = name.replace('has_', '')
        name = name.replace('was_', '')
        name = name.replace('_of', '')

    return name


def nodeFromEnt(ent, etype):
    """Create a Neo4J node from the given JSON entity.
    
    Creates the node in gdb and returns the node.
    """ 
    attrs = {}
    # go through all attributes
    for att in ent['atts']:
        ct = att.get('content_type', None)
        if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
            # normal text attribute (assume no content_type is text too...)
            key = att['name']
            val = att['ov']
            
            if key in ent_exclude_attrs:
                # exclude attribute
                continue

            # keep attribute
            attrs[key] = val
            
        elif ct == 'num':
            # number attribute
            key = att['name']
            val = att['ov']
            
            if key in ent_exclude_attrs:
                # exclude attribute
                continue

            # keep attribute, assume num is int
            attrs[key] = int(val)
            
        elif ct == 'date':
            # date attribute
            key = att['name']
            val = att['ov']
            #print("don't know what to do with date: %s=%s"%(key,val))
            
        elif ct == 'old':
            # ignore attribute
            continue
            
        else:
            print("WARN: attribute with unknown content_type: %s"%repr(att))
            # ignore other content types
            continue
            
    # process base attributes
    oc = ent['oc']
    if oc != etype:
        print("ERROR: entity type doesn't match!")
        return null
            
    attrs['type'] = fixName(oc)
                
    ismi_id = ent['id']
    # rename id to ismi_id
    attrs['ismi_id'] = ismi_id
            
    ov = ent.get('ov', None)
    if ov is not None:
        # save ov as label
        attrs['label'] = ov
                        
    # create node with attributes
    nx_graph.add_node(ismi_id, **attrs)
    node = nx_graph.node[ismi_id]
    
    return node


def relsFromEnt(ent, relations):
    """Extract all relations from JSON entity.
    
    Adds JSON to dict relations under relation's id.
    """
    # go through src_rels and tar_rels
    rels = ent.get('src_rels', []) + ent.get('tar_rels', [])
    for rel in rels:
        rel_id = rel['id']
        if rel_id in relations:
            old_rel = relations[rel_id]
            if rel != old_rel:
                print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel)))
                continue
            
        relations[rel_id] = rel
        
    return relations


def relationsFromRels(rels, nodes):
    """Create relations in Neo4J.
    
    Args:
        rels: dict of JSON relations
        nodes: dict of existing Neo4J nodes
    Returns:
        dict of Neo4J relations
    """
    # go through all rels
    print("importing %s relations"%len(rels))
    cnt = 0
    for rel in rels.values():
        cnt += 1
        if cnt % 100 == 0:
            print(" %s relations"%cnt)
            
        rel_id = rel['id']
        rel_name = rel['name']
        src_id = rel['src_id']
        tar_id = rel['tar_id']
        if not src_id in nodes:
            print("ERROR: relation %s src node %s missing!"%(rel_id,src_id))
            continue
        
        if not tar_id in nodes:
            print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
            continue
        
        if contract_relations_into_attributes:
            # contract source relations
            tar_type = rel['tar_oc']
            if tar_type in contract_relations_into_attributes:
                att_name = fixName(rel_name, att_from_rel=True)
                # TODO: clean up attribute names
                while src.get(att_name, None) is not None:
                    # attribute exists
                    if att_name[-1].isnumeric():
                        # increment last digit
                        att_name = att_name[:-1] + str(int(att_name[-1]) + 1)
                    else:
                        att_name += '2'
                    
                # add target node's label as attribute
                #print("contracting tar to attribute %s on id=%s"%(att_name, src_id))
                nx_graph.node[src_id][att_name] = nx_graph.node[tar_id]['label']
                
            # contract target relations
            src_type = rel['src_oc']
            if src_type in contract_relations_into_attributes:
                att_name = fixName(rel_name, att_from_rel=True)
                # TODO: clean up attribute names
                while tar.get(att_name, None) is not None:
                    # attribute exists
                    if att_name[-1].isnumeric():
                        # increment last digit
                        att_name = att_name[:-1] + str(int(att_name[-1]) + 1)
                    else:
                        att_name += '2'
                    
                # add target node's label as attribute
                #print("contracting src to attribute %s on id=%s"%(att_name, tar_id))
                nx_graph.node[tar_id][att_name] = nx_graph.node[src_id]['label']
        
        # create relation with type
        nx_rel = nx_graph.add_edge(src_id, tar_id, type=fixName(rel_name))
        
        nx_relations[rel_id] = nx_rel
        
    return nx_relations


def importEnts(etype):
    """Import all entities of the given type.
    """
    # read json for all entities of given type
    json = readJSON(entsURL%etype)
    ents = json['ents']
    print("importing %s %ss"%(len(ents),etype))
    size = 100
    batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)]
    cnt = 0
    for batch in batches:
        cnt += size
        if cnt % 100 == 0:
            print(" %s %ss"%(cnt, etype))
            
        # extract list of ismi ids
        ismi_ids = [str(ent['id']) for ent in batch]
        
        # fetch full data for list of entities
        ent_json = readJSON(entsByIdURL%','.join(ismi_ids))
        ents_data = ent_json['ents']
        
        # iterate through results batch
        for ent_data in ents_data:
            ismi_id = ent_data['id']
            if ismi_id in nx_nodes:
                print("ERROR: entity with id=%s exists!"%ismi_id)
                return
            
            # create neo4j node
            node = nodeFromEnt(ent_data, etype)
            
            # save node reference
            nx_nodes[ismi_id] = node
            
            # extract relations
            relsFromEnt(ent_data, ismi_relations)
        
        #if cnt >= 100:
        #    return


# In[119]:

def importAllEnts(etypes):
    
    for etype in etypes:
        importEnts(etype)
        
    relationsFromRels(ismi_relations, nx_nodes)


# In[120]:

importAllEnts(ismi_defs)
#importAllEnts(['TEXT'])

print("Graph info: %s"%networkx.info(nx_graph))
print("Number of nodes: %s"%networkx.number_of_nodes(nx_graph))
print("Number of edges: %s"%networkx.number_of_edges(nx_graph))
#print(" nodes:%s"%repr(nx_graph.nodes(data=True)))
# export pickle
networkx.write_gpickle(nx_graph, output_fn)
print("Wrote file %s"%output_fn)