view importFromOpenMind/importer/ismi2neo4j.py @ 28:a9bfd49355f8

updated config for ismi-dev.
author casties
date Wed, 18 Nov 2015 15:22:05 +0100
parents ca1e02a2a9c4
children
line wrap: on
line source

import urllib.request
import json
from neo4jrestclient.client import GraphDatabase, Node

## configure behaviour

# add inverse relations as "<relation"
add_inverse_relations = True

# add relations to these objects as attributes with the relations name
contract_relations_into_attributes = ['PLACE', 'ALIAS']

# label added to all nodes
project_label = '_ismi'

# OpenMind base URL
baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"

# neo4j base URL
neo4jBaseURL = "http://localhost:7474/db/data/"


entsURL=baseURL+"method=get_ents&oc=%s"

entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s"

entURL=baseURL+"method=get_ent&id=%s&include_content=True"


def readJSON(url):
    #print("JSON loading %s"%url)
    wsh=urllib.request.urlopen(url)
    txt = wsh.read()
    return json.loads(txt.decode("utf-8"))
    
defs_json = readJSON(baseURL+"method=get_defs")

# current list of all definitions 
ismi_defs = [atts['ov'] for atts in defs_json['defs']]

#ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]


gdb = GraphDatabase(neo4jBaseURL, username="neo4j", password="neo5j")

n4j_nodes = {}
ismi_relations = {}
n4j_relations = {}

ent_exclude_attrs = [
    'lw',
    'node_type',
    'nov'
]

def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
    # these are too embarrassing...
    if 'FLORUIT' in name:
        name = name.replace('FLORUIT', 'FLOURISH')
        
    elif 'floruit' in name:
        name = name.replace('floruit', 'flourish')
        
    if is_src_rel:
        #name = name + '>'
        pass
        
    if is_tar_rel:
        name = '<' + name
        
    if att_from_rel:
        # clean up relations as attribute names
        name = name.replace('is_', '')
        name = name.replace('has_', '')
        name = name.replace('was_', '')
        name = name.replace('_of', '')

    return name


def getNode(ismi_id=None):
    if ismi_id is not None:
        res = gdb.query("match (n {ismi_id: %s}) return n"%ismi_id, returns=(Node))
        if len(res) > 0:
            return res[0]
    
    return None


def nodeFromEnt(ent, etype):
    """Create a Neo4J node from the given JSON entity.
    
    Creates the node in gdb and returns the node.
    """ 
    attrs = {}
    # go through all attributes
    for att in ent['atts']:
        ct = att.get('content_type', None)
        if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
            # normal text attribute (assume no content_type is text too...)
            key = att['name']
            val = att['ov']
            
            if key in ent_exclude_attrs:
                # exclude attribute
                continue

            # keep attribute
            attrs[key] = val
            
        elif ct == 'num':
            # number attribute
            key = att['name']
            val = att['ov']
            
            if key in ent_exclude_attrs:
                # exclude attribute
                continue

            # keep attribute, assume num is int
            attrs[key] = int(val)
            
        elif ct == 'date':
            # date attribute
            key = att['name']
            val = att['ov']
            #print("don't know what to do with date: %s=%s"%(key,val))
            
        elif ct == 'old':
            # ignore attribute
            continue
            
        else:
            print("WARN: attribute with unknown content_type: %s"%repr(att))
            # ignore other content types
            continue
            
    # process base attributes
    oc = ent['oc']
    if oc != etype:
        print("ERROR: entity type doesn't match!")
        return null
            
    attrs['type'] = fixName(oc)
                
    ismi_id = ent['id']
    # rename id to ismi_id
    attrs['ismi_id'] = ismi_id
            
    ov = ent.get('ov', None)
    if ov is not None:
        # save ov as label
        attrs['label'] = ov
                        
    # create node with attributes
    node = gdb.nodes.create(**attrs)
    # add labels
    node.labels.add([project_label, fixName(etype)])
    return node


def relsFromEnt(ent, relations):
    """Extract all relations from JSON entity.
    
    Adds JSON to dict relations under relation's id.
    """
    # go through src_rels and tar_rels
    rels = ent.get('src_rels', []) + ent.get('tar_rels', [])
    for rel in rels:
        rel_id = rel['id']
        if rel_id in relations:
            old_rel = relations[rel_id]
            if rel != old_rel:
                print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel)))
                continue
            
        relations[rel_id] = rel
        
    return relations


def n4jrelationsFromRels(rels, nodes):
    """Create relations in Neo4J.
    
    Args:
        rels: dict of JSON relations
        nodes: dict of existing Neo4J nodes
    Returns:
        dict of Neo4J relations
    """
    # go through all rels
    print("importing %s relations"%len(rels))
    cnt = 0
    for rel in rels.values():
        cnt += 1
        if cnt % 100 == 0:
            print(" %s relations"%cnt)
            
        rel_id = rel['id']
        rel_name = rel['name']
        src_id = rel['src_id']
        tar_id = rel['tar_id']
        src = nodes.get(src_id, None)
        if src is None:
            print("ERROR: relation %s src node %s missing!"%(rel_id,src_id))
            continue
        
        tar = nodes.get(tar_id, None)
        if tar is None:
            print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
            continue
        
        if contract_relations_into_attributes:
            # contract source relations
            tar_type = rel['tar_oc']
            if tar_type in contract_relations_into_attributes:
                att_name = fixName(rel_name, att_from_rel=True)
                # TODO: clean up attribute names
                while src.get(att_name, None) is not None:
                    # attribute exists
                    if att_name[-1].isnumeric():
                        # increment last digit
                        att_name = att_name[:-1] + str(int(att_name[-1]) + 1)
                    else:
                        att_name += '2'
                    
                # add target node's label as attribute
                #print("contracting tar to attribute %s on id=%s"%(att_name, src_id))
                src.set(att_name, tar.get('label'))
                
            # contract target relations
            src_type = rel['src_oc']
            if src_type in contract_relations_into_attributes:
                att_name = fixName(rel_name, att_from_rel=True)
                # TODO: clean up attribute names
                while tar.get(att_name, None) is not None:
                    # attribute exists
                    if att_name[-1].isnumeric():
                        # increment last digit
                        att_name = att_name[:-1] + str(int(att_name[-1]) + 1)
                    else:
                        att_name += '2'
                    
                # add target node's label as attribute
                #print("contracting src to attribute %s on id=%s"%(att_name, tar_id))
                tar.set(att_name, src.get('label'))
        
        if add_inverse_relations:
            n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar),
                       gdb.relationships.create(tar, fixName(rel_name, is_tar_rel=True), src)]
                       
        else:
            n4j_rel = gdb.relationships.create(src, fixName(rel_name), tar)

        n4j_relations[rel_id] = n4j_rel
        
    return n4j_relations


def importEnts(etype):
    """Import all entities of the given type.
    """
    # read json for all entities of given type
    json = readJSON(entsURL%etype)
    ents = json['ents']
    print("importing %s %ss"%(len(ents),etype))
    size = 100
    batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)]
    cnt = 0
    for batch in batches:
        cnt += size
        if cnt % 100 == 0:
            print(" %s %ss"%(cnt, etype))
            
        # extract list of ismi ids
        ismi_ids = [str(ent['id']) for ent in batch]
        
        # fetch full data for list of entities
        ent_json = readJSON(entsByIdURL%','.join(ismi_ids))
        ents_data = ent_json['ents']
        
        # iterate through results batch
        for ent_data in ents_data:
            ismi_id = ent_data['id']
            if ismi_id in n4j_nodes:
                print("ERROR: entity with id=%s exists!"%ismi_id)
                return
            
            # create neo4j node
            node = nodeFromEnt(ent_data, etype)
            
            # save node reference
            n4j_nodes[ismi_id] = node
            
            # extract relations
            relsFromEnt(ent_data, ismi_relations)
        
        #if cnt >= 100:
        #    return


# In[119]:

def importAllEnts(etypes):
    
    for etype in etypes:
        importEnts(etype)
        
    n4jrelationsFromRels(ismi_relations, n4j_nodes)


# In[120]:

#importAllEnts(ismi_types)
importAllEnts(ismi_defs)