view importFromOpenMind/importer/ismi2neo4j.py @ 17:4dfd832e9cd9

added automatic creation of inverse relations. added more attribute types.
author casties
date Thu, 03 Sep 2015 18:48:21 +0200
parents de0a06eef13b
children 0827156df210
line wrap: on
line source

import urllib.request
import json
from neo4jrestclient.client import GraphDatabase, Node

## configure behaviour

# add inverse relations as "<relation"
add_inverse_relations = True

# try to find and re-use existing nodes in neo4j (slow!)
keep_nodes = False

# label added to all nodes
project_label = '_ismi_inv_rel'

# OpenMind base URL
baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"

# neo4j base URL
neo4jBaseURL = "http://localhost:7474/db/data/"


entsURL=baseURL+"method=get_ents&oc=%s"

entURL=baseURL+"method=get_ent&id=%s&include_content=True"


def readJSON(url):
    wsh=urllib.request.urlopen(url)
    txt = wsh.read()
    return json.loads(txt.decode("utf-8"))
    
defs_json = readJSON(baseURL+"method=get_defs")

# current list of all definitions 
ismi_defs = [atts['ov'] for atts in defs_json['defs']]

#ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]


gdb = GraphDatabase(neo4jBaseURL, username="neo4j", password="neo5j")

n4j_nodes = {}
ismi_relations = {}
n4j_relations = {}

ent_exclude_attrs = [
    'lw',
    'node_type',
    'nov'
]

def fixName(name, is_src_rel=False, is_tar_rel=False):
    # these are too embarrasing...
    if 'FLORUIT' in name:
        name = name.replace('FLORUIT', 'FLOURISH')
        
    elif 'floruit' in name:
        name = name.replace('floruit', 'flourish')
        
    if is_src_rel:
        name = name + '>'
        
    if is_tar_rel:
        name = '<' + name

    return name


def getNode(ismi_id=None):
    if ismi_id is not None:
        res = gdb.query("match (n {ismi_id: %s}) return n"%ismi_id, returns=(Node))
        if len(res) > 0:
            return res[0]
    
    return None


def nodeFromEnt(ent, etype):
    attrs = {}
    # go through all attributes
    for att in ent['atts']:
        ct = att.get('content_type', None)
        if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
            # normal text attribute (assume no content_type is text too...)
            key = att['name']
            val = att['ov']
            
            if key in ent_exclude_attrs:
                # exclude attribute
                continue

            # keep attribute
            attrs[key] = val
            
        elif ct == 'num':
            # number attribute
            key = att['name']
            val = att['ov']
            
            if key in ent_exclude_attrs:
                # exclude attribute
                continue

            # keep attribute, assume num is int
            attrs[key] = int(val)
            
        elif ct == 'date':
            # date attribute
            key = att['name']
            val = att['ov']
            #print("don't know what to do with date: %s=%s"%(key,val))
            
        elif ct == 'old':
            # ignore attribute
            continue
            
        else:
            print("WARN: attribute with unknown content_type: %s"%repr(att))
            # ignore other content types
            continue
            
    # process base attributes
    oc = ent['oc']
    if oc != etype:
        print("ERROR: entity type doesn't match!")
        return null
            
    attrs['type'] = fixName(oc)
                
    ismi_id = ent['id']
    # rename id to ismi_id
    attrs['ismi_id'] = ismi_id
            
    ov = ent.get('ov', None)
    if ov is not None:
        # save ov as label
        attrs['label'] = ov
                        
    # create node with attributes
    node = gdb.nodes.create(**attrs)
    # add labels
    node.labels.add([project_label, fixName(etype)])
    return node


# In[77]:

def relsFromEnt(ent, relations):
    # go through src_rels and tar_rels
    rels = ent.get('src_rels', []) + ent.get('tar_rels', [])
    for rel in rels:
        rel_id = rel['id']
        if rel_id in relations:
            old_rel = relations[rel_id]
            if rel != old_rel:
                print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel)))
                continue
            
        relations[rel_id] = rel
        
    return relations


# In[110]:

def n4jrelationsFromRels(rels, nodes):
    # go through all rels
    print("importing %s relations"%len(rels))
    cnt = 0
    for rel in rels.values():
        cnt += 1
        if cnt % 100 == 0:
            print(" %s relations"%cnt)
            
        rel_id = rel['id']
        rel_name = rel['name']
        src_id = rel['src_id']
        tar_id = rel['tar_id']
        src = nodes.get(src_id, None)
        if src is None:
            print("ERROR: relation %s src node %s missing!"%(rel_id,src_id))
            continue
        
        tar = nodes.get(tar_id, None)
        if tar is None:
            print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
            continue
        
        if add_inverse_relations:
            n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar),
                       gdb.relationships.create(tar, fixName(rel_name, is_tar_rel=True), src)]
                       
        else:
            n4j_rel = gdb.relationships.create(src, fixName(rel_name), tar)

        n4j_relations[rel_id] = n4j_rel
        
    return n4j_relations


# In[114]:

def importEnts(etype):
    # read json for all entities of given type
    json = readJSON(entsURL%etype)
    ents = json['ents']
    print("importing %s %ss"%(len(ents),etype))
    cnt = 0
    for ent in ents:
        cnt += 1
        if cnt % 100 == 0:
            print(" %s %ss"%(cnt, etype))
            
        # extract ismi id
        ismi_id = ent['id']
        
        node = None
        
        # fetch full data for entity
        ent_json = readJSON(entURL%ismi_id)
        ent_data = ent_json['ent']
        # create neo4j node
        if keep_nodes:
            node = getNode(ismi_id)
        
        if node is None:
            node = nodeFromEnt(ent_data, etype)
        
        if ismi_id in n4j_nodes:
            print("ERROR: entity with id=%s exists!"%ismi_id)
            return
        
        # save node reference
        n4j_nodes[ismi_id] = node
        
        # extract relations
        relsFromEnt(ent_data, ismi_relations)
        
        #if cnt >= 100:
        #    return


# In[119]:

def importAllEnts(etypes):
    
    for etype in etypes:
        importEnts(etype)
        
    n4jrelationsFromRels(ismi_relations, n4j_nodes)


# In[120]:

#importAllEnts(ismi_types)
importAllEnts(ismi_defs)