view importFromOpenMind/importer/ismi2neo4j.py @ 16:de0a06eef13b

new neo4j importer for network visualisation frontend.
author casties
date Fri, 28 Aug 2015 17:24:45 +0200
parents
children 4dfd832e9cd9
line wrap: on
line source

import urllib.request
import json
from neo4jrestclient.client import GraphDatabase, Node

# In[111]:
ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]

baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"

entsURL=baseURL+"method=get_ents&oc=%s"

entURL=baseURL+"method=get_ent&id=%s&include_content=True"


def readJSON(url):
    wsh=urllib.request.urlopen(url)
    txt = wsh.read()
    return json.loads(txt.decode("utf-8"))
    
defs_json = readJSON(baseURL+"method=get_defs")

ismi_defs = [atts['ov'] for atts in defs_json['defs']]


gdb = GraphDatabase("http://localhost:7474/db/data/", username="neo4j", password="neo5j")

n4j_nodes = {}
ismi_relations = {}
n4j_relations = {}

keep_nodes = False

ent_exclude_attrs = [
    'lw',
    'node_type',
    'nov'
]


def getNode(ismi_id=None):
    if ismi_id is not None:
        res = gdb.query("match (n {ismi_id: %s}) return n"%40635, returns=(Node))
        if len(res) > 0:
            return res[0]
    
    return None

def nodeFromEnt(ent, etype):
    attrs = {}
    # go through all attributes
    for att in ent['atts']:
        ct = att.get('content_type', None)
        if ct in ['text', 'arabic', 'bool', 'url']:
            # normal text attribute
            key = att['name']
            val = att['ov']
            
            if key in ent_exclude_attrs:
                # exclude attribute
                continue

            # keep attribute
            attrs[key] = val
            
        elif ct == 'date':
            # date attribute
            key = att['name']
            val = att['ov']
            #print("don't know what to do with date: %s=%s"%(key,val))
            
        elif ct == 'old':
            # ignore attribute
            continue
            
        else:
            #print("WARN: attribute with unknown content_type: %s"%repr(att))
            # ignore other content types
            continue
            
    # process base attributes
    oc = ent['oc']
    if oc != etype:
        print("ERROR: entity type doesn't match!")
        return null
            
    attrs['type'] = oc
                
    ismi_id = ent['id']
    # rename id to ismi_id
    attrs['ismi_id'] = ismi_id
            
    ov = ent.get('ov', None)
    if ov is not None:
        # save ov as label
        attrs['label'] = ov
                        
    # create node with attributes
    node = gdb.nodes.create(**attrs)
    # add labels
    node.labels.add(['project_ismi', etype])
    return node


# In[77]:

def relsFromEnt(ent, relations):
    # go through src_rels and tar_rels
    rels = ent.get('src_rels', []) + ent.get('tar_rels', [])
    for rel in rels:
        rel_id = rel['id']
        if rel_id in relations:
            old_rel = relations[rel_id]
            if rel != old_rel:
                print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel)))
                continue
            
        relations[rel_id] = rel
        
    return relations


# In[110]:

def n4jrelationsFromRels(rels, nodes):
    # go through all rels
    print("importing %s relations"%len(rels))
    cnt = 0
    for rel in rels.values():
        cnt += 1
        if cnt % 100 == 0:
            print(" %s relations"%cnt)
            
        rel_id = rel['id']
        rel_name = rel['name']
        src_id = rel['src_id']
        tar_id = rel['tar_id']
        src = nodes.get(src_id, None)
        if src is None:
            print("ERROR: relation %s src node %s missing!"%(rel_id,src_id))
            continue
        
        tar = nodes.get(tar_id, None)
        if tar is None:
            print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
            continue
        
        n4j_rel = gdb.relationships.create(src, rel_name, tar)
        n4j_relations[rel_id] = n4j_rel
        
    return n4j_relations


# In[114]:

def importEnts(etype):
    # read json for all entities of given type
    json = readJSON(entsURL%etype)
    ents = json['ents']
    print("importing %s %ss"%(len(ents),etype))
    cnt = 0
    for ent in ents:
        cnt += 1
        if cnt % 100 == 0:
            print(" %s %ss"%(cnt, etype))
            
        # extract ismi id
        ismi_id = ent['id']
        
        node = None
        
        # fetch full data for entity
        ent_json = readJSON(entURL%ismi_id)
        ent_data = ent_json['ent']
        # create neo4j node
        if keep_nodes:
            node = getNode(ismi_id)
        
        if node is None:
            node = nodeFromEnt(ent_data, etype)
        
        if ismi_id in n4j_nodes:
            print("ERROR: entity with id=%s exists!"%ismi_id)
            return
        
        # save node reference
        n4j_nodes[ismi_id] = node
        
        # extract relations
        relsFromEnt(ent_data, ismi_relations)
        
        #if cnt >= 100:
        #    return


# In[119]:

def importAllEnts(etypes):
    
    for etype in etypes:
        importEnts(etype)
        
    n4jrelationsFromRels(ismi_relations, n4j_nodes)


# In[120]:

#importAllEnts(ismi_types)
importAllEnts(ismi_defs)