# HG changeset patch # User casties # Date 1440775485 -7200 # Node ID de0a06eef13b7d0dfc1aa0347ebaf295aea181cf # Parent 61767ff5ce2be9cdc43c03b71f023c6aa248a96d new neo4j importer for network visualisation frontend. diff -r 61767ff5ce2b -r de0a06eef13b importFromOpenMind/importer/ismi2neo4j.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/importFromOpenMind/importer/ismi2neo4j.py Fri Aug 28 17:24:45 2015 +0200 @@ -0,0 +1,208 @@ +import urllib.request +import json +from neo4jrestclient.client import GraphDatabase, Node + +# In[111]: +ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] + +baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" + +entsURL=baseURL+"method=get_ents&oc=%s" + +entURL=baseURL+"method=get_ent&id=%s&include_content=True" + + +def readJSON(url): + wsh=urllib.request.urlopen(url) + txt = wsh.read() + return json.loads(txt.decode("utf-8")) + +defs_json = readJSON(baseURL+"method=get_defs") + +ismi_defs = [atts['ov'] for atts in defs_json['defs']] + + +gdb = GraphDatabase("http://localhost:7474/db/data/", username="neo4j", password="neo5j") + +n4j_nodes = {} +ismi_relations = {} +n4j_relations = {} + +keep_nodes = False + +ent_exclude_attrs = [ + 'lw', + 'node_type', + 'nov' +] + + +def getNode(ismi_id=None): + if ismi_id is not None: + res = gdb.query("match (n {ismi_id: %s}) return n"%40635, returns=(Node)) + if len(res) > 0: + return res[0] + + return None + +def nodeFromEnt(ent, etype): + attrs = {} + # go through all attributes + for att in ent['atts']: + ct = att.get('content_type', None) + if ct in ['text', 'arabic', 'bool', 'url']: + # normal text attribute + key = att['name'] + val = att['ov'] + + if key in ent_exclude_attrs: + # exclude attribute + continue + + # keep attribute + attrs[key] = val + + elif ct == 'date': + # date attribute + key = att['name'] + val = att['ov'] + #print("don't know what to do with date: %s=%s"%(key,val)) + + elif ct == 'old': + # ignore attribute + continue + + else: + #print("WARN: attribute with unknown content_type: %s"%repr(att)) + # ignore other content types + continue + + # process base attributes + oc = ent['oc'] + if oc != etype: + print("ERROR: entity type doesn't match!") + return null + + attrs['type'] = oc + + ismi_id = ent['id'] + # rename id to ismi_id + attrs['ismi_id'] = ismi_id + + ov = ent.get('ov', None) + if ov is not None: + # save ov as label + attrs['label'] = ov + + # create node with attributes + node = gdb.nodes.create(**attrs) + # add labels + node.labels.add(['project_ismi', etype]) + return node + + +# In[77]: + +def relsFromEnt(ent, relations): + # go through src_rels and tar_rels + rels = ent.get('src_rels', []) + ent.get('tar_rels', []) + for rel in rels: + rel_id = rel['id'] + if rel_id in relations: + old_rel = relations[rel_id] + if rel != old_rel: + print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel))) + continue + + relations[rel_id] = rel + + return relations + + +# In[110]: + +def n4jrelationsFromRels(rels, nodes): + # go through all rels + print("importing %s relations"%len(rels)) + cnt = 0 + for rel in rels.values(): + cnt += 1 + if cnt % 100 == 0: + print(" %s relations"%cnt) + + rel_id = rel['id'] + rel_name = rel['name'] + src_id = rel['src_id'] + tar_id = rel['tar_id'] + src = nodes.get(src_id, None) + if src is None: + print("ERROR: relation %s src node %s missing!"%(rel_id,src_id)) + continue + + tar = nodes.get(tar_id, None) + if tar is None: + print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) + continue + + n4j_rel = gdb.relationships.create(src, rel_name, tar) + n4j_relations[rel_id] = n4j_rel + + return n4j_relations + + +# In[114]: + +def importEnts(etype): + # read json for all entities of given type + json = readJSON(entsURL%etype) + ents = json['ents'] + print("importing %s %ss"%(len(ents),etype)) + cnt = 0 + for ent in ents: + cnt += 1 + if cnt % 100 == 0: + print(" %s %ss"%(cnt, etype)) + + # extract ismi id + ismi_id = ent['id'] + + node = None + + # fetch full data for entity + ent_json = readJSON(entURL%ismi_id) + ent_data = ent_json['ent'] + # create neo4j node + if keep_nodes: + node = getNode(ismi_id) + + if node is None: + node = nodeFromEnt(ent_data, etype) + + if ismi_id in n4j_nodes: + print("ERROR: entity with id=%s exists!"%ismi_id) + return + + # save node reference + n4j_nodes[ismi_id] = node + + # extract relations + relsFromEnt(ent_data, ismi_relations) + + #if cnt >= 100: + # return + + +# In[119]: + +def importAllEnts(etypes): + + for etype in etypes: + importEnts(etype) + + n4jrelationsFromRels(ismi_relations, n4j_nodes) + + +# In[120]: + +#importAllEnts(ismi_types) +importAllEnts(ismi_defs)