Mercurial > hg > drupalISMI
view importFromOpenMind/importer/ismi2neo4j.py @ 17:4dfd832e9cd9
added automatic creation of inverse relations.
added more attribute types.
author | casties |
---|---|
date | Thu, 03 Sep 2015 18:48:21 +0200 |
parents | de0a06eef13b |
children | 0827156df210 |
line wrap: on
line source
import urllib.request import json from neo4jrestclient.client import GraphDatabase, Node ## configure behaviour # add inverse relations as "<relation" add_inverse_relations = True # try to find and re-use existing nodes in neo4j (slow!) keep_nodes = False # label added to all nodes project_label = '_ismi_inv_rel' # OpenMind base URL baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" # neo4j base URL neo4jBaseURL = "http://localhost:7474/db/data/" entsURL=baseURL+"method=get_ents&oc=%s" entURL=baseURL+"method=get_ent&id=%s&include_content=True" def readJSON(url): wsh=urllib.request.urlopen(url) txt = wsh.read() return json.loads(txt.decode("utf-8")) defs_json = readJSON(baseURL+"method=get_defs") # current list of all definitions ismi_defs = [atts['ov'] for atts in defs_json['defs']] #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] gdb = GraphDatabase(neo4jBaseURL, username="neo4j", password="neo5j") n4j_nodes = {} ismi_relations = {} n4j_relations = {} ent_exclude_attrs = [ 'lw', 'node_type', 'nov' ] def fixName(name, is_src_rel=False, is_tar_rel=False): # these are too embarrasing... if 'FLORUIT' in name: name = name.replace('FLORUIT', 'FLOURISH') elif 'floruit' in name: name = name.replace('floruit', 'flourish') if is_src_rel: name = name + '>' if is_tar_rel: name = '<' + name return name def getNode(ismi_id=None): if ismi_id is not None: res = gdb.query("match (n {ismi_id: %s}) return n"%ismi_id, returns=(Node)) if len(res) > 0: return res[0] return None def nodeFromEnt(ent, etype): attrs = {} # go through all attributes for att in ent['atts']: ct = att.get('content_type', None) if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: # normal text attribute (assume no content_type is text too...) key = att['name'] val = att['ov'] if key in ent_exclude_attrs: # exclude attribute continue # keep attribute attrs[key] = val elif ct == 'num': # number attribute key = att['name'] val = att['ov'] if key in ent_exclude_attrs: # exclude attribute continue # keep attribute, assume num is int attrs[key] = int(val) elif ct == 'date': # date attribute key = att['name'] val = att['ov'] #print("don't know what to do with date: %s=%s"%(key,val)) elif ct == 'old': # ignore attribute continue else: print("WARN: attribute with unknown content_type: %s"%repr(att)) # ignore other content types continue # process base attributes oc = ent['oc'] if oc != etype: print("ERROR: entity type doesn't match!") return null attrs['type'] = fixName(oc) ismi_id = ent['id'] # rename id to ismi_id attrs['ismi_id'] = ismi_id ov = ent.get('ov', None) if ov is not None: # save ov as label attrs['label'] = ov # create node with attributes node = gdb.nodes.create(**attrs) # add labels node.labels.add([project_label, fixName(etype)]) return node # In[77]: def relsFromEnt(ent, relations): # go through src_rels and tar_rels rels = ent.get('src_rels', []) + ent.get('tar_rels', []) for rel in rels: rel_id = rel['id'] if rel_id in relations: old_rel = relations[rel_id] if rel != old_rel: print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel))) continue relations[rel_id] = rel return relations # In[110]: def n4jrelationsFromRels(rels, nodes): # go through all rels print("importing %s relations"%len(rels)) cnt = 0 for rel in rels.values(): cnt += 1 if cnt % 100 == 0: print(" %s relations"%cnt) rel_id = rel['id'] rel_name = rel['name'] src_id = rel['src_id'] tar_id = rel['tar_id'] src = nodes.get(src_id, None) if src is None: print("ERROR: relation %s src node %s missing!"%(rel_id,src_id)) continue tar = nodes.get(tar_id, None) if tar is None: print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) continue if add_inverse_relations: n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar), gdb.relationships.create(tar, fixName(rel_name, is_tar_rel=True), src)] else: n4j_rel = gdb.relationships.create(src, fixName(rel_name), tar) n4j_relations[rel_id] = n4j_rel return n4j_relations # In[114]: def importEnts(etype): # read json for all entities of given type json = readJSON(entsURL%etype) ents = json['ents'] print("importing %s %ss"%(len(ents),etype)) cnt = 0 for ent in ents: cnt += 1 if cnt % 100 == 0: print(" %s %ss"%(cnt, etype)) # extract ismi id ismi_id = ent['id'] node = None # fetch full data for entity ent_json = readJSON(entURL%ismi_id) ent_data = ent_json['ent'] # create neo4j node if keep_nodes: node = getNode(ismi_id) if node is None: node = nodeFromEnt(ent_data, etype) if ismi_id in n4j_nodes: print("ERROR: entity with id=%s exists!"%ismi_id) return # save node reference n4j_nodes[ismi_id] = node # extract relations relsFromEnt(ent_data, ismi_relations) #if cnt >= 100: # return # In[119]: def importAllEnts(etypes): for etype in etypes: importEnts(etype) n4jrelationsFromRels(ismi_relations, n4j_nodes) # In[120]: #importAllEnts(ismi_types) importAllEnts(ismi_defs)