Mercurial > hg > drupalISMI
view importFromOpenMind/importer/ismi2neo4j.py @ 16:de0a06eef13b
new neo4j importer for network visualisation frontend.
author | casties |
---|---|
date | Fri, 28 Aug 2015 17:24:45 +0200 |
parents | |
children | 4dfd832e9cd9 |
line wrap: on
line source
import urllib.request import json from neo4jrestclient.client import GraphDatabase, Node # In[111]: ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" entsURL=baseURL+"method=get_ents&oc=%s" entURL=baseURL+"method=get_ent&id=%s&include_content=True" def readJSON(url): wsh=urllib.request.urlopen(url) txt = wsh.read() return json.loads(txt.decode("utf-8")) defs_json = readJSON(baseURL+"method=get_defs") ismi_defs = [atts['ov'] for atts in defs_json['defs']] gdb = GraphDatabase("http://localhost:7474/db/data/", username="neo4j", password="neo5j") n4j_nodes = {} ismi_relations = {} n4j_relations = {} keep_nodes = False ent_exclude_attrs = [ 'lw', 'node_type', 'nov' ] def getNode(ismi_id=None): if ismi_id is not None: res = gdb.query("match (n {ismi_id: %s}) return n"%40635, returns=(Node)) if len(res) > 0: return res[0] return None def nodeFromEnt(ent, etype): attrs = {} # go through all attributes for att in ent['atts']: ct = att.get('content_type', None) if ct in ['text', 'arabic', 'bool', 'url']: # normal text attribute key = att['name'] val = att['ov'] if key in ent_exclude_attrs: # exclude attribute continue # keep attribute attrs[key] = val elif ct == 'date': # date attribute key = att['name'] val = att['ov'] #print("don't know what to do with date: %s=%s"%(key,val)) elif ct == 'old': # ignore attribute continue else: #print("WARN: attribute with unknown content_type: %s"%repr(att)) # ignore other content types continue # process base attributes oc = ent['oc'] if oc != etype: print("ERROR: entity type doesn't match!") return null attrs['type'] = oc ismi_id = ent['id'] # rename id to ismi_id attrs['ismi_id'] = ismi_id ov = ent.get('ov', None) if ov is not None: # save ov as label attrs['label'] = ov # create node with attributes node = gdb.nodes.create(**attrs) # add labels node.labels.add(['project_ismi', etype]) return node # In[77]: def relsFromEnt(ent, relations): # go through src_rels and tar_rels rels = ent.get('src_rels', []) + ent.get('tar_rels', []) for rel in rels: rel_id = rel['id'] if rel_id in relations: old_rel = relations[rel_id] if rel != old_rel: print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel))) continue relations[rel_id] = rel return relations # In[110]: def n4jrelationsFromRels(rels, nodes): # go through all rels print("importing %s relations"%len(rels)) cnt = 0 for rel in rels.values(): cnt += 1 if cnt % 100 == 0: print(" %s relations"%cnt) rel_id = rel['id'] rel_name = rel['name'] src_id = rel['src_id'] tar_id = rel['tar_id'] src = nodes.get(src_id, None) if src is None: print("ERROR: relation %s src node %s missing!"%(rel_id,src_id)) continue tar = nodes.get(tar_id, None) if tar is None: print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) continue n4j_rel = gdb.relationships.create(src, rel_name, tar) n4j_relations[rel_id] = n4j_rel return n4j_relations # In[114]: def importEnts(etype): # read json for all entities of given type json = readJSON(entsURL%etype) ents = json['ents'] print("importing %s %ss"%(len(ents),etype)) cnt = 0 for ent in ents: cnt += 1 if cnt % 100 == 0: print(" %s %ss"%(cnt, etype)) # extract ismi id ismi_id = ent['id'] node = None # fetch full data for entity ent_json = readJSON(entURL%ismi_id) ent_data = ent_json['ent'] # create neo4j node if keep_nodes: node = getNode(ismi_id) if node is None: node = nodeFromEnt(ent_data, etype) if ismi_id in n4j_nodes: print("ERROR: entity with id=%s exists!"%ismi_id) return # save node reference n4j_nodes[ismi_id] = node # extract relations relsFromEnt(ent_data, ismi_relations) #if cnt >= 100: # return # In[119]: def importAllEnts(etypes): for etype in etypes: importEnts(etype) n4jrelationsFromRels(ismi_relations, n4j_nodes) # In[120]: #importAllEnts(ismi_types) importAllEnts(ismi_defs)