Mercurial > hg > drupalISMI
view importFromOpenMind/importer/ismi2neo4j.py @ 31:48bbba800c03
remove unused method.
author | casties |
---|---|
date | Thu, 21 Jan 2016 18:24:51 +0100 |
parents | a9bfd49355f8 |
children |
line wrap: on
line source
import urllib.request import json from neo4jrestclient.client import GraphDatabase, Node ## configure behaviour # add inverse relations as "<relation" add_inverse_relations = True # add relations to these objects as attributes with the relations name contract_relations_into_attributes = ['PLACE', 'ALIAS'] # label added to all nodes project_label = '_ismi' # OpenMind base URL baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" # neo4j base URL neo4jBaseURL = "http://localhost:7474/db/data/" entsURL=baseURL+"method=get_ents&oc=%s" entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s" entURL=baseURL+"method=get_ent&id=%s&include_content=True" def readJSON(url): #print("JSON loading %s"%url) wsh=urllib.request.urlopen(url) txt = wsh.read() return json.loads(txt.decode("utf-8")) defs_json = readJSON(baseURL+"method=get_defs") # current list of all definitions ismi_defs = [atts['ov'] for atts in defs_json['defs']] #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] gdb = GraphDatabase(neo4jBaseURL, username="neo4j", password="neo5j") n4j_nodes = {} ismi_relations = {} n4j_relations = {} ent_exclude_attrs = [ 'lw', 'node_type', 'nov' ] def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): # these are too embarrassing... if 'FLORUIT' in name: name = name.replace('FLORUIT', 'FLOURISH') elif 'floruit' in name: name = name.replace('floruit', 'flourish') if is_src_rel: #name = name + '>' pass if is_tar_rel: name = '<' + name if att_from_rel: # clean up relations as attribute names name = name.replace('is_', '') name = name.replace('has_', '') name = name.replace('was_', '') name = name.replace('_of', '') return name def getNode(ismi_id=None): if ismi_id is not None: res = gdb.query("match (n {ismi_id: %s}) return n"%ismi_id, returns=(Node)) if len(res) > 0: return res[0] return None def nodeFromEnt(ent, etype): """Create a Neo4J node from the given JSON entity. Creates the node in gdb and returns the node. """ attrs = {} # go through all attributes for att in ent['atts']: ct = att.get('content_type', None) if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: # normal text attribute (assume no content_type is text too...) key = att['name'] val = att['ov'] if key in ent_exclude_attrs: # exclude attribute continue # keep attribute attrs[key] = val elif ct == 'num': # number attribute key = att['name'] val = att['ov'] if key in ent_exclude_attrs: # exclude attribute continue # keep attribute, assume num is int attrs[key] = int(val) elif ct == 'date': # date attribute key = att['name'] val = att['ov'] #print("don't know what to do with date: %s=%s"%(key,val)) elif ct == 'old': # ignore attribute continue else: print("WARN: attribute with unknown content_type: %s"%repr(att)) # ignore other content types continue # process base attributes oc = ent['oc'] if oc != etype: print("ERROR: entity type doesn't match!") return null attrs['type'] = fixName(oc) ismi_id = ent['id'] # rename id to ismi_id attrs['ismi_id'] = ismi_id ov = ent.get('ov', None) if ov is not None: # save ov as label attrs['label'] = ov # create node with attributes node = gdb.nodes.create(**attrs) # add labels node.labels.add([project_label, fixName(etype)]) return node def relsFromEnt(ent, relations): """Extract all relations from JSON entity. Adds JSON to dict relations under relation's id. """ # go through src_rels and tar_rels rels = ent.get('src_rels', []) + ent.get('tar_rels', []) for rel in rels: rel_id = rel['id'] if rel_id in relations: old_rel = relations[rel_id] if rel != old_rel: print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel))) continue relations[rel_id] = rel return relations def n4jrelationsFromRels(rels, nodes): """Create relations in Neo4J. Args: rels: dict of JSON relations nodes: dict of existing Neo4J nodes Returns: dict of Neo4J relations """ # go through all rels print("importing %s relations"%len(rels)) cnt = 0 for rel in rels.values(): cnt += 1 if cnt % 100 == 0: print(" %s relations"%cnt) rel_id = rel['id'] rel_name = rel['name'] src_id = rel['src_id'] tar_id = rel['tar_id'] src = nodes.get(src_id, None) if src is None: print("ERROR: relation %s src node %s missing!"%(rel_id,src_id)) continue tar = nodes.get(tar_id, None) if tar is None: print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) continue if contract_relations_into_attributes: # contract source relations tar_type = rel['tar_oc'] if tar_type in contract_relations_into_attributes: att_name = fixName(rel_name, att_from_rel=True) # TODO: clean up attribute names while src.get(att_name, None) is not None: # attribute exists if att_name[-1].isnumeric(): # increment last digit att_name = att_name[:-1] + str(int(att_name[-1]) + 1) else: att_name += '2' # add target node's label as attribute #print("contracting tar to attribute %s on id=%s"%(att_name, src_id)) src.set(att_name, tar.get('label')) # contract target relations src_type = rel['src_oc'] if src_type in contract_relations_into_attributes: att_name = fixName(rel_name, att_from_rel=True) # TODO: clean up attribute names while tar.get(att_name, None) is not None: # attribute exists if att_name[-1].isnumeric(): # increment last digit att_name = att_name[:-1] + str(int(att_name[-1]) + 1) else: att_name += '2' # add target node's label as attribute #print("contracting src to attribute %s on id=%s"%(att_name, tar_id)) tar.set(att_name, src.get('label')) if add_inverse_relations: n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar), gdb.relationships.create(tar, fixName(rel_name, is_tar_rel=True), src)] else: n4j_rel = gdb.relationships.create(src, fixName(rel_name), tar) n4j_relations[rel_id] = n4j_rel return n4j_relations def importEnts(etype): """Import all entities of the given type. """ # read json for all entities of given type json = readJSON(entsURL%etype) ents = json['ents'] print("importing %s %ss"%(len(ents),etype)) size = 100 batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)] cnt = 0 for batch in batches: cnt += size if cnt % 100 == 0: print(" %s %ss"%(cnt, etype)) # extract list of ismi ids ismi_ids = [str(ent['id']) for ent in batch] # fetch full data for list of entities ent_json = readJSON(entsByIdURL%','.join(ismi_ids)) ents_data = ent_json['ents'] # iterate through results batch for ent_data in ents_data: ismi_id = ent_data['id'] if ismi_id in n4j_nodes: print("ERROR: entity with id=%s exists!"%ismi_id) return # create neo4j node node = nodeFromEnt(ent_data, etype) # save node reference n4j_nodes[ismi_id] = node # extract relations relsFromEnt(ent_data, ismi_relations) #if cnt >= 100: # return # In[119]: def importAllEnts(etypes): for etype in etypes: importEnts(etype) n4jrelationsFromRels(ismi_relations, n4j_nodes) # In[120]: #importAllEnts(ismi_types) importAllEnts(ismi_defs)