Mercurial > hg > drupalISMI
view importFromOpenMind/importer/ismi2model.py @ 24:97f2da68fb5f
first version of model2model graph manipulation tool. doesn't work yet.
author | casties |
---|---|
date | Wed, 23 Sep 2015 19:47:02 +0200 |
parents | 45a823b5bf33 |
children | 5bdcb5805d29 |
line wrap: on
line source
import urllib.request import json import networkx import sys ## configure behaviour # output filename output_fn = "ismi_graph.gpickle" # contract relations to these objects into attributes with the relations' name #contract_relations_into_attributes = ['PLACE', 'ALIAS'] contract_relations_into_attributes = [] # OpenMind base URL baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" entsURL=baseURL+"method=get_ents&oc=%s" entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s" entURL=baseURL+"method=get_ent&id=%s&include_content=True" def readJSON(url): #print("JSON loading %s"%url) wsh=urllib.request.urlopen(url) txt = wsh.read() return json.loads(txt.decode("utf-8")) defs_json = readJSON(baseURL+"method=get_defs") # current list of all definitions ismi_defs = [atts['ov'] for atts in defs_json['defs']] #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] nx_graph = networkx.MultiDiGraph() nx_nodes = {} ismi_relations = {} nx_relations = {} ent_exclude_attrs = [ 'lw', 'node_type', 'nov' ] def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): # these are too embarrassing... if 'FLORUIT' in name: name = name.replace('FLORUIT', 'FLOURISH') elif 'floruit' in name: name = name.replace('floruit', 'flourish') if is_src_rel: #name = name + '>' pass if is_tar_rel: name = '<' + name if att_from_rel: # clean up relations as attribute names name = name.replace('is_', '') name = name.replace('has_', '') name = name.replace('was_', '') name = name.replace('_of', '') return name def nodeFromEnt(ent, etype): """Create a Neo4J node from the given JSON entity. Creates the node in gdb and returns the node. """ attrs = {} # go through all attributes for att in ent['atts']: ct = att.get('content_type', None) if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: # normal text attribute (assume no content_type is text too...) key = att['name'] val = att['ov'] if key in ent_exclude_attrs: # exclude attribute continue # keep attribute attrs[key] = val elif ct == 'num': # number attribute key = att['name'] val = att['ov'] if key in ent_exclude_attrs: # exclude attribute continue # keep attribute, assume num is int attrs[key] = int(val) elif ct == 'date': # date attribute key = att['name'] val = att['ov'] print("don't know what to do with date: %s=%s"%(key,val)) elif ct == 'old': # ignore attribute continue else: print("WARN: attribute with unknown content_type: %s"%repr(att)) # ignore other content types continue # process base attributes oc = ent['oc'] if oc != etype: print("ERROR: entity type doesn't match!") return null attrs['type'] = fixName(oc) ismi_id = ent['id'] # rename id to ismi_id attrs['ismi_id'] = ismi_id ov = ent.get('ov', None) if ov is not None: # save ov as label attrs['label'] = ov # create node with attributes nx_graph.add_node(ismi_id, **attrs) node = nx_graph.node[ismi_id] return node def relsFromEnt(ent, relations): """Extract all relations from JSON entity. Adds JSON to dict relations under relation's id. """ # go through src_rels and tar_rels rels = ent.get('src_rels', []) + ent.get('tar_rels', []) for rel in rels: rel_id = rel['id'] if rel_id in relations: old_rel = relations[rel_id] if rel != old_rel: print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel))) continue relations[rel_id] = rel return relations def relationsFromRels(rels, nodes): """Create relations in Neo4J. Args: rels: dict of JSON relations nodes: dict of existing Neo4J nodes Returns: dict of Neo4J relations """ # go through all rels print("importing %s relations"%len(rels)) cnt = 0 for rel in rels.values(): cnt += 1 if cnt % 100 == 0: print(" %s relations"%cnt) rel_id = rel['id'] rel_name = rel['name'] src_id = rel['src_id'] tar_id = rel['tar_id'] if not src_id in nodes: print("ERROR: relation %s src node %s missing!"%(rel_id,src_id)) continue if not tar_id in nodes: print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) continue if contract_relations_into_attributes: # contract source relations tar_type = rel['tar_oc'] if tar_type in contract_relations_into_attributes: att_name = fixName(rel_name, att_from_rel=True) # TODO: clean up attribute names while src.get(att_name, None) is not None: # attribute exists if att_name[-1].isnumeric(): # increment last digit att_name = att_name[:-1] + str(int(att_name[-1]) + 1) else: att_name += '2' # add target node's label as attribute #print("contracting tar to attribute %s on id=%s"%(att_name, src_id)) nx_graph.node[src_id][att_name] = nx_graph.node[tar_id]['label'] # contract target relations src_type = rel['src_oc'] if src_type in contract_relations_into_attributes: att_name = fixName(rel_name, att_from_rel=True) # TODO: clean up attribute names while tar.get(att_name, None) is not None: # attribute exists if att_name[-1].isnumeric(): # increment last digit att_name = att_name[:-1] + str(int(att_name[-1]) + 1) else: att_name += '2' # add target node's label as attribute #print("contracting src to attribute %s on id=%s"%(att_name, tar_id)) nx_graph.node[tar_id][att_name] = nx_graph.node[src_id]['label'] # create relation with type nx_rel = nx_graph.add_edge(src_id, tar_id, type=fixName(rel_name), ismi_id=rel_id) nx_relations[rel_id] = nx_rel return nx_relations def importEnts(etype): """Import all entities of the given type. """ # read json for all entities of given type json = readJSON(entsURL%etype) ents = json['ents'] print("importing %s %ss"%(len(ents),etype)) size = 100 batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)] cnt = 0 for batch in batches: cnt += size if cnt % 100 == 0: print(" %s %ss"%(cnt, etype)) # extract list of ismi ids ismi_ids = [str(ent['id']) for ent in batch] # fetch full data for list of entities ent_json = readJSON(entsByIdURL%','.join(ismi_ids)) ents_data = ent_json['ents'] # iterate through results batch for ent_data in ents_data: ismi_id = ent_data['id'] if ismi_id in nx_nodes: print("ERROR: entity with id=%s exists!"%ismi_id) return # create neo4j node node = nodeFromEnt(ent_data, etype) # save node reference nx_nodes[ismi_id] = node # extract relations relsFromEnt(ent_data, ismi_relations) #if cnt >= 100: # return # In[119]: def importAllEnts(etypes): for etype in etypes: importEnts(etype) relationsFromRels(ismi_relations, nx_nodes) ## main print("Copy graph from OpenMind to networkx pickle") # parse command line parameters if len(sys.argv) > 1: output_fn = sys.argv[1] # import everything print("Reading graph from OpenMind at %s"%baseURL) importAllEnts(ismi_defs) #importAllEnts(['TEXT']) print("Graph info: %s"%networkx.info(nx_graph)) #print(" nodes:%s"%repr(nx_graph.nodes(data=True))) # export pickle networkx.write_gpickle(nx_graph, output_fn) print("Wrote networkx pickle file %s"%output_fn)