Mercurial > hg > drupalISMI
diff importFromOpenMind/importer/ismi2model.py @ 19:ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
ismi2model: openmind importer like ismi2neo4j that saves networkx pickle file.
author | casties |
---|---|
date | Wed, 09 Sep 2015 17:32:42 +0200 |
parents | |
children | 45a823b5bf33 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/importFromOpenMind/importer/ismi2model.py Wed Sep 09 17:32:42 2015 +0200 @@ -0,0 +1,303 @@ +import urllib.request +import json +import networkx + +## configure behaviour + +# output filename +output_fn = "ismi_graph.gpickle" + +# contract relations to these objects into attributes with the relations' name +#contract_relations_into_attributes = ['PLACE', 'ALIAS'] +contract_relations_into_attributes = [] + +# OpenMind base URL +baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" + + +entsURL=baseURL+"method=get_ents&oc=%s" + +entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s" + +entURL=baseURL+"method=get_ent&id=%s&include_content=True" + + +def readJSON(url): + #print("JSON loading %s"%url) + wsh=urllib.request.urlopen(url) + txt = wsh.read() + return json.loads(txt.decode("utf-8")) + +defs_json = readJSON(baseURL+"method=get_defs") + +# current list of all definitions +ismi_defs = [atts['ov'] for atts in defs_json['defs']] + +#ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] + + +nx_graph = networkx.MultiDiGraph() + +nx_nodes = {} +ismi_relations = {} +nx_relations = {} + +ent_exclude_attrs = [ + 'lw', + 'node_type', + 'nov' +] + +def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): + # these are too embarrassing... + if 'FLORUIT' in name: + name = name.replace('FLORUIT', 'FLOURISH') + + elif 'floruit' in name: + name = name.replace('floruit', 'flourish') + + if is_src_rel: + #name = name + '>' + pass + + if is_tar_rel: + name = '<' + name + + if att_from_rel: + # clean up relations as attribute names + name = name.replace('is_', '') + name = name.replace('has_', '') + name = name.replace('was_', '') + name = name.replace('_of', '') + + return name + + +def nodeFromEnt(ent, etype): + """Create a Neo4J node from the given JSON entity. + + Creates the node in gdb and returns the node. + """ + attrs = {} + # go through all attributes + for att in ent['atts']: + ct = att.get('content_type', None) + if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: + # normal text attribute (assume no content_type is text too...) + key = att['name'] + val = att['ov'] + + if key in ent_exclude_attrs: + # exclude attribute + continue + + # keep attribute + attrs[key] = val + + elif ct == 'num': + # number attribute + key = att['name'] + val = att['ov'] + + if key in ent_exclude_attrs: + # exclude attribute + continue + + # keep attribute, assume num is int + attrs[key] = int(val) + + elif ct == 'date': + # date attribute + key = att['name'] + val = att['ov'] + #print("don't know what to do with date: %s=%s"%(key,val)) + + elif ct == 'old': + # ignore attribute + continue + + else: + print("WARN: attribute with unknown content_type: %s"%repr(att)) + # ignore other content types + continue + + # process base attributes + oc = ent['oc'] + if oc != etype: + print("ERROR: entity type doesn't match!") + return null + + attrs['type'] = fixName(oc) + + ismi_id = ent['id'] + # rename id to ismi_id + attrs['ismi_id'] = ismi_id + + ov = ent.get('ov', None) + if ov is not None: + # save ov as label + attrs['label'] = ov + + # create node with attributes + nx_graph.add_node(ismi_id, **attrs) + node = nx_graph.node[ismi_id] + + return node + + +def relsFromEnt(ent, relations): + """Extract all relations from JSON entity. + + Adds JSON to dict relations under relation's id. + """ + # go through src_rels and tar_rels + rels = ent.get('src_rels', []) + ent.get('tar_rels', []) + for rel in rels: + rel_id = rel['id'] + if rel_id in relations: + old_rel = relations[rel_id] + if rel != old_rel: + print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel))) + continue + + relations[rel_id] = rel + + return relations + + +def relationsFromRels(rels, nodes): + """Create relations in Neo4J. + + Args: + rels: dict of JSON relations + nodes: dict of existing Neo4J nodes + Returns: + dict of Neo4J relations + """ + # go through all rels + print("importing %s relations"%len(rels)) + cnt = 0 + for rel in rels.values(): + cnt += 1 + if cnt % 100 == 0: + print(" %s relations"%cnt) + + rel_id = rel['id'] + rel_name = rel['name'] + src_id = rel['src_id'] + tar_id = rel['tar_id'] + if not src_id in nodes: + print("ERROR: relation %s src node %s missing!"%(rel_id,src_id)) + continue + + if not tar_id in nodes: + print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) + continue + + if contract_relations_into_attributes: + # contract source relations + tar_type = rel['tar_oc'] + if tar_type in contract_relations_into_attributes: + att_name = fixName(rel_name, att_from_rel=True) + # TODO: clean up attribute names + while src.get(att_name, None) is not None: + # attribute exists + if att_name[-1].isnumeric(): + # increment last digit + att_name = att_name[:-1] + str(int(att_name[-1]) + 1) + else: + att_name += '2' + + # add target node's label as attribute + #print("contracting tar to attribute %s on id=%s"%(att_name, src_id)) + nx_graph.node[src_id][att_name] = nx_graph.node[tar_id]['label'] + + # contract target relations + src_type = rel['src_oc'] + if src_type in contract_relations_into_attributes: + att_name = fixName(rel_name, att_from_rel=True) + # TODO: clean up attribute names + while tar.get(att_name, None) is not None: + # attribute exists + if att_name[-1].isnumeric(): + # increment last digit + att_name = att_name[:-1] + str(int(att_name[-1]) + 1) + else: + att_name += '2' + + # add target node's label as attribute + #print("contracting src to attribute %s on id=%s"%(att_name, tar_id)) + nx_graph.node[tar_id][att_name] = nx_graph.node[src_id]['label'] + + # create relation with type + nx_rel = nx_graph.add_edge(src_id, tar_id, type=fixName(rel_name)) + + nx_relations[rel_id] = nx_rel + + return nx_relations + + +def importEnts(etype): + """Import all entities of the given type. + """ + # read json for all entities of given type + json = readJSON(entsURL%etype) + ents = json['ents'] + print("importing %s %ss"%(len(ents),etype)) + size = 100 + batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)] + cnt = 0 + for batch in batches: + cnt += size + if cnt % 100 == 0: + print(" %s %ss"%(cnt, etype)) + + # extract list of ismi ids + ismi_ids = [str(ent['id']) for ent in batch] + + # fetch full data for list of entities + ent_json = readJSON(entsByIdURL%','.join(ismi_ids)) + ents_data = ent_json['ents'] + + # iterate through results batch + for ent_data in ents_data: + ismi_id = ent_data['id'] + if ismi_id in nx_nodes: + print("ERROR: entity with id=%s exists!"%ismi_id) + return + + # create neo4j node + node = nodeFromEnt(ent_data, etype) + + # save node reference + nx_nodes[ismi_id] = node + + # extract relations + relsFromEnt(ent_data, ismi_relations) + + #if cnt >= 100: + # return + + +# In[119]: + +def importAllEnts(etypes): + + for etype in etypes: + importEnts(etype) + + relationsFromRels(ismi_relations, nx_nodes) + + +# In[120]: + +importAllEnts(ismi_defs) +#importAllEnts(['TEXT']) + +print("Graph info: %s"%networkx.info(nx_graph)) +print("Number of nodes: %s"%networkx.number_of_nodes(nx_graph)) +print("Number of edges: %s"%networkx.number_of_edges(nx_graph)) +#print(" nodes:%s"%repr(nx_graph.nodes(data=True))) +# export pickle +networkx.write_gpickle(nx_graph, output_fn) +print("Wrote file %s"%output_fn)