Mercurial > hg > drupalISMI
changeset 19:ca1e02a2a9c4
unfilteredIsmi: openmind to json exporter like filterISMI.
ismi2model: openmind importer like ismi2neo4j that saves networkx pickle file.
author | casties |
---|---|
date | Wed, 09 Sep 2015 17:32:42 +0200 |
parents | 0827156df210 |
children | bdf91a4a40ff |
files | importFromOpenMind/importer/ismi2model.py importFromOpenMind/importer/ismi2neo4j.py importFromOpenMind/importer/unfilteredISMI.py |
diffstat | 3 files changed, 524 insertions(+), 28 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/importFromOpenMind/importer/ismi2model.py Wed Sep 09 17:32:42 2015 +0200 @@ -0,0 +1,303 @@ +import urllib.request +import json +import networkx + +## configure behaviour + +# output filename +output_fn = "ismi_graph.gpickle" + +# contract relations to these objects into attributes with the relations' name +#contract_relations_into_attributes = ['PLACE', 'ALIAS'] +contract_relations_into_attributes = [] + +# OpenMind base URL +baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" + + +entsURL=baseURL+"method=get_ents&oc=%s" + +entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s" + +entURL=baseURL+"method=get_ent&id=%s&include_content=True" + + +def readJSON(url): + #print("JSON loading %s"%url) + wsh=urllib.request.urlopen(url) + txt = wsh.read() + return json.loads(txt.decode("utf-8")) + +defs_json = readJSON(baseURL+"method=get_defs") + +# current list of all definitions +ismi_defs = [atts['ov'] for atts in defs_json['defs']] + +#ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] + + +nx_graph = networkx.MultiDiGraph() + +nx_nodes = {} +ismi_relations = {} +nx_relations = {} + +ent_exclude_attrs = [ + 'lw', + 'node_type', + 'nov' +] + +def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): + # these are too embarrassing... + if 'FLORUIT' in name: + name = name.replace('FLORUIT', 'FLOURISH') + + elif 'floruit' in name: + name = name.replace('floruit', 'flourish') + + if is_src_rel: + #name = name + '>' + pass + + if is_tar_rel: + name = '<' + name + + if att_from_rel: + # clean up relations as attribute names + name = name.replace('is_', '') + name = name.replace('has_', '') + name = name.replace('was_', '') + name = name.replace('_of', '') + + return name + + +def nodeFromEnt(ent, etype): + """Create a Neo4J node from the given JSON entity. + + Creates the node in gdb and returns the node. + """ + attrs = {} + # go through all attributes + for att in ent['atts']: + ct = att.get('content_type', None) + if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: + # normal text attribute (assume no content_type is text too...) + key = att['name'] + val = att['ov'] + + if key in ent_exclude_attrs: + # exclude attribute + continue + + # keep attribute + attrs[key] = val + + elif ct == 'num': + # number attribute + key = att['name'] + val = att['ov'] + + if key in ent_exclude_attrs: + # exclude attribute + continue + + # keep attribute, assume num is int + attrs[key] = int(val) + + elif ct == 'date': + # date attribute + key = att['name'] + val = att['ov'] + #print("don't know what to do with date: %s=%s"%(key,val)) + + elif ct == 'old': + # ignore attribute + continue + + else: + print("WARN: attribute with unknown content_type: %s"%repr(att)) + # ignore other content types + continue + + # process base attributes + oc = ent['oc'] + if oc != etype: + print("ERROR: entity type doesn't match!") + return null + + attrs['type'] = fixName(oc) + + ismi_id = ent['id'] + # rename id to ismi_id + attrs['ismi_id'] = ismi_id + + ov = ent.get('ov', None) + if ov is not None: + # save ov as label + attrs['label'] = ov + + # create node with attributes + nx_graph.add_node(ismi_id, **attrs) + node = nx_graph.node[ismi_id] + + return node + + +def relsFromEnt(ent, relations): + """Extract all relations from JSON entity. + + Adds JSON to dict relations under relation's id. + """ + # go through src_rels and tar_rels + rels = ent.get('src_rels', []) + ent.get('tar_rels', []) + for rel in rels: + rel_id = rel['id'] + if rel_id in relations: + old_rel = relations[rel_id] + if rel != old_rel: + print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel))) + continue + + relations[rel_id] = rel + + return relations + + +def relationsFromRels(rels, nodes): + """Create relations in Neo4J. + + Args: + rels: dict of JSON relations + nodes: dict of existing Neo4J nodes + Returns: + dict of Neo4J relations + """ + # go through all rels + print("importing %s relations"%len(rels)) + cnt = 0 + for rel in rels.values(): + cnt += 1 + if cnt % 100 == 0: + print(" %s relations"%cnt) + + rel_id = rel['id'] + rel_name = rel['name'] + src_id = rel['src_id'] + tar_id = rel['tar_id'] + if not src_id in nodes: + print("ERROR: relation %s src node %s missing!"%(rel_id,src_id)) + continue + + if not tar_id in nodes: + print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) + continue + + if contract_relations_into_attributes: + # contract source relations + tar_type = rel['tar_oc'] + if tar_type in contract_relations_into_attributes: + att_name = fixName(rel_name, att_from_rel=True) + # TODO: clean up attribute names + while src.get(att_name, None) is not None: + # attribute exists + if att_name[-1].isnumeric(): + # increment last digit + att_name = att_name[:-1] + str(int(att_name[-1]) + 1) + else: + att_name += '2' + + # add target node's label as attribute + #print("contracting tar to attribute %s on id=%s"%(att_name, src_id)) + nx_graph.node[src_id][att_name] = nx_graph.node[tar_id]['label'] + + # contract target relations + src_type = rel['src_oc'] + if src_type in contract_relations_into_attributes: + att_name = fixName(rel_name, att_from_rel=True) + # TODO: clean up attribute names + while tar.get(att_name, None) is not None: + # attribute exists + if att_name[-1].isnumeric(): + # increment last digit + att_name = att_name[:-1] + str(int(att_name[-1]) + 1) + else: + att_name += '2' + + # add target node's label as attribute + #print("contracting src to attribute %s on id=%s"%(att_name, tar_id)) + nx_graph.node[tar_id][att_name] = nx_graph.node[src_id]['label'] + + # create relation with type + nx_rel = nx_graph.add_edge(src_id, tar_id, type=fixName(rel_name)) + + nx_relations[rel_id] = nx_rel + + return nx_relations + + +def importEnts(etype): + """Import all entities of the given type. + """ + # read json for all entities of given type + json = readJSON(entsURL%etype) + ents = json['ents'] + print("importing %s %ss"%(len(ents),etype)) + size = 100 + batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)] + cnt = 0 + for batch in batches: + cnt += size + if cnt % 100 == 0: + print(" %s %ss"%(cnt, etype)) + + # extract list of ismi ids + ismi_ids = [str(ent['id']) for ent in batch] + + # fetch full data for list of entities + ent_json = readJSON(entsByIdURL%','.join(ismi_ids)) + ents_data = ent_json['ents'] + + # iterate through results batch + for ent_data in ents_data: + ismi_id = ent_data['id'] + if ismi_id in nx_nodes: + print("ERROR: entity with id=%s exists!"%ismi_id) + return + + # create neo4j node + node = nodeFromEnt(ent_data, etype) + + # save node reference + nx_nodes[ismi_id] = node + + # extract relations + relsFromEnt(ent_data, ismi_relations) + + #if cnt >= 100: + # return + + +# In[119]: + +def importAllEnts(etypes): + + for etype in etypes: + importEnts(etype) + + relationsFromRels(ismi_relations, nx_nodes) + + +# In[120]: + +importAllEnts(ismi_defs) +#importAllEnts(['TEXT']) + +print("Graph info: %s"%networkx.info(nx_graph)) +print("Number of nodes: %s"%networkx.number_of_nodes(nx_graph)) +print("Number of edges: %s"%networkx.number_of_edges(nx_graph)) +#print(" nodes:%s"%repr(nx_graph.nodes(data=True))) +# export pickle +networkx.write_gpickle(nx_graph, output_fn) +print("Wrote file %s"%output_fn)
--- a/importFromOpenMind/importer/ismi2neo4j.py Mon Sep 07 16:57:10 2015 +0200 +++ b/importFromOpenMind/importer/ismi2neo4j.py Wed Sep 09 17:32:42 2015 +0200 @@ -10,11 +10,8 @@ # add relations to these objects as attributes with the relations name contract_relations_into_attributes = ['PLACE', 'ALIAS'] -# try to find and re-use existing nodes in neo4j (slow!) -keep_nodes = False - # label added to all nodes -project_label = '_ismi2' +project_label = '_ismi3' # OpenMind base URL baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" @@ -25,10 +22,13 @@ entsURL=baseURL+"method=get_ents&oc=%s" +entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s" + entURL=baseURL+"method=get_ent&id=%s&include_content=True" def readJSON(url): + #print("JSON loading %s"%url) wsh=urllib.request.urlopen(url) txt = wsh.read() return json.loads(txt.decode("utf-8")) @@ -243,7 +243,7 @@ # add target node's label as attribute #print("contracting src to attribute %s on id=%s"%(att_name, tar_id)) - src.set(att_name, src.get('label')) + tar.set(att_name, src.get('label')) if add_inverse_relations: n4j_rel = [gdb.relationships.create(src, fixName(rel_name, is_src_rel=True), tar), @@ -264,36 +264,36 @@ json = readJSON(entsURL%etype) ents = json['ents'] print("importing %s %ss"%(len(ents),etype)) + size = 100 + batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)] cnt = 0 - for ent in ents: - cnt += 1 + for batch in batches: + cnt += size if cnt % 100 == 0: print(" %s %ss"%(cnt, etype)) - # extract ismi id - ismi_id = ent['id'] - - node = None + # extract list of ismi ids + ismi_ids = [str(ent['id']) for ent in batch] - # fetch full data for entity - ent_json = readJSON(entURL%ismi_id) - ent_data = ent_json['ent'] - # create neo4j node - if keep_nodes: - node = getNode(ismi_id) + # fetch full data for list of entities + ent_json = readJSON(entsByIdURL%','.join(ismi_ids)) + ents_data = ent_json['ents'] - if ismi_id in n4j_nodes: - print("ERROR: entity with id=%s exists!"%ismi_id) - return - - if node is None: + # iterate through results batch + for ent_data in ents_data: + ismi_id = ent_data['id'] + if ismi_id in n4j_nodes: + print("ERROR: entity with id=%s exists!"%ismi_id) + return + + # create neo4j node node = nodeFromEnt(ent_data, etype) - - # save node reference - n4j_nodes[ismi_id] = node - - # extract relations - relsFromEnt(ent_data, ismi_relations) + + # save node reference + n4j_nodes[ismi_id] = node + + # extract relations + relsFromEnt(ent_data, ismi_relations) #if cnt >= 100: # return
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/importFromOpenMind/importer/unfilteredISMI.py Wed Sep 09 17:32:42 2015 +0200 @@ -0,0 +1,193 @@ +''' +Created on 22.04.2014 + +@author: dwinter +''' + +import os +import json +import urllib.request + +#ismiBaseUrl="https://ismi.mpiwg-berlin.mpg.de/om4-ismi" +ismiBaseUrl="http://localhost:18080/ismi-richfaces" + +class Importer: + + allents = {} + allrels = {} + + def loadJSON(self,url): + """Load JSON from URL. + + Saves JSON in data member. + """ + #print(" loading "+url) + response = urllib.request.urlopen(url) + str_response = response.readall().decode('utf-8') + + self.data = json.loads(str_response) + + + def loadJSONFromFile(self,fn): + """Load JSON from file. + + Saves JSON in data member. + """ + print(" loading "+fn+".json") + self.data = json.load(open(fn+".json",'r', encoding="utf-8"),encoding="utf-8") + + + def getEntIds(self): + """Extract entities from data member. + + Checks all relations. + Returns a set of ids of related objects and a list of the relations. + """ + + ents = self.data.get("ents") + + ret=set() + rels=[] + + for ent in ents: + ret.add(str(ent.get('id'))) + if 'src_rels' in ent: + print("src_rels: %s"%ent.get('src_rels')) + rels.extend(ent.get('src_rels')) + + if 'tar_rels' in ent: + print("tar_rels: %s"%ent.get('tar_rels')) + rels.extend(ent.get('tar_rels')) + + return ret,rels + + + def loadallEnts(self,kind="tar",filterOC=[]): + """Get related entities from OpenMind. + + Gets all related entities' ids using kind and filterOC via getEntIdsMentioned(). + Downloads the entities from OpenMind using the ids. + Returns the entities as JSON-string and a list of relations. + """ + + ids,rels = self.getEntIds() + + baseUrl=ismiBaseUrl+"/jsonInterface?include_content=true&include_romanization=true&method=get_ents" + + lenId = len(ids) + portions = int(lenId / 500) + print("loading %s entities"%lenId) + + ents = [] + for p in range(portions+1): + + start = p * 500 + end = min(lenId,(p+1)*500) + + idsFrak = list(ids)[start:end] + idsString = ",".join(idsFrak) + + + qs = baseUrl+"&ids="+idsString + #print(" loading ents from "+qs) + response = urllib.request.urlopen(qs) + entsJ = json.loads(response.readall().decode('utf-8')); + ents += entsJ.get("ents") + + # iterate all entities + for ent in entsJ.get("ents"): + ismi_id = ent.get('id') + if ismi_id in self.allents: + print("entity id=%s exists!"%ismi_id) + else: + self.allents[ismi_id] = ent + + # extract relations + if 'src_rels' in ent: + #print("src_rels: %s"%ent.get('src_rels')) + rels.extend(ent.get('src_rels')) + + for rel in ent.get('src_rels'): + rel_id = rel.get('id') + if rel_id in self.allrels: + print("relation id=%s exists!"%rel_id) + else: + self.allrels[rel_id] = rel + + if 'tar_rels' in ent: + #print("tar_rels: %s"%ent.get('tar_rels')) + rels.extend(ent.get('tar_rels')) + + for rel in ent.get('tar_rels'): + rel_id = rel.get('id') + if rel_id in self.allrels: + print("relation id=%s exists!"%rel_id) + else: + self.allrels[rel_id] = rel + + #str_response = json.dumps({"ents":ents}); + return ents,rels + + + def saveallEnts(self,filename,kind="tar",filterOC=[]): + """Loads all related entities and saves as JSON. + + Loads all related entities using kind and filterOC via LoadAllEnts(). + Saves entities in file filename.json. + Saves relations in file filename_rels.json. + """ + + ents,rels = self.loadallEnts(kind=kind,filterOC=filterOC) + + print(" writing ", filename+".json") + of = open(filename+".json","wb") + of.write(json.dumps({"ents":ents}).encode('utf-8')) + of.close() + + print(" writing ", filename+"_rels.json") + of = open(filename+"_rels.json","w") + json.dump({'rels':rels},of); + of.close() + + +if __name__ == '__main__': + imp = Importer() + + # get current list of all definitions + imp.loadJSON(ismiBaseUrl+"/jsonInterface?method=get_defs") + ismi_defs = [atts['ov'] for atts in imp.data['defs']] + + # create directory for export files + exportDir = '/tmp/ismi_data' + if not os.access(exportDir, os.R_OK): + # dir doesn't exist -> create + os.makedirs(exportDir) + + for ismi_def in ismi_defs: + print("loading entities of type %s"%ismi_def) + # + # load all entities of type ismi_def + # contains entities with attributes and first-order relations + # + url = ismiBaseUrl+"/jsonInterface?method=get_ents&oc=%s"%ismi_def + imp.loadJSON(url) + + # + # load and save all target relations of entities as entities.json + # + imp.saveallEnts(exportDir+"/%s"%ismi_def) + + # + # save all entities in one file + # + print(" writing ", "ALL.json") + of = open(exportDir+"/ALL.json","wb") + allents = [ent for ent in imp.allents.values()] + of.write(json.dumps({"ents":allents}).encode('utf-8')) + of.close() + + print(" writing ", "ALL_rels.json") + of = open(exportDir+"/ALL_rels.json","wb") + allrels = [rel for rel in imp.allrels.values()] + of.write(json.dumps({"rels":allrels}).encode('utf-8')) + of.close()