Mercurial > hg > drupalISMI
view importFromOpenMind/importer/ismi2model.py @ 49:f8cd7db4178c
output number of checked saves.
author | casties |
---|---|
date | Thu, 23 Feb 2017 19:57:05 +0100 |
parents | f3945ef1e6a4 |
children |
line wrap: on
line source
import urllib.request import json import networkx import sys ## configure behaviour # output filename output_fn = "ismi_graph.gpickle" # OpenMind base URL #baseURL="http://ismi.mpiwg-berlin.mpg.de//om4-ismi/jsonInterface?" baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" # node types to exclude from the graph exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE'] # attributes to exclude exclude_attributes_of_type = [ 'lw', 'node_type', 'nov', 'notes_old' ] # name of type attribute node_type_attribute = '_type' rel_type_attribute = '_type' entsURL=baseURL+"method=get_ents&oc=%s" entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s" entURL=baseURL+"method=get_ent&id=%s&include_content=True" def readJSON(url): #print("JSON loading %s"%url) wsh=urllib.request.urlopen(url) txt = wsh.read() return json.loads(txt.decode("utf-8")) defs_json = readJSON(baseURL+"method=get_defs") # current list of all definitions ismi_defs = [atts['ov'] for atts in defs_json['defs']] #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] nx_graph = networkx.MultiDiGraph() nx_nodes = {} ismi_relations = {} nx_relations = {} def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): if is_src_rel: #name = name + '>' pass if is_tar_rel: name = '<' + name if att_from_rel: # clean up relations as attribute names name = name.replace('is_', '') name = name.replace('has_', '') name = name.replace('was_', '') name = name.replace('_of', '') return name def parseYear(val): year = None try: date_json = json.loads(val) if 'from' in date_json: year = date_json['from'].get('year', None) elif 'date' in date_json: year = date_json['date'].get('year', None) else: print("don't know what to do with date %s"%(val)) except: pass return year def nodeFromEnt(ent, etype): """Create a Neo4J node from the given JSON entity. Creates the node in gdb and returns the node. """ attrs = {} # go through all attributes for att in ent['atts']: ct = att.get('content_type', None) name = att.get('name', None) if name in exclude_attributes_of_type: # exclude attribute continue if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: # normal text attribute (assume no content_type is text too...) val = att['ov'] if val[0] == '{': # try to parse as date year = parseYear(val) if year is not None: val = year # keep attribute attrs[name] = val if 'nov' in att: # add normalized value attrs['_n_'+name] = att['nov'] elif ct == 'date': # date attribute val = att['ov'] # try to parse date object to get gregorian year year = parseYear(val) if year is not None: attrs[name] = year elif ct == 'num': # number attribute val = att['ov'] # keep attribute, assume num is int attrs[name] = int(val) elif ct == 'old': # ignore attribute continue else: print("WARN: attribute with unknown content_type: %s"%repr(att)) # ignore other content types continue # process base attributes oc = ent['oc'] if oc != etype: print("ERROR: entity type doesn't match!") return None # set type attrs[node_type_attribute] = fixName(oc) ismi_id = ent['id'] # rename id to ismi_id attrs['ismi_id'] = ismi_id ov = ent.get('ov', None) if ov is not None: # save ov as label attrs['label'] = ov if 'nov' in ent: # add normalized value attrs['_n_label'] = ent.get('nov') nx_graph.add_node(ismi_id, **attrs) node = nx_graph.node[ismi_id] return node def relsFromEnt(ent, relations): """Extract all relations from JSON entity. Adds JSON to dict relations under relation's id. """ # go through src_rels and tar_rels rels = ent.get('src_rels', []) + ent.get('tar_rels', []) for rel in rels: src_type = rel['src_oc'] tar_type = rel['tar_oc'] if src_type in exclude_objects_of_type or tar_type in exclude_objects_of_type: # skip relation to excluded objects continue rel_id = rel['id'] if rel_id in relations: old_rel = relations[rel_id] if rel != old_rel: print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel))) continue relations[rel_id] = rel return relations def relationsFromRels(rels, nodes): """Create relations in Neo4J. Args: rels: dict of JSON relations nodes: dict of existing Neo4J nodes Returns: dict of Neo4J relations """ # go through all rels print("importing %s relations"%len(rels)) cnt = 0 for rel in rels.values(): cnt += 1 if cnt % 100 == 0: print(" %s relations"%cnt) rel_id = rel['id'] rel_name = rel['name'] src_id = rel['src_id'] tar_id = rel['tar_id'] if not src_id in nodes: print("ERROR: relation %s src node %s missing!"%(rel_id,src_id)) continue if not tar_id in nodes: print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) continue # TODO: what about attributes of relation? if len(rel['atts']) > 0: print("INFO: relation with attributes! name=%s id=%s atts=%s"%(rel_name, rel_id, repr(rel['atts']))) # create relation with type rel_atts = {rel_type_attribute: fixName(rel_name), 'ismi_id': rel_id} nx_rel = nx_graph.add_edge(src_id, tar_id, attr_dict=rel_atts) nx_relations[rel_id] = nx_rel return nx_relations def importEnts(etype): """Import all entities of the given type. """ # read json for all entities of given type json = readJSON(entsURL%etype) ents = json['ents'] print("importing %s %ss"%(len(ents),etype)) size = 100 batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)] cnt = 0 for batch in batches: cnt += size if cnt % 100 == 0: print(" %s %ss"%(cnt, etype)) # extract list of ismi ids ismi_ids = [str(ent['id']) for ent in batch] # fetch full data for list of entities ent_json = readJSON(entsByIdURL%','.join(ismi_ids)) ents_data = ent_json['ents'] # iterate through results batch for ent_data in ents_data: ismi_id = ent_data['id'] if ismi_id in nx_nodes: print("ERROR: entity with id=%s exists!"%ismi_id) return # create networkx node node = nodeFromEnt(ent_data, etype) # save node reference nx_nodes[ismi_id] = node # extract relations relsFromEnt(ent_data, ismi_relations) #if cnt >= 100: # return def importAllEnts(etypes): for etype in etypes: if etype in exclude_objects_of_type: # skip this type continue importEnts(etype) relationsFromRels(ismi_relations, nx_nodes) ## main print("Copy graph from OpenMind to networkx pickle") # parse command line parameters if len(sys.argv) > 1: output_fn = sys.argv[1] # import everything print("Reading graph from OpenMind at %s"%baseURL) if len(exclude_objects_of_type) > 0: print(" Skipping objects of type %s"%exclude_objects_of_type); importAllEnts(ismi_defs) #importAllEnts(['TEXT']) print("Graph info: %s"%networkx.info(nx_graph)) #print(" nodes:%s"%repr(nx_graph.nodes(data=True))) # export pickle networkx.write_gpickle(nx_graph, output_fn) print("Wrote networkx pickle file %s"%output_fn)