Mercurial > hg > drupalISMI
changeset 25:5bdcb5805d29
updated openmind-networkx-neo4j conversion with dates, locations and links.
author | casties |
---|---|
date | Thu, 24 Sep 2015 18:17:41 +0200 |
parents | 97f2da68fb5f |
children | 248bf8d1e2e7 |
files | importFromOpenMind/importer/ismi2model.py importFromOpenMind/importer/model2model.py importFromOpenMind/importer/model2neo4j.py |
diffstat | 3 files changed, 212 insertions(+), 48 deletions(-) [+] |
line wrap: on
line diff
--- a/importFromOpenMind/importer/ismi2model.py Wed Sep 23 19:47:02 2015 +0200 +++ b/importFromOpenMind/importer/ismi2model.py Thu Sep 24 18:17:41 2015 +0200 @@ -8,13 +8,12 @@ # output filename output_fn = "ismi_graph.gpickle" -# contract relations to these objects into attributes with the relations' name -#contract_relations_into_attributes = ['PLACE', 'ALIAS'] -contract_relations_into_attributes = [] - # OpenMind base URL baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" +# node types to exclude from the graph +exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE'] + entsURL=baseURL+"method=get_ents&oc=%s" @@ -111,7 +110,22 @@ # date attribute key = att['name'] val = att['ov'] - print("don't know what to do with date: %s=%s"%(key,val)) + # try to parse date object to get gregorian year + try: + year = None + date_json = json.loads(val) + if 'from' in date_json: + year = date_json['from'].get('year', None) + elif 'date' in date_json: + year = date_json['date'].get('year', None) + else: + print("don't know what to do with date on %s: %s=%s"%(ent['id'],key,val)) + + if year is not None: + attrs[key] = year + + except: + print("ERROR: invalid JSON in date: %s"%repr(val)) elif ct == 'old': # ignore attribute @@ -128,6 +142,11 @@ print("ERROR: entity type doesn't match!") return null + # rename if type attr exists + if 'type' in attrs: + attrs['type2'] = attrs['type'] + + # set type attrs['type'] = fixName(oc) ismi_id = ent['id'] @@ -154,6 +173,12 @@ # go through src_rels and tar_rels rels = ent.get('src_rels', []) + ent.get('tar_rels', []) for rel in rels: + src_type = rel['src_oc'] + tar_type = rel['tar_oc'] + if src_type in exclude_objects_of_type or tar_type in exclude_objects_of_type: + # skip relation to excluded objects + continue + rel_id = rel['id'] if rel_id in relations: old_rel = relations[rel_id] @@ -195,41 +220,6 @@ print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id)) continue - if contract_relations_into_attributes: - # contract source relations - tar_type = rel['tar_oc'] - if tar_type in contract_relations_into_attributes: - att_name = fixName(rel_name, att_from_rel=True) - # TODO: clean up attribute names - while src.get(att_name, None) is not None: - # attribute exists - if att_name[-1].isnumeric(): - # increment last digit - att_name = att_name[:-1] + str(int(att_name[-1]) + 1) - else: - att_name += '2' - - # add target node's label as attribute - #print("contracting tar to attribute %s on id=%s"%(att_name, src_id)) - nx_graph.node[src_id][att_name] = nx_graph.node[tar_id]['label'] - - # contract target relations - src_type = rel['src_oc'] - if src_type in contract_relations_into_attributes: - att_name = fixName(rel_name, att_from_rel=True) - # TODO: clean up attribute names - while tar.get(att_name, None) is not None: - # attribute exists - if att_name[-1].isnumeric(): - # increment last digit - att_name = att_name[:-1] + str(int(att_name[-1]) + 1) - else: - att_name += '2' - - # add target node's label as attribute - #print("contracting src to attribute %s on id=%s"%(att_name, tar_id)) - nx_graph.node[tar_id][att_name] = nx_graph.node[src_id]['label'] - # create relation with type nx_rel = nx_graph.add_edge(src_id, tar_id, type=fixName(rel_name), ismi_id=rel_id) @@ -267,7 +257,7 @@ print("ERROR: entity with id=%s exists!"%ismi_id) return - # create neo4j node + # create networkx node node = nodeFromEnt(ent_data, etype) # save node reference @@ -285,6 +275,10 @@ def importAllEnts(etypes): for etype in etypes: + if etype in exclude_objects_of_type: + # skip this type + continue + importEnts(etype) relationsFromRels(ismi_relations, nx_nodes) @@ -300,6 +294,9 @@ # import everything print("Reading graph from OpenMind at %s"%baseURL) +if len(exclude_objects_of_type) > 0: + print(" Skipping objects of type %s"%exclude_objects_of_type); + importAllEnts(ismi_defs) #importAllEnts(['TEXT'])
--- a/importFromOpenMind/importer/model2model.py Wed Sep 23 19:47:02 2015 +0200 +++ b/importFromOpenMind/importer/model2model.py Thu Sep 24 18:17:41 2015 +0200 @@ -1,5 +1,6 @@ import networkx as nx import sys +import csv ## configure behaviour @@ -8,13 +9,24 @@ output_fn = 'ismi_graph_mod.gpickle' # operations -ops = ['contract', 'inv_rels'] +ops = ['locate', 'contract', 'inv_rels', 'add_links'] + +# types of object to locate +locate_objects_of_type = ['PLACE'] -# add relations to these objects as attributes with the relations name -contract_relations_into_attributes = {'PLACE': ['label'], +# file with place location information +places_fn = 'ismi_places_loc.csv' + +# node types to remove from the graph +#remove_objects_of_type = ['DIGITALIZATION', 'REFERENCE'] + +# add relations to these objects as attributes with the relation's name +contract_relations_into_attributes = {'PLACE': ['label', 'latitude', 'longitude'], 'ALIAS': ['label']} +# add URLs to nodes using an attribute in a pattern +add_link_attributes = {'ismi_id': 'https://ismi-dev.mpiwg-berlin.mpg.de/drupal-ismi/entity/%s'} def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): @@ -42,11 +54,158 @@ return name +def locatePlaces(nx_graph): + """add location information to objects in the graph""" + + print("Adding location information from %s to %s."%(places_fn, locate_objects_of_type)) + cnt = 0 + + # read place location file + locations = {} + with open(places_fn, encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + lat = row['Latitude'] + lon = row['Longitude'] + name = row['Address'] + if lat and lon: + locations[name] = {'latitude': lat, 'longitude': lon} + + # iterate all nodes + for n in nx.nodes_iter(nx_graph): + attrs = nx_graph.node[n] + if attrs['type'] in locate_objects_of_type: + # locatable object + name = attrs['label'] + if name in locations: + # place name match + location = locations[name] + attrs['latitude'] = location['latitude'] + attrs['longitude'] = location['longitude'] + + else: + print("WARNING: no location for name '%s'"%name) + + cnt += 1 + if cnt % 100 == 0: + print(" %s nodes"%cnt) + + + +def genAttName(attrs, name): + """Generate new attribute name. + """ + while attrs.get(name, None) is not None: + # attribute exists + if name[-1].isnumeric(): # increment last digit + name = name[:-1] + str(int(name[-1]) + 1) + else: + name += '2' + + return name + + def contractRelations(nx_graph): """contract relations into attributes""" + print("Contracting relations to attributes.") + cnt = 0 + for nx_edge in nx.edges_iter(nx_graph): + (nx_src, nx_tar) = nx_edge + # get attributes of edge + rel_attrs = nx_graph.edge[nx_src][nx_tar][0] + rel_type = rel_attrs['type'] + # get attributes of source and target nodes + src_attrs = nx_graph.node[nx_src] + tar_attrs = nx_graph.node[nx_tar] + + # contract source relations + tar_type = tar_attrs['type'] + if tar_type in contract_relations_into_attributes: + # get list of attributes to transfer + transfer_atts = contract_relations_into_attributes[tar_type] + for transfer_att in transfer_atts: + if transfer_att not in tar_attrs: + # target has no attribute + continue + + # name for new attribute starts with relation name + att_name = fixName(rel_type, att_from_rel=True) + # then attribute name + if transfer_att != 'label': + att_name += "_%s"%transfer_att + + # then generate unique name + att_name = genAttName(src_attrs, att_name) + # add target node's attribute + src_attrs[att_name] = tar_attrs.get(transfer_att) + + # contract target relations + src_type = src_attrs['type'] + if src_type in contract_relations_into_attributes: + # get list of attributes to transfer + transfer_atts = contract_relations_into_attributes[src_type] + for transfer_att in transfer_atts: + if transfer_att not in src_attrs: + # target has no attribute + continue + + # name for new attribute starts with relation name + att_name = fixName(rel_type, att_from_rel=True) + # then attribute name + if transfer_att != 'label': + att_name += "_%s"%transfer_att + + # then generate unique name + att_name = genAttName(tar_attrs, att_name) + # add target node's attribute + tar_attrs[att_name] = src_attrs.get(transfer_att) + + cnt += 1 + if cnt % 100 == 0: + print(" %s relations"%cnt) + + +def invertRelations(nx_graph): + """Add inverse relations to each relation""" + print("Adding inverse relations.") + # copy list of edges because we add edges in the loop + edges = nx.edges(nx_graph)[:] + # iterate list + cnt = 0 + for nx_edge in edges: + (nx_src, nx_tar) = nx_edge + # get attributes of edge + rel_attrs = nx_graph.edge[nx_src][nx_tar][0] + rel_type = rel_attrs['type'] + rel_id = rel_attrs['ismi_id'] + # create new relation + nx_graph.add_edge(nx_tar, nx_src, type=fixName(rel_type, is_tar_rel=True), ismi_id=-rel_id) + + cnt += 1 + if cnt % 100 == 0: + print(" %s relations"%cnt) + +def addLinks(nx_graph): + """Add link attributes to all nodes.""" + + print("Adding links: %s"%repr(add_link_attributes)) + cnt = 0 + for link_att, link_pattern in add_link_attributes.items(): + # iterate all nodes + for n in nx.nodes_iter(nx_graph): + attrs = nx_graph.node[n] + if link_att in attrs: + url = link_pattern%attrs[link_att] + # TODO: which target attribute for multiple? + attrs['link'] = url + + cnt += 1 + if cnt % 100 == 0: + print(" %s nodes"%cnt) + ## main @@ -65,11 +224,19 @@ # operate for op in ops: - if op == 'contract': + if op == 'locate': + locatePlaces(nx_graph) + + elif op == 'contract': contractRelations(nx_graph) elif op == 'inv_rels': invertRelations(nx_graph) + elif op == 'add_links': + addLinks(nx_graph) + +print("Writing graph to %s"%output_fn) +nx_graph = nx.write_gpickle(nx_graph, output_fn) print("Done.")
--- a/importFromOpenMind/importer/model2neo4j.py Wed Sep 23 19:47:02 2015 +0200 +++ b/importFromOpenMind/importer/model2neo4j.py Thu Sep 24 18:17:41 2015 +0200 @@ -8,7 +8,7 @@ input_fn = 'ismi_graph.gpickle' # label added to all nodes -project_label = '_ismi4' +project_label = '_ismi6' # neo4j base URL neo4jBaseURL = "http://localhost:7474/db/data/" @@ -63,7 +63,7 @@ cnt += 1 if cnt % 100 == 0: - print(" %s"%cnt) + print(" %s nodes"%cnt) def copyRelations(nx_graph, n4j_graph): @@ -97,7 +97,7 @@ cnt += 1 if cnt % 100 == 0: - print(" %s"%cnt) + print(" %s relations"%cnt) ## main