Mercurial > hg > drupalISMI
changeset 48:6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
author | casties |
---|---|
date | Tue, 07 Feb 2017 21:06:13 +0100 |
parents | 378dcb66a27f |
children | f8cd7db4178c |
files | importFromOpenMind/importer/compare_models.py importFromOpenMind/importer/ismixml2model.py importFromOpenMind/importer/model2neo4j.py importFromOpenMind/importer/model2neo4j_client.py importFromOpenMind/importer/model2neo4j_import.py importFromOpenMind/importer/model2neo4j_restclient.py |
diffstat | 6 files changed, 486 insertions(+), 148 deletions(-) [+] |
line wrap: on
line diff
--- a/importFromOpenMind/importer/compare_models.py Mon Feb 06 18:44:43 2017 +0100 +++ b/importFromOpenMind/importer/compare_models.py Tue Feb 07 21:06:13 2017 +0100 @@ -1,6 +1,7 @@ import networkx as nx import sys import csv +from sqlalchemy.sql.expression import false ## configure behaviour @@ -12,6 +13,10 @@ node_type_attribute = '_type' rel_type_attribute = '_type' +# also compare attributes +check_attributes = True +check_attribute_content = False + # active log levels for logging logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} #logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} @@ -22,30 +27,37 @@ print("%s: %s"%(level, message)) -def invertRelations(nx_graph): - """Add inverse relations to each relation""" - - print("Adding inverse relations.") - # copy list of edges because we add edges in the loop - edges = nx.edges(nx_graph)[:] - # iterate list - cnt = 0 - for nx_edge in edges: - (nx_src, nx_tar) = nx_edge - # get attributes of edge - rel_attrs = nx_graph.edge[nx_src][nx_tar][0][:] - rel_type = rel_attrs[rel_type_attribute] - rel_id = rel_attrs['ismi_id'] - # create new relation - rel_attrs[rel_type_attribute] = fixName(rel_type, is_tar_rel=True) - rel_attrs['ismi_id': -rel_id] - nx_graph.add_edge(nx_tar, nx_src, attr_dict=invrel_atts) +def compare_attributes(attrs1, attrs2): + """compare two sets of attributes""" + for a in attrs1.keys(): + if a.startswith('_n_'): + continue + + if a not in attrs2: + log('DEBUG', "attribute %s missing in attrs2"%a) + return False + + elif check_attribute_content: + val1 = attrs1[a] + val2 = attrs2[a] + if isinstance(val1, str): + val1 = val1.replace('\r', '') + val2 = val2.replace('\r', '') + + if val1 != val2: + log('DEBUG', "attribute %s different in attrs2: \n%s\n vs \n%s\n"%(a, repr(val1), repr(val2))) + return False + + for a in attrs2.keys(): + if a.startswith('_n_'): + continue + + if a not in attrs1: + log('DEBUG', "attribute %s missing in attrs1"%a) + return False + + return True - cnt += 1 - if cnt % 100 == 0: - print(" %s relations"%cnt) - - def compare_nodes(nx_graph1, nx_graph2): """compare nodes of two graphs""" @@ -53,13 +65,18 @@ cnt = 0 missing_nodes1 = [] missing_nodes2 = [] + attribute_differences = [] # iterate all nodes in graph 1 for n in nx.nodes_iter(nx_graph1): - #attrs = nx_graph.node[n] - if not nx_graph2.has_node(n): missing_nodes2.append(n) + else: + attrs1 = nx_graph1.node[n] + attrs2 = nx_graph2.node[n] + if check_attributes and not compare_attributes(attrs1, attrs2): + attribute_differences.append(n) + if len(missing_nodes2) > 0: log('WARNING', "%s nodes missing in graph 2"%len(missing_nodes2)) log('DEBUG', "nodes: %s"%missing_nodes2) @@ -76,6 +93,10 @@ log('WARNING', "%s nodes missing in graph 1"%len(missing_nodes1)) log('DEBUG', "nodes: %s"%(missing_nodes1)) + if len(attribute_differences) > 0: + log('WARNING', "%s nodes with attribute differences"%len(attribute_differences)) + log('DEBUG', "nodes: %s"%(attribute_differences)) + def compare_relations(nx_graph1, nx_graph2): """compare relations of two graphs"""
--- a/importFromOpenMind/importer/ismixml2model.py Mon Feb 06 18:44:43 2017 +0100 +++ b/importFromOpenMind/importer/ismixml2model.py Tue Feb 07 21:06:13 2017 +0100 @@ -98,19 +98,19 @@ log('DEBUG', "entity has no attributes: %s"%ent_elem) else: + if atts_elem.tail is not None: + # tail belongs to parent + ov += atts_elem.tail + # go through all attributes for att_elem in atts_elem: - if att_elem.tail is not None: - # tail belongs to parent - ov += att_elem.tail - ct = att_elem.get('content-type', None) name = att_elem.get('name', None) if name in exclude_attributes_of_type: # exclude attribute continue - if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: + if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']: # normal text attribute (assume no content_type is text too...) val = att_elem.text @@ -120,11 +120,12 @@ if year is not None: val = year - # keep attribute - attrs[name] = val - #if 'nov' in att: - # # add normalized value - # attrs['_n_'+name] = att['nov'] + if val is not None: + # keep attribute + attrs[name] = val + #if 'nov' in att: + # # add normalized value + # attrs['_n_'+name] = att['nov'] elif ct == 'date': # date attribute @@ -199,19 +200,19 @@ atts_elem = rel_elem.find('attributes') if atts_elem is not None: + if atts_elem.tail is not None: + # tail belongs to parent + ov += atts_elem.tail + # go through all attributes for att_elem in atts_elem: - if att_elem.tail is not None: - # tail belongs to parent - ov += att_elem.tail - ct = att_elem.get('content-type', None) name = att_elem.get('name', None) if name in exclude_attributes_of_type: # exclude attribute continue - if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: + if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']: # normal text attribute (assume no content_type is text too...) val = att_elem.text @@ -221,11 +222,12 @@ if year is not None: val = year - # keep attribute - attrs[name] = val - #if 'nov' in att: - # # add normalized value - # attrs['_n_'+name] = att['nov'] + if val is not None: + # keep attribute + attrs[name] = val + #if 'nov' in att: + # # add normalized value + # attrs['_n_'+name] = att['nov'] elif ct == 'date': # date attribute
--- a/importFromOpenMind/importer/model2neo4j.py Mon Feb 06 18:44:43 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,103 +0,0 @@ -import networkx as nx -from neo4jrestclient.client import GraphDatabase -import sys - -## configure behaviour - -# metworkx graph file -input_fn = 'ismi_graph.gpickle' - -# label added to all nodes -project_label = '_ismi' - -# neo4j base URL -neo4jBaseURL = "http://localhost:7474/db/data/" - -# name of type attribute -node_type_attribute = '_type' -rel_type_attribute = '_type' - - -## setup - -n4j_nodes = {} -n4j_relations = {} - - -def copyNodes(nx_graph, n4j_graph): - """copy all nodes from nx_graph to n4j_graph""" - - print("Copying nodes to Neo4J") - cnt = 0 - for node_id in nx.nodes_iter(nx_graph): - attrs = nx_graph.node[node_id] - ntype = attrs[node_type_attribute] - ismi_id = attrs['ismi_id'] - # create node with attributes - n4j_node = n4j_graph.nodes.create(**attrs) - # add labels - n4j_node.labels.add([project_label, ntype]) - # save reference - n4j_nodes[ismi_id] = n4j_node - - cnt += 1 - if cnt % 100 == 0: - print(" %s nodes"%cnt) - - -def copyRelations(nx_graph, n4j_graph): - """copy all relations from nx_graph to n4j_graph""" - - print("Copying relations to Neo4J") - cnt = 0 - for nx_edge in nx.edges_iter(nx_graph): - (nx_src, nx_tar) = nx_edge - # get attributes of edge - attrs = nx_graph.edge[nx_src][nx_tar][0] - etype = attrs[rel_type_attribute] - # get ismi_id of source and target nodes - src_id = nx_graph.node[nx_src]['ismi_id'] - tar_id = nx_graph.node[nx_tar]['ismi_id'] - # get Neo4J nodes - src = n4j_nodes.get(src_id, None) - if src is None: - print("ERROR: src node %s missing!"%src_id) - break - - tar = n4j_nodes.get(tar_id, None) - if tar is None: - print("ERROR: tar node %s missing!"%tar_id) - break - - # create Neo4J relation - n4j_rel = n4j_graph.relationships.create(src, etype, tar) - # add attributes - n4j_rel.properties = attrs - - cnt += 1 - if cnt % 100 == 0: - print(" %s relations"%cnt) - - -## main - -print("Copy graph from networkx to Neo4J") - -# read commandline parameters -if len(sys.argv) > 1: - input_fn = sys.argv[1] - -# read networkx graph from pickle -print("Reading graph from %s"%input_fn) -nx_graph = nx.read_gpickle(input_fn) -print("Graph info: %s"%nx.info(nx_graph)) - -# open neo4j graph db -print("Opening Neo4J db at %s"%neo4jBaseURL) -n4j_graph = GraphDatabase(neo4jBaseURL, username="neo4j", password="neo5j") - -copyNodes(nx_graph, n4j_graph) - -copyRelations(nx_graph, n4j_graph) - -print("Done.")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/importFromOpenMind/importer/model2neo4j_client.py Tue Feb 07 21:06:13 2017 +0100 @@ -0,0 +1,144 @@ +import networkx as nx +from neo4j.v1 import GraphDatabase, basic_auth +import sys + +## configure behaviour + +# metworkx graph file +input_fn = 'ismi_graph.gpickle' + +# label added to all nodes +project_label = '_ismi' + +# neo4j base URL +neo4jBaseURL = "bolt://localhost:7687" + +# name of type attribute +node_type_attribute = '_type' +rel_type_attribute = '_type' + + +## setup + +n4j_nodes = {} +n4j_relations = {} + +# active log levels for logging +logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} +#logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} +#logLevels = {'INFO', 'ERROR', 'SYSMSG'} + +def log(level, message): + if level in logLevels: + print("%s: %s"%(level, message)) + + +def createIndices(nx_graph, n4j_graph): + """create indices for nodes from nx_graph in n4j_graph""" + + log('INFO', "Creating node indices in Neo4J") + cnt = 0 + ismi_types = set() + # collect types of all nodes + for node_id in nx.nodes_iter(nx_graph): + attrs = nx_graph.node[node_id] + # get entity type + ismi_types.add(attrs[node_type_attribute]) + + # create constraints for all types + for ismi_type in ismi_types: + query = "CREATE CONSTRAINT ON (n:%s) ASSERT n.ismi_id IS UNIQUE"%ismi_type + n4j_graph.run(query) + + +def copyNodes(nx_graph, n4j_graph): + """copy all nodes from nx_graph to n4j_graph""" + + log('INFO', "Copying nodes to Neo4J") + cnt = 0 + for node_id in nx.nodes_iter(nx_graph): + attrs = nx_graph.node[node_id] + # get entity type + ntype = attrs[node_type_attribute] + # get ismi_id + ismi_id = attrs['ismi_id'] + att_qs = ", ".join(["%s: {%s}"%(k, k) for k in attrs.keys()]) + # query to create node with attributes (parameter names are attribute keys) + cypher = "CREATE (n:%s {%s})"%(ntype, att_qs) + # run query + n4j_graph.run(cypher, parameters=attrs) + # save node id + n4j_nodes[ismi_id] = attrs + + cnt += 1 + if cnt % 100 == 0: + log('INFO', "%s nodes"%cnt) + + log('INFO', "%s nodes copied"%cnt) + + +def copyRelations(nx_graph, n4j_graph): + """copy all relations from nx_graph to n4j_graph""" + + print("Copying relations to Neo4J") + cnt = 0 + for nx_edge in nx.edges_iter(nx_graph): + (nx_src, nx_tar) = nx_edge + # get attributes of edge + attrs = nx_graph.edge[nx_src][nx_tar][0] + # get relation type + rtype = attrs[rel_type_attribute] + # get ismi_id of source and target nodes + src_id = nx_graph.node[nx_src]['ismi_id'] + tar_id = nx_graph.node[nx_tar]['ismi_id'] + # get Neo4J nodes + src = n4j_nodes.get(src_id, None) + if src is None: + print("ERROR: src node %s missing!"%src_id) + break + + tar = n4j_nodes.get(tar_id, None) + if tar is None: + print("ERROR: tar node %s missing!"%tar_id) + break + + src_type = src[node_type_attribute] + tar_type = tar[node_type_attribute] + + att_qs = ", ".join(["%s: {%s}"%(k, k) for k in attrs.keys()]) + # query to create a relation with attributes + cypher = "MATCH (n1:%s),(n2:%s) WHERE n1.ismi_id = %s AND n2.ismi_id = %s CREATE (n1)-[r:%s {%s}]->(n2)"%(src_type, tar_type, src_id, tar_id, rtype, att_qs) + # run query + n4j_graph.run(cypher, attrs) + + cnt += 1 + if cnt % 100 == 0: + log('INFO', "%s relations"%cnt) + + log('INFO', "%s relations copied"%cnt) +## main + +print("Copy graph from networkx to Neo4J") + +# read commandline parameters +if len(sys.argv) > 1: + input_fn = sys.argv[1] + +# read networkx graph from pickle +print("Reading graph from %s"%input_fn) +nx_graph = nx.read_gpickle(input_fn) +print("Graph info: %s"%nx.info(nx_graph)) + +# open neo4j graph db +print("Opening Neo4J db at %s"%neo4jBaseURL) +n4j_driver = GraphDatabase.driver(neo4jBaseURL, auth=basic_auth("neo4j", "neo5j")) +# get session to pass to functions +n4j_graph = n4j_driver.session() + +createIndices(nx_graph, n4j_graph) + +copyNodes(nx_graph, n4j_graph) + +copyRelations(nx_graph, n4j_graph) + +print("Done.")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/importFromOpenMind/importer/model2neo4j_import.py Tue Feb 07 21:06:13 2017 +0100 @@ -0,0 +1,171 @@ +import networkx as nx +import csv +import sys + +## configure behaviour + +# metworkx graph file +input_fn = 'ismi_graph.gpickle' + +# label added to all nodes +project_label = '_ismi' + +# neo4j import file +node_import_fn = "neo4j-nodes.csv" +relation_import_fn = "neo4j-relations.csv" + +# name of type attribute +node_type_attribute = '_type' +rel_type_attribute = '_type' + + +## setup + +n4j_nodes = {} +n4j_relations = {} + +# active log levels for logging +logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} +#logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} +#logLevels = {'INFO', 'ERROR', 'SYSMSG'} + +def log(level, message): + if level in logLevels: + print("%s: %s"%(level, message)) + + +def getNodeFields(nx_graph): + """returns a set of field names for nodes from nx_graph""" + + log('INFO', "Creating node field list") + ismi_fields = set() + # collect types of all nodes + for node_id in nx.nodes_iter(nx_graph): + attrs = nx_graph.node[node_id] + # save all attribute names + for att in attrs.keys(): + ismi_fields.add(att) + + return list(ismi_fields) + + +def getRelationFields(nx_graph): + """returns a set of field names for relations from nx_graph""" + + log('INFO', "Creating node field list") + ismi_fields = set() + # collect types of all relations + for nx_edge in nx.edges_iter(nx_graph): + (nx_src, nx_tar) = nx_edge + # get attributes of edge + attrs = nx_graph.edge[nx_src][nx_tar][0].copy() + # save all attribute names + for att in attrs.keys(): + ismi_fields.add(att) + + return list(ismi_fields) + + +def writeNodes(nx_graph, node_writer): + """write all nodes from nx_graph to node_writer""" + + log('INFO', "Writing nodes to CSV file") + cnt = 0 + for node_id in nx.nodes_iter(nx_graph): + attrs = nx_graph.node[node_id].copy() + # get entity type + ntype = attrs[node_type_attribute] + # add as label + attrs[':LABEL'] = ntype + # get ismi_id + ismi_id = attrs['ismi_id'] + # add ismi_id as node id + attrs[':ID'] = ismi_id + # write row + node_writer.writerow(attrs) + # save node id + n4j_nodes[ismi_id] = ismi_id + + cnt += 1 + if cnt % 100 == 0: + log('INFO', "%s nodes"%cnt) + + log('INFO', "%s nodes written"%cnt) + +def writeRelations(nx_graph, relation_writer): + """write all relations from nx_graph to node_writer""" + + log('INFO', "Writing relations to CSV file") + cnt = 0 + for nx_edge in nx.edges_iter(nx_graph): + (nx_src, nx_tar) = nx_edge + # get attributes of edge + attrs = nx_graph.edge[nx_src][nx_tar][0].copy() + # get relation type + rtype = attrs[rel_type_attribute] + # add as label + attrs[':TYPE'] = rtype + + # get ismi_id of source and target nodes + src_id = nx_graph.node[nx_src]['ismi_id'] + tar_id = nx_graph.node[nx_tar]['ismi_id'] + # check Neo4J nodes + src = n4j_nodes.get(src_id, None) + if src is None: + log("ERROR", "src node %s missing!"%src_id) + break + + tar = n4j_nodes.get(tar_id, None) + if tar is None: + log("ERROR", "tar node %s missing!"%tar_id) + break + + # use as start and end id + attrs[':START_ID'] = src_id + attrs[':END_ID'] = tar_id + + # write row + relation_writer.writerow(attrs) + + cnt += 1 + if cnt % 100 == 0: + log('INFO', "%s relations"%cnt) + + log('INFO', "%s relations written"%cnt) + +## main + +log('SYSINFO', "Copy graph from networkx to Neo4J") + +# read commandline parameters +if len(sys.argv) > 1: + input_fn = sys.argv[1] + +if len(sys.argv) > 3: + node_import_fn = sys.argv[2] + relation_import_fn = sys.argv[3] + +# read networkx graph from pickle +log('SYSINFO', "Reading graph from %s"%input_fn) +nx_graph = nx.read_gpickle(input_fn) +log('SYSINFO', "Graph info: %s"%nx.info(nx_graph)) + +# get field lists +node_fields = [':LABEL', ':ID'] + getNodeFields(nx_graph) +relation_fields = [':TYPE', ':START_ID', ':END_ID'] + getRelationFields(nx_graph) + +# write neo4j CSV import files +log('SYSINFO', "Opening node import file at %s"%node_import_fn) +with open(node_import_fn, 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=node_fields, dialect=csv.excel, quoting=csv.QUOTE_NONNUMERIC) + writer.writeheader() + writeNodes(nx_graph, writer) + +# write neo4j CSV import files +log('SYSINFO', "Opening relation import file at %s"%relation_import_fn) +with open(relation_import_fn, 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=relation_fields, dialect=csv.excel, quoting=csv.QUOTE_NONNUMERIC) + writer.writeheader() + writeRelations(nx_graph, writer) + +print("Done.")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/importFromOpenMind/importer/model2neo4j_restclient.py Tue Feb 07 21:06:13 2017 +0100 @@ -0,0 +1,103 @@ +import networkx as nx +from neo4jrestclient.client import GraphDatabase +import sys + +## configure behaviour + +# metworkx graph file +input_fn = 'ismi_graph.gpickle' + +# label added to all nodes +project_label = '_ismi' + +# neo4j base URL +neo4jBaseURL = "http://localhost:7474/db/data/" + +# name of type attribute +node_type_attribute = '_type' +rel_type_attribute = '_type' + + +## setup + +n4j_nodes = {} +n4j_relations = {} + + +def copyNodes(nx_graph, n4j_graph): + """copy all nodes from nx_graph to n4j_graph""" + + print("Copying nodes to Neo4J") + cnt = 0 + for node_id in nx.nodes_iter(nx_graph): + attrs = nx_graph.node[node_id] + ntype = attrs[node_type_attribute] + ismi_id = attrs['ismi_id'] + # create node with attributes + n4j_node = n4j_graph.nodes.create(**attrs) + # add labels + n4j_node.labels.add([project_label, ntype]) + # save reference + n4j_nodes[ismi_id] = n4j_node + + cnt += 1 + if cnt % 100 == 0: + print(" %s nodes"%cnt) + + +def copyRelations(nx_graph, n4j_graph): + """copy all relations from nx_graph to n4j_graph""" + + print("Copying relations to Neo4J") + cnt = 0 + for nx_edge in nx.edges_iter(nx_graph): + (nx_src, nx_tar) = nx_edge + # get attributes of edge + attrs = nx_graph.edge[nx_src][nx_tar][0] + etype = attrs[rel_type_attribute] + # get ismi_id of source and target nodes + src_id = nx_graph.node[nx_src]['ismi_id'] + tar_id = nx_graph.node[nx_tar]['ismi_id'] + # get Neo4J nodes + src = n4j_nodes.get(src_id, None) + if src is None: + print("ERROR: src node %s missing!"%src_id) + break + + tar = n4j_nodes.get(tar_id, None) + if tar is None: + print("ERROR: tar node %s missing!"%tar_id) + break + + # create Neo4J relation + n4j_rel = n4j_graph.relationships.create(src, etype, tar) + # add attributes + n4j_rel.properties = attrs + + cnt += 1 + if cnt % 100 == 0: + print(" %s relations"%cnt) + + +## main + +print("Copy graph from networkx to Neo4J") + +# read commandline parameters +if len(sys.argv) > 1: + input_fn = sys.argv[1] + +# read networkx graph from pickle +print("Reading graph from %s"%input_fn) +nx_graph = nx.read_gpickle(input_fn) +print("Graph info: %s"%nx.info(nx_graph)) + +# open neo4j graph db +print("Opening Neo4J db at %s"%neo4jBaseURL) +n4j_graph = GraphDatabase(neo4jBaseURL, username="neo4j", password="neo5j") + +copyNodes(nx_graph, n4j_graph) + +copyRelations(nx_graph, n4j_graph) + +print("Done.")