view importFromOpenMind/importer/model2neo4j_import.py @ 52:763b5d29fa5e

add int type to ismi_id attribute in neo4j csv import file.
author casties
date Fri, 17 Mar 2017 17:51:18 +0100
parents 6625019a0c96
children
line wrap: on
line source

import networkx as nx
import csv
import sys

## configure behaviour

# metworkx graph file
input_fn = 'ismi_graph.gpickle'

# neo4j import file
node_import_fn = "neo4j-nodes.csv"
relation_import_fn = "neo4j-relations.csv"

# name of type attribute
node_type_attribute = '_type'
rel_type_attribute = '_type'


## setup

n4j_nodes = {}
n4j_relations = {}

# active log levels for logging
logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
#logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
#logLevels = {'INFO', 'ERROR', 'SYSMSG'}

def log(level, message):
    if level in logLevels:
        print("%s: %s"%(level, message))


def getNodeFields(nx_graph):
    """returns a set of field names for nodes from nx_graph"""
    
    log('INFO', "Creating node field list")
    ismi_fields = set()
    # collect types of all nodes
    for node_id in nx.nodes_iter(nx_graph):
        attrs = nx_graph.node[node_id]
        # save all attribute names
        for att in attrs.keys():
            # add type to ismi_id
            if att == 'ismi_id':
                att = 'ismi_id:int'
                
            ismi_fields.add(att)
        
    return list(ismi_fields)


def getRelationFields(nx_graph):
    """returns a set of field names for relations from nx_graph"""
    
    log('INFO', "Creating node field list")
    ismi_fields = set()
    # collect types of all relations
    for nx_edge in nx.edges_iter(nx_graph):
        (nx_src, nx_tar) = nx_edge
        # get attributes of edge
        attrs = nx_graph.edge[nx_src][nx_tar][0].copy()
        # save all attribute names
        for att in attrs.keys():
            # add type to ismi_id
            if att == 'ismi_id':
                att = 'ismi_id:int'
                
            ismi_fields.add(att)
        
    return list(ismi_fields)


def writeNodes(nx_graph, node_writer):
    """write all nodes from nx_graph to node_writer"""
    
    log('INFO', "Writing nodes to CSV file")
    cnt = 0
    for node_id in nx.nodes_iter(nx_graph):
        attrs = nx_graph.node[node_id].copy()
        # get entity type
        ntype = attrs[node_type_attribute]
        # add as label
        attrs[':LABEL'] = ntype
        
        # get ismi_id
        ismi_id = attrs['ismi_id']
        # add ismi_id as node id
        attrs[':ID'] = ismi_id
        
        # change ismi_id key to add type
        attrs['ismi_id:int'] = ismi_id
        del attrs['ismi_id']
        
        # write row
        node_writer.writerow(attrs)
        
        # save node id
        n4j_nodes[ismi_id] = ismi_id

        cnt += 1
        if cnt % 100 == 0:
            log('INFO', "%s nodes"%cnt)

    log('INFO', "%s nodes written"%cnt)

def writeRelations(nx_graph, relation_writer):
    """write all relations from nx_graph to node_writer"""
    
    log('INFO', "Writing relations to CSV file")
    cnt = 0
    for nx_edge in nx.edges_iter(nx_graph):
        (nx_src, nx_tar) = nx_edge
        # get attributes of edge
        attrs = nx_graph.edge[nx_src][nx_tar][0].copy()
        # get relation type
        rtype = attrs[rel_type_attribute]
        # add as label
        attrs[':TYPE'] = rtype
        
        # get ismi_id of source and target nodes
        src_id = nx_graph.node[nx_src]['ismi_id']
        tar_id = nx_graph.node[nx_tar]['ismi_id']
        # check Neo4J nodes
        src = n4j_nodes.get(src_id, None)
        if src is None:
            log("ERROR", "src node %s missing!"%src_id)
            break
        
        tar = n4j_nodes.get(tar_id, None)
        if tar is None:
            log("ERROR", "tar node %s missing!"%tar_id)
            break
        
        # use as start and end id
        attrs[':START_ID'] = src_id
        attrs[':END_ID'] = tar_id

        # change ismi_id key to add type
        attrs['ismi_id:int'] = attrs['ismi_id']
        del attrs['ismi_id']

        # write row
        relation_writer.writerow(attrs)
        
        cnt += 1
        if cnt % 100 == 0:
            log('INFO', "%s relations"%cnt)

    log('INFO', "%s relations written"%cnt)

## main

log('SYSINFO', "Copy graph from networkx to Neo4J")

# read commandline parameters
if len(sys.argv) > 1:
    input_fn = sys.argv[1]

if len(sys.argv) > 3:
    node_import_fn = sys.argv[2]
    relation_import_fn = sys.argv[3]

# read networkx graph from pickle
log('SYSINFO', "Reading graph from %s"%input_fn)
nx_graph = nx.read_gpickle(input_fn)
log('SYSINFO', "Graph info: %s"%nx.info(nx_graph))

# get field lists
node_fields = [':LABEL', ':ID'] + getNodeFields(nx_graph)
relation_fields = [':TYPE', ':START_ID', ':END_ID'] + getRelationFields(nx_graph)

# write neo4j CSV import files
log('SYSINFO', "Opening node import file at %s"%node_import_fn)
with open(node_import_fn, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=node_fields, dialect=csv.excel, quoting=csv.QUOTE_NONNUMERIC)
    writer.writeheader()
    writeNodes(nx_graph, writer)

# write neo4j CSV import files
log('SYSINFO', "Opening relation import file at %s"%relation_import_fn)
with open(relation_import_fn, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=relation_fields, dialect=csv.excel, quoting=csv.QUOTE_NONNUMERIC)
    writer.writeheader()
    writeRelations(nx_graph, writer)

print("Done.")