view importFromOpenMind/importer/model2model.py @ 29:1a1877812757

include normalized attributes in neo4j with prefix "_n_"
author casties
date Thu, 10 Dec 2015 12:11:25 -0500
parents a9bfd49355f8
children 870b0b3b272f
line wrap: on
line source

import networkx as nx
import sys
import csv

## configure behaviour

# metworkx graph files
input_fn = 'ismi_graph.gpickle'
output_fn = 'ismi_graph_mod.gpickle'

# operations
ops = ['locate', 'contract', 'inv_rels', 'add_links']

# types of object to locate
locate_objects_of_type = ['PLACE']

# file with place location information
places_fn = 'ismi_places_loc.csv'

# node types to remove from the graph
#remove_objects_of_type = ['DIGITALIZATION', 'REFERENCE']

# add relations to these objects as attributes with the relation's name
contract_relations_into_attributes = {'PLACE': ['label', 'latitude', 'longitude'],
                                      'ALIAS': ['label']}


# add URLs to nodes using an attribute in a pattern
#add_link_attributes = {'ismi_id': 'https://ismi-dev.mpiwg-berlin.mpg.de/drupal-ismi/entity/%s'}
add_link_attributes = {'ismi_id': 'https://ismi-dev.mpiwg-berlin.mpg.de/om4-ismi/public/entityDetails.xhtml?eid=%s'}


def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
    # these are too embarrassing...
    if 'FLORUIT' in name:
        name = name.replace('FLORUIT', 'FLOURISH')
        
    elif 'floruit' in name:
        name = name.replace('floruit', 'flourish')
        
    if is_src_rel:
        #name = name + '>'
        pass
        
    if is_tar_rel:
        name = '<' + name
        
    if att_from_rel:
        # clean up relations as attribute names
        name = name.replace('is_', '')
        name = name.replace('has_', '')
        name = name.replace('was_', '')
        name = name.replace('_of', '')

    return name


def locatePlaces(nx_graph):
    """add location information to objects in the graph"""
    
    print("Adding location information from %s to %s."%(places_fn, locate_objects_of_type))
    cnt = 0
    
    # read place location file
    locations = {}
    with open(places_fn, encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            lat = row['Latitude']
            lon = row['Longitude']
            name = row['Address']
            if lat and lon:
                locations[name] = {'latitude': lat, 'longitude': lon}

    # iterate all nodes
    for n in nx.nodes_iter(nx_graph):
        attrs = nx_graph.node[n]
        if attrs['type'] in locate_objects_of_type:
            # locatable object
            name = attrs['label']
            if name in locations:
                # place name match
                location = locations[name]
                attrs['latitude'] = location['latitude']
                attrs['longitude'] = location['longitude']
                
            else:
                print("WARNING: no location for name '%s'"%name)

        cnt += 1
        if cnt % 100 == 0:
            print("  %s nodes"%cnt)

    

def genAttName(attrs, name):
    """Generate new attribute name.
    """
    while attrs.get(name, None) is not None:
    # attribute exists
        if name[-1].isnumeric(): # increment last digit
            name = name[:-1] + str(int(name[-1]) + 1)
        else:
            name += '2'
    
    return name


def contractRelations(nx_graph):
    """contract relations into attributes"""
    
    print("Contracting relations to attributes.")
    cnt = 0
    for nx_edge in nx.edges_iter(nx_graph):
        (nx_src, nx_tar) = nx_edge
        # get attributes of edge
        rel_attrs = nx_graph.edge[nx_src][nx_tar][0]
        rel_type = rel_attrs['type']
        # get attributes of source and target nodes
        src_attrs = nx_graph.node[nx_src]
        tar_attrs = nx_graph.node[nx_tar]
        
        # contract source relations
        tar_type = tar_attrs['type']
        if tar_type in contract_relations_into_attributes:
            # get list of attributes to transfer
            transfer_atts = contract_relations_into_attributes[tar_type]
            for transfer_att in transfer_atts:
                if transfer_att not in tar_attrs:
                    # target has no attribute
                    continue
                
                # name for new attribute starts with relation name
                att_name = fixName(rel_type, att_from_rel=True)
                # then attribute name
                if transfer_att != 'label':
                    att_name += "_%s"%transfer_att
                    
                # then generate unique name
                att_name = genAttName(src_attrs, att_name)
                # add target node's attribute
                src_attrs[att_name] = tar_attrs.get(transfer_att)
                # also add normalized attribute
                if '_n_'+transfer_att in tar_attrs:
                    src_attrs['_n_'+att_name] = tar_attrs.get('_n_'+transfer_att)
            
        # contract target relations
        src_type = src_attrs['type']
        if src_type in contract_relations_into_attributes:
            # get list of attributes to transfer
            transfer_atts = contract_relations_into_attributes[src_type]
            for transfer_att in transfer_atts:
                if transfer_att not in src_attrs:
                    # target has no attribute
                    continue
                
                # name for new attribute starts with relation name
                att_name = fixName(rel_type, att_from_rel=True)
                # then attribute name
                if transfer_att != 'label':
                    att_name += "_%s"%transfer_att
                    
                # then generate unique name
                att_name = genAttName(tar_attrs, att_name)
                # add target node's attribute
                tar_attrs[att_name] = src_attrs.get(transfer_att)
                # also add normalized attribute
                if '_n_'+transfer_att in src_attrs:
                    tar_attrs['_n_'+att_name] = src_attrs.get('_n_'+transfer_att)

        cnt += 1
        if cnt % 100 == 0:
            print("  %s relations"%cnt)


def invertRelations(nx_graph):
    """Add inverse relations to each relation"""
    
    print("Adding inverse relations.")
    # copy list of edges because we add edges in the loop
    edges = nx.edges(nx_graph)[:]
    # iterate list
    cnt = 0
    for nx_edge in edges:
        (nx_src, nx_tar) = nx_edge
        # get attributes of edge
        rel_attrs = nx_graph.edge[nx_src][nx_tar][0]
        rel_type = rel_attrs['type']
        rel_id = rel_attrs['ismi_id']
        # create new relation
        nx_graph.add_edge(nx_tar, nx_src, type=fixName(rel_type, is_tar_rel=True), ismi_id=-rel_id)

        cnt += 1
        if cnt % 100 == 0:
            print("  %s relations"%cnt)
        
    
def addLinks(nx_graph):
    """Add link attributes to all nodes."""
    
    print("Adding links: %s"%repr(add_link_attributes))
    cnt = 0
    for link_att, link_pattern in add_link_attributes.items():
        # iterate all nodes
        for n in nx.nodes_iter(nx_graph):
            attrs = nx_graph.node[n]
            if link_att in attrs:
                url = link_pattern%attrs[link_att]
                # TODO: which target attribute for multiple?
                attrs['link'] = url

            cnt += 1
            if cnt % 100 == 0:
                print("  %s nodes"%cnt)
        
    
## main

print("Modify networkx graph")

# read commandline parameters
if len(sys.argv) > 2:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

if len(sys.argv) > 3:
    ops = sys.argv[3].split(',')

# read networkx graph from pickle
print("Reading graph from %s"%input_fn)
nx_graph = nx.read_gpickle(input_fn)
print("Graph info: %s"%nx.info(nx_graph))

# operate    
for op in ops:
    if op == 'locate':
        locatePlaces(nx_graph)
        
    elif op == 'contract':
        contractRelations(nx_graph)
        
    elif op == 'inv_rels':
        invertRelations(nx_graph)

    elif op == 'add_links':
        addLinks(nx_graph)
        
    else:
        print("ERROR: unknown operation %s"%op)

print("Writing graph to %s"%output_fn)
nx_graph = nx.write_gpickle(nx_graph, output_fn)

print("Done.")