view importFromOpenMind/importer/ismi2model.py @ 29:1a1877812757

include normalized attributes in neo4j with prefix "_n_"
author casties
date Thu, 10 Dec 2015 12:11:25 -0500
parents a9bfd49355f8
children ce12475d2109
line wrap: on
line source

import urllib.request
import json
import networkx
import sys

## configure behaviour

# output filename
output_fn = "ismi_graph.gpickle"

# OpenMind base URL
baseURL="http://localhost:18080/om4-ismi/jsonInterface?"

# node types to exclude from the graph
exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE']

# attributes to exclude
exclude_attributes_of_type = [
    'lw',
    'node_type',
    'nov',
    'notes_old'
]


entsURL=baseURL+"method=get_ents&oc=%s"

entsByIdURL = baseURL+"method=get_ents&include_content=True&ids=%s"

entURL=baseURL+"method=get_ent&id=%s&include_content=True"


def readJSON(url):
    #print("JSON loading %s"%url)
    wsh=urllib.request.urlopen(url)
    txt = wsh.read()
    return json.loads(txt.decode("utf-8"))
    
defs_json = readJSON(baseURL+"method=get_defs")

# current list of all definitions 
ismi_defs = [atts['ov'] for atts in defs_json['defs']]

#ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]


nx_graph = networkx.MultiDiGraph()

nx_nodes = {}
ismi_relations = {}
nx_relations = {}


def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
    # these are too embarrassing...
    if 'FLORUIT' in name:
        name = name.replace('FLORUIT', 'FLOURISH')
        
    elif 'floruit' in name:
        name = name.replace('floruit', 'flourish')
        
    if is_src_rel:
        #name = name + '>'
        pass
        
    if is_tar_rel:
        name = '<' + name
        
    if att_from_rel:
        # clean up relations as attribute names
        name = name.replace('is_', '')
        name = name.replace('has_', '')
        name = name.replace('was_', '')
        name = name.replace('_of', '')

    return name


def nodeFromEnt(ent, etype):
    """Create a Neo4J node from the given JSON entity.
    
    Creates the node in gdb and returns the node.
    """ 
    attrs = {}
    # go through all attributes
    for att in ent['atts']:
        ct = att.get('content_type', None)
        if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
            # normal text attribute (assume no content_type is text too...)
            key = att['name']
            val = att['ov']
            
            if key in exclude_attributes_of_type:
                # exclude attribute
                continue

            # keep attribute
            attrs[key] = val
            if 'nov' in att:
                # add normalized value
                attrs['_n_'+key] = att['nov']
            
        elif ct == 'num':
            # number attribute
            key = att['name']
            val = att['ov']
            
            if key in exclude_attributes_of_type:
                # exclude attribute
                continue

            # keep attribute, assume num is int
            attrs[key] = int(val)
            
        elif ct == 'date':
            # date attribute
            key = att['name']
            val = att['ov']
            # try to parse date object to get gregorian year
            try:
                year = None
                date_json = json.loads(val)
                if 'from' in date_json:
                    year = date_json['from'].get('year', None)
                elif 'date' in date_json:
                    year = date_json['date'].get('year', None)
                else:
                    print("don't know what to do with date on %s: %s=%s"%(ent['id'],key,val))
                    
                if year is not None:
                    attrs[key] = year
                    
            except:
                print("ERROR: invalid JSON in date: %s"%repr(val))
            
        elif ct == 'old':
            # ignore attribute
            continue
            
        else:
            print("WARN: attribute with unknown content_type: %s"%repr(att))
            # ignore other content types
            continue
            
    # process base attributes
    oc = ent['oc']
    if oc != etype:
        print("ERROR: entity type doesn't match!")
        return None
            
    # rename if type attr exists
    if 'type' in attrs:
        attrs['type2'] = attrs['type']
        
    # set type
    attrs['type'] = fixName(oc)
                
    ismi_id = ent['id']
    # rename id to ismi_id
    attrs['ismi_id'] = ismi_id
            
    ov = ent.get('ov', None)
    if ov is not None:
        # save ov as label
        attrs['label'] = ov
        if 'nov' in ent:
            # add normalized value
            attrs['_n_label'] = ent.get('nov')
                        
    nx_graph.add_node(ismi_id, **attrs)
    node = nx_graph.node[ismi_id]
    
    return node


def relsFromEnt(ent, relations):
    """Extract all relations from JSON entity.
    
    Adds JSON to dict relations under relation's id.
    """
    # go through src_rels and tar_rels
    rels = ent.get('src_rels', []) + ent.get('tar_rels', [])
    for rel in rels:
        src_type = rel['src_oc']
        tar_type = rel['tar_oc']
        if src_type in exclude_objects_of_type or tar_type in exclude_objects_of_type:
            # skip relation to excluded objects
            continue
        
        rel_id = rel['id']
        if rel_id in relations:
            old_rel = relations[rel_id]
            if rel != old_rel:
                print("ERROR: relation is different: %s != %s"%(repr(rel), repr(old_rel)))
                continue
            
        relations[rel_id] = rel
        
    return relations


def relationsFromRels(rels, nodes):
    """Create relations in Neo4J.
    
    Args:
        rels: dict of JSON relations
        nodes: dict of existing Neo4J nodes
    Returns:
        dict of Neo4J relations
    """
    # go through all rels
    print("importing %s relations"%len(rels))
    cnt = 0
    for rel in rels.values():
        cnt += 1
        if cnt % 100 == 0:
            print(" %s relations"%cnt)
            
        rel_id = rel['id']
        rel_name = rel['name']
        src_id = rel['src_id']
        tar_id = rel['tar_id']
        if not src_id in nodes:
            print("ERROR: relation %s src node %s missing!"%(rel_id,src_id))
            continue
        
        if not tar_id in nodes:
            print("ERROR: relation %s tar node %s missing!"%(rel_id,tar_id))
            continue
        
        # create relation with type
        nx_rel = nx_graph.add_edge(src_id, tar_id, type=fixName(rel_name), ismi_id=rel_id)
        
        nx_relations[rel_id] = nx_rel
        
    return nx_relations


def importEnts(etype):
    """Import all entities of the given type.
    """
    # read json for all entities of given type
    json = readJSON(entsURL%etype)
    ents = json['ents']
    print("importing %s %ss"%(len(ents),etype))
    size = 100
    batches = [ents[pos:pos + size] for pos in range(0, len(ents), size)]
    cnt = 0
    for batch in batches:
        cnt += size
        if cnt % 100 == 0:
            print(" %s %ss"%(cnt, etype))
            
        # extract list of ismi ids
        ismi_ids = [str(ent['id']) for ent in batch]
        
        # fetch full data for list of entities
        ent_json = readJSON(entsByIdURL%','.join(ismi_ids))
        ents_data = ent_json['ents']
        
        # iterate through results batch
        for ent_data in ents_data:
            ismi_id = ent_data['id']
            if ismi_id in nx_nodes:
                print("ERROR: entity with id=%s exists!"%ismi_id)
                return
            
            # create networkx node
            node = nodeFromEnt(ent_data, etype)
            
            # save node reference
            nx_nodes[ismi_id] = node
            
            # extract relations
            relsFromEnt(ent_data, ismi_relations)
        
        #if cnt >= 100:
        #    return


# In[119]:

def importAllEnts(etypes):
    
    for etype in etypes:
        if etype in exclude_objects_of_type:
            # skip this type
            continue
        
        importEnts(etype)
        
    relationsFromRels(ismi_relations, nx_nodes)


## main

print("Copy graph from OpenMind to networkx pickle")

# parse command line parameters
if len(sys.argv) > 1:
    output_fn = sys.argv[1]

# import everything
print("Reading graph from OpenMind at %s"%baseURL)
if len(exclude_objects_of_type) > 0:
    print("  Skipping objects of type %s"%exclude_objects_of_type);
    
importAllEnts(ismi_defs)
#importAllEnts(['TEXT'])

print("Graph info: %s"%networkx.info(nx_graph))
#print(" nodes:%s"%repr(nx_graph.nodes(data=True)))

# export pickle
networkx.write_gpickle(nx_graph, output_fn)
print("Wrote networkx pickle file %s"%output_fn)