view importFromOpenMind/importer/ismixml2model.py @ 48:6625019a0c96

old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
author casties
date Tue, 07 Feb 2017 21:06:13 +0100
parents 378dcb66a27f
children 5a633e875490
line wrap: on
line source

import xml.etree.ElementTree as ET
import json
import networkx
import sys

## configure behaviour

# output filename
output_fn = "ismi_graph.gpickle"

input_fn = "openmind-data.xml"


# node types to exclude from the graph
exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE']

# attributes to exclude
exclude_attributes_of_type = [
    'lw',
    'node_type',
    'nov',
    'notes_old'
]

# name of type attribute
node_type_attribute = '_type'
rel_type_attribute = '_type'

#ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]


nx_graph = networkx.MultiDiGraph()

nx_nodes = {}
ismi_relations = {}
nx_relations = {}

# active log levels for logging
#logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
#logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
logLevels = {'INFO', 'ERROR', 'SYSMSG'}

def log(level, message):
    if level in logLevels:
        print("%s: %s"%(level, message))


def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
    if is_src_rel:
        #name = name + '>'
        pass
        
    if is_tar_rel:
        name = '<' + name
        
    if att_from_rel:
        # clean up relations as attribute names
        name = name.replace('is_', '')
        name = name.replace('has_', '')
        name = name.replace('was_', '')
        name = name.replace('_of', '')

    return name



def parseYear(val):
    year = None
    try:
        date_json = json.loads(val)
        if 'from' in date_json:
            year = date_json['from'].get('year', None)
        elif 'date' in date_json:
            year = date_json['date'].get('year', None)
        else:
            log("WARNING", "don't know what to do with date %s"%(val))
            
    except:
        pass
    
    return year


def nodeFromEnt(ent_elem):
    """Create a graph node from the given XML entity.
    
    Creates the node in gdb and returns the node.
    """     
    # text content of entity element
    ov = ent_elem.text or ''

    attrs = {}

    # get attributes element
    atts_elem = ent_elem.find('attributes')
    
    if atts_elem is None:
        log('DEBUG', "entity has no attributes: %s"%ent_elem)
        
    else:
        if atts_elem.tail is not None:
            # tail belongs to parent
            ov += atts_elem.tail
        
        # go through all attributes
        for att_elem in atts_elem:
            ct = att_elem.get('content-type', None)
            name = att_elem.get('name', None)
            if name in exclude_attributes_of_type:
                # exclude attribute
                continue
    
            if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']:
                # normal text attribute (assume no content_type is text too...)
                val = att_elem.text
                
                if val is not None and val[0] == '{':
                    # try to parse as date
                    year = parseYear(val)
                    if year is not None:
                        val = year
                    
                if val is not None:
                    # keep attribute
                    attrs[name] = val
                    #if 'nov' in att:
                    #    # add normalized value
                    #    attrs['_n_'+name] = att['nov']
                
            elif ct == 'date':
                # date attribute
                val = att_elem.text
                if val is not None:
                    # try to parse date object to get gregorian year
                    year = parseYear(val)
                    if year is not None:
                        attrs[name] = year
                
            elif ct == 'num':
                # number attribute
                val = att_elem.text
                if val is not None:
                    # keep attribute, assume num is int
                    attrs[name] = int(val)
                
            elif ct == 'old':
                # ignore attribute
                continue
                
            else:
                log("WARN", "attribute with unknown content_type: %s"%repr(att_elem))
                # ignore other content types
                continue
            
    # process base attributes
    oc = ent_elem.get('object-class')
            
    # set type
    attrs[node_type_attribute] = fixName(oc)
                
    ismi_id = int(ent_elem.get('id'))
    # rename id to ismi_id
    attrs['ismi_id'] = ismi_id
            
    if len(ov) > 0:
        # save ov as label
        attrs['label'] = ov
        #if 'nov' in ent:
        #    # add normalized value
        #    attrs['_n_label'] = ent.get('nov')
    
    # create node
    log('DEBUG', "new node(%s, %s)"%(ismi_id, attrs))
    nx_graph.add_node(ismi_id, **attrs)
    node = nx_graph.node[ismi_id]
    
    return node


def relationFromRel(rel_elem):
    """Create graph relation from etree element.    
    """
    rel_id = int(rel_elem.get('id'))
    rel_name = rel_elem.get('object-class')
    src_id = int(rel_elem.get('source-id'))
    tar_id = int(rel_elem.get('target-id'))
    if not src_id in nx_nodes:
        log("WARNING", "relation %s src node %s missing!"%(rel_id,src_id))
        return None
    
    if not tar_id in nx_nodes:
        log("WARNING", "relation %s tar node %s missing!"%(rel_id,tar_id))
        return None

    ov = rel_elem.text or ''

    attrs = {}

    # get attributes element
    atts_elem = rel_elem.find('attributes')
    
    if atts_elem is not None:
        if atts_elem.tail is not None:
            # tail belongs to parent
            ov += atts_elem.tail
        
        # go through all attributes
        for att_elem in atts_elem:
            ct = att_elem.get('content-type', None)
            name = att_elem.get('name', None)
            if name in exclude_attributes_of_type:
                # exclude attribute
                continue
    
            if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']:
                # normal text attribute (assume no content_type is text too...)
                val = att_elem.text
                
                if val is not None and val[0] == '{':
                    # try to parse as date
                    year = parseYear(val)
                    if year is not None:
                        val = year
                    
                if val is not None:
                    # keep attribute
                    attrs[name] = val
                    #if 'nov' in att:
                    #    # add normalized value
                    #    attrs['_n_'+name] = att['nov']
                
            elif ct == 'date':
                # date attribute
                val = att_elem.text
                if val is not None:
                    # try to parse date object to get gregorian year
                    year = parseYear(val)
                    if year is not None:
                        attrs[name] = year
                
            elif ct == 'num':
                # number attribute
                val = att_elem.text
                if val is not None:
                    # keep attribute, assume num is int
                    attrs[name] = int(val)
                
            elif ct == 'old':
                # ignore attribute
                continue
                
            else:
                log("WARN", "attribute with unknown content_type: %s"%repr(att_elem))
                # ignore other content types
                continue
    
    #if len(ov) > 0:
    #    # own value of relation is not useful
    #    attrs['ov'] = ov
        
    attrs[rel_type_attribute] = fixName(rel_name)
    attrs['ismi_id'] = rel_id
    #log('DEBUG', "new edge(%s, %s, %s)"%(src_id, tar_id, attrs))
    # create relation with type
    nx_rel = nx_graph.add_edge(src_id, tar_id, attr_dict=attrs)
    
    return nx_rel


def importEnts(ents_elem):
    """Import all entities from etree element elem.
    """
    cnt = 0
    xml_num = ents_elem.get('number')
    log('INFO', "XML says %s entities"%xml_num)
    
    # iterate through entities element
    for ent_elem in ents_elem:
        cnt += 1
        
        oc = ent_elem.get('object-class')
        if oc in exclude_objects_of_type:
            # skip this entity
            continue
        
        ismi_id = int(ent_elem.get('id'))
        log('DEBUG', "reading entity[%s]"%ismi_id)
        
        if ismi_id in nx_nodes:
            log("ERROR", "entity with id=%s exists!"%ismi_id)
            return
        
        # create networkx node
        node = nodeFromEnt(ent_elem)
        
        # save node reference
        nx_nodes[ismi_id] = node
        
        # debug        
        #if cnt >= 100:
        #    return


def importRels(rels_elem):
    """Import all entities from etree element elem.
    """
    cnt = 0
    xml_num = rels_elem.get('number')
    log('INFO', "XML says %s relations"%xml_num)
    
    # iterate through entities element
    for rel_elem in rels_elem:
        cnt += 1

        ismi_id = int(rel_elem.get('id'))
        log('DEBUG', "reading relation[%s]"%ismi_id)
        
        if ismi_id in nx_relations:
            print("ERROR: relation with id=%s exists!"%ismi_id)
            return
        
        # create networkx relation
        relation = relationFromRel(rel_elem)
        
        # save relation reference
        nx_relations[ismi_id] = relation
        
        # debug
        #if cnt >= 100:
        #    return


def importAll():
    # parse XML file
    log('INFO', "parsing XML file %s"%input_fn)
    tree = ET.parse(input_fn)
    log('DEBUG', "etree ready")
    root = tree.getroot()
    ents = root.find('entities')
    importEnts(ents)
        
    rels = root.find('relations')
    importRels(rels)

## main

print("Copy graph from OpenMind-XML to networkx pickle")

# parse command line parameters
if len(sys.argv) > 1:
    input_fn = sys.argv[1]

if len(sys.argv) > 2:
    output_fn = sys.argv[2]

# import everything
print("Reading graph from OpenMind-XML file %s"%input_fn)
if len(exclude_objects_of_type) > 0:
    print("  Skipping objects of type %s"%exclude_objects_of_type);
    
importAll()

print("Graph info: %s"%networkx.info(nx_graph))
#print(" nodes:%s"%repr(nx_graph.nodes(data=True)))

# export pickle
networkx.write_gpickle(nx_graph, output_fn)
print("Wrote networkx pickle file %s"%output_fn)