view importFromOpenMind/importer/ismixml_splitter.py @ 60:1b520696760a default tip

new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
author casties
date Tue, 19 Jun 2018 21:46:49 +0200
parents
children
line wrap: on
line source

import xml.etree.ElementTree as ET
import sys

# output filename pattern
output_fn = 'openmind-data-%s.xml'

# input filename
input_fn = 'openmind-data.xml'

# dict of types and doms
output_doms = {}

# node types to exclude from the graph
# exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE']
exclude_objects_of_type = []

# active log levels for logging
# logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
# logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
logLevels = {'INFO', 'ERROR', 'SYSMSG'}


def log(level, message):
    if level in logLevels:
        print("%s: %s" % (level, message))


def startOutputDom(root, ent_elem, rel_elem, ent_type):
    """Creates XML DOM for type ent_type.
       Puts the DOM into output_doms.
    """
    tree = ET.ElementTree()
    # create new root element
    new_root = ET.Element(root.tag, root.attrib)
    # set new root
    tree._setroot(new_root)
    # create new entity container element
    new_ent_elem = ET.SubElement(new_root, ent_elem.tag, ent_elem.attrib)
    # create new entity container element
    new_rel_elem = ET.SubElement(new_root, rel_elem.tag, rel_elem.attrib)
    # save element reference
    output_doms[ent_type] = {'tree': tree, 'root': new_root, 'ent_cnt': 0, 'rel_cnt': 0, 
                             'ents_elem': new_ent_elem, 'rels_elem': new_rel_elem}


def importEnts(root, ents_elem, rels_elem):
    """Import all entities from etree element elem.
    """
    cnt = 0
    xml_num = ents_elem.get('count')
    log('INFO', "XML says %s entities. Processing..." % xml_num)
    
    # iterate through entities element
    for ent_elem in ents_elem:
        cnt += 1
        
        oc = ent_elem.get('object-class')
        if oc in exclude_objects_of_type:
            # skip this entity
            continue
        
        if (not oc in output_doms):
            # create new output dom
            startOutputDom(root, ents_elem, rels_elem, oc)
            
        target_elem = output_doms[oc]['ents_elem']
        target_elem.append(ent_elem)
        output_doms[oc]['ent_cnt'] += 1;


def importRels(rels_elem):
    """Import all entities from etree element elem.
    """
    cnt = 0
    xml_num = rels_elem.get('count')
    log('INFO', "XML says %s relations. Processing..." % xml_num)
    
    # iterate through entities element
    for rel_elem in rels_elem:
        cnt += 1

        oc = rel_elem.get('source-class')
        if oc in exclude_objects_of_type:
            # skip this entity
            continue
        
        if (not oc in output_doms):
            log('ERROR', "relation source class unknown: %s"%oc)
            continue
            
        target_elem = output_doms[oc]['rels_elem']
        target_elem.append(rel_elem)
        output_doms[oc]['rel_cnt'] += 1;
        

def importAll():
    # parse XML file
    log('INFO', "parsing XML file %s" % input_fn)
    tree = ET.parse(input_fn)
    log('DEBUG', "etree ready")
    root = tree.getroot()
    ents = root.find('entities')
    rels = root.find('relations')
    # import and process
    importEnts(root, ents, rels)
    importRels(rels)

  
def exportAll():
    """Write all XML files"""  
    for oc in output_doms.keys():
        # update counts
        ent_elem = output_doms[oc]['ents_elem']
        ent_cnt = output_doms[oc]['ent_cnt']
        ent_elem.set('count', str(ent_cnt))
        rel_elem = output_doms[oc]['rels_elem']
        rel_cnt = output_doms[oc]['rel_cnt']
        rel_elem.set('count', str(rel_cnt))
        # save tree
        tree = output_doms[oc]['tree']
        fn = output_fn % (oc.lower())
        tree.write(fn, encoding='utf-8')
        log('INFO', "writing XML file %s (%s ents, %s rels)" % (fn, ent_cnt, rel_cnt))


# main
print("Split OpenMind-XML into per-object XML files.")

# parse command line parameters
if len(sys.argv) > 1:
    input_fn = sys.argv[1]

if len(sys.argv) > 2:
    output_fn = sys.argv[2]

# import everything
print("Reading OpenMind-XML file %s" % input_fn)
if len(exclude_objects_of_type) > 0:
    print("  Skipping objects of type %s" % exclude_objects_of_type);
    
importAll()
exportAll()

print("Done.")