# HG changeset patch # User casties # Date 1529437609 -7200 # Node ID 1b520696760a35244cd1c7160142e6104e59c183 # Parent adfb57978a696c3de53e2b478d6c390bca2d3b12 new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type. diff -r adfb57978a69 -r 1b520696760a importFromOpenMind/importer/ismixml_splitter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/importFromOpenMind/importer/ismixml_splitter.py Tue Jun 19 21:46:49 2018 +0200 @@ -0,0 +1,144 @@ +import xml.etree.ElementTree as ET +import sys + +# output filename pattern +output_fn = 'openmind-data-%s.xml' + +# input filename +input_fn = 'openmind-data.xml' + +# dict of types and doms +output_doms = {} + +# node types to exclude from the graph +# exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE'] +exclude_objects_of_type = [] + +# active log levels for logging +# logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} +# logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} +logLevels = {'INFO', 'ERROR', 'SYSMSG'} + + +def log(level, message): + if level in logLevels: + print("%s: %s" % (level, message)) + + +def startOutputDom(root, ent_elem, rel_elem, ent_type): + """Creates XML DOM for type ent_type. + Puts the DOM into output_doms. + """ + tree = ET.ElementTree() + # create new root element + new_root = ET.Element(root.tag, root.attrib) + # set new root + tree._setroot(new_root) + # create new entity container element + new_ent_elem = ET.SubElement(new_root, ent_elem.tag, ent_elem.attrib) + # create new entity container element + new_rel_elem = ET.SubElement(new_root, rel_elem.tag, rel_elem.attrib) + # save element reference + output_doms[ent_type] = {'tree': tree, 'root': new_root, 'ent_cnt': 0, 'rel_cnt': 0, + 'ents_elem': new_ent_elem, 'rels_elem': new_rel_elem} + + +def importEnts(root, ents_elem, rels_elem): + """Import all entities from etree element elem. + """ + cnt = 0 + xml_num = ents_elem.get('count') + log('INFO', "XML says %s entities. Processing..." % xml_num) + + # iterate through entities element + for ent_elem in ents_elem: + cnt += 1 + + oc = ent_elem.get('object-class') + if oc in exclude_objects_of_type: + # skip this entity + continue + + if (not oc in output_doms): + # create new output dom + startOutputDom(root, ents_elem, rels_elem, oc) + + target_elem = output_doms[oc]['ents_elem'] + target_elem.append(ent_elem) + output_doms[oc]['ent_cnt'] += 1; + + +def importRels(rels_elem): + """Import all entities from etree element elem. + """ + cnt = 0 + xml_num = rels_elem.get('count') + log('INFO', "XML says %s relations. Processing..." % xml_num) + + # iterate through entities element + for rel_elem in rels_elem: + cnt += 1 + + oc = rel_elem.get('source-class') + if oc in exclude_objects_of_type: + # skip this entity + continue + + if (not oc in output_doms): + log('ERROR', "relation source class unknown: %s"%oc) + continue + + target_elem = output_doms[oc]['rels_elem'] + target_elem.append(rel_elem) + output_doms[oc]['rel_cnt'] += 1; + + +def importAll(): + # parse XML file + log('INFO', "parsing XML file %s" % input_fn) + tree = ET.parse(input_fn) + log('DEBUG', "etree ready") + root = tree.getroot() + ents = root.find('entities') + rels = root.find('relations') + # import and process + importEnts(root, ents, rels) + importRels(rels) + + +def exportAll(): + """Write all XML files""" + for oc in output_doms.keys(): + # update counts + ent_elem = output_doms[oc]['ents_elem'] + ent_cnt = output_doms[oc]['ent_cnt'] + ent_elem.set('count', str(ent_cnt)) + rel_elem = output_doms[oc]['rels_elem'] + rel_cnt = output_doms[oc]['rel_cnt'] + rel_elem.set('count', str(rel_cnt)) + # save tree + tree = output_doms[oc]['tree'] + fn = output_fn % (oc.lower()) + tree.write(fn, encoding='utf-8') + log('INFO', "writing XML file %s (%s ents, %s rels)" % (fn, ent_cnt, rel_cnt)) + + +# main +print("Split OpenMind-XML into per-object XML files.") + +# parse command line parameters +if len(sys.argv) > 1: + input_fn = sys.argv[1] + +if len(sys.argv) > 2: + output_fn = sys.argv[2] + +# import everything +print("Reading OpenMind-XML file %s" % input_fn) +if len(exclude_objects_of_type) > 0: + print(" Skipping objects of type %s" % exclude_objects_of_type); + +importAll() +exportAll() + +print("Done.")