Mercurial > hg > drupalISMI
view importFromOpenMind/importer/ismixml_splitter.py @ 60:1b520696760a default tip
new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
author | casties |
---|---|
date | Tue, 19 Jun 2018 21:46:49 +0200 |
parents | |
children |
line wrap: on
line source
import xml.etree.ElementTree as ET import sys # output filename pattern output_fn = 'openmind-data-%s.xml' # input filename input_fn = 'openmind-data.xml' # dict of types and doms output_doms = {} # node types to exclude from the graph # exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE'] exclude_objects_of_type = [] # active log levels for logging # logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} # logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} logLevels = {'INFO', 'ERROR', 'SYSMSG'} def log(level, message): if level in logLevels: print("%s: %s" % (level, message)) def startOutputDom(root, ent_elem, rel_elem, ent_type): """Creates XML DOM for type ent_type. Puts the DOM into output_doms. """ tree = ET.ElementTree() # create new root element new_root = ET.Element(root.tag, root.attrib) # set new root tree._setroot(new_root) # create new entity container element new_ent_elem = ET.SubElement(new_root, ent_elem.tag, ent_elem.attrib) # create new entity container element new_rel_elem = ET.SubElement(new_root, rel_elem.tag, rel_elem.attrib) # save element reference output_doms[ent_type] = {'tree': tree, 'root': new_root, 'ent_cnt': 0, 'rel_cnt': 0, 'ents_elem': new_ent_elem, 'rels_elem': new_rel_elem} def importEnts(root, ents_elem, rels_elem): """Import all entities from etree element elem. """ cnt = 0 xml_num = ents_elem.get('count') log('INFO', "XML says %s entities. Processing..." % xml_num) # iterate through entities element for ent_elem in ents_elem: cnt += 1 oc = ent_elem.get('object-class') if oc in exclude_objects_of_type: # skip this entity continue if (not oc in output_doms): # create new output dom startOutputDom(root, ents_elem, rels_elem, oc) target_elem = output_doms[oc]['ents_elem'] target_elem.append(ent_elem) output_doms[oc]['ent_cnt'] += 1; def importRels(rels_elem): """Import all entities from etree element elem. """ cnt = 0 xml_num = rels_elem.get('count') log('INFO', "XML says %s relations. Processing..." % xml_num) # iterate through entities element for rel_elem in rels_elem: cnt += 1 oc = rel_elem.get('source-class') if oc in exclude_objects_of_type: # skip this entity continue if (not oc in output_doms): log('ERROR', "relation source class unknown: %s"%oc) continue target_elem = output_doms[oc]['rels_elem'] target_elem.append(rel_elem) output_doms[oc]['rel_cnt'] += 1; def importAll(): # parse XML file log('INFO', "parsing XML file %s" % input_fn) tree = ET.parse(input_fn) log('DEBUG', "etree ready") root = tree.getroot() ents = root.find('entities') rels = root.find('relations') # import and process importEnts(root, ents, rels) importRels(rels) def exportAll(): """Write all XML files""" for oc in output_doms.keys(): # update counts ent_elem = output_doms[oc]['ents_elem'] ent_cnt = output_doms[oc]['ent_cnt'] ent_elem.set('count', str(ent_cnt)) rel_elem = output_doms[oc]['rels_elem'] rel_cnt = output_doms[oc]['rel_cnt'] rel_elem.set('count', str(rel_cnt)) # save tree tree = output_doms[oc]['tree'] fn = output_fn % (oc.lower()) tree.write(fn, encoding='utf-8') log('INFO', "writing XML file %s (%s ents, %s rels)" % (fn, ent_cnt, rel_cnt)) # main print("Split OpenMind-XML into per-object XML files.") # parse command line parameters if len(sys.argv) > 1: input_fn = sys.argv[1] if len(sys.argv) > 2: output_fn = sys.argv[2] # import everything print("Reading OpenMind-XML file %s" % input_fn) if len(exclude_objects_of_type) > 0: print(" Skipping objects of type %s" % exclude_objects_of_type); importAll() exportAll() print("Done.")