changeset 60:1b520696760a default tip

new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
author casties
date Tue, 19 Jun 2018 21:46:49 +0200
parents adfb57978a69
children
files importFromOpenMind/importer/ismixml_splitter.py
diffstat 1 files changed, 144 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/importFromOpenMind/importer/ismixml_splitter.py	Tue Jun 19 21:46:49 2018 +0200
@@ -0,0 +1,144 @@
+import xml.etree.ElementTree as ET
+import sys
+
+# output filename pattern
+output_fn = 'openmind-data-%s.xml'
+
+# input filename
+input_fn = 'openmind-data.xml'
+
+# dict of types and doms
+output_doms = {}
+
+# node types to exclude from the graph
+# exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE']
+exclude_objects_of_type = []
+
+# active log levels for logging
+# logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
+# logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
+logLevels = {'INFO', 'ERROR', 'SYSMSG'}
+
+
+def log(level, message):
+    if level in logLevels:
+        print("%s: %s" % (level, message))
+
+
+def startOutputDom(root, ent_elem, rel_elem, ent_type):
+    """Creates XML DOM for type ent_type.
+       Puts the DOM into output_doms.
+    """
+    tree = ET.ElementTree()
+    # create new root element
+    new_root = ET.Element(root.tag, root.attrib)
+    # set new root
+    tree._setroot(new_root)
+    # create new entity container element
+    new_ent_elem = ET.SubElement(new_root, ent_elem.tag, ent_elem.attrib)
+    # create new entity container element
+    new_rel_elem = ET.SubElement(new_root, rel_elem.tag, rel_elem.attrib)
+    # save element reference
+    output_doms[ent_type] = {'tree': tree, 'root': new_root, 'ent_cnt': 0, 'rel_cnt': 0, 
+                             'ents_elem': new_ent_elem, 'rels_elem': new_rel_elem}
+
+
+def importEnts(root, ents_elem, rels_elem):
+    """Import all entities from etree element elem.
+    """
+    cnt = 0
+    xml_num = ents_elem.get('count')
+    log('INFO', "XML says %s entities. Processing..." % xml_num)
+    
+    # iterate through entities element
+    for ent_elem in ents_elem:
+        cnt += 1
+        
+        oc = ent_elem.get('object-class')
+        if oc in exclude_objects_of_type:
+            # skip this entity
+            continue
+        
+        if (not oc in output_doms):
+            # create new output dom
+            startOutputDom(root, ents_elem, rels_elem, oc)
+            
+        target_elem = output_doms[oc]['ents_elem']
+        target_elem.append(ent_elem)
+        output_doms[oc]['ent_cnt'] += 1;
+
+
+def importRels(rels_elem):
+    """Import all entities from etree element elem.
+    """
+    cnt = 0
+    xml_num = rels_elem.get('count')
+    log('INFO', "XML says %s relations. Processing..." % xml_num)
+    
+    # iterate through entities element
+    for rel_elem in rels_elem:
+        cnt += 1
+
+        oc = rel_elem.get('source-class')
+        if oc in exclude_objects_of_type:
+            # skip this entity
+            continue
+        
+        if (not oc in output_doms):
+            log('ERROR', "relation source class unknown: %s"%oc)
+            continue
+            
+        target_elem = output_doms[oc]['rels_elem']
+        target_elem.append(rel_elem)
+        output_doms[oc]['rel_cnt'] += 1;
+        
+
+def importAll():
+    # parse XML file
+    log('INFO', "parsing XML file %s" % input_fn)
+    tree = ET.parse(input_fn)
+    log('DEBUG', "etree ready")
+    root = tree.getroot()
+    ents = root.find('entities')
+    rels = root.find('relations')
+    # import and process
+    importEnts(root, ents, rels)
+    importRels(rels)
+
+  
+def exportAll():
+    """Write all XML files"""  
+    for oc in output_doms.keys():
+        # update counts
+        ent_elem = output_doms[oc]['ents_elem']
+        ent_cnt = output_doms[oc]['ent_cnt']
+        ent_elem.set('count', str(ent_cnt))
+        rel_elem = output_doms[oc]['rels_elem']
+        rel_cnt = output_doms[oc]['rel_cnt']
+        rel_elem.set('count', str(rel_cnt))
+        # save tree
+        tree = output_doms[oc]['tree']
+        fn = output_fn % (oc.lower())
+        tree.write(fn, encoding='utf-8')
+        log('INFO', "writing XML file %s (%s ents, %s rels)" % (fn, ent_cnt, rel_cnt))
+
+
+# main
+print("Split OpenMind-XML into per-object XML files.")
+
+# parse command line parameters
+if len(sys.argv) > 1:
+    input_fn = sys.argv[1]
+
+if len(sys.argv) > 2:
+    output_fn = sys.argv[2]
+
+# import everything
+print("Reading OpenMind-XML file %s" % input_fn)
+if len(exclude_objects_of_type) > 0:
+    print("  Skipping objects of type %s" % exclude_objects_of_type);
+    
+importAll()
+exportAll()
+
+print("Done.")