comparison importFromOpenMind/importer/ismixml_splitter.py @ 60:1b520696760a default tip

new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
author casties
date Tue, 19 Jun 2018 21:46:49 +0200
parents
children
comparison
equal deleted inserted replaced
59:adfb57978a69 60:1b520696760a
1 import xml.etree.ElementTree as ET
2 import sys
3
4 # output filename pattern
5 output_fn = 'openmind-data-%s.xml'
6
7 # input filename
8 input_fn = 'openmind-data.xml'
9
10 # dict of types and doms
11 output_doms = {}
12
13 # node types to exclude from the graph
14 # exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE']
15 exclude_objects_of_type = []
16
17 # active log levels for logging
18 # logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
19 # logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
20 logLevels = {'INFO', 'ERROR', 'SYSMSG'}
21
22
23 def log(level, message):
24 if level in logLevels:
25 print("%s: %s" % (level, message))
26
27
28 def startOutputDom(root, ent_elem, rel_elem, ent_type):
29 """Creates XML DOM for type ent_type.
30 Puts the DOM into output_doms.
31 """
32 tree = ET.ElementTree()
33 # create new root element
34 new_root = ET.Element(root.tag, root.attrib)
35 # set new root
36 tree._setroot(new_root)
37 # create new entity container element
38 new_ent_elem = ET.SubElement(new_root, ent_elem.tag, ent_elem.attrib)
39 # create new entity container element
40 new_rel_elem = ET.SubElement(new_root, rel_elem.tag, rel_elem.attrib)
41 # save element reference
42 output_doms[ent_type] = {'tree': tree, 'root': new_root, 'ent_cnt': 0, 'rel_cnt': 0,
43 'ents_elem': new_ent_elem, 'rels_elem': new_rel_elem}
44
45
46 def importEnts(root, ents_elem, rels_elem):
47 """Import all entities from etree element elem.
48 """
49 cnt = 0
50 xml_num = ents_elem.get('count')
51 log('INFO', "XML says %s entities. Processing..." % xml_num)
52
53 # iterate through entities element
54 for ent_elem in ents_elem:
55 cnt += 1
56
57 oc = ent_elem.get('object-class')
58 if oc in exclude_objects_of_type:
59 # skip this entity
60 continue
61
62 if (not oc in output_doms):
63 # create new output dom
64 startOutputDom(root, ents_elem, rels_elem, oc)
65
66 target_elem = output_doms[oc]['ents_elem']
67 target_elem.append(ent_elem)
68 output_doms[oc]['ent_cnt'] += 1;
69
70
71 def importRels(rels_elem):
72 """Import all entities from etree element elem.
73 """
74 cnt = 0
75 xml_num = rels_elem.get('count')
76 log('INFO', "XML says %s relations. Processing..." % xml_num)
77
78 # iterate through entities element
79 for rel_elem in rels_elem:
80 cnt += 1
81
82 oc = rel_elem.get('source-class')
83 if oc in exclude_objects_of_type:
84 # skip this entity
85 continue
86
87 if (not oc in output_doms):
88 log('ERROR', "relation source class unknown: %s"%oc)
89 continue
90
91 target_elem = output_doms[oc]['rels_elem']
92 target_elem.append(rel_elem)
93 output_doms[oc]['rel_cnt'] += 1;
94
95
96 def importAll():
97 # parse XML file
98 log('INFO', "parsing XML file %s" % input_fn)
99 tree = ET.parse(input_fn)
100 log('DEBUG', "etree ready")
101 root = tree.getroot()
102 ents = root.find('entities')
103 rels = root.find('relations')
104 # import and process
105 importEnts(root, ents, rels)
106 importRels(rels)
107
108
109 def exportAll():
110 """Write all XML files"""
111 for oc in output_doms.keys():
112 # update counts
113 ent_elem = output_doms[oc]['ents_elem']
114 ent_cnt = output_doms[oc]['ent_cnt']
115 ent_elem.set('count', str(ent_cnt))
116 rel_elem = output_doms[oc]['rels_elem']
117 rel_cnt = output_doms[oc]['rel_cnt']
118 rel_elem.set('count', str(rel_cnt))
119 # save tree
120 tree = output_doms[oc]['tree']
121 fn = output_fn % (oc.lower())
122 tree.write(fn, encoding='utf-8')
123 log('INFO', "writing XML file %s (%s ents, %s rels)" % (fn, ent_cnt, rel_cnt))
124
125
126 # main
127 print("Split OpenMind-XML into per-object XML files.")
128
129 # parse command line parameters
130 if len(sys.argv) > 1:
131 input_fn = sys.argv[1]
132
133 if len(sys.argv) > 2:
134 output_fn = sys.argv[2]
135
136 # import everything
137 print("Reading OpenMind-XML file %s" % input_fn)
138 if len(exclude_objects_of_type) > 0:
139 print(" Skipping objects of type %s" % exclude_objects_of_type);
140
141 importAll()
142 exportAll()
143
144 print("Done.")