Mercurial > hg > drupalISMI
comparison importFromOpenMind/importer/ismixml_splitter.py @ 60:1b520696760a default tip
new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
author | casties |
---|---|
date | Tue, 19 Jun 2018 21:46:49 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
59:adfb57978a69 | 60:1b520696760a |
---|---|
1 import xml.etree.ElementTree as ET | |
2 import sys | |
3 | |
4 # output filename pattern | |
5 output_fn = 'openmind-data-%s.xml' | |
6 | |
7 # input filename | |
8 input_fn = 'openmind-data.xml' | |
9 | |
10 # dict of types and doms | |
11 output_doms = {} | |
12 | |
13 # node types to exclude from the graph | |
14 # exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE'] | |
15 exclude_objects_of_type = [] | |
16 | |
17 # active log levels for logging | |
18 # logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} | |
19 # logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} | |
20 logLevels = {'INFO', 'ERROR', 'SYSMSG'} | |
21 | |
22 | |
23 def log(level, message): | |
24 if level in logLevels: | |
25 print("%s: %s" % (level, message)) | |
26 | |
27 | |
28 def startOutputDom(root, ent_elem, rel_elem, ent_type): | |
29 """Creates XML DOM for type ent_type. | |
30 Puts the DOM into output_doms. | |
31 """ | |
32 tree = ET.ElementTree() | |
33 # create new root element | |
34 new_root = ET.Element(root.tag, root.attrib) | |
35 # set new root | |
36 tree._setroot(new_root) | |
37 # create new entity container element | |
38 new_ent_elem = ET.SubElement(new_root, ent_elem.tag, ent_elem.attrib) | |
39 # create new entity container element | |
40 new_rel_elem = ET.SubElement(new_root, rel_elem.tag, rel_elem.attrib) | |
41 # save element reference | |
42 output_doms[ent_type] = {'tree': tree, 'root': new_root, 'ent_cnt': 0, 'rel_cnt': 0, | |
43 'ents_elem': new_ent_elem, 'rels_elem': new_rel_elem} | |
44 | |
45 | |
46 def importEnts(root, ents_elem, rels_elem): | |
47 """Import all entities from etree element elem. | |
48 """ | |
49 cnt = 0 | |
50 xml_num = ents_elem.get('count') | |
51 log('INFO', "XML says %s entities. Processing..." % xml_num) | |
52 | |
53 # iterate through entities element | |
54 for ent_elem in ents_elem: | |
55 cnt += 1 | |
56 | |
57 oc = ent_elem.get('object-class') | |
58 if oc in exclude_objects_of_type: | |
59 # skip this entity | |
60 continue | |
61 | |
62 if (not oc in output_doms): | |
63 # create new output dom | |
64 startOutputDom(root, ents_elem, rels_elem, oc) | |
65 | |
66 target_elem = output_doms[oc]['ents_elem'] | |
67 target_elem.append(ent_elem) | |
68 output_doms[oc]['ent_cnt'] += 1; | |
69 | |
70 | |
71 def importRels(rels_elem): | |
72 """Import all entities from etree element elem. | |
73 """ | |
74 cnt = 0 | |
75 xml_num = rels_elem.get('count') | |
76 log('INFO', "XML says %s relations. Processing..." % xml_num) | |
77 | |
78 # iterate through entities element | |
79 for rel_elem in rels_elem: | |
80 cnt += 1 | |
81 | |
82 oc = rel_elem.get('source-class') | |
83 if oc in exclude_objects_of_type: | |
84 # skip this entity | |
85 continue | |
86 | |
87 if (not oc in output_doms): | |
88 log('ERROR', "relation source class unknown: %s"%oc) | |
89 continue | |
90 | |
91 target_elem = output_doms[oc]['rels_elem'] | |
92 target_elem.append(rel_elem) | |
93 output_doms[oc]['rel_cnt'] += 1; | |
94 | |
95 | |
96 def importAll(): | |
97 # parse XML file | |
98 log('INFO', "parsing XML file %s" % input_fn) | |
99 tree = ET.parse(input_fn) | |
100 log('DEBUG', "etree ready") | |
101 root = tree.getroot() | |
102 ents = root.find('entities') | |
103 rels = root.find('relations') | |
104 # import and process | |
105 importEnts(root, ents, rels) | |
106 importRels(rels) | |
107 | |
108 | |
109 def exportAll(): | |
110 """Write all XML files""" | |
111 for oc in output_doms.keys(): | |
112 # update counts | |
113 ent_elem = output_doms[oc]['ents_elem'] | |
114 ent_cnt = output_doms[oc]['ent_cnt'] | |
115 ent_elem.set('count', str(ent_cnt)) | |
116 rel_elem = output_doms[oc]['rels_elem'] | |
117 rel_cnt = output_doms[oc]['rel_cnt'] | |
118 rel_elem.set('count', str(rel_cnt)) | |
119 # save tree | |
120 tree = output_doms[oc]['tree'] | |
121 fn = output_fn % (oc.lower()) | |
122 tree.write(fn, encoding='utf-8') | |
123 log('INFO', "writing XML file %s (%s ents, %s rels)" % (fn, ent_cnt, rel_cnt)) | |
124 | |
125 | |
126 # main | |
127 print("Split OpenMind-XML into per-object XML files.") | |
128 | |
129 # parse command line parameters | |
130 if len(sys.argv) > 1: | |
131 input_fn = sys.argv[1] | |
132 | |
133 if len(sys.argv) > 2: | |
134 output_fn = sys.argv[2] | |
135 | |
136 # import everything | |
137 print("Reading OpenMind-XML file %s" % input_fn) | |
138 if len(exclude_objects_of_type) > 0: | |
139 print(" Skipping objects of type %s" % exclude_objects_of_type); | |
140 | |
141 importAll() | |
142 exportAll() | |
143 | |
144 print("Done.") |