annotate importFromOpenMind/importer/ismixml_splitter.py @ 60:1b520696760a default tip

new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
author casties
date Tue, 19 Jun 2018 21:46:49 +0200
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
60
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
1 import xml.etree.ElementTree as ET
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
2 import sys
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
3
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
4 # output filename pattern
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
5 output_fn = 'openmind-data-%s.xml'
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
6
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
7 # input filename
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
8 input_fn = 'openmind-data.xml'
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
9
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
10 # dict of types and doms
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
11 output_doms = {}
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
12
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
13 # node types to exclude from the graph
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
14 # exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE']
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
15 exclude_objects_of_type = []
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
16
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
17 # active log levels for logging
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
18 # logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
19 # logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
20 logLevels = {'INFO', 'ERROR', 'SYSMSG'}
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
21
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
22
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
23 def log(level, message):
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
24 if level in logLevels:
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
25 print("%s: %s" % (level, message))
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
26
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
27
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
28 def startOutputDom(root, ent_elem, rel_elem, ent_type):
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
29 """Creates XML DOM for type ent_type.
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
30 Puts the DOM into output_doms.
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
31 """
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
32 tree = ET.ElementTree()
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
33 # create new root element
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
34 new_root = ET.Element(root.tag, root.attrib)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
35 # set new root
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
36 tree._setroot(new_root)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
37 # create new entity container element
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
38 new_ent_elem = ET.SubElement(new_root, ent_elem.tag, ent_elem.attrib)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
39 # create new entity container element
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
40 new_rel_elem = ET.SubElement(new_root, rel_elem.tag, rel_elem.attrib)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
41 # save element reference
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
42 output_doms[ent_type] = {'tree': tree, 'root': new_root, 'ent_cnt': 0, 'rel_cnt': 0,
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
43 'ents_elem': new_ent_elem, 'rels_elem': new_rel_elem}
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
44
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
45
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
46 def importEnts(root, ents_elem, rels_elem):
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
47 """Import all entities from etree element elem.
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
48 """
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
49 cnt = 0
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
50 xml_num = ents_elem.get('count')
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
51 log('INFO', "XML says %s entities. Processing..." % xml_num)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
52
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
53 # iterate through entities element
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
54 for ent_elem in ents_elem:
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
55 cnt += 1
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
56
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
57 oc = ent_elem.get('object-class')
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
58 if oc in exclude_objects_of_type:
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
59 # skip this entity
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
60 continue
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
61
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
62 if (not oc in output_doms):
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
63 # create new output dom
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
64 startOutputDom(root, ents_elem, rels_elem, oc)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
65
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
66 target_elem = output_doms[oc]['ents_elem']
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
67 target_elem.append(ent_elem)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
68 output_doms[oc]['ent_cnt'] += 1;
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
69
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
70
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
71 def importRels(rels_elem):
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
72 """Import all entities from etree element elem.
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
73 """
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
74 cnt = 0
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
75 xml_num = rels_elem.get('count')
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
76 log('INFO', "XML says %s relations. Processing..." % xml_num)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
77
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
78 # iterate through entities element
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
79 for rel_elem in rels_elem:
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
80 cnt += 1
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
81
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
82 oc = rel_elem.get('source-class')
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
83 if oc in exclude_objects_of_type:
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
84 # skip this entity
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
85 continue
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
86
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
87 if (not oc in output_doms):
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
88 log('ERROR', "relation source class unknown: %s"%oc)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
89 continue
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
90
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
91 target_elem = output_doms[oc]['rels_elem']
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
92 target_elem.append(rel_elem)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
93 output_doms[oc]['rel_cnt'] += 1;
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
94
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
95
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
96 def importAll():
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
97 # parse XML file
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
98 log('INFO', "parsing XML file %s" % input_fn)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
99 tree = ET.parse(input_fn)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
100 log('DEBUG', "etree ready")
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
101 root = tree.getroot()
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
102 ents = root.find('entities')
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
103 rels = root.find('relations')
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
104 # import and process
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
105 importEnts(root, ents, rels)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
106 importRels(rels)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
107
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
108
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
109 def exportAll():
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
110 """Write all XML files"""
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
111 for oc in output_doms.keys():
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
112 # update counts
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
113 ent_elem = output_doms[oc]['ents_elem']
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
114 ent_cnt = output_doms[oc]['ent_cnt']
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
115 ent_elem.set('count', str(ent_cnt))
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
116 rel_elem = output_doms[oc]['rels_elem']
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
117 rel_cnt = output_doms[oc]['rel_cnt']
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
118 rel_elem.set('count', str(rel_cnt))
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
119 # save tree
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
120 tree = output_doms[oc]['tree']
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
121 fn = output_fn % (oc.lower())
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
122 tree.write(fn, encoding='utf-8')
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
123 log('INFO', "writing XML file %s (%s ents, %s rels)" % (fn, ent_cnt, rel_cnt))
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
124
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
125
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
126 # main
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
127 print("Split OpenMind-XML into per-object XML files.")
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
128
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
129 # parse command line parameters
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
130 if len(sys.argv) > 1:
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
131 input_fn = sys.argv[1]
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
132
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
133 if len(sys.argv) > 2:
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
134 output_fn = sys.argv[2]
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
135
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
136 # import everything
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
137 print("Reading OpenMind-XML file %s" % input_fn)
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
138 if len(exclude_objects_of_type) > 0:
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
139 print(" Skipping objects of type %s" % exclude_objects_of_type);
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
140
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
141 importAll()
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
142 exportAll()
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
143
1b520696760a new ismixml_splitter.py that splits openmind-data.xml into separate files per entity type.
casties
parents:
diff changeset
144 print("Done.")