Mercurial > hg > drupalISMI
annotate importFromOpenMind/importer/ismixml2model.py @ 51:5a633e875490
also read normalized fields from xml.
author | casties |
---|---|
date | Fri, 03 Mar 2017 20:11:06 +0100 |
parents | 6625019a0c96 |
children | b9a6e596ebe4 |
rev | line source |
---|---|
46 | 1 import xml.etree.ElementTree as ET |
2 import json | |
3 import networkx | |
4 import sys | |
5 | |
6 ## configure behaviour | |
7 | |
8 # output filename | |
9 output_fn = "ismi_graph.gpickle" | |
10 | |
11 input_fn = "openmind-data.xml" | |
12 | |
13 | |
14 # node types to exclude from the graph | |
15 exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE'] | |
16 | |
17 # attributes to exclude | |
18 exclude_attributes_of_type = [ | |
19 'lw', | |
20 'node_type', | |
21 'nov', | |
22 'notes_old' | |
23 ] | |
24 | |
25 # name of type attribute | |
26 node_type_attribute = '_type' | |
27 rel_type_attribute = '_type' | |
28 | |
29 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] | |
30 | |
31 | |
32 nx_graph = networkx.MultiDiGraph() | |
33 | |
34 nx_nodes = {} | |
35 ismi_relations = {} | |
36 nx_relations = {} | |
37 | |
38 # active log levels for logging | |
39 #logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} | |
40 #logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} | |
41 logLevels = {'INFO', 'ERROR', 'SYSMSG'} | |
42 | |
43 def log(level, message): | |
44 if level in logLevels: | |
45 print("%s: %s"%(level, message)) | |
46 | |
47 | |
48 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): | |
49 if is_src_rel: | |
50 #name = name + '>' | |
51 pass | |
52 | |
53 if is_tar_rel: | |
54 name = '<' + name | |
55 | |
56 if att_from_rel: | |
57 # clean up relations as attribute names | |
58 name = name.replace('is_', '') | |
59 name = name.replace('has_', '') | |
60 name = name.replace('was_', '') | |
61 name = name.replace('_of', '') | |
62 | |
63 return name | |
64 | |
65 | |
66 | |
67 def parseYear(val): | |
68 year = None | |
69 try: | |
70 date_json = json.loads(val) | |
71 if 'from' in date_json: | |
72 year = date_json['from'].get('year', None) | |
73 elif 'date' in date_json: | |
74 year = date_json['date'].get('year', None) | |
75 else: | |
76 log("WARNING", "don't know what to do with date %s"%(val)) | |
77 | |
78 except: | |
79 pass | |
80 | |
81 return year | |
82 | |
83 | |
84 def nodeFromEnt(ent_elem): | |
85 """Create a graph node from the given XML entity. | |
86 | |
87 Creates the node in gdb and returns the node. | |
88 """ | |
89 # text content of entity element | |
90 ov = ent_elem.text or '' | |
91 | |
92 attrs = {} | |
93 | |
94 # get attributes element | |
95 atts_elem = ent_elem.find('attributes') | |
96 | |
97 if atts_elem is None: | |
98 log('DEBUG', "entity has no attributes: %s"%ent_elem) | |
99 | |
100 else: | |
101 # go through all attributes | |
102 for att_elem in atts_elem: | |
103 ct = att_elem.get('content-type', None) | |
104 name = att_elem.get('name', None) | |
105 if name in exclude_attributes_of_type: | |
106 # exclude attribute | |
107 continue | |
108 | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
109 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']: |
46 | 110 # normal text attribute (assume no content_type is text too...) |
111 val = att_elem.text | |
112 | |
113 if val is not None and val[0] == '{': | |
114 # try to parse as date | |
115 year = parseYear(val) | |
116 if year is not None: | |
117 val = year | |
118 | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
119 if val is not None: |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
120 # keep attribute |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
121 attrs[name] = val |
51 | 122 # check for normalized value |
123 nov = att_elem.findtext('norm') | |
124 if nov is not None: | |
125 # add normalized value | |
126 attrs['_n_'+name] = nov | |
46 | 127 |
128 elif ct == 'date': | |
129 # date attribute | |
130 val = att_elem.text | |
131 if val is not None: | |
132 # try to parse date object to get gregorian year | |
133 year = parseYear(val) | |
134 if year is not None: | |
135 attrs[name] = year | |
136 | |
137 elif ct == 'num': | |
138 # number attribute | |
139 val = att_elem.text | |
140 if val is not None: | |
141 # keep attribute, assume num is int | |
142 attrs[name] = int(val) | |
143 | |
144 elif ct == 'old': | |
145 # ignore attribute | |
146 continue | |
147 | |
148 else: | |
149 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem)) | |
150 # ignore other content types | |
151 continue | |
152 | |
153 # process base attributes | |
154 oc = ent_elem.get('object-class') | |
155 | |
156 # set type | |
157 attrs[node_type_attribute] = fixName(oc) | |
158 | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
159 ismi_id = int(ent_elem.get('id')) |
46 | 160 # rename id to ismi_id |
161 attrs['ismi_id'] = ismi_id | |
162 | |
163 if len(ov) > 0: | |
164 # save ov as label | |
165 attrs['label'] = ov | |
51 | 166 # check for normalized value |
167 nov = ent_elem.findtext('norm') | |
168 if nov is not None: | |
169 # add normalized value | |
170 attrs['_n_label'] = nov | |
46 | 171 |
172 # create node | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
173 log('DEBUG', "new node(%s, %s)"%(ismi_id, attrs)) |
46 | 174 nx_graph.add_node(ismi_id, **attrs) |
175 node = nx_graph.node[ismi_id] | |
176 | |
177 return node | |
178 | |
179 | |
180 def relationFromRel(rel_elem): | |
181 """Create graph relation from etree element. | |
182 """ | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
183 rel_id = int(rel_elem.get('id')) |
46 | 184 rel_name = rel_elem.get('object-class') |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
185 src_id = int(rel_elem.get('source-id')) |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
186 tar_id = int(rel_elem.get('target-id')) |
46 | 187 if not src_id in nx_nodes: |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
188 log("WARNING", "relation %s src node %s missing!"%(rel_id,src_id)) |
46 | 189 return None |
190 | |
191 if not tar_id in nx_nodes: | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
192 log("WARNING", "relation %s tar node %s missing!"%(rel_id,tar_id)) |
46 | 193 return None |
194 | |
195 ov = rel_elem.text or '' | |
196 | |
197 attrs = {} | |
198 | |
199 # get attributes element | |
200 atts_elem = rel_elem.find('attributes') | |
201 | |
202 if atts_elem is not None: | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
203 if atts_elem.tail is not None: |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
204 # tail belongs to parent |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
205 ov += atts_elem.tail |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
206 |
46 | 207 # go through all attributes |
208 for att_elem in atts_elem: | |
209 ct = att_elem.get('content-type', None) | |
210 name = att_elem.get('name', None) | |
211 if name in exclude_attributes_of_type: | |
212 # exclude attribute | |
213 continue | |
214 | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
215 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']: |
46 | 216 # normal text attribute (assume no content_type is text too...) |
217 val = att_elem.text | |
218 | |
219 if val is not None and val[0] == '{': | |
220 # try to parse as date | |
221 year = parseYear(val) | |
222 if year is not None: | |
223 val = year | |
224 | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
225 if val is not None: |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
226 # keep attribute |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
227 attrs[name] = val |
51 | 228 # check for normalized value |
229 nov = att_elem.findtext('norm') | |
230 if nov is not None: | |
231 # add normalized value | |
232 attrs['_n_'+name] = nov | |
46 | 233 |
234 elif ct == 'date': | |
235 # date attribute | |
236 val = att_elem.text | |
237 if val is not None: | |
238 # try to parse date object to get gregorian year | |
239 year = parseYear(val) | |
240 if year is not None: | |
241 attrs[name] = year | |
242 | |
243 elif ct == 'num': | |
244 # number attribute | |
245 val = att_elem.text | |
246 if val is not None: | |
247 # keep attribute, assume num is int | |
248 attrs[name] = int(val) | |
249 | |
250 elif ct == 'old': | |
251 # ignore attribute | |
252 continue | |
253 | |
254 else: | |
255 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem)) | |
256 # ignore other content types | |
257 continue | |
258 | |
259 #if len(ov) > 0: | |
260 # # own value of relation is not useful | |
261 # attrs['ov'] = ov | |
262 | |
263 attrs[rel_type_attribute] = fixName(rel_name) | |
264 attrs['ismi_id'] = rel_id | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
265 #log('DEBUG', "new edge(%s, %s, %s)"%(src_id, tar_id, attrs)) |
46 | 266 # create relation with type |
267 nx_rel = nx_graph.add_edge(src_id, tar_id, attr_dict=attrs) | |
268 | |
269 return nx_rel | |
270 | |
271 | |
272 def importEnts(ents_elem): | |
273 """Import all entities from etree element elem. | |
274 """ | |
275 cnt = 0 | |
276 xml_num = ents_elem.get('number') | |
277 log('INFO', "XML says %s entities"%xml_num) | |
278 | |
279 # iterate through entities element | |
280 for ent_elem in ents_elem: | |
281 cnt += 1 | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
282 |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
283 oc = ent_elem.get('object-class') |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
284 if oc in exclude_objects_of_type: |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
285 # skip this entity |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
286 continue |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
287 |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
288 ismi_id = int(ent_elem.get('id')) |
46 | 289 log('DEBUG', "reading entity[%s]"%ismi_id) |
290 | |
291 if ismi_id in nx_nodes: | |
292 log("ERROR", "entity with id=%s exists!"%ismi_id) | |
293 return | |
294 | |
295 # create networkx node | |
296 node = nodeFromEnt(ent_elem) | |
297 | |
298 # save node reference | |
299 nx_nodes[ismi_id] = node | |
300 | |
301 # debug | |
302 #if cnt >= 100: | |
303 # return | |
304 | |
305 | |
306 def importRels(rels_elem): | |
307 """Import all entities from etree element elem. | |
308 """ | |
309 cnt = 0 | |
310 xml_num = rels_elem.get('number') | |
311 log('INFO', "XML says %s relations"%xml_num) | |
312 | |
313 # iterate through entities element | |
314 for rel_elem in rels_elem: | |
315 cnt += 1 | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
316 |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
317 ismi_id = int(rel_elem.get('id')) |
46 | 318 log('DEBUG', "reading relation[%s]"%ismi_id) |
319 | |
320 if ismi_id in nx_relations: | |
321 print("ERROR: relation with id=%s exists!"%ismi_id) | |
322 return | |
323 | |
324 # create networkx relation | |
325 relation = relationFromRel(rel_elem) | |
326 | |
327 # save relation reference | |
328 nx_relations[ismi_id] = relation | |
329 | |
330 # debug | |
331 #if cnt >= 100: | |
332 # return | |
333 | |
334 | |
335 def importAll(): | |
336 # parse XML file | |
337 log('INFO', "parsing XML file %s"%input_fn) | |
338 tree = ET.parse(input_fn) | |
339 log('DEBUG', "etree ready") | |
340 root = tree.getroot() | |
341 ents = root.find('entities') | |
342 importEnts(ents) | |
343 | |
344 rels = root.find('relations') | |
345 importRels(rels) | |
346 | |
347 ## main | |
348 | |
349 print("Copy graph from OpenMind-XML to networkx pickle") | |
350 | |
351 # parse command line parameters | |
352 if len(sys.argv) > 1: | |
353 input_fn = sys.argv[1] | |
354 | |
355 if len(sys.argv) > 2: | |
356 output_fn = sys.argv[2] | |
357 | |
358 # import everything | |
359 print("Reading graph from OpenMind-XML file %s"%input_fn) | |
360 if len(exclude_objects_of_type) > 0: | |
361 print(" Skipping objects of type %s"%exclude_objects_of_type); | |
362 | |
363 importAll() | |
364 | |
365 print("Graph info: %s"%networkx.info(nx_graph)) | |
366 #print(" nodes:%s"%repr(nx_graph.nodes(data=True))) | |
367 | |
368 # export pickle | |
369 networkx.write_gpickle(nx_graph, output_fn) | |
370 print("Wrote networkx pickle file %s"%output_fn) |