Mercurial > hg > drupalISMI
annotate importFromOpenMind/importer/ismixml2model.py @ 55:5a1a4af235eb
fix fix of transfer of ownvalue to _label attribute.
author | casties |
---|---|
date | Fri, 21 Apr 2017 19:08:09 +0200 |
parents | b9a6e596ebe4 |
children | be1c7d6814b6 |
rev | line source |
---|---|
46 | 1 import xml.etree.ElementTree as ET |
2 import json | |
3 import networkx | |
4 import sys | |
5 | |
6 ## configure behaviour | |
7 | |
8 # output filename | |
9 output_fn = "ismi_graph.gpickle" | |
10 | |
11 input_fn = "openmind-data.xml" | |
12 | |
13 | |
14 # node types to exclude from the graph | |
15 exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE'] | |
16 | |
17 # attributes to exclude | |
18 exclude_attributes_of_type = [ | |
19 'lw', | |
20 'node_type', | |
21 'nov', | |
22 'notes_old' | |
23 ] | |
24 | |
25 # name of type attribute | |
26 node_type_attribute = '_type' | |
27 rel_type_attribute = '_type' | |
28 | |
29 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] | |
30 | |
31 | |
32 nx_graph = networkx.MultiDiGraph() | |
33 | |
34 nx_nodes = {} | |
35 ismi_relations = {} | |
36 nx_relations = {} | |
37 | |
38 # active log levels for logging | |
39 #logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} | |
40 #logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} | |
41 logLevels = {'INFO', 'ERROR', 'SYSMSG'} | |
42 | |
43 def log(level, message): | |
44 if level in logLevels: | |
45 print("%s: %s"%(level, message)) | |
46 | |
47 | |
48 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): | |
49 if is_src_rel: | |
50 #name = name + '>' | |
51 pass | |
52 | |
53 if is_tar_rel: | |
54 name = '<' + name | |
55 | |
56 if att_from_rel: | |
57 # clean up relations as attribute names | |
58 name = name.replace('is_', '') | |
59 name = name.replace('has_', '') | |
60 name = name.replace('was_', '') | |
61 name = name.replace('_of', '') | |
62 | |
63 return name | |
64 | |
65 | |
66 | |
67 def parseYear(val): | |
68 year = None | |
69 try: | |
70 date_json = json.loads(val) | |
71 if 'from' in date_json: | |
72 year = date_json['from'].get('year', None) | |
73 elif 'date' in date_json: | |
74 year = date_json['date'].get('year', None) | |
75 else: | |
76 log("WARNING", "don't know what to do with date %s"%(val)) | |
77 | |
78 except: | |
79 pass | |
80 | |
81 return year | |
82 | |
83 | |
84 def nodeFromEnt(ent_elem): | |
85 """Create a graph node from the given XML entity. | |
86 | |
87 Creates the node in gdb and returns the node. | |
88 """ | |
89 # text content of entity element | |
90 ov = ent_elem.text or '' | |
91 | |
92 attrs = {} | |
93 | |
94 # get attributes element | |
95 atts_elem = ent_elem.find('attributes') | |
54 | 96 |
46 | 97 if atts_elem is None: |
98 log('DEBUG', "entity has no attributes: %s"%ent_elem) | |
99 | |
100 else: | |
55
5a1a4af235eb
fix fix of transfer of ownvalue to _label attribute.
casties
parents:
54
diff
changeset
|
101 # text content of entity element after atts_elem |
5a1a4af235eb
fix fix of transfer of ownvalue to _label attribute.
casties
parents:
54
diff
changeset
|
102 ov += atts_elem.tail or '' |
5a1a4af235eb
fix fix of transfer of ownvalue to _label attribute.
casties
parents:
54
diff
changeset
|
103 |
46 | 104 # go through all attributes |
105 for att_elem in atts_elem: | |
106 ct = att_elem.get('content-type', None) | |
107 name = att_elem.get('name', None) | |
108 if name in exclude_attributes_of_type: | |
109 # exclude attribute | |
110 continue | |
111 | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
112 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']: |
46 | 113 # normal text attribute (assume no content_type is text too...) |
114 val = att_elem.text | |
115 | |
116 if val is not None and val[0] == '{': | |
117 # try to parse as date | |
118 year = parseYear(val) | |
119 if year is not None: | |
120 val = year | |
121 | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
122 if val is not None: |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
123 # keep attribute |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
124 attrs[name] = val |
51 | 125 # check for normalized value |
126 nov = att_elem.findtext('norm') | |
127 if nov is not None: | |
128 # add normalized value | |
129 attrs['_n_'+name] = nov | |
46 | 130 |
131 elif ct == 'date': | |
132 # date attribute | |
133 val = att_elem.text | |
134 if val is not None: | |
135 # try to parse date object to get gregorian year | |
136 year = parseYear(val) | |
137 if year is not None: | |
138 attrs[name] = year | |
139 | |
140 elif ct == 'num': | |
141 # number attribute | |
142 val = att_elem.text | |
143 if val is not None: | |
144 # keep attribute, assume num is int | |
145 attrs[name] = int(val) | |
146 | |
147 elif ct == 'old': | |
148 # ignore attribute | |
149 continue | |
150 | |
151 else: | |
152 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem)) | |
153 # ignore other content types | |
154 continue | |
155 | |
156 # process base attributes | |
157 oc = ent_elem.get('object-class') | |
158 | |
159 # set type | |
160 attrs[node_type_attribute] = fixName(oc) | |
161 | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
162 ismi_id = int(ent_elem.get('id')) |
46 | 163 # rename id to ismi_id |
164 attrs['ismi_id'] = ismi_id | |
165 | |
166 if len(ov) > 0: | |
167 # save ov as label | |
54 | 168 attrs['_label'] = ov |
51 | 169 # check for normalized value |
170 nov = ent_elem.findtext('norm') | |
171 if nov is not None: | |
172 # add normalized value | |
173 attrs['_n_label'] = nov | |
46 | 174 |
175 # create node | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
176 log('DEBUG', "new node(%s, %s)"%(ismi_id, attrs)) |
46 | 177 nx_graph.add_node(ismi_id, **attrs) |
178 node = nx_graph.node[ismi_id] | |
179 | |
180 return node | |
181 | |
182 | |
183 def relationFromRel(rel_elem): | |
184 """Create graph relation from etree element. | |
185 """ | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
186 rel_id = int(rel_elem.get('id')) |
46 | 187 rel_name = rel_elem.get('object-class') |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
188 src_id = int(rel_elem.get('source-id')) |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
189 tar_id = int(rel_elem.get('target-id')) |
46 | 190 if not src_id in nx_nodes: |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
191 log("WARNING", "relation %s src node %s missing!"%(rel_id,src_id)) |
46 | 192 return None |
193 | |
194 if not tar_id in nx_nodes: | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
195 log("WARNING", "relation %s tar node %s missing!"%(rel_id,tar_id)) |
46 | 196 return None |
197 | |
198 ov = rel_elem.text or '' | |
199 | |
200 attrs = {} | |
201 | |
202 # get attributes element | |
203 atts_elem = rel_elem.find('attributes') | |
204 | |
205 if atts_elem is not None: | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
206 if atts_elem.tail is not None: |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
207 # tail belongs to parent |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
208 ov += atts_elem.tail |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
209 |
46 | 210 # go through all attributes |
211 for att_elem in atts_elem: | |
212 ct = att_elem.get('content-type', None) | |
213 name = att_elem.get('name', None) | |
214 if name in exclude_attributes_of_type: | |
215 # exclude attribute | |
216 continue | |
217 | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
218 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']: |
46 | 219 # normal text attribute (assume no content_type is text too...) |
220 val = att_elem.text | |
221 | |
222 if val is not None and val[0] == '{': | |
223 # try to parse as date | |
224 year = parseYear(val) | |
225 if year is not None: | |
226 val = year | |
227 | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
228 if val is not None: |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
229 # keep attribute |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
230 attrs[name] = val |
51 | 231 # check for normalized value |
232 nov = att_elem.findtext('norm') | |
233 if nov is not None: | |
234 # add normalized value | |
235 attrs['_n_'+name] = nov | |
46 | 236 |
237 elif ct == 'date': | |
238 # date attribute | |
239 val = att_elem.text | |
240 if val is not None: | |
241 # try to parse date object to get gregorian year | |
242 year = parseYear(val) | |
243 if year is not None: | |
244 attrs[name] = year | |
245 | |
246 elif ct == 'num': | |
247 # number attribute | |
248 val = att_elem.text | |
249 if val is not None: | |
250 # keep attribute, assume num is int | |
251 attrs[name] = int(val) | |
252 | |
253 elif ct == 'old': | |
254 # ignore attribute | |
255 continue | |
256 | |
257 else: | |
258 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem)) | |
259 # ignore other content types | |
260 continue | |
261 | |
262 #if len(ov) > 0: | |
263 # # own value of relation is not useful | |
264 # attrs['ov'] = ov | |
265 | |
266 attrs[rel_type_attribute] = fixName(rel_name) | |
267 attrs['ismi_id'] = rel_id | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
268 #log('DEBUG', "new edge(%s, %s, %s)"%(src_id, tar_id, attrs)) |
46 | 269 # create relation with type |
270 nx_rel = nx_graph.add_edge(src_id, tar_id, attr_dict=attrs) | |
271 | |
272 return nx_rel | |
273 | |
274 | |
275 def importEnts(ents_elem): | |
276 """Import all entities from etree element elem. | |
277 """ | |
278 cnt = 0 | |
279 xml_num = ents_elem.get('number') | |
280 log('INFO', "XML says %s entities"%xml_num) | |
281 | |
282 # iterate through entities element | |
283 for ent_elem in ents_elem: | |
284 cnt += 1 | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
285 |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
286 oc = ent_elem.get('object-class') |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
287 if oc in exclude_objects_of_type: |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
288 # skip this entity |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
289 continue |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
290 |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
291 ismi_id = int(ent_elem.get('id')) |
46 | 292 log('DEBUG', "reading entity[%s]"%ismi_id) |
293 | |
294 if ismi_id in nx_nodes: | |
295 log("ERROR", "entity with id=%s exists!"%ismi_id) | |
296 return | |
297 | |
298 # create networkx node | |
299 node = nodeFromEnt(ent_elem) | |
300 | |
301 # save node reference | |
302 nx_nodes[ismi_id] = node | |
303 | |
304 # debug | |
305 #if cnt >= 100: | |
306 # return | |
307 | |
308 | |
309 def importRels(rels_elem): | |
310 """Import all entities from etree element elem. | |
311 """ | |
312 cnt = 0 | |
313 xml_num = rels_elem.get('number') | |
314 log('INFO', "XML says %s relations"%xml_num) | |
315 | |
316 # iterate through entities element | |
317 for rel_elem in rels_elem: | |
318 cnt += 1 | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
319 |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
320 ismi_id = int(rel_elem.get('id')) |
46 | 321 log('DEBUG', "reading relation[%s]"%ismi_id) |
322 | |
323 if ismi_id in nx_relations: | |
324 print("ERROR: relation with id=%s exists!"%ismi_id) | |
325 return | |
326 | |
327 # create networkx relation | |
328 relation = relationFromRel(rel_elem) | |
329 | |
330 # save relation reference | |
331 nx_relations[ismi_id] = relation | |
332 | |
333 # debug | |
334 #if cnt >= 100: | |
335 # return | |
336 | |
337 | |
338 def importAll(): | |
339 # parse XML file | |
340 log('INFO', "parsing XML file %s"%input_fn) | |
341 tree = ET.parse(input_fn) | |
342 log('DEBUG', "etree ready") | |
343 root = tree.getroot() | |
344 ents = root.find('entities') | |
345 importEnts(ents) | |
346 | |
347 rels = root.find('relations') | |
348 importRels(rels) | |
349 | |
350 ## main | |
351 | |
352 print("Copy graph from OpenMind-XML to networkx pickle") | |
353 | |
354 # parse command line parameters | |
355 if len(sys.argv) > 1: | |
356 input_fn = sys.argv[1] | |
357 | |
358 if len(sys.argv) > 2: | |
359 output_fn = sys.argv[2] | |
360 | |
361 # import everything | |
362 print("Reading graph from OpenMind-XML file %s"%input_fn) | |
363 if len(exclude_objects_of_type) > 0: | |
364 print(" Skipping objects of type %s"%exclude_objects_of_type); | |
365 | |
366 importAll() | |
367 | |
368 print("Graph info: %s"%networkx.info(nx_graph)) | |
369 #print(" nodes:%s"%repr(nx_graph.nodes(data=True))) | |
370 | |
371 # export pickle | |
372 networkx.write_gpickle(nx_graph, output_fn) | |
373 print("Wrote networkx pickle file %s"%output_fn) |