Mercurial > hg > drupalISMI
annotate importFromOpenMind/importer/ismixml2model.py @ 48:6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
author | casties |
---|---|
date | Tue, 07 Feb 2017 21:06:13 +0100 |
parents | 378dcb66a27f |
children | 5a633e875490 |
rev | line source |
---|---|
46 | 1 import xml.etree.ElementTree as ET |
2 import json | |
3 import networkx | |
4 import sys | |
5 | |
6 ## configure behaviour | |
7 | |
8 # output filename | |
9 output_fn = "ismi_graph.gpickle" | |
10 | |
11 input_fn = "openmind-data.xml" | |
12 | |
13 | |
14 # node types to exclude from the graph | |
15 exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE'] | |
16 | |
17 # attributes to exclude | |
18 exclude_attributes_of_type = [ | |
19 'lw', | |
20 'node_type', | |
21 'nov', | |
22 'notes_old' | |
23 ] | |
24 | |
25 # name of type attribute | |
26 node_type_attribute = '_type' | |
27 rel_type_attribute = '_type' | |
28 | |
29 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] | |
30 | |
31 | |
32 nx_graph = networkx.MultiDiGraph() | |
33 | |
34 nx_nodes = {} | |
35 ismi_relations = {} | |
36 nx_relations = {} | |
37 | |
38 # active log levels for logging | |
39 #logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} | |
40 #logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} | |
41 logLevels = {'INFO', 'ERROR', 'SYSMSG'} | |
42 | |
43 def log(level, message): | |
44 if level in logLevels: | |
45 print("%s: %s"%(level, message)) | |
46 | |
47 | |
48 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): | |
49 if is_src_rel: | |
50 #name = name + '>' | |
51 pass | |
52 | |
53 if is_tar_rel: | |
54 name = '<' + name | |
55 | |
56 if att_from_rel: | |
57 # clean up relations as attribute names | |
58 name = name.replace('is_', '') | |
59 name = name.replace('has_', '') | |
60 name = name.replace('was_', '') | |
61 name = name.replace('_of', '') | |
62 | |
63 return name | |
64 | |
65 | |
66 | |
67 def parseYear(val): | |
68 year = None | |
69 try: | |
70 date_json = json.loads(val) | |
71 if 'from' in date_json: | |
72 year = date_json['from'].get('year', None) | |
73 elif 'date' in date_json: | |
74 year = date_json['date'].get('year', None) | |
75 else: | |
76 log("WARNING", "don't know what to do with date %s"%(val)) | |
77 | |
78 except: | |
79 pass | |
80 | |
81 return year | |
82 | |
83 | |
84 def nodeFromEnt(ent_elem): | |
85 """Create a graph node from the given XML entity. | |
86 | |
87 Creates the node in gdb and returns the node. | |
88 """ | |
89 # text content of entity element | |
90 ov = ent_elem.text or '' | |
91 | |
92 attrs = {} | |
93 | |
94 # get attributes element | |
95 atts_elem = ent_elem.find('attributes') | |
96 | |
97 if atts_elem is None: | |
98 log('DEBUG', "entity has no attributes: %s"%ent_elem) | |
99 | |
100 else: | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
101 if atts_elem.tail is not None: |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
102 # tail belongs to parent |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
103 ov += atts_elem.tail |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
104 |
46 | 105 # go through all attributes |
106 for att_elem in atts_elem: | |
107 ct = att_elem.get('content-type', None) | |
108 name = att_elem.get('name', None) | |
109 if name in exclude_attributes_of_type: | |
110 # exclude attribute | |
111 continue | |
112 | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
113 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']: |
46 | 114 # normal text attribute (assume no content_type is text too...) |
115 val = att_elem.text | |
116 | |
117 if val is not None and val[0] == '{': | |
118 # try to parse as date | |
119 year = parseYear(val) | |
120 if year is not None: | |
121 val = year | |
122 | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
123 if val is not None: |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
124 # keep attribute |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
125 attrs[name] = val |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
126 #if 'nov' in att: |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
127 # # add normalized value |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
128 # attrs['_n_'+name] = att['nov'] |
46 | 129 |
130 elif ct == 'date': | |
131 # date attribute | |
132 val = att_elem.text | |
133 if val is not None: | |
134 # try to parse date object to get gregorian year | |
135 year = parseYear(val) | |
136 if year is not None: | |
137 attrs[name] = year | |
138 | |
139 elif ct == 'num': | |
140 # number attribute | |
141 val = att_elem.text | |
142 if val is not None: | |
143 # keep attribute, assume num is int | |
144 attrs[name] = int(val) | |
145 | |
146 elif ct == 'old': | |
147 # ignore attribute | |
148 continue | |
149 | |
150 else: | |
151 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem)) | |
152 # ignore other content types | |
153 continue | |
154 | |
155 # process base attributes | |
156 oc = ent_elem.get('object-class') | |
157 | |
158 # set type | |
159 attrs[node_type_attribute] = fixName(oc) | |
160 | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
161 ismi_id = int(ent_elem.get('id')) |
46 | 162 # rename id to ismi_id |
163 attrs['ismi_id'] = ismi_id | |
164 | |
165 if len(ov) > 0: | |
166 # save ov as label | |
167 attrs['label'] = ov | |
168 #if 'nov' in ent: | |
169 # # add normalized value | |
170 # attrs['_n_label'] = ent.get('nov') | |
171 | |
172 # create node | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
173 log('DEBUG', "new node(%s, %s)"%(ismi_id, attrs)) |
46 | 174 nx_graph.add_node(ismi_id, **attrs) |
175 node = nx_graph.node[ismi_id] | |
176 | |
177 return node | |
178 | |
179 | |
180 def relationFromRel(rel_elem): | |
181 """Create graph relation from etree element. | |
182 """ | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
183 rel_id = int(rel_elem.get('id')) |
46 | 184 rel_name = rel_elem.get('object-class') |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
185 src_id = int(rel_elem.get('source-id')) |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
186 tar_id = int(rel_elem.get('target-id')) |
46 | 187 if not src_id in nx_nodes: |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
188 log("WARNING", "relation %s src node %s missing!"%(rel_id,src_id)) |
46 | 189 return None |
190 | |
191 if not tar_id in nx_nodes: | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
192 log("WARNING", "relation %s tar node %s missing!"%(rel_id,tar_id)) |
46 | 193 return None |
194 | |
195 ov = rel_elem.text or '' | |
196 | |
197 attrs = {} | |
198 | |
199 # get attributes element | |
200 atts_elem = rel_elem.find('attributes') | |
201 | |
202 if atts_elem is not None: | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
203 if atts_elem.tail is not None: |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
204 # tail belongs to parent |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
205 ov += atts_elem.tail |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
206 |
46 | 207 # go through all attributes |
208 for att_elem in atts_elem: | |
209 ct = att_elem.get('content-type', None) | |
210 name = att_elem.get('name', None) | |
211 if name in exclude_attributes_of_type: | |
212 # exclude attribute | |
213 continue | |
214 | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
215 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']: |
46 | 216 # normal text attribute (assume no content_type is text too...) |
217 val = att_elem.text | |
218 | |
219 if val is not None and val[0] == '{': | |
220 # try to parse as date | |
221 year = parseYear(val) | |
222 if year is not None: | |
223 val = year | |
224 | |
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
225 if val is not None: |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
226 # keep attribute |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
227 attrs[name] = val |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
228 #if 'nov' in att: |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
229 # # add normalized value |
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
230 # attrs['_n_'+name] = att['nov'] |
46 | 231 |
232 elif ct == 'date': | |
233 # date attribute | |
234 val = att_elem.text | |
235 if val is not None: | |
236 # try to parse date object to get gregorian year | |
237 year = parseYear(val) | |
238 if year is not None: | |
239 attrs[name] = year | |
240 | |
241 elif ct == 'num': | |
242 # number attribute | |
243 val = att_elem.text | |
244 if val is not None: | |
245 # keep attribute, assume num is int | |
246 attrs[name] = int(val) | |
247 | |
248 elif ct == 'old': | |
249 # ignore attribute | |
250 continue | |
251 | |
252 else: | |
253 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem)) | |
254 # ignore other content types | |
255 continue | |
256 | |
257 #if len(ov) > 0: | |
258 # # own value of relation is not useful | |
259 # attrs['ov'] = ov | |
260 | |
261 attrs[rel_type_attribute] = fixName(rel_name) | |
262 attrs['ismi_id'] = rel_id | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
263 #log('DEBUG', "new edge(%s, %s, %s)"%(src_id, tar_id, attrs)) |
46 | 264 # create relation with type |
265 nx_rel = nx_graph.add_edge(src_id, tar_id, attr_dict=attrs) | |
266 | |
267 return nx_rel | |
268 | |
269 | |
270 def importEnts(ents_elem): | |
271 """Import all entities from etree element elem. | |
272 """ | |
273 cnt = 0 | |
274 xml_num = ents_elem.get('number') | |
275 log('INFO', "XML says %s entities"%xml_num) | |
276 | |
277 # iterate through entities element | |
278 for ent_elem in ents_elem: | |
279 cnt += 1 | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
280 |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
281 oc = ent_elem.get('object-class') |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
282 if oc in exclude_objects_of_type: |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
283 # skip this entity |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
284 continue |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
285 |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
286 ismi_id = int(ent_elem.get('id')) |
46 | 287 log('DEBUG', "reading entity[%s]"%ismi_id) |
288 | |
289 if ismi_id in nx_nodes: | |
290 log("ERROR", "entity with id=%s exists!"%ismi_id) | |
291 return | |
292 | |
293 # create networkx node | |
294 node = nodeFromEnt(ent_elem) | |
295 | |
296 # save node reference | |
297 nx_nodes[ismi_id] = node | |
298 | |
299 # debug | |
300 #if cnt >= 100: | |
301 # return | |
302 | |
303 | |
304 def importRels(rels_elem): | |
305 """Import all entities from etree element elem. | |
306 """ | |
307 cnt = 0 | |
308 xml_num = rels_elem.get('number') | |
309 log('INFO', "XML says %s relations"%xml_num) | |
310 | |
311 # iterate through entities element | |
312 for rel_elem in rels_elem: | |
313 cnt += 1 | |
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
314 |
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
315 ismi_id = int(rel_elem.get('id')) |
46 | 316 log('DEBUG', "reading relation[%s]"%ismi_id) |
317 | |
318 if ismi_id in nx_relations: | |
319 print("ERROR: relation with id=%s exists!"%ismi_id) | |
320 return | |
321 | |
322 # create networkx relation | |
323 relation = relationFromRel(rel_elem) | |
324 | |
325 # save relation reference | |
326 nx_relations[ismi_id] = relation | |
327 | |
328 # debug | |
329 #if cnt >= 100: | |
330 # return | |
331 | |
332 | |
333 def importAll(): | |
334 # parse XML file | |
335 log('INFO', "parsing XML file %s"%input_fn) | |
336 tree = ET.parse(input_fn) | |
337 log('DEBUG', "etree ready") | |
338 root = tree.getroot() | |
339 ents = root.find('entities') | |
340 importEnts(ents) | |
341 | |
342 rels = root.find('relations') | |
343 importRels(rels) | |
344 | |
345 ## main | |
346 | |
347 print("Copy graph from OpenMind-XML to networkx pickle") | |
348 | |
349 # parse command line parameters | |
350 if len(sys.argv) > 1: | |
351 input_fn = sys.argv[1] | |
352 | |
353 if len(sys.argv) > 2: | |
354 output_fn = sys.argv[2] | |
355 | |
356 # import everything | |
357 print("Reading graph from OpenMind-XML file %s"%input_fn) | |
358 if len(exclude_objects_of_type) > 0: | |
359 print(" Skipping objects of type %s"%exclude_objects_of_type); | |
360 | |
361 importAll() | |
362 | |
363 print("Graph info: %s"%networkx.info(nx_graph)) | |
364 #print(" nodes:%s"%repr(nx_graph.nodes(data=True))) | |
365 | |
366 # export pickle | |
367 networkx.write_gpickle(nx_graph, output_fn) | |
368 print("Wrote networkx pickle file %s"%output_fn) |