46
|
1 import xml.etree.ElementTree as ET
|
|
2 import json
|
|
3 import networkx
|
|
4 import sys
|
|
5
|
|
6 ## configure behaviour
|
|
7
|
|
8 # output filename
|
|
9 output_fn = "ismi_graph.gpickle"
|
|
10
|
|
11 input_fn = "openmind-data.xml"
|
|
12
|
|
13
|
|
14 # node types to exclude from the graph
|
|
15 exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE']
|
|
16
|
|
17 # attributes to exclude
|
|
18 exclude_attributes_of_type = [
|
|
19 'lw',
|
|
20 'node_type',
|
|
21 'nov',
|
|
22 'notes_old'
|
|
23 ]
|
|
24
|
|
25 # name of type attribute
|
|
26 node_type_attribute = '_type'
|
|
27 rel_type_attribute = '_type'
|
|
28
|
|
29 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
|
|
30
|
|
31
|
|
32 nx_graph = networkx.MultiDiGraph()
|
|
33
|
|
34 nx_nodes = {}
|
|
35 ismi_relations = {}
|
|
36 nx_relations = {}
|
|
37
|
|
38 # active log levels for logging
|
|
39 #logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
|
|
40 #logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
|
|
41 logLevels = {'INFO', 'ERROR', 'SYSMSG'}
|
|
42
|
|
43 def log(level, message):
|
|
44 if level in logLevels:
|
|
45 print("%s: %s"%(level, message))
|
|
46
|
|
47
|
|
48 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
|
|
49 if is_src_rel:
|
|
50 #name = name + '>'
|
|
51 pass
|
|
52
|
|
53 if is_tar_rel:
|
|
54 name = '<' + name
|
|
55
|
|
56 if att_from_rel:
|
|
57 # clean up relations as attribute names
|
|
58 name = name.replace('is_', '')
|
|
59 name = name.replace('has_', '')
|
|
60 name = name.replace('was_', '')
|
|
61 name = name.replace('_of', '')
|
|
62
|
|
63 return name
|
|
64
|
|
65
|
|
66
|
|
67 def parseYear(val):
|
|
68 year = None
|
|
69 try:
|
|
70 date_json = json.loads(val)
|
|
71 if 'from' in date_json:
|
|
72 year = date_json['from'].get('year', None)
|
|
73 elif 'date' in date_json:
|
|
74 year = date_json['date'].get('year', None)
|
|
75 else:
|
|
76 log("WARNING", "don't know what to do with date %s"%(val))
|
|
77
|
|
78 except:
|
|
79 pass
|
|
80
|
|
81 return year
|
|
82
|
|
83
|
|
84 def nodeFromEnt(ent_elem):
|
|
85 """Create a graph node from the given XML entity.
|
|
86
|
|
87 Creates the node in gdb and returns the node.
|
|
88 """
|
|
89 # text content of entity element
|
|
90 ov = ent_elem.text or ''
|
|
91
|
|
92 attrs = {}
|
|
93
|
|
94 # get attributes element
|
|
95 atts_elem = ent_elem.find('attributes')
|
|
96
|
|
97 if atts_elem is None:
|
|
98 log('DEBUG', "entity has no attributes: %s"%ent_elem)
|
|
99
|
|
100 else:
|
|
101 # go through all attributes
|
|
102 for att_elem in atts_elem:
|
|
103 if att_elem.tail is not None:
|
|
104 # tail belongs to parent
|
|
105 ov += att_elem.tail
|
|
106
|
|
107 ct = att_elem.get('content-type', None)
|
|
108 name = att_elem.get('name', None)
|
|
109 if name in exclude_attributes_of_type:
|
|
110 # exclude attribute
|
|
111 continue
|
|
112
|
|
113 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
|
|
114 # normal text attribute (assume no content_type is text too...)
|
|
115 val = att_elem.text
|
|
116
|
|
117 if val is not None and val[0] == '{':
|
|
118 # try to parse as date
|
|
119 year = parseYear(val)
|
|
120 if year is not None:
|
|
121 val = year
|
|
122
|
|
123 # keep attribute
|
|
124 attrs[name] = val
|
|
125 #if 'nov' in att:
|
|
126 # # add normalized value
|
|
127 # attrs['_n_'+name] = att['nov']
|
|
128
|
|
129 elif ct == 'date':
|
|
130 # date attribute
|
|
131 val = att_elem.text
|
|
132 if val is not None:
|
|
133 # try to parse date object to get gregorian year
|
|
134 year = parseYear(val)
|
|
135 if year is not None:
|
|
136 attrs[name] = year
|
|
137
|
|
138 elif ct == 'num':
|
|
139 # number attribute
|
|
140 val = att_elem.text
|
|
141 if val is not None:
|
|
142 # keep attribute, assume num is int
|
|
143 attrs[name] = int(val)
|
|
144
|
|
145 elif ct == 'old':
|
|
146 # ignore attribute
|
|
147 continue
|
|
148
|
|
149 else:
|
|
150 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem))
|
|
151 # ignore other content types
|
|
152 continue
|
|
153
|
|
154 # process base attributes
|
|
155 oc = ent_elem.get('object-class')
|
|
156
|
|
157 # set type
|
|
158 attrs[node_type_attribute] = fixName(oc)
|
|
159
|
|
160 ismi_id = ent_elem.get('id')
|
|
161 # rename id to ismi_id
|
|
162 attrs['ismi_id'] = ismi_id
|
|
163
|
|
164 if len(ov) > 0:
|
|
165 # save ov as label
|
|
166 attrs['label'] = ov
|
|
167 #if 'nov' in ent:
|
|
168 # # add normalized value
|
|
169 # attrs['_n_label'] = ent.get('nov')
|
|
170
|
|
171 # create node
|
|
172 #log('DEBUG', "new node(%s, %s)"%(ismi_id, attrs))
|
|
173 nx_graph.add_node(ismi_id, **attrs)
|
|
174 node = nx_graph.node[ismi_id]
|
|
175
|
|
176 return node
|
|
177
|
|
178
|
|
179 def relationFromRel(rel_elem):
|
|
180 """Create graph relation from etree element.
|
|
181 """
|
|
182 rel_id = rel_elem.get('id')
|
|
183 rel_name = rel_elem.get('object-class')
|
|
184 src_id = rel_elem.get('source-id')
|
|
185 tar_id = rel_elem.get('target-id')
|
|
186 if not src_id in nx_nodes:
|
|
187 log("ERROR", "relation %s src node %s missing!"%(rel_id,src_id))
|
|
188 return None
|
|
189
|
|
190 if not tar_id in nx_nodes:
|
|
191 log("ERROR", "relation %s tar node %s missing!"%(rel_id,tar_id))
|
|
192 return None
|
|
193
|
|
194 ov = rel_elem.text or ''
|
|
195
|
|
196 attrs = {}
|
|
197
|
|
198 # get attributes element
|
|
199 atts_elem = rel_elem.find('attributes')
|
|
200
|
|
201 if atts_elem is not None:
|
|
202 # go through all attributes
|
|
203 for att_elem in atts_elem:
|
|
204 if att_elem.tail is not None:
|
|
205 # tail belongs to parent
|
|
206 ov += att_elem.tail
|
|
207
|
|
208 ct = att_elem.get('content-type', None)
|
|
209 name = att_elem.get('name', None)
|
|
210 if name in exclude_attributes_of_type:
|
|
211 # exclude attribute
|
|
212 continue
|
|
213
|
|
214 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
|
|
215 # normal text attribute (assume no content_type is text too...)
|
|
216 val = att_elem.text
|
|
217
|
|
218 if val is not None and val[0] == '{':
|
|
219 # try to parse as date
|
|
220 year = parseYear(val)
|
|
221 if year is not None:
|
|
222 val = year
|
|
223
|
|
224 # keep attribute
|
|
225 attrs[name] = val
|
|
226 #if 'nov' in att:
|
|
227 # # add normalized value
|
|
228 # attrs['_n_'+name] = att['nov']
|
|
229
|
|
230 elif ct == 'date':
|
|
231 # date attribute
|
|
232 val = att_elem.text
|
|
233 if val is not None:
|
|
234 # try to parse date object to get gregorian year
|
|
235 year = parseYear(val)
|
|
236 if year is not None:
|
|
237 attrs[name] = year
|
|
238
|
|
239 elif ct == 'num':
|
|
240 # number attribute
|
|
241 val = att_elem.text
|
|
242 if val is not None:
|
|
243 # keep attribute, assume num is int
|
|
244 attrs[name] = int(val)
|
|
245
|
|
246 elif ct == 'old':
|
|
247 # ignore attribute
|
|
248 continue
|
|
249
|
|
250 else:
|
|
251 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem))
|
|
252 # ignore other content types
|
|
253 continue
|
|
254
|
|
255 #if len(ov) > 0:
|
|
256 # # own value of relation is not useful
|
|
257 # attrs['ov'] = ov
|
|
258
|
|
259 attrs[rel_type_attribute] = fixName(rel_name)
|
|
260 attrs['ismi_id'] = rel_id
|
|
261 log('DEBUG', "new edge(%s, %s, %s)"%(src_id, tar_id, attrs))
|
|
262 # create relation with type
|
|
263 nx_rel = nx_graph.add_edge(src_id, tar_id, attr_dict=attrs)
|
|
264
|
|
265 return nx_rel
|
|
266
|
|
267
|
|
268 def importEnts(ents_elem):
|
|
269 """Import all entities from etree element elem.
|
|
270 """
|
|
271 cnt = 0
|
|
272 xml_num = ents_elem.get('number')
|
|
273 log('INFO', "XML says %s entities"%xml_num)
|
|
274
|
|
275 # iterate through entities element
|
|
276 for ent_elem in ents_elem:
|
|
277 cnt += 1
|
|
278 ismi_id = ent_elem.get('id')
|
|
279 log('DEBUG', "reading entity[%s]"%ismi_id)
|
|
280
|
|
281 if ismi_id in nx_nodes:
|
|
282 log("ERROR", "entity with id=%s exists!"%ismi_id)
|
|
283 return
|
|
284
|
|
285 # create networkx node
|
|
286 node = nodeFromEnt(ent_elem)
|
|
287
|
|
288 # save node reference
|
|
289 nx_nodes[ismi_id] = node
|
|
290
|
|
291 # debug
|
|
292 #if cnt >= 100:
|
|
293 # return
|
|
294
|
|
295
|
|
296 def importRels(rels_elem):
|
|
297 """Import all entities from etree element elem.
|
|
298 """
|
|
299 cnt = 0
|
|
300 xml_num = rels_elem.get('number')
|
|
301 log('INFO', "XML says %s relations"%xml_num)
|
|
302
|
|
303 # iterate through entities element
|
|
304 for rel_elem in rels_elem:
|
|
305 cnt += 1
|
|
306 ismi_id = rel_elem.get('id')
|
|
307 log('DEBUG', "reading relation[%s]"%ismi_id)
|
|
308
|
|
309 if ismi_id in nx_relations:
|
|
310 print("ERROR: relation with id=%s exists!"%ismi_id)
|
|
311 return
|
|
312
|
|
313 # create networkx relation
|
|
314 relation = relationFromRel(rel_elem)
|
|
315
|
|
316 # save relation reference
|
|
317 nx_relations[ismi_id] = relation
|
|
318
|
|
319 # debug
|
|
320 #if cnt >= 100:
|
|
321 # return
|
|
322
|
|
323
|
|
324 def importAll():
|
|
325 # parse XML file
|
|
326 log('INFO', "parsing XML file %s"%input_fn)
|
|
327 tree = ET.parse(input_fn)
|
|
328 log('DEBUG', "etree ready")
|
|
329 root = tree.getroot()
|
|
330 ents = root.find('entities')
|
|
331 importEnts(ents)
|
|
332
|
|
333 rels = root.find('relations')
|
|
334 importRels(rels)
|
|
335
|
|
336 ## main
|
|
337
|
|
338 print("Copy graph from OpenMind-XML to networkx pickle")
|
|
339
|
|
340 # parse command line parameters
|
|
341 if len(sys.argv) > 1:
|
|
342 input_fn = sys.argv[1]
|
|
343
|
|
344 if len(sys.argv) > 2:
|
|
345 output_fn = sys.argv[2]
|
|
346
|
|
347 # import everything
|
|
348 print("Reading graph from OpenMind-XML file %s"%input_fn)
|
|
349 if len(exclude_objects_of_type) > 0:
|
|
350 print(" Skipping objects of type %s"%exclude_objects_of_type);
|
|
351
|
|
352 importAll()
|
|
353
|
|
354 print("Graph info: %s"%networkx.info(nx_graph))
|
|
355 #print(" nodes:%s"%repr(nx_graph.nodes(data=True)))
|
|
356
|
|
357 # export pickle
|
|
358 networkx.write_gpickle(nx_graph, output_fn)
|
|
359 print("Wrote networkx pickle file %s"%output_fn)
|