Mercurial > hg > drupalISMI
annotate importFromOpenMind/importer/ismixml2model.py @ 48:6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
| author | casties |
|---|---|
| date | Tue, 07 Feb 2017 21:06:13 +0100 |
| parents | 378dcb66a27f |
| children | 5a633e875490 |
| rev | line source |
|---|---|
| 46 | 1 import xml.etree.ElementTree as ET |
| 2 import json | |
| 3 import networkx | |
| 4 import sys | |
| 5 | |
| 6 ## configure behaviour | |
| 7 | |
| 8 # output filename | |
| 9 output_fn = "ismi_graph.gpickle" | |
| 10 | |
| 11 input_fn = "openmind-data.xml" | |
| 12 | |
| 13 | |
| 14 # node types to exclude from the graph | |
| 15 exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE'] | |
| 16 | |
| 17 # attributes to exclude | |
| 18 exclude_attributes_of_type = [ | |
| 19 'lw', | |
| 20 'node_type', | |
| 21 'nov', | |
| 22 'notes_old' | |
| 23 ] | |
| 24 | |
| 25 # name of type attribute | |
| 26 node_type_attribute = '_type' | |
| 27 rel_type_attribute = '_type' | |
| 28 | |
| 29 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] | |
| 30 | |
| 31 | |
| 32 nx_graph = networkx.MultiDiGraph() | |
| 33 | |
| 34 nx_nodes = {} | |
| 35 ismi_relations = {} | |
| 36 nx_relations = {} | |
| 37 | |
| 38 # active log levels for logging | |
| 39 #logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} | |
| 40 #logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} | |
| 41 logLevels = {'INFO', 'ERROR', 'SYSMSG'} | |
| 42 | |
| 43 def log(level, message): | |
| 44 if level in logLevels: | |
| 45 print("%s: %s"%(level, message)) | |
| 46 | |
| 47 | |
| 48 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): | |
| 49 if is_src_rel: | |
| 50 #name = name + '>' | |
| 51 pass | |
| 52 | |
| 53 if is_tar_rel: | |
| 54 name = '<' + name | |
| 55 | |
| 56 if att_from_rel: | |
| 57 # clean up relations as attribute names | |
| 58 name = name.replace('is_', '') | |
| 59 name = name.replace('has_', '') | |
| 60 name = name.replace('was_', '') | |
| 61 name = name.replace('_of', '') | |
| 62 | |
| 63 return name | |
| 64 | |
| 65 | |
| 66 | |
| 67 def parseYear(val): | |
| 68 year = None | |
| 69 try: | |
| 70 date_json = json.loads(val) | |
| 71 if 'from' in date_json: | |
| 72 year = date_json['from'].get('year', None) | |
| 73 elif 'date' in date_json: | |
| 74 year = date_json['date'].get('year', None) | |
| 75 else: | |
| 76 log("WARNING", "don't know what to do with date %s"%(val)) | |
| 77 | |
| 78 except: | |
| 79 pass | |
| 80 | |
| 81 return year | |
| 82 | |
| 83 | |
| 84 def nodeFromEnt(ent_elem): | |
| 85 """Create a graph node from the given XML entity. | |
| 86 | |
| 87 Creates the node in gdb and returns the node. | |
| 88 """ | |
| 89 # text content of entity element | |
| 90 ov = ent_elem.text or '' | |
| 91 | |
| 92 attrs = {} | |
| 93 | |
| 94 # get attributes element | |
| 95 atts_elem = ent_elem.find('attributes') | |
| 96 | |
| 97 if atts_elem is None: | |
| 98 log('DEBUG', "entity has no attributes: %s"%ent_elem) | |
| 99 | |
| 100 else: | |
|
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
101 if atts_elem.tail is not None: |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
102 # tail belongs to parent |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
103 ov += atts_elem.tail |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
104 |
| 46 | 105 # go through all attributes |
| 106 for att_elem in atts_elem: | |
| 107 ct = att_elem.get('content-type', None) | |
| 108 name = att_elem.get('name', None) | |
| 109 if name in exclude_attributes_of_type: | |
| 110 # exclude attribute | |
| 111 continue | |
| 112 | |
|
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
113 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']: |
| 46 | 114 # normal text attribute (assume no content_type is text too...) |
| 115 val = att_elem.text | |
| 116 | |
| 117 if val is not None and val[0] == '{': | |
| 118 # try to parse as date | |
| 119 year = parseYear(val) | |
| 120 if year is not None: | |
| 121 val = year | |
| 122 | |
|
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
123 if val is not None: |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
124 # keep attribute |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
125 attrs[name] = val |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
126 #if 'nov' in att: |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
127 # # add normalized value |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
128 # attrs['_n_'+name] = att['nov'] |
| 46 | 129 |
| 130 elif ct == 'date': | |
| 131 # date attribute | |
| 132 val = att_elem.text | |
| 133 if val is not None: | |
| 134 # try to parse date object to get gregorian year | |
| 135 year = parseYear(val) | |
| 136 if year is not None: | |
| 137 attrs[name] = year | |
| 138 | |
| 139 elif ct == 'num': | |
| 140 # number attribute | |
| 141 val = att_elem.text | |
| 142 if val is not None: | |
| 143 # keep attribute, assume num is int | |
| 144 attrs[name] = int(val) | |
| 145 | |
| 146 elif ct == 'old': | |
| 147 # ignore attribute | |
| 148 continue | |
| 149 | |
| 150 else: | |
| 151 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem)) | |
| 152 # ignore other content types | |
| 153 continue | |
| 154 | |
| 155 # process base attributes | |
| 156 oc = ent_elem.get('object-class') | |
| 157 | |
| 158 # set type | |
| 159 attrs[node_type_attribute] = fixName(oc) | |
| 160 | |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
161 ismi_id = int(ent_elem.get('id')) |
| 46 | 162 # rename id to ismi_id |
| 163 attrs['ismi_id'] = ismi_id | |
| 164 | |
| 165 if len(ov) > 0: | |
| 166 # save ov as label | |
| 167 attrs['label'] = ov | |
| 168 #if 'nov' in ent: | |
| 169 # # add normalized value | |
| 170 # attrs['_n_label'] = ent.get('nov') | |
| 171 | |
| 172 # create node | |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
173 log('DEBUG', "new node(%s, %s)"%(ismi_id, attrs)) |
| 46 | 174 nx_graph.add_node(ismi_id, **attrs) |
| 175 node = nx_graph.node[ismi_id] | |
| 176 | |
| 177 return node | |
| 178 | |
| 179 | |
| 180 def relationFromRel(rel_elem): | |
| 181 """Create graph relation from etree element. | |
| 182 """ | |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
183 rel_id = int(rel_elem.get('id')) |
| 46 | 184 rel_name = rel_elem.get('object-class') |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
185 src_id = int(rel_elem.get('source-id')) |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
186 tar_id = int(rel_elem.get('target-id')) |
| 46 | 187 if not src_id in nx_nodes: |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
188 log("WARNING", "relation %s src node %s missing!"%(rel_id,src_id)) |
| 46 | 189 return None |
| 190 | |
| 191 if not tar_id in nx_nodes: | |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
192 log("WARNING", "relation %s tar node %s missing!"%(rel_id,tar_id)) |
| 46 | 193 return None |
| 194 | |
| 195 ov = rel_elem.text or '' | |
| 196 | |
| 197 attrs = {} | |
| 198 | |
| 199 # get attributes element | |
| 200 atts_elem = rel_elem.find('attributes') | |
| 201 | |
| 202 if atts_elem is not None: | |
|
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
203 if atts_elem.tail is not None: |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
204 # tail belongs to parent |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
205 ov += atts_elem.tail |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
206 |
| 46 | 207 # go through all attributes |
| 208 for att_elem in atts_elem: | |
| 209 ct = att_elem.get('content-type', None) | |
| 210 name = att_elem.get('name', None) | |
| 211 if name in exclude_attributes_of_type: | |
| 212 # exclude attribute | |
| 213 continue | |
| 214 | |
|
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
215 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']: |
| 46 | 216 # normal text attribute (assume no content_type is text too...) |
| 217 val = att_elem.text | |
| 218 | |
| 219 if val is not None and val[0] == '{': | |
| 220 # try to parse as date | |
| 221 year = parseYear(val) | |
| 222 if year is not None: | |
| 223 val = year | |
| 224 | |
|
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
225 if val is not None: |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
226 # keep attribute |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
227 attrs[name] = val |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
228 #if 'nov' in att: |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
229 # # add normalized value |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
230 # attrs['_n_'+name] = att['nov'] |
| 46 | 231 |
| 232 elif ct == 'date': | |
| 233 # date attribute | |
| 234 val = att_elem.text | |
| 235 if val is not None: | |
| 236 # try to parse date object to get gregorian year | |
| 237 year = parseYear(val) | |
| 238 if year is not None: | |
| 239 attrs[name] = year | |
| 240 | |
| 241 elif ct == 'num': | |
| 242 # number attribute | |
| 243 val = att_elem.text | |
| 244 if val is not None: | |
| 245 # keep attribute, assume num is int | |
| 246 attrs[name] = int(val) | |
| 247 | |
| 248 elif ct == 'old': | |
| 249 # ignore attribute | |
| 250 continue | |
| 251 | |
| 252 else: | |
| 253 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem)) | |
| 254 # ignore other content types | |
| 255 continue | |
| 256 | |
| 257 #if len(ov) > 0: | |
| 258 # # own value of relation is not useful | |
| 259 # attrs['ov'] = ov | |
| 260 | |
| 261 attrs[rel_type_attribute] = fixName(rel_name) | |
| 262 attrs['ismi_id'] = rel_id | |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
263 #log('DEBUG', "new edge(%s, %s, %s)"%(src_id, tar_id, attrs)) |
| 46 | 264 # create relation with type |
| 265 nx_rel = nx_graph.add_edge(src_id, tar_id, attr_dict=attrs) | |
| 266 | |
| 267 return nx_rel | |
| 268 | |
| 269 | |
| 270 def importEnts(ents_elem): | |
| 271 """Import all entities from etree element elem. | |
| 272 """ | |
| 273 cnt = 0 | |
| 274 xml_num = ents_elem.get('number') | |
| 275 log('INFO', "XML says %s entities"%xml_num) | |
| 276 | |
| 277 # iterate through entities element | |
| 278 for ent_elem in ents_elem: | |
| 279 cnt += 1 | |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
280 |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
281 oc = ent_elem.get('object-class') |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
282 if oc in exclude_objects_of_type: |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
283 # skip this entity |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
284 continue |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
285 |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
286 ismi_id = int(ent_elem.get('id')) |
| 46 | 287 log('DEBUG', "reading entity[%s]"%ismi_id) |
| 288 | |
| 289 if ismi_id in nx_nodes: | |
| 290 log("ERROR", "entity with id=%s exists!"%ismi_id) | |
| 291 return | |
| 292 | |
| 293 # create networkx node | |
| 294 node = nodeFromEnt(ent_elem) | |
| 295 | |
| 296 # save node reference | |
| 297 nx_nodes[ismi_id] = node | |
| 298 | |
| 299 # debug | |
| 300 #if cnt >= 100: | |
| 301 # return | |
| 302 | |
| 303 | |
| 304 def importRels(rels_elem): | |
| 305 """Import all entities from etree element elem. | |
| 306 """ | |
| 307 cnt = 0 | |
| 308 xml_num = rels_elem.get('number') | |
| 309 log('INFO', "XML says %s relations"%xml_num) | |
| 310 | |
| 311 # iterate through entities element | |
| 312 for rel_elem in rels_elem: | |
| 313 cnt += 1 | |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
314 |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
315 ismi_id = int(rel_elem.get('id')) |
| 46 | 316 log('DEBUG', "reading relation[%s]"%ismi_id) |
| 317 | |
| 318 if ismi_id in nx_relations: | |
| 319 print("ERROR: relation with id=%s exists!"%ismi_id) | |
| 320 return | |
| 321 | |
| 322 # create networkx relation | |
| 323 relation = relationFromRel(rel_elem) | |
| 324 | |
| 325 # save relation reference | |
| 326 nx_relations[ismi_id] = relation | |
| 327 | |
| 328 # debug | |
| 329 #if cnt >= 100: | |
| 330 # return | |
| 331 | |
| 332 | |
| 333 def importAll(): | |
| 334 # parse XML file | |
| 335 log('INFO', "parsing XML file %s"%input_fn) | |
| 336 tree = ET.parse(input_fn) | |
| 337 log('DEBUG', "etree ready") | |
| 338 root = tree.getroot() | |
| 339 ents = root.find('entities') | |
| 340 importEnts(ents) | |
| 341 | |
| 342 rels = root.find('relations') | |
| 343 importRels(rels) | |
| 344 | |
| 345 ## main | |
| 346 | |
| 347 print("Copy graph from OpenMind-XML to networkx pickle") | |
| 348 | |
| 349 # parse command line parameters | |
| 350 if len(sys.argv) > 1: | |
| 351 input_fn = sys.argv[1] | |
| 352 | |
| 353 if len(sys.argv) > 2: | |
| 354 output_fn = sys.argv[2] | |
| 355 | |
| 356 # import everything | |
| 357 print("Reading graph from OpenMind-XML file %s"%input_fn) | |
| 358 if len(exclude_objects_of_type) > 0: | |
| 359 print(" Skipping objects of type %s"%exclude_objects_of_type); | |
| 360 | |
| 361 importAll() | |
| 362 | |
| 363 print("Graph info: %s"%networkx.info(nx_graph)) | |
| 364 #print(" nodes:%s"%repr(nx_graph.nodes(data=True))) | |
| 365 | |
| 366 # export pickle | |
| 367 networkx.write_gpickle(nx_graph, output_fn) | |
| 368 print("Wrote networkx pickle file %s"%output_fn) |
