Mercurial > hg > drupalISMI
annotate importFromOpenMind/importer/ismixml2model.py @ 55:5a1a4af235eb
fix fix of transfer of ownvalue to _label attribute.
| author | casties |
|---|---|
| date | Fri, 21 Apr 2017 19:08:09 +0200 |
| parents | b9a6e596ebe4 |
| children | be1c7d6814b6 |
| rev | line source |
|---|---|
| 46 | 1 import xml.etree.ElementTree as ET |
| 2 import json | |
| 3 import networkx | |
| 4 import sys | |
| 5 | |
| 6 ## configure behaviour | |
| 7 | |
| 8 # output filename | |
| 9 output_fn = "ismi_graph.gpickle" | |
| 10 | |
| 11 input_fn = "openmind-data.xml" | |
| 12 | |
| 13 | |
| 14 # node types to exclude from the graph | |
| 15 exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE'] | |
| 16 | |
| 17 # attributes to exclude | |
| 18 exclude_attributes_of_type = [ | |
| 19 'lw', | |
| 20 'node_type', | |
| 21 'nov', | |
| 22 'notes_old' | |
| 23 ] | |
| 24 | |
| 25 # name of type attribute | |
| 26 node_type_attribute = '_type' | |
| 27 rel_type_attribute = '_type' | |
| 28 | |
| 29 #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] | |
| 30 | |
| 31 | |
| 32 nx_graph = networkx.MultiDiGraph() | |
| 33 | |
| 34 nx_nodes = {} | |
| 35 ismi_relations = {} | |
| 36 nx_relations = {} | |
| 37 | |
| 38 # active log levels for logging | |
| 39 #logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} | |
| 40 #logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} | |
| 41 logLevels = {'INFO', 'ERROR', 'SYSMSG'} | |
| 42 | |
| 43 def log(level, message): | |
| 44 if level in logLevels: | |
| 45 print("%s: %s"%(level, message)) | |
| 46 | |
| 47 | |
| 48 def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): | |
| 49 if is_src_rel: | |
| 50 #name = name + '>' | |
| 51 pass | |
| 52 | |
| 53 if is_tar_rel: | |
| 54 name = '<' + name | |
| 55 | |
| 56 if att_from_rel: | |
| 57 # clean up relations as attribute names | |
| 58 name = name.replace('is_', '') | |
| 59 name = name.replace('has_', '') | |
| 60 name = name.replace('was_', '') | |
| 61 name = name.replace('_of', '') | |
| 62 | |
| 63 return name | |
| 64 | |
| 65 | |
| 66 | |
| 67 def parseYear(val): | |
| 68 year = None | |
| 69 try: | |
| 70 date_json = json.loads(val) | |
| 71 if 'from' in date_json: | |
| 72 year = date_json['from'].get('year', None) | |
| 73 elif 'date' in date_json: | |
| 74 year = date_json['date'].get('year', None) | |
| 75 else: | |
| 76 log("WARNING", "don't know what to do with date %s"%(val)) | |
| 77 | |
| 78 except: | |
| 79 pass | |
| 80 | |
| 81 return year | |
| 82 | |
| 83 | |
| 84 def nodeFromEnt(ent_elem): | |
| 85 """Create a graph node from the given XML entity. | |
| 86 | |
| 87 Creates the node in gdb and returns the node. | |
| 88 """ | |
| 89 # text content of entity element | |
| 90 ov = ent_elem.text or '' | |
| 91 | |
| 92 attrs = {} | |
| 93 | |
| 94 # get attributes element | |
| 95 atts_elem = ent_elem.find('attributes') | |
| 54 | 96 |
| 46 | 97 if atts_elem is None: |
| 98 log('DEBUG', "entity has no attributes: %s"%ent_elem) | |
| 99 | |
| 100 else: | |
|
55
5a1a4af235eb
fix fix of transfer of ownvalue to _label attribute.
casties
parents:
54
diff
changeset
|
101 # text content of entity element after atts_elem |
|
5a1a4af235eb
fix fix of transfer of ownvalue to _label attribute.
casties
parents:
54
diff
changeset
|
102 ov += atts_elem.tail or '' |
|
5a1a4af235eb
fix fix of transfer of ownvalue to _label attribute.
casties
parents:
54
diff
changeset
|
103 |
| 46 | 104 # go through all attributes |
| 105 for att_elem in atts_elem: | |
| 106 ct = att_elem.get('content-type', None) | |
| 107 name = att_elem.get('name', None) | |
| 108 if name in exclude_attributes_of_type: | |
| 109 # exclude attribute | |
| 110 continue | |
| 111 | |
|
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
112 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']: |
| 46 | 113 # normal text attribute (assume no content_type is text too...) |
| 114 val = att_elem.text | |
| 115 | |
| 116 if val is not None and val[0] == '{': | |
| 117 # try to parse as date | |
| 118 year = parseYear(val) | |
| 119 if year is not None: | |
| 120 val = year | |
| 121 | |
|
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
122 if val is not None: |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
123 # keep attribute |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
124 attrs[name] = val |
| 51 | 125 # check for normalized value |
| 126 nov = att_elem.findtext('norm') | |
| 127 if nov is not None: | |
| 128 # add normalized value | |
| 129 attrs['_n_'+name] = nov | |
| 46 | 130 |
| 131 elif ct == 'date': | |
| 132 # date attribute | |
| 133 val = att_elem.text | |
| 134 if val is not None: | |
| 135 # try to parse date object to get gregorian year | |
| 136 year = parseYear(val) | |
| 137 if year is not None: | |
| 138 attrs[name] = year | |
| 139 | |
| 140 elif ct == 'num': | |
| 141 # number attribute | |
| 142 val = att_elem.text | |
| 143 if val is not None: | |
| 144 # keep attribute, assume num is int | |
| 145 attrs[name] = int(val) | |
| 146 | |
| 147 elif ct == 'old': | |
| 148 # ignore attribute | |
| 149 continue | |
| 150 | |
| 151 else: | |
| 152 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem)) | |
| 153 # ignore other content types | |
| 154 continue | |
| 155 | |
| 156 # process base attributes | |
| 157 oc = ent_elem.get('object-class') | |
| 158 | |
| 159 # set type | |
| 160 attrs[node_type_attribute] = fixName(oc) | |
| 161 | |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
162 ismi_id = int(ent_elem.get('id')) |
| 46 | 163 # rename id to ismi_id |
| 164 attrs['ismi_id'] = ismi_id | |
| 165 | |
| 166 if len(ov) > 0: | |
| 167 # save ov as label | |
| 54 | 168 attrs['_label'] = ov |
| 51 | 169 # check for normalized value |
| 170 nov = ent_elem.findtext('norm') | |
| 171 if nov is not None: | |
| 172 # add normalized value | |
| 173 attrs['_n_label'] = nov | |
| 46 | 174 |
| 175 # create node | |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
176 log('DEBUG', "new node(%s, %s)"%(ismi_id, attrs)) |
| 46 | 177 nx_graph.add_node(ismi_id, **attrs) |
| 178 node = nx_graph.node[ismi_id] | |
| 179 | |
| 180 return node | |
| 181 | |
| 182 | |
| 183 def relationFromRel(rel_elem): | |
| 184 """Create graph relation from etree element. | |
| 185 """ | |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
186 rel_id = int(rel_elem.get('id')) |
| 46 | 187 rel_name = rel_elem.get('object-class') |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
188 src_id = int(rel_elem.get('source-id')) |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
189 tar_id = int(rel_elem.get('target-id')) |
| 46 | 190 if not src_id in nx_nodes: |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
191 log("WARNING", "relation %s src node %s missing!"%(rel_id,src_id)) |
| 46 | 192 return None |
| 193 | |
| 194 if not tar_id in nx_nodes: | |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
195 log("WARNING", "relation %s tar node %s missing!"%(rel_id,tar_id)) |
| 46 | 196 return None |
| 197 | |
| 198 ov = rel_elem.text or '' | |
| 199 | |
| 200 attrs = {} | |
| 201 | |
| 202 # get attributes element | |
| 203 atts_elem = rel_elem.find('attributes') | |
| 204 | |
| 205 if atts_elem is not None: | |
|
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
206 if atts_elem.tail is not None: |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
207 # tail belongs to parent |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
208 ov += atts_elem.tail |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
209 |
| 46 | 210 # go through all attributes |
| 211 for att_elem in atts_elem: | |
| 212 ct = att_elem.get('content-type', None) | |
| 213 name = att_elem.get('name', None) | |
| 214 if name in exclude_attributes_of_type: | |
| 215 # exclude attribute | |
| 216 continue | |
| 217 | |
|
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
218 if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']: |
| 46 | 219 # normal text attribute (assume no content_type is text too...) |
| 220 val = att_elem.text | |
| 221 | |
| 222 if val is not None and val[0] == '{': | |
| 223 # try to parse as date | |
| 224 year = parseYear(val) | |
| 225 if year is not None: | |
| 226 val = year | |
| 227 | |
|
48
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
228 if val is not None: |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
229 # keep attribute |
|
6625019a0c96
old model2neo4j renamed to model2neo4j_restclient. new model2neo4j_client and model2neo4j_import. fixed ismixml2model and compare_models.
casties
parents:
47
diff
changeset
|
230 attrs[name] = val |
| 51 | 231 # check for normalized value |
| 232 nov = att_elem.findtext('norm') | |
| 233 if nov is not None: | |
| 234 # add normalized value | |
| 235 attrs['_n_'+name] = nov | |
| 46 | 236 |
| 237 elif ct == 'date': | |
| 238 # date attribute | |
| 239 val = att_elem.text | |
| 240 if val is not None: | |
| 241 # try to parse date object to get gregorian year | |
| 242 year = parseYear(val) | |
| 243 if year is not None: | |
| 244 attrs[name] = year | |
| 245 | |
| 246 elif ct == 'num': | |
| 247 # number attribute | |
| 248 val = att_elem.text | |
| 249 if val is not None: | |
| 250 # keep attribute, assume num is int | |
| 251 attrs[name] = int(val) | |
| 252 | |
| 253 elif ct == 'old': | |
| 254 # ignore attribute | |
| 255 continue | |
| 256 | |
| 257 else: | |
| 258 log("WARN", "attribute with unknown content_type: %s"%repr(att_elem)) | |
| 259 # ignore other content types | |
| 260 continue | |
| 261 | |
| 262 #if len(ov) > 0: | |
| 263 # # own value of relation is not useful | |
| 264 # attrs['ov'] = ov | |
| 265 | |
| 266 attrs[rel_type_attribute] = fixName(rel_name) | |
| 267 attrs['ismi_id'] = rel_id | |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
268 #log('DEBUG', "new edge(%s, %s, %s)"%(src_id, tar_id, attrs)) |
| 46 | 269 # create relation with type |
| 270 nx_rel = nx_graph.add_edge(src_id, tar_id, attr_dict=attrs) | |
| 271 | |
| 272 return nx_rel | |
| 273 | |
| 274 | |
| 275 def importEnts(ents_elem): | |
| 276 """Import all entities from etree element elem. | |
| 277 """ | |
| 278 cnt = 0 | |
| 279 xml_num = ents_elem.get('number') | |
| 280 log('INFO', "XML says %s entities"%xml_num) | |
| 281 | |
| 282 # iterate through entities element | |
| 283 for ent_elem in ents_elem: | |
| 284 cnt += 1 | |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
285 |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
286 oc = ent_elem.get('object-class') |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
287 if oc in exclude_objects_of_type: |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
288 # skip this entity |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
289 continue |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
290 |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
291 ismi_id = int(ent_elem.get('id')) |
| 46 | 292 log('DEBUG', "reading entity[%s]"%ismi_id) |
| 293 | |
| 294 if ismi_id in nx_nodes: | |
| 295 log("ERROR", "entity with id=%s exists!"%ismi_id) | |
| 296 return | |
| 297 | |
| 298 # create networkx node | |
| 299 node = nodeFromEnt(ent_elem) | |
| 300 | |
| 301 # save node reference | |
| 302 nx_nodes[ismi_id] = node | |
| 303 | |
| 304 # debug | |
| 305 #if cnt >= 100: | |
| 306 # return | |
| 307 | |
| 308 | |
| 309 def importRels(rels_elem): | |
| 310 """Import all entities from etree element elem. | |
| 311 """ | |
| 312 cnt = 0 | |
| 313 xml_num = rels_elem.get('number') | |
| 314 log('INFO', "XML says %s relations"%xml_num) | |
| 315 | |
| 316 # iterate through entities element | |
| 317 for rel_elem in rels_elem: | |
| 318 cnt += 1 | |
|
47
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
319 |
|
378dcb66a27f
new compare_models comparing the existence of nodes and relations in two graphs.
casties
parents:
46
diff
changeset
|
320 ismi_id = int(rel_elem.get('id')) |
| 46 | 321 log('DEBUG', "reading relation[%s]"%ismi_id) |
| 322 | |
| 323 if ismi_id in nx_relations: | |
| 324 print("ERROR: relation with id=%s exists!"%ismi_id) | |
| 325 return | |
| 326 | |
| 327 # create networkx relation | |
| 328 relation = relationFromRel(rel_elem) | |
| 329 | |
| 330 # save relation reference | |
| 331 nx_relations[ismi_id] = relation | |
| 332 | |
| 333 # debug | |
| 334 #if cnt >= 100: | |
| 335 # return | |
| 336 | |
| 337 | |
| 338 def importAll(): | |
| 339 # parse XML file | |
| 340 log('INFO', "parsing XML file %s"%input_fn) | |
| 341 tree = ET.parse(input_fn) | |
| 342 log('DEBUG', "etree ready") | |
| 343 root = tree.getroot() | |
| 344 ents = root.find('entities') | |
| 345 importEnts(ents) | |
| 346 | |
| 347 rels = root.find('relations') | |
| 348 importRels(rels) | |
| 349 | |
| 350 ## main | |
| 351 | |
| 352 print("Copy graph from OpenMind-XML to networkx pickle") | |
| 353 | |
| 354 # parse command line parameters | |
| 355 if len(sys.argv) > 1: | |
| 356 input_fn = sys.argv[1] | |
| 357 | |
| 358 if len(sys.argv) > 2: | |
| 359 output_fn = sys.argv[2] | |
| 360 | |
| 361 # import everything | |
| 362 print("Reading graph from OpenMind-XML file %s"%input_fn) | |
| 363 if len(exclude_objects_of_type) > 0: | |
| 364 print(" Skipping objects of type %s"%exclude_objects_of_type); | |
| 365 | |
| 366 importAll() | |
| 367 | |
| 368 print("Graph info: %s"%networkx.info(nx_graph)) | |
| 369 #print(" nodes:%s"%repr(nx_graph.nodes(data=True))) | |
| 370 | |
| 371 # export pickle | |
| 372 networkx.write_gpickle(nx_graph, output_fn) | |
| 373 print("Wrote networkx pickle file %s"%output_fn) |
