Mercurial > hg > drupalISMI
view importFromOpenMind/importer/ismixml2model.py @ 51:5a633e875490
also read normalized fields from xml.
author | casties |
---|---|
date | Fri, 03 Mar 2017 20:11:06 +0100 |
parents | 6625019a0c96 |
children | b9a6e596ebe4 |
line wrap: on
line source
import xml.etree.ElementTree as ET import json import networkx import sys ## configure behaviour # output filename output_fn = "ismi_graph.gpickle" input_fn = "openmind-data.xml" # node types to exclude from the graph exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE'] # attributes to exclude exclude_attributes_of_type = [ 'lw', 'node_type', 'nov', 'notes_old' ] # name of type attribute node_type_attribute = '_type' rel_type_attribute = '_type' #ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] nx_graph = networkx.MultiDiGraph() nx_nodes = {} ismi_relations = {} nx_relations = {} # active log levels for logging #logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} #logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} logLevels = {'INFO', 'ERROR', 'SYSMSG'} def log(level, message): if level in logLevels: print("%s: %s"%(level, message)) def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): if is_src_rel: #name = name + '>' pass if is_tar_rel: name = '<' + name if att_from_rel: # clean up relations as attribute names name = name.replace('is_', '') name = name.replace('has_', '') name = name.replace('was_', '') name = name.replace('_of', '') return name def parseYear(val): year = None try: date_json = json.loads(val) if 'from' in date_json: year = date_json['from'].get('year', None) elif 'date' in date_json: year = date_json['date'].get('year', None) else: log("WARNING", "don't know what to do with date %s"%(val)) except: pass return year def nodeFromEnt(ent_elem): """Create a graph node from the given XML entity. Creates the node in gdb and returns the node. """ # text content of entity element ov = ent_elem.text or '' attrs = {} # get attributes element atts_elem = ent_elem.find('attributes') if atts_elem is None: log('DEBUG', "entity has no attributes: %s"%ent_elem) else: # go through all attributes for att_elem in atts_elem: ct = att_elem.get('content-type', None) name = att_elem.get('name', None) if name in exclude_attributes_of_type: # exclude attribute continue if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']: # normal text attribute (assume no content_type is text too...) val = att_elem.text if val is not None and val[0] == '{': # try to parse as date year = parseYear(val) if year is not None: val = year if val is not None: # keep attribute attrs[name] = val # check for normalized value nov = att_elem.findtext('norm') if nov is not None: # add normalized value attrs['_n_'+name] = nov elif ct == 'date': # date attribute val = att_elem.text if val is not None: # try to parse date object to get gregorian year year = parseYear(val) if year is not None: attrs[name] = year elif ct == 'num': # number attribute val = att_elem.text if val is not None: # keep attribute, assume num is int attrs[name] = int(val) elif ct == 'old': # ignore attribute continue else: log("WARN", "attribute with unknown content_type: %s"%repr(att_elem)) # ignore other content types continue # process base attributes oc = ent_elem.get('object-class') # set type attrs[node_type_attribute] = fixName(oc) ismi_id = int(ent_elem.get('id')) # rename id to ismi_id attrs['ismi_id'] = ismi_id if len(ov) > 0: # save ov as label attrs['label'] = ov # check for normalized value nov = ent_elem.findtext('norm') if nov is not None: # add normalized value attrs['_n_label'] = nov # create node log('DEBUG', "new node(%s, %s)"%(ismi_id, attrs)) nx_graph.add_node(ismi_id, **attrs) node = nx_graph.node[ismi_id] return node def relationFromRel(rel_elem): """Create graph relation from etree element. """ rel_id = int(rel_elem.get('id')) rel_name = rel_elem.get('object-class') src_id = int(rel_elem.get('source-id')) tar_id = int(rel_elem.get('target-id')) if not src_id in nx_nodes: log("WARNING", "relation %s src node %s missing!"%(rel_id,src_id)) return None if not tar_id in nx_nodes: log("WARNING", "relation %s tar node %s missing!"%(rel_id,tar_id)) return None ov = rel_elem.text or '' attrs = {} # get attributes element atts_elem = rel_elem.find('attributes') if atts_elem is not None: if atts_elem.tail is not None: # tail belongs to parent ov += atts_elem.tail # go through all attributes for att_elem in atts_elem: ct = att_elem.get('content-type', None) name = att_elem.get('name', None) if name in exclude_attributes_of_type: # exclude attribute continue if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language', 'null']: # normal text attribute (assume no content_type is text too...) val = att_elem.text if val is not None and val[0] == '{': # try to parse as date year = parseYear(val) if year is not None: val = year if val is not None: # keep attribute attrs[name] = val # check for normalized value nov = att_elem.findtext('norm') if nov is not None: # add normalized value attrs['_n_'+name] = nov elif ct == 'date': # date attribute val = att_elem.text if val is not None: # try to parse date object to get gregorian year year = parseYear(val) if year is not None: attrs[name] = year elif ct == 'num': # number attribute val = att_elem.text if val is not None: # keep attribute, assume num is int attrs[name] = int(val) elif ct == 'old': # ignore attribute continue else: log("WARN", "attribute with unknown content_type: %s"%repr(att_elem)) # ignore other content types continue #if len(ov) > 0: # # own value of relation is not useful # attrs['ov'] = ov attrs[rel_type_attribute] = fixName(rel_name) attrs['ismi_id'] = rel_id #log('DEBUG', "new edge(%s, %s, %s)"%(src_id, tar_id, attrs)) # create relation with type nx_rel = nx_graph.add_edge(src_id, tar_id, attr_dict=attrs) return nx_rel def importEnts(ents_elem): """Import all entities from etree element elem. """ cnt = 0 xml_num = ents_elem.get('number') log('INFO', "XML says %s entities"%xml_num) # iterate through entities element for ent_elem in ents_elem: cnt += 1 oc = ent_elem.get('object-class') if oc in exclude_objects_of_type: # skip this entity continue ismi_id = int(ent_elem.get('id')) log('DEBUG', "reading entity[%s]"%ismi_id) if ismi_id in nx_nodes: log("ERROR", "entity with id=%s exists!"%ismi_id) return # create networkx node node = nodeFromEnt(ent_elem) # save node reference nx_nodes[ismi_id] = node # debug #if cnt >= 100: # return def importRels(rels_elem): """Import all entities from etree element elem. """ cnt = 0 xml_num = rels_elem.get('number') log('INFO', "XML says %s relations"%xml_num) # iterate through entities element for rel_elem in rels_elem: cnt += 1 ismi_id = int(rel_elem.get('id')) log('DEBUG', "reading relation[%s]"%ismi_id) if ismi_id in nx_relations: print("ERROR: relation with id=%s exists!"%ismi_id) return # create networkx relation relation = relationFromRel(rel_elem) # save relation reference nx_relations[ismi_id] = relation # debug #if cnt >= 100: # return def importAll(): # parse XML file log('INFO', "parsing XML file %s"%input_fn) tree = ET.parse(input_fn) log('DEBUG', "etree ready") root = tree.getroot() ents = root.find('entities') importEnts(ents) rels = root.find('relations') importRels(rels) ## main print("Copy graph from OpenMind-XML to networkx pickle") # parse command line parameters if len(sys.argv) > 1: input_fn = sys.argv[1] if len(sys.argv) > 2: output_fn = sys.argv[2] # import everything print("Reading graph from OpenMind-XML file %s"%input_fn) if len(exclude_objects_of_type) > 0: print(" Skipping objects of type %s"%exclude_objects_of_type); importAll() print("Graph info: %s"%networkx.info(nx_graph)) #print(" nodes:%s"%repr(nx_graph.nodes(data=True))) # export pickle networkx.write_gpickle(nx_graph, output_fn) print("Wrote networkx pickle file %s"%output_fn)