Mercurial > hg > drupalISMI
changeset 46:f3945ef1e6a4
new importer for OM4XML dump file.
author | casties |
---|---|
date | Fri, 03 Feb 2017 18:46:16 +0100 |
parents | 277ea02906f9 |
children | 378dcb66a27f |
files | importFromOpenMind/importer/ismi2model.py importFromOpenMind/importer/ismixml2model.py |
diffstat | 2 files changed, 360 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/importFromOpenMind/importer/ismi2model.py Fri Dec 09 12:24:21 2016 +0100 +++ b/importFromOpenMind/importer/ismi2model.py Fri Feb 03 18:46:16 2017 +0100 @@ -9,7 +9,7 @@ output_fn = "ismi_graph.gpickle" # OpenMind base URL -#baseURL="http://localhost:18080/om4-ismi/jsonInterface?" +#baseURL="http://ismi.mpiwg-berlin.mpg.de//om4-ismi/jsonInterface?" baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?" # node types to exclude from the graph @@ -282,8 +282,6 @@ # return -# In[119]: - def importAllEnts(etypes): for etype in etypes:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/importFromOpenMind/importer/ismixml2model.py Fri Feb 03 18:46:16 2017 +0100 @@ -0,0 +1,359 @@ +import xml.etree.ElementTree as ET +import json +import networkx +import sys + +## configure behaviour + +# output filename +output_fn = "ismi_graph.gpickle" + +input_fn = "openmind-data.xml" + + +# node types to exclude from the graph +exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE'] + +# attributes to exclude +exclude_attributes_of_type = [ + 'lw', + 'node_type', + 'nov', + 'notes_old' +] + +# name of type attribute +node_type_attribute = '_type' +rel_type_attribute = '_type' + +#ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"] + + +nx_graph = networkx.MultiDiGraph() + +nx_nodes = {} +ismi_relations = {} +nx_relations = {} + +# active log levels for logging +#logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} +#logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} +logLevels = {'INFO', 'ERROR', 'SYSMSG'} + +def log(level, message): + if level in logLevels: + print("%s: %s"%(level, message)) + + +def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False): + if is_src_rel: + #name = name + '>' + pass + + if is_tar_rel: + name = '<' + name + + if att_from_rel: + # clean up relations as attribute names + name = name.replace('is_', '') + name = name.replace('has_', '') + name = name.replace('was_', '') + name = name.replace('_of', '') + + return name + + + +def parseYear(val): + year = None + try: + date_json = json.loads(val) + if 'from' in date_json: + year = date_json['from'].get('year', None) + elif 'date' in date_json: + year = date_json['date'].get('year', None) + else: + log("WARNING", "don't know what to do with date %s"%(val)) + + except: + pass + + return year + + +def nodeFromEnt(ent_elem): + """Create a graph node from the given XML entity. + + Creates the node in gdb and returns the node. + """ + # text content of entity element + ov = ent_elem.text or '' + + attrs = {} + + # get attributes element + atts_elem = ent_elem.find('attributes') + + if atts_elem is None: + log('DEBUG', "entity has no attributes: %s"%ent_elem) + + else: + # go through all attributes + for att_elem in atts_elem: + if att_elem.tail is not None: + # tail belongs to parent + ov += att_elem.tail + + ct = att_elem.get('content-type', None) + name = att_elem.get('name', None) + if name in exclude_attributes_of_type: + # exclude attribute + continue + + if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: + # normal text attribute (assume no content_type is text too...) + val = att_elem.text + + if val is not None and val[0] == '{': + # try to parse as date + year = parseYear(val) + if year is not None: + val = year + + # keep attribute + attrs[name] = val + #if 'nov' in att: + # # add normalized value + # attrs['_n_'+name] = att['nov'] + + elif ct == 'date': + # date attribute + val = att_elem.text + if val is not None: + # try to parse date object to get gregorian year + year = parseYear(val) + if year is not None: + attrs[name] = year + + elif ct == 'num': + # number attribute + val = att_elem.text + if val is not None: + # keep attribute, assume num is int + attrs[name] = int(val) + + elif ct == 'old': + # ignore attribute + continue + + else: + log("WARN", "attribute with unknown content_type: %s"%repr(att_elem)) + # ignore other content types + continue + + # process base attributes + oc = ent_elem.get('object-class') + + # set type + attrs[node_type_attribute] = fixName(oc) + + ismi_id = ent_elem.get('id') + # rename id to ismi_id + attrs['ismi_id'] = ismi_id + + if len(ov) > 0: + # save ov as label + attrs['label'] = ov + #if 'nov' in ent: + # # add normalized value + # attrs['_n_label'] = ent.get('nov') + + # create node + #log('DEBUG', "new node(%s, %s)"%(ismi_id, attrs)) + nx_graph.add_node(ismi_id, **attrs) + node = nx_graph.node[ismi_id] + + return node + + +def relationFromRel(rel_elem): + """Create graph relation from etree element. + """ + rel_id = rel_elem.get('id') + rel_name = rel_elem.get('object-class') + src_id = rel_elem.get('source-id') + tar_id = rel_elem.get('target-id') + if not src_id in nx_nodes: + log("ERROR", "relation %s src node %s missing!"%(rel_id,src_id)) + return None + + if not tar_id in nx_nodes: + log("ERROR", "relation %s tar node %s missing!"%(rel_id,tar_id)) + return None + + ov = rel_elem.text or '' + + attrs = {} + + # get attributes element + atts_elem = rel_elem.find('attributes') + + if atts_elem is not None: + # go through all attributes + for att_elem in atts_elem: + if att_elem.tail is not None: + # tail belongs to parent + ov += att_elem.tail + + ct = att_elem.get('content-type', None) + name = att_elem.get('name', None) + if name in exclude_attributes_of_type: + # exclude attribute + continue + + if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']: + # normal text attribute (assume no content_type is text too...) + val = att_elem.text + + if val is not None and val[0] == '{': + # try to parse as date + year = parseYear(val) + if year is not None: + val = year + + # keep attribute + attrs[name] = val + #if 'nov' in att: + # # add normalized value + # attrs['_n_'+name] = att['nov'] + + elif ct == 'date': + # date attribute + val = att_elem.text + if val is not None: + # try to parse date object to get gregorian year + year = parseYear(val) + if year is not None: + attrs[name] = year + + elif ct == 'num': + # number attribute + val = att_elem.text + if val is not None: + # keep attribute, assume num is int + attrs[name] = int(val) + + elif ct == 'old': + # ignore attribute + continue + + else: + log("WARN", "attribute with unknown content_type: %s"%repr(att_elem)) + # ignore other content types + continue + + #if len(ov) > 0: + # # own value of relation is not useful + # attrs['ov'] = ov + + attrs[rel_type_attribute] = fixName(rel_name) + attrs['ismi_id'] = rel_id + log('DEBUG', "new edge(%s, %s, %s)"%(src_id, tar_id, attrs)) + # create relation with type + nx_rel = nx_graph.add_edge(src_id, tar_id, attr_dict=attrs) + + return nx_rel + + +def importEnts(ents_elem): + """Import all entities from etree element elem. + """ + cnt = 0 + xml_num = ents_elem.get('number') + log('INFO', "XML says %s entities"%xml_num) + + # iterate through entities element + for ent_elem in ents_elem: + cnt += 1 + ismi_id = ent_elem.get('id') + log('DEBUG', "reading entity[%s]"%ismi_id) + + if ismi_id in nx_nodes: + log("ERROR", "entity with id=%s exists!"%ismi_id) + return + + # create networkx node + node = nodeFromEnt(ent_elem) + + # save node reference + nx_nodes[ismi_id] = node + + # debug + #if cnt >= 100: + # return + + +def importRels(rels_elem): + """Import all entities from etree element elem. + """ + cnt = 0 + xml_num = rels_elem.get('number') + log('INFO', "XML says %s relations"%xml_num) + + # iterate through entities element + for rel_elem in rels_elem: + cnt += 1 + ismi_id = rel_elem.get('id') + log('DEBUG', "reading relation[%s]"%ismi_id) + + if ismi_id in nx_relations: + print("ERROR: relation with id=%s exists!"%ismi_id) + return + + # create networkx relation + relation = relationFromRel(rel_elem) + + # save relation reference + nx_relations[ismi_id] = relation + + # debug + #if cnt >= 100: + # return + + +def importAll(): + # parse XML file + log('INFO', "parsing XML file %s"%input_fn) + tree = ET.parse(input_fn) + log('DEBUG', "etree ready") + root = tree.getroot() + ents = root.find('entities') + importEnts(ents) + + rels = root.find('relations') + importRels(rels) + +## main + +print("Copy graph from OpenMind-XML to networkx pickle") + +# parse command line parameters +if len(sys.argv) > 1: + input_fn = sys.argv[1] + +if len(sys.argv) > 2: + output_fn = sys.argv[2] + +# import everything +print("Reading graph from OpenMind-XML file %s"%input_fn) +if len(exclude_objects_of_type) > 0: + print(" Skipping objects of type %s"%exclude_objects_of_type); + +importAll() + +print("Graph info: %s"%networkx.info(nx_graph)) +#print(" nodes:%s"%repr(nx_graph.nodes(data=True))) + +# export pickle +networkx.write_gpickle(nx_graph, output_fn) +print("Wrote networkx pickle file %s"%output_fn)