changeset 46:f3945ef1e6a4

new importer for OM4XML dump file.
author casties
date Fri, 03 Feb 2017 18:46:16 +0100
parents 277ea02906f9
children 378dcb66a27f
files importFromOpenMind/importer/ismi2model.py importFromOpenMind/importer/ismixml2model.py
diffstat 2 files changed, 360 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/importFromOpenMind/importer/ismi2model.py	Fri Dec 09 12:24:21 2016 +0100
+++ b/importFromOpenMind/importer/ismi2model.py	Fri Feb 03 18:46:16 2017 +0100
@@ -9,7 +9,7 @@
 output_fn = "ismi_graph.gpickle"
 
 # OpenMind base URL
-#baseURL="http://localhost:18080/om4-ismi/jsonInterface?"
+#baseURL="http://ismi.mpiwg-berlin.mpg.de//om4-ismi/jsonInterface?"
 baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
 
 # node types to exclude from the graph
@@ -282,8 +282,6 @@
         #    return
 
 
-# In[119]:
-
 def importAllEnts(etypes):
     
     for etype in etypes:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/importFromOpenMind/importer/ismixml2model.py	Fri Feb 03 18:46:16 2017 +0100
@@ -0,0 +1,359 @@
+import xml.etree.ElementTree as ET
+import json
+import networkx
+import sys
+
+## configure behaviour
+
+# output filename
+output_fn = "ismi_graph.gpickle"
+
+input_fn = "openmind-data.xml"
+
+
+# node types to exclude from the graph
+exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE']
+
+# attributes to exclude
+exclude_attributes_of_type = [
+    'lw',
+    'node_type',
+    'nov',
+    'notes_old'
+]
+
+# name of type attribute
+node_type_attribute = '_type'
+rel_type_attribute = '_type'
+
+#ismi_types=["PERSON","WITNESS","CODEX","PLACE","COLLECTION","REPOSITORY"]
+
+
+nx_graph = networkx.MultiDiGraph()
+
+nx_nodes = {}
+ismi_relations = {}
+nx_relations = {}
+
+# active log levels for logging
+#logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
+#logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
+logLevels = {'INFO', 'ERROR', 'SYSMSG'}
+
+def log(level, message):
+    if level in logLevels:
+        print("%s: %s"%(level, message))
+
+
+def fixName(name, is_src_rel=False, is_tar_rel=False, att_from_rel=False):
+    if is_src_rel:
+        #name = name + '>'
+        pass
+        
+    if is_tar_rel:
+        name = '<' + name
+        
+    if att_from_rel:
+        # clean up relations as attribute names
+        name = name.replace('is_', '')
+        name = name.replace('has_', '')
+        name = name.replace('was_', '')
+        name = name.replace('_of', '')
+
+    return name
+
+
+
+def parseYear(val):
+    year = None
+    try:
+        date_json = json.loads(val)
+        if 'from' in date_json:
+            year = date_json['from'].get('year', None)
+        elif 'date' in date_json:
+            year = date_json['date'].get('year', None)
+        else:
+            log("WARNING", "don't know what to do with date %s"%(val))
+            
+    except:
+        pass
+    
+    return year
+
+
+def nodeFromEnt(ent_elem):
+    """Create a graph node from the given XML entity.
+    
+    Creates the node in gdb and returns the node.
+    """     
+    # text content of entity element
+    ov = ent_elem.text or ''
+
+    attrs = {}
+
+    # get attributes element
+    atts_elem = ent_elem.find('attributes')
+    
+    if atts_elem is None:
+        log('DEBUG', "entity has no attributes: %s"%ent_elem)
+        
+    else:
+        # go through all attributes
+        for att_elem in atts_elem:
+            if att_elem.tail is not None:
+                # tail belongs to parent
+                ov += att_elem.tail
+                
+            ct = att_elem.get('content-type', None)
+            name = att_elem.get('name', None)
+            if name in exclude_attributes_of_type:
+                # exclude attribute
+                continue
+    
+            if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
+                # normal text attribute (assume no content_type is text too...)
+                val = att_elem.text
+                
+                if val is not None and val[0] == '{':
+                    # try to parse as date
+                    year = parseYear(val)
+                    if year is not None:
+                        val = year
+                    
+                # keep attribute
+                attrs[name] = val
+                #if 'nov' in att:
+                #    # add normalized value
+                #    attrs['_n_'+name] = att['nov']
+                
+            elif ct == 'date':
+                # date attribute
+                val = att_elem.text
+                if val is not None:
+                    # try to parse date object to get gregorian year
+                    year = parseYear(val)
+                    if year is not None:
+                        attrs[name] = year
+                
+            elif ct == 'num':
+                # number attribute
+                val = att_elem.text
+                if val is not None:
+                    # keep attribute, assume num is int
+                    attrs[name] = int(val)
+                
+            elif ct == 'old':
+                # ignore attribute
+                continue
+                
+            else:
+                log("WARN", "attribute with unknown content_type: %s"%repr(att_elem))
+                # ignore other content types
+                continue
+            
+    # process base attributes
+    oc = ent_elem.get('object-class')
+            
+    # set type
+    attrs[node_type_attribute] = fixName(oc)
+                
+    ismi_id = ent_elem.get('id')
+    # rename id to ismi_id
+    attrs['ismi_id'] = ismi_id
+            
+    if len(ov) > 0:
+        # save ov as label
+        attrs['label'] = ov
+        #if 'nov' in ent:
+        #    # add normalized value
+        #    attrs['_n_label'] = ent.get('nov')
+    
+    # create node
+    #log('DEBUG', "new node(%s, %s)"%(ismi_id, attrs))
+    nx_graph.add_node(ismi_id, **attrs)
+    node = nx_graph.node[ismi_id]
+    
+    return node
+
+
+def relationFromRel(rel_elem):
+    """Create graph relation from etree element.    
+    """
+    rel_id = rel_elem.get('id')
+    rel_name = rel_elem.get('object-class')
+    src_id = rel_elem.get('source-id')
+    tar_id = rel_elem.get('target-id')
+    if not src_id in nx_nodes:
+        log("ERROR", "relation %s src node %s missing!"%(rel_id,src_id))
+        return None
+    
+    if not tar_id in nx_nodes:
+        log("ERROR", "relation %s tar node %s missing!"%(rel_id,tar_id))
+        return None
+
+    ov = rel_elem.text or ''
+
+    attrs = {}
+
+    # get attributes element
+    atts_elem = rel_elem.find('attributes')
+    
+    if atts_elem is not None:
+        # go through all attributes
+        for att_elem in atts_elem:
+            if att_elem.tail is not None:
+                # tail belongs to parent
+                ov += att_elem.tail
+                
+            ct = att_elem.get('content-type', None)
+            name = att_elem.get('name', None)
+            if name in exclude_attributes_of_type:
+                # exclude attribute
+                continue
+    
+            if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
+                # normal text attribute (assume no content_type is text too...)
+                val = att_elem.text
+                
+                if val is not None and val[0] == '{':
+                    # try to parse as date
+                    year = parseYear(val)
+                    if year is not None:
+                        val = year
+                    
+                # keep attribute
+                attrs[name] = val
+                #if 'nov' in att:
+                #    # add normalized value
+                #    attrs['_n_'+name] = att['nov']
+                
+            elif ct == 'date':
+                # date attribute
+                val = att_elem.text
+                if val is not None:
+                    # try to parse date object to get gregorian year
+                    year = parseYear(val)
+                    if year is not None:
+                        attrs[name] = year
+                
+            elif ct == 'num':
+                # number attribute
+                val = att_elem.text
+                if val is not None:
+                    # keep attribute, assume num is int
+                    attrs[name] = int(val)
+                
+            elif ct == 'old':
+                # ignore attribute
+                continue
+                
+            else:
+                log("WARN", "attribute with unknown content_type: %s"%repr(att_elem))
+                # ignore other content types
+                continue
+    
+    #if len(ov) > 0:
+    #    # own value of relation is not useful
+    #    attrs['ov'] = ov
+        
+    attrs[rel_type_attribute] = fixName(rel_name)
+    attrs['ismi_id'] = rel_id
+    log('DEBUG', "new edge(%s, %s, %s)"%(src_id, tar_id, attrs))
+    # create relation with type
+    nx_rel = nx_graph.add_edge(src_id, tar_id, attr_dict=attrs)
+    
+    return nx_rel
+
+
+def importEnts(ents_elem):
+    """Import all entities from etree element elem.
+    """
+    cnt = 0
+    xml_num = ents_elem.get('number')
+    log('INFO', "XML says %s entities"%xml_num)
+    
+    # iterate through entities element
+    for ent_elem in ents_elem:
+        cnt += 1
+        ismi_id = ent_elem.get('id')
+        log('DEBUG', "reading entity[%s]"%ismi_id)
+        
+        if ismi_id in nx_nodes:
+            log("ERROR", "entity with id=%s exists!"%ismi_id)
+            return
+        
+        # create networkx node
+        node = nodeFromEnt(ent_elem)
+        
+        # save node reference
+        nx_nodes[ismi_id] = node
+        
+        # debug        
+        #if cnt >= 100:
+        #    return
+
+
+def importRels(rels_elem):
+    """Import all entities from etree element elem.
+    """
+    cnt = 0
+    xml_num = rels_elem.get('number')
+    log('INFO', "XML says %s relations"%xml_num)
+    
+    # iterate through entities element
+    for rel_elem in rels_elem:
+        cnt += 1
+        ismi_id = rel_elem.get('id')
+        log('DEBUG', "reading relation[%s]"%ismi_id)
+        
+        if ismi_id in nx_relations:
+            print("ERROR: relation with id=%s exists!"%ismi_id)
+            return
+        
+        # create networkx relation
+        relation = relationFromRel(rel_elem)
+        
+        # save relation reference
+        nx_relations[ismi_id] = relation
+        
+        # debug
+        #if cnt >= 100:
+        #    return
+
+
+def importAll():
+    # parse XML file
+    log('INFO', "parsing XML file %s"%input_fn)
+    tree = ET.parse(input_fn)
+    log('DEBUG', "etree ready")
+    root = tree.getroot()
+    ents = root.find('entities')
+    importEnts(ents)
+        
+    rels = root.find('relations')
+    importRels(rels)
+
+## main
+
+print("Copy graph from OpenMind-XML to networkx pickle")
+
+# parse command line parameters
+if len(sys.argv) > 1:
+    input_fn = sys.argv[1]
+
+if len(sys.argv) > 2:
+    output_fn = sys.argv[2]
+
+# import everything
+print("Reading graph from OpenMind-XML file %s"%input_fn)
+if len(exclude_objects_of_type) > 0:
+    print("  Skipping objects of type %s"%exclude_objects_of_type);
+    
+importAll()
+
+print("Graph info: %s"%networkx.info(nx_graph))
+#print(" nodes:%s"%repr(nx_graph.nodes(data=True)))
+
+# export pickle
+networkx.write_gpickle(nx_graph, output_fn)
+print("Wrote networkx pickle file %s"%output_fn)