changeset 35:d535f11a0d81

be more aggressive about parsing dates in text type fields.
author casties
date Tue, 02 Feb 2016 16:54:03 +0100
parents 74dfaed3600b
children 9a9a6da1d415
files importFromOpenMind/importer/ismi2model.py
diffstat 1 files changed, 44 insertions(+), 37 deletions(-) [+]
line wrap: on
line diff
--- a/importFromOpenMind/importer/ismi2model.py	Tue Feb 02 15:16:29 2016 +0100
+++ b/importFromOpenMind/importer/ismi2model.py	Tue Feb 02 16:54:03 2016 +0100
@@ -9,8 +9,8 @@
 output_fn = "ismi_graph.gpickle"
 
 # OpenMind base URL
-baseURL="http://localhost:18080/om4-ismi/jsonInterface?"
-#baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
+#baseURL="http://localhost:18080/om4-ismi/jsonInterface?"
+baseURL="http://localhost:18080/ismi-richfaces/jsonInterface?"
 
 # node types to exclude from the graph
 exclude_objects_of_type = ['DIGITALIZATION', 'REFERENCE']
@@ -73,6 +73,24 @@
     return name
 
 
+
+def parseYear(val):
+    year = None
+    try:
+        date_json = json.loads(val)
+        if 'from' in date_json:
+            year = date_json['from'].get('year', None)
+        elif 'date' in date_json:
+            year = date_json['date'].get('year', None)
+        else:
+            print("don't know what to do with date %s"%(val))
+            
+    except:
+        pass
+    
+    return year
+
+
 def nodeFromEnt(ent, etype):
     """Create a Neo4J node from the given JSON entity.
     
@@ -82,53 +100,42 @@
     # go through all attributes
     for att in ent['atts']:
         ct = att.get('content_type', None)
-        if ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
+        name = att.get('name', None)
+        if name in exclude_attributes_of_type:
+            # exclude attribute
+            continue
+
+        # try date first since some date attributes have the wrong type
+        if ct == 'date':
+            # date attribute
+            val = att['ov']
+            # try to parse date object to get gregorian year
+            year = parseYear(val)
+            if year is not None:
+                attrs[name] = year
+            
+        elif ct is None or ct.lower() in ['text', 'arabic', 'bool', 'boolean', 'url', 'language']:
             # normal text attribute (assume no content_type is text too...)
-            key = att['name']
             val = att['ov']
             
-            if key in exclude_attributes_of_type:
-                # exclude attribute
-                continue
-
+            if val[0] == '{':
+                # try to parse as date
+                year = parseYear(val)
+                if year is not None:
+                    val = year
+                
             # keep attribute
-            attrs[key] = val
+            attrs[name] = val
             if 'nov' in att:
                 # add normalized value
-                attrs['_n_'+key] = att['nov']
+                attrs['_n_'+name] = att['nov']
             
         elif ct == 'num':
             # number attribute
-            key = att['name']
             val = att['ov']
             
-            if key in exclude_attributes_of_type:
-                # exclude attribute
-                continue
-
             # keep attribute, assume num is int
-            attrs[key] = int(val)
-            
-        elif ct == 'date':
-            # date attribute
-            key = att['name']
-            val = att['ov']
-            # try to parse date object to get gregorian year
-            try:
-                year = None
-                date_json = json.loads(val)
-                if 'from' in date_json:
-                    year = date_json['from'].get('year', None)
-                elif 'date' in date_json:
-                    year = date_json['date'].get('year', None)
-                else:
-                    print("don't know what to do with date on %s: %s=%s"%(ent['id'],key,val))
-                    
-                if year is not None:
-                    attrs[key] = year
-                    
-            except:
-                print("ERROR: invalid JSON in date: %s"%repr(val))
+            attrs[name] = int(val)
             
         elif ct == 'old':
             # ignore attribute