# HG changeset patch
# User casties
# Date 1481221142 -3600
# Node ID bb5390f752fc04153da32f5f664894d6616fe78e
# Parent  4dee9586cc44cf40fce732e05c4f60f88f81df60
new ISMI transaction log analyser.

diff -r 4dee9586cc44 -r bb5390f752fc importFromOpenMind/importer/check_ismi_transaction_log.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/importFromOpenMind/importer/check_ismi_transaction_log.py	Thu Dec 08 19:19:02 2016 +0100
@@ -0,0 +1,443 @@
+
+# coding: utf-8
+
+import sys, re
+
+# max number of lines to read (for testing)
+maxLinecnt = None
+
+# do not output deleted nodes
+omitDeleted = True
+
+# active log levels for logging
+logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
+#logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
+#logLevels = {'ERROR', 'SYSMSG'}
+
+def log(level, message):
+    if level in logLevels:
+        print("%s: %s"%(level, message))
+
+
+def prettyPrintNode(node):
+    nt = node['node-type']
+    att = ''
+    if nt == 'ENTITY':
+        att = " %s=%s "%('oc',node['object-class'])
+        
+    elif nt == 'ATTRIBUTE':
+        att = " %s=%s "%('name',node['name'])
+        
+    elif nt == 'RELATION':
+        att = " %s=%s "%('oc',node['object-class'])
+
+    s = "%s%s[%s]"%(nt, att, node)
+    return s
+
+
+nodeCsvFieldList = ['node_type', 'id', 'row_id', 'object_class', 'user', 'public', 'type',
+                    'version', 'modification_time', 'system_status', 'content_type',
+                    'source_id', 'source_modif', 'source_obj_class', 'target_id', 'target_modif', 'target_obj_class',
+                    'own_value_b64']
+
+nodeCsvFieldMap = {'node_type':'node-type', 'id':'id', 'row_id':'row-id', 'object_class':'object-class', 'user':'user', 
+                   'public':'public', 'type':'type',
+                    'version':'version', 'modification_time':'mtime', 'system_status':'system-status', 'content_type':'content-type',
+                    'source_id':'source-id', 'source_modif':'source-mtime', 'source_obj_class':'source-oc', 
+                    'target_id':'target-id', 'target_modif':'target-mtime', 'target_obj_class':'target-oc',
+                    'own_value_b64':'b64-value'}
+
+
+def printHeaderCsv(outFile):
+    s = ""
+    for cf in nodeCsvFieldList:
+        if s == "":
+            s += "%s"%cf
+        else:
+            s += ",%s"%cf
+            
+    print(s, file=outFile)
+
+
+def printNodeCsv(node, outFile):
+    s = ""
+    for cf in nodeCsvFieldList:
+        nf = nodeCsvFieldMap[cf]
+        if s == "":
+            s += "%s"%(node.get(nf, ''))
+        else:
+            s += ",%s"%(node.get(nf, ''))
+            
+    print(s, file=outFile)
+
+
+def parseStart(line):
+    tstamp = None
+    tm = re.match('(\d+-\d+-\d+ \d+:\d+:\d+)', line)
+    if tm:
+        tstamp = tm.group(1)
+        
+    sm = re.search('START Saving (\w+) id=(\w+)', line)
+    if sm:
+        return {'time': tstamp, 'oc': sm.group(1), 'id': sm.group(2)}
+    
+    sm = re.search('START remove entity: user=\S+ entity=Entity\[rowId=\d+, id=(\d+)', line)
+    if sm:
+        return {'time': tstamp, 'id': sm.group(1)}
+    
+    sm = re.search('END Saving (\w+) id=(\w+)', line)
+    if sm:
+        return {'time': tstamp, 'oc': sm.group(1), 'id': sm.group(2)}
+    
+    sm = re.search('END remove entity: user=\S+ entity=Entity\[rowId=\d+, id=(\d+)', line)
+    if sm:
+        return {'time': tstamp, 'id': sm.group(1)}
+    
+    return None
+
+
+def parseSave(line):
+    match = re.search('([A-Z]+)\[([^\]]+)\]', line)
+    if match:
+        data = {'node-type': match.group(1)}
+        segs = match.group(2).split(', ')
+        for seg in segs:
+            k, v = seg.split('=', 1)
+            
+            data[k] = v.strip('"')
+            
+            # normalize attribute's name to object-class
+            if k == 'name':
+                data['object-class'] = v.strip('"')
+            
+        return data
+    
+    return None
+
+
+def equalNodes(prev, cur):
+    #log("DEBUG", "compare: %s vs %s"%(prev, cur))
+    if prev['id'] != cur['id']:
+        log("INFO", "node id mismatch!")
+        return False
+    
+    if prev['node-type'] != cur['node-type']:
+        log("INFO", "node node-type mismatch!")
+        return False
+    
+    if prev.get('source-id', None) != cur.get('source-id', None):
+        log("INFO", "node source_id mismatch!")
+        return False
+    
+    if prev.get('target-id', None) != cur.get('target-id', None):
+        log("INFO", "node target_id mismatch!")
+        return False
+    
+    if prev['b64-value'] != cur['b64-value']:
+        log("INFO", "node ownvalue mismatch!")
+        return False
+    
+    return True
+
+
+def getSimilarNode(prev, curList):
+    nt = prev['node-type']
+    if nt == 'ATTRIBUTE':
+        for n in curList:
+            if n['node-type'] == 'ATTRIBUTE' \
+              and prev['name'] == n['name']:
+                # attribute with same name
+                log("DEBUG", "similar attributes: %s vs %s"%(prev, n))
+                return n
+            
+    elif nt == 'RELATION':
+        for n in curList:
+            if n['node-type'] == 'RELATION' \
+              and prev['source-id'] == n['source-id'] \
+              and prev['target-id'] == n['target-id'] \
+              and prev['object-class'] == n['object-class']:
+                # relation with same source, target and type
+                log("DEBUG", "similar relations: %s vs %s"%(prev, n))
+                return n
+            
+    return None
+
+
+def compareNodeLists(prev, cur, ctx, lostFile=None):
+    """compare list of previous and current nodes.
+    
+    prints results to the log and output file.
+    """
+    prevNodes = {}
+    curNodes = {}
+    
+    #
+    # read nodes
+    #
+    for n in prev:
+        nid = n['id']
+        if nid not in prevNodes:
+            prevNodes[nid] = n
+        else:
+            log("DEBUG", "duplicate save of prev node id="+nid)
+            if isinstance(prevNodes[nid], list):
+                prevNodes[nid].append(n)
+            else:
+                prevNodes[nid] = [prevNodes[nid], n]
+        
+    for n in cur:
+        nid = n['id']
+        if nid not in curNodes:
+            curNodes[nid] = n
+        else:
+            log("DEBUG", "duplicate save of cur node id="+nid)
+            if isinstance(curNodes[nid], list):
+                curNodes[nid].append(n)
+            else:
+                curNodes[nid] = [curNodes[nid], n]
+        
+    #
+    # compare nodes
+    #
+    curNodeUnchecked = set(curNodes.keys())
+    addPrevNodes = []
+    addCurNodes = []
+    
+    for nid in prevNodes:
+        prevNode = prevNodes[nid]
+        if isinstance(prevNode, list):
+            log("DEBUG", "multi-save prev node: %s"%prevNode)
+            # use the last version(?)
+            prevNode = prevNode[-1]
+            
+        if nid not in curNodes:
+            if prevNode['node-type'] == 'ATTRIBUTE' and prevNode['b64-value'] == '':
+                # emtpy attribute - ignore
+                continue
+                
+            else:
+                log("DEBUG", "node %s not in cur saves! %s"%(nid,prevNode))
+                addPrevNodes.append(prevNode)
+                continue
+            
+        curNode = curNodes[nid]
+        if isinstance(curNode, list):
+            log("DEBUG", "multi-save cur node: %s"%curNode)
+            # use the last version?
+            curNode = curNode[-1]
+            
+        equalNodes(prevNode, curNode)
+
+        curNodeUnchecked.remove(nid)
+        
+    # make list of additional current (=new) nodes
+    for nid in curNodeUnchecked:
+        curNode = curNodes[nid]
+        # list can contain lists
+        if isinstance(curNode, list):
+            # use the last version(?)
+            curNode = curNode[-1]
+            
+        addCurNodes.append(curNode)
+        log("DEBUG", "new node %s"%curNodes[nid])
+        
+    # compare missing and new nodes
+    for n in addPrevNodes.copy():
+        sn = getSimilarNode(n, addCurNodes)
+        if sn is not None:
+            # similar is good enough
+            addPrevNodes.remove(n)
+            addCurNodes.remove(sn)
+    
+    if len(addPrevNodes) > 0:
+        #print("ERROR: lost nodes: %s"%[prettyPrintNode(n) for n in addPrevNodes])
+        log("ERROR", "in %s"%ctx)
+        for n in addPrevNodes:
+            log("ERROR","lost node: %s"%prettyPrintNode(n))
+            if lostFile is not None:
+                printNodeCsv(n, lostFile)
+        
+    if len(addCurNodes) > 0:
+        #print("INFO: new nodes: %s"%[prettyPrintNode(n) for n in addCurNodes])
+        for n in addCurNodes:
+            log("INFO", "new node: %s"%prettyPrintNode(n))
+        
+
+
+def analyseLogfile(inFilename, outFilename=None):
+    outFile = None
+    if outFilename is not None:
+        outFile = open(outFilename, mode='w')
+        printHeaderCsv(outFile)
+    
+    with open(inFilename) as f:
+        linecnt = 0
+        saving = 0
+        openSaves = {}
+        savingPrev = 0
+        deleting = 0
+        saveCtx = None
+        deleteCtx = None
+        prevSaves = []
+        saves = []
+        
+        for line in f:
+            linecnt += 1
+            #
+            # parsing lines in order of specific match, not order of operation!
+            #
+            if '*** START Saving' in line:
+                saving += 1
+                # make sure delete is off
+                deleting = 0
+                log('DEBUG', line)
+                # parse time and id
+                saveCtx = parseStart(line)
+                saveId = saveCtx['id']
+                if saveId in openSaves:
+                    log('ERROR', "Duplicate save for same id! %s"%saveCtx)
+                    # assume the first save was spurious
+                    saving -= 1
+                    
+                openSaves[saveId] = saveCtx
+                
+                if len(openSaves) > 1:
+                    log("ERROR", "Multiple open save (%s) in #%s of %s"%(saving, linecnt, line))
+                    # TODO: what now?
+                    
+            elif '*** END Saving' in line:
+                saving -= 1
+                saveCtx = parseStart(line)
+                saveId = saveCtx['id']
+                if saveId in openSaves:
+                    log('DEBUG', "End save matches start save: %s"%saveCtx)
+                    del openSaves[saveId]
+                    
+                elif 'null' in openSaves:
+                    log('DEBUG', "Assume end save (of %s) matches start save null: %s"%(len(openSaves), saveCtx))
+                    del openSaves['null']
+                    
+                else:
+                    log("ERROR", "End save without start save! %s"%saveCtx)
+                
+                # make sure delete is off
+                deleting = 0
+                log('DEBUG', line)
+                
+                if len(openSaves) > 0:
+                    log('WARNING', "Still open saves (%s) during end save in #%s of %s"%(saving, linecnt, line))
+                
+                elif saving < 0:
+                    log('ERROR', "Too many END saves!")
+                    break
+                    
+                log("INFO", "saving %s"%saveCtx)
+                log("INFO", "prev saves: %s"%len(prevSaves))
+                log("INFO", "saves: %s"%len(saves))
+                
+                if len(prevSaves) > 0:
+                    compareNodeLists(prevSaves, saves, saveCtx, outFile)
+                
+                prevSaves = []
+                saves = []
+                            
+            elif '** START save entity' in line:
+                pass
+
+            elif '** END save entity' in line:
+                pass
+
+            elif '** START remove entity' in line:
+                deleting += 1
+                log('DEBUG', line)
+                deleteCtx = parseStart(line)
+                
+                if deleting > 1:
+                    log("ERROR", "Concurrent delete (%s) in #%s of %s"%(saving, linecnt, line))
+                    # TODO: what now?
+                    break
+                    
+            elif '** END remove entity' in line:
+                deleting -= 1
+                log('DEBUG', line)
+                ctx = parseStart(line)
+                if deleteCtx['id'] != ctx['id']:
+                    log('ERROR', "END delete has different ID!")
+                    break
+                
+                if deleting > 1:
+                    log("ERROR", "Concurrent delete (%s) in #%s of %s"%(saving, linecnt, line))
+                    # TODO: what now?
+                    break
+                
+                deleteCtx = None
+                    
+            elif '* START save entity' in line:
+                pass
+
+            elif '* START save previous' in line:
+                savingPrev += 1
+
+            elif '* ...END save entity' in line:
+                pass
+
+            elif '* END ...save previous' in line:                
+                savingPrev -= 1
+                
+                if saving < 0:
+                    log("ERROR", "Too many END save previous!")
+            
+            elif 'save previous' in line:
+                data = parseSave(line)
+                if data is None:
+                    log("DEBUG", "Error parsing line: %s"%line)
+                    continue
+                    
+                if omitDeleted and deleting > 0 and savingPrev > 0:
+                    # this should be a save prev from deleting
+                    delId = deleteCtx['id']
+                    # check if node is related to deleted id
+                    if data.get('id', None) == delId or data.get('source-id', None) == delId \
+                        or data.get('target-id', None) == delId:
+                        log('DEBUG', "intentionally deleted node: %s"%data)
+                    
+                    else:
+                        log('ERROR', "Node without matching id in delete! %s"%data)
+                        prevSaves.append(data)
+                        
+                else:
+                    prevSaves.append(data)
+                
+            elif 'save' in line:
+                data = parseSave(line)
+                if data is None:
+                    log("DEBUG", "Error parsing line: %s"%line)
+                    continue
+                
+                saves.append(parseSave(line))
+
+            if maxLinecnt is not None and linecnt >= maxLinecnt:
+                break
+                
+        log("SYSMSG", "%s lines of logfile scanned"%linecnt)
+        
+
+#        
+# public static void main :-)
+#
+
+input_fn = None
+output_fn = None
+
+# parse command line parameters
+if len(sys.argv) > 2:
+    input_fn = sys.argv[1]
+    output_fn = sys.argv[2]
+
+    # run analysis
+    analyseLogfile(input_fn, output_fn)
+
+else:
+    print("ERROR: missing parameters!")
+    print("use: check_ismi_log logfile csvfile")
+    exit(1)