view importFromOpenMind/importer/check_ismi_transaction_log.py @ 43:bb5390f752fc

new ISMI transaction log analyser.
author casties
date Thu, 08 Dec 2016 19:19:02 +0100
parents
children 4692e90215eb
line wrap: on
line source


# coding: utf-8

import sys, re

# max number of lines to read (for testing)
maxLinecnt = None

# do not output deleted nodes
omitDeleted = True

# active log levels for logging
logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
#logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'}
#logLevels = {'ERROR', 'SYSMSG'}

def log(level, message):
    if level in logLevels:
        print("%s: %s"%(level, message))


def prettyPrintNode(node):
    nt = node['node-type']
    att = ''
    if nt == 'ENTITY':
        att = " %s=%s "%('oc',node['object-class'])
        
    elif nt == 'ATTRIBUTE':
        att = " %s=%s "%('name',node['name'])
        
    elif nt == 'RELATION':
        att = " %s=%s "%('oc',node['object-class'])

    s = "%s%s[%s]"%(nt, att, node)
    return s


nodeCsvFieldList = ['node_type', 'id', 'row_id', 'object_class', 'user', 'public', 'type',
                    'version', 'modification_time', 'system_status', 'content_type',
                    'source_id', 'source_modif', 'source_obj_class', 'target_id', 'target_modif', 'target_obj_class',
                    'own_value_b64']

nodeCsvFieldMap = {'node_type':'node-type', 'id':'id', 'row_id':'row-id', 'object_class':'object-class', 'user':'user', 
                   'public':'public', 'type':'type',
                    'version':'version', 'modification_time':'mtime', 'system_status':'system-status', 'content_type':'content-type',
                    'source_id':'source-id', 'source_modif':'source-mtime', 'source_obj_class':'source-oc', 
                    'target_id':'target-id', 'target_modif':'target-mtime', 'target_obj_class':'target-oc',
                    'own_value_b64':'b64-value'}


def printHeaderCsv(outFile):
    s = ""
    for cf in nodeCsvFieldList:
        if s == "":
            s += "%s"%cf
        else:
            s += ",%s"%cf
            
    print(s, file=outFile)


def printNodeCsv(node, outFile):
    s = ""
    for cf in nodeCsvFieldList:
        nf = nodeCsvFieldMap[cf]
        if s == "":
            s += "%s"%(node.get(nf, ''))
        else:
            s += ",%s"%(node.get(nf, ''))
            
    print(s, file=outFile)


def parseStart(line):
    tstamp = None
    tm = re.match('(\d+-\d+-\d+ \d+:\d+:\d+)', line)
    if tm:
        tstamp = tm.group(1)
        
    sm = re.search('START Saving (\w+) id=(\w+)', line)
    if sm:
        return {'time': tstamp, 'oc': sm.group(1), 'id': sm.group(2)}
    
    sm = re.search('START remove entity: user=\S+ entity=Entity\[rowId=\d+, id=(\d+)', line)
    if sm:
        return {'time': tstamp, 'id': sm.group(1)}
    
    sm = re.search('END Saving (\w+) id=(\w+)', line)
    if sm:
        return {'time': tstamp, 'oc': sm.group(1), 'id': sm.group(2)}
    
    sm = re.search('END remove entity: user=\S+ entity=Entity\[rowId=\d+, id=(\d+)', line)
    if sm:
        return {'time': tstamp, 'id': sm.group(1)}
    
    return None


def parseSave(line):
    match = re.search('([A-Z]+)\[([^\]]+)\]', line)
    if match:
        data = {'node-type': match.group(1)}
        segs = match.group(2).split(', ')
        for seg in segs:
            k, v = seg.split('=', 1)
            
            data[k] = v.strip('"')
            
            # normalize attribute's name to object-class
            if k == 'name':
                data['object-class'] = v.strip('"')
            
        return data
    
    return None


def equalNodes(prev, cur):
    #log("DEBUG", "compare: %s vs %s"%(prev, cur))
    if prev['id'] != cur['id']:
        log("INFO", "node id mismatch!")
        return False
    
    if prev['node-type'] != cur['node-type']:
        log("INFO", "node node-type mismatch!")
        return False
    
    if prev.get('source-id', None) != cur.get('source-id', None):
        log("INFO", "node source_id mismatch!")
        return False
    
    if prev.get('target-id', None) != cur.get('target-id', None):
        log("INFO", "node target_id mismatch!")
        return False
    
    if prev['b64-value'] != cur['b64-value']:
        log("INFO", "node ownvalue mismatch!")
        return False
    
    return True


def getSimilarNode(prev, curList):
    nt = prev['node-type']
    if nt == 'ATTRIBUTE':
        for n in curList:
            if n['node-type'] == 'ATTRIBUTE' \
              and prev['name'] == n['name']:
                # attribute with same name
                log("DEBUG", "similar attributes: %s vs %s"%(prev, n))
                return n
            
    elif nt == 'RELATION':
        for n in curList:
            if n['node-type'] == 'RELATION' \
              and prev['source-id'] == n['source-id'] \
              and prev['target-id'] == n['target-id'] \
              and prev['object-class'] == n['object-class']:
                # relation with same source, target and type
                log("DEBUG", "similar relations: %s vs %s"%(prev, n))
                return n
            
    return None


def compareNodeLists(prev, cur, ctx, lostFile=None):
    """compare list of previous and current nodes.
    
    prints results to the log and output file.
    """
    prevNodes = {}
    curNodes = {}
    
    #
    # read nodes
    #
    for n in prev:
        nid = n['id']
        if nid not in prevNodes:
            prevNodes[nid] = n
        else:
            log("DEBUG", "duplicate save of prev node id="+nid)
            if isinstance(prevNodes[nid], list):
                prevNodes[nid].append(n)
            else:
                prevNodes[nid] = [prevNodes[nid], n]
        
    for n in cur:
        nid = n['id']
        if nid not in curNodes:
            curNodes[nid] = n
        else:
            log("DEBUG", "duplicate save of cur node id="+nid)
            if isinstance(curNodes[nid], list):
                curNodes[nid].append(n)
            else:
                curNodes[nid] = [curNodes[nid], n]
        
    #
    # compare nodes
    #
    curNodeUnchecked = set(curNodes.keys())
    addPrevNodes = []
    addCurNodes = []
    
    for nid in prevNodes:
        prevNode = prevNodes[nid]
        if isinstance(prevNode, list):
            log("DEBUG", "multi-save prev node: %s"%prevNode)
            # use the last version(?)
            prevNode = prevNode[-1]
            
        if nid not in curNodes:
            if prevNode['node-type'] == 'ATTRIBUTE' and prevNode['b64-value'] == '':
                # emtpy attribute - ignore
                continue
                
            else:
                log("DEBUG", "node %s not in cur saves! %s"%(nid,prevNode))
                addPrevNodes.append(prevNode)
                continue
            
        curNode = curNodes[nid]
        if isinstance(curNode, list):
            log("DEBUG", "multi-save cur node: %s"%curNode)
            # use the last version?
            curNode = curNode[-1]
            
        equalNodes(prevNode, curNode)

        curNodeUnchecked.remove(nid)
        
    # make list of additional current (=new) nodes
    for nid in curNodeUnchecked:
        curNode = curNodes[nid]
        # list can contain lists
        if isinstance(curNode, list):
            # use the last version(?)
            curNode = curNode[-1]
            
        addCurNodes.append(curNode)
        log("DEBUG", "new node %s"%curNodes[nid])
        
    # compare missing and new nodes
    for n in addPrevNodes.copy():
        sn = getSimilarNode(n, addCurNodes)
        if sn is not None:
            # similar is good enough
            addPrevNodes.remove(n)
            addCurNodes.remove(sn)
    
    if len(addPrevNodes) > 0:
        #print("ERROR: lost nodes: %s"%[prettyPrintNode(n) for n in addPrevNodes])
        log("ERROR", "in %s"%ctx)
        for n in addPrevNodes:
            log("ERROR","lost node: %s"%prettyPrintNode(n))
            if lostFile is not None:
                printNodeCsv(n, lostFile)
        
    if len(addCurNodes) > 0:
        #print("INFO: new nodes: %s"%[prettyPrintNode(n) for n in addCurNodes])
        for n in addCurNodes:
            log("INFO", "new node: %s"%prettyPrintNode(n))
        


def analyseLogfile(inFilename, outFilename=None):
    outFile = None
    if outFilename is not None:
        outFile = open(outFilename, mode='w')
        printHeaderCsv(outFile)
    
    with open(inFilename) as f:
        linecnt = 0
        saving = 0
        openSaves = {}
        savingPrev = 0
        deleting = 0
        saveCtx = None
        deleteCtx = None
        prevSaves = []
        saves = []
        
        for line in f:
            linecnt += 1
            #
            # parsing lines in order of specific match, not order of operation!
            #
            if '*** START Saving' in line:
                saving += 1
                # make sure delete is off
                deleting = 0
                log('DEBUG', line)
                # parse time and id
                saveCtx = parseStart(line)
                saveId = saveCtx['id']
                if saveId in openSaves:
                    log('ERROR', "Duplicate save for same id! %s"%saveCtx)
                    # assume the first save was spurious
                    saving -= 1
                    
                openSaves[saveId] = saveCtx
                
                if len(openSaves) > 1:
                    log("ERROR", "Multiple open save (%s) in #%s of %s"%(saving, linecnt, line))
                    # TODO: what now?
                    
            elif '*** END Saving' in line:
                saving -= 1
                saveCtx = parseStart(line)
                saveId = saveCtx['id']
                if saveId in openSaves:
                    log('DEBUG', "End save matches start save: %s"%saveCtx)
                    del openSaves[saveId]
                    
                elif 'null' in openSaves:
                    log('DEBUG', "Assume end save (of %s) matches start save null: %s"%(len(openSaves), saveCtx))
                    del openSaves['null']
                    
                else:
                    log("ERROR", "End save without start save! %s"%saveCtx)
                
                # make sure delete is off
                deleting = 0
                log('DEBUG', line)
                
                if len(openSaves) > 0:
                    log('WARNING', "Still open saves (%s) during end save in #%s of %s"%(saving, linecnt, line))
                
                elif saving < 0:
                    log('ERROR', "Too many END saves!")
                    break
                    
                log("INFO", "saving %s"%saveCtx)
                log("INFO", "prev saves: %s"%len(prevSaves))
                log("INFO", "saves: %s"%len(saves))
                
                if len(prevSaves) > 0:
                    compareNodeLists(prevSaves, saves, saveCtx, outFile)
                
                prevSaves = []
                saves = []
                            
            elif '** START save entity' in line:
                pass

            elif '** END save entity' in line:
                pass

            elif '** START remove entity' in line:
                deleting += 1
                log('DEBUG', line)
                deleteCtx = parseStart(line)
                
                if deleting > 1:
                    log("ERROR", "Concurrent delete (%s) in #%s of %s"%(saving, linecnt, line))
                    # TODO: what now?
                    break
                    
            elif '** END remove entity' in line:
                deleting -= 1
                log('DEBUG', line)
                ctx = parseStart(line)
                if deleteCtx['id'] != ctx['id']:
                    log('ERROR', "END delete has different ID!")
                    break
                
                if deleting > 1:
                    log("ERROR", "Concurrent delete (%s) in #%s of %s"%(saving, linecnt, line))
                    # TODO: what now?
                    break
                
                deleteCtx = None
                    
            elif '* START save entity' in line:
                pass

            elif '* START save previous' in line:
                savingPrev += 1

            elif '* ...END save entity' in line:
                pass

            elif '* END ...save previous' in line:                
                savingPrev -= 1
                
                if saving < 0:
                    log("ERROR", "Too many END save previous!")
            
            elif 'save previous' in line:
                data = parseSave(line)
                if data is None:
                    log("DEBUG", "Error parsing line: %s"%line)
                    continue
                    
                if omitDeleted and deleting > 0 and savingPrev > 0:
                    # this should be a save prev from deleting
                    delId = deleteCtx['id']
                    # check if node is related to deleted id
                    if data.get('id', None) == delId or data.get('source-id', None) == delId \
                        or data.get('target-id', None) == delId:
                        log('DEBUG', "intentionally deleted node: %s"%data)
                    
                    else:
                        log('ERROR', "Node without matching id in delete! %s"%data)
                        prevSaves.append(data)
                        
                else:
                    prevSaves.append(data)
                
            elif 'save' in line:
                data = parseSave(line)
                if data is None:
                    log("DEBUG", "Error parsing line: %s"%line)
                    continue
                
                saves.append(parseSave(line))

            if maxLinecnt is not None and linecnt >= maxLinecnt:
                break
                
        log("SYSMSG", "%s lines of logfile scanned"%linecnt)
        

#        
# public static void main :-)
#

input_fn = None
output_fn = None

# parse command line parameters
if len(sys.argv) > 2:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

    # run analysis
    analyseLogfile(input_fn, output_fn)

else:
    print("ERROR: missing parameters!")
    print("use: check_ismi_log logfile csvfile")
    exit(1)