# HG changeset patch # User casties # Date 1481221142 -3600 # Node ID bb5390f752fc04153da32f5f664894d6616fe78e # Parent 4dee9586cc44cf40fce732e05c4f60f88f81df60 new ISMI transaction log analyser. diff -r 4dee9586cc44 -r bb5390f752fc importFromOpenMind/importer/check_ismi_transaction_log.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/importFromOpenMind/importer/check_ismi_transaction_log.py Thu Dec 08 19:19:02 2016 +0100 @@ -0,0 +1,443 @@ + +# coding: utf-8 + +import sys, re + +# max number of lines to read (for testing) +maxLinecnt = None + +# do not output deleted nodes +omitDeleted = True + +# active log levels for logging +logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} +#logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} +#logLevels = {'ERROR', 'SYSMSG'} + +def log(level, message): + if level in logLevels: + print("%s: %s"%(level, message)) + + +def prettyPrintNode(node): + nt = node['node-type'] + att = '' + if nt == 'ENTITY': + att = " %s=%s "%('oc',node['object-class']) + + elif nt == 'ATTRIBUTE': + att = " %s=%s "%('name',node['name']) + + elif nt == 'RELATION': + att = " %s=%s "%('oc',node['object-class']) + + s = "%s%s[%s]"%(nt, att, node) + return s + + +nodeCsvFieldList = ['node_type', 'id', 'row_id', 'object_class', 'user', 'public', 'type', + 'version', 'modification_time', 'system_status', 'content_type', + 'source_id', 'source_modif', 'source_obj_class', 'target_id', 'target_modif', 'target_obj_class', + 'own_value_b64'] + +nodeCsvFieldMap = {'node_type':'node-type', 'id':'id', 'row_id':'row-id', 'object_class':'object-class', 'user':'user', + 'public':'public', 'type':'type', + 'version':'version', 'modification_time':'mtime', 'system_status':'system-status', 'content_type':'content-type', + 'source_id':'source-id', 'source_modif':'source-mtime', 'source_obj_class':'source-oc', + 'target_id':'target-id', 'target_modif':'target-mtime', 'target_obj_class':'target-oc', + 'own_value_b64':'b64-value'} + + +def printHeaderCsv(outFile): + s = "" + for cf in nodeCsvFieldList: + if s == "": + s += "%s"%cf + else: + s += ",%s"%cf + + print(s, file=outFile) + + +def printNodeCsv(node, outFile): + s = "" + for cf in nodeCsvFieldList: + nf = nodeCsvFieldMap[cf] + if s == "": + s += "%s"%(node.get(nf, '')) + else: + s += ",%s"%(node.get(nf, '')) + + print(s, file=outFile) + + +def parseStart(line): + tstamp = None + tm = re.match('(\d+-\d+-\d+ \d+:\d+:\d+)', line) + if tm: + tstamp = tm.group(1) + + sm = re.search('START Saving (\w+) id=(\w+)', line) + if sm: + return {'time': tstamp, 'oc': sm.group(1), 'id': sm.group(2)} + + sm = re.search('START remove entity: user=\S+ entity=Entity\[rowId=\d+, id=(\d+)', line) + if sm: + return {'time': tstamp, 'id': sm.group(1)} + + sm = re.search('END Saving (\w+) id=(\w+)', line) + if sm: + return {'time': tstamp, 'oc': sm.group(1), 'id': sm.group(2)} + + sm = re.search('END remove entity: user=\S+ entity=Entity\[rowId=\d+, id=(\d+)', line) + if sm: + return {'time': tstamp, 'id': sm.group(1)} + + return None + + +def parseSave(line): + match = re.search('([A-Z]+)\[([^\]]+)\]', line) + if match: + data = {'node-type': match.group(1)} + segs = match.group(2).split(', ') + for seg in segs: + k, v = seg.split('=', 1) + + data[k] = v.strip('"') + + # normalize attribute's name to object-class + if k == 'name': + data['object-class'] = v.strip('"') + + return data + + return None + + +def equalNodes(prev, cur): + #log("DEBUG", "compare: %s vs %s"%(prev, cur)) + if prev['id'] != cur['id']: + log("INFO", "node id mismatch!") + return False + + if prev['node-type'] != cur['node-type']: + log("INFO", "node node-type mismatch!") + return False + + if prev.get('source-id', None) != cur.get('source-id', None): + log("INFO", "node source_id mismatch!") + return False + + if prev.get('target-id', None) != cur.get('target-id', None): + log("INFO", "node target_id mismatch!") + return False + + if prev['b64-value'] != cur['b64-value']: + log("INFO", "node ownvalue mismatch!") + return False + + return True + + +def getSimilarNode(prev, curList): + nt = prev['node-type'] + if nt == 'ATTRIBUTE': + for n in curList: + if n['node-type'] == 'ATTRIBUTE' \ + and prev['name'] == n['name']: + # attribute with same name + log("DEBUG", "similar attributes: %s vs %s"%(prev, n)) + return n + + elif nt == 'RELATION': + for n in curList: + if n['node-type'] == 'RELATION' \ + and prev['source-id'] == n['source-id'] \ + and prev['target-id'] == n['target-id'] \ + and prev['object-class'] == n['object-class']: + # relation with same source, target and type + log("DEBUG", "similar relations: %s vs %s"%(prev, n)) + return n + + return None + + +def compareNodeLists(prev, cur, ctx, lostFile=None): + """compare list of previous and current nodes. + + prints results to the log and output file. + """ + prevNodes = {} + curNodes = {} + + # + # read nodes + # + for n in prev: + nid = n['id'] + if nid not in prevNodes: + prevNodes[nid] = n + else: + log("DEBUG", "duplicate save of prev node id="+nid) + if isinstance(prevNodes[nid], list): + prevNodes[nid].append(n) + else: + prevNodes[nid] = [prevNodes[nid], n] + + for n in cur: + nid = n['id'] + if nid not in curNodes: + curNodes[nid] = n + else: + log("DEBUG", "duplicate save of cur node id="+nid) + if isinstance(curNodes[nid], list): + curNodes[nid].append(n) + else: + curNodes[nid] = [curNodes[nid], n] + + # + # compare nodes + # + curNodeUnchecked = set(curNodes.keys()) + addPrevNodes = [] + addCurNodes = [] + + for nid in prevNodes: + prevNode = prevNodes[nid] + if isinstance(prevNode, list): + log("DEBUG", "multi-save prev node: %s"%prevNode) + # use the last version(?) + prevNode = prevNode[-1] + + if nid not in curNodes: + if prevNode['node-type'] == 'ATTRIBUTE' and prevNode['b64-value'] == '': + # emtpy attribute - ignore + continue + + else: + log("DEBUG", "node %s not in cur saves! %s"%(nid,prevNode)) + addPrevNodes.append(prevNode) + continue + + curNode = curNodes[nid] + if isinstance(curNode, list): + log("DEBUG", "multi-save cur node: %s"%curNode) + # use the last version? + curNode = curNode[-1] + + equalNodes(prevNode, curNode) + + curNodeUnchecked.remove(nid) + + # make list of additional current (=new) nodes + for nid in curNodeUnchecked: + curNode = curNodes[nid] + # list can contain lists + if isinstance(curNode, list): + # use the last version(?) + curNode = curNode[-1] + + addCurNodes.append(curNode) + log("DEBUG", "new node %s"%curNodes[nid]) + + # compare missing and new nodes + for n in addPrevNodes.copy(): + sn = getSimilarNode(n, addCurNodes) + if sn is not None: + # similar is good enough + addPrevNodes.remove(n) + addCurNodes.remove(sn) + + if len(addPrevNodes) > 0: + #print("ERROR: lost nodes: %s"%[prettyPrintNode(n) for n in addPrevNodes]) + log("ERROR", "in %s"%ctx) + for n in addPrevNodes: + log("ERROR","lost node: %s"%prettyPrintNode(n)) + if lostFile is not None: + printNodeCsv(n, lostFile) + + if len(addCurNodes) > 0: + #print("INFO: new nodes: %s"%[prettyPrintNode(n) for n in addCurNodes]) + for n in addCurNodes: + log("INFO", "new node: %s"%prettyPrintNode(n)) + + + +def analyseLogfile(inFilename, outFilename=None): + outFile = None + if outFilename is not None: + outFile = open(outFilename, mode='w') + printHeaderCsv(outFile) + + with open(inFilename) as f: + linecnt = 0 + saving = 0 + openSaves = {} + savingPrev = 0 + deleting = 0 + saveCtx = None + deleteCtx = None + prevSaves = [] + saves = [] + + for line in f: + linecnt += 1 + # + # parsing lines in order of specific match, not order of operation! + # + if '*** START Saving' in line: + saving += 1 + # make sure delete is off + deleting = 0 + log('DEBUG', line) + # parse time and id + saveCtx = parseStart(line) + saveId = saveCtx['id'] + if saveId in openSaves: + log('ERROR', "Duplicate save for same id! %s"%saveCtx) + # assume the first save was spurious + saving -= 1 + + openSaves[saveId] = saveCtx + + if len(openSaves) > 1: + log("ERROR", "Multiple open save (%s) in #%s of %s"%(saving, linecnt, line)) + # TODO: what now? + + elif '*** END Saving' in line: + saving -= 1 + saveCtx = parseStart(line) + saveId = saveCtx['id'] + if saveId in openSaves: + log('DEBUG', "End save matches start save: %s"%saveCtx) + del openSaves[saveId] + + elif 'null' in openSaves: + log('DEBUG', "Assume end save (of %s) matches start save null: %s"%(len(openSaves), saveCtx)) + del openSaves['null'] + + else: + log("ERROR", "End save without start save! %s"%saveCtx) + + # make sure delete is off + deleting = 0 + log('DEBUG', line) + + if len(openSaves) > 0: + log('WARNING', "Still open saves (%s) during end save in #%s of %s"%(saving, linecnt, line)) + + elif saving < 0: + log('ERROR', "Too many END saves!") + break + + log("INFO", "saving %s"%saveCtx) + log("INFO", "prev saves: %s"%len(prevSaves)) + log("INFO", "saves: %s"%len(saves)) + + if len(prevSaves) > 0: + compareNodeLists(prevSaves, saves, saveCtx, outFile) + + prevSaves = [] + saves = [] + + elif '** START save entity' in line: + pass + + elif '** END save entity' in line: + pass + + elif '** START remove entity' in line: + deleting += 1 + log('DEBUG', line) + deleteCtx = parseStart(line) + + if deleting > 1: + log("ERROR", "Concurrent delete (%s) in #%s of %s"%(saving, linecnt, line)) + # TODO: what now? + break + + elif '** END remove entity' in line: + deleting -= 1 + log('DEBUG', line) + ctx = parseStart(line) + if deleteCtx['id'] != ctx['id']: + log('ERROR', "END delete has different ID!") + break + + if deleting > 1: + log("ERROR", "Concurrent delete (%s) in #%s of %s"%(saving, linecnt, line)) + # TODO: what now? + break + + deleteCtx = None + + elif '* START save entity' in line: + pass + + elif '* START save previous' in line: + savingPrev += 1 + + elif '* ...END save entity' in line: + pass + + elif '* END ...save previous' in line: + savingPrev -= 1 + + if saving < 0: + log("ERROR", "Too many END save previous!") + + elif 'save previous' in line: + data = parseSave(line) + if data is None: + log("DEBUG", "Error parsing line: %s"%line) + continue + + if omitDeleted and deleting > 0 and savingPrev > 0: + # this should be a save prev from deleting + delId = deleteCtx['id'] + # check if node is related to deleted id + if data.get('id', None) == delId or data.get('source-id', None) == delId \ + or data.get('target-id', None) == delId: + log('DEBUG', "intentionally deleted node: %s"%data) + + else: + log('ERROR', "Node without matching id in delete! %s"%data) + prevSaves.append(data) + + else: + prevSaves.append(data) + + elif 'save' in line: + data = parseSave(line) + if data is None: + log("DEBUG", "Error parsing line: %s"%line) + continue + + saves.append(parseSave(line)) + + if maxLinecnt is not None and linecnt >= maxLinecnt: + break + + log("SYSMSG", "%s lines of logfile scanned"%linecnt) + + +# +# public static void main :-) +# + +input_fn = None +output_fn = None + +# parse command line parameters +if len(sys.argv) > 2: + input_fn = sys.argv[1] + output_fn = sys.argv[2] + + # run analysis + analyseLogfile(input_fn, output_fn) + +else: + print("ERROR: missing parameters!") + print("use: check_ismi_log logfile csvfile") + exit(1)