Mercurial > hg > drupalISMI
view importFromOpenMind/importer/check_ismi_transaction_log.py @ 57:be1c7d6814b6
small change to xml format.
author | casties |
---|---|
date | Mon, 10 Jul 2017 17:18:39 +0200 |
parents | 1f160e595167 |
children |
line wrap: on
line source
# coding: utf-8 import sys, re # max number of lines to read (for testing) maxLinecnt = None # do not output deleted nodes omitDeleted = True # active log levels for logging #logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} #logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} logLevels = {'ERROR', 'SYSMSG'} def log(level, message): if level in logLevels: print("%s: %s"%(level, message)) def prettyPrintNode(node): nt = node['node-type'] att = '' if nt == 'ENTITY': att = " %s=%s "%('oc',node['object-class']) elif nt == 'ATTRIBUTE': att = " %s=%s "%('name',node['name']) elif nt == 'RELATION': att = " %s=%s "%('oc',node['object-class']) s = "%s%s[%s]"%(nt, att, node) return s nodeCsvFieldList = ['_logtime', 'node_type', 'id', 'row_id', 'object_class', 'user', 'public', 'type', 'version', 'modification_time', 'system_status', 'content_type', 'source_id', 'source_modif', 'source_obj_class', 'target_id', 'target_modif', 'target_obj_class', 'own_value_b64'] nodeCsvFieldMap = {'_logtime':'_logtime', 'node_type':'node-type', 'id':'id', 'row_id':'row-id', 'object_class':'object-class', 'user':'user', 'public':'public', 'type':'type', 'version':'version', 'modification_time':'mtime', 'system_status':'system-status', 'content_type':'content-type', 'source_id':'source-id', 'source_modif':'source-mtime', 'source_obj_class':'source-oc', 'target_id':'target-id', 'target_modif':'target-mtime', 'target_obj_class':'target-oc', 'own_value_b64':'b64-value'} def printHeaderCsv(outFile): s = "" for cf in nodeCsvFieldList: if s == "": s += "%s"%cf else: s += ",%s"%cf print(s, file=outFile) def printNodeCsv(node, outFile): s = "" for cf in nodeCsvFieldList: nf = nodeCsvFieldMap[cf] if s == "": s += "%s"%(node.get(nf, '')) else: s += ",%s"%(node.get(nf, '')) print(s, file=outFile) def parseThread(line): tm = re.match('(\d+-\d+-\d+ \d+:\d+:\d+),\d+ (\S+)', line) if tm: tstamp = tm.group(1) tid = tm.group(2) return {'time': tstamp, 'tid': tid} return None def parseStart(line): pt = parseThread(line) tstamp = pt['time'] tid = pt['tid'] sm = re.search('START Saving (\w+) id=(\w+)', line) if sm: return {'tid': tid, 'time': tstamp, 'oc': sm.group(1), 'id': sm.group(2)} sm = re.search('START remove entity: user=\S+ entity=Entity\[rowId=\d+, id=(\d+)', line) if sm: return {'tid': tid, 'time': tstamp, 'id': sm.group(1)} sm = re.search('END Saving (\w+) id=(\w+)', line) if sm: return {'tid': tid, 'time': tstamp, 'oc': sm.group(1), 'id': sm.group(2)} sm = re.search('END remove entity: user=\S+ entity=Entity\[rowId=\d+, id=(\d+)', line) if sm: return {'tid': tid, 'time': tstamp, 'id': sm.group(1)} return None def parseSave(line): match = re.search('([A-Z]+)\[([^\]]+)\]', line) if match: data = {'node-type': match.group(1)} segs = match.group(2).split(', ') for seg in segs: k, v = seg.split('=', 1) data[k] = v.strip('"') # normalize attribute's name to object-class if k == 'name': data['object-class'] = v.strip('"') return data return None def equalNodes(prev, cur): #log("DEBUG", "compare: %s vs %s"%(prev, cur)) if prev['id'] != cur['id']: log("INFO", "node id mismatch!") return False if prev['node-type'] != cur['node-type']: log("INFO", "node node-type mismatch!") return False if prev.get('source-id', None) != cur.get('source-id', None): log("INFO", "node source_id mismatch!") return False if prev.get('target-id', None) != cur.get('target-id', None): log("INFO", "node target_id mismatch!") return False if prev['b64-value'] != cur['b64-value']: log("INFO", "node ownvalue mismatch!") return False return True def getSimilarNode(prev, curList): nt = prev['node-type'] if nt == 'ATTRIBUTE': for n in curList: if n['node-type'] == 'ATTRIBUTE' \ and prev['name'] == n['name']: # attribute with same name log("DEBUG", "similar attributes: %s vs %s"%(prev, n)) return n elif nt == 'RELATION': for n in curList: if n['node-type'] == 'RELATION' \ and prev['source-id'] == n['source-id'] \ and prev['target-id'] == n['target-id'] \ and prev['object-class'] == n['object-class']: # relation with same source, target and type log("DEBUG", "similar relations: %s vs %s"%(prev, n)) return n return None def compareNodeLists(prev, cur, ctx, lostFile=None): """compare list of previous and current nodes. prints results to the log and output file. """ prevNodes = {} curNodes = {} # # read nodes # for n in prev: nid = n['id'] if nid not in prevNodes: prevNodes[nid] = n else: log("DEBUG", "duplicate save of prev node id="+nid) if isinstance(prevNodes[nid], list): prevNodes[nid].append(n) else: prevNodes[nid] = [prevNodes[nid], n] for n in cur: nid = n['id'] if nid not in curNodes: curNodes[nid] = n else: log("DEBUG", "duplicate save of cur node id="+nid) if isinstance(curNodes[nid], list): curNodes[nid].append(n) else: curNodes[nid] = [curNodes[nid], n] # # compare nodes # curNodeUnchecked = set(curNodes.keys()) addPrevNodes = [] addCurNodes = [] for nid in prevNodes: prevNode = prevNodes[nid] if isinstance(prevNode, list): log("DEBUG", "multi-save prev node: %s"%prevNode) # use the last version(?) prevNode = prevNode[-1] if nid not in curNodes: if prevNode['node-type'] == 'ATTRIBUTE' and prevNode['b64-value'] == '': # emtpy attribute - ignore continue else: log("DEBUG", "node %s not in cur saves! %s"%(nid,prevNode)) addPrevNodes.append(prevNode) continue curNode = curNodes[nid] if isinstance(curNode, list): log("DEBUG", "multi-save cur node: %s"%curNode) # use the last version? curNode = curNode[-1] equalNodes(prevNode, curNode) curNodeUnchecked.remove(nid) # make list of additional current (=new) nodes for nid in curNodeUnchecked: curNode = curNodes[nid] # list can contain lists if isinstance(curNode, list): # use the last version(?) curNode = curNode[-1] addCurNodes.append(curNode) log("DEBUG", "new node %s"%curNodes[nid]) # compare missing and new nodes for n in addPrevNodes.copy(): sn = getSimilarNode(n, addCurNodes) if sn is not None: # similar is good enough addPrevNodes.remove(n) addCurNodes.remove(sn) if len(addPrevNodes) > 0: #print("ERROR: lost nodes: %s"%[prettyPrintNode(n) for n in addPrevNodes]) log("ERROR", "in %s"%ctx) for n in addPrevNodes: log("ERROR","lost node: %s"%prettyPrintNode(n)) if lostFile is not None: printNodeCsv(n, lostFile) if len(addCurNodes) > 0: #print("INFO: new nodes: %s"%[prettyPrintNode(n) for n in addCurNodes]) for n in addCurNodes: log("INFO", "new node: %s"%prettyPrintNode(n)) def analyseLogfile(inFilename, outFilename=None): outFile = None if outFilename is not None: outFile = open(outFilename, mode='w') printHeaderCsv(outFile) with open(inFilename) as f: linecnt = 0 savecnt = 0 saving = 0 openSaves = {} savingPrev = 0 deleting = 0 saveCtx = None deleteCtx = None prevSaves = {} saves = {} for line in f: linecnt += 1 # # parsing lines in order of specific match, not order of operation! # if '*** START Saving' in line: savecnt += 1 saving += 1 # make sure delete is off deleting = 0 log('DEBUG', line) # parse time and id saveCtx = parseStart(line) tid = saveCtx['tid'] saveId = "%s@%s"%(saveCtx['id'], tid) if saveId in openSaves: log('ERROR', "Duplicate save for same id! %s"%saveCtx) # assume the first save was spurious saving -= 1 openSaves[saveId] = saveCtx if len(openSaves) > 1: log("ERROR", "Multiple open save (%s) in #%s of %s"%(saving, linecnt, line)) # TODO: what now? elif '*** END Saving' in line: saving -= 1 saveCtx = parseStart(line) tid = saveCtx['tid'] saveId = "%s@%s"%(saveCtx['id'], tid) if saveId in openSaves: log('DEBUG', "End save matches start save: %s"%saveCtx) del openSaves[saveId] elif "null@%s"%saveCtx['tid'] in openSaves: log('DEBUG', "Assume end save (of %s) matches start save null: %s"%(len(openSaves), saveCtx)) del openSaves["null@%s"%saveCtx['tid']] else: log("ERROR", "End save without start save! %s"%saveCtx) # make sure delete is off deleting = 0 log('DEBUG', line) if len(openSaves) > 0: log('WARNING', "Still open saves (%s) during end save in #%s of %s"%(saving, linecnt, line)) elif saving < 0: log('ERROR', "Too many END saves!") break if tid not in saves: saves[tid] = [] if tid not in prevSaves: prevSaves[tid] = [] log("INFO", "saving %s"%saveCtx) log("INFO", "prev saves: %s"%len(prevSaves[tid])) log("INFO", "saves: %s"%len(saves[tid])) if len(prevSaves[tid]) > 0: compareNodeLists(prevSaves[tid], saves[tid], saveCtx, outFile) del prevSaves[tid] del saves[tid] elif '** START save entity' in line: pass elif '** END save entity' in line: pass elif '** START remove entity' in line: deleting += 1 log('DEBUG', line) deleteCtx = parseStart(line) if deleting > 1: log("ERROR", "Concurrent delete (%s) in #%s of %s"%(saving, linecnt, line)) # TODO: what now? break elif '** END remove entity' in line: deleting -= 1 log('DEBUG', line) ctx = parseStart(line) if deleteCtx['id'] != ctx['id']: log('ERROR', "END delete has different ID!") break if deleting > 1: log("ERROR", "Concurrent delete (%s) in #%s of %s"%(saving, linecnt, line)) # TODO: what now? break deleteCtx = None elif '* START save entity' in line: pass elif '* START save previous' in line: savingPrev += 1 elif '* ...END save entity' in line: pass elif '* END ...save previous' in line: savingPrev -= 1 if savingPrev < 0: log("ERROR", "Too many END save previous!") elif 'save previous' in line: # check thread ID td = parseThread(line) tid = td['tid'] data = parseSave(line) if data is None: log("DEBUG", "Error parsing line: %s"%line) continue # add log time data['_logtime'] = td['time'] if omitDeleted and deleting > 0 and savingPrev > 0: # this should be a save prev from deleting delId = deleteCtx['id'] # check if node is related to deleted id if data.get('id', None) == delId or data.get('source-id', None) == delId \ or data.get('target-id', None) == delId: log('DEBUG', "intentionally deleted node: %s"%data) else: log('ERROR', "Node without matching id in delete! %s"%data) if tid in prevSaves: prevSaves[tid].append(data) else: prevSaves[tid] = [data] else: if tid in prevSaves: prevSaves[tid].append(data) else: prevSaves[tid] = [data] elif 'save' in line: # check thread ID td = parseThread(line) tid = td['tid'] data = parseSave(line) if data is None: log("DEBUG", "Error parsing line: %s"%line) continue # add log time data['_logtime'] = td['time'] if tid in saves: saves[tid].append(data) else: saves[tid] = [data] if maxLinecnt is not None and linecnt >= maxLinecnt: break if len(saves) > 0: log('ERROR', "%s unprocessed saves!"%len(saves)) log('DEBUG', saves) if len(prevSaves) > 0: log('ERROR', "%s unprocessed previous saves!"%len(prevSaves)) log('DEBUG', prevSaves) log("SYSMSG", "%s form saves in %s lines of logfile"%(savecnt, linecnt)) # # public static void main :-) # input_fn = None output_fn = None # parse command line parameters if len(sys.argv) > 2: input_fn = sys.argv[1] output_fn = sys.argv[2] # run analysis analyseLogfile(input_fn, output_fn) else: print("ERROR: missing parameters!") print("use: check_ismi_log logfile csvfile") exit(1)