Mercurial > hg > drupalISMI
view importFromOpenMind/importer/check_ismi_log.py @ 41:5b3cd0b66b30
fix bug with multiple-saved nodes.
author | casties |
---|---|
date | Wed, 02 Nov 2016 16:56:27 +0100 |
parents | f38ca3eb1088 |
children | 4dee9586cc44 |
line wrap: on
line source
# coding: utf-8 import sys, re # max number of lines to read (for testing) maxLinecnt = None # do not output deleted nodes omitDeleted = True # active log levels for logging logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} #logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} #logLevels = {'ERROR', 'SYSMSG'} def log(level, message): if level in logLevels: print("%s: %s"%(level, message)) def prettyPrintNode(node): nt = node['node-type'] att = '' if nt == 'ENTITY': att = " %s=%s "%('oc',node['object-class']) elif nt == 'ATTRIBUTE': att = " %s=%s "%('name',node['name']) elif nt == 'RELATION': att = " %s=%s "%('oc',node['object-class']) s = "%s%s[%s]"%(nt, att, node) return s nodeCsvFieldList = ['node_type', 'id', 'row_id', 'object_class', 'user', 'public', 'type', 'version', 'modification_time', 'system_status', 'content_type', 'source_id', 'source_modif', 'source_obj_class', 'target_id', 'target_modif', 'target_obj_class', 'own_value_b64'] nodeCsvFieldMap = {'node_type':'node-type', 'id':'id', 'row_id':'row-id', 'object_class':'object-class', 'user':'user', 'public':'public', 'type':'type', 'version':'version', 'modification_time':'mtime', 'system_status':'system-status', 'content_type':'content-type', 'source_id':'source-id', 'source_modif':'source-mtime', 'source_obj_class':'source-oc', 'target_id':'target-id', 'target_modif':'target-mtime', 'target_obj_class':'target-oc', 'own_value_b64':'b64-value'} def printHeaderCsv(outFile): s = "" for cf in nodeCsvFieldList: if s == "": s += "%s"%cf else: s += ",%s"%cf print(s, file=outFile) def printNodeCsv(node, outFile): s = "" for cf in nodeCsvFieldList: nf = nodeCsvFieldMap[cf] if s == "": s += "%s"%(node.get(nf, '')) else: s += ",%s"%(node.get(nf, '')) print(s, file=outFile) def parseStart(line): tstamp = None tm = re.match('(\d+-\d+-\d+ \d+:\d+:\d+)', line) if tm: tstamp = tm.group(1) sm = re.search('START Saving (\w+) \[ID=(\d*)', line) if sm: return {'time': tstamp, 'oc': sm.group(1), 'id': sm.group(2)} sm = re.search('Deleting entity \[ID=(\d*)', line) if sm: return {'time': tstamp, 'id': sm.group(1)} return None def parseSave(line): match = re.search('([A-Z]+)\[([^\]]+)\]', line) if match: data = {'node-type': match.group(1)} segs = match.group(2).split(', ') for seg in segs: k, v = seg.split('=', 1) # fix bug with relation's object-class parameter if k == 'object_class': k = 'object-class' data[k] = v.strip('"') # normalize attriute's name to object-class if k == 'name': data['object-class'] = v.strip('"') return data return None def equalNodes(prev, cur): log("DEBUG", "compare: %s vs %s"%(prev, cur)) if prev['id'] != cur['id']: log("INFO", "node id mismatch!") return False if prev['node-type'] != cur['node-type']: log("INFO", "node node-type mismatch!") return False if prev.get('source-id', None) != cur.get('source-id', None): log("INFO", "node source_id mismatch!") return False if prev.get('target-id', None) != cur.get('target-id', None): log("INFO", "node target_id mismatch!") return False if prev['b64-value'] != cur['b64-value']: log("INFO", "node ownvalue mismatch!") return False return True def getSimilarNode(prev, curList): nt = prev['node-type'] if nt == 'ATTRIBUTE': for n in curList: if n['node-type'] == 'ATTRIBUTE' \ and prev['name'] == n['name']: # attribute with same name log("DEBUG", "similar attributes: %s vs %s"%(prev, n)) return n elif nt == 'RELATION': for n in curList: if n['node-type'] == 'RELATION' \ and prev['source-id'] == n['source-id'] \ and prev['target-id'] == n['target-id'] \ and prev['object-class'] == n['object-class']: # relation with same source, target and type log("DEBUG", "similar relations: %s vs %s"%(prev, n)) return n return None def compareNodeLists(prev, cur, ctx, lostFile=None): """compare list of previous and current nodes. prints results to the log and output file. """ prevNodes = {} curNodes = {} # # read nodes # for n in prev: nid = n['id'] if nid not in prevNodes: prevNodes[nid] = n else: log("DEBUG", "duplicate save of prev node id="+nid) if isinstance(prevNodes[nid], list): prevNodes[nid].append(n) else: prevNodes[nid] = [prevNodes[nid], n] for n in cur: nid = n['id'] if nid not in curNodes: curNodes[nid] = n else: log("DEBUG", "duplicate save of cur node id="+nid) if isinstance(curNodes[nid], list): curNodes[nid].append(n) else: curNodes[nid] = [curNodes[nid], n] # # compare nodes # curNodeUnchecked = set(curNodes.keys()) addPrevNodes = [] addCurNodes = [] for nid in prevNodes: prevNode = prevNodes[nid] if isinstance(prevNode, list): log("DEBUG", "multi-save prev node: %s"%prevNode) # use the last version(?) prevNode = prevNode[-1] if nid not in curNodes: if prevNode['node-type'] == 'ATTRIBUTE' and prevNode['b64-value'] == '': # emtpy attribute - ignore continue else: log("DEBUG", "node %s not in cur saves! %s"%(nid,prevNode)) addPrevNodes.append(prevNode) continue curNode = curNodes[nid] if isinstance(curNode, list): log("DEBUG", "multi-save cur node: %s"%curNode) # use the last version? curNode = curNode[-1] equalNodes(prevNode, curNode) curNodeUnchecked.remove(nid) # make list of additional current (=new) nodes for nid in curNodeUnchecked: curNode = curNodes[nid] # list can contain lists if isinstance(curNode, list): # use the last version(?) curNode = curNode[-1] addCurNodes.append(curNode) log("DEBUG", "new node %s"%curNodes[nid]) # compare missing and new nodes for n in addPrevNodes.copy(): sn = getSimilarNode(n, addCurNodes) if sn is not None: # similar is good enough addPrevNodes.remove(n) addCurNodes.remove(sn) if len(addPrevNodes) > 0: #print("ERROR: lost nodes: %s"%[prettyPrintNode(n) for n in addPrevNodes]) log("ERROR", "in %s"%ctx) for n in addPrevNodes: log("ERROR","lost node: %s"%prettyPrintNode(n)) if lostFile is not None: printNodeCsv(n, lostFile) if len(addCurNodes) > 0: #print("INFO: new nodes: %s"%[prettyPrintNode(n) for n in addCurNodes]) for n in addCurNodes: log("INFO", "new node: %s"%prettyPrintNode(n)) def analyseLogfile(inFilename, outFilename=None): outFile = None if outFilename is not None: outFile = open(outFilename, mode='w') printHeaderCsv(outFile) with open(inFilename) as f: linecnt = 0 saving = 0 savingPrev = 0 deleting = 0 saveCtx = None deleteCtx = None prevSaves = [] saves = [] for line in f: linecnt += 1 if '*************** START Saving' in line: saving += 1 # make sure delete is off deleting = 0 log('DEBUG', line) # parse time and id saveCtx = parseStart(line) if saving > 1: log("ERROR", "Concurrent save (%s) in #%s of %s"%(saving, linecnt, line)) # TODO: what now? elif 'Deleting entity' in line: deleting += 1 log('DEBUG', line) deleteCtx = parseStart(line) if deleting > 1: log("ERROR", "Concurrent delete (%s) in #%s of %s"%(saving, linecnt, line)) # TODO: what now? break elif 'transactionlog' in line: if '* START save previous' in line: savingPrev += 1 elif '* End ...save previous' in line: if deleting > 0 and savingPrev > 0: # this should be the end of the save prev from deleting deleting -= 1 deleteCtx = None savingPrev -= 1 if saving < 0: log("ERROR", "Too many END save previous!") elif 'save previous' in line: data = parseSave(line) if data is None: log("DEBUG", "Error parsing line: %s"%line) continue if omitDeleted and deleting > 0 and savingPrev > 0: # this should be a save prev from deleting delId = deleteCtx['id'] # check if node is related to deleted id if data.get('id', None) == delId or data.get('source-id', None) == delId \ or data.get('target-id', None) == delId: log('DEBUG', "intentionally deleted node: %s"%data) else: log('ERROR', "Node without matching id in delete! %s"%data) prevSaves.append(data) else: prevSaves.append(data) elif 'save' in line: data = parseSave(line) if data is None: log("DEBUG", "Error parsing line: %s"%line) continue saves.append(parseSave(line)) elif '*************** END Saving' in line: saving -= 1 # make sure delete is off deleting = 0 log('DEBUG', line) if saving > 0: log("ERROR", "Concurrent end save (%s) in #%s of %s"%(saving, linecnt, line)) elif saving < 0: log("ERROR", "Too many END saves!") break log("INFO", "saving %s"%saveCtx) log("INFO", "prev saves: %s"%len(prevSaves)) log("INFO", "saves: %s"%len(saves)) if len(prevSaves) > 0: compareNodeLists(prevSaves, saves, saveCtx, outFile) prevSaves = [] saves = [] if maxLinecnt is not None and linecnt >= maxLinecnt: break log("SYSMSG", "%s lines of logfile scanned"%linecnt) # # public static void main :-) # input_fn = None output_fn = None # parse command line parameters if len(sys.argv) > 2: input_fn = sys.argv[1] output_fn = sys.argv[2] # run analysis analyseLogfile(input_fn, output_fn) else: print("ERROR: missing parameters!") print("use: check_ismi_log logfile csvfile") exit(1)