Mercurial > hg > drupalISMI
comparison importFromOpenMind/importer/check_ismi_transaction_log.py @ 43:bb5390f752fc
new ISMI transaction log analyser.
author | casties |
---|---|
date | Thu, 08 Dec 2016 19:19:02 +0100 |
parents | |
children | 4692e90215eb |
comparison
equal
deleted
inserted
replaced
42:4dee9586cc44 | 43:bb5390f752fc |
---|---|
1 | |
2 # coding: utf-8 | |
3 | |
4 import sys, re | |
5 | |
6 # max number of lines to read (for testing) | |
7 maxLinecnt = None | |
8 | |
9 # do not output deleted nodes | |
10 omitDeleted = True | |
11 | |
12 # active log levels for logging | |
13 logLevels = {'DEBUG', 'INFO', 'WARNING', 'ERROR', 'SYSMSG'} | |
14 #logLevels = {'INFO', 'WARNING', 'ERROR', 'SYSMSG'} | |
15 #logLevels = {'ERROR', 'SYSMSG'} | |
16 | |
17 def log(level, message): | |
18 if level in logLevels: | |
19 print("%s: %s"%(level, message)) | |
20 | |
21 | |
22 def prettyPrintNode(node): | |
23 nt = node['node-type'] | |
24 att = '' | |
25 if nt == 'ENTITY': | |
26 att = " %s=%s "%('oc',node['object-class']) | |
27 | |
28 elif nt == 'ATTRIBUTE': | |
29 att = " %s=%s "%('name',node['name']) | |
30 | |
31 elif nt == 'RELATION': | |
32 att = " %s=%s "%('oc',node['object-class']) | |
33 | |
34 s = "%s%s[%s]"%(nt, att, node) | |
35 return s | |
36 | |
37 | |
38 nodeCsvFieldList = ['node_type', 'id', 'row_id', 'object_class', 'user', 'public', 'type', | |
39 'version', 'modification_time', 'system_status', 'content_type', | |
40 'source_id', 'source_modif', 'source_obj_class', 'target_id', 'target_modif', 'target_obj_class', | |
41 'own_value_b64'] | |
42 | |
43 nodeCsvFieldMap = {'node_type':'node-type', 'id':'id', 'row_id':'row-id', 'object_class':'object-class', 'user':'user', | |
44 'public':'public', 'type':'type', | |
45 'version':'version', 'modification_time':'mtime', 'system_status':'system-status', 'content_type':'content-type', | |
46 'source_id':'source-id', 'source_modif':'source-mtime', 'source_obj_class':'source-oc', | |
47 'target_id':'target-id', 'target_modif':'target-mtime', 'target_obj_class':'target-oc', | |
48 'own_value_b64':'b64-value'} | |
49 | |
50 | |
51 def printHeaderCsv(outFile): | |
52 s = "" | |
53 for cf in nodeCsvFieldList: | |
54 if s == "": | |
55 s += "%s"%cf | |
56 else: | |
57 s += ",%s"%cf | |
58 | |
59 print(s, file=outFile) | |
60 | |
61 | |
62 def printNodeCsv(node, outFile): | |
63 s = "" | |
64 for cf in nodeCsvFieldList: | |
65 nf = nodeCsvFieldMap[cf] | |
66 if s == "": | |
67 s += "%s"%(node.get(nf, '')) | |
68 else: | |
69 s += ",%s"%(node.get(nf, '')) | |
70 | |
71 print(s, file=outFile) | |
72 | |
73 | |
74 def parseStart(line): | |
75 tstamp = None | |
76 tm = re.match('(\d+-\d+-\d+ \d+:\d+:\d+)', line) | |
77 if tm: | |
78 tstamp = tm.group(1) | |
79 | |
80 sm = re.search('START Saving (\w+) id=(\w+)', line) | |
81 if sm: | |
82 return {'time': tstamp, 'oc': sm.group(1), 'id': sm.group(2)} | |
83 | |
84 sm = re.search('START remove entity: user=\S+ entity=Entity\[rowId=\d+, id=(\d+)', line) | |
85 if sm: | |
86 return {'time': tstamp, 'id': sm.group(1)} | |
87 | |
88 sm = re.search('END Saving (\w+) id=(\w+)', line) | |
89 if sm: | |
90 return {'time': tstamp, 'oc': sm.group(1), 'id': sm.group(2)} | |
91 | |
92 sm = re.search('END remove entity: user=\S+ entity=Entity\[rowId=\d+, id=(\d+)', line) | |
93 if sm: | |
94 return {'time': tstamp, 'id': sm.group(1)} | |
95 | |
96 return None | |
97 | |
98 | |
99 def parseSave(line): | |
100 match = re.search('([A-Z]+)\[([^\]]+)\]', line) | |
101 if match: | |
102 data = {'node-type': match.group(1)} | |
103 segs = match.group(2).split(', ') | |
104 for seg in segs: | |
105 k, v = seg.split('=', 1) | |
106 | |
107 data[k] = v.strip('"') | |
108 | |
109 # normalize attribute's name to object-class | |
110 if k == 'name': | |
111 data['object-class'] = v.strip('"') | |
112 | |
113 return data | |
114 | |
115 return None | |
116 | |
117 | |
118 def equalNodes(prev, cur): | |
119 #log("DEBUG", "compare: %s vs %s"%(prev, cur)) | |
120 if prev['id'] != cur['id']: | |
121 log("INFO", "node id mismatch!") | |
122 return False | |
123 | |
124 if prev['node-type'] != cur['node-type']: | |
125 log("INFO", "node node-type mismatch!") | |
126 return False | |
127 | |
128 if prev.get('source-id', None) != cur.get('source-id', None): | |
129 log("INFO", "node source_id mismatch!") | |
130 return False | |
131 | |
132 if prev.get('target-id', None) != cur.get('target-id', None): | |
133 log("INFO", "node target_id mismatch!") | |
134 return False | |
135 | |
136 if prev['b64-value'] != cur['b64-value']: | |
137 log("INFO", "node ownvalue mismatch!") | |
138 return False | |
139 | |
140 return True | |
141 | |
142 | |
143 def getSimilarNode(prev, curList): | |
144 nt = prev['node-type'] | |
145 if nt == 'ATTRIBUTE': | |
146 for n in curList: | |
147 if n['node-type'] == 'ATTRIBUTE' \ | |
148 and prev['name'] == n['name']: | |
149 # attribute with same name | |
150 log("DEBUG", "similar attributes: %s vs %s"%(prev, n)) | |
151 return n | |
152 | |
153 elif nt == 'RELATION': | |
154 for n in curList: | |
155 if n['node-type'] == 'RELATION' \ | |
156 and prev['source-id'] == n['source-id'] \ | |
157 and prev['target-id'] == n['target-id'] \ | |
158 and prev['object-class'] == n['object-class']: | |
159 # relation with same source, target and type | |
160 log("DEBUG", "similar relations: %s vs %s"%(prev, n)) | |
161 return n | |
162 | |
163 return None | |
164 | |
165 | |
166 def compareNodeLists(prev, cur, ctx, lostFile=None): | |
167 """compare list of previous and current nodes. | |
168 | |
169 prints results to the log and output file. | |
170 """ | |
171 prevNodes = {} | |
172 curNodes = {} | |
173 | |
174 # | |
175 # read nodes | |
176 # | |
177 for n in prev: | |
178 nid = n['id'] | |
179 if nid not in prevNodes: | |
180 prevNodes[nid] = n | |
181 else: | |
182 log("DEBUG", "duplicate save of prev node id="+nid) | |
183 if isinstance(prevNodes[nid], list): | |
184 prevNodes[nid].append(n) | |
185 else: | |
186 prevNodes[nid] = [prevNodes[nid], n] | |
187 | |
188 for n in cur: | |
189 nid = n['id'] | |
190 if nid not in curNodes: | |
191 curNodes[nid] = n | |
192 else: | |
193 log("DEBUG", "duplicate save of cur node id="+nid) | |
194 if isinstance(curNodes[nid], list): | |
195 curNodes[nid].append(n) | |
196 else: | |
197 curNodes[nid] = [curNodes[nid], n] | |
198 | |
199 # | |
200 # compare nodes | |
201 # | |
202 curNodeUnchecked = set(curNodes.keys()) | |
203 addPrevNodes = [] | |
204 addCurNodes = [] | |
205 | |
206 for nid in prevNodes: | |
207 prevNode = prevNodes[nid] | |
208 if isinstance(prevNode, list): | |
209 log("DEBUG", "multi-save prev node: %s"%prevNode) | |
210 # use the last version(?) | |
211 prevNode = prevNode[-1] | |
212 | |
213 if nid not in curNodes: | |
214 if prevNode['node-type'] == 'ATTRIBUTE' and prevNode['b64-value'] == '': | |
215 # emtpy attribute - ignore | |
216 continue | |
217 | |
218 else: | |
219 log("DEBUG", "node %s not in cur saves! %s"%(nid,prevNode)) | |
220 addPrevNodes.append(prevNode) | |
221 continue | |
222 | |
223 curNode = curNodes[nid] | |
224 if isinstance(curNode, list): | |
225 log("DEBUG", "multi-save cur node: %s"%curNode) | |
226 # use the last version? | |
227 curNode = curNode[-1] | |
228 | |
229 equalNodes(prevNode, curNode) | |
230 | |
231 curNodeUnchecked.remove(nid) | |
232 | |
233 # make list of additional current (=new) nodes | |
234 for nid in curNodeUnchecked: | |
235 curNode = curNodes[nid] | |
236 # list can contain lists | |
237 if isinstance(curNode, list): | |
238 # use the last version(?) | |
239 curNode = curNode[-1] | |
240 | |
241 addCurNodes.append(curNode) | |
242 log("DEBUG", "new node %s"%curNodes[nid]) | |
243 | |
244 # compare missing and new nodes | |
245 for n in addPrevNodes.copy(): | |
246 sn = getSimilarNode(n, addCurNodes) | |
247 if sn is not None: | |
248 # similar is good enough | |
249 addPrevNodes.remove(n) | |
250 addCurNodes.remove(sn) | |
251 | |
252 if len(addPrevNodes) > 0: | |
253 #print("ERROR: lost nodes: %s"%[prettyPrintNode(n) for n in addPrevNodes]) | |
254 log("ERROR", "in %s"%ctx) | |
255 for n in addPrevNodes: | |
256 log("ERROR","lost node: %s"%prettyPrintNode(n)) | |
257 if lostFile is not None: | |
258 printNodeCsv(n, lostFile) | |
259 | |
260 if len(addCurNodes) > 0: | |
261 #print("INFO: new nodes: %s"%[prettyPrintNode(n) for n in addCurNodes]) | |
262 for n in addCurNodes: | |
263 log("INFO", "new node: %s"%prettyPrintNode(n)) | |
264 | |
265 | |
266 | |
267 def analyseLogfile(inFilename, outFilename=None): | |
268 outFile = None | |
269 if outFilename is not None: | |
270 outFile = open(outFilename, mode='w') | |
271 printHeaderCsv(outFile) | |
272 | |
273 with open(inFilename) as f: | |
274 linecnt = 0 | |
275 saving = 0 | |
276 openSaves = {} | |
277 savingPrev = 0 | |
278 deleting = 0 | |
279 saveCtx = None | |
280 deleteCtx = None | |
281 prevSaves = [] | |
282 saves = [] | |
283 | |
284 for line in f: | |
285 linecnt += 1 | |
286 # | |
287 # parsing lines in order of specific match, not order of operation! | |
288 # | |
289 if '*** START Saving' in line: | |
290 saving += 1 | |
291 # make sure delete is off | |
292 deleting = 0 | |
293 log('DEBUG', line) | |
294 # parse time and id | |
295 saveCtx = parseStart(line) | |
296 saveId = saveCtx['id'] | |
297 if saveId in openSaves: | |
298 log('ERROR', "Duplicate save for same id! %s"%saveCtx) | |
299 # assume the first save was spurious | |
300 saving -= 1 | |
301 | |
302 openSaves[saveId] = saveCtx | |
303 | |
304 if len(openSaves) > 1: | |
305 log("ERROR", "Multiple open save (%s) in #%s of %s"%(saving, linecnt, line)) | |
306 # TODO: what now? | |
307 | |
308 elif '*** END Saving' in line: | |
309 saving -= 1 | |
310 saveCtx = parseStart(line) | |
311 saveId = saveCtx['id'] | |
312 if saveId in openSaves: | |
313 log('DEBUG', "End save matches start save: %s"%saveCtx) | |
314 del openSaves[saveId] | |
315 | |
316 elif 'null' in openSaves: | |
317 log('DEBUG', "Assume end save (of %s) matches start save null: %s"%(len(openSaves), saveCtx)) | |
318 del openSaves['null'] | |
319 | |
320 else: | |
321 log("ERROR", "End save without start save! %s"%saveCtx) | |
322 | |
323 # make sure delete is off | |
324 deleting = 0 | |
325 log('DEBUG', line) | |
326 | |
327 if len(openSaves) > 0: | |
328 log('WARNING', "Still open saves (%s) during end save in #%s of %s"%(saving, linecnt, line)) | |
329 | |
330 elif saving < 0: | |
331 log('ERROR', "Too many END saves!") | |
332 break | |
333 | |
334 log("INFO", "saving %s"%saveCtx) | |
335 log("INFO", "prev saves: %s"%len(prevSaves)) | |
336 log("INFO", "saves: %s"%len(saves)) | |
337 | |
338 if len(prevSaves) > 0: | |
339 compareNodeLists(prevSaves, saves, saveCtx, outFile) | |
340 | |
341 prevSaves = [] | |
342 saves = [] | |
343 | |
344 elif '** START save entity' in line: | |
345 pass | |
346 | |
347 elif '** END save entity' in line: | |
348 pass | |
349 | |
350 elif '** START remove entity' in line: | |
351 deleting += 1 | |
352 log('DEBUG', line) | |
353 deleteCtx = parseStart(line) | |
354 | |
355 if deleting > 1: | |
356 log("ERROR", "Concurrent delete (%s) in #%s of %s"%(saving, linecnt, line)) | |
357 # TODO: what now? | |
358 break | |
359 | |
360 elif '** END remove entity' in line: | |
361 deleting -= 1 | |
362 log('DEBUG', line) | |
363 ctx = parseStart(line) | |
364 if deleteCtx['id'] != ctx['id']: | |
365 log('ERROR', "END delete has different ID!") | |
366 break | |
367 | |
368 if deleting > 1: | |
369 log("ERROR", "Concurrent delete (%s) in #%s of %s"%(saving, linecnt, line)) | |
370 # TODO: what now? | |
371 break | |
372 | |
373 deleteCtx = None | |
374 | |
375 elif '* START save entity' in line: | |
376 pass | |
377 | |
378 elif '* START save previous' in line: | |
379 savingPrev += 1 | |
380 | |
381 elif '* ...END save entity' in line: | |
382 pass | |
383 | |
384 elif '* END ...save previous' in line: | |
385 savingPrev -= 1 | |
386 | |
387 if saving < 0: | |
388 log("ERROR", "Too many END save previous!") | |
389 | |
390 elif 'save previous' in line: | |
391 data = parseSave(line) | |
392 if data is None: | |
393 log("DEBUG", "Error parsing line: %s"%line) | |
394 continue | |
395 | |
396 if omitDeleted and deleting > 0 and savingPrev > 0: | |
397 # this should be a save prev from deleting | |
398 delId = deleteCtx['id'] | |
399 # check if node is related to deleted id | |
400 if data.get('id', None) == delId or data.get('source-id', None) == delId \ | |
401 or data.get('target-id', None) == delId: | |
402 log('DEBUG', "intentionally deleted node: %s"%data) | |
403 | |
404 else: | |
405 log('ERROR', "Node without matching id in delete! %s"%data) | |
406 prevSaves.append(data) | |
407 | |
408 else: | |
409 prevSaves.append(data) | |
410 | |
411 elif 'save' in line: | |
412 data = parseSave(line) | |
413 if data is None: | |
414 log("DEBUG", "Error parsing line: %s"%line) | |
415 continue | |
416 | |
417 saves.append(parseSave(line)) | |
418 | |
419 if maxLinecnt is not None and linecnt >= maxLinecnt: | |
420 break | |
421 | |
422 log("SYSMSG", "%s lines of logfile scanned"%linecnt) | |
423 | |
424 | |
425 # | |
426 # public static void main :-) | |
427 # | |
428 | |
429 input_fn = None | |
430 output_fn = None | |
431 | |
432 # parse command line parameters | |
433 if len(sys.argv) > 2: | |
434 input_fn = sys.argv[1] | |
435 output_fn = sys.argv[2] | |
436 | |
437 # run analysis | |
438 analyseLogfile(input_fn, output_fn) | |
439 | |
440 else: | |
441 print("ERROR: missing parameters!") | |
442 print("use: check_ismi_log logfile csvfile") | |
443 exit(1) |