Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix sorting of None values in NMPVCD.do_persist(). #1321

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import unittest
from aminer.parsing.ParserMatch import ParserMatch
from aminer.parsing.MatchElement import MatchElement
from aminer.parsing.OptionalMatchModelElement import OptionalMatchModelElement
from aminer.analysis.NewMatchPathValueComboDetector import NewMatchPathValueComboDetector
from aminer.input.LogAtom import LogAtom
import time
Expand All @@ -24,6 +26,13 @@ class NewMatchPathValueComboDetectorTest(TestBase):
match_element2 = seq2.get_match_element("", match_context)
match_element3 = fdme3.get_match_element("/seq", match_context)

match_context = DummyMatchContext(b"ddd 25538ddd ")
fdme5 = DummyFixedDataModelElement("s1", b"ddd ")
fdme6 = OptionalMatchModelElement("o", DummyFixedDataModelElement("d1", b"25539"))
seq3 = DummySequenceModelElement("seq", [fdme5, fdme6])
match_element4 = seq3.get_match_element("", match_context)
match_element5 = fdme5.get_match_element("/seq", match_context)

def test1receive_atom(self):
"""
Test if log atoms are processed correctly and the detector is learning (learn_mode=True) and stops if learn_mode=False.
Expand Down Expand Up @@ -139,9 +148,11 @@ def test4persistence(self):
t = round(time.time(), 3)
log_atom1 = LogAtom(self.match_element1.match_string, ParserMatch(self.match_element1), t, nmpvcd)
log_atom2 = LogAtom(self.match_element2.match_string, ParserMatch(self.match_element2), t, nmpvcd)
log_atom3 = LogAtom(self.match_element4.match_string, ParserMatch(self.match_element4), t, nmpvcd)

self.assertTrue(nmpvcd.receive_atom(log_atom1))
self.assertTrue(nmpvcd.receive_atom(log_atom2))
self.assertFalse(nmpvcd.receive_atom(log_atom3))
self.assertEqual(nmpvcd.known_values_set, {(b"ddd ", b"25538"), (b" pid=", b"25537")})
nmpvcd.do_persist()
with open(nmpvcd.persistence_file_name, "r") as f:
Expand All @@ -154,6 +165,23 @@ def test4persistence(self):
other = NewMatchPathValueComboDetector(self.aminer_config, [self.match_element1.path, self.match_element2.path], [self.stream_printer_event_handler])
self.assertEqual(nmpvcd.known_values_set, other.known_values_set)

nmpvcd = NewMatchPathValueComboDetector(self.aminer_config, ["/seq/s1", "/seq/d1"], [self.stream_printer_event_handler], learn_mode=True, output_logline=False, allow_missing_values_flag=True)
self.assertTrue(nmpvcd.receive_atom(log_atom1))
self.assertTrue(nmpvcd.receive_atom(log_atom2))
self.assertTrue(nmpvcd.receive_atom(log_atom3))
self.assertEqual(nmpvcd.known_values_set, {(b"ddd ", b"25538"), (b" pid=", b"25537"), (b"ddd ", None)})
nmpvcd.known_values_set = {(b"ddd ", b"25538"), (b" pid=", b"25537"), (b"ddd ", None)}
nmpvcd.do_persist()
with open(nmpvcd.persistence_file_name, "r") as f:
self.assertEqual(f.read(), '[["bytes: pid=", "bytes:25537"], ["bytes:ddd ", null], ["bytes:ddd ", "bytes:25538"]]')

nmpvcd.known_values_set = set()
nmpvcd.load_persistence_data()
self.assertEqual(nmpvcd.known_values_set, {(b"ddd ", b"25538"), (b" pid=", b"25537"), (b"ddd ", None)})

other = NewMatchPathValueComboDetector(self.aminer_config, [self.match_element1.path, self.match_element2.path], [self.stream_printer_event_handler])
self.assertEqual(nmpvcd.known_values_set, other.known_values_set)

def test5validate_parameters(self):
"""Test all initialization parameters for the detector. Input parameters must be validated in the class."""
self.assertRaises(ValueError, NewMatchPathValueComboDetector, self.aminer_config, [""], [self.stream_printer_event_handler])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,11 @@ def do_timer(self, trigger_time):

def do_persist(self):
"""Immediately write persistence data to storage."""
PersistenceUtil.store_json(self.persistence_file_name, sorted(list(self.known_values_set)))
try:
PersistenceUtil.store_json(self.persistence_file_name, sorted(list(self.known_values_set),
key=lambda L: tuple(el if el is not None else b'-' for el in L)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not understand why the lambda expression is necessary here. When there is None in the list, sorting will fail.

>>> x = [b'a', None, b'1']
>>> sorted(list(x), key=lambda L: tuple(el if el is not None else b'-' for el in L))
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<stdin>", line 1, in <lambda>
TypeError: 'NoneType' object is not iterable

Also, when replacing None with '-' this may be problematic if the value observed in the log lines can also be '-' (which is not unusual, e.g. in Apache Access logs). It is therefore important that None is persisted, and not some string replacement.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for showing the need to comment this code.
The Lambda function is only used to allow sorting of tuples with None values. This would not be possible otherwise.
The data is later on stored normally as json (None will be null).

The data structure of known_values_set looks different. This test case is covered in the NewMatchPathValueComboDetectorTest.py lines 168-180.

Here is a simple test:
Python 3.8.10 (default, Jul 29 2024, 17:02:10)
[GCC 9.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.

x = {(b"ddd ", b"25538"), (b" pid=", b"25537"), (b"ddd ", None)}
sorted(list(x), key=lambda L: tuple(el if el is not None else b'-' for el in L))
[(b' pid=', b'25537'), (b'ddd ', None), (b'ddd ', b'25538')]

I don't think there is any bug in the code.

except TypeError:
PersistenceUtil.store_json(self.persistence_file_name, list(self.known_values_set))
logging.getLogger(DEBUG_LOG_NAME).debug("%s persisted data.", self.__class__.__name__)

def allowlist_event(self, event_type, event_data, allowlisting_data):
Expand Down