Skip to content

Commit

Permalink
Truncate large report text data (#1020)
Browse files Browse the repository at this point in the history
Truncate large report text data, sets the maximum size that the report can be before truncating it.
  • Loading branch information
aarontp authored Mar 18, 2022
1 parent 5d6e9e9 commit 414655d
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 4 deletions.
4 changes: 2 additions & 2 deletions turbinia/task_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,8 +684,8 @@ def process_tasks(self):
timeout = self.check_task_timeout(task)
if timeout:
log.warning(
'Task {0:s} timed on server out after {1:d} seconds. Auto-closing Task.'
.format(celery_task.id, timeout))
'Task {0:s} timed out on server after {1:d} seconds. '
'Auto-closing Task.'.format(celery_task.id, timeout))
task = self.timeout_task(task, timeout)
completed_tasks.append(task)

Expand Down
19 changes: 18 additions & 1 deletion turbinia/workers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@
from prometheus_client import Histogram

METRICS = {}
# Set the maximum size that the report can be before truncating it. This is a
# best effort estimate and not a guarantee and comes from the limit for
# datastore entities[1] less some overhead for the rest of the attributes that
# will be saved in the response.
# [1]https://cloud.google.com/datastore/docs/concepts/limits
REPORT_MAXSIZE = int(1048572 * 0.75)

log = logging.getLogger('turbinia')

Expand Down Expand Up @@ -220,6 +226,18 @@ def close(self, task, success, status=None):
'Evidence {0:s} has empty source_path so '
'not saving.'.format(evidence.name))

# Truncate report text data if it is approaching the size of the max
# datastore entity size (See REPORT_MAXSIZE definition for details).
if (hasattr(evidence, 'text_data') and evidence.text_data and
len(evidence.text_data) > REPORT_MAXSIZE):
message = (
'The text_data attribute has a size {0:d} larger than the max '
'size {1:d} so truncating the response.'.format(
len(evidence.text_data), REPORT_MAXSIZE))
self.log(message, logging.WARN)
evidence.text_data = evidence.text_data[:REPORT_MAXSIZE] + '\n'
evidence.text_data += message

if not evidence.request_id:
evidence.request_id = self.request_id

Expand Down Expand Up @@ -659,7 +677,6 @@ def execute(

ret = proc.returncode

result.error['stdout'] = str(stdout)
result.error['stderr'] = str(stderr)

if stderr_file and not stderr:
Expand Down
13 changes: 12 additions & 1 deletion turbinia/workers/workers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,18 @@ def unregisterMetrics(self):
class TestTurbiniaTask(TestTurbiniaTaskBase):
"""Test TurbiniaTask class."""

def testTurbiniaTaskCloseTruncate(self):
"""Tests that the close method will truncate large report output."""
evidence_ = evidence.ReportText(source_path='/no/path')
max_size = 2**20
evidence_.text_data = 'A' * max_size
self.result.add_evidence(evidence_, self.task._evidence_config)
self.result.close(self.task, success=True)
self.remove_files.append(
os.path.join(self.task.base_output_dir, 'worker-log.txt'))
self.assertIn('truncating', evidence_.text_data[-100:])
self.assertTrue(len(evidence_.text_data) <= (max_size * 0.8))

def testTurbiniaTaskSerialize(self):
"""Test that we can properly serialize/deserialize tasks."""
out_dict = self.plaso_task.serialize()
Expand Down Expand Up @@ -267,7 +279,6 @@ def testTurbiniaTaskExecute(self, popen_mock):
# Command was executed, has the correct output saved and
# TurbiniaTaskResult.close() was called with successful status.
popen_mock.assert_called_with(cmd, stdout=-1, stderr=-1, cwd=None, env=None)
self.assertEqual(self.result.error['stdout'], str(output[0]))
self.assertEqual(self.result.error['stderr'], str(output[1]))
self.assertEqual(stdout_data, output[0])
self.result.close.assert_called_with(self.task, success=True)
Expand Down

0 comments on commit 414655d

Please sign in to comment.