Truncate large report text data (#1020)

Truncate large report text data, sets the maximum size that the report can be before truncating it.
google · Mar 18, 2022 · 414655d · 414655d
1 parent 5d6e9e9
commit 414655d
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 4 deletions.
diff --git a/turbinia/task_manager.py b/turbinia/task_manager.py
@@ -684,8 +684,8 @@ def process_tasks(self):
         timeout = self.check_task_timeout(task)
         if timeout:
           log.warning(
-              'Task {0:s} timed on server out after {1:d} seconds. Auto-closing Task.'
-              .format(celery_task.id, timeout))
+              'Task {0:s} timed out on server after {1:d} seconds. '
+              'Auto-closing Task.'.format(celery_task.id, timeout))
           task = self.timeout_task(task, timeout)
           completed_tasks.append(task)
 

diff --git a/turbinia/workers/__init__.py b/turbinia/workers/__init__.py
@@ -50,6 +50,12 @@
 from prometheus_client import Histogram
 
 METRICS = {}
+# Set the maximum size that the report can be before truncating it.  This is a
+# best effort estimate and not a guarantee and comes from the limit for
+# datastore entities[1] less some overhead for the rest of the attributes that
+# will be saved in the response.
+# [1]https://cloud.google.com/datastore/docs/concepts/limits
+REPORT_MAXSIZE = int(1048572 * 0.75)
 
 log = logging.getLogger('turbinia')
 
@@ -220,6 +226,18 @@ def close(self, task, success, status=None):
             'Evidence {0:s} has empty source_path so '
             'not saving.'.format(evidence.name))
 
+      # Truncate report text data if it is approaching the size of the max
+      # datastore entity size (See REPORT_MAXSIZE definition for details).
+      if (hasattr(evidence, 'text_data') and evidence.text_data and
+          len(evidence.text_data) > REPORT_MAXSIZE):
+        message = (
+            'The text_data attribute has a size {0:d} larger than the max '
+            'size {1:d} so truncating the response.'.format(
+                len(evidence.text_data), REPORT_MAXSIZE))
+        self.log(message, logging.WARN)
+        evidence.text_data = evidence.text_data[:REPORT_MAXSIZE] + '\n'
+        evidence.text_data += message
+
       if not evidence.request_id:
         evidence.request_id = self.request_id
 
@@ -659,7 +677,6 @@ def execute(
 
       ret = proc.returncode
 
-    result.error['stdout'] = str(stdout)
     result.error['stderr'] = str(stderr)
 
     if stderr_file and not stderr:

diff --git a/turbinia/workers/workers_test.py b/turbinia/workers/workers_test.py
@@ -125,6 +125,18 @@ def unregisterMetrics(self):
 class TestTurbiniaTask(TestTurbiniaTaskBase):
   """Test TurbiniaTask class."""
 
+  def testTurbiniaTaskCloseTruncate(self):
+    """Tests that the close method will truncate large report output."""
+    evidence_ = evidence.ReportText(source_path='/no/path')
+    max_size = 2**20
+    evidence_.text_data = 'A' * max_size
+    self.result.add_evidence(evidence_, self.task._evidence_config)
+    self.result.close(self.task, success=True)
+    self.remove_files.append(
+        os.path.join(self.task.base_output_dir, 'worker-log.txt'))
+    self.assertIn('truncating', evidence_.text_data[-100:])
+    self.assertTrue(len(evidence_.text_data) <= (max_size * 0.8))
+
   def testTurbiniaTaskSerialize(self):
     """Test that we can properly serialize/deserialize tasks."""
     out_dict = self.plaso_task.serialize()
@@ -267,7 +279,6 @@ def testTurbiniaTaskExecute(self, popen_mock):
     # Command was executed, has the correct output saved and
     # TurbiniaTaskResult.close() was called with successful status.
     popen_mock.assert_called_with(cmd, stdout=-1, stderr=-1, cwd=None, env=None)
-    self.assertEqual(self.result.error['stdout'], str(output[0]))
     self.assertEqual(self.result.error['stderr'], str(output[1]))
     self.assertEqual(stdout_data, output[0])
     self.result.close.assert_called_with(self.task, success=True)