fix issue with pandas dataframe apply raw=True

CountESS-Project · Oct 16, 2023 · 9cfe6d9 · 9cfe6d9
1 parent 68052c0
commit 9cfe6d9
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 6 deletions.
diff --git a/countess/core/logger.py b/countess/core/logger.py
@@ -35,7 +35,9 @@ def error(self, message: str, detail: Optional[str] = None):
 
  def exception(self, exception: Exception):
  # Slightly odd calling to maintain compatibility with 3.9 and 3.10
+ # XXX format more nicely
  message = traceback.format_exception(None, value=exception, tb=None)
+ message += "\n\n" + "".join(traceback.format_tb(exception.__traceback__))
  self.error(str(exception), detail="".join(message))
 
  def clear(self):

diff --git a/countess/core/plugins.py b/countess/core/plugins.py
@@ -413,12 +413,15 @@ class PandasTransformDictToXMixin:
  """Transformer which takes a row as a dictionary"""
 
  def dataframe_to_series(self, dataframe: pd.DataFrame, logger: Logger) -> pd.Series:
+ # XXX there is a bug in Pandas 2.1.x which prevents
+ # args and kwargs getting passed through when raw=True
+ # this seems to be fixed in Pandas 2.2.0.dev so
+ # hopefully this lambda can be removed some day.
+ # https://github.com/pandas-dev/pandas/issues/55009
  return dataframe.apply(
- self.process_raw,
+ lambda x: self.process_raw(x, list(dataframe.columns), logger),
  axis=1,
  raw=True,
- columns=list(dataframe.columns),
- logger=logger,
  )
 
  def process_dict(self, data, logger: Logger):

diff --git a/countess/plugins/python.py b/countess/plugins/python.py
@@ -1,3 +1,5 @@
+from types import CodeType
+
 import pandas as pd
 
 from countess import VERSION
@@ -18,7 +20,6 @@
 # PandasTransformDictToDictPlugin
 # which is a bit more efficient.
 
-
 class PythonPlugin(PandasTransformRowToDictPlugin):
  name = "Python Code"
  description = "Apply python code to each row."
@@ -35,13 +36,18 @@ class PythonPlugin(PandasTransformRowToDictPlugin):
  "code": TextParam("Python Code"),
  }
 
+ code_object = None
+
  def process_row(self, row: pd.Series, logger: Logger):
  assert isinstance(self.parameters["code"], TextParam)
  assert isinstance(self.parameters["columns"], PerColumnArrayParam)
- code_object = compile(self.parameters["code"].value, "<PythonPlugin>", mode="exec")
+ assert isinstance(self.code_object, CodeType)
 
  row_dict = dict(row)
- exec(code_object, {}, row_dict) # pylint: disable=exec-used
+ try:
+ exec(self.code_object, {}, row_dict) # pylint: disable=exec-used
+ except Exception as exc: # pylint: disable=broad-exception-caught
+ logger.exception(exc)
 
  column_parameters = list(zip(self.input_columns, self.parameters["columns"].params))
  columns_to_remove = set(col for col, param in column_parameters if not param.value)
@@ -53,6 +59,9 @@ def process_dataframe(self, dataframe: pd.DataFrame, logger: Logger) -> pd.DataF
  the indexes so we can use their values easily and
  b) we don't need to merge afterwards"""
 
+ # XXX cache this?
+ self.code_object = compile(self.parameters["code"].value, "<PythonPlugin>", mode="exec")
+
  dataframe = dataframe.reset_index(drop=False)
  series = self.dataframe_to_series(dataframe, logger)
  dataframe = self.series_to_dataframe(series)