From c4b6ed19c80e079b54da9a4ef322b86b9188b1cf Mon Sep 17 00:00:00 2001 From: Nick Moore Date: Fri, 23 Aug 2024 18:40:00 +1000 Subject: [PATCH] More tests for python and csv plugins, bz2 handling for csv writer --- countess/core/cmd.py | 1 - countess/plugins/csv.py | 5 ++ tests/input1.csv.bz2 | Bin 0 -> 82 bytes tests/input1.csv.gz | Bin 0 -> 73 bytes tests/plugins/test_csv.py | 98 ++++++++++++++++++++++++++++++++++- tests/plugins/test_python.py | 32 ++++++++++++ tests/test_cmd.py | 14 +++-- 7 files changed, 145 insertions(+), 5 deletions(-) create mode 100644 tests/input1.csv.bz2 create mode 100644 tests/input1.csv.gz diff --git a/countess/core/cmd.py b/countess/core/cmd.py index e934a20..c49242c 100644 --- a/countess/core/cmd.py +++ b/countess/core/cmd.py @@ -5,7 +5,6 @@ from .config import read_config - def process_ini(config_filename) -> None: graph = read_config(config_filename) graph.run() diff --git a/countess/plugins/csv.py b/countess/plugins/csv.py index 89bec0b..41261b7 100644 --- a/countess/plugins/csv.py +++ b/countess/plugins/csv.py @@ -1,3 +1,4 @@ +import bz2 import csv import gzip import logging @@ -140,6 +141,8 @@ def prepare(self, sources: list[str], row_limit: Optional[int] = None): filename = str(self.filename) if filename.endswith(".gz"): self.filehandle = gzip.open(filename, "wb") + elif filename.endswith(".bz2"): + self.filehandle = bz2.open(filename, "wb") else: self.filehandle = open(filename, "wb") else: @@ -182,3 +185,5 @@ def process(self, data: pd.DataFrame, source: str): def finalize(self): if isinstance(self.filehandle, BytesIO): yield self.filehandle.getvalue().decode("utf-8") + else: + self.filehandle.close() diff --git a/tests/input1.csv.bz2 b/tests/input1.csv.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..4ab66d26b09ad49bab8bdd5d17b80e49123fe6a8 GIT binary patch literal 82 zcmV-Y0ImN*T4*^jL0KkKSvOl*EC2u$S%3f#00cWAIpMZYAOJBy1|v;0XheE}sPq#w o%1D-{wRhS2mxz`EAq^HtN4j3W8I8OYdsFduBvXY619i2Mz!$Y05&!@I literal 0 HcmV?d00001 diff --git a/tests/input1.csv.gz b/tests/input1.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..0088c32e50a757eabc3c29ff5efc94babd496dbb GIT binary patch literal 73 zcmV-P0Ji@hiwFpEy>4Uz18Ht>b#yT)@E6K>rOV>%xFU>3AO3TmJF*GtX;z~*^ f(lIgy(N#L8W~N+)r4>4smX=%qOaectEdT%jGwmEO literal 0 HcmV?d00001 diff --git a/tests/plugins/test_csv.py b/tests/plugins/test_csv.py index 2da199d..d2cc699 100644 --- a/tests/plugins/test_csv.py +++ b/tests/plugins/test_csv.py @@ -1,5 +1,9 @@ -from countess.plugins.csv import LoadCsvPlugin +import bz2 +import gzip +import pandas as pd + +from countess.plugins.csv import LoadCsvPlugin, SaveCsvPlugin def test_load_csv(): plugin = LoadCsvPlugin() @@ -9,6 +13,37 @@ def test_load_csv(): assert len(output_df) == 4 +def test_load_csv_index(): + plugin = LoadCsvPlugin() + plugin.set_parameter("files.0.filename", "tests/input1.csv") + plugin.set_parameter("columns.0.name", "whatever") + plugin.set_parameter("columns.0.type", "string") + plugin.set_parameter("columns.0.index", True) + plugin.set_parameter("columns.1.name", "stuff") + plugin.set_parameter("columns.1.type", "integer") + plugin.set_parameter("columns.1.index", False) + output_df = next(plugin.load_file(0)) + assert output_df.index.name == "whatever" + assert list(output_df.columns) == ["stuff"] + assert len(output_df) == 4 + + +def test_load_csv_gz(): + plugin = LoadCsvPlugin() + plugin.set_parameter("files.0.filename", "tests/input1.csv.gz") + output_df = next(plugin.load_file(0)) + assert list(output_df.columns) == ["thing", "count"] + assert len(output_df) == 4 + + +def test_load_csv_bz2(): + plugin = LoadCsvPlugin() + plugin.set_parameter("files.0.filename", "tests/input1.csv.bz2") + output_df = next(plugin.load_file(0)) + assert list(output_df.columns) == ["thing", "count"] + assert len(output_df) == 4 + + def test_load_tsv(): plugin = LoadCsvPlugin() plugin.set_parameter("files.0.filename", "tests/input1.tsv") @@ -59,3 +94,64 @@ def test_filename_column(): output_df = next(plugin.load_file(0)) assert "filename" in output_df.columns assert output_df["filename"].iloc[1] == "input1" + + +df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], columns=['a','b','c']) + +def test_save_csv(): + + plugin = SaveCsvPlugin() + plugin.set_parameter("header", True) + plugin.set_parameter("filename", "tests/output1.csv") + plugin.prepare(["test"], None) + plugin.process(df, 'test') + plugin.finalize() + + with open("tests/output1.csv", "r", encoding="utf-8") as fh: + text = fh.read() + assert text == "a,b,c\n1,2,3\n4,5,6\n7,8,9\n" + + +def test_save_csv_gz(): + + plugin = SaveCsvPlugin() + plugin.set_parameter("header", True) + plugin.set_parameter("filename", "tests/output1.csv.gz") + plugin.prepare(["test"], None) + plugin.process(df, 'test') + list(plugin.finalize()) + + with gzip.open("tests/output1.csv.gz", "rt") as fh: + text = fh.read() + assert text == "a,b,c\n1,2,3\n4,5,6\n7,8,9\n" + + +def test_save_csv_bz2(): + + plugin = SaveCsvPlugin() + plugin.set_parameter("header", True) + plugin.set_parameter("filename", "tests/output1.csv.bz2") + plugin.prepare(["test"], None) + plugin.process(df, 'test') + list(plugin.finalize()) + + with bz2.open("tests/output1.csv.bz2", "rt") as fh: + text = fh.read() + assert text == "a,b,c\n1,2,3\n4,5,6\n7,8,9\n" + + +df2 = pd.DataFrame([[10,11,12]], columns=['a','b','d']) + + +def test_save_csv_multi(): + plugin = SaveCsvPlugin() + plugin.set_parameter("header", True) + plugin.set_parameter("filename", "tests/output2.csv") + plugin.prepare(["test"], None) + plugin.process(df, 'test') + plugin.process(df2, 'test2') + plugin.finalize() + + with open("tests/output2.csv", "r", encoding="utf-8") as fh: + text = fh.read() + assert text == "a,b,c\n1,2,3\n4,5,6\n7,8,9\n10,11,,12\n" diff --git a/tests/plugins/test_python.py b/tests/plugins/test_python.py index 28e376a..f63e4c9 100644 --- a/tests/plugins/test_python.py +++ b/tests/plugins/test_python.py @@ -53,3 +53,35 @@ def test_python_dropna(): assert any(np.isnan(dfo["d"])) assert not any(np.isnan(dfo["b"])) + + +def test_python_filter(): + plugin = PythonPlugin() + plugin.set_parameter( + "code", + """ +__filter = d < 10 and a % 2 + """, + ) + + plugin.prepare(["test"], None) + dfo = plugin.process_dataframe(dfi) + + assert "__filter" not in dfo.columns + assert len(dfo) == 2 + + +def test_python_exception(caplog): + plugin = PythonPlugin() + plugin.set_parameter( + "code", + """ +e = 1/0 + """, + ) + + plugin.prepare(["test"], None) + dfo = plugin.process_dataframe(dfi) + assert len(dfo) == 5 + + assert "Exception" in caplog.text diff --git a/tests/test_cmd.py b/tests/test_cmd.py index 8f71892..381b727 100644 --- a/tests/test_cmd.py +++ b/tests/test_cmd.py @@ -1,8 +1,10 @@ import csv +from unittest.mock import patch import pytest -from countess.core.cmd import run as cmd_run +import countess.core.cmd +from countess.core.cmd import run, main expected_output = """"thing","foo","bar","baz","qux","number","zz" "bar",10,2,1,4,232,0.08620689655172414 @@ -13,8 +15,14 @@ @pytest.mark.slow def test_command_invocation(): - cmd_run(["countess_cmd", "tests/simple.ini"]) + run(["countess_cmd", "tests/simple.ini"]) - with open("tests/output.csv", "r") as fh: + with open("tests/output.csv", "r", encoding="utf-8") as fh: output = fh.read() assert output == expected_output + + +def test_main(): + with patch.object(countess.core.cmd, 'run') as p: + main() + p.assert_called_once()