diff --git a/countess/plugins/hgvs_parser.py b/countess/plugins/hgvs_parser.py index d5a2fa6..4669b78 100644 --- a/countess/plugins/hgvs_parser.py +++ b/countess/plugins/hgvs_parser.py @@ -88,6 +88,7 @@ def series_to_dataframe(self, series: pd.Series) -> pd.DataFrame: dataframe = super().series_to_dataframe(series) if self.parameters["multi"].value: + if self.parameters["split"].value: dataframe = dataframe.explode(["var", "loc"]) else: diff --git a/docs/included-plugins/index.md b/docs/included-plugins/index.md index b7b9f68..c17877d 100644 --- a/docs/included-plugins/index.md +++ b/docs/included-plugins/index.md @@ -217,3 +217,22 @@ The reference sequence can either be provided directly as a configuration parame *See also: [countess-minimap2 plugin](https://github.com/CountESS-Project/countess-minimap2), a variant caller which uses 'minimap2' to find sequences within a genome.* +#### Parameters + +Input Column +: the input column with the variant sequence + +Reference +: (optional) select column which contains the reference sequence ... + +Sequence +: (optional) ... or supply a reference sequence as a value + +Output Column +: Column name for HGVS string + +Max Mutations +: Maximum number of mutations, if no variant with this number or less mutations is found then return a null value for the output + +Drop +: Drop rows which would have null values for output diff --git a/script/run-tests b/script/run-tests index 0d538a9..81948de 100755 --- a/script/run-tests +++ b/script/run-tests @@ -1,6 +1,6 @@ #!/bin/bash -coverage run --source=countess -m pytest --doctest-modules countess/ tests/ +xvfb-run coverage run --source=countess -m pytest --doctest-modules countess/ tests/ coverage report --skip-empty --sort=-cover diff --git a/tests/input1.tsv b/tests/input1.tsv new file mode 100644 index 0000000..1adbea5 --- /dev/null +++ b/tests/input1.tsv @@ -0,0 +1,5 @@ +thing count +foo 1212 +bar 232 +baz 565 +qux 999 diff --git a/tests/input1.txt b/tests/input1.txt new file mode 100644 index 0000000..e68a43f --- /dev/null +++ b/tests/input1.txt @@ -0,0 +1,5 @@ +thing count +foo 1212 +bar 232 +baz 565 +qux 999 diff --git a/tests/input2.csv b/tests/input2.csv new file mode 100644 index 0000000..a1a1876 --- /dev/null +++ b/tests/input2.csv @@ -0,0 +1,4 @@ +x,y +1,"this line has a comma, and a double "" quote" +2,"this line has a comma, and an escaped \" quote" +3,this line has a comment # don't read this diff --git a/tests/plugins/test_csv.py b/tests/plugins/test_csv.py index 8eaca3f..0bee0c1 100644 --- a/tests/plugins/test_csv.py +++ b/tests/plugins/test_csv.py @@ -1,6 +1,3 @@ -import pandas as pd -import pytest - from countess.core.logger import MultiprocessLogger from countess.plugins.csv import LoadCsvPlugin @@ -9,10 +6,53 @@ def test_load_csv(): plugin = LoadCsvPlugin() - plugin.set_parameter("files.0.filename", "tests/input1.csv") + output_df = next(plugin.load_file(0, logger)) + assert list(output_df.columns) == ["thing", "count"] + assert len(output_df) == 4 +def test_load_tsv(): + plugin = LoadCsvPlugin() + plugin.set_parameter("files.0.filename", "tests/input1.tsv") + plugin.set_parameter("delimiter", "TAB") output_df = next(plugin.load_file(0, logger)) + assert list(output_df.columns) == ["thing", "count"] + assert len(output_df) == 4 +def test_load_txt(): + plugin = LoadCsvPlugin() + plugin.set_parameter("files.0.filename", "tests/input1.txt") + plugin.set_parameter("delimiter", "WHITESPACE") + output_df = next(plugin.load_file(0, logger)) assert list(output_df.columns) == ["thing", "count"] assert len(output_df) == 4 + +def test_load_quoting_double(): + plugin = LoadCsvPlugin() + plugin.set_parameter("files.0.filename", "tests/input2.csv") + plugin.set_parameter("quoting", "Double-Quote") + output_df = next(plugin.load_file(0, logger)) + assert output_df['y'].iloc[0] == 'this line has a comma, and a double " quote' + +def test_load_quoting_escaped(): + plugin = LoadCsvPlugin() + plugin.set_parameter("files.0.filename", "tests/input2.csv") + plugin.set_parameter("quoting", "Quote with Escape") + output_df = next(plugin.load_file(0, logger)) + assert output_df['y'].iloc[1] == 'this line has a comma, and an escaped " quote' + +def test_load_comment(): + plugin = LoadCsvPlugin() + plugin.set_parameter("files.0.filename", "tests/input2.csv") + plugin.set_parameter("quoting", "Double-Quote") + plugin.set_parameter("comment", "#") + output_df = next(plugin.load_file(0, logger)) + assert output_df['y'].iloc[2] == "this line has a comment " + +def test_filename_column(): + plugin = LoadCsvPlugin() + plugin.set_parameter("files.0.filename", "tests/input1.csv") + plugin.set_parameter("filename_column", 'filename') + output_df = next(plugin.load_file(0, logger)) + assert 'filename' in output_df.columns + assert output_df['filename'].iloc[1] == "./tests/input1.csv" diff --git a/tests/plugins/test_hgvs_parser.py b/tests/plugins/test_hgvs_parser.py new file mode 100644 index 0000000..5e902ea --- /dev/null +++ b/tests/plugins/test_hgvs_parser.py @@ -0,0 +1,78 @@ + +import pandas as pd + +from countess.plugins.hgvs_parser import HgvsParserPlugin +from countess.core.logger import MultiprocessLogger + +logger = MultiprocessLogger() + +df1 = pd.DataFrame([ + {'hgvs': 'NC_000017.11:g.[43124022G>C;43124175C>T;43124111A>G]', + 'guides': '43124022G>C;43124111A>G' } +]) + +def test_hgvs_parser(): + + plugin = HgvsParserPlugin() + plugin.set_parameter('column', 'hgvs') + plugin.set_parameter('guides_col', 'guides') + + df = plugin.process_dataframe(df1, logger) + + assert df['var_1'].iloc[0] == '43124175C>T' + assert df['guide_1'].iloc[0] == True + assert df['guide_2'].iloc[0] == True + +def test_hgvs_parser_guides_str(): + + plugin = HgvsParserPlugin() + plugin.set_parameter('column', 'hgvs') + plugin.set_parameter('guides_str', '43124022G>C;43124111A>G') + + df = plugin.process_dataframe(df1, logger) + + assert df['var_1'].iloc[0] == '43124175C>T' + assert df['guide_1'].iloc[0] == True + assert df['guide_2'].iloc[0] == True + +def test_hgvs_parser_split(): + plugin = HgvsParserPlugin() + plugin.set_parameter('column', 'hgvs') + plugin.set_parameter('guides_col', 'guides') + plugin.set_parameter('split', True) + + df = plugin.process_dataframe(df1, logger) + + assert df['loc_1'].iloc[0] == '43124175' + assert df['var_1'].iloc[0] == 'C>T' + assert df['guide_1'].iloc[0] == True + assert df['guide_2'].iloc[0] == True + +def test_hgvs_parser_multi(): + + plugin = HgvsParserPlugin() + plugin.set_parameter('column', 'hgvs') + plugin.set_parameter('guides_str', '43124022G>C') + plugin.set_parameter('multi', True) + plugin.set_parameter('max_var', 2) + + df = plugin.process_dataframe(df1, logger) + + assert df['var'].iloc[0] == '43124175C>T' + assert df['var'].iloc[1] == '43124111A>G' + +def test_hgvs_parser_split_and_multi(): + + plugin = HgvsParserPlugin() + plugin.set_parameter('column', 'hgvs') + plugin.set_parameter('guides_str', '43124022G>C') + plugin.set_parameter('split', True) + plugin.set_parameter('multi', True) + plugin.set_parameter('max_var', 2) + + df = plugin.process_dataframe(df1, logger) + + assert df['var'].iloc[0] == 'C>T' + assert df['var'].iloc[1] == 'A>G' + assert df['loc'].iloc[0] == '43124175' + assert df['loc'].iloc[1] == '43124111'