From eb606323555a567440b20e38ab4847ddb38c2f89 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Sun, 29 Mar 2020 19:32:10 +0200 Subject: [PATCH 01/15] Test and fix rename_dim --- fast_plotter/postproc/functions.py | 2 +- tests/postproc/test_functions.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fast_plotter/postproc/functions.py b/fast_plotter/postproc/functions.py index 4d63e2e..aa15a55 100644 --- a/fast_plotter/postproc/functions.py +++ b/fast_plotter/postproc/functions.py @@ -190,7 +190,7 @@ def rename_dim(df, mapping): """ Rename one or more dimensions """ - df.index.names = [mapping.get(n, n) for n in df.df.index.names] + df.index.names = [mapping.get(n, n) for n in df.index.names] return df diff --git a/tests/postproc/test_functions.py b/tests/postproc/test_functions.py index 08b5a74..0fd8071 100644 --- a/tests/postproc/test_functions.py +++ b/tests/postproc/test_functions.py @@ -70,9 +70,9 @@ def test_combine_cols_AND_split_dimension(binned_df): # #def rename_cols(df, mapping): # pass -# def test_rename_dim(): -# #def rename_dim(df, mapping): -# pass +def test_rename_dim(binned_df): + result = funcs.rename_dim(binned_df, {"int": "integers", "cat": "CATEGORICALS"}) + assert result.index.names == ["integers", "CATEGORICALS", "interval"] # def test_split(): # #def split(df, axis, keep_split_dim, return_meta=True): From ddba0b71237a307cf5802c55e9508c4636ef6daf Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Sun, 29 Mar 2020 19:34:27 +0200 Subject: [PATCH 02/15] Update CHANGELOG --- CHANGELOG.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4967fdb..4838e9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,9 +4,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.5.0] - 2020-03-30 +## [0.5.1] - 2020-03-30 +### Fixed +- Bugs in post-processing modules, PR #29 [@benkrikler](github.com/benkrikler) + +## [0.5.0] - 2020-03-29 ### Added -- New post-processing command to reshape outputs of fast-carpenter +- New post-processing command to reshape outputs of fast-carpenter from PR #28 [@benkrikler](github.com/benkrikler) ## [0.4.0] - 2020-02-26 - Many changes from PR #26 [@benkrikler](github.com/benkrikler) From e12d39eb542f4d7b0a108c172cec0eaa579c9ff6 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Sun, 29 Mar 2020 19:34:34 +0200 Subject: [PATCH 03/15] =?UTF-8?q?Bump=20version:=200.5.0=20=E2=86=92=200.5?= =?UTF-8?q?.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fast_plotter/version.py | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fast_plotter/version.py b/fast_plotter/version.py index d1947dd..cd7cc73 100644 --- a/fast_plotter/version.py +++ b/fast_plotter/version.py @@ -12,5 +12,5 @@ def split_version(version): return tuple(result) -__version__ = '0.5.0' +__version__ = '0.5.1' version_info = split_version(__version__) # noqa diff --git a/setup.cfg b/setup.cfg index 99f64d1..644ec11 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.5.0 +current_version = 0.5.1 commit = True tag = False From cfec0c5ff9fe6e0bc6e3cdd7720cd4175d3440d7 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Sun, 29 Mar 2020 19:35:13 +0200 Subject: [PATCH 04/15] Fix pep8 --- tests/postproc/test_functions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/postproc/test_functions.py b/tests/postproc/test_functions.py index 0fd8071..ace720c 100644 --- a/tests/postproc/test_functions.py +++ b/tests/postproc/test_functions.py @@ -70,10 +70,12 @@ def test_combine_cols_AND_split_dimension(binned_df): # #def rename_cols(df, mapping): # pass + def test_rename_dim(binned_df): result = funcs.rename_dim(binned_df, {"int": "integers", "cat": "CATEGORICALS"}) assert result.index.names == ["integers", "CATEGORICALS", "interval"] + # def test_split(): # #def split(df, axis, keep_split_dim, return_meta=True): # pass From 8e81027327aaa8f9dd91e6af95251bd11dcdbb89 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Sun, 29 Mar 2020 20:37:52 +0200 Subject: [PATCH 05/15] Handle when no bins need to be dropped --- fast_plotter/postproc/functions.py | 2 ++ tests/postproc/test_functions.py | 15 ++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fast_plotter/postproc/functions.py b/fast_plotter/postproc/functions.py index aa15a55..b942b19 100644 --- a/fast_plotter/postproc/functions.py +++ b/fast_plotter/postproc/functions.py @@ -119,6 +119,8 @@ def split_dimension(df, axis, delimeter=";"): def keep_bins(df, axis, keep): """Keep bins on the single dimension, dropping others""" others = {val for val in df.index.unique(axis) if val not in keep} + if not others: + return df logger.info("Dropping values for '%s': %s", axis, str(others)) out = df.drop(others, level=axis, axis="index") return out diff --git a/tests/postproc/test_functions.py b/tests/postproc/test_functions.py index ace720c..a176ca6 100644 --- a/tests/postproc/test_functions.py +++ b/tests/postproc/test_functions.py @@ -33,9 +33,9 @@ def test_query(binned_df): def test_rebin(binned_df): - result = funcs.rebin(binned_df.copy(), axis="int", mapping=dict(zip(range(4), [0, 2] * 2))) + result = funcs.rebin(binned_df.copy(), rename="hruff", axis="int", mapping=dict(zip(range(4), [0, 2] * 2))) assert len(result) == 20 - assert list(result.index.unique("int")) == [0, 2] + assert list(result.index.unique("hruff")) == [0, 2] mapping = {0: dict(bar="foo"), 2: dict(foo="bar"), 3: dict(foo="BAZ", bar="BAZ")} result = funcs.rebin(binned_df.copy(), axis=["int", 'cat'], mapping=mapping) @@ -43,9 +43,14 @@ def test_rebin(binned_df): assert set(result.index.unique("cat")) == {"bar", "BAZ", "foo"} -# def test_keep_bins(): -# #def keep_bins(df, axis, keep): -# pass +def test_keep_bins(binned_df): + result = funcs.keep_bins(binned_df.copy(), "int", keep=[0, 2]) + assert len(result) == 20 + + result = funcs.keep_bins(binned_df.copy(), "int", keep=binned_df.index.unique("int")) + assert len(result) == 40 + + # def test_keep_specific_bins(): # #def keep_specific_bins(df, axis, keep, expansions={}): From 05a44bbbe9a229230706079f6924d9ccac64a9b8 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Sun, 29 Mar 2020 20:38:20 +0200 Subject: [PATCH 06/15] Fix whitespace --- tests/postproc/test_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/postproc/test_functions.py b/tests/postproc/test_functions.py index a176ca6..d345c58 100644 --- a/tests/postproc/test_functions.py +++ b/tests/postproc/test_functions.py @@ -51,11 +51,11 @@ def test_keep_bins(binned_df): assert len(result) == 40 - # def test_keep_specific_bins(): # #def keep_specific_bins(df, axis, keep, expansions={}): # pass + def test_combine_cols_AND_split_dimension(binned_df): result = funcs.combine_cols(binned_df, {"a;b": "{a};{b}"}) assert len(result.columns) == 3 From 1ff9fa7d5eb791e8cccf43f2f69440ae2fcb3d68 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Tue, 31 Mar 2020 10:13:51 +0200 Subject: [PATCH 07/15] Fix things for dataframes with only unweighted counts --- fast_plotter/__main__.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fast_plotter/__main__.py b/fast_plotter/__main__.py index 3c9e625..c0e173a 100644 --- a/fast_plotter/__main__.py +++ b/fast_plotter/__main__.py @@ -104,12 +104,11 @@ def process_one_file(infile, args): for weight in weights: if args.weights and weight not in args.weights: continue + df_filtered = df.copy() if weight == "n": - df_filtered = df.filter(weight, axis="columns").copy() - df_filtered.rename({weight: "sumw"}, axis="columns", inplace=True) - df_filtered["sumw2"] = df_filtered.sumw + df_filtered["sumw"] = df_filtered.n + df_filtered["sumw2"] = df_filtered.n else: - df_filtered = df.copy() if "n" in df.columns: data_rows = mask_rows(df_filtered, regex=args.data, From 8f01420676f3a967425680df6f708220252f9bdc Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Tue, 31 Mar 2020 10:28:16 +0200 Subject: [PATCH 08/15] Add option to change output file format --- fast_plotter/postproc/functions.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fast_plotter/postproc/functions.py b/fast_plotter/postproc/functions.py index b942b19..89d8f28 100644 --- a/fast_plotter/postproc/functions.py +++ b/fast_plotter/postproc/functions.py @@ -376,7 +376,7 @@ def open_many(file_list, return_meta=True): return dfs -def write_out(df, meta, filename="tbl_{dims}--{name}.csv", out_dir=None): +def write_out(df, meta, filename="tbl_{dims}--{name}", out_dir=None, filetype="csv"): """ Write a dataframe to disk """ meta = meta.copy() @@ -387,5 +387,10 @@ def write_out(df, meta, filename="tbl_{dims}--{name}.csv", out_dir=None): complete_file = os.path.join(out_dir, complete_file) os.makedirs(os.path.dirname(complete_file), exist_ok=True) logger.info("Writing out file '%s'", complete_file) - df.to_csv(complete_file) + if not complete_file.endswith(filetype): + complete_file += "." + filetype + if filetype == "csv": + df.to_csv(complete_file) + elif filetype == "hd5": + df.to_hdf(complete_file) return df From 5a8eaa79464366df2341813a9fdfe0805f20a00e Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Tue, 31 Mar 2020 10:50:49 +0200 Subject: [PATCH 09/15] Add option to write out with hd5 --- fast_plotter/postproc/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fast_plotter/postproc/functions.py b/fast_plotter/postproc/functions.py index 89d8f28..c6bb0b3 100644 --- a/fast_plotter/postproc/functions.py +++ b/fast_plotter/postproc/functions.py @@ -392,5 +392,5 @@ def write_out(df, meta, filename="tbl_{dims}--{name}", out_dir=None, filetype="c if filetype == "csv": df.to_csv(complete_file) elif filetype == "hd5": - df.to_hdf(complete_file) + df.to_hdf(complete_file, key="df") return df From 71c31f372e7263a28a9576a2c99cdd2a929d4416 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Tue, 31 Mar 2020 10:59:12 +0200 Subject: [PATCH 10/15] Add option to control the number of debugging lines to show --- fast_plotter/postproc/__main__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fast_plotter/postproc/__main__.py b/fast_plotter/postproc/__main__.py index 023352e..3248de8 100644 --- a/fast_plotter/postproc/__main__.py +++ b/fast_plotter/postproc/__main__.py @@ -13,6 +13,8 @@ def make_parser(): help="Print a dataframe after each step") parser.add_argument("--debug-dfs-query", default=None, help="Provide a query to select rows from the debugged dataframe") + parser.add_argument("--debug-num", default=5, + help="Number of rows to dump from debugging dataframe") parser.add_argument("-p", "--post-process", default=None, required=True, help="A yaml to configure the post-processing step") parser.add_argument("-o", "--outdir", default=".", @@ -57,7 +59,7 @@ def main(args=None): if args.debug_dfs: debug_df = dump_debug_df(dfs, args.debug_dfs_query) if debug_df is not None: - logger.debug(debug_df.head().to_string()) + logger.debug(debug_df.head(args.debug_num).to_string()) if __name__ == "__main__": From 4fd6435c08b036ea97223774b470432373c88031 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Tue, 31 Mar 2020 11:00:29 +0200 Subject: [PATCH 11/15] Fix debug-num --> debug-rows command --- fast_plotter/postproc/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fast_plotter/postproc/__main__.py b/fast_plotter/postproc/__main__.py index 3248de8..ff1e33d 100644 --- a/fast_plotter/postproc/__main__.py +++ b/fast_plotter/postproc/__main__.py @@ -13,7 +13,7 @@ def make_parser(): help="Print a dataframe after each step") parser.add_argument("--debug-dfs-query", default=None, help="Provide a query to select rows from the debugged dataframe") - parser.add_argument("--debug-num", default=5, + parser.add_argument("--debug-rows", default=5, type=int, help="Number of rows to dump from debugging dataframe") parser.add_argument("-p", "--post-process", default=None, required=True, help="A yaml to configure the post-processing step") From 0ed39069b7ab653330b70e916d8092530d9a951b Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Tue, 31 Mar 2020 11:02:27 +0200 Subject: [PATCH 12/15] Fix up the debug rows option (again) --- fast_plotter/postproc/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fast_plotter/postproc/__main__.py b/fast_plotter/postproc/__main__.py index ff1e33d..41b0123 100644 --- a/fast_plotter/postproc/__main__.py +++ b/fast_plotter/postproc/__main__.py @@ -59,7 +59,7 @@ def main(args=None): if args.debug_dfs: debug_df = dump_debug_df(dfs, args.debug_dfs_query) if debug_df is not None: - logger.debug(debug_df.head(args.debug_num).to_string()) + logger.debug(debug_df.head(args.debug_rows).to_string()) if __name__ == "__main__": From bcc2204a52e6a3362195171c3056f79712f53c7c Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Tue, 7 Apr 2020 11:03:32 +0200 Subject: [PATCH 13/15] Extend split method to work with multiple split dimensions --- fast_plotter/postproc/functions.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fast_plotter/postproc/functions.py b/fast_plotter/postproc/functions.py index c6bb0b3..9666b73 100644 --- a/fast_plotter/postproc/functions.py +++ b/fast_plotter/postproc/functions.py @@ -201,6 +201,10 @@ def split(df, axis, keep_split_dim, return_meta=True): split the dataframe into a list of dataframes using a given binning dimensions """ + if isinstance(axis, (list, tuple)): + axis = tuple(axis) + else: + axis = (axis, ) logger.info("Splitting on axis: '%s'", axis) out_dfs = [] groups = df.groupby(level=axis, group_keys=keep_split_dim) @@ -209,8 +213,9 @@ def split(df, axis, keep_split_dim, return_meta=True): group.index = group.index.droplevel(axis) result = group.copy() if return_meta: - meta = {"split_name": "%s_%s" % (axis, split_val), - axis: split_val} + meta = dict(zip(axis, split_val)) + split_name = "--".join(map("_".join, meta.items())) + meta["split_name"] = split_name result = (result, meta) out_dfs.append(result) return out_dfs From 3de2d824c8d31615d574d6898c5e4d2c31f6c24c Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Tue, 7 Apr 2020 11:06:28 +0200 Subject: [PATCH 14/15] Also convert splitval to tuple --- fast_plotter/postproc/functions.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fast_plotter/postproc/functions.py b/fast_plotter/postproc/functions.py index 9666b73..4388a3c 100644 --- a/fast_plotter/postproc/functions.py +++ b/fast_plotter/postproc/functions.py @@ -201,14 +201,17 @@ def split(df, axis, keep_split_dim, return_meta=True): split the dataframe into a list of dataframes using a given binning dimensions """ - if isinstance(axis, (list, tuple)): - axis = tuple(axis) - else: - axis = (axis, ) + def to_tuple(obj): + if isinstance(obj, (list, tuple)): + return tuple(obj) + else: + return (obj, ) + axis = to_tuple(axis) logger.info("Splitting on axis: '%s'", axis) out_dfs = [] groups = df.groupby(level=axis, group_keys=keep_split_dim) for split_val, group in groups: + split_val = to_tuple(split_val) if not keep_split_dim: group.index = group.index.droplevel(axis) result = group.copy() From 2f966ad20e7c640f3cf6dda39ebf8973362846e4 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Tue, 7 Apr 2020 11:27:27 +0200 Subject: [PATCH 15/15] Update CHANGELOG --- CHANGELOG.md | 2 +- fast_plotter/postproc/functions.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4838e9e..3ed6d6c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.5.1] - 2020-03-30 +## [0.5.1] - 2020-04-7 ### Fixed - Bugs in post-processing modules, PR #29 [@benkrikler](github.com/benkrikler) diff --git a/fast_plotter/postproc/functions.py b/fast_plotter/postproc/functions.py index 4388a3c..5592739 100644 --- a/fast_plotter/postproc/functions.py +++ b/fast_plotter/postproc/functions.py @@ -206,6 +206,7 @@ def to_tuple(obj): return tuple(obj) else: return (obj, ) + axis = to_tuple(axis) logger.info("Splitting on axis: '%s'", axis) out_dfs = []