Skip to content

Commit

Permalink
Merge pull request #29 from FAST-HEP/BK_add_postproc_module
Browse files Browse the repository at this point in the history
Fix bugs in post-processing module
  • Loading branch information
benkrikler authored Apr 7, 2020
2 parents 66c6f4f + 2f966ad commit f518ea1
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 22 deletions.
8 changes: 6 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.5.0] - 2020-03-30
## [0.5.1] - 2020-04-7
### Fixed
- Bugs in post-processing modules, PR #29 [@benkrikler](github.com/benkrikler)

## [0.5.0] - 2020-03-29
### Added
- New post-processing command to reshape outputs of fast-carpenter
- New post-processing command to reshape outputs of fast-carpenter from PR #28 [@benkrikler](github.com/benkrikler)

## [0.4.0] - 2020-02-26
- Many changes from PR #26 [@benkrikler](github.com/benkrikler)
Expand Down
7 changes: 3 additions & 4 deletions fast_plotter/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,11 @@ def process_one_file(infile, args):
for weight in weights:
if args.weights and weight not in args.weights:
continue
df_filtered = df.copy()
if weight == "n":
df_filtered = df.filter(weight, axis="columns").copy()
df_filtered.rename({weight: "sumw"}, axis="columns", inplace=True)
df_filtered["sumw2"] = df_filtered.sumw
df_filtered["sumw"] = df_filtered.n
df_filtered["sumw2"] = df_filtered.n
else:
df_filtered = df.copy()
if "n" in df.columns:
data_rows = mask_rows(df_filtered,
regex=args.data,
Expand Down
4 changes: 3 additions & 1 deletion fast_plotter/postproc/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ def make_parser():
help="Print a dataframe after each step")
parser.add_argument("--debug-dfs-query", default=None,
help="Provide a query to select rows from the debugged dataframe")
parser.add_argument("--debug-rows", default=5, type=int,
help="Number of rows to dump from debugging dataframe")
parser.add_argument("-p", "--post-process", default=None, required=True,
help="A yaml to configure the post-processing step")
parser.add_argument("-o", "--outdir", default=".",
Expand Down Expand Up @@ -57,7 +59,7 @@ def main(args=None):
if args.debug_dfs:
debug_df = dump_debug_df(dfs, args.debug_dfs_query)
if debug_df is not None:
logger.debug(debug_df.head().to_string())
logger.debug(debug_df.head(args.debug_rows).to_string())


if __name__ == "__main__":
Expand Down
26 changes: 21 additions & 5 deletions fast_plotter/postproc/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ def split_dimension(df, axis, delimeter=";"):
def keep_bins(df, axis, keep):
"""Keep bins on the single dimension, dropping others"""
others = {val for val in df.index.unique(axis) if val not in keep}
if not others:
return df
logger.info("Dropping values for '%s': %s", axis, str(others))
out = df.drop(others, level=axis, axis="index")
return out
Expand Down Expand Up @@ -190,7 +192,7 @@ def rename_dim(df, mapping):
"""
Rename one or more dimensions
"""
df.index.names = [mapping.get(n, n) for n in df.df.index.names]
df.index.names = [mapping.get(n, n) for n in df.index.names]
return df


Expand All @@ -199,16 +201,25 @@ def split(df, axis, keep_split_dim, return_meta=True):
split the dataframe into a list of dataframes using a given binning
dimensions
"""
def to_tuple(obj):
if isinstance(obj, (list, tuple)):
return tuple(obj)
else:
return (obj, )

axis = to_tuple(axis)
logger.info("Splitting on axis: '%s'", axis)
out_dfs = []
groups = df.groupby(level=axis, group_keys=keep_split_dim)
for split_val, group in groups:
split_val = to_tuple(split_val)
if not keep_split_dim:
group.index = group.index.droplevel(axis)
result = group.copy()
if return_meta:
meta = {"split_name": "%s_%s" % (axis, split_val),
axis: split_val}
meta = dict(zip(axis, split_val))
split_name = "--".join(map("_".join, meta.items()))
meta["split_name"] = split_name
result = (result, meta)
out_dfs.append(result)
return out_dfs
Expand Down Expand Up @@ -374,7 +385,7 @@ def open_many(file_list, return_meta=True):
return dfs


def write_out(df, meta, filename="tbl_{dims}--{name}.csv", out_dir=None):
def write_out(df, meta, filename="tbl_{dims}--{name}", out_dir=None, filetype="csv"):
""" Write a dataframe to disk
"""
meta = meta.copy()
Expand All @@ -385,5 +396,10 @@ def write_out(df, meta, filename="tbl_{dims}--{name}.csv", out_dir=None):
complete_file = os.path.join(out_dir, complete_file)
os.makedirs(os.path.dirname(complete_file), exist_ok=True)
logger.info("Writing out file '%s'", complete_file)
df.to_csv(complete_file)
if not complete_file.endswith(filetype):
complete_file += "." + filetype
if filetype == "csv":
df.to_csv(complete_file)
elif filetype == "hd5":
df.to_hdf(complete_file, key="df")
return df
2 changes: 1 addition & 1 deletion fast_plotter/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ def split_version(version):
return tuple(result)


__version__ = '0.5.0'
__version__ = '0.5.1'
version_info = split_version(__version__) # noqa
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.5.0
current_version = 0.5.1
commit = True
tag = False

Expand Down
23 changes: 15 additions & 8 deletions tests/postproc/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,24 +33,29 @@ def test_query(binned_df):


def test_rebin(binned_df):
result = funcs.rebin(binned_df.copy(), axis="int", mapping=dict(zip(range(4), [0, 2] * 2)))
result = funcs.rebin(binned_df.copy(), rename="hruff", axis="int", mapping=dict(zip(range(4), [0, 2] * 2)))
assert len(result) == 20
assert list(result.index.unique("int")) == [0, 2]
assert list(result.index.unique("hruff")) == [0, 2]

mapping = {0: dict(bar="foo"), 2: dict(foo="bar"), 3: dict(foo="BAZ", bar="BAZ")}
result = funcs.rebin(binned_df.copy(), axis=["int", 'cat'], mapping=mapping)
assert len(result) == 25
assert set(result.index.unique("cat")) == {"bar", "BAZ", "foo"}


# def test_keep_bins():
# #def keep_bins(df, axis, keep):
# pass
def test_keep_bins(binned_df):
result = funcs.keep_bins(binned_df.copy(), "int", keep=[0, 2])
assert len(result) == 20

result = funcs.keep_bins(binned_df.copy(), "int", keep=binned_df.index.unique("int"))
assert len(result) == 40


# def test_keep_specific_bins():
# #def keep_specific_bins(df, axis, keep, expansions={}):
# pass


def test_combine_cols_AND_split_dimension(binned_df):
result = funcs.combine_cols(binned_df, {"a;b": "{a};{b}"})
assert len(result.columns) == 3
Expand All @@ -70,9 +75,11 @@ def test_combine_cols_AND_split_dimension(binned_df):
# #def rename_cols(df, mapping):
# pass

# def test_rename_dim():
# #def rename_dim(df, mapping):
# pass

def test_rename_dim(binned_df):
result = funcs.rename_dim(binned_df, {"int": "integers", "cat": "CATEGORICALS"})
assert result.index.names == ["integers", "CATEGORICALS", "interval"]


# def test_split():
# #def split(df, axis, keep_split_dim, return_meta=True):
Expand Down

0 comments on commit f518ea1

Please sign in to comment.