Merge pull request #34 from FAST-HEP/BK_small_improvements

Several small improvements
FAST-HEP · May 7, 2020 · 11ce33b · 11ce33b
2 parents 21e45e5 + 98fda78
commit 11ce33b
Show file tree

Hide file tree

Showing 7 changed files with 74 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.6.4] - 2020-05-07
+### Added
+- New postprocessing stage to filter columns, PR #34 [@benkrikler](https://github.com/benkrikler)
+- New option to AssignCols stage to make assignment the index, PR #34
+
+## Fixed
+- ToDatacardInputs had been broken in a previous update, PR #34
+
 ## [0.6.3] - 2020-04-29
 ### Added
 - Add GenericPandas and UnstackWeights stages, PR #33 [@benkrikler](https://github.com/benkrikler)

diff --git a/fast_plotter/__main__.py b/fast_plotter/__main__.py
@@ -6,6 +6,7 @@
 import logging
 import matplotlib
 matplotlib.use('Agg')
+matplotlib.rcParams.update({'figure.autolayout': True})
 from .version import __version__ # noqa
 from .utils import read_binned_df, weighting_vars # noqa
 from .utils import decipher_filename, mask_rows  # noqa
@@ -134,8 +135,9 @@ def process_one_file(infile, args):
     return ran_ok
 
 
-def dress_main_plots(plots, annotations=[], yscale=None, ylabel=None, legend={}, limits={}, **kwargs):
-    for main_ax, _ in plots.values():
+def dress_main_plots(plots, annotations=[], yscale=None, ylabel=None, legend={},
+                     limits={}, xtickrotation=None, **kwargs):
+    for main_ax, summary_ax in plots.values():
         add_annotations(annotations, main_ax)
         if yscale:
             main_ax.set_yscale(yscale)
@@ -151,6 +153,8 @@ def dress_main_plots(plots, annotations=[], yscale=None, ylabel=None, legend={},
                     getattr(main_ax, "set_%slim" % axis)(*lims)
             elif lims.endswith("%"):
                 main_ax.margins(**{axis: float(lims[:-1])})
+        if xtickrotation:
+            matplotlib.pyplot.xticks(rotation=xtickrotation)
 
 
 def save_plots(infile, weight, plots, outdir, extensions):

diff --git a/fast_plotter/postproc/functions.py b/fast_plotter/postproc/functions.py
@@ -152,7 +152,31 @@ def keep_specific_bins(df, axis, keep, expansions={}):
     return out_df
 
 
-def combine_cols(df, format_strings):
+def filter_cols(df, items=None, like=None, regex=None, drop_not_keep=False):
+    """Filter out columns you want to keep.
+
+    Parameters:
+      items (list-like): A list of column names to filter with
+      like (str, list[string]): A string or list of strings which will filter
+            columns where they are found in the column name
+      regex (str): A regular expression to match column names to
+      drop_not_keep (bool): Inverts the selection if true so that matched columns are dropped
+    """
+    if not like or not isinstance(like, (tuple, list)):
+        df_filtered = df.filter(items=items, like=like, regex=regex)
+    elif like:
+        if items and like:
+            raise RuntimeError("Can only use one of 'items', 'like', or 'regex'")
+        filtered = [set(col for col in df.columns if i in col) for i in like]
+        filtered = set.union(*filtered)
+        df_filtered = df.filter(items=filtered, regex=regex)
+
+    if drop_not_keep:
+        return df.drop(df_filtered.columns)
+    return df_filtered
+
+
+def combine_cols(df, format_strings, as_index=[]):
     """Combine columns together using format strings"""
     logger.info("Combining columns based on: %s", str(format_strings))
     result_names = list(format_strings.keys())
@@ -166,6 +190,8 @@ def apply_fmt(row):
     results.columns = result_names
     new_df = new_df.assign(**results)
     new_df.set_index(index, inplace=True, drop=True)
+    if as_index:
+        new_df.set_index(as_index, inplace=True, append=True)
     return new_df
 
 
@@ -288,6 +314,8 @@ def to_datacard_inputs(df, select_data, rename_syst_vars=False):
     df["content"] = df.n
     df["content"][~data_mask] = df.sumw
     df["error"] = df.content / np.sqrt(df.n)
+    df.drop(["n", "sumw", "sumw2"], inplace=True, axis="columns")
+    return df
 
 
 def generic_pandas(df, func, *args, **kwargs):

diff --git a/fast_plotter/postproc/stages.py b/fast_plotter/postproc/stages.py
@@ -111,6 +111,11 @@ class AssignCol(BaseManipulator):
     func = "assign_col"
 
 
+class FilterCols(BaseManipulator):
+    cardinality = "one-to-one"
+    func = "filter_cols"
+
+
 class AssignDim(BaseManipulator):
     cardinality = "one-to-one"
     func = "assign_dim"

diff --git a/fast_plotter/version.py b/fast_plotter/version.py
@@ -12,5 +12,5 @@ def split_version(version):
     return tuple(result)
 
 
-__version__ = '0.6.3'
+__version__ = '0.6.4'
 version_info = split_version(__version__) # noqa
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.3
+current_version = 0.6.4
 commit = True
 tag = False
 

diff --git a/tests/postproc/test_functions.py b/tests/postproc/test_functions.py
@@ -91,6 +91,30 @@ def test_split(binned_df):
     assert all([r[0].index.nlevels == 3 for r in results])
 
 
+def test_filter_cols(binned_df):
+    df = binned_df.index.to_frame()
+
+    result = funcs.filter_cols(df, items=["int"])
+    assert len(result.columns) == 1
+    assert result.columns[0] == "int"
+
+    result = funcs.filter_cols(df, items=["int", "cat"])
+    assert len(result.columns) == 2
+    assert set(result.columns) == set(("int", "cat"))
+
+    result = funcs.filter_cols(df, like="int")
+    assert len(result.columns) == 2
+    assert set(result.columns) == set(("int", "interval"))
+
+    result = funcs.filter_cols(df, like=["int", "cat"])
+    assert len(result.columns) == 3
+    assert set(result.columns) == set(("int", "cat", "interval"))
+
+    result = funcs.filter_cols(df, regex="^int.*")
+    assert len(result.columns) == 2
+    assert set(result.columns) == set(("int", "interval"))
+
+
 # def test_reorder_dimensions():
 #     #def reorder_dimensions(df, order):
 #     pass