From 39b63945dca6c0ddfd905ed1a0569e953401d6a6 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Tue, 12 May 2020 23:53:36 +0200 Subject: [PATCH 1/8] Add function to scale_datasets using functionality from curator --- fast_plotter/postproc/scale_datasets.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 fast_plotter/postproc/scale_datasets.py diff --git a/fast_plotter/postproc/scale_datasets.py b/fast_plotter/postproc/scale_datasets.py new file mode 100644 index 0000000..3862bc2 --- /dev/null +++ b/fast_plotter/postproc/scale_datasets.py @@ -0,0 +1,20 @@ +import pandas as pd +from fast_curator import read +from .functions import multiply_dataframe + + +def scale_datasets(df, curator_cfg, multiply_by=[], divide_by=[], dataset_col="dataset"): + if isinstance(curator_cfg, list): + dataset_cfg = curator_cfg + else: + dataset_cfg = read.from_yaml(curator_cfg) + + scale = [1] * len(dataset_cfg) + for dataset in dataset_cfg: + for m in multiply_by: + scale *= getattr(dataset, m) + for d in divide_by: + scale /= getattr(dataset, d) + + scale = pd.Series(scale, index=[d.name for d in dataset_cfg], name=dataset_col) + return multiply_dataframe(df, scale) From 7db45083da26935ec81574dd82dcd78f8f79a585 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Wed, 13 May 2020 00:01:28 +0200 Subject: [PATCH 2/8] Set up stage for scaling datasets --- fast_plotter/postproc/functions.py | 10 ++++++++++ fast_plotter/postproc/scale_datasets.py | 8 +++++--- fast_plotter/postproc/stages.py | 5 +++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/fast_plotter/postproc/functions.py b/fast_plotter/postproc/functions.py index c132396..0f0708f 100644 --- a/fast_plotter/postproc/functions.py +++ b/fast_plotter/postproc/functions.py @@ -3,6 +3,7 @@ import re import numpy as np import pandas as pd +from .scale_datasets import prepare_datasets_scale_factor import logging logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -412,6 +413,15 @@ def multiply_dataframe(df, multiply_df, use_column=None): return out +def scale_datasets(df, use_column=None, curator_cfg, multiply_by=[], divide_by=[], dataset_col="dataset", eventtype="mc"): + """ + Pull fields from a fast-curator config for datasets, and use these to normalise inputs + """ + scale = prepare_datasets_scale_factor(curator_cfg, multiply_by, divide_by, dataset_col, eventtype) + result = multiply_dataframe(df, scale, use_column=use_column) + return result + + def normalise_group(df, groupby_dimensions, apply_if=None, use_column=None): logger.info("Normalising within groups defined by: %s", str(groupby_dimensions)) norm_to = 1 / df.groupby(level=groupby_dimensions).sum() diff --git a/fast_plotter/postproc/scale_datasets.py b/fast_plotter/postproc/scale_datasets.py index 3862bc2..01cb109 100644 --- a/fast_plotter/postproc/scale_datasets.py +++ b/fast_plotter/postproc/scale_datasets.py @@ -1,9 +1,8 @@ import pandas as pd from fast_curator import read -from .functions import multiply_dataframe -def scale_datasets(df, curator_cfg, multiply_by=[], divide_by=[], dataset_col="dataset"): +def prepare_datasets_scale_factor(df, curator_cfg, multiply_by=[], divide_by=[], dataset_col="dataset", eventtype="mc"): if isinstance(curator_cfg, list): dataset_cfg = curator_cfg else: @@ -11,10 +10,13 @@ def scale_datasets(df, curator_cfg, multiply_by=[], divide_by=[], dataset_col="d scale = [1] * len(dataset_cfg) for dataset in dataset_cfg: + if eventtype and dataset.eventtype not in eventtype: + continue + for m in multiply_by: scale *= getattr(dataset, m) for d in divide_by: scale /= getattr(dataset, d) scale = pd.Series(scale, index=[d.name for d in dataset_cfg], name=dataset_col) - return multiply_dataframe(df, scale) + return scale diff --git a/fast_plotter/postproc/stages.py b/fast_plotter/postproc/stages.py index d97bc7c..d886ac1 100644 --- a/fast_plotter/postproc/stages.py +++ b/fast_plotter/postproc/stages.py @@ -131,6 +131,11 @@ class MultiplyValues(BaseManipulator): func = "multiply_values" +class ScaleDatasets(BaseManipulator): + cardinality = "one-to-one" + func = "scale_datasets" + + class NormaliseGroup(BaseManipulator): cardinality = "one-to-one" func = "normalise_group" From 450e275b1217b257722e5925e1bfbac2a588dc9d Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Wed, 13 May 2020 00:01:36 +0200 Subject: [PATCH 3/8] Actually mention we now curator and flow in setup.py --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 81eba23..2679c80 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,8 @@ def get_version(): return _globals["__version__"] -requirements = ['matplotlib', 'pandas', 'numpy', 'scipy'] +requirements = ['matplotlib', 'pandas', 'numpy', 'scipy', + 'fast-curator', 'fast-flow'] setup_requirements = ['pytest-runner', ] From b379e37793aaa91294dba0242acb92d8420023f5 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Thu, 14 May 2020 20:28:56 +0100 Subject: [PATCH 4/8] Use curator configs in postproc --- fast_plotter/postproc/__init__.py | 1 + fast_plotter/postproc/functions.py | 25 ++++++++--- fast_plotter/postproc/scale_datasets.py | 58 ++++++++++++++++++++----- fast_plotter/postproc/stages.py | 5 +++ 4 files changed, 73 insertions(+), 16 deletions(-) diff --git a/fast_plotter/postproc/__init__.py b/fast_plotter/postproc/__init__.py index e69de29..339cbbd 100644 --- a/fast_plotter/postproc/__init__.py +++ b/fast_plotter/postproc/__init__.py @@ -0,0 +1 @@ +from .functions import open_many diff --git a/fast_plotter/postproc/functions.py b/fast_plotter/postproc/functions.py index 0f0708f..c35af20 100644 --- a/fast_plotter/postproc/functions.py +++ b/fast_plotter/postproc/functions.py @@ -3,7 +3,7 @@ import re import numpy as np import pandas as pd -from .scale_datasets import prepare_datasets_scale_factor +from .scale_datasets import prepare_datasets_scale_factor, make_dataset_map import logging logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -103,6 +103,18 @@ def explode(mapping, expect_depth, prefix="", depth=0): return out_df +def rebin_by_curator_cfg(df, curator_cfg, map_from="name", map_to="eventtype", + column_from="dataset", column_to=None, + default_from=None, default_to=None, error_all_missing=True): + mapping = make_dataset_map(curator_cfg, + map_from=map_from, map_to=map_to, + default_from=default_from, + default_to=default_to, + error_all_missing=error_all_missing) + df = rebin(df, axis=column_from, mapping=mapping, rename=column_to) + return df + + def split_dimension(df, axis, delimeter=";"): """ Split up a binning dimensions @@ -401,24 +413,25 @@ def multiply_values(df, constant=0, mapping={}, weight_by_dataframes=[], apply_i return df -def multiply_dataframe(df, multiply_df, use_column=None): +def multiply_dataframe(df, multiply_df, use_column=None, level=None): if isinstance(multiply_df, six.string_types): multiply_df = open_many([multiply_df], return_meta=False)[0] if use_column is not None: multiply_df = multiply_df[use_column] if isinstance(multiply_df, pd.Series): - out = df.mul(multiply_df, axis=0) + out = df.mul(multiply_df, axis=0, level=level) else: - out = df * multiply_df + out = df.mul(multiply_df, level=level) return out -def scale_datasets(df, use_column=None, curator_cfg, multiply_by=[], divide_by=[], dataset_col="dataset", eventtype="mc"): +def scale_datasets(df, curator_cfg, multiply_by=[], divide_by=[], + dataset_col="dataset", eventtype="mc", use_column=None): """ Pull fields from a fast-curator config for datasets, and use these to normalise inputs """ scale = prepare_datasets_scale_factor(curator_cfg, multiply_by, divide_by, dataset_col, eventtype) - result = multiply_dataframe(df, scale, use_column=use_column) + result = multiply_dataframe(df, scale, use_column=use_column, level=dataset_col) return result diff --git a/fast_plotter/postproc/scale_datasets.py b/fast_plotter/postproc/scale_datasets.py index 01cb109..495321d 100644 --- a/fast_plotter/postproc/scale_datasets.py +++ b/fast_plotter/postproc/scale_datasets.py @@ -2,21 +2,59 @@ from fast_curator import read -def prepare_datasets_scale_factor(df, curator_cfg, multiply_by=[], divide_by=[], dataset_col="dataset", eventtype="mc"): - if isinstance(curator_cfg, list): - dataset_cfg = curator_cfg - else: - dataset_cfg = read.from_yaml(curator_cfg) +def _get_cfg(cfg): + if isinstance(cfg, list): + return cfg + return read.from_yaml(cfg) - scale = [1] * len(dataset_cfg) + +def prepare_datasets_scale_factor(curator_cfg, multiply_by=[], divide_by=[], dataset_col="dataset", eventtype="mc"): + dataset_cfg = _get_cfg(curator_cfg) + + sfs = {} for dataset in dataset_cfg: if eventtype and dataset.eventtype not in eventtype: + sfs[dataset.name] = 1 continue + scale = 1 for m in multiply_by: - scale *= getattr(dataset, m) + scale *= float(getattr(dataset, m)) for d in divide_by: - scale /= getattr(dataset, d) + scale /= float(getattr(dataset, d)) + sfs[dataset.name] = scale + + sfs = pd.Series(sfs, name=dataset_col) + return sfs + + +def make_dataset_map(curator_cfg, map_from="name", map_to="eventtype", + default_from=None, default_to=None, error_all_missing=True): + dataset_cfg = _get_cfg(curator_cfg) + + mapping = {} + missing_from = 0 + missing_to = 0 + for dataset in dataset_cfg: + if hasattr(dataset, map_from): + key = getattr(dataset, map_from) + else: + key = default_from + missing_from += 1 + + if hasattr(dataset, map_to): + value = getattr(dataset, map_to) + else: + value = default_to + missing_to += 1 + + mapping[key] = value + if missing_from == len(dataset_cfg) and error_all_missing: + msg = "None of the datasets contain the 'from' field, '%s'" + raise RuntimeError(msg % map_from) + + if missing_to == len(dataset_cfg) and error_all_missing: + msg = "None of the datasets contain the 'to' field, '%s'" + raise RuntimeError(msg % map_to) - scale = pd.Series(scale, index=[d.name for d in dataset_cfg], name=dataset_col) - return scale + return mapping diff --git a/fast_plotter/postproc/stages.py b/fast_plotter/postproc/stages.py index d886ac1..612896e 100644 --- a/fast_plotter/postproc/stages.py +++ b/fast_plotter/postproc/stages.py @@ -136,6 +136,11 @@ class ScaleDatasets(BaseManipulator): func = "scale_datasets" +class RebinByCuratorCfg(BaseManipulator): + cardinality = "one-to-one" + func = "rebin_by_curator_cfg" + + class NormaliseGroup(BaseManipulator): cardinality = "one-to-one" func = "normalise_group" From 91d7f26241effb8fb602b55157b7825e078c259c Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Thu, 14 May 2020 20:33:05 +0100 Subject: [PATCH 5/8] More appropriate curator-interacting module name --- fast_plotter/postproc/functions.py | 2 +- fast_plotter/postproc/{scale_datasets.py => query_curator.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename fast_plotter/postproc/{scale_datasets.py => query_curator.py} (100%) diff --git a/fast_plotter/postproc/functions.py b/fast_plotter/postproc/functions.py index c35af20..8bae4cb 100644 --- a/fast_plotter/postproc/functions.py +++ b/fast_plotter/postproc/functions.py @@ -3,7 +3,7 @@ import re import numpy as np import pandas as pd -from .scale_datasets import prepare_datasets_scale_factor, make_dataset_map +from .query_curator import prepare_datasets_scale_factor, make_dataset_map import logging logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) diff --git a/fast_plotter/postproc/scale_datasets.py b/fast_plotter/postproc/query_curator.py similarity index 100% rename from fast_plotter/postproc/scale_datasets.py rename to fast_plotter/postproc/query_curator.py From 0869b8f0b982d9a8ca43c19d171140b0bc3765e1 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Thu, 14 May 2020 20:34:49 +0100 Subject: [PATCH 6/8] Fix pep8 --- fast_plotter/postproc/__init__.py | 3 +++ fast_plotter/postproc/functions.py | 2 +- fast_plotter/postproc/query_curator.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fast_plotter/postproc/__init__.py b/fast_plotter/postproc/__init__.py index 339cbbd..052d6ed 100644 --- a/fast_plotter/postproc/__init__.py +++ b/fast_plotter/postproc/__init__.py @@ -1 +1,4 @@ from .functions import open_many + + +__all__ = ["open_many"] diff --git a/fast_plotter/postproc/functions.py b/fast_plotter/postproc/functions.py index 8bae4cb..d2b8992 100644 --- a/fast_plotter/postproc/functions.py +++ b/fast_plotter/postproc/functions.py @@ -104,7 +104,7 @@ def explode(mapping, expect_depth, prefix="", depth=0): def rebin_by_curator_cfg(df, curator_cfg, map_from="name", map_to="eventtype", - column_from="dataset", column_to=None, + column_from="dataset", column_to=None, default_from=None, default_to=None, error_all_missing=True): mapping = make_dataset_map(curator_cfg, map_from=map_from, map_to=map_to, diff --git a/fast_plotter/postproc/query_curator.py b/fast_plotter/postproc/query_curator.py index 495321d..ba0bdb0 100644 --- a/fast_plotter/postproc/query_curator.py +++ b/fast_plotter/postproc/query_curator.py @@ -17,7 +17,7 @@ def prepare_datasets_scale_factor(curator_cfg, multiply_by=[], divide_by=[], dat sfs[dataset.name] = 1 continue - scale = 1 + scale = 1 for m in multiply_by: scale *= float(getattr(dataset, m)) for d in divide_by: From fd3f8b057aa56645a177362168580b56c3346295 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Thu, 14 May 2020 20:37:15 +0100 Subject: [PATCH 7/8] =?UTF-8?q?Bump=20version:=200.6.5=20=E2=86=92=200.7.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fast_plotter/version.py | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fast_plotter/version.py b/fast_plotter/version.py index 148d646..bf21ade 100644 --- a/fast_plotter/version.py +++ b/fast_plotter/version.py @@ -12,5 +12,5 @@ def split_version(version): return tuple(result) -__version__ = '0.6.5' +__version__ = '0.7.0' version_info = split_version(__version__) # noqa diff --git a/setup.cfg b/setup.cfg index c65bee9..9ffd9b8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.6.5 +current_version = 0.7.0 commit = True tag = False From 1f7a04593633420011bcf26e04d2fbb9c0892b64 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Thu, 14 May 2020 20:38:13 +0100 Subject: [PATCH 8/8] Update CHANGELOG --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c425c32..f116f51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.7.0] - 2020-05-14 +### Added +- Post-processing functions that interact with curator to apply or rebin datasets, PR #36 [@benkrikler](https://github.com/benkrikler) + ## [0.6.5] - 2020-05-12 ### Added - Implement the multiply_values with a mapping, PR #35 [@benkrikler](https://github.com/benkrikler)