Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: drop use of awkward in yield uncertainty calculation #408

Merged
merged 19 commits into from
Jul 5, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 62 additions & 45 deletions src/cabinetry/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,6 @@
import logging
from typing import Any, DefaultDict, Dict, List, NamedTuple, Optional, Tuple, Union

try:
# use awkward v2
import awkward._v2 as ak
except ModuleNotFoundError: # pragma: no cover
# fallback if the _v2 submodule disappears after the full v2 release
import awkward as ak # pragma: no cover
import numpy as np
import pyhf

Expand Down Expand Up @@ -300,16 +294,26 @@ def yield_stdev(
up_comb = np.vstack((up_comb, np.sum(up_comb, axis=0)))
# turn into list of channels (keep all samples, select correct bins per channel)
# indices: channel, sample, bin
up_yields = [
up_yields_per_channel = [
up_comb[:, model.config.channel_slices[ch]] for ch in model.config.channels
]
# append list of yields summed per channel
up_yields += [
np.asarray(np.sum(chan_yields, axis=-1, keepdims=True))
for chan_yields in up_yields
]
# indices: variation, channel, sample, bin
up_variations.append(up_yields)
# calculate list of yields summed per channel
up_yields_channel_sum = np.stack(
[
np.sum(chan_yields, axis=-1, keepdims=True)
for chan_yields in up_yields_per_channel
]
)
# reshape to drop bin axis, transpose to turn channel axis into new bin axis
# (channel, sample, bin) -> (sample, bin) where "bin" becomes channel sums
up_yields_channel_sum = up_yields_channel_sum.reshape(
up_yields_channel_sum.shape[:-1]
).T

# concatenate per-channel sums to up_comb (along bin axis)
up_yields = np.concatenate((up_comb, up_yields_channel_sum), axis=1)
# indices: variation, sample, bin
up_variations.append(up_yields.tolist())

# model distribution per sample with this parameter varied down
down_comb = pyhf.tensorlib.to_numpy(
Expand All @@ -318,68 +322,81 @@ def yield_stdev(
# add total prediction (sum over samples)
down_comb = np.vstack((down_comb, np.sum(down_comb, axis=0)))
# turn into list of channels
down_yields = [
down_yields_per_channel = [
down_comb[:, model.config.channel_slices[ch]]
for ch in model.config.channels
]
# append list of yields summed per channel
down_yields += [
np.asarray(np.sum(chan_yields, axis=-1, keepdims=True))
for chan_yields in down_yields
]

# calculate list of yields summed per channel
down_yields_channel_sum = np.stack(
[
np.sum(chan_yields, axis=-1, keepdims=True)
for chan_yields in down_yields_per_channel
]
)
down_yields_channel_sum = down_yields_channel_sum.reshape(
down_yields_channel_sum.shape[:-1]
).T

down_yields = np.concatenate((down_comb, down_yields_channel_sum), axis=1)
down_variations.append(down_yields)

# convert to awkward arrays for further processing
up_variations_ak = ak.from_iter(up_variations)
down_variations_ak = ak.from_iter(down_variations)
# convert to numpy arrays for further processing
up_variations_np = np.asarray(up_variations)
down_variations_np = np.asarray(down_variations)

# calculate symmetric uncertainties for all components
# indices: variation, channel (last entries sums), sample (last entry sum), bin
sym_uncs = (up_variations_ak - down_variations_ak) / 2
sym_uncs = (up_variations_np - down_variations_np) / 2

# calculate total variance, indexed by channel, sample, bin (per-channel numbers act
# like additional channels with one bin each)
# calculate total variance, indexed by sample, bin (per-channel numbers act like
# additional channels with one bin each)
if np.count_nonzero(corr_mat - np.diagflat(np.ones_like(parameters))) == 0:
# no off-diagonal contributions from correlation matrix (e.g. pre-fit)
total_variance = ak.sum(np.power(sym_uncs, 2), axis=0)
total_variance = np.sum(np.power(sym_uncs, 2), axis=0)
else:
# full calculation including off-diagonal contributions
# with v as vector of variations (each element contains yields under variation)
# and M as correlation matrix, calculate variance as follows:
# variance = sum_i sum_j v[i] * M[i, j] * v[j]
# where the product between elements of v again is elementwise (multiplying bin
# yields), and the final variance shape is the same as element of v (yield
# uncertainties per bin, sample, channel)
# uncertainties per sample, bin)

# possible optimizations that could be considered here:
# - skipping staterror-staterror terms for per-bin calculation (orthogonal)
# - taking advantage of correlation matrix symmetry
# - (optional) skipping combinations with correlations below threshold

# calculate M[i, j] * v[j] first
# indices: pars (i), pars (j), channel, sample, bin
m_times_v = (
corr_mat[..., np.newaxis, np.newaxis, np.newaxis]
* sym_uncs[np.newaxis, ...]
)
# now multiply by v[i] as well, indices: pars(i), pars(j), channel, sample, bin
# indices: pars (i), pars (j), sample, bin
m_times_v = corr_mat[..., np.newaxis, np.newaxis] * sym_uncs[np.newaxis, ...]
# now multiply by v[i] as well, indices: pars(i), pars(j), sample, bin
v_times_m_times_v = sym_uncs[:, np.newaxis, ...] * m_times_v
# finally perform sums over i and j, remaining indices: channel, sample, bin
total_variance = ak.sum(ak.sum(v_times_m_times_v, axis=1), axis=0)
# finally perform sums over i and j, remaining indices: sample, bin
total_variance = np.sum(np.sum(v_times_m_times_v, axis=1), axis=0)

# get number of bins from model config
n_bins = sum(model.config.channel_nbins.values())

# convert to standard deviations per bin and per channel
ekauffma marked this conversation as resolved.
Show resolved Hide resolved
n_channels = len(model.config.channels)
# add back outer channel dimension for per-bin uncertainty
# indices: (channel, sample, bin)
ekauffma marked this conversation as resolved.
Show resolved Hide resolved
total_stdev_per_bin = np.sqrt(total_variance[:n_channels])
total_stdev_per_bin = [
np.sqrt(total_variance[:, :n_bins][:, model.config.channel_slices[ch]]).tolist()
for ch in model.config.channels
]
# per-channel: transpose to flip remaining dimensions (channel sums acted as
# individual bins before)
# indices: (channel, sample)
total_stdev_per_channel = ak.sum(np.sqrt(total_variance[n_channels:]), axis=-1)
# log total stdev per bin / channel (-1 index for sample sum)
log.debug(f"total stdev is {total_stdev_per_bin[:, -1, :]}")
log.debug(f"total stdev per channel is {total_stdev_per_channel[:, -1]}")
total_stdev_per_channel = np.sqrt(total_variance[:, n_bins:].T).tolist()

# convert to lists
total_stdev_per_bin = ak.to_list(total_stdev_per_bin)
total_stdev_per_channel = ak.to_list(total_stdev_per_channel)
# log total stdev per bin / channel (-1 index for sample sum)
ekauffma marked this conversation as resolved.
Show resolved Hide resolved
n_channels = len(model.config.channels)
total_stdev_bin = [total_stdev_per_bin[i][-1] for i in range(n_channels)]
log.debug(f"total stdev is {total_stdev_bin}")
total_stdev_chan = [total_stdev_per_channel[i][-1] for i in range(n_channels)]
log.debug(f"total stdev per channel is {total_stdev_chan}")

# save to cache
_YIELD_STDEV_CACHE.update(
Expand Down