diff --git a/docs/source/dataset_tools.rst b/docs/source/dataset_tools.rst new file mode 100644 index 000000000..15ef21295 --- /dev/null +++ b/docs/source/dataset_tools.rst @@ -0,0 +1,13 @@ +.. _dataset-tools: + +Dataset Tools +************* + +This page contains documentation for parts of the ``coffea.dataset_tools`` +package that are not included in the ``coffea`` namespace. That is, they +must be explicitly imported. + +.. automodule:: coffea.dataset_tools.dataset_query + :members: +.. automodule:: coffea.dataset_tools.rucio_utils + :members: diff --git a/docs/source/reference.rst b/docs/source/reference.rst index e5b5a9a17..155f45c64 100644 --- a/docs/source/reference.rst +++ b/docs/source/reference.rst @@ -9,6 +9,11 @@ When executing a subset of the full coffea package is imported into the python environment. Some packages must be imported explicitly, so as to avoid importing unnecessary and/or heavy dependencies. Below lists the packages available in the ``coffea`` namespace. +Under that, we list documentation for some of the coffea packages that need to be +imported explicitly. + +In coffea Namespace +----------------------- .. autosummary:: :toctree: modules @@ -28,3 +33,10 @@ and/or heavy dependencies. Below lists the packages available in the ``coffea`` coffea.nanoevents.methods.vector coffea.processor coffea.util + +Not in coffea Namespace +--------------------------- +Here is documentation for some of the packages that are not automatically +imported on a call to ``import coffea``. + +* :ref:`dataset-tools`. diff --git a/src/coffea/analysis_tools.py b/src/coffea/analysis_tools.py index a256e535f..c0862889e 100644 --- a/src/coffea/analysis_tools.py +++ b/src/coffea/analysis_tools.py @@ -22,6 +22,24 @@ class WeightStatistics: + """ + Container for statistics about the weight, including the sum of squared weights + and number of entries. + + Parameters + ---------- + sumw: float + The sum of weights + sumw2: float + The sum of squared weights + minw: float + The minimum weight + maxw: float + The maximum weight + n: int + The number of entries + """ + def __init__(self, sumw=0.0, sumw2=0.0, minw=numpy.inf, maxw=-numpy.inf, n=0): self.sumw = sumw self.sumw2 = sumw2 @@ -36,6 +54,17 @@ def identity(self): return WeightStatistics() def add(self, other): + """Add two WeightStatistics objects together. + + Adds the sum of weights, the sum of squared weights, and the number of entries. + Takes the minimum and maximum across the two WeightStatistics objects. Modifies + this object in place. + + Parameters + ---------- + other: WeightStatistics + The other WeightStatistics object to add to this one + """ self.sumw += other.sumw self.sumw2 += other.sumw2 self.minw = min(self.minw, other.minw) @@ -76,6 +105,8 @@ def __init__(self, size, storeIndividual=False): @property def weightStatistics(self): + """Statistics about the weight, including the sum of squared weights + and number of entries.""" return self._weightStats def __add_eager(self, name, weight, weightUp, weightDown, shift): @@ -348,7 +379,7 @@ def __add_variation( self.__add_variation_delayed(name, weight, weightUp, weightDown, shift) def weight(self, modifier=None): - """Current event weight vector + """Returns the current event weight vector Parameters ---------- @@ -1100,6 +1131,14 @@ def names(self): @property def delayed_mode(self): + """ + Is the PackedSelection in delayed mode? + + Returns + ------- + res: bool + True if the PackedSelection is in delayed mode + """ if isinstance(self._data, dask_awkward.Array): return True elif isinstance(self._data, numpy.ndarray): @@ -1112,6 +1151,14 @@ def delayed_mode(self): @property def maxitems(self): + """ + What is the maximum supported number of selections in this PackedSelection? + + Returns + ------- + res: bool + The maximum supported number of selections + """ return PackedSelection._supported_types[self._dtype] def __add_delayed(self, name, selection, fill_value): diff --git a/src/coffea/btag_tools/btagscalefactor.py b/src/coffea/btag_tools/btagscalefactor.py index 6c91588e4..72ac8b995 100644 --- a/src/coffea/btag_tools/btagscalefactor.py +++ b/src/coffea/btag_tools/btagscalefactor.py @@ -19,6 +19,23 @@ class BTagScaleFactor: Defaults to 'comb,comb,incl' keep_df : bool, optional If set true, keep the parsed dataframe as an attribute (.df) for later inspection + + Attributes + ---------- + LOOSE: int + Value is 0. This is the integer for the loose WP + MEDIUM: int + Value is 1. This is the integer for the medium WP + TIGHT: int + Value is 2. This is the integer for the tight WP + RESHAPE: int + Value is 3. This is the integer for the reshape WP + FLAV_B: int + Value is 0. This is the integer to represent the b flavor. Input choice to some methods. + FLAV_C: int + Value is 1. This is the integer to represent the c flavor. Input choice to some methods. + FLAV_UDSG: int + Value is 2. This is the integer to represent u, d, and s flavors, as well as gluons. Input choice to some methods. """ LOOSE, MEDIUM, TIGHT, RESHAPE = range(4) diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py index 489b8b6f0..71612210b 100644 --- a/src/coffea/dataset_tools/apply_processor.py +++ b/src/coffea/dataset_tools/apply_processor.py @@ -40,6 +40,7 @@ def apply_to_dataset( ) -> DaskOutputType | tuple[DaskOutputType, dask_awkward.Array]: """ Apply the supplied function or processor to the supplied dataset. + Parameters ---------- data_manipulation : ProcessorABC or GenericHEPAnalysis @@ -97,6 +98,7 @@ def apply_to_fileset( ) -> dict[str, DaskOutputType] | tuple[dict[str, DaskOutputType], dask_awkward.Array]: """ Apply the supplied function or processor to the supplied fileset (set of datasets). + Parameters ---------- data_manipulation : ProcessorABC or GenericHEPAnalysis diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py index af199e857..47d4d7d6c 100644 --- a/src/coffea/dataset_tools/dataset_query.py +++ b/src/coffea/dataset_tools/dataset_query.py @@ -4,7 +4,7 @@ import os import random from collections import defaultdict -from typing import List +from typing import Dict, List import yaml from dask.distributed import Client @@ -18,7 +18,26 @@ from .preprocess import preprocess -def print_dataset_query(query, dataset_list, console, selected=[]): +def print_dataset_query( + query: str, + dataset_list: Dict[str, Dict[str, list[str]]], + console: Console, + selected: list[str] = [], +) -> None: + """ + Pretty-print the results of a rucio query in a table. + + Parameters + ---------- + query: str + The query given to rucio + dataset_list: dict[str, dict[str,list[str]]] + The second output of a call to query_dataset with tree=True + console: Console + A Console object to print to + selected: list[str], default [] + A list of selected datasets + """ table = Table(title=f"Query: [bold red]{query}") table.add_column("Name", justify="left", style="cyan", no_wrap=True) table.add_column("Tag", style="magenta", no_wrap=True) @@ -88,6 +107,12 @@ def get_indices_query(input_str: str, maxN: int) -> List[int]: class DataDiscoveryCLI: + """ + Simplifies dataset query, replicas, filters, and uproot preprocessing with Dask. + It can be accessed in a Python script or interpreter via this class, or from the + command line (as in `python -m coffea.dataset_tools.dataset_query --help`). + """ + def __init__(self): self.console = Console() self.rucio_client = None @@ -140,7 +165,7 @@ def start_cli(self): - [bold cyan]query-results[/]: List the results of the last dataset query - [bold cyan]list-selected[/]: Print a list of the selected datasets - [bold cyan]list-replicas[/]: Print the selected files replicas for the selected dataset - - [bold cyan]sites-filters[/]: show the active sites filters and ask to clear them + - [bold cyan]sites-filters[/]: Show the active sites filters and ask to clear them - [bold cyan]allow-sites[/]: Restrict the grid sites available for replicas query only to the requested list - [bold cyan]block-sites[/]: Exclude grid sites from the available sites for replicas query - [bold cyan]regex-sites[/]: Select sites with a regex for replica queries: e.g. "T[123]_(FR|IT|BE|CH|DE)_\w+" @@ -198,7 +223,14 @@ def do_whoami(self): print(self.rucio_client.whoami()) def do_query(self, query=None): - # Your code here + """ + Look for datasets with * wildcards (like in DAS) + + Parameters + ---------- + query: str | None, default None + The query to pass to rucio. If None, will prompt the user for an input. + """ if query is None: query = Prompt.ask( "[yellow bold]Query for[/]", @@ -218,6 +250,7 @@ def do_query(self, query=None): print("Use the command [bold red]select[/] to selected the datasets") def do_query_results(self): + """List the results of the last dataset query""" if self.last_query_list: print_dataset_query( self.last_query, @@ -229,8 +262,19 @@ def do_query_results(self): print("First [bold red]query (Q)[/] for a dataset") def do_select(self, selection=None, metadata=None): - """Selected the datasets from the list of query results. Input a list of indices - also with range 4-6 or "all".""" + """ + Selected the datasets from the list of query results. Input a list of indices + also with range 4-6 or "all". + + Parameters + ---------- + selection: list[str] | None, default None + A list of indices corresponding to selected datasets. Should be a + string, with indices separated by spaces. Can include ranges (like "4-6") + or "all". + metadata: dict[Hashable,Any], default None + Metadata to store in associated with selected datasets. + """ if not self.last_query_list: print("First [bold red]query (Q)[/] for a dataset") return @@ -260,6 +304,7 @@ def do_select(self, selection=None, metadata=None): ) def do_list_selected(self): + """Print a list of the selected datasets""" print("[cyan]Selected datasets:") table = Table(title="Selected datasets") table.add_column("Index", justify="left", style="cyan", no_wrap=True) @@ -280,12 +325,20 @@ def do_list_selected(self): self.console.print(table) def do_replicas(self, mode=None, selection=None): - """Query Rucio for replicas. - mode: - None: ask the user about the mode - - round-robin (take files randomly from available sites), - - choose: ask the user to choose from a list of sites - - first: take the first site from the rucio query - selection: list of indices or 'all' to select all the selected datasets for replicas query + """ + Query Rucio for replicas. + + Parameters + ---------- + mode: str, default None + One of the following + - None: ask the user about the mode + - round-robin (take files randomly from available sites), + - choose: ask the user to choose from a list of sites + - first: take the first site from the rucio query + selection: str, default None + list of indices or 'all' to select all the selected datasets for + replicas query """ if selection is None: selection = Prompt.ask( @@ -433,6 +486,16 @@ def as_dict(self): return self.final_output def do_allowlist_sites(self, sites=None): + """ + Restrict the grid sites available for replicas query only to the requested list + + Parameters + ---------- + sites: list[str] | None, default None + The sites to allow the replicas query to look at. If passing in a list, + elements of the list are sites. If passing in None, the prompt requires + a single string containing a comma-separated listing. + """ if sites is None: sites = Prompt.ask( "[yellow]Restrict the available sites to (comma-separated list)" @@ -446,6 +509,16 @@ def do_allowlist_sites(self, sites=None): print(f"- {s}") def do_blocklist_sites(self, sites=None): + """ + Exclude grid sites from the available sites for replicas query + + Parameters + ---------- + sites: list[str] | None, default None + The sites to prevent the replicas query from looking at. If passing in a + list elements of the list are sites. If passing in None, the prompt + requires a single string containing a comma-separated listing. + """ if sites is None: sites = Prompt.ask( "[yellow]Exclude the sites (comma-separated list)" @@ -459,6 +532,14 @@ def do_blocklist_sites(self, sites=None): print(f"- {s}") def do_regex_sites(self, regex=None): + r""" + Select sites with a regex for replica queries: e.g. "T[123]_(FR|IT|BE|CH|DE)_\w+" + + Parameters + ---------- + regex: str | None, default None + Sites to use for replica queries, described with a regex string. + """ if regex is None: regex = Prompt.ask("[yellow]Regex to restrict the available sites") if len(regex): @@ -466,6 +547,16 @@ def do_regex_sites(self, regex=None): print(f"New sites regex: [cyan]{self.sites_regex}") def do_sites_filters(self, ask_clear=True): + """ + Show the active sites filters (allowed, disallowed, and regex) and ask to clear + them + + Parameters + ---------- + ask_clear: bool, default True + If True, ask the user via prompt if allow, disallow, and regex filters + should be cleared. + """ print("[green bold]Allow-listed sites:") if self.sites_allowlist: for s in self.sites_allowlist: @@ -479,13 +570,14 @@ def do_sites_filters(self, ask_clear=True): print(f"[bold cyan]Sites regex: [italics]{self.sites_regex}") if ask_clear: - if Confirm.ask("Clear sites restrinction?", default=False): + if Confirm.ask("Clear sites restriction?", default=False): self.sites_allowlist = None self.sites_blocklist = None self.sites_regex = None print("[bold green]Sites filters cleared") def do_list_replicas(self): + """Print the selected files replicas for the selected dataset""" selection = Prompt.ask( "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all" ) @@ -507,7 +599,13 @@ def do_list_replicas(self): self.console.print(tree) def do_save(self, filename=None): - """Save the replica information in yaml format""" + """ + Save the replica information in yaml format + + Parameters: + filename: str | None, default None + The name of the file to save the information into + """ if not filename: filename = Prompt.ask( "[yellow bold]Output file name (.yaml or .json)", default="output.json" @@ -536,8 +634,21 @@ def do_preprocess( step_size_safety_factor=0.5, allow_empty_datasets=False, ): - """Perform preprocessing for concrete fileset extraction. - Args: output_file [step_size] [align to file cluster boundaries] [dask scheduler url] + """ + Perform preprocessing for concrete fileset extraction into a file, compressed + with gzip. + + Parameters + ---------- + output_file: str | None, default None + The name of the file to write the preprocessed file into + step_size: int | None, default None + The chunk size for file splitting + align_to_clusters: bool | None, default None + Whether or not round to the cluster size in a root file. See + align_clusters parameter in coffea.dataset_tools.preprocess. + scheduler_url: str | None, default None + Dask scheduler URL where the preprocessing should take place """ if not output_file: output_file = Prompt.ask( @@ -600,12 +711,25 @@ def load_dataset_definition( Initialize the DataDiscoverCLI by querying a set of datasets defined in `dataset_definitions` and selected results and replicas following the options. - - query_results_strategy: "all" or "manual" to be prompt for selection - - replicas_strategy: - - "round-robin": select randomly from the available sites for each file - - "choose": filter the sites with a list of indices for all the files - - "first": take the first result returned by rucio - - "manual": to be prompt for manual decision dataset by dataset + Parameters + ---------- + dataset_definition: Dict[str,Dict[Hashable,Any]] + Keys are dataset queries (ie: something that can be passed to do_query()) + query_results_strategy: str, default "all" + How to decide which datasets to select. If "manual", user will be prompted + for selection + replicas_strategy: str, default "round-robin" + Options are: + - "round-robin": select randomly from the available sites for each file + - "choose": filter the sites with a list of indices for all the files + - "first": take the first result returned by rucio + - "manual": to be prompt for manual decision dataset by dataset + + Returns + ------- + out_replicas: FilesetSpecOptional + An uproot-readable fileset. At this point, the fileset is not fully + preprocessed, but this can be done with do_preprocess(). """ for dataset_query, dataset_meta in dataset_definition.items(): print(f"\nProcessing query: {dataset_query}") diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py index e515ce2bf..c037e7f5c 100644 --- a/src/coffea/dataset_tools/manipulations.py +++ b/src/coffea/dataset_tools/manipulations.py @@ -12,6 +12,7 @@ def max_chunks(fileset: FilesetSpec, maxchunks: int | None = None) -> FilesetSpec: """ Modify the input dataset so that only the first "maxchunks" chunks of each file will be processed. + Parameters ---------- fileset: FilesetSpec @@ -30,6 +31,7 @@ def max_chunks(fileset: FilesetSpec, maxchunks: int | None = None) -> FilesetSpe def slice_chunks(fileset: FilesetSpec, theslice: Any = slice(None)) -> FilesetSpec: """ Modify the input dataset so that only the chunks of each file specified by the input slice are processed. + Parameters ---------- fileset: FilesetSpec @@ -56,6 +58,7 @@ def slice_chunks(fileset: FilesetSpec, theslice: Any = slice(None)) -> FilesetSp def max_files(fileset: FilesetSpec, maxfiles: int | None = None) -> FilesetSpec: """ Modify the input dataset so that only the first "maxfiles" files of each dataset will be processed. + Parameters ---------- fileset: FilesetSpec @@ -74,6 +77,7 @@ def max_files(fileset: FilesetSpec, maxfiles: int | None = None) -> FilesetSpec: def slice_files(fileset: FilesetSpec, theslice: Any = slice(None)) -> FilesetSpec: """ Modify the input dataset so that only the files of each dataset specified by the input slice are processed. + Parameters ---------- fileset: FilesetSpec @@ -111,6 +115,7 @@ def filter_files( ) -> FilesetSpec: """ Modify the input dataset so that only the files of each dataset that pass the filter remain. + Parameters ---------- fileset: FilesetSpec @@ -134,6 +139,7 @@ def get_failed_steps_for_dataset( ) -> DatasetSpec: """ Modify an input dataset to only contain the files and row-ranges for *failed* processing jobs as specified in the supplied report. + Parameters ---------- dataset: DatasetSpec @@ -190,6 +196,7 @@ def get_failed_steps_for_fileset( ): """ Modify an input dataset to only contain the files and row-ranges for *failed* processing jobs as specified in the supplied report. + Parameters ---------- fileset: FilesetSpec diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py index 328f0daf3..65f232da9 100644 --- a/src/coffea/dataset_tools/preprocess.py +++ b/src/coffea/dataset_tools/preprocess.py @@ -32,6 +32,7 @@ def get_steps( ) -> awkward.Array | dask_awkward.Array: """ Given a list of normalized file and object paths (defined in uproot), determine the steps for each file according to the supplied processing options. + Parameters ---------- normed_files: awkward.Array | dask_awkward.Array diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py index a4d3bc9ce..ea6e8eb0d 100644 --- a/src/coffea/dataset_tools/rucio_utils.py +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -157,13 +157,22 @@ def get_dataset_files_replicas( ---------- dataset: str + The dataset to search for. allowlist_sites: list + List of sites to select from. If the file is not found there, raise an Exception. blocklist_sites: list + List of sites to avoid. If the file has no left site, raise an Exception. regex_sites: list + Regex expression to restrict the list of sites. mode: str, default "full" + One of "full", "first", "best", or "roundrobin". Behavior of each described above. client: rucio Client, optional + The rucio client to use. If not provided, one will be generated for you. partial_allowed: bool, default False + If False, throws an exception if any file in the dataset cannot be found. If True, + will find as many files from the dataset as it can. scope: rucio scope, "cms" + The scope for rucio to search through. Returns ------- @@ -292,18 +301,21 @@ def query_dataset( Parameters --------- - query: str = query to filter datasets / containers with the rucio list_dids functions - client: rucio client - tree: bool = if True return the results splitting the dataset name in parts parts - datatype: "container/dataset": rucio terminology. "Container"==CMS dataset. "Dataset" == CMS block. - scope: "cms". Rucio instance + query: str + Query to filter datasets / containers with the rucio list_dids functions + client: rucio Client + The rucio client to use. If not provided, one will be generated for you + tree: bool, default False + If True, return the results splitting the dataset name in parts + datatype: str, default "container" + Options are "container", "dataset". rucio terminology. "Container"==CMS dataset. "Dataset" == CMS block. + scope: str, default "cms" + Rucio instance Returns ------- - list of containers/datasets - - if tree==True, returns the list of dataset and also a dictionary decomposing the datasets - names in the 1st command part and a list of available 2nd parts. + List of containers/datasets. If tree==True, returns the list of dataset and also a dictionary decomposing + the datasets names in the 1st command part and a list of available 2nd parts. """ client = client if client else get_rucio_client() diff --git a/src/coffea/jetmet_tools/CorrectedJetsFactory.py b/src/coffea/jetmet_tools/CorrectedJetsFactory.py index a7637c69b..5e118f536 100644 --- a/src/coffea/jetmet_tools/CorrectedJetsFactory.py +++ b/src/coffea/jetmet_tools/CorrectedJetsFactory.py @@ -129,6 +129,27 @@ def getfunction(layout, depth, **kwargs): class CorrectedJetsFactory: + """ + Factory class for applying corrections to jets, including organizing variations + (eg: JES up and down). It is constructed from a name map, which translates between + field names for corrections and field names in inputs, and a JECStack, which + contains the actual correction functions. + + Once a CorrectedJetsFactory is constructed, the `build` method can produce corrected + jets from an input array of jets. + + Parameters + ---------- + name_map: dict[str,str] + Keys are argument names in the various corrections' signatures (eg: the `signature` + attribute of a `FactorizedJJetCorrector` object). Values are the names of the + corresponding fields as they would appear in the jet array passed to the `build` + method. + jec_stack: JECStack + Contains the corrections that will be applied to the input jet array when calling + `build`. + """ + def __init__(self, name_map, jec_stack): # from PhysicsTools/PatUtils/interface/SmearedJetProducerT.h#L283 self.forceStochastic = False @@ -173,12 +194,34 @@ def __init__(self, name_map, jec_stack): self.jec_stack = jec_stack def uncertainties(self): + """ + Returns a list of the sources of uncertainty included in the stack. + + Returns + ------- + list[str] + A list of the sources of uncertainty. + """ out = ["JER"] if self.jec_stack.jer is not None else [] if self.jec_stack.junc is not None: out.extend([f"JES_{unc}" for unc in self.jec_stack.junc.levels]) return out def build(self, injets): + """ + Apply the corrections to the array of jets, returning an array of corrected + jets. + + Parameters + ---------- + injets: (Awkward array[jets]) + An array of uncorrected jets, to which we want to apply corrections. + + Returns + ------- + Awkward array of jets, representing the corrected jets, with shape matching + `injets`. + """ if not isinstance(injets, (awkward.highlevel.Array, dask_awkward.Array)): raise Exception("input jets must be an (dask_)awkward array of some kind!") diff --git a/src/coffea/jetmet_tools/CorrectedMETFactory.py b/src/coffea/jetmet_tools/CorrectedMETFactory.py index 5853dc5b2..8eb74eab0 100644 --- a/src/coffea/jetmet_tools/CorrectedMETFactory.py +++ b/src/coffea/jetmet_tools/CorrectedMETFactory.py @@ -19,6 +19,30 @@ def corrected_polar_met( class CorrectedMETFactory: + """ + Factory class for propagating corrections made to jets into a corrected value + of MET. This includes organizing different variations associated with uncertainties + in MET from unclustered energy. + + Once the `CorrectedMETFactory` is constructed, an array of corrected MET values and + variations can be produced with the `build` method, which requires an array of + uncorrected MET and an array of corrected jets. + + Parameters + ---------- + name_map: dict[str,str] + Keys must include at least the following: + - METpt + - METphi + - JetPt + - JetPhi + - ptRaw + - UnClusteredEnergyDeltaX + - UnClusteredEnergyDeltaY + and each of those must be mapped to the corresponding field name of the input + arrays `in_MET` and `in_corrected_jets` for the `build` method. + """ + def __init__(self, name_map): for name in [ "METpt", @@ -37,6 +61,21 @@ def __init__(self, name_map): self.name_map = name_map def build(self, in_MET, in_corrected_jets): + """ + Produce an array of corrected MET values from an array of uncorrected MET + values and an array of corrected jets. + + Parameters + ---------- + in_MET: (Awkward array[float]) + An array of raw (uncorrected) MET values. + in_corrected_jets: (Awkward array[jets]) + An array of corrected jets, as produced by `CorrectedJetsFactory`. + + Returns + ------- + Awkward array of corrected MET values, with shape matching `in_MET`. + """ if not isinstance( in_MET, (awkward.highlevel.Array, dask_awkward.Array) ) or not isinstance( @@ -171,4 +210,12 @@ def create_variants(raw_met, corrected_jets_or_variants, dx, dy): return out def uncertainties(self): + """ + Returns a list of the sources of uncertainty included in the stack. + + Returns + ------- + list[str] + A list of the sources of uncertainty. + """ return ["MET_UnclusteredEnergy"] diff --git a/src/coffea/jetmet_tools/FactorizedJetCorrector.py b/src/coffea/jetmet_tools/FactorizedJetCorrector.py index 0f752e49e..22946d6df 100644 --- a/src/coffea/jetmet_tools/FactorizedJetCorrector.py +++ b/src/coffea/jetmet_tools/FactorizedJetCorrector.py @@ -62,15 +62,19 @@ class FactorizedJetCorrector: You can use this class as follows:: fjc = FactorizedJetCorrector(name1=corrL1,...) - jetCorrs = fjc(JetParameter1=jet.parameter1,...) + jetCorrs = fjc.getCorrection(JetParameter1=jet.parameter1,...) + in which `jetCorrs` are the corrected jet scaled factors, with the same shape as + the input parameters. In order to see what parameters must be passed to + `getCorrection()`, one can do `fjc.signature`. + + You construct a FactorizedJetCorrector by passing in a dict of names and functions. + Names must be formatted as '____'. You + can use coffea.lookup_tools' `extractor` and `evaluator` to get the functions from + some input files. """ def __init__(self, **kwargs): - """ - You construct a FactorizedJetCorrector by passing in a dict of names and functions. - Names must be formatted as '____'. - """ jettype = None levels = [] funcs = [] diff --git a/src/coffea/jetmet_tools/JECStack.py b/src/coffea/jetmet_tools/JECStack.py index d76124f65..9e98ce602 100644 --- a/src/coffea/jetmet_tools/JECStack.py +++ b/src/coffea/jetmet_tools/JECStack.py @@ -8,12 +8,35 @@ class JECStack: + """ + Mostly used as an input to `CorrectedJetsFactory`. Hosts and organizes multiple + corrections under one object. + + jec, junc, etc. can be explicitly set by passing in the appropriate corrector class + (eg: FactorizedJetCorrector). If they are not set, correctors will be created, using + the info in `corrections` as input. + + Parameters + --------- + corrections: dict[str,lookup_base] + A dict-like of function names and functions. The function depends on the type + of correction (eg: for JEC, should be jme_standard_function). We expect JEC + names to be formatted as their filenames. + jec: FactorizedJetCorrector, optional + If provided, overrides the jec that would be created from `corrections` in + the stack. + junc: JetCorrectionUncertainty, optional + If provided, overrides the junc that would be created from `corrections` in + the stack. + jer: JetResolution, optional + If provided, overrides the jer that would be created from `corrections` in + the stack. + jersf: JetResolutionScaleFactor, optional + If provided, overrides the jersf that would be created from `corrections` in + the stack. + """ + def __init__(self, corrections, jec=None, junc=None, jer=None, jersf=None): - """ - corrections is a dict-like of function names and functions - we expect JEC names to be formatted as their filenames - jecs, etc. can be overridden by passing in the appropriate corrector class. - """ self._jec = None self._junc = None self._jer = None @@ -99,6 +122,10 @@ def __init__(self, corrections, jec=None, junc=None, jer=None, jersf=None): @property def blank_name_map(self): + """ + A dictionary in the form of the `name_map` input parameter for + `CorrectedJetsFactory`, with all keys mapped to None. + """ out = { "massRaw", "ptRaw", @@ -126,16 +153,28 @@ def blank_name_map(self): @property def jec(self): + """ + The stack's FactorizedJetCorrector object. + """ return self._jec @property def junc(self): + """ + The stack's JetCorrectionUncertainty object. + """ return self._junc @property def jer(self): + """ + The stack's JetResolution object. + """ return self._jer @property def jersf(self): + """ + The stack's JetResolutionScaleFactor object. + """ return self._jersf diff --git a/src/coffea/jetmet_tools/JetCorrectionUncertainty.py b/src/coffea/jetmet_tools/JetCorrectionUncertainty.py index 26f43f873..4bc8868cd 100644 --- a/src/coffea/jetmet_tools/JetCorrectionUncertainty.py +++ b/src/coffea/jetmet_tools/JetCorrectionUncertainty.py @@ -67,8 +67,16 @@ class JetCorrectionUncertainty: You can use this class as follows:: jcu = JetCorrectionUncertainty(name1=corrL1,...) - jetUncs = jcu(JetParameter1=jet.parameter1,...) + jetUncs = jcu.getUncertainty(JetParameter1=jet.parameter1,...) + in which `jetUncs` are the uncertainties, with the same shape as the input parameters. + In order to see which parameters must be passed to `getUncertainty`, one can do + `jcu.signature`. + + You construct a JetCorrectionUncertainty by passing in a dict of names and functions. + Names must be formatted as '____'. You + can use coffea.lookup_tools' `extractor` and `evaluator` to get the functions from + some input files. """ def __init__(self, **kwargs): diff --git a/src/coffea/jetmet_tools/JetResolution.py b/src/coffea/jetmet_tools/JetResolution.py index 18cb0ca02..59c5c4bd4 100644 --- a/src/coffea/jetmet_tools/JetResolution.py +++ b/src/coffea/jetmet_tools/JetResolution.py @@ -45,15 +45,19 @@ class JetResolution: You can use this class as follows:: jr = JetResolution(name1=corrL1,...) - jetRes = jr(JetParameter1=jet.parameter1,...) + jetRes = jr.getResolution(JetParameter1=jet.parameter1,...) + in which `jetRes` are the resolutions, with the same shape as the input parameters. + In order to see what parameters must be passed to `getResolution`, one can do + `jr.signature`. + + You construct a JetResolution object by passing in a dict of names and functions. + Names must be formatted as '____'. You + can use coffea.lookup_tools' `extractor` and `evaluator` to get the functions from + some input files. """ def __init__(self, **kwargs): - """ - You construct a JetResolution by passing in a dict of names and functions. - Names must be formatted as '____'. - """ jettype = None levels = [] funcs = [] diff --git a/src/coffea/jetmet_tools/JetResolutionScaleFactor.py b/src/coffea/jetmet_tools/JetResolutionScaleFactor.py index 40f7c3b5d..73311991c 100644 --- a/src/coffea/jetmet_tools/JetResolutionScaleFactor.py +++ b/src/coffea/jetmet_tools/JetResolutionScaleFactor.py @@ -45,15 +45,19 @@ class JetResolutionScaleFactor: You can use this class as follows:: jersf = JetResolutionScaleFactor(name1=corrL1,...) - jetResSF = jersf(JetParameter1=jet.parameter1,...) + jetResSF = jersf.getScaleFactor(JetParameter1=jet.parameter1,...) + in which `jetResSF` are the scale factors, with the same shape as the input parameters. + In order to see which parameters must be passed to `getScaleFactor`, one can do + `jersf.signature`. + + You construct a JetResolutionScaleFactor by passing in a dict of names and functions. + Names must be formatted as '____'. You + can use coffea.lookup_tools' `extractor` and `evaluator` to get the functions from + some input files. """ def __init__(self, **kwargs): - """ - You construct a JetResolutionScaleFactor by passing in a dict of names and functions. - Names must be formatted as '____'. - """ jettype = None levels = [] funcs = [] diff --git a/src/coffea/lookup_tools/evaluator.py b/src/coffea/lookup_tools/evaluator.py index 50ad6e674..27dd78ba0 100644 --- a/src/coffea/lookup_tools/evaluator.py +++ b/src/coffea/lookup_tools/evaluator.py @@ -33,9 +33,23 @@ class evaluator: evaluator = extractor.make_evaluator() out = evaluator["testSF2d"](eta, pt) - The returned value has the same shape as the input arguments. + The returned value has the same shape as the input arguments. `lookup_types` is a map of possible + constructors for extracted data. The arguments used when calling the evaluator depend on which named + weight is being used (eg. in the above example, the "testSF2d" weight requires `eta` and `pt` be + passed when calling the evaluator). - lookup_types is a map of possible constructors for extracted data + It is recommended to construct an evaluator from an extractor, so ensure that inputs to the + constructor are properly ordered and formatted. + + Parameters + ---------- + names: dict[str, int] + A dictionary mapping the names of weights to the index of that weight in `primitives`. + types: list[str] + A list of the types of weights, ordered in the same way as `primitives`. + primitives: list[Varies] + A list of primitives, whose type and structure depend on types. Should be order in the + same way as `primitives`. """ def __init__(self, names, types, primitives): diff --git a/src/coffea/lookup_tools/extractor.py b/src/coffea/lookup_tools/extractor.py index 3d73e5586..7b217601f 100644 --- a/src/coffea/lookup_tools/extractor.py +++ b/src/coffea/lookup_tools/extractor.py @@ -65,7 +65,18 @@ def __init__(self): self._finalized = False def add_weight_set(self, local_name, thetype, weights): - """adds one extracted weight to the extractor""" + """ + Adds one extracted weight to the extractor. + + Parameters + ---------- + local_name: str + The name of the weight. + thetype: str + The type of weight (eg: jme_standard_function). + weights: Varies + The weights themselves. Type and structure depends on thetype. + """ if self._finalized: raise Exception("extractor is finalized cannot add new weights!") if local_name in self._names.keys(): @@ -76,8 +87,13 @@ def add_weight_set(self, local_name, thetype, weights): def add_weight_sets(self, weightsdescs): """ - expects a list of text lines to be formatted as ' ' - allows * * and * to do easy imports of whole file + Add multiple weight sets at once, coming from one or more files. + + Parameters + ---------- + weightsdescs: Iterable[str] + Expects a list of text lines to be formatted as ' '. + Allows * * and * to do easy imports of whole file. """ for weightdesc in weightsdescs: if weightdesc[0] == "#": @@ -110,7 +126,14 @@ def add_weight_sets(self, weightsdescs): self._names[local_name] = 0 def import_file(self, thefile): - """cache the whole contents of a file for later processing""" + """ + Cache the whole contents of a file for later processing + + Parameters + ---------- + thefile: str + The path to the file to be imported + """ if thefile not in self._filecache.keys(): drop_gz = thefile.replace(".gz", "") file_dots = os.path.basename(drop_gz).split(".") @@ -127,7 +150,16 @@ def import_file(self, thefile): self._filecache[thefile] = file_converters[theformat][thetype](thefile) def extract_from_file(self, thefile, name): - """import a file and then extract a lookup set""" + """ + Import a file and then extract a lookup set + + Parameters + ---------- + thefile: str + The path to the file to import + name: str + The name of the weights to extract, as named in the file + """ self.import_file(thefile) weights = self._filecache[thefile] names = {key[0]: key[1] for key in weights.keys()} @@ -137,8 +169,14 @@ def extract_from_file(self, thefile, name): def finalize(self, reduce_list=None): """ - stop any further imports and if provided pare down - the stored histograms to those specified in reduce_list + Stop any further imports and, if requested, pare down + the stored histograms to those specified in reduce_list. + + Parameters + ---------- + reduce_list: list[str], optional + Reduce the weights contained in this extractor to only those with names + in reduce_list. If not provided, no such reduction takes place. """ if self._finalized: raise Exception("extractor is already finalized!") @@ -159,7 +197,13 @@ def finalize(self, reduce_list=None): self._finalized = True def make_evaluator(self): - """produce an evaluator based on the finalized extractor""" + """ + Produce an evaluator based on the finalized extractor + + Returns + ------- + An evaluator based on the names, weight types, and weights of the finalized extractor. + """ if self._finalized: return evaluator(self._names, self._types, self._weights) else: diff --git a/src/coffea/lumi_tools/lumi_tools.py b/src/coffea/lumi_tools/lumi_tools.py index 0dfc3aae2..29170d554 100644 --- a/src/coffea/lumi_tools/lumi_tools.py +++ b/src/coffea/lumi_tools/lumi_tools.py @@ -33,14 +33,18 @@ class LumiData: Parameters ---------- lumi_csv : str - The path the the luminosity csv output file + The path to the luminosity csv file to read from. Generally, this is the output file from brilcalc. + is_inst_lumi: bool, default False + If True, treats the values read in from `lumi_csv` as average instantaneous luminosities, instead of integrated luminosities. - The values are extracted from the csv output as returned by brilcalc, e.g. with a command such as:: + The values are extracted from the csv output as returned by brilcalc_, e.g. with a command such as:: brilcalc lumi -c /cvmfs/cms.cern.ch/SITECONF/local/JobConfig/site-local-config.xml \ -b "STABLE BEAMS" --normtag=/cvmfs/cms-bril.cern.ch/cms-lumi-pog/Normtags/normtag_PHYSICS.json \ -u /pb --byls --output-style csv -i Cert_294927-306462_13TeV_PromptReco_Collisions17_JSON.txt > lumi2017.csv + .. _brilcalc: https://cms-service-lumi.web.cern.ch/cms-service-lumi/brilwsdoc.html + Note that some brilcalc files may be in different units than inverse picobarns, including possibly average instantaneous luminosity. You should make sure that you understand the units of the LumiData file you are using before calculating luminosity with this tool. If you are using a LumiData file containing avg. inst. luminosity, make sure to set is_inst_lumi=True in the constructor of this class. @@ -73,7 +77,12 @@ def get_lumi(self, runlumis): ---------- runlumis : numpy.ndarray or LumiList A 2d numpy array of ``[[run,lumi], [run,lumi], ...]`` or `LumiList` object - of the lumiSections to integrate over. + of the lumiSections to integrate over, where `run` is a run number and `lumi` is a + lumisection number. + + Returns + ------- + (float) The total integrated luminosity of the runs and lumisections indicated in `runlumis`. """ if self.index is None: self.index = Dict.empty( @@ -132,14 +141,15 @@ def _get_lumi_kernel(runs, lumis, index, tot_lumi): class LumiMask: - """Holds a luminosity mask index, and provides vectorized lookup + """ + Holds a luminosity mask index, and provides vectorized lookup, retaining only valid (run,lumisection) pairs. Parameters ---------- jsonfile : str Path the the 'golden json' file or other valid lumiSection database in json format. - This class parses a CMS lumi json into an efficient valid lumiSection lookup table + This class parses a CMS lumi json into an efficient valid lumiSection lookup table. """ def __init__(self, jsonfile): @@ -154,7 +164,8 @@ def __init__(self, jsonfile): self._masks[numpy.uint32(run)] = mask def __call__(self, runs, lumis): - """Check if run and lumi are valid + """ + Check pairs of runs and lumis for validity, and produce a mask retaining the valid pairs. Parameters ---------- diff --git a/src/coffea/ml_tools/helper.py b/src/coffea/ml_tools/helper.py index c5d8f90d9..59887ee7d 100644 --- a/src/coffea/ml_tools/helper.py +++ b/src/coffea/ml_tools/helper.py @@ -136,13 +136,13 @@ class numpy_call_wrapper(abc.ABC): For tools outside the coffea package (like for ML inference), the inputs typically expect a numpy-like input. This class wraps up the user-level - awkward->numpy data mangling and the underling numpy evaluation calls to + awkward->numpy data mangling and the underlying numpy evaluation calls to recognizable to dask. For the class to be fully functional, the user must overload these methods: - - numpy_call: How the evaluation using all numpy tool be performed - - prepare_awkward: How awkward arrays should be translated to the a numpy + - numpy_call: How the evaluation using all-numpy tool be performed + - prepare_awkward: How awkward arrays should be translated to a numpy format that is compatible with the numpy_call Additionally, the following helper functions can be omitted, but will help @@ -150,7 +150,7 @@ class numpy_call_wrapper(abc.ABC): - validate_numpy_input: makes sure the computation routine understand the input. - - numpy_to_awkward: Additional translation to convert numpy outputs to + - postprocess_awkward: Additional translation to convert numpy outputs to awkward (defaults to a simple `awkward.from_numpy` conversion) """ @@ -233,7 +233,7 @@ def postprocess_awkward(self, return_array, *args, **kwargs): def _call_awkward(self, *args, **kwargs): """ The common routine of prepare_awkward conversion, numpy evaluation, - then numpy_to_awkward conversion. + then postprocess_awkward conversion. """ ak_args, ak_kwargs = self.prepare_awkward(*args, **kwargs) (np_args, np_kwargs), _ = self._ak_to_np_(*ak_args, **ak_kwargs) @@ -246,7 +246,7 @@ def _call_dask(self, *args, **kwargs): Wrapper required for dask awkward calls. Here we create a new callable class (_callable_wrap) that packs the - prepare_awkward/numpy_call/numpy_to_awkward call routines to be + prepare_awkward/numpy_call/postprocess_awkward call routines to be passable to the dask_awkward.map_partition method. In addition, because map_partition by default expects the callable's diff --git a/src/coffea/ml_tools/torch_wrapper.py b/src/coffea/ml_tools/torch_wrapper.py index 36625d244..e09f92afb 100644 --- a/src/coffea/ml_tools/torch_wrapper.py +++ b/src/coffea/ml_tools/torch_wrapper.py @@ -14,24 +14,38 @@ class torch_wrapper(nonserializable_attribute, numpy_call_wrapper): """ Wrapper for running pytorch with awkward/dask-awkward inputs. - """ - def __init__(self, torch_jit: str): - """ - As torch models are not guaranteed to be serializable we load the model - using torch save-state files. Notice that we only support TorchScript - files for this wrapper class [1]. If the user is attempting to run on - the clusters, the TorchScript file will need to be passed to the worker - nodes in a way which preserves the file path. + As torch models are not guaranteed to be serializable we load the model + using torch save-state files. Notice that we only support TorchScript + files for this wrapper class [1]. If the user is attempting to run on + the clusters, the TorchScript file will need to be passed to the worker + nodes in a way which preserves the file path. - [1] - https://pytorch.org/tutorials/beginner/saving_loading_models.html#export-load-model-in-torchscript-format + Once an instance `wrapper` of this class is created, it can be called on inputs + like `wrapper(*args)`, where `args` are the inputs to `prepare_awkward` (see + next paragraph). - Parameters - ---------- + In order to actually use the class, the user must override the method + `prepare_awkward`. The input to this method is an arbitrary number of awkward + arrays or dask awkward arrays (but never a mix of dask/non-dask array). The + output is two objects: a tuple `a` and a dictionary `b` such that the underlying + `pytorch` model instance calls like `model(*a,**b)`. The contents of a and b + should be numpy-compatible awkward-like arrays: if the inputs are non-dask awkward + arrays, the return should also be non-dask awkward arrays that can be trivially + converted to numpy arrays via a ak.to_numpy call; if the inputs are dask awkward + arrays, the return should be still be dask awkward arrays that can be trivially + converted via a to_awkward().to_numpy() call. - - torch_jit: Path to the TorchScript file to load - """ + [1] + https://pytorch.org/tutorials/beginner/saving_loading_models.html#export-load-model-in-torchscript-format + + Parameters + ---------- + torch_jit: str + Path to the TorchScript file to load + """ + + def __init__(self, torch_jit: str): if _torch_import_error is not None: warnings.warn( "Users should make sure the torch package is installed before proceeding!\n" diff --git a/src/coffea/ml_tools/triton_wrapper.py b/src/coffea/ml_tools/triton_wrapper.py index 29b78986d..c92180b1c 100644 --- a/src/coffea/ml_tools/triton_wrapper.py +++ b/src/coffea/ml_tools/triton_wrapper.py @@ -21,9 +21,37 @@ class triton_wrapper(nonserializable_attribute, numpy_call_wrapper): Wrapper for running triton inference. The target of this class is such that all triton specific operations are - wrapped and abstracted-away from the users. The users should then only needs + wrapped and abstracted-away from the users. The user should then only need to handle awkward-level operations to mangle the arrays into the expected - input format required by the the model of interest. + input format required by the the model of interest. This must be done by + overriding the `prepare_awkward` method. + + Once an instance `wrapper` of this class is created, it can be called on inputs + like `wrapper(*args)`, where `args` are the inputs to `prepare_awkward` (see + next paragraph). + + In order to actually use the class, the user must override the method + `prepare_awkward`. The input to this method is an arbitrary number of awkward + arrays or dask awkward arrays (but never a mix of dask/non-dask array). The + output is two objects: a tuple `a` and a dictionary `b` such that the underlying + `tritonclient` instance calls like `client(*a,**b)`. The contents of a and b + should be numpy-compatible awkward-like arrays: if the inputs are non-dask awkward + arrays, the return should also be non-dask awkward arrays that can be trivially + converted to numpy arrays via a ak.to_numpy call; if the inputs are dask awkward + arrays, the return should be still be dask awkward arrays that can be trivially + converted via a to_awkward().to_numpy() call. + + Parameters + ---------- + model_url: str + A string in the format of: `triton+://
//` + + client_args: dict[str,str], optional + Optional keyword arguments to pass to the underlying `InferenceServerClient` objects. + + batch_size: int, default -1 + How the input arrays should be split up for analysis processing. Leave negative to + have this automatically resolved. """ batch_size_fallback = 10 # Fall back should batch size not be determined. @@ -32,19 +60,6 @@ class triton_wrapper(nonserializable_attribute, numpy_call_wrapper): def __init__( self, model_url: str, client_args: Optional[Dict] = None, batch_size=-1 ): - """ - Parameters - ---------- - - - model_url: A string in the format of: - triton+://
// - - - client_args: optional keyword arguments to pass to the underlying - `InferenceServerClient` objects. - - - batch_size: How the input arrays should be split up for analysis - processing. Leave negative to have this automatically resolved. - """ if _triton_import_error is not None: warnings.warn( "Users should make sure the tritonclient package is installed before proceeding!\n" @@ -157,8 +172,28 @@ def validate_numpy_input( self, output_list: List[str], input_dict: Dict[str, numpy.array] ) -> None: """ - tritonclient can return the expected input array dimensions and - available output values. + Check that tritonclient can return the expected input array dimensions and + available output values. Can be useful when ensuring that data is being properly + mangled for Triton. This method is called just before passing to the Triton client + when an inference request is made. + + If no errors are raised, it is understood that the input is validated by this function. + + Parameters + ---------- + output_list: list[str] + List of string corresponding to the name of the outputs + of interest. These strings will be automatically translated into the + required `tritonclient.InferRequestedOutput` objects. This is identical + to the first argument the user passes in when calling the `triton_wrapper` + instance. + + input_dict: dict[str,np.array] + Dictionary with the model's input-names as the key and the + appropriate numpy array as the dictionary value. This dictionary is + automatically translated into a list of `tritonclient.InferInput` + objects. This is identical to the second argument the user passes in when + calling the `triton_wrapper` instance. """ # Input value checking for iname, iarr in input_dict.items(): @@ -213,22 +248,23 @@ def numpy_call( """ Parameters ---------- - - - output_list: List of string corresponding to the name of the outputs - of interest. These strings will be automatically translated into the - required `tritonclient.InferRequestedOutput` objects. - - - input_dict: Dictionary with the model's input-names as the key and the - appropriate numpy array as the dictionary value. This dictionary is - automatically translated into a list of `tritonclient.InferInput` - objects. - - - Return - ------ - - The return will be the dictionary of numpy arrays that have the - output_list arguments as keys. + output_list: list[str] + List of string corresponding to the name of the outputs + of interest. These strings will be automatically translated into the + required `tritonclient.InferRequestedOutput` objects. + + input_dict: dict[str,np.array] + Dictionary with the model's input-names as the key and the + appropriate numpy array as the dictionary value. This dictionary is + automatically translated into a list of `tritonclient.InferInput` + objects. + + + Returns + ------- + dict[str,np.array] + The return will be the dictionary of numpy arrays that have the + output_list arguments as keys. """ # Setting up the inference input containers diff --git a/src/coffea/ml_tools/xgboost_wrapper.py b/src/coffea/ml_tools/xgboost_wrapper.py index 86aa6fd7f..c9c489c62 100644 --- a/src/coffea/ml_tools/xgboost_wrapper.py +++ b/src/coffea/ml_tools/xgboost_wrapper.py @@ -16,6 +16,12 @@ class xgboost_wrapper(numpy_call_wrapper, nonserializable_attribute): """ Very simple wrapper for xgbooster inference. The xgboost.Booster object is nonserializable, so the users should pass in the xgboost model file. + + Parameters + ---------- + fname: str + Path to the xgboost model file, such that an `xgbooster` can be created + via `xgboost.Booster(model_file=fname)`. """ def __init__(self, fname): @@ -43,11 +49,25 @@ def validate_numpy_input( predict_args: Optional[Dict] = None, ): """ + Check that the arguments to be passed into the actual xgboost inference + request are valid. + The inner most dimension of the data array should be smaller than the number of features of the xgboost model. (Will raise a warning if mismatched). We will not attempt to parse the kwargs passed to the construction of a DMatrix, or the predict call, as those advanced features are expected to be properly handled by the user. + + Parameters + ---------- + data: np.ndarray + The data to pass into the `xgboost.DMatrix` construction. + dmat_args: dict[str,str], optional + Keyword arguments to pass into the `xgboost.DMatrix` construction. + predict_args: dict[str,str], optional + Keyword arguments to pass to the actual prediction step of `xgboost`, + ie: the `predict` method of `xgbooster.Booster.predict`. Note that the + first argument of that method is handled by this method. """ ndims = data.shape[-1] nfeat = self.xgbooster.num_features() @@ -68,10 +88,21 @@ def numpy_call( predict_args: Optional[Dict] = None, ): """ - Passing the numpy array data as-is to the construction of an + Pass the numpy array data as-is to the construction of an xgboost.DMatrix constructor (with additional keyword arguments should - they be specified), the run the xgboost.Booster.predict method (with + they be specified), then run the xgboost.Booster.predict method (with additional keyword arguments). + + Parameters + ---------- + data: np.ndarray + The data to pass into the `xgboost.DMatrix` construction. + dmat_args: dict[str,str], optional + Keyword arguments to pass into the `xgboost.DMatrix` construction. + predict_args: dict[str,str], optional + Keyword arguments to pass to the actual prediction step of `xgboost`, + ie: the `predict` method of `xgbooster.Booster.predict`. Note that the + first argument of that method is handled by this method. """ if dmat_args is None: dmat_args = {} diff --git a/src/coffea/nanoevents/factory.py b/src/coffea/nanoevents/factory.py index 34866a11b..9ef58a409 100644 --- a/src/coffea/nanoevents/factory.py +++ b/src/coffea/nanoevents/factory.py @@ -207,7 +207,12 @@ def __call__(self, form): class NanoEventsFactory: - """A factory class to build NanoEvents objects""" + """ + A factory class to build NanoEvents objects. + + For most users, it is advisable to construct instances via methods like `from_root` so that + the constructor args are properly set. + """ def __init__(self, schema, mapping, partition_key, cache=None, is_dask=False): self._is_dask = is_dask @@ -292,6 +297,11 @@ def from_root( see: https://github.com/scikit-hep/uproot5/blob/main/src/uproot/_dask.py#L109 interpretation_executor (None or Executor with a ``submit`` method): see: https://github.com/scikit-hep/uproot5/blob/main/src/uproot/_dask.py#L113 + + Returns + ------- + out: NanoEventsFactory + A NanoEventsFactory instance built from the file at `file`. """ if treepath is not uproot._util.unset and not isinstance( @@ -442,6 +452,11 @@ def from_parquet( Pass a list instance to record which branches were lazily accessed by this instance delayed: Nanoevents will use dask as a backend to construct a delayed task graph representing your analysis. + + Returns + ------- + out: NanoEventsFactory + A NanoEventsFactory instance built from the file at `file`. """ import pyarrow import pyarrow.dataset as ds @@ -581,6 +596,11 @@ def from_preloaded( Arbitrary metadata to add to the `base.NanoEvents` object access_log : list, optional Pass a list instance to record which branches were lazily accessed by this instance + + Returns + ------- + out: NanoEventsFactory + A NanoEventsFactory instance built from information in `array_source`. """ if not isinstance(array_source, Mapping): raise TypeError( @@ -679,7 +699,18 @@ def __len__(self): return stop - start def events(self): - """Build events""" + """ + Build events + + Returns + ------- + out: + If the NanoEventsFactory is running in delayed mode (Dask), this is + a Dask awkward array of the events. If the mapping also produces a + report, the output will be a tuple (events, report). + If the factory is not running in delayed mode, this is an awkward + array of the events. + """ if self._is_dask: events = self._mapping(form_mapping=self._schema) report = None diff --git a/src/coffea/nanoevents/methods/base.py b/src/coffea/nanoevents/methods/base.py index 896f53a25..3e06162da 100644 --- a/src/coffea/nanoevents/methods/base.py +++ b/src/coffea/nanoevents/methods/base.py @@ -26,14 +26,22 @@ def __call__(self, coll: awkward.Array, *args: Any, **kwargs: Any) -> awkward.Ar @awkward.mixin_class(behavior) class Systematic: - """A base mixin class to describe and build variations on a feature of an nanoevents object.""" + """A base mixin class to describe and build variations on a feature of a nanoevents object.""" _systematic_kinds = set() @classmethod def add_kind(cls, kind: str): """ - Register a type of systematic variation, it must fulfill the base class interface. + Register a type of systematic variation, which must fulfill the base class interface. Types of + systematic variations must be registered here before an actual systematic of that type can be + added. For example, by default an up/down systematic is registered, as described in + `coffea.nanoevents.methods.systematics.UpDownSystematic`. + + Parameters + ---------- + kind: str + The name of the type of systematic described by this class """ cls._systematic_kinds.add(kind) @@ -96,10 +104,19 @@ def add_systematic( varying_function: Callable, ): """ - name: str, name of the systematic variation / uncertainty source - kind: str, the name of the kind of systematic variation - what: Union[str, List[str], Tuple[str]], name what gets varied, this could be a list or tuple of column names - varying_function: Union[function, bound method], a function that describes how 'what' is varied, it must close over all non-event-data arguments. + Add a systematic to the nanoevents object's `systematics` field, with field name `name`, of kind `kind` (must be registered + with `add_kind` already), and varying the objects under field(s) `what` with a function `varying_function`. + + Parameters + ---------- + name: str + Name of the systematic variation / uncertainty source + kind: str + The name of the kind of systematic variation + what: Union[str, List[str], Tuple[str]] + Name what gets varied, this could be a list or tuple of column names + varying_function: Union[function, bound method] + A function that describes how 'what' is varied, it must close over all non-event-data arguments. """ self._ensure_systematics() diff --git a/src/coffea/nanoevents/methods/nanoaod.py b/src/coffea/nanoevents/methods/nanoaod.py index 13b1782ff..539e940e4 100644 --- a/src/coffea/nanoevents/methods/nanoaod.py +++ b/src/coffea/nanoevents/methods/nanoaod.py @@ -106,6 +106,9 @@ def hasFlags(self, *flags): @dask_property def parent(self): + """ + Accessor to the direct parent of this particle. + """ return self._events().GenPart._apply_global_index(self.genPartIdxMotherG) @parent.dask @@ -116,6 +119,9 @@ def parent(self, dask_array): @dask_property def distinctParent(self): + """ + Accessor to distinct (different PDG id) parent particle. + """ return self._events().GenPart._apply_global_index(self.distinctParentIdxG) @distinctParent.dask @@ -126,6 +132,10 @@ def distinctParent(self, dask_array): @dask_property def children(self): + """ + Accessor to direct children of this particle (not grandchildren). Includes particles + with the same PDG ID as this particle. + """ return self._events().GenPart._apply_global_index(self.childrenIdxG) @children.dask @@ -134,6 +144,12 @@ def children(self, dask_array): @dask_property def distinctChildren(self): + """ + Accessor to direct children of this particle which do not have the same PDG ID as + this particle. Note that this implies the summed four-momentum of the distinctChildren + may not sum to the four-momentum of this particle (for example, if this particle + radiates another particle type). If that behavior is desired, see `distinctChildrenDeep`. + """ return self._events().GenPart._apply_global_index(self.distinctChildrenIdxG) @distinctChildren.dask @@ -144,7 +160,12 @@ def distinctChildren(self, dask_array): @dask_property def distinctChildrenDeep(self): - """Accessor to distinct child particles with different PDG id, or last ones in the chain""" + """ + Accessor to distinct child particles with different PDG id, or last ones in the chain. + Note that this does not always find the correct children, since this sometimes depends + on the MC generator! See `here ` for more + information. + """ warnings.warn( "distinctChildrenDeep may not give correct answers for all generators!" ) @@ -240,6 +261,7 @@ def isTight(self): @dask_property def matched_gen(self): + """The matched gen-level particle as determined by the NanoAOD branch genPartIdx""" return self._events().GenPart._apply_global_index(self.genPartIdxG) @matched_gen.dask @@ -248,6 +270,7 @@ def matched_gen(self, dask_array): @dask_property def matched_jet(self): + """The matched jet as determined by the NanoAOD branch jetIdx""" return self._events().Jet._apply_global_index(self.jetIdxG) @matched_jet.dask @@ -256,6 +279,7 @@ def matched_jet(self, dask_array): @dask_property def matched_photon(self): + """The associated photon as determined by the NanoAOD branch photonIdx""" return self._events().Photon._apply_global_index(self.photonIdxG) @matched_photon.dask @@ -281,6 +305,7 @@ class LowPtElectron(candidate.PtEtaPhiMCandidate, base.NanoCollection, base.Syst @dask_property def matched_gen(self): + """The matched gen-level particle as determined by the NanoAOD branch genPartIdx""" return self._events().GenPart._apply_global_index(self.genPartIdxG) @matched_gen.dask @@ -289,6 +314,7 @@ def matched_gen(self, dask_array): @dask_property def matched_electron(self): + """The matched gen-level electron as determined by the NanoAOD branch electronIdx""" return self._events().Electron._apply_global_index(self.electronIdxG) @matched_electron.dask @@ -299,6 +325,7 @@ def matched_electron(self, dask_array): @dask_property def matched_photon(self): + """The associated photon as determined by the NanoAOD branch photonIdx""" return self._events().Photon._apply_global_index(self.photonIdxG) @matched_photon.dask @@ -322,6 +349,7 @@ class Muon(candidate.PtEtaPhiMCandidate, base.NanoCollection, base.Systematic): @dask_property def matched_fsrPhoton(self): + """The matched FSR photon with the lowest dR/ET2. Accessed via the NanoAOD branch fsrPhotonIdx""" return self._events().FsrPhoton._apply_global_index(self.fsrPhotonIdxG) @matched_fsrPhoton.dask @@ -332,6 +360,7 @@ def matched_fsrPhoton(self, dask_array): @dask_property def matched_gen(self): + """The matched gen-level particle as determined by the NanoAOD branch genPartIdx""" return self._events().GenPart._apply_global_index(self.genPartIdxG) @matched_gen.dask @@ -340,6 +369,7 @@ def matched_gen(self, dask_array): @dask_property def matched_jet(self): + """The matched jet as determined by the NanoAOD branch jetIdx""" return self._events().Jet._apply_global_index(self.jetIdxG) @matched_jet.dask @@ -363,6 +393,7 @@ class Tau(candidate.PtEtaPhiMCandidate, base.NanoCollection, base.Systematic): @dask_property def matched_gen(self): + """The matched gen-level particle as determined by the NanoAOD branch genPartIdx""" return self._events().GenPart._apply_global_index(self.genPartIdxG) @matched_gen.dask @@ -371,6 +402,7 @@ def matched_gen(self, dask_array): @dask_property def matched_jet(self): + """The matched jet as determined by the NanoAOD branch jetIdx""" return self._events().Jet._apply_global_index(self.jetIdxG) @matched_jet.dask @@ -435,6 +467,7 @@ def isTight(self): @dask_property def matched_electron(self): + """The matched electron as determined by the NanoAOD branch electronIdx""" return self._events().Electron._apply_global_index(self.electronIdxG) @matched_electron.dask @@ -445,6 +478,7 @@ def matched_electron(self, dask_array): @dask_property def matched_gen(self): + """The matched gen-level particle as determined by the NanoAOD branch genPartIdx""" return self._events().GenPart._apply_global_index(self.genPartIdxG) @matched_gen.dask @@ -453,6 +487,7 @@ def matched_gen(self, dask_array): @dask_property def matched_jet(self): + """The matched jet as determined by the NanoAOD branch jetIdx""" return self._events().Jet._apply_global_index(self.jetIdxG) @matched_jet.dask @@ -478,6 +513,7 @@ class FsrPhoton(candidate.PtEtaPhiMCandidate, base.NanoCollection): @dask_property def matched_muon(self): + """The matched muon as determined by the NanoAOD branch muonIdx""" return self._events().Muon._apply_global_index(self.muonIdxG) @matched_muon.dask @@ -527,6 +563,11 @@ def isTightLeptonVeto(self): @dask_property def matched_electrons(self): + """ + The matched electrons as determined by the NanoAOD branch electronIdx. The resulting awkward + array has two entries per jet, where if there are fewer than 2 electrons matched to a jet, the + innermost dimensions are padded with None to be of size 2. + """ return self._events().Electron._apply_global_index(self.electronIdxG) @matched_electrons.dask @@ -537,6 +578,11 @@ def matched_electrons(self, dask_array): @dask_property def matched_muons(self): + """ + The matched muons as determined by the NanoAOD branch muonIdx. The resulting awkward + array has two entries per jet, where if there are fewer than 2 muons matched to a jet, the + innermost dimensions are padded with None to be of size 2. + """ return self._events().Muon._apply_global_index(self.muonIdxG) @matched_muons.dask @@ -545,6 +591,9 @@ def matched_muons(self, dask_array): @dask_property def matched_gen(self): + """ + AK4 jets made with visible genparticles, matched to this jet via the NanoAOD branch genJetIdx + """ return self._events().GenJet._apply_global_index(self.genJetIdxG) @matched_gen.dask @@ -616,6 +665,7 @@ def subjets(self, dask_array): @dask_property def matched_gen(self): + """AK8 jets made of visible genparticles, matched via the NanoAOD branch genJetAK8Idx""" return self._events().GenJetAK8._apply_global_index(self.genJetAK8IdxG) @matched_gen.dask @@ -655,6 +705,7 @@ class MissingET(vector.PolarTwoVector, base.NanoCollection, base.Systematic): @property def r(self): + """Distance from origin in XY plane""" return self["pt"] diff --git a/src/coffea/nanoevents/schemas/base.py b/src/coffea/nanoevents/schemas/base.py index 59a4b4285..4b1705c9b 100644 --- a/src/coffea/nanoevents/schemas/base.py +++ b/src/coffea/nanoevents/schemas/base.py @@ -122,12 +122,12 @@ def __init__(self, base_form, *args, **kwargs): @property def form(self): - """Awkward form of this schema""" + """Awkward form of this schema (dict)""" return self._form @classmethod def behavior(cls): - """Behaviors necessary to implement this schema""" + """Behaviors necessary to implement this schema (dict)""" from coffea.nanoevents.methods import base return base.behavior diff --git a/src/coffea/nanoevents/schemas/delphes.py b/src/coffea/nanoevents/schemas/delphes.py index 5b1095ee6..0e708e806 100644 --- a/src/coffea/nanoevents/schemas/delphes.py +++ b/src/coffea/nanoevents/schemas/delphes.py @@ -68,6 +68,9 @@ class DelphesSchema(BaseSchema): "Rho": "Rho", "ScalarHT": "ScalarHT", } + """ + Default configuration for mixin types, based on the collection name. + """ # These are stored as length-1 vectors unnecessarily singletons = [ @@ -79,6 +82,10 @@ class DelphesSchema(BaseSchema): "ScalarHT", "MissingET", ] + """ + Fields that are stored as length-1 vectors in Delphes, to be flattened out in nanoevents + (removing an unnecessary level of nesting). + """ docstrings = { "AlphaQCD": "value of the QCD coupling used in the event, see hep-ph/0109068", @@ -197,6 +204,9 @@ class DelphesSchema(BaseSchema): "ZOuter": "position (z component) at the edge", "Zd": "Z coordinate of point of closest approach to vertex", } + """ + The docstrings for each field in the resulting nanoevents + """ def __init__(self, base_form, version="latest", *args, **kwargs): super().__init__(base_form) @@ -310,7 +320,7 @@ def _preprocess_branch_form(objname, form): @classmethod def behavior(cls): - """Behaviors necessary to implement this schema""" + """Behaviors necessary to implement this schema (dict)""" from coffea.nanoevents.methods import delphes return delphes.behavior diff --git a/src/coffea/nanoevents/schemas/nanoaod.py b/src/coffea/nanoevents/schemas/nanoaod.py index 601b7f34e..8e731731b 100644 --- a/src/coffea/nanoevents/schemas/nanoaod.py +++ b/src/coffea/nanoevents/schemas/nanoaod.py @@ -43,8 +43,8 @@ class NanoAODSchema(BaseSchema): """ __dask_capable__ = True - warn_missing_crossrefs = True - error_missing_event_ids = True + warn_missing_crossrefs = True # If True, issues a warning when a missing global index cross-ref target is encountered + error_missing_event_ids = True # If True, raises an exception when 'run', 'event', or 'luminosityBlock' fields are missing event_ids = ["run", "luminosityBlock", "event"] """List of NanoAOD event IDs @@ -189,17 +189,34 @@ def v7(cls, base_form): For example, one can use ``NanoEventsFactory.from_root("file.root", schemaclass=NanoAODSchema.v7)`` to ensure NanoAODv7 compatibility. + + Returns + ------- + out: NanoAODSchema + Schema assuming NanoAODv7 """ return cls(base_form, version="7") @classmethod def v6(cls, base_form): - """Build the NanoEvents assuming NanoAODv6""" + """Build the NanoEvents assuming NanoAODv6 + + Returns + ------- + out: NanoAODSchema + Schema assuming NanoAODv6 + """ return cls(base_form, version="6") @classmethod def v5(cls, base_form): - """Build the NanoEvents assuming NanoAODv5""" + """Build the NanoEvents assuming NanoAODv5 + + Returns + ------- + out: NanoAODSchema + Schema assuming NanoAODv5 + """ return cls(base_form, version="5") def _build_collections(self, field_names, input_contents): @@ -327,7 +344,7 @@ def _build_collections(self, field_names, input_contents): @classmethod def behavior(cls): - """Behaviors necessary to implement this schema""" + """Behaviors necessary to implement this schema (dict)""" from coffea.nanoevents.methods import nanoaod return nanoaod.behavior diff --git a/src/coffea/nanoevents/schemas/physlite.py b/src/coffea/nanoevents/schemas/physlite.py index 7405b96aa..64ffc2e5b 100644 --- a/src/coffea/nanoevents/schemas/physlite.py +++ b/src/coffea/nanoevents/schemas/physlite.py @@ -159,7 +159,7 @@ def _create_eventindex_form(base_form, key): @classmethod def behavior(cls): - """Behaviors necessary to implement this schema""" + """Behaviors necessary to implement this schema (dict)""" from coffea.nanoevents.methods import physlite return physlite.behavior diff --git a/src/coffea/nanoevents/schemas/treemaker.py b/src/coffea/nanoevents/schemas/treemaker.py index a90bd609b..754ad186d 100644 --- a/src/coffea/nanoevents/schemas/treemaker.py +++ b/src/coffea/nanoevents/schemas/treemaker.py @@ -166,7 +166,7 @@ def _build_collections(self, branch_forms): @classmethod def behavior(cls): - """Behaviors necessary to implement this schema""" + """Behaviors necessary to implement this schema (dict)""" from coffea.nanoevents.methods import base, vector behavior = {} @@ -181,6 +181,17 @@ def uproot_writeable(cls, events): writeable. Based off the discussion thread here [1], but added specific cased to handled the nested structures define for TreeMaker n-tuples. [1] https://github.com/CoffeaTeam/coffea/discussions/735 + + Parameters + ---------- + events: TreeMakerSchema events + The TreeMakerSchema events to be turned into something uproot-writeable + + Returns + ------- + out: dict + An uproot-writeable dictionary representing the same information as the input + TreeMakerSchema events """ import awkward as ak