From 9051437e85e2fafcc6513817170a3c5031cfc04a Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Thu, 20 Jun 2024 11:30:46 -0400 Subject: [PATCH 01/21] Update gen processor --- .../mc_validation/mc_validation_gen_processor.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/analysis/mc_validation/mc_validation_gen_processor.py b/analysis/mc_validation/mc_validation_gen_processor.py index e0e68c7d4..8d0657732 100644 --- a/analysis/mc_validation/mc_validation_gen_processor.py +++ b/analysis/mc_validation/mc_validation_gen_processor.py @@ -2,14 +2,18 @@ import numpy as np import awkward as ak np.seterr(divide='ignore', invalid='ignore', over='ignore') -from coffea import hist, processor +from coffea import processor +import hist from topcoffea.modules.GetValuesFromJsons import get_lumi from topcoffea.modules.objects import * #from topcoffea.modules.corrections import get_ht_sf from topcoffea.modules.selection import * -from topcoffea.modules.HistEFT import HistEFT +from topcoffea.modules.histEFT import HistEFT import topcoffea.modules.eft_helper as efth +from topcoffea.modules.get_param_from_jsons import GetParam +from topcoffea.modules.paths import topcoffea_path +get_tc_param = GetParam(topcoffea_path("params/params.json")) class AnalysisProcessor(processor.ProcessorABC): @@ -100,7 +104,7 @@ def process(self, events): # Jet object selection genjet = genjet[genjet.pt > 30] - is_clean_jet = isClean(genjet, gen_e, drmin=0.4) & isClean(genjet, gen_m, drmin=0.4) & isClean(genjet, gen_t, drmin=0.4) + is_clean_jet = te_os.isClean(genjet, gen_e, drmin=0.4) & te_os.isClean(genjet, gen_m, drmin=0.4) & te_os.isClean(genjet, gen_t, drmin=0.4) genjet_clean = genjet[is_clean_jet] njets = ak.num(genjet_clean) @@ -171,8 +175,8 @@ def process(self, events): event_weight_cut = event_weight[isnotnone_mask] eft_coeffs_cut = eft_coeffs if eft_coeffs is not None: eft_coeffs_cut = eft_coeffs[isnotnone_mask] - eft_w2_coeffs_cut = eft_w2_coeffs - if eft_w2_coeffs is not None: eft_w2_coeffs_cut = eft_w2_coeffs[isnotnone_mask] + #eft_w2_coeffs_cut = eft_w2_coeffs + #if eft_w2_coeffs is not None: eft_w2_coeffs_cut = eft_w2_coeffs[isnotnone_mask] # Fill the histos axes_fill_info_dict = { @@ -180,7 +184,7 @@ def process(self, events): "sample" : histAxisName, "weight" : event_weight_cut, "eft_coeff" : eft_coeffs_cut, - "eft_err_coeff" : eft_w2_coeffs_cut, + #"eft_err_coeff" : eft_w2_coeffs_cut, } hout[dense_axis_name].fill(**axes_fill_info_dict) From 829468e5da0a056562384aead2b912c0392039fe Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Thu, 20 Jun 2024 11:51:47 -0400 Subject: [PATCH 02/21] Updated gen processor --- .../mc_validation_gen_processor.py | 37 +- analysis/mc_validation/run_analysis.py | 365 ++++++++++++++++++ 2 files changed, 383 insertions(+), 19 deletions(-) create mode 100644 analysis/mc_validation/run_analysis.py diff --git a/analysis/mc_validation/mc_validation_gen_processor.py b/analysis/mc_validation/mc_validation_gen_processor.py index 8d0657732..eccb83b9b 100644 --- a/analysis/mc_validation/mc_validation_gen_processor.py +++ b/analysis/mc_validation/mc_validation_gen_processor.py @@ -5,10 +5,7 @@ from coffea import processor import hist -from topcoffea.modules.GetValuesFromJsons import get_lumi -from topcoffea.modules.objects import * -#from topcoffea.modules.corrections import get_ht_sf -from topcoffea.modules.selection import * +import topeft.modules.object_selection as te_os from topcoffea.modules.histEFT import HistEFT import topcoffea.modules.eft_helper as efth from topcoffea.modules.get_param_from_jsons import GetParam @@ -18,24 +15,26 @@ class AnalysisProcessor(processor.ProcessorABC): - def __init__(self, samples, wc_names_lst=[], hist_lst=None, ecut_threshold=None, do_errors=False, do_systematics=False, split_by_lepton_flavor=False, skip_signal_regions=False, skip_control_regions=False, muonSyst='nominal', dtype=np.float32): + def __init__(self, samples, wc_names_lst=[], hist_lst=None, ecut_threshold=None, do_errors=False, do_systematics=False, split_by_lepton_flavor=False, skip_signal_regions=False, skip_control_regions=False, dtype=np.float32): self._samples = samples self._wc_names_lst = wc_names_lst self._dtype = dtype # Create the histograms - self._accumulator = processor.dict_accumulator({ - "mll_fromzg_e" : HistEFT("Events", wc_names_lst, hist.Cat("sample", "sample"), hist.Bin("mll_fromzg_e", "invmass ee from z/gamma", 40, 0, 200)), - "mll_fromzg_m" : HistEFT("Events", wc_names_lst, hist.Cat("sample", "sample"), hist.Bin("mll_fromzg_m", "invmass mm from z/gamma", 40, 0, 200)), - "mll_fromzg_t" : HistEFT("Events", wc_names_lst, hist.Cat("sample", "sample"), hist.Bin("mll_fromzg_t", "invmass tautau from z/gamma", 40, 0, 200)), - "mll" : HistEFT("Events", wc_names_lst, hist.Cat("sample", "sample"), hist.Bin("mll", "Invmass l0l1", 60, 0, 600)), - "ht" : HistEFT("Events", wc_names_lst, hist.Cat("sample", "sample"), hist.Bin("ht", "Scalar sum of genjet pt", 100, 0, 1000)), - "ht_clean" : HistEFT("Events", wc_names_lst, hist.Cat("sample", "sample"), hist.Bin("ht_clean", "Scalar sum of clean genjet pt", 100, 0, 1000)), - "tops_pt" : HistEFT("Events", wc_names_lst, hist.Cat("sample", "sample"), hist.Bin("tops_pt", "Pt of the sum of the tops", 50, 0, 500)), - "tX_pt" : HistEFT("Events", wc_names_lst, hist.Cat("sample", "sample"), hist.Bin("tX_pt", "Pt of the t(t)X system", 40, 0, 400)), - "njets" : HistEFT("Events", wc_names_lst, hist.Cat("sample", "sample"), hist.Bin("njets", "njets", 10, 0, 10)), - }) + proc_axis = hist.axis.StrCategory([], name="process", growth=True) + self._accumulator = { + "mll_fromzg_e" : HistEFT(proc_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_e", label=r"invmass ee from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll_fromzg_m" : HistEFT(proc_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_m", label=r"invmass mm from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll_fromzg_t" : HistEFT(proc_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_t", label=r"invmass tautau from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll" : HistEFT(proc_axis, hist.axis.Regular(60, 0, 600, name="mll", label=r"Invmass l0l1"), wc_names=wc_names_lst, rebin=False), + "ht" : HistEFT(proc_axis, hist.axis.Regular(100, 0, 1000, name="ht", label=r"Scalar sum of genjet pt"), wc_names=wc_names_lst, rebin=False), + "ht_clean" : HistEFT(proc_axis, hist.axis.Regular(100, 0, 1000, name="ht_clean", label=r"Scalar sum of clean genjet pt"), wc_names=wc_names_lst, rebin=False), + "tops_pt" : HistEFT(proc_axis, hist.axis.Regular(50, 0, 500, name="tops_pt", label=r"Pt of the sum of the tops"), wc_names=wc_names_lst, rebin=False), + "tX_pt" : HistEFT(proc_axis, hist.axis.Regular(40, 0, 400, name="tX_pt", label=r"Pt of the t(t)X system"), wc_names=wc_names_lst, rebin=False), + "njets" : HistEFT(proc_axis, hist.axis.Regular(10, 0, 10, name="njets", label=r"njets"), wc_names=wc_names_lst, rebin=False), + "njets" : HistEFT(proc_axis, hist.axis.Regular(10, 0, 10, name="njets", label=r"njets"), wc_names=wc_names_lst, rebin=False), + } # Set the list of hists to fill if hist_lst is None: @@ -154,7 +153,7 @@ def process(self, events): # If this is not an eft sample, get the genWeight if eft_coeffs is None: genw = events["genWeight"] else: genw = np.ones_like(events["event"]) - lumi = get_lumi(year)*1000.0 + lumi = 1000.0*get_tc_param(f"lumi_{year}") event_weight = lumi*xsec*genw/sow # Example of reweighting based on Ht @@ -165,7 +164,7 @@ def process(self, events): ### Loop over the hists we want to fill ### - hout = self.accumulator.identity() + hout = self.accumulator for dense_axis_name, dense_axis_vals in dense_axis_dict.items(): @@ -181,7 +180,7 @@ def process(self, events): # Fill the histos axes_fill_info_dict = { dense_axis_name : dense_axis_vals_cut, - "sample" : histAxisName, + "process" : histAxisName, "weight" : event_weight_cut, "eft_coeff" : eft_coeffs_cut, #"eft_err_coeff" : eft_w2_coeffs_cut, diff --git a/analysis/mc_validation/run_analysis.py b/analysis/mc_validation/run_analysis.py new file mode 100644 index 000000000..650a8dbac --- /dev/null +++ b/analysis/mc_validation/run_analysis.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python + +import argparse +import json +import time +import cloudpickle +import gzip +import os + +from coffea import processor +from coffea.nanoevents import NanoAODSchema + +import topcoffea.modules.utils as utils +import topcoffea.modules.remote_environment as remote_environment + +from topeft.modules.dataDrivenEstimation import DataDrivenProducer +from topeft.modules.get_renormfact_envelope import get_renormfact_envelope +import mc_validation_gen_processor as gen_processor + +LST_OF_KNOWN_EXECUTORS = ["futures","work_queue"] + +WGT_VAR_LST = [ + "nSumOfWeights_ISRUp", + "nSumOfWeights_ISRDown", + "nSumOfWeights_FSRUp", + "nSumOfWeights_FSRDown", + "nSumOfWeights_renormUp", + "nSumOfWeights_renormDown", + "nSumOfWeights_factUp", + "nSumOfWeights_factDown", + "nSumOfWeights_renormfactUp", + "nSumOfWeights_renormfactDown", +] + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='You can customize your run') + parser.add_argument('jsonFiles' , nargs='?', default='', help = 'Json file(s) containing files and metadata') + parser.add_argument('--executor','-x' , default='work_queue', help = 'Which executor to use') + parser.add_argument('--prefix', '-r' , nargs='?', default='', help = 'Prefix or redirector to look for the files') + parser.add_argument('--test','-t' , action='store_true' , help = 'To perform a test, run over a few events in a couple of chunks') + parser.add_argument('--pretend' , action='store_true', help = 'Read json files but, not execute the analysis') + parser.add_argument('--nworkers','-n' , default=8 , help = 'Number of workers') + parser.add_argument('--chunksize','-s' , default=100000, help = 'Number of events per chunk') + parser.add_argument('--nchunks','-c' , default=None, help = 'You can choose to run only a number of chunks') + parser.add_argument('--outname','-o' , default='plotsTopEFT', help = 'Name of the output file with histograms') + parser.add_argument('--outpath','-p' , default='histos', help = 'Name of the output directory') + parser.add_argument('--treename' , default='Events', help = 'Name of the tree inside the files') + parser.add_argument('--do-errors' , action='store_true', help = 'Save the w**2 coefficients') + parser.add_argument('--do-systs', action='store_true', help = 'Compute systematic variations') + parser.add_argument('--split-lep-flavor', action='store_true', help = 'Split up categories by lepton flavor') + parser.add_argument('--skip-sr', action='store_true', help = 'Skip all signal region categories') + parser.add_argument('--skip-cr', action='store_true', help = 'Skip all control region categories') + parser.add_argument('--do-np' , action='store_true', help = 'Perform nonprompt estimation on the output hist, and save a new hist with the np contribution included. Note that signal, background and data samples should all be processed together in order for this option to make sense.') + parser.add_argument('--do-renormfact-envelope', action='store_true', help = 'Perform renorm/fact envelope calculation on the output hist (saves the modified with the the same name as the original.') + parser.add_argument('--wc-list', action='extend', nargs='+', help = 'Specify a list of Wilson coefficients to use in filling histograms.') + parser.add_argument('--hist-list', action='extend', nargs='+', help = 'Specify a list of histograms to fill.') + parser.add_argument('--ecut', default=None , help = 'Energy cut threshold i.e. throw out events above this (GeV)') + parser.add_argument('--port', default='9123-9130', help = 'Specify the Work Queue port. An integer PORT or an integer range PORT_MIN-PORT_MAX.') + + + args = parser.parse_args() + jsonFiles = args.jsonFiles + prefix = args.prefix + executor = args.executor + dotest = args.test + nworkers = int(args.nworkers) + chunksize = int(args.chunksize) + nchunks = int(args.nchunks) if not args.nchunks is None else args.nchunks + outname = args.outname + outpath = args.outpath + pretend = args.pretend + treename = args.treename + do_errors = args.do_errors + do_systs = args.do_systs + split_lep_flavor = args.split_lep_flavor + skip_sr = args.skip_sr + skip_cr = args.skip_cr + do_np = args.do_np + do_renormfact_envelope = args.do_renormfact_envelope + wc_lst = args.wc_list if args.wc_list is not None else [] + + # Check if we have valid options + if executor not in LST_OF_KNOWN_EXECUTORS: + raise Exception(f"The \"{executor}\" executor is not known. Please specify an executor from the known executors ({LST_OF_KNOWN_EXECUTORS}). Exiting.") + if do_renormfact_envelope: + if not do_systs: + raise Exception("Error: Cannot specify do_renormfact_envelope if we are not including systematics.") + if not do_np: + raise Exception("Error: Cannot specify do_renormfact_envelope if we have not already done the integration across the appl axis that occurs in the data driven estimator step.") + if dotest: + if executor == "futures": + nchunks = 2 + chunksize = 10000 + nworkers = 1 + print('Running a fast test with %i workers, %i chunks of %i events'%(nworkers, nchunks, chunksize)) + else: + raise Exception(f"The \"test\" option is not set up to work with the {executor} executor. Exiting.") + + + # Set the threshold for the ecut (if not applying a cut, should be None) + ecut_threshold = args.ecut + if ecut_threshold is not None: ecut_threshold = float(args.ecut) + + if executor == "work_queue": + # construct wq port range + port = list(map(int, args.port.split('-'))) + if len(port) < 1: + raise ValueError("At least one port value should be specified.") + if len(port) > 2: + raise ValueError("More than one port range was specified.") + if len(port) == 1: + # convert single values into a range of one element + port.append(port[0]) + + # Figure out which hists to include + if args.hist_list == ["ana"]: + # Here we hardcode a list of hists used for the analysis + hist_lst = ["cosZ", "cosq", "azim", "tops_pt", "tX_pt", "njets"] + elif args.hist_list == ["cr"]: + # Here we hardcode a list of hists used for the CRs + hist_lst = ["lj0pt", "ptz", "met", "ljptsum", "l0pt", "l0eta", "l1pt", "l1eta", "j0pt", "j0eta", "njets", "nbtagsl", "invmass"] + else: + # We want to specify a custom list + # If we don't specify this argument, it will be None, and the processor will fill all hists + hist_lst = args.hist_list + + + ### Load samples from json + samplesdict = {} + allInputFiles = [] + + def LoadJsonToSampleName(jsonFile, prefix): + sampleName = jsonFile if not '/' in jsonFile else jsonFile[jsonFile.rfind('/')+1:] + if sampleName.endswith('.json'): sampleName = sampleName[:-5] + with open(jsonFile) as jf: + samplesdict[sampleName] = json.load(jf) + samplesdict[sampleName]['redirector'] = prefix + + if isinstance(jsonFiles, str) and ',' in jsonFiles: + jsonFiles = jsonFiles.replace(' ', '').split(',') + elif isinstance(jsonFiles, str): + jsonFiles = [jsonFiles] + for jsonFile in jsonFiles: + if os.path.isdir(jsonFile): + if not jsonFile.endswith('/'): jsonFile+='/' + for f in os.path.listdir(jsonFile): + if f.endswith('.json'): allInputFiles.append(jsonFile+f) + else: + allInputFiles.append(jsonFile) + + # Read from cfg files + for f in allInputFiles: + if not os.path.isfile(f): + raise Exception(f'[ERROR] Input file {f} not found!') + # This input file is a json file, not a cfg + if f.endswith('.json'): + LoadJsonToSampleName(f, prefix) + # Open cfg files + else: + with open(f) as fin: + print(' >> Reading json from cfg file...') + lines = fin.readlines() + for l in lines: + if '#' in l: + l=l[:l.find('#')] + l = l.replace(' ', '').replace('\n', '') + if l == '': continue + if ',' in l: + l = l.split(',') + for nl in l: + if not os.path.isfile(l): + prefix = nl + else: + LoadJsonToSampleName(nl, prefix) + else: + if not os.path.isfile(l): + prefix = l + else: + LoadJsonToSampleName(l, prefix) + + flist = {} + nevts_total = 0 + for sname in samplesdict.keys(): + redirector = samplesdict[sname]['redirector'] + flist[sname] = [(redirector+f) for f in samplesdict[sname]['files']] + samplesdict[sname]['year'] = samplesdict[sname]['year'] + samplesdict[sname]['xsec'] = float(samplesdict[sname]['xsec']) + samplesdict[sname]['nEvents'] = int(samplesdict[sname]['nEvents']) + nevts_total += samplesdict[sname]['nEvents'] + samplesdict[sname]['nGenEvents'] = int(samplesdict[sname]['nGenEvents']) + samplesdict[sname]['nSumOfWeights'] = float(samplesdict[sname]['nSumOfWeights']) + if not samplesdict[sname]["isData"]: + for wgt_var in WGT_VAR_LST: + # Check that MC samples have all needed weight sums (only needed if doing systs) + if do_systs: + if (wgt_var not in samplesdict[sname]): + raise Exception(f"Missing weight variation \"{wgt_var}\".") + else: + samplesdict[sname][wgt_var] = float(samplesdict[sname][wgt_var]) + # Print file info + print('>> '+sname) + print(' - isData? : %s' %('YES' if samplesdict[sname]['isData'] else 'NO')) + print(' - year : %s' %samplesdict[sname]['year']) + print(' - xsec : %f' %samplesdict[sname]['xsec']) + print(' - histAxisName : %s' %samplesdict[sname]['histAxisName']) + print(' - options : %s' %samplesdict[sname]['options']) + print(' - tree : %s' %samplesdict[sname]['treeName']) + print(' - nEvents : %i' %samplesdict[sname]['nEvents']) + print(' - nGenEvents : %i' %samplesdict[sname]['nGenEvents']) + print(' - SumWeights : %i' %samplesdict[sname]['nSumOfWeights']) + if not samplesdict[sname]["isData"]: + for wgt_var in WGT_VAR_LST: + if wgt_var in samplesdict[sname]: + print(f' - {wgt_var}: {samplesdict[sname][wgt_var]}') + print(' - Prefix : %s' %samplesdict[sname]['redirector']) + print(' - nFiles : %i' %len(samplesdict[sname]['files'])) + for fname in samplesdict[sname]['files']: print(' %s'%fname) + + if pretend: + print('pretending...') + exit() + + # Extract the list of all WCs, as long as we haven't already specified one. + if len(wc_lst) == 0: + for k in samplesdict.keys(): + for wc in samplesdict[k]['WCnames']: + if wc not in wc_lst: + wc_lst.append(wc) + + if len(wc_lst) > 0: + # Yes, why not have the output be in correct English? + if len(wc_lst) == 1: + wc_print = wc_lst[0] + elif len(wc_lst) == 2: + wc_print = wc_lst[0] + ' and ' + wc_lst[1] + else: + wc_print = ', '.join(wc_lst[:-1]) + ', and ' + wc_lst[-1] + print('Wilson Coefficients: {}.'.format(wc_print)) + else: + print('No Wilson coefficients specified') + + processor_instance = gen_processor.AnalysisProcessor(samplesdict,wc_lst,hist_lst,ecut_threshold,do_errors,do_systs,split_lep_flavor,skip_sr,skip_cr) + + if executor == "work_queue": + executor_args = { + 'master_name': '{}-workqueue-coffea'.format(os.environ['USER']), + + # find a port to run work queue in this range: + 'port': port, + + 'debug_log': 'debug.log', + 'transactions_log': 'tr.log', + 'stats_log': 'stats.log', + 'tasks_accum_log': 'tasks.log', + + 'environment_file': remote_environment.get_environment( + extra_pip_local = {"topeft": ["topeft", "setup.py"]}, + ), + 'filepath': f'/project01/ndcms/{os.environ["USER"]}', + 'extra_input_files': ["mc_validation_gen_processor.py"], + + 'retries': 5, + + # use mid-range compression for chunks results. 9 is the default for work + # queue in coffea. Valid values are 0 (minimum compression, less memory + # usage) to 16 (maximum compression, more memory usage). + 'compression': 9, + + # automatically find an adequate resource allocation for tasks. + # tasks are first tried using the maximum resources seen of previously ran + # tasks. on resource exhaustion, they are retried with the maximum resource + # values, if specified below. if a maximum is not specified, the task waits + # forever until a larger worker connects. + 'resource_monitor': True, + 'resources_mode': 'auto', + + # this resource values may be omitted when using + # resources_mode: 'auto', but they do make the initial portion + # of a workflow run a little bit faster. + # Rather than using whole workers in the exploratory mode of + # resources_mode: auto, tasks are forever limited to a maximum + # of 8GB of mem and disk. + # + # NOTE: The very first tasks in the exploratory + # mode will use the values specified here, so workers need to be at least + # this large. If left unspecified, tasks will use whole workers in the + # exploratory mode. + # 'cores': 1, + # 'disk': 8000, #MB + # 'memory': 10000, #MB + + # control the size of accumulation tasks. Results are + # accumulated in groups of size chunks_per_accum, keeping at + # most chunks_per_accum at the same time in memory per task. + 'chunks_per_accum': 25, + 'chunks_accum_in_mem': 2, + + # terminate workers on which tasks have been running longer than average. + # This is useful for temporary conditions on worker nodes where a task will + # be finish faster is ran in another worker. + # the time limit is computed by multipliying the average runtime of tasks + # by the value of 'fast_terminate_workers'. Since some tasks can be + # legitimately slow, no task can trigger the termination of workers twice. + # + # warning: small values (e.g. close to 1) may cause the workflow to misbehave, + # as most tasks will be terminated. + # + # Less than 1 disables it. + 'fast_terminate_workers': 0, + + # print messages when tasks are submitted, finished, etc., + # together with their resource allocation and usage. If a task + # fails, its standard output is also printed, so we can turn + # off print_stdout for all tasks. + 'verbose': True, + 'print_stdout': False, + } + + # Run the processor and get the output + tstart = time.time() + + if executor == "futures": + exec_instance = processor.futures_executor(workers=nworkers) + runner = processor.Runner(exec_instance, schema=NanoAODSchema, chunksize=chunksize, maxchunks=nchunks) + elif executor == "work_queue": + executor = processor.WorkQueueExecutor(**executor_args) + runner = processor.Runner(executor, schema=NanoAODSchema, chunksize=chunksize, maxchunks=nchunks, skipbadfiles=False, xrootdtimeout=180) + + output = runner(flist, treename, processor_instance) + + dt = time.time() - tstart + + if executor == "work_queue": + print('Processed {} events in {} seconds ({:.2f} evts/sec).'.format(nevts_total,dt,nevts_total/dt)) + + #nbins = sum(sum(arr.size for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist)) + #nfilled = sum(sum(np.sum(arr > 0) for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist)) + #print("Filled %.0f bins, nonzero bins: %1.1f %%" % (nbins, 100*nfilled/nbins,)) + + if executor == "futures": + print("Processing time: %1.2f s with %i workers (%.2f s cpu overall)" % (dt, nworkers, dt*nworkers, )) + + # Save the output + if not os.path.isdir(outpath): os.system("mkdir -p %s"%outpath) + out_pkl_file = os.path.join(outpath,outname+".pkl.gz") + print(f"\nSaving output in {out_pkl_file}...") + with gzip.open(out_pkl_file, "wb") as fout: + cloudpickle.dump(output, fout) + print("Done!") + + # Run the data driven estimation, save the output + if do_np: + print("\nDoing the nonprompt estimation...") + out_pkl_file_name_np = os.path.join(outpath,outname+"_np.pkl.gz") + ddp = DataDrivenProducer(out_pkl_file,out_pkl_file_name_np) + print(f"Saving output in {out_pkl_file_name_np}...") + ddp.dumpToPickle() + print("Done!") + # Run the renorm fact envelope calculation + if do_renormfact_envelope: + print("\nDoing the renorm. fact. envelope calculation...") + dict_of_histos = utils.get_hist_from_pkl(out_pkl_file_name_np,allow_empty=False) + dict_of_histos_after_applying_envelope = get_renormfact_envelope(dict_of_histos) + utils.dump_to_pkl(out_pkl_file_name_np,dict_of_histos_after_applying_envelope) From 1cf2c604002a576e7016143b2923c4d144bd81b6 Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Thu, 20 Jun 2024 12:37:23 -0400 Subject: [PATCH 03/21] Fix lint error (duplicate line) --- analysis/mc_validation/mc_validation_gen_processor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/analysis/mc_validation/mc_validation_gen_processor.py b/analysis/mc_validation/mc_validation_gen_processor.py index eccb83b9b..6012b4015 100644 --- a/analysis/mc_validation/mc_validation_gen_processor.py +++ b/analysis/mc_validation/mc_validation_gen_processor.py @@ -33,7 +33,6 @@ def __init__(self, samples, wc_names_lst=[], hist_lst=None, ecut_threshold=None, "tops_pt" : HistEFT(proc_axis, hist.axis.Regular(50, 0, 500, name="tops_pt", label=r"Pt of the sum of the tops"), wc_names=wc_names_lst, rebin=False), "tX_pt" : HistEFT(proc_axis, hist.axis.Regular(40, 0, 400, name="tX_pt", label=r"Pt of the t(t)X system"), wc_names=wc_names_lst, rebin=False), "njets" : HistEFT(proc_axis, hist.axis.Regular(10, 0, 10, name="njets", label=r"njets"), wc_names=wc_names_lst, rebin=False), - "njets" : HistEFT(proc_axis, hist.axis.Regular(10, 0, 10, name="njets", label=r"njets"), wc_names=wc_names_lst, rebin=False), } # Set the list of hists to fill From c86ed658feca4842b227a4ef79edbac3e9a5ecf1 Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Thu, 20 Jun 2024 14:06:49 -0400 Subject: [PATCH 04/21] Add l0pt and j0pt --- analysis/mc_validation/mc_validation_gen_processor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/analysis/mc_validation/mc_validation_gen_processor.py b/analysis/mc_validation/mc_validation_gen_processor.py index 6012b4015..4c64a7b90 100644 --- a/analysis/mc_validation/mc_validation_gen_processor.py +++ b/analysis/mc_validation/mc_validation_gen_processor.py @@ -31,6 +31,8 @@ def __init__(self, samples, wc_names_lst=[], hist_lst=None, ecut_threshold=None, "ht" : HistEFT(proc_axis, hist.axis.Regular(100, 0, 1000, name="ht", label=r"Scalar sum of genjet pt"), wc_names=wc_names_lst, rebin=False), "ht_clean" : HistEFT(proc_axis, hist.axis.Regular(100, 0, 1000, name="ht_clean", label=r"Scalar sum of clean genjet pt"), wc_names=wc_names_lst, rebin=False), "tops_pt" : HistEFT(proc_axis, hist.axis.Regular(50, 0, 500, name="tops_pt", label=r"Pt of the sum of the tops"), wc_names=wc_names_lst, rebin=False), + "l0_pt" : HistEFT(proc_axis, hist.axis.Regular(50, 0, 500, name="l0_pt", label=r"Pt of leading lepton"), wc_names=wc_names_lst, rebin=False), + "j0_pt" : HistEFT(proc_axis, hist.axis.Regular(50, 0, 500, name="j0_pt", label=r"Pt of leading jet"), wc_names=wc_names_lst, rebin=False), "tX_pt" : HistEFT(proc_axis, hist.axis.Regular(40, 0, 400, name="tX_pt", label=r"Pt of the t(t)X system"), wc_names=wc_names_lst, rebin=False), "njets" : HistEFT(proc_axis, hist.axis.Regular(10, 0, 10, name="njets", label=r"njets"), wc_names=wc_names_lst, rebin=False), } @@ -134,6 +136,8 @@ def process(self, events): "ht_clean" : ht_clean, "tX_pt" : tX_pt, "tops_pt" : tops_pt, + "l0_pt" : ak.firsts(gen_l.pt), + "j0_pt" : ak.firsts(genjet.pt), "njets" : njets, } From 12f023242c02006fdf2493308426766daaa282a9 Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Fri, 28 Jun 2024 16:37:37 -0400 Subject: [PATCH 05/21] Gen photon processor --- analysis/mc_validation/README.md | 3 + analysis/mc_validation/gen_photon_rocessor.py | 268 ++++++++++++++++++ 2 files changed, 271 insertions(+) create mode 100644 analysis/mc_validation/gen_photon_rocessor.py diff --git a/analysis/mc_validation/README.md b/analysis/mc_validation/README.md index e31ffc4d6..b4a7ce1a4 100644 --- a/analysis/mc_validation/README.md +++ b/analysis/mc_validation/README.md @@ -16,5 +16,8 @@ This directory contains scripts from the validation studies of the FullR2 privat - Should be run on the output of the topeft processor - Was used during the June 2022 MC validation studies (for TOP-22-006 pre approval checks) +* `gen_photon_rocessor.py`: + - This script produces a pkl file compatible with the datacard maker using nanoGEN files. + diff --git a/analysis/mc_validation/gen_photon_rocessor.py b/analysis/mc_validation/gen_photon_rocessor.py new file mode 100644 index 000000000..f3b6228da --- /dev/null +++ b/analysis/mc_validation/gen_photon_rocessor.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python +import numpy as np +import awkward as ak +np.seterr(divide='ignore', invalid='ignore', over='ignore') +from coffea import processor +from coffea.analysis_tools import PackedSelection +import hist + +import topeft.modules.object_selection as te_os +from topcoffea.modules.histEFT import HistEFT +import topcoffea.modules.eft_helper as efth +from topcoffea.modules.get_param_from_jsons import GetParam +from topcoffea.modules.paths import topcoffea_path +get_tc_param = GetParam(topcoffea_path("params/params.json")) +def construct_cat_name(chan_str,njet_str=None,flav_str=None): + + # Get the component strings + nlep_str = chan_str.split("_")[0] # Assumes n leps comes first in the str + chan_str = "_".join(chan_str.split("_")[1:]) # The rest of the channel name is everything that comes after nlep + if chan_str == "": chan_str = None # So that we properly skip this in the for loop below + if flav_str is not None: + flav_str = flav_str + if njet_str is not None: + njet_str = njet_str[-2:] # Assumes number of n jets comes at the end of the string + if "j" not in njet_str: + # The njet string should really have a "j" in it + raise Exception(f"Something when wrong while trying to consturct channel name, is \"{njet_str}\" an njet string?") + + # Put the component strings into the channel name + ret_str = nlep_str + for component in [flav_str,chan_str,njet_str]: + if component is None: continue + ret_str = "_".join([ret_str,component]) + return ret_str + + +class AnalysisProcessor(processor.ProcessorABC): + + def __init__(self, samples, wc_names_lst=[], hist_lst=None, ecut_threshold=None, do_errors=False, do_systematics=False, split_by_lepton_flavor=False, skip_signal_regions=False, skip_control_regions=False, dtype=np.float32): + + self._samples = samples + self._wc_names_lst = wc_names_lst + self._dtype = dtype + + # Create the histograms + proc_axis = hist.axis.StrCategory([], name="process", growth=True) + chan_axis = hist.axis.StrCategory([], name="channel", growth=True) + syst_axis = hist.axis.StrCategory([], name="systematic", growth=True) + self._accumulator = { + "mll_fromzg_e" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_e", label=r"invmass ee from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll_fromzg_m" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_m", label=r"invmass mm from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll_fromzg_t" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_t", label=r"invmass tautau from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(60, 0, 600, name="mll", label=r"Invmass l0l1"), wc_names=wc_names_lst, rebin=False), + "ht" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(100, 0, 1000, name="ht", label=r"Scalar sum of genjet pt"), wc_names=wc_names_lst, rebin=False), + "ht_clean" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(100, 0, 1000, name="ht_clean", label=r"Scalar sum of clean genjet pt"), wc_names=wc_names_lst, rebin=False), + "tops_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(50, 0, 500, name="tops_pt", label=r"Pt of the sum of the tops"), wc_names=wc_names_lst, rebin=False), + "photon_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Variable([20,35,50,70,100,170,200,250,300], name="photon_pt", label=r"Pt of the sum of the photon"), wc_names=wc_names_lst, rebin=False), + "l0_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(50, 0, 500, name="l0_pt", label=r"Pt of leading lepton"), wc_names=wc_names_lst, rebin=False), + "j0_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(50, 0, 500, name="j0_pt", label=r"Pt of leading jet"), wc_names=wc_names_lst, rebin=False), + "tX_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 400, name="tX_pt", label=r"Pt of the t(t)X system"), wc_names=wc_names_lst, rebin=False), + "njets" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(10, 0, 10, name="njets", label=r"njets"), wc_names=wc_names_lst, rebin=False), + } + + # Set the list of hists to fill + if hist_lst is None: + # If the hist list is none, assume we want to fill all hists + self._hist_lst = list(self._accumulator.keys()) + else: + # Otherwise, just fill the specified subset of hists + for hist_to_include in hist_lst: + if hist_to_include not in self._accumulator.keys(): + raise Exception(f"Error: Cannot specify hist \"{hist_to_include}\", it is not defined in the processor.") + self._hist_lst = hist_lst # Which hists to fill + + # Set the booleans + self._do_errors = do_errors # Whether to calculate and store the w**2 coefficients + + + @property + def accumulator(self): + return self._accumulator + + @property + def columns(self): + return self._columns + + # Main function: run on a given dataset + def process(self, events): + + ### Dataset parameters ### + dataset = events.metadata["dataset"] + + isData = self._samples[dataset]["isData"] + histAxisName = self._samples[dataset]["histAxisName"] + year = self._samples[dataset]["year"] + xsec = self._samples[dataset]["xsec"] + sow = self._samples[dataset]["nSumOfWeights"] + + if isData: raise Exception("Error: This processor is not for data") + + selections = PackedSelection(dtype='uint64') + + ### Get gen particles collection ### + genpart = events.GenPart + genjet = events.GenJet + + + ### Lepton object selection ### + + is_final_mask = genpart.hasFlags(["fromHardProcess","isLastCopy"]) + + gen_top = ak.pad_none(genpart[is_final_mask & (abs(genpart.pdgId) == 6)],2) + + gen_l = genpart[is_final_mask & ((abs(genpart.pdgId) == 11) | (abs(genpart.pdgId) == 13) | (abs(genpart.pdgId) == 15))] + gen_e = genpart[is_final_mask & (abs(genpart.pdgId) == 11)] + gen_m = genpart[is_final_mask & (abs(genpart.pdgId) == 13)] + gen_t = genpart[is_final_mask & (abs(genpart.pdgId) == 15)] + + gen_p = genpart[is_final_mask & (abs(genpart.pdgId) == 22)] + + gen_l = gen_l[ak.argsort(gen_l.pt, axis=-1, ascending=False)] + gen_l = ak.pad_none(gen_l,2) + gen_e = gen_e[ak.argsort(gen_e.pt, axis=-1, ascending=False)] + gen_m = gen_m[ak.argsort(gen_m.pt, axis=-1, ascending=False)] + gen_t = gen_t[ak.argsort(gen_t.pt, axis=-1, ascending=False)] + + gen_p = gen_p[ak.argsort(gen_p.pt, axis=-1, ascending=False)] + + gen_l_from_zg = ak.pad_none(gen_l[(gen_l.distinctParent.pdgId == 23) | (gen_l.distinctParent.pdgId == 22)], 2) + gen_e_from_zg = ak.pad_none(gen_e[(gen_e.distinctParent.pdgId == 23) | (gen_e.distinctParent.pdgId == 22)], 2) + gen_m_from_zg = ak.pad_none(gen_m[(gen_m.distinctParent.pdgId == 23) | (gen_m.distinctParent.pdgId == 22)], 2) + gen_t_from_zg = ak.pad_none(gen_t[(gen_t.distinctParent.pdgId == 23) | (gen_t.distinctParent.pdgId == 22)], 2) + + + # Jet object selection + genjet = genjet[genjet.pt > 30] + is_clean_jet = te_os.isClean(genjet, gen_e, drmin=0.4) & te_os.isClean(genjet, gen_m, drmin=0.4) & te_os.isClean(genjet, gen_t, drmin=0.4) + genjet_clean = genjet[is_clean_jet] + genbjet_clean = genjet[np.abs(genjet.partonFlavour)==5] + njets = ak.num(genjet_clean) + nbjets = ak.num(genbjet_clean) + + + is_em = (np.abs(gen_l[:,0].pdgId)==11) & (np.abs(gen_l[:,1].pdgId)==13) + is_me = (np.abs(gen_l[:,1].pdgId)==11) & (np.abs(gen_l[:,0].pdgId)==13) + selections.add("2los_CRtt", (ak.any(gen_l[:,0].pdgId/np.abs(gen_l[:,1].pdgId)+gen_l[:,1].pdgId/np.abs(gen_l[:,1].pdgId)==0) & ak.any(is_em | is_me) & (nbjets==2)) & (ak.num(gen_p)>0) & (ak.firsts(gen_p).pt>20)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + #selections.add("2los_CRtt", (events.is2l_nozeeveto & charge2l_0 & events.is_em & bmask_exactly2med & pass_trg)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("atmost_3j" , (njets<=3)) + ### Get dense axis variables ### + + ht = ak.sum(genjet.pt,axis=-1) + ht_clean = ak.sum(genjet_clean.pt,axis=-1) + + tops_pt = gen_top.sum().pt + + # Pt of the t(t)X system + tX_system = ak.concatenate([gen_top,gen_l_from_zg],axis=1) + tX_pt = tX_system.sum().pt + + # Invmass distributions + mll_e_from_zg = (gen_e_from_zg[:,0] + gen_e_from_zg[:,1]).mass + mll_m_from_zg = (gen_m_from_zg[:,0] + gen_m_from_zg[:,1]).mass + mll_t_from_zg = (gen_t_from_zg[:,0] + gen_t_from_zg[:,1]).mass + mll_l0l1 = (gen_l[:,0] + gen_l[:,1]).mass + + # Dictionary of dense axis values + dense_axis_dict = { + "mll_fromzg_e" : mll_e_from_zg, + "mll_fromzg_m" : mll_m_from_zg, + "mll_fromzg_t" : mll_t_from_zg, + "mll" : mll_l0l1, + "ht" : ht, + "ht_clean" : ht_clean, + "tX_pt" : tX_pt, + "tops_pt" : tops_pt, + "photon_pt" : ak.firsts(gen_p).pt, + "l0_pt" : ak.firsts(gen_l.pt), + "j0_pt" : ak.firsts(genjet.pt), + "njets" : njets, + } + selections.add("atmost_3j" , (njets<=3)) + cat_dict = { + "2los_CRtt" : { + "atmost_3j" : { + "lep_chan_lst" : ["2los_CRtt_photon"], + "lep_flav_lst" : ["em"], + "appl_lst" : ["isSR_2lOS" , "isAR_2lOS"], + }, + "atleast_1j" : { + "lep_chan_lst" : ["2los_CRtt_photon"], + "lep_flav_lst" : ["em"], + "appl_lst" : ["isSR_2lOS" , "isAR_2lOS"], + }, + }, + } + + + ### Get weights ### + + # Extract the EFT quadratic coefficients and optionally use them to calculate the coefficients on the w**2 quartic function + # eft_coeffs is never Jagged so convert immediately to numpy for ease of use. + eft_coeffs = ak.to_numpy(events["EFTfitCoefficients"]) if hasattr(events, "EFTfitCoefficients") else None + if eft_coeffs is not None: + # Check to see if the ordering of WCs for this sample matches what want + if self._samples[dataset]["WCnames"] != self._wc_names_lst: + eft_coeffs = efth.remap_coeffs(self._samples[dataset]["WCnames"], self._wc_names_lst, eft_coeffs) + eft_w2_coeffs = efth.calc_w2_coeffs(eft_coeffs,self._dtype) if (self._do_errors and eft_coeffs is not None) else None + + # If this is not an eft sample, get the genWeight + if eft_coeffs is None: genw = events["genWeight"] + else: genw = np.ones_like(events["event"]) + lumi = 1000.0*get_tc_param(f"lumi_{year}") + event_weight = lumi*xsec*genw/sow + + # Example of reweighting based on Ht + #if "private" in histAxisName: + # ht_sf = get_ht_sf(ht,histAxisName) + # event_weight = event_weight*ht_sf + + + ### Loop over the hists we want to fill ### + + hout = self.accumulator + + for dense_axis_name, dense_axis_vals in dense_axis_dict.items(): + if dense_axis_name not in self._hist_lst: + print(f"Skipping \"{dense_axis_name}\", it is not in the list of hists to include.") + continue + for chan in cat_dict: + flav_ch = None + njet_ch = None + njets_any_mask = selections.any(*cat_dict[chan].keys()) + for njet_val in cat_dict[chan]: + cuts_lst = [chan, njet_val] + if dense_axis_name == "njets": + all_cuts_mask = (selections.all(*cuts_lst) & njets_any_mask) + else: + njet_ch = njet_val + all_cuts_mask = (selections.all(*cuts_lst)) + ch_name = construct_cat_name(chan,njet_str=njet_ch,flav_str=flav_ch) + + # Mask out the none values + isnotnone_mask = (ak.fill_none((dense_axis_vals != None),False)) + isnotnone_mask = isnotnone_mask & all_cuts_mask + dense_axis_vals_cut = dense_axis_vals[isnotnone_mask] + event_weight_cut = event_weight[isnotnone_mask] + eft_coeffs_cut = eft_coeffs + if eft_coeffs is not None: eft_coeffs_cut = eft_coeffs[isnotnone_mask] + #eft_w2_coeffs_cut = eft_w2_coeffs + #if eft_w2_coeffs is not None: eft_w2_coeffs_cut = eft_w2_coeffs[isnotnone_mask] + + # Fill the histos + axes_fill_info_dict = { + dense_axis_name : dense_axis_vals_cut, + "process" : histAxisName, + "channel" : ch_name, + "systematic" : "nominal", + "weight" : event_weight_cut, + "eft_coeff" : eft_coeffs_cut, + #"eft_err_coeff" : eft_w2_coeffs_cut, + } + + hout[dense_axis_name].fill(**axes_fill_info_dict) + + return hout + + def postprocess(self, accumulator): + return accumulator From d53dd21f7b4c1c1901c1564f7da5982d2747a844 Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Fri, 28 Jun 2024 17:20:18 -0400 Subject: [PATCH 06/21] Fix lint --- analysis/mc_validation/gen_photon_rocessor.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/analysis/mc_validation/gen_photon_rocessor.py b/analysis/mc_validation/gen_photon_rocessor.py index f3b6228da..694cfd5e2 100644 --- a/analysis/mc_validation/gen_photon_rocessor.py +++ b/analysis/mc_validation/gen_photon_rocessor.py @@ -180,18 +180,18 @@ def process(self, events): } selections.add("atmost_3j" , (njets<=3)) cat_dict = { - "2los_CRtt" : { - "atmost_3j" : { - "lep_chan_lst" : ["2los_CRtt_photon"], - "lep_flav_lst" : ["em"], - "appl_lst" : ["isSR_2lOS" , "isAR_2lOS"], - }, - "atleast_1j" : { - "lep_chan_lst" : ["2los_CRtt_photon"], - "lep_flav_lst" : ["em"], - "appl_lst" : ["isSR_2lOS" , "isAR_2lOS"], - }, + "2los_CRtt" : { + "atmost_3j" : { + "lep_chan_lst" : ["2los_CRtt_photon"], + "lep_flav_lst" : ["em"], + "appl_lst" : ["isSR_2lOS" , "isAR_2lOS"], }, + "atleast_1j" : { + "lep_chan_lst" : ["2los_CRtt_photon"], + "lep_flav_lst" : ["em"], + "appl_lst" : ["isSR_2lOS" , "isAR_2lOS"], + }, + }, } From c0d92be2c74c9879716f6631d8a7d55fc240bb7a Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Mon, 1 Jul 2024 11:02:17 -0400 Subject: [PATCH 07/21] Adding run gen script --- analysis/mc_validation/run_gen_analysis.py | 365 +++++++++++++++++++++ 1 file changed, 365 insertions(+) create mode 100644 analysis/mc_validation/run_gen_analysis.py diff --git a/analysis/mc_validation/run_gen_analysis.py b/analysis/mc_validation/run_gen_analysis.py new file mode 100644 index 000000000..650a8dbac --- /dev/null +++ b/analysis/mc_validation/run_gen_analysis.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python + +import argparse +import json +import time +import cloudpickle +import gzip +import os + +from coffea import processor +from coffea.nanoevents import NanoAODSchema + +import topcoffea.modules.utils as utils +import topcoffea.modules.remote_environment as remote_environment + +from topeft.modules.dataDrivenEstimation import DataDrivenProducer +from topeft.modules.get_renormfact_envelope import get_renormfact_envelope +import mc_validation_gen_processor as gen_processor + +LST_OF_KNOWN_EXECUTORS = ["futures","work_queue"] + +WGT_VAR_LST = [ + "nSumOfWeights_ISRUp", + "nSumOfWeights_ISRDown", + "nSumOfWeights_FSRUp", + "nSumOfWeights_FSRDown", + "nSumOfWeights_renormUp", + "nSumOfWeights_renormDown", + "nSumOfWeights_factUp", + "nSumOfWeights_factDown", + "nSumOfWeights_renormfactUp", + "nSumOfWeights_renormfactDown", +] + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='You can customize your run') + parser.add_argument('jsonFiles' , nargs='?', default='', help = 'Json file(s) containing files and metadata') + parser.add_argument('--executor','-x' , default='work_queue', help = 'Which executor to use') + parser.add_argument('--prefix', '-r' , nargs='?', default='', help = 'Prefix or redirector to look for the files') + parser.add_argument('--test','-t' , action='store_true' , help = 'To perform a test, run over a few events in a couple of chunks') + parser.add_argument('--pretend' , action='store_true', help = 'Read json files but, not execute the analysis') + parser.add_argument('--nworkers','-n' , default=8 , help = 'Number of workers') + parser.add_argument('--chunksize','-s' , default=100000, help = 'Number of events per chunk') + parser.add_argument('--nchunks','-c' , default=None, help = 'You can choose to run only a number of chunks') + parser.add_argument('--outname','-o' , default='plotsTopEFT', help = 'Name of the output file with histograms') + parser.add_argument('--outpath','-p' , default='histos', help = 'Name of the output directory') + parser.add_argument('--treename' , default='Events', help = 'Name of the tree inside the files') + parser.add_argument('--do-errors' , action='store_true', help = 'Save the w**2 coefficients') + parser.add_argument('--do-systs', action='store_true', help = 'Compute systematic variations') + parser.add_argument('--split-lep-flavor', action='store_true', help = 'Split up categories by lepton flavor') + parser.add_argument('--skip-sr', action='store_true', help = 'Skip all signal region categories') + parser.add_argument('--skip-cr', action='store_true', help = 'Skip all control region categories') + parser.add_argument('--do-np' , action='store_true', help = 'Perform nonprompt estimation on the output hist, and save a new hist with the np contribution included. Note that signal, background and data samples should all be processed together in order for this option to make sense.') + parser.add_argument('--do-renormfact-envelope', action='store_true', help = 'Perform renorm/fact envelope calculation on the output hist (saves the modified with the the same name as the original.') + parser.add_argument('--wc-list', action='extend', nargs='+', help = 'Specify a list of Wilson coefficients to use in filling histograms.') + parser.add_argument('--hist-list', action='extend', nargs='+', help = 'Specify a list of histograms to fill.') + parser.add_argument('--ecut', default=None , help = 'Energy cut threshold i.e. throw out events above this (GeV)') + parser.add_argument('--port', default='9123-9130', help = 'Specify the Work Queue port. An integer PORT or an integer range PORT_MIN-PORT_MAX.') + + + args = parser.parse_args() + jsonFiles = args.jsonFiles + prefix = args.prefix + executor = args.executor + dotest = args.test + nworkers = int(args.nworkers) + chunksize = int(args.chunksize) + nchunks = int(args.nchunks) if not args.nchunks is None else args.nchunks + outname = args.outname + outpath = args.outpath + pretend = args.pretend + treename = args.treename + do_errors = args.do_errors + do_systs = args.do_systs + split_lep_flavor = args.split_lep_flavor + skip_sr = args.skip_sr + skip_cr = args.skip_cr + do_np = args.do_np + do_renormfact_envelope = args.do_renormfact_envelope + wc_lst = args.wc_list if args.wc_list is not None else [] + + # Check if we have valid options + if executor not in LST_OF_KNOWN_EXECUTORS: + raise Exception(f"The \"{executor}\" executor is not known. Please specify an executor from the known executors ({LST_OF_KNOWN_EXECUTORS}). Exiting.") + if do_renormfact_envelope: + if not do_systs: + raise Exception("Error: Cannot specify do_renormfact_envelope if we are not including systematics.") + if not do_np: + raise Exception("Error: Cannot specify do_renormfact_envelope if we have not already done the integration across the appl axis that occurs in the data driven estimator step.") + if dotest: + if executor == "futures": + nchunks = 2 + chunksize = 10000 + nworkers = 1 + print('Running a fast test with %i workers, %i chunks of %i events'%(nworkers, nchunks, chunksize)) + else: + raise Exception(f"The \"test\" option is not set up to work with the {executor} executor. Exiting.") + + + # Set the threshold for the ecut (if not applying a cut, should be None) + ecut_threshold = args.ecut + if ecut_threshold is not None: ecut_threshold = float(args.ecut) + + if executor == "work_queue": + # construct wq port range + port = list(map(int, args.port.split('-'))) + if len(port) < 1: + raise ValueError("At least one port value should be specified.") + if len(port) > 2: + raise ValueError("More than one port range was specified.") + if len(port) == 1: + # convert single values into a range of one element + port.append(port[0]) + + # Figure out which hists to include + if args.hist_list == ["ana"]: + # Here we hardcode a list of hists used for the analysis + hist_lst = ["cosZ", "cosq", "azim", "tops_pt", "tX_pt", "njets"] + elif args.hist_list == ["cr"]: + # Here we hardcode a list of hists used for the CRs + hist_lst = ["lj0pt", "ptz", "met", "ljptsum", "l0pt", "l0eta", "l1pt", "l1eta", "j0pt", "j0eta", "njets", "nbtagsl", "invmass"] + else: + # We want to specify a custom list + # If we don't specify this argument, it will be None, and the processor will fill all hists + hist_lst = args.hist_list + + + ### Load samples from json + samplesdict = {} + allInputFiles = [] + + def LoadJsonToSampleName(jsonFile, prefix): + sampleName = jsonFile if not '/' in jsonFile else jsonFile[jsonFile.rfind('/')+1:] + if sampleName.endswith('.json'): sampleName = sampleName[:-5] + with open(jsonFile) as jf: + samplesdict[sampleName] = json.load(jf) + samplesdict[sampleName]['redirector'] = prefix + + if isinstance(jsonFiles, str) and ',' in jsonFiles: + jsonFiles = jsonFiles.replace(' ', '').split(',') + elif isinstance(jsonFiles, str): + jsonFiles = [jsonFiles] + for jsonFile in jsonFiles: + if os.path.isdir(jsonFile): + if not jsonFile.endswith('/'): jsonFile+='/' + for f in os.path.listdir(jsonFile): + if f.endswith('.json'): allInputFiles.append(jsonFile+f) + else: + allInputFiles.append(jsonFile) + + # Read from cfg files + for f in allInputFiles: + if not os.path.isfile(f): + raise Exception(f'[ERROR] Input file {f} not found!') + # This input file is a json file, not a cfg + if f.endswith('.json'): + LoadJsonToSampleName(f, prefix) + # Open cfg files + else: + with open(f) as fin: + print(' >> Reading json from cfg file...') + lines = fin.readlines() + for l in lines: + if '#' in l: + l=l[:l.find('#')] + l = l.replace(' ', '').replace('\n', '') + if l == '': continue + if ',' in l: + l = l.split(',') + for nl in l: + if not os.path.isfile(l): + prefix = nl + else: + LoadJsonToSampleName(nl, prefix) + else: + if not os.path.isfile(l): + prefix = l + else: + LoadJsonToSampleName(l, prefix) + + flist = {} + nevts_total = 0 + for sname in samplesdict.keys(): + redirector = samplesdict[sname]['redirector'] + flist[sname] = [(redirector+f) for f in samplesdict[sname]['files']] + samplesdict[sname]['year'] = samplesdict[sname]['year'] + samplesdict[sname]['xsec'] = float(samplesdict[sname]['xsec']) + samplesdict[sname]['nEvents'] = int(samplesdict[sname]['nEvents']) + nevts_total += samplesdict[sname]['nEvents'] + samplesdict[sname]['nGenEvents'] = int(samplesdict[sname]['nGenEvents']) + samplesdict[sname]['nSumOfWeights'] = float(samplesdict[sname]['nSumOfWeights']) + if not samplesdict[sname]["isData"]: + for wgt_var in WGT_VAR_LST: + # Check that MC samples have all needed weight sums (only needed if doing systs) + if do_systs: + if (wgt_var not in samplesdict[sname]): + raise Exception(f"Missing weight variation \"{wgt_var}\".") + else: + samplesdict[sname][wgt_var] = float(samplesdict[sname][wgt_var]) + # Print file info + print('>> '+sname) + print(' - isData? : %s' %('YES' if samplesdict[sname]['isData'] else 'NO')) + print(' - year : %s' %samplesdict[sname]['year']) + print(' - xsec : %f' %samplesdict[sname]['xsec']) + print(' - histAxisName : %s' %samplesdict[sname]['histAxisName']) + print(' - options : %s' %samplesdict[sname]['options']) + print(' - tree : %s' %samplesdict[sname]['treeName']) + print(' - nEvents : %i' %samplesdict[sname]['nEvents']) + print(' - nGenEvents : %i' %samplesdict[sname]['nGenEvents']) + print(' - SumWeights : %i' %samplesdict[sname]['nSumOfWeights']) + if not samplesdict[sname]["isData"]: + for wgt_var in WGT_VAR_LST: + if wgt_var in samplesdict[sname]: + print(f' - {wgt_var}: {samplesdict[sname][wgt_var]}') + print(' - Prefix : %s' %samplesdict[sname]['redirector']) + print(' - nFiles : %i' %len(samplesdict[sname]['files'])) + for fname in samplesdict[sname]['files']: print(' %s'%fname) + + if pretend: + print('pretending...') + exit() + + # Extract the list of all WCs, as long as we haven't already specified one. + if len(wc_lst) == 0: + for k in samplesdict.keys(): + for wc in samplesdict[k]['WCnames']: + if wc not in wc_lst: + wc_lst.append(wc) + + if len(wc_lst) > 0: + # Yes, why not have the output be in correct English? + if len(wc_lst) == 1: + wc_print = wc_lst[0] + elif len(wc_lst) == 2: + wc_print = wc_lst[0] + ' and ' + wc_lst[1] + else: + wc_print = ', '.join(wc_lst[:-1]) + ', and ' + wc_lst[-1] + print('Wilson Coefficients: {}.'.format(wc_print)) + else: + print('No Wilson coefficients specified') + + processor_instance = gen_processor.AnalysisProcessor(samplesdict,wc_lst,hist_lst,ecut_threshold,do_errors,do_systs,split_lep_flavor,skip_sr,skip_cr) + + if executor == "work_queue": + executor_args = { + 'master_name': '{}-workqueue-coffea'.format(os.environ['USER']), + + # find a port to run work queue in this range: + 'port': port, + + 'debug_log': 'debug.log', + 'transactions_log': 'tr.log', + 'stats_log': 'stats.log', + 'tasks_accum_log': 'tasks.log', + + 'environment_file': remote_environment.get_environment( + extra_pip_local = {"topeft": ["topeft", "setup.py"]}, + ), + 'filepath': f'/project01/ndcms/{os.environ["USER"]}', + 'extra_input_files': ["mc_validation_gen_processor.py"], + + 'retries': 5, + + # use mid-range compression for chunks results. 9 is the default for work + # queue in coffea. Valid values are 0 (minimum compression, less memory + # usage) to 16 (maximum compression, more memory usage). + 'compression': 9, + + # automatically find an adequate resource allocation for tasks. + # tasks are first tried using the maximum resources seen of previously ran + # tasks. on resource exhaustion, they are retried with the maximum resource + # values, if specified below. if a maximum is not specified, the task waits + # forever until a larger worker connects. + 'resource_monitor': True, + 'resources_mode': 'auto', + + # this resource values may be omitted when using + # resources_mode: 'auto', but they do make the initial portion + # of a workflow run a little bit faster. + # Rather than using whole workers in the exploratory mode of + # resources_mode: auto, tasks are forever limited to a maximum + # of 8GB of mem and disk. + # + # NOTE: The very first tasks in the exploratory + # mode will use the values specified here, so workers need to be at least + # this large. If left unspecified, tasks will use whole workers in the + # exploratory mode. + # 'cores': 1, + # 'disk': 8000, #MB + # 'memory': 10000, #MB + + # control the size of accumulation tasks. Results are + # accumulated in groups of size chunks_per_accum, keeping at + # most chunks_per_accum at the same time in memory per task. + 'chunks_per_accum': 25, + 'chunks_accum_in_mem': 2, + + # terminate workers on which tasks have been running longer than average. + # This is useful for temporary conditions on worker nodes where a task will + # be finish faster is ran in another worker. + # the time limit is computed by multipliying the average runtime of tasks + # by the value of 'fast_terminate_workers'. Since some tasks can be + # legitimately slow, no task can trigger the termination of workers twice. + # + # warning: small values (e.g. close to 1) may cause the workflow to misbehave, + # as most tasks will be terminated. + # + # Less than 1 disables it. + 'fast_terminate_workers': 0, + + # print messages when tasks are submitted, finished, etc., + # together with their resource allocation and usage. If a task + # fails, its standard output is also printed, so we can turn + # off print_stdout for all tasks. + 'verbose': True, + 'print_stdout': False, + } + + # Run the processor and get the output + tstart = time.time() + + if executor == "futures": + exec_instance = processor.futures_executor(workers=nworkers) + runner = processor.Runner(exec_instance, schema=NanoAODSchema, chunksize=chunksize, maxchunks=nchunks) + elif executor == "work_queue": + executor = processor.WorkQueueExecutor(**executor_args) + runner = processor.Runner(executor, schema=NanoAODSchema, chunksize=chunksize, maxchunks=nchunks, skipbadfiles=False, xrootdtimeout=180) + + output = runner(flist, treename, processor_instance) + + dt = time.time() - tstart + + if executor == "work_queue": + print('Processed {} events in {} seconds ({:.2f} evts/sec).'.format(nevts_total,dt,nevts_total/dt)) + + #nbins = sum(sum(arr.size for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist)) + #nfilled = sum(sum(np.sum(arr > 0) for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist)) + #print("Filled %.0f bins, nonzero bins: %1.1f %%" % (nbins, 100*nfilled/nbins,)) + + if executor == "futures": + print("Processing time: %1.2f s with %i workers (%.2f s cpu overall)" % (dt, nworkers, dt*nworkers, )) + + # Save the output + if not os.path.isdir(outpath): os.system("mkdir -p %s"%outpath) + out_pkl_file = os.path.join(outpath,outname+".pkl.gz") + print(f"\nSaving output in {out_pkl_file}...") + with gzip.open(out_pkl_file, "wb") as fout: + cloudpickle.dump(output, fout) + print("Done!") + + # Run the data driven estimation, save the output + if do_np: + print("\nDoing the nonprompt estimation...") + out_pkl_file_name_np = os.path.join(outpath,outname+"_np.pkl.gz") + ddp = DataDrivenProducer(out_pkl_file,out_pkl_file_name_np) + print(f"Saving output in {out_pkl_file_name_np}...") + ddp.dumpToPickle() + print("Done!") + # Run the renorm fact envelope calculation + if do_renormfact_envelope: + print("\nDoing the renorm. fact. envelope calculation...") + dict_of_histos = utils.get_hist_from_pkl(out_pkl_file_name_np,allow_empty=False) + dict_of_histos_after_applying_envelope = get_renormfact_envelope(dict_of_histos) + utils.dump_to_pkl(out_pkl_file_name_np,dict_of_histos_after_applying_envelope) From 8653a462a7bc2499d62f45ba52528d32ffb144ec Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Mon, 1 Jul 2024 12:40:03 -0400 Subject: [PATCH 08/21] Point to gen photon processor --- analysis/mc_validation/run_gen_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/mc_validation/run_gen_analysis.py b/analysis/mc_validation/run_gen_analysis.py index 650a8dbac..5a9c2b6c9 100644 --- a/analysis/mc_validation/run_gen_analysis.py +++ b/analysis/mc_validation/run_gen_analysis.py @@ -15,7 +15,7 @@ from topeft.modules.dataDrivenEstimation import DataDrivenProducer from topeft.modules.get_renormfact_envelope import get_renormfact_envelope -import mc_validation_gen_processor as gen_processor +import gen_photon_processor as gen_processor LST_OF_KNOWN_EXECUTORS = ["futures","work_queue"] From 63d8b160977a05e3b8e27b5654966f69a187d157 Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Mon, 1 Jul 2024 12:40:40 -0400 Subject: [PATCH 09/21] Correct name --- .../mc_validation/gen_photon_processor.py | 268 ++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 analysis/mc_validation/gen_photon_processor.py diff --git a/analysis/mc_validation/gen_photon_processor.py b/analysis/mc_validation/gen_photon_processor.py new file mode 100644 index 000000000..694cfd5e2 --- /dev/null +++ b/analysis/mc_validation/gen_photon_processor.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python +import numpy as np +import awkward as ak +np.seterr(divide='ignore', invalid='ignore', over='ignore') +from coffea import processor +from coffea.analysis_tools import PackedSelection +import hist + +import topeft.modules.object_selection as te_os +from topcoffea.modules.histEFT import HistEFT +import topcoffea.modules.eft_helper as efth +from topcoffea.modules.get_param_from_jsons import GetParam +from topcoffea.modules.paths import topcoffea_path +get_tc_param = GetParam(topcoffea_path("params/params.json")) +def construct_cat_name(chan_str,njet_str=None,flav_str=None): + + # Get the component strings + nlep_str = chan_str.split("_")[0] # Assumes n leps comes first in the str + chan_str = "_".join(chan_str.split("_")[1:]) # The rest of the channel name is everything that comes after nlep + if chan_str == "": chan_str = None # So that we properly skip this in the for loop below + if flav_str is not None: + flav_str = flav_str + if njet_str is not None: + njet_str = njet_str[-2:] # Assumes number of n jets comes at the end of the string + if "j" not in njet_str: + # The njet string should really have a "j" in it + raise Exception(f"Something when wrong while trying to consturct channel name, is \"{njet_str}\" an njet string?") + + # Put the component strings into the channel name + ret_str = nlep_str + for component in [flav_str,chan_str,njet_str]: + if component is None: continue + ret_str = "_".join([ret_str,component]) + return ret_str + + +class AnalysisProcessor(processor.ProcessorABC): + + def __init__(self, samples, wc_names_lst=[], hist_lst=None, ecut_threshold=None, do_errors=False, do_systematics=False, split_by_lepton_flavor=False, skip_signal_regions=False, skip_control_regions=False, dtype=np.float32): + + self._samples = samples + self._wc_names_lst = wc_names_lst + self._dtype = dtype + + # Create the histograms + proc_axis = hist.axis.StrCategory([], name="process", growth=True) + chan_axis = hist.axis.StrCategory([], name="channel", growth=True) + syst_axis = hist.axis.StrCategory([], name="systematic", growth=True) + self._accumulator = { + "mll_fromzg_e" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_e", label=r"invmass ee from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll_fromzg_m" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_m", label=r"invmass mm from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll_fromzg_t" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_t", label=r"invmass tautau from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(60, 0, 600, name="mll", label=r"Invmass l0l1"), wc_names=wc_names_lst, rebin=False), + "ht" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(100, 0, 1000, name="ht", label=r"Scalar sum of genjet pt"), wc_names=wc_names_lst, rebin=False), + "ht_clean" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(100, 0, 1000, name="ht_clean", label=r"Scalar sum of clean genjet pt"), wc_names=wc_names_lst, rebin=False), + "tops_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(50, 0, 500, name="tops_pt", label=r"Pt of the sum of the tops"), wc_names=wc_names_lst, rebin=False), + "photon_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Variable([20,35,50,70,100,170,200,250,300], name="photon_pt", label=r"Pt of the sum of the photon"), wc_names=wc_names_lst, rebin=False), + "l0_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(50, 0, 500, name="l0_pt", label=r"Pt of leading lepton"), wc_names=wc_names_lst, rebin=False), + "j0_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(50, 0, 500, name="j0_pt", label=r"Pt of leading jet"), wc_names=wc_names_lst, rebin=False), + "tX_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 400, name="tX_pt", label=r"Pt of the t(t)X system"), wc_names=wc_names_lst, rebin=False), + "njets" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(10, 0, 10, name="njets", label=r"njets"), wc_names=wc_names_lst, rebin=False), + } + + # Set the list of hists to fill + if hist_lst is None: + # If the hist list is none, assume we want to fill all hists + self._hist_lst = list(self._accumulator.keys()) + else: + # Otherwise, just fill the specified subset of hists + for hist_to_include in hist_lst: + if hist_to_include not in self._accumulator.keys(): + raise Exception(f"Error: Cannot specify hist \"{hist_to_include}\", it is not defined in the processor.") + self._hist_lst = hist_lst # Which hists to fill + + # Set the booleans + self._do_errors = do_errors # Whether to calculate and store the w**2 coefficients + + + @property + def accumulator(self): + return self._accumulator + + @property + def columns(self): + return self._columns + + # Main function: run on a given dataset + def process(self, events): + + ### Dataset parameters ### + dataset = events.metadata["dataset"] + + isData = self._samples[dataset]["isData"] + histAxisName = self._samples[dataset]["histAxisName"] + year = self._samples[dataset]["year"] + xsec = self._samples[dataset]["xsec"] + sow = self._samples[dataset]["nSumOfWeights"] + + if isData: raise Exception("Error: This processor is not for data") + + selections = PackedSelection(dtype='uint64') + + ### Get gen particles collection ### + genpart = events.GenPart + genjet = events.GenJet + + + ### Lepton object selection ### + + is_final_mask = genpart.hasFlags(["fromHardProcess","isLastCopy"]) + + gen_top = ak.pad_none(genpart[is_final_mask & (abs(genpart.pdgId) == 6)],2) + + gen_l = genpart[is_final_mask & ((abs(genpart.pdgId) == 11) | (abs(genpart.pdgId) == 13) | (abs(genpart.pdgId) == 15))] + gen_e = genpart[is_final_mask & (abs(genpart.pdgId) == 11)] + gen_m = genpart[is_final_mask & (abs(genpart.pdgId) == 13)] + gen_t = genpart[is_final_mask & (abs(genpart.pdgId) == 15)] + + gen_p = genpart[is_final_mask & (abs(genpart.pdgId) == 22)] + + gen_l = gen_l[ak.argsort(gen_l.pt, axis=-1, ascending=False)] + gen_l = ak.pad_none(gen_l,2) + gen_e = gen_e[ak.argsort(gen_e.pt, axis=-1, ascending=False)] + gen_m = gen_m[ak.argsort(gen_m.pt, axis=-1, ascending=False)] + gen_t = gen_t[ak.argsort(gen_t.pt, axis=-1, ascending=False)] + + gen_p = gen_p[ak.argsort(gen_p.pt, axis=-1, ascending=False)] + + gen_l_from_zg = ak.pad_none(gen_l[(gen_l.distinctParent.pdgId == 23) | (gen_l.distinctParent.pdgId == 22)], 2) + gen_e_from_zg = ak.pad_none(gen_e[(gen_e.distinctParent.pdgId == 23) | (gen_e.distinctParent.pdgId == 22)], 2) + gen_m_from_zg = ak.pad_none(gen_m[(gen_m.distinctParent.pdgId == 23) | (gen_m.distinctParent.pdgId == 22)], 2) + gen_t_from_zg = ak.pad_none(gen_t[(gen_t.distinctParent.pdgId == 23) | (gen_t.distinctParent.pdgId == 22)], 2) + + + # Jet object selection + genjet = genjet[genjet.pt > 30] + is_clean_jet = te_os.isClean(genjet, gen_e, drmin=0.4) & te_os.isClean(genjet, gen_m, drmin=0.4) & te_os.isClean(genjet, gen_t, drmin=0.4) + genjet_clean = genjet[is_clean_jet] + genbjet_clean = genjet[np.abs(genjet.partonFlavour)==5] + njets = ak.num(genjet_clean) + nbjets = ak.num(genbjet_clean) + + + is_em = (np.abs(gen_l[:,0].pdgId)==11) & (np.abs(gen_l[:,1].pdgId)==13) + is_me = (np.abs(gen_l[:,1].pdgId)==11) & (np.abs(gen_l[:,0].pdgId)==13) + selections.add("2los_CRtt", (ak.any(gen_l[:,0].pdgId/np.abs(gen_l[:,1].pdgId)+gen_l[:,1].pdgId/np.abs(gen_l[:,1].pdgId)==0) & ak.any(is_em | is_me) & (nbjets==2)) & (ak.num(gen_p)>0) & (ak.firsts(gen_p).pt>20)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + #selections.add("2los_CRtt", (events.is2l_nozeeveto & charge2l_0 & events.is_em & bmask_exactly2med & pass_trg)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("atmost_3j" , (njets<=3)) + ### Get dense axis variables ### + + ht = ak.sum(genjet.pt,axis=-1) + ht_clean = ak.sum(genjet_clean.pt,axis=-1) + + tops_pt = gen_top.sum().pt + + # Pt of the t(t)X system + tX_system = ak.concatenate([gen_top,gen_l_from_zg],axis=1) + tX_pt = tX_system.sum().pt + + # Invmass distributions + mll_e_from_zg = (gen_e_from_zg[:,0] + gen_e_from_zg[:,1]).mass + mll_m_from_zg = (gen_m_from_zg[:,0] + gen_m_from_zg[:,1]).mass + mll_t_from_zg = (gen_t_from_zg[:,0] + gen_t_from_zg[:,1]).mass + mll_l0l1 = (gen_l[:,0] + gen_l[:,1]).mass + + # Dictionary of dense axis values + dense_axis_dict = { + "mll_fromzg_e" : mll_e_from_zg, + "mll_fromzg_m" : mll_m_from_zg, + "mll_fromzg_t" : mll_t_from_zg, + "mll" : mll_l0l1, + "ht" : ht, + "ht_clean" : ht_clean, + "tX_pt" : tX_pt, + "tops_pt" : tops_pt, + "photon_pt" : ak.firsts(gen_p).pt, + "l0_pt" : ak.firsts(gen_l.pt), + "j0_pt" : ak.firsts(genjet.pt), + "njets" : njets, + } + selections.add("atmost_3j" , (njets<=3)) + cat_dict = { + "2los_CRtt" : { + "atmost_3j" : { + "lep_chan_lst" : ["2los_CRtt_photon"], + "lep_flav_lst" : ["em"], + "appl_lst" : ["isSR_2lOS" , "isAR_2lOS"], + }, + "atleast_1j" : { + "lep_chan_lst" : ["2los_CRtt_photon"], + "lep_flav_lst" : ["em"], + "appl_lst" : ["isSR_2lOS" , "isAR_2lOS"], + }, + }, + } + + + ### Get weights ### + + # Extract the EFT quadratic coefficients and optionally use them to calculate the coefficients on the w**2 quartic function + # eft_coeffs is never Jagged so convert immediately to numpy for ease of use. + eft_coeffs = ak.to_numpy(events["EFTfitCoefficients"]) if hasattr(events, "EFTfitCoefficients") else None + if eft_coeffs is not None: + # Check to see if the ordering of WCs for this sample matches what want + if self._samples[dataset]["WCnames"] != self._wc_names_lst: + eft_coeffs = efth.remap_coeffs(self._samples[dataset]["WCnames"], self._wc_names_lst, eft_coeffs) + eft_w2_coeffs = efth.calc_w2_coeffs(eft_coeffs,self._dtype) if (self._do_errors and eft_coeffs is not None) else None + + # If this is not an eft sample, get the genWeight + if eft_coeffs is None: genw = events["genWeight"] + else: genw = np.ones_like(events["event"]) + lumi = 1000.0*get_tc_param(f"lumi_{year}") + event_weight = lumi*xsec*genw/sow + + # Example of reweighting based on Ht + #if "private" in histAxisName: + # ht_sf = get_ht_sf(ht,histAxisName) + # event_weight = event_weight*ht_sf + + + ### Loop over the hists we want to fill ### + + hout = self.accumulator + + for dense_axis_name, dense_axis_vals in dense_axis_dict.items(): + if dense_axis_name not in self._hist_lst: + print(f"Skipping \"{dense_axis_name}\", it is not in the list of hists to include.") + continue + for chan in cat_dict: + flav_ch = None + njet_ch = None + njets_any_mask = selections.any(*cat_dict[chan].keys()) + for njet_val in cat_dict[chan]: + cuts_lst = [chan, njet_val] + if dense_axis_name == "njets": + all_cuts_mask = (selections.all(*cuts_lst) & njets_any_mask) + else: + njet_ch = njet_val + all_cuts_mask = (selections.all(*cuts_lst)) + ch_name = construct_cat_name(chan,njet_str=njet_ch,flav_str=flav_ch) + + # Mask out the none values + isnotnone_mask = (ak.fill_none((dense_axis_vals != None),False)) + isnotnone_mask = isnotnone_mask & all_cuts_mask + dense_axis_vals_cut = dense_axis_vals[isnotnone_mask] + event_weight_cut = event_weight[isnotnone_mask] + eft_coeffs_cut = eft_coeffs + if eft_coeffs is not None: eft_coeffs_cut = eft_coeffs[isnotnone_mask] + #eft_w2_coeffs_cut = eft_w2_coeffs + #if eft_w2_coeffs is not None: eft_w2_coeffs_cut = eft_w2_coeffs[isnotnone_mask] + + # Fill the histos + axes_fill_info_dict = { + dense_axis_name : dense_axis_vals_cut, + "process" : histAxisName, + "channel" : ch_name, + "systematic" : "nominal", + "weight" : event_weight_cut, + "eft_coeff" : eft_coeffs_cut, + #"eft_err_coeff" : eft_w2_coeffs_cut, + } + + hout[dense_axis_name].fill(**axes_fill_info_dict) + + return hout + + def postprocess(self, accumulator): + return accumulator From f785767bd6d96635d152ca8d64e0fd753a5acd5e Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Mon, 1 Jul 2024 13:52:22 -0400 Subject: [PATCH 10/21] Fix to processor --- .../mc_validation/gen_photon_processor.py | 7 +- analysis/mc_validation/gen_photon_rocessor.py | 268 ------------------ 2 files changed, 1 insertion(+), 274 deletions(-) delete mode 100644 analysis/mc_validation/gen_photon_rocessor.py diff --git a/analysis/mc_validation/gen_photon_processor.py b/analysis/mc_validation/gen_photon_processor.py index 694cfd5e2..10b318ef5 100644 --- a/analysis/mc_validation/gen_photon_processor.py +++ b/analysis/mc_validation/gen_photon_processor.py @@ -186,11 +186,6 @@ def process(self, events): "lep_flav_lst" : ["em"], "appl_lst" : ["isSR_2lOS" , "isAR_2lOS"], }, - "atleast_1j" : { - "lep_chan_lst" : ["2los_CRtt_photon"], - "lep_flav_lst" : ["em"], - "appl_lst" : ["isSR_2lOS" , "isAR_2lOS"], - }, }, } @@ -224,7 +219,7 @@ def process(self, events): for dense_axis_name, dense_axis_vals in dense_axis_dict.items(): if dense_axis_name not in self._hist_lst: - print(f"Skipping \"{dense_axis_name}\", it is not in the list of hists to include.") + #print(f"Skipping \"{dense_axis_name}\", it is not in the list of hists to include.") continue for chan in cat_dict: flav_ch = None diff --git a/analysis/mc_validation/gen_photon_rocessor.py b/analysis/mc_validation/gen_photon_rocessor.py deleted file mode 100644 index 694cfd5e2..000000000 --- a/analysis/mc_validation/gen_photon_rocessor.py +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/env python -import numpy as np -import awkward as ak -np.seterr(divide='ignore', invalid='ignore', over='ignore') -from coffea import processor -from coffea.analysis_tools import PackedSelection -import hist - -import topeft.modules.object_selection as te_os -from topcoffea.modules.histEFT import HistEFT -import topcoffea.modules.eft_helper as efth -from topcoffea.modules.get_param_from_jsons import GetParam -from topcoffea.modules.paths import topcoffea_path -get_tc_param = GetParam(topcoffea_path("params/params.json")) -def construct_cat_name(chan_str,njet_str=None,flav_str=None): - - # Get the component strings - nlep_str = chan_str.split("_")[0] # Assumes n leps comes first in the str - chan_str = "_".join(chan_str.split("_")[1:]) # The rest of the channel name is everything that comes after nlep - if chan_str == "": chan_str = None # So that we properly skip this in the for loop below - if flav_str is not None: - flav_str = flav_str - if njet_str is not None: - njet_str = njet_str[-2:] # Assumes number of n jets comes at the end of the string - if "j" not in njet_str: - # The njet string should really have a "j" in it - raise Exception(f"Something when wrong while trying to consturct channel name, is \"{njet_str}\" an njet string?") - - # Put the component strings into the channel name - ret_str = nlep_str - for component in [flav_str,chan_str,njet_str]: - if component is None: continue - ret_str = "_".join([ret_str,component]) - return ret_str - - -class AnalysisProcessor(processor.ProcessorABC): - - def __init__(self, samples, wc_names_lst=[], hist_lst=None, ecut_threshold=None, do_errors=False, do_systematics=False, split_by_lepton_flavor=False, skip_signal_regions=False, skip_control_regions=False, dtype=np.float32): - - self._samples = samples - self._wc_names_lst = wc_names_lst - self._dtype = dtype - - # Create the histograms - proc_axis = hist.axis.StrCategory([], name="process", growth=True) - chan_axis = hist.axis.StrCategory([], name="channel", growth=True) - syst_axis = hist.axis.StrCategory([], name="systematic", growth=True) - self._accumulator = { - "mll_fromzg_e" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_e", label=r"invmass ee from z/gamma"), wc_names=wc_names_lst, rebin=False), - "mll_fromzg_m" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_m", label=r"invmass mm from z/gamma"), wc_names=wc_names_lst, rebin=False), - "mll_fromzg_t" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_t", label=r"invmass tautau from z/gamma"), wc_names=wc_names_lst, rebin=False), - "mll" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(60, 0, 600, name="mll", label=r"Invmass l0l1"), wc_names=wc_names_lst, rebin=False), - "ht" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(100, 0, 1000, name="ht", label=r"Scalar sum of genjet pt"), wc_names=wc_names_lst, rebin=False), - "ht_clean" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(100, 0, 1000, name="ht_clean", label=r"Scalar sum of clean genjet pt"), wc_names=wc_names_lst, rebin=False), - "tops_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(50, 0, 500, name="tops_pt", label=r"Pt of the sum of the tops"), wc_names=wc_names_lst, rebin=False), - "photon_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Variable([20,35,50,70,100,170,200,250,300], name="photon_pt", label=r"Pt of the sum of the photon"), wc_names=wc_names_lst, rebin=False), - "l0_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(50, 0, 500, name="l0_pt", label=r"Pt of leading lepton"), wc_names=wc_names_lst, rebin=False), - "j0_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(50, 0, 500, name="j0_pt", label=r"Pt of leading jet"), wc_names=wc_names_lst, rebin=False), - "tX_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 400, name="tX_pt", label=r"Pt of the t(t)X system"), wc_names=wc_names_lst, rebin=False), - "njets" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(10, 0, 10, name="njets", label=r"njets"), wc_names=wc_names_lst, rebin=False), - } - - # Set the list of hists to fill - if hist_lst is None: - # If the hist list is none, assume we want to fill all hists - self._hist_lst = list(self._accumulator.keys()) - else: - # Otherwise, just fill the specified subset of hists - for hist_to_include in hist_lst: - if hist_to_include not in self._accumulator.keys(): - raise Exception(f"Error: Cannot specify hist \"{hist_to_include}\", it is not defined in the processor.") - self._hist_lst = hist_lst # Which hists to fill - - # Set the booleans - self._do_errors = do_errors # Whether to calculate and store the w**2 coefficients - - - @property - def accumulator(self): - return self._accumulator - - @property - def columns(self): - return self._columns - - # Main function: run on a given dataset - def process(self, events): - - ### Dataset parameters ### - dataset = events.metadata["dataset"] - - isData = self._samples[dataset]["isData"] - histAxisName = self._samples[dataset]["histAxisName"] - year = self._samples[dataset]["year"] - xsec = self._samples[dataset]["xsec"] - sow = self._samples[dataset]["nSumOfWeights"] - - if isData: raise Exception("Error: This processor is not for data") - - selections = PackedSelection(dtype='uint64') - - ### Get gen particles collection ### - genpart = events.GenPart - genjet = events.GenJet - - - ### Lepton object selection ### - - is_final_mask = genpart.hasFlags(["fromHardProcess","isLastCopy"]) - - gen_top = ak.pad_none(genpart[is_final_mask & (abs(genpart.pdgId) == 6)],2) - - gen_l = genpart[is_final_mask & ((abs(genpart.pdgId) == 11) | (abs(genpart.pdgId) == 13) | (abs(genpart.pdgId) == 15))] - gen_e = genpart[is_final_mask & (abs(genpart.pdgId) == 11)] - gen_m = genpart[is_final_mask & (abs(genpart.pdgId) == 13)] - gen_t = genpart[is_final_mask & (abs(genpart.pdgId) == 15)] - - gen_p = genpart[is_final_mask & (abs(genpart.pdgId) == 22)] - - gen_l = gen_l[ak.argsort(gen_l.pt, axis=-1, ascending=False)] - gen_l = ak.pad_none(gen_l,2) - gen_e = gen_e[ak.argsort(gen_e.pt, axis=-1, ascending=False)] - gen_m = gen_m[ak.argsort(gen_m.pt, axis=-1, ascending=False)] - gen_t = gen_t[ak.argsort(gen_t.pt, axis=-1, ascending=False)] - - gen_p = gen_p[ak.argsort(gen_p.pt, axis=-1, ascending=False)] - - gen_l_from_zg = ak.pad_none(gen_l[(gen_l.distinctParent.pdgId == 23) | (gen_l.distinctParent.pdgId == 22)], 2) - gen_e_from_zg = ak.pad_none(gen_e[(gen_e.distinctParent.pdgId == 23) | (gen_e.distinctParent.pdgId == 22)], 2) - gen_m_from_zg = ak.pad_none(gen_m[(gen_m.distinctParent.pdgId == 23) | (gen_m.distinctParent.pdgId == 22)], 2) - gen_t_from_zg = ak.pad_none(gen_t[(gen_t.distinctParent.pdgId == 23) | (gen_t.distinctParent.pdgId == 22)], 2) - - - # Jet object selection - genjet = genjet[genjet.pt > 30] - is_clean_jet = te_os.isClean(genjet, gen_e, drmin=0.4) & te_os.isClean(genjet, gen_m, drmin=0.4) & te_os.isClean(genjet, gen_t, drmin=0.4) - genjet_clean = genjet[is_clean_jet] - genbjet_clean = genjet[np.abs(genjet.partonFlavour)==5] - njets = ak.num(genjet_clean) - nbjets = ak.num(genbjet_clean) - - - is_em = (np.abs(gen_l[:,0].pdgId)==11) & (np.abs(gen_l[:,1].pdgId)==13) - is_me = (np.abs(gen_l[:,1].pdgId)==11) & (np.abs(gen_l[:,0].pdgId)==13) - selections.add("2los_CRtt", (ak.any(gen_l[:,0].pdgId/np.abs(gen_l[:,1].pdgId)+gen_l[:,1].pdgId/np.abs(gen_l[:,1].pdgId)==0) & ak.any(is_em | is_me) & (nbjets==2)) & (ak.num(gen_p)>0) & (ak.firsts(gen_p).pt>20)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement - #selections.add("2los_CRtt", (events.is2l_nozeeveto & charge2l_0 & events.is_em & bmask_exactly2med & pass_trg)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement - selections.add("atmost_3j" , (njets<=3)) - ### Get dense axis variables ### - - ht = ak.sum(genjet.pt,axis=-1) - ht_clean = ak.sum(genjet_clean.pt,axis=-1) - - tops_pt = gen_top.sum().pt - - # Pt of the t(t)X system - tX_system = ak.concatenate([gen_top,gen_l_from_zg],axis=1) - tX_pt = tX_system.sum().pt - - # Invmass distributions - mll_e_from_zg = (gen_e_from_zg[:,0] + gen_e_from_zg[:,1]).mass - mll_m_from_zg = (gen_m_from_zg[:,0] + gen_m_from_zg[:,1]).mass - mll_t_from_zg = (gen_t_from_zg[:,0] + gen_t_from_zg[:,1]).mass - mll_l0l1 = (gen_l[:,0] + gen_l[:,1]).mass - - # Dictionary of dense axis values - dense_axis_dict = { - "mll_fromzg_e" : mll_e_from_zg, - "mll_fromzg_m" : mll_m_from_zg, - "mll_fromzg_t" : mll_t_from_zg, - "mll" : mll_l0l1, - "ht" : ht, - "ht_clean" : ht_clean, - "tX_pt" : tX_pt, - "tops_pt" : tops_pt, - "photon_pt" : ak.firsts(gen_p).pt, - "l0_pt" : ak.firsts(gen_l.pt), - "j0_pt" : ak.firsts(genjet.pt), - "njets" : njets, - } - selections.add("atmost_3j" , (njets<=3)) - cat_dict = { - "2los_CRtt" : { - "atmost_3j" : { - "lep_chan_lst" : ["2los_CRtt_photon"], - "lep_flav_lst" : ["em"], - "appl_lst" : ["isSR_2lOS" , "isAR_2lOS"], - }, - "atleast_1j" : { - "lep_chan_lst" : ["2los_CRtt_photon"], - "lep_flav_lst" : ["em"], - "appl_lst" : ["isSR_2lOS" , "isAR_2lOS"], - }, - }, - } - - - ### Get weights ### - - # Extract the EFT quadratic coefficients and optionally use them to calculate the coefficients on the w**2 quartic function - # eft_coeffs is never Jagged so convert immediately to numpy for ease of use. - eft_coeffs = ak.to_numpy(events["EFTfitCoefficients"]) if hasattr(events, "EFTfitCoefficients") else None - if eft_coeffs is not None: - # Check to see if the ordering of WCs for this sample matches what want - if self._samples[dataset]["WCnames"] != self._wc_names_lst: - eft_coeffs = efth.remap_coeffs(self._samples[dataset]["WCnames"], self._wc_names_lst, eft_coeffs) - eft_w2_coeffs = efth.calc_w2_coeffs(eft_coeffs,self._dtype) if (self._do_errors and eft_coeffs is not None) else None - - # If this is not an eft sample, get the genWeight - if eft_coeffs is None: genw = events["genWeight"] - else: genw = np.ones_like(events["event"]) - lumi = 1000.0*get_tc_param(f"lumi_{year}") - event_weight = lumi*xsec*genw/sow - - # Example of reweighting based on Ht - #if "private" in histAxisName: - # ht_sf = get_ht_sf(ht,histAxisName) - # event_weight = event_weight*ht_sf - - - ### Loop over the hists we want to fill ### - - hout = self.accumulator - - for dense_axis_name, dense_axis_vals in dense_axis_dict.items(): - if dense_axis_name not in self._hist_lst: - print(f"Skipping \"{dense_axis_name}\", it is not in the list of hists to include.") - continue - for chan in cat_dict: - flav_ch = None - njet_ch = None - njets_any_mask = selections.any(*cat_dict[chan].keys()) - for njet_val in cat_dict[chan]: - cuts_lst = [chan, njet_val] - if dense_axis_name == "njets": - all_cuts_mask = (selections.all(*cuts_lst) & njets_any_mask) - else: - njet_ch = njet_val - all_cuts_mask = (selections.all(*cuts_lst)) - ch_name = construct_cat_name(chan,njet_str=njet_ch,flav_str=flav_ch) - - # Mask out the none values - isnotnone_mask = (ak.fill_none((dense_axis_vals != None),False)) - isnotnone_mask = isnotnone_mask & all_cuts_mask - dense_axis_vals_cut = dense_axis_vals[isnotnone_mask] - event_weight_cut = event_weight[isnotnone_mask] - eft_coeffs_cut = eft_coeffs - if eft_coeffs is not None: eft_coeffs_cut = eft_coeffs[isnotnone_mask] - #eft_w2_coeffs_cut = eft_w2_coeffs - #if eft_w2_coeffs is not None: eft_w2_coeffs_cut = eft_w2_coeffs[isnotnone_mask] - - # Fill the histos - axes_fill_info_dict = { - dense_axis_name : dense_axis_vals_cut, - "process" : histAxisName, - "channel" : ch_name, - "systematic" : "nominal", - "weight" : event_weight_cut, - "eft_coeff" : eft_coeffs_cut, - #"eft_err_coeff" : eft_w2_coeffs_cut, - } - - hout[dense_axis_name].fill(**axes_fill_info_dict) - - return hout - - def postprocess(self, accumulator): - return accumulator From e170c788401c74b1306a5a64b4ad28e18330c336 Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Wed, 3 Jul 2024 09:56:23 -0400 Subject: [PATCH 11/21] Matching some channels from photon analysis, adding appl --- .../mc_validation/gen_photon_processor.py | 183 +++++++++++++----- 1 file changed, 133 insertions(+), 50 deletions(-) diff --git a/analysis/mc_validation/gen_photon_processor.py b/analysis/mc_validation/gen_photon_processor.py index 10b318ef5..6436e62b5 100644 --- a/analysis/mc_validation/gen_photon_processor.py +++ b/analysis/mc_validation/gen_photon_processor.py @@ -9,6 +9,7 @@ import topeft.modules.object_selection as te_os from topcoffea.modules.histEFT import HistEFT import topcoffea.modules.eft_helper as efth +import topcoffea.modules.event_selection as tc_es from topcoffea.modules.get_param_from_jsons import GetParam from topcoffea.modules.paths import topcoffea_path get_tc_param = GetParam(topcoffea_path("params/params.json")) @@ -41,24 +42,27 @@ def __init__(self, samples, wc_names_lst=[], hist_lst=None, ecut_threshold=None, self._samples = samples self._wc_names_lst = wc_names_lst self._dtype = dtype + self._skip_signal_regions = skip_signal_regions # Whether to skip the SR categories + self._skip_control_regions = skip_control_regions # Whether to skip the CR categories # Create the histograms proc_axis = hist.axis.StrCategory([], name="process", growth=True) chan_axis = hist.axis.StrCategory([], name="channel", growth=True) syst_axis = hist.axis.StrCategory([], name="systematic", growth=True) + appl_axis = hist.axis.StrCategory([], name="appl", label=r"AR/SR", growth=True) self._accumulator = { - "mll_fromzg_e" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_e", label=r"invmass ee from z/gamma"), wc_names=wc_names_lst, rebin=False), - "mll_fromzg_m" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_m", label=r"invmass mm from z/gamma"), wc_names=wc_names_lst, rebin=False), - "mll_fromzg_t" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_t", label=r"invmass tautau from z/gamma"), wc_names=wc_names_lst, rebin=False), - "mll" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(60, 0, 600, name="mll", label=r"Invmass l0l1"), wc_names=wc_names_lst, rebin=False), - "ht" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(100, 0, 1000, name="ht", label=r"Scalar sum of genjet pt"), wc_names=wc_names_lst, rebin=False), - "ht_clean" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(100, 0, 1000, name="ht_clean", label=r"Scalar sum of clean genjet pt"), wc_names=wc_names_lst, rebin=False), - "tops_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(50, 0, 500, name="tops_pt", label=r"Pt of the sum of the tops"), wc_names=wc_names_lst, rebin=False), - "photon_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Variable([20,35,50,70,100,170,200,250,300], name="photon_pt", label=r"Pt of the sum of the photon"), wc_names=wc_names_lst, rebin=False), - "l0_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(50, 0, 500, name="l0_pt", label=r"Pt of leading lepton"), wc_names=wc_names_lst, rebin=False), - "j0_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(50, 0, 500, name="j0_pt", label=r"Pt of leading jet"), wc_names=wc_names_lst, rebin=False), - "tX_pt" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(40, 0, 400, name="tX_pt", label=r"Pt of the t(t)X system"), wc_names=wc_names_lst, rebin=False), - "njets" : HistEFT(proc_axis, chan_axis, syst_axis, hist.axis.Regular(10, 0, 10, name="njets", label=r"njets"), wc_names=wc_names_lst, rebin=False), + "mll_fromzg_e" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_e", label=r"invmass ee from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll_fromzg_m" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_m", label=r"invmass mm from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll_fromzg_t" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(40, 0, 200, name="mll_fromzg_t", label=r"invmass tautau from z/gamma"), wc_names=wc_names_lst, rebin=False), + "mll" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(60, 0, 600, name="mll", label=r"Invmass l0l1"), wc_names=wc_names_lst, rebin=False), + "ht" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(100, 0, 1000, name="ht", label=r"Scalar sum of genjet pt"), wc_names=wc_names_lst, rebin=False), + "ht_clean" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(100, 0, 1000, name="ht_clean", label=r"Scalar sum of clean genjet pt"), wc_names=wc_names_lst, rebin=False), + "tops_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(50, 0, 500, name="tops_pt", label=r"Pt of the sum of the tops"), wc_names=wc_names_lst, rebin=False), + "photon_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Variable([20,35,50,70,100,170,200,250,300], name="photon_pt", label=r"Pt of the sum of the photon"), wc_names=wc_names_lst, rebin=False), + "l0_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(50, 0, 500, name="l0_pt", label=r"Pt of leading lepton"), wc_names=wc_names_lst, rebin=False), + "j0_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(50, 0, 500, name="j0_pt", label=r"Pt of leading jet"), wc_names=wc_names_lst, rebin=False), + "tX_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(40, 0, 400, name="tX_pt", label=r"Pt of the t(t)X system"), wc_names=wc_names_lst, rebin=False), + "njets" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(10, 0, 10, name="njets", label=r"njets"), wc_names=wc_names_lst, rebin=False), } # Set the list of hists to fill @@ -116,6 +120,12 @@ def process(self, events): gen_m = genpart[is_final_mask & (abs(genpart.pdgId) == 13)] gen_t = genpart[is_final_mask & (abs(genpart.pdgId) == 15)] + # Object selection before padding + is_em = (np.abs(gen_l[:,0].pdgId)==11) & (np.abs(gen_l[:,1].pdgId)==13) + is_me = (np.abs(gen_l[:,1].pdgId)==11) & (np.abs(gen_l[:,0].pdgId)==13) + is2lOS = ak.any(gen_l[:,0].pdgId/np.abs(gen_l[:,1].pdgId)+gen_l[:,1].pdgId/np.abs(gen_l[:,1].pdgId)==0) & ((ak.num(gen_e) + ak.num(gen_m))==2) + is2lOS_em = is2lOS & ak.any(is_em | is_me) + gen_p = genpart[is_final_mask & (abs(genpart.pdgId) == 22)] gen_l = gen_l[ak.argsort(gen_l.pt, axis=-1, ascending=False)] @@ -139,13 +149,50 @@ def process(self, events): genbjet_clean = genjet[np.abs(genjet.partonFlavour)==5] njets = ak.num(genjet_clean) nbjets = ak.num(genbjet_clean) + atleast_1ph = ak.num(gen_p)>0 - is_em = (np.abs(gen_l[:,0].pdgId)==11) & (np.abs(gen_l[:,1].pdgId)==13) - is_me = (np.abs(gen_l[:,1].pdgId)==11) & (np.abs(gen_l[:,0].pdgId)==13) - selections.add("2los_CRtt", (ak.any(gen_l[:,0].pdgId/np.abs(gen_l[:,1].pdgId)+gen_l[:,1].pdgId/np.abs(gen_l[:,1].pdgId)==0) & ak.any(is_em | is_me) & (nbjets==2)) & (ak.num(gen_p)>0) & (ak.firsts(gen_p).pt>20)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("2los_CRtt", is2lOS_em & (nbjets==2) & (~atleast_1ph)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement #selections.add("2los_CRtt", (events.is2l_nozeeveto & charge2l_0 & events.is_em & bmask_exactly2med & pass_trg)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + selections.add("2los_ph", is2lOS & (nbjets==2) & (ak.num(gen_p)>0) & (ak.firsts(gen_p).pt>20)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement + #selections.add("2los_sf_ph", (retainedbyOverlap & events.is2l & charge2l_0 & (events.is_ee | events.is_mm) & ~sfosz_2los_ttg_mask & events.mask_SF_Zllgamma & bmask_atleast1med & pass_trg & exactly_1ph)) + sfosz_2los_ttg_mask = tc_es.get_Z_peak_mask(gen_l[:,0:2],pt_window=15.0) + Zllgamma_SF_mask = (abs( (gen_l[:,0] + gen_l[:,1] + gen_p[:,0]).mass -91.2) > 15) + selections.add("2los_sf_Zg_CR_ULttg", (is2lOS_em & ~sfosz_2los_ttg_mask & Zllgamma_SF_mask & (nbjets==2) & atleast_1ph)) + #selections.add("2los_sf_Zg_CR_ULttg", (retainedbyOverlap & events.is2l_nozeeveto & charge2l_0 & (events.is_ee | events.is_mm) & ~sfosz_2los_ttg_mask & ~events.mask_SF_Zllgamma & bmask_atleast1med & pass_trg & atleast_1ph)) + + + # Njets selection + selections.add("exactly_0j", (njets==0)) + selections.add("exactly_1j", (njets==1)) + selections.add("exactly_2j", (njets==2)) + selections.add("exactly_3j", (njets==3)) + selections.add("exactly_4j", (njets==4)) + selections.add("exactly_5j", (njets==5)) + selections.add("exactly_6j", (njets==6)) + selections.add("atleast_1j", (njets>=1)) + selections.add("atleast_2j", (njets>=2)) + selections.add("atleast_3j", (njets>=3)) + selections.add("atleast_4j", (njets>=4)) + selections.add("atleast_5j", (njets>=5)) + selections.add("atleast_7j", (njets>=7)) + selections.add("atleast_0j", (njets>=0)) + selections.add("atmost_1j" , (njets<=1)) selections.add("atmost_3j" , (njets<=3)) + + + # AR/SR categories + #selections.add("isSR_2lSS", ( events.is2l_SR) & charge2l_1) + #selections.add("isAR_2lSS", (~events.is2l_SR) & charge2l_1) + #selections.add("isAR_2lSS_OS", ( events.is2l_SR) & charge2l_0) # Sideband for the charge flip + selections.add("isSR_2lOS", is2lOS)#( events.is2l_SR) & charge2l_0) + selections.add("isAR_2lOS", is2lOS)#(~events.is2l_SR) & charge2l_0) + + #selections.add("isSR_3l", events.is3l_SR) + #selections.add("isAR_3l", ~events.is3l_SR) + #selections.add("isSR_4l", events.is4l_SR) + + ### Get dense axis variables ### ht = ak.sum(genjet.pt,axis=-1) @@ -178,16 +225,33 @@ def process(self, events): "j0_pt" : ak.firsts(genjet.pt), "njets" : njets, } - selections.add("atmost_3j" , (njets<=3)) cat_dict = { "2los_CRtt" : { "atmost_3j" : { - "lep_chan_lst" : ["2los_CRtt_photon"], + "lep_chan_lst" : ["2los_ph"], "lep_flav_lst" : ["em"], "appl_lst" : ["isSR_2lOS" , "isAR_2lOS"], }, }, } + sr_cat_dict = { + "2los_ph" : { + "atleast_1j" : { + "lep_chan_lst" : ["2los_ph"], + "lep_flav_lst" : ["ee", "mm", "em"], + "appl_lst" : ["isSR_2lOS", "isAR_2lOS"], + }, + }, + } + cr_cat_dict = { + "2los_CR_Zg_ULttg" : { + "atleast_2j" : { + "lep_chan_lst" : ["2los_sf_Zg_CR_ULttg"], + "lep_flav_lst" : ["ee", "mm"], + "appl_lst" : ["isSR_2lOS", "isAR_2lOS"], + }, + }, + } ### Get weights ### @@ -221,41 +285,60 @@ def process(self, events): if dense_axis_name not in self._hist_lst: #print(f"Skipping \"{dense_axis_name}\", it is not in the list of hists to include.") continue - for chan in cat_dict: + if not self._skip_signal_regions: + cat_dict.update(sr_cat_dict) + if not self._skip_control_regions: + cat_dict.update(cr_cat_dict) + if (not self._skip_signal_regions and not self._skip_control_regions): + for k in sr_cat_dict: + if k in cr_cat_dict: + raise Exception(f"The key {k} is in both CR and SR dictionaries.") + for nlep_cat in cat_dict.keys(): flav_ch = None njet_ch = None - njets_any_mask = selections.any(*cat_dict[chan].keys()) - for njet_val in cat_dict[chan]: - cuts_lst = [chan, njet_val] - if dense_axis_name == "njets": - all_cuts_mask = (selections.all(*cuts_lst) & njets_any_mask) - else: - njet_ch = njet_val - all_cuts_mask = (selections.all(*cuts_lst)) - ch_name = construct_cat_name(chan,njet_str=njet_ch,flav_str=flav_ch) - - # Mask out the none values - isnotnone_mask = (ak.fill_none((dense_axis_vals != None),False)) - isnotnone_mask = isnotnone_mask & all_cuts_mask - dense_axis_vals_cut = dense_axis_vals[isnotnone_mask] - event_weight_cut = event_weight[isnotnone_mask] - eft_coeffs_cut = eft_coeffs - if eft_coeffs is not None: eft_coeffs_cut = eft_coeffs[isnotnone_mask] - #eft_w2_coeffs_cut = eft_w2_coeffs - #if eft_w2_coeffs is not None: eft_w2_coeffs_cut = eft_w2_coeffs[isnotnone_mask] - - # Fill the histos - axes_fill_info_dict = { - dense_axis_name : dense_axis_vals_cut, - "process" : histAxisName, - "channel" : ch_name, - "systematic" : "nominal", - "weight" : event_weight_cut, - "eft_coeff" : eft_coeffs_cut, - #"eft_err_coeff" : eft_w2_coeffs_cut, - } - - hout[dense_axis_name].fill(**axes_fill_info_dict) + njets_any_mask = selections.any(*cat_dict[nlep_cat].keys()) + # Loop over the njets list for each channel + for njet_val in cat_dict[nlep_cat]: + + # Loop over the appropriate AR and SR for this channel + for appl in cat_dict[nlep_cat][njet_val]["appl_lst"]: + # Loop over the channels in each nlep cat (e.g. "3l_m_offZ_1b") + for lep_chan in cat_dict[nlep_cat][njet_val]["lep_chan_lst"]: + + # Loop over the lep flavor list for each channel + for lep_flav in cat_dict[nlep_cat][njet_val]["lep_flav_lst"]: + #cuts_lst = [chan, njet_val] + cuts_lst = [appl,lep_chan] + if dense_axis_name == "njets": + all_cuts_mask = (selections.all(*cuts_lst) & njets_any_mask) + else: + njet_ch = njet_val + all_cuts_mask = (selections.all(*cuts_lst)) + ch_name = construct_cat_name(lep_chan,njet_str=njet_ch,flav_str=flav_ch) + + # Mask out the none values + isnotnone_mask = (ak.fill_none((dense_axis_vals != None),False)) + isnotnone_mask = isnotnone_mask & all_cuts_mask + dense_axis_vals_cut = dense_axis_vals[isnotnone_mask] + event_weight_cut = event_weight[isnotnone_mask] + eft_coeffs_cut = eft_coeffs + if eft_coeffs is not None: eft_coeffs_cut = eft_coeffs[isnotnone_mask] + #eft_w2_coeffs_cut = eft_w2_coeffs + #if eft_w2_coeffs is not None: eft_w2_coeffs_cut = eft_w2_coeffs[isnotnone_mask] + + # Fill the histos + axes_fill_info_dict = { + dense_axis_name : dense_axis_vals_cut, + "process" : histAxisName, + "appl" : appl, + "channel" : ch_name, + "systematic" : "nominal", + "weight" : event_weight_cut, + "eft_coeff" : eft_coeffs_cut, + #"eft_err_coeff" : eft_w2_coeffs_cut, + } + + hout[dense_axis_name].fill(**axes_fill_info_dict) return hout From 45bc60a3d40147b673fd0a0196be526b294b0d3e Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Wed, 3 Jul 2024 11:04:15 -0400 Subject: [PATCH 12/21] Correct categorization, add random numbers for testing SF NPs --- .../mc_validation/gen_photon_processor.py | 128 +++++++++++------- 1 file changed, 82 insertions(+), 46 deletions(-) diff --git a/analysis/mc_validation/gen_photon_processor.py b/analysis/mc_validation/gen_photon_processor.py index 6436e62b5..cbd0f3f45 100644 --- a/analysis/mc_validation/gen_photon_processor.py +++ b/analysis/mc_validation/gen_photon_processor.py @@ -1,4 +1,6 @@ #!/usr/bin/env python +import copy +import coffea import numpy as np import awkward as ak np.seterr(divide='ignore', invalid='ignore', over='ignore') @@ -42,6 +44,7 @@ def __init__(self, samples, wc_names_lst=[], hist_lst=None, ecut_threshold=None, self._samples = samples self._wc_names_lst = wc_names_lst self._dtype = dtype + self._do_systematics = do_systematics # Whether to process systematic samples self._skip_signal_regions = skip_signal_regions # Whether to skip the SR categories self._skip_control_regions = skip_control_regions # Whether to skip the CR categories @@ -127,6 +130,12 @@ def process(self, events): is2lOS_em = is2lOS & ak.any(is_em | is_me) gen_p = genpart[is_final_mask & (abs(genpart.pdgId) == 22)] + ######### Systematics ########### + + # Define the lists of systematics we include + wgt_correction_syst_lst = [ + "phoSFUp","phoSFDown" # Exp systs + ] gen_l = gen_l[ak.argsort(gen_l.pt, axis=-1, ascending=False)] gen_l = ak.pad_none(gen_l,2) @@ -271,6 +280,14 @@ def process(self, events): lumi = 1000.0*get_tc_param(f"lumi_{year}") event_weight = lumi*xsec*genw/sow + weights_dict = {} + # For both data and MC + weights_obj_base = coffea.analysis_tools.Weights(len(events),storeIndividual=True) + weights_obj_base.add("norm",(xsec/sow)*genw*lumi) + for ch_name in ["2los_CRtt", "2los_ph", "2los_CR_Zg_ULttg"]: + weights_dict[ch_name] = copy.deepcopy(weights_obj_base) + weights_dict[ch_name].add("phoSF", ak.ones_like(ak.firsts(gen_p).pt), np.random.rand(*ak.to_numpy(ak.firsts(gen_p).pt).shape), np.random.rand(*ak.to_numpy(ak.firsts(gen_p).pt).shape)) + # Example of reweighting based on Ht #if "private" in histAxisName: # ht_sf = get_ht_sf(ht,histAxisName) @@ -293,52 +310,71 @@ def process(self, events): for k in sr_cat_dict: if k in cr_cat_dict: raise Exception(f"The key {k} is in both CR and SR dictionaries.") - for nlep_cat in cat_dict.keys(): - flav_ch = None - njet_ch = None - njets_any_mask = selections.any(*cat_dict[nlep_cat].keys()) - # Loop over the njets list for each channel - for njet_val in cat_dict[nlep_cat]: - - # Loop over the appropriate AR and SR for this channel - for appl in cat_dict[nlep_cat][njet_val]["appl_lst"]: - # Loop over the channels in each nlep cat (e.g. "3l_m_offZ_1b") - for lep_chan in cat_dict[nlep_cat][njet_val]["lep_chan_lst"]: - - # Loop over the lep flavor list for each channel - for lep_flav in cat_dict[nlep_cat][njet_val]["lep_flav_lst"]: - #cuts_lst = [chan, njet_val] - cuts_lst = [appl,lep_chan] - if dense_axis_name == "njets": - all_cuts_mask = (selections.all(*cuts_lst) & njets_any_mask) - else: - njet_ch = njet_val - all_cuts_mask = (selections.all(*cuts_lst)) - ch_name = construct_cat_name(lep_chan,njet_str=njet_ch,flav_str=flav_ch) - - # Mask out the none values - isnotnone_mask = (ak.fill_none((dense_axis_vals != None),False)) - isnotnone_mask = isnotnone_mask & all_cuts_mask - dense_axis_vals_cut = dense_axis_vals[isnotnone_mask] - event_weight_cut = event_weight[isnotnone_mask] - eft_coeffs_cut = eft_coeffs - if eft_coeffs is not None: eft_coeffs_cut = eft_coeffs[isnotnone_mask] - #eft_w2_coeffs_cut = eft_w2_coeffs - #if eft_w2_coeffs is not None: eft_w2_coeffs_cut = eft_w2_coeffs[isnotnone_mask] - - # Fill the histos - axes_fill_info_dict = { - dense_axis_name : dense_axis_vals_cut, - "process" : histAxisName, - "appl" : appl, - "channel" : ch_name, - "systematic" : "nominal", - "weight" : event_weight_cut, - "eft_coeff" : eft_coeffs_cut, - #"eft_err_coeff" : eft_w2_coeffs_cut, - } - - hout[dense_axis_name].fill(**axes_fill_info_dict) + + # Set up the list of syst wgt variations to loop over + wgt_var_lst = ["nominal"] + if self._do_systematics: + if not isData: + wgt_var_lst = wgt_var_lst + wgt_correction_syst_lst + + for wgt_fluct in wgt_var_lst: + for nlep_cat in cat_dict.keys(): + flav_ch = None + njet_ch = None + njets_any_mask = selections.any(*cat_dict[nlep_cat].keys()) + weights_object = weights_dict[nlep_cat] + if (wgt_fluct == "nominal"): + # In the case of "nominal", or the jet energy systematics, no weight systematic variation is used + weight = weights_object.weight(None) + else: + # Otherwise get the weight from the Weights object + if wgt_fluct in weights_object.variations: + weight = weights_object.weight(wgt_fluct) + else: + # Note in this case there is no up/down fluct for this cateogry, so we don't want to fill a hist for it + continue + # Loop over the njets list for each channel + for njet_val in cat_dict[nlep_cat]: + + # Loop over the appropriate AR and SR for this channel + for appl in cat_dict[nlep_cat][njet_val]["appl_lst"]: + # Loop over the channels in each nlep cat (e.g. "3l_m_offZ_1b") + for lep_chan in cat_dict[nlep_cat][njet_val]["lep_chan_lst"]: + + # Loop over the lep flavor list for each channel + for lep_flav in cat_dict[nlep_cat][njet_val]["lep_flav_lst"]: + #cuts_lst = [chan, njet_val] + cuts_lst = [appl,lep_chan] + if dense_axis_name == "njets": + all_cuts_mask = (selections.all(*cuts_lst) & njets_any_mask) + else: + njet_ch = njet_val + all_cuts_mask = (selections.all(*cuts_lst)) + ch_name = construct_cat_name(lep_chan,njet_str=njet_ch,flav_str=flav_ch) + + # Mask out the none values + isnotnone_mask = (ak.fill_none((dense_axis_vals != None),False)) + isnotnone_mask = isnotnone_mask & all_cuts_mask + dense_axis_vals_cut = dense_axis_vals[isnotnone_mask] + event_weight_cut = weight[isnotnone_mask] + eft_coeffs_cut = eft_coeffs + if eft_coeffs is not None: eft_coeffs_cut = eft_coeffs[isnotnone_mask] + #eft_w2_coeffs_cut = eft_w2_coeffs + #if eft_w2_coeffs is not None: eft_w2_coeffs_cut = eft_w2_coeffs[isnotnone_mask] + + # Fill the histos + axes_fill_info_dict = { + dense_axis_name : dense_axis_vals_cut, + "process" : histAxisName, + "appl" : appl, + "channel" : ch_name, + "systematic" : wgt_fluct, + "weight" : event_weight_cut, + "eft_coeff" : eft_coeffs_cut, + #"eft_err_coeff" : eft_w2_coeffs_cut, + } + + hout[dense_axis_name].fill(**axes_fill_info_dict) return hout From 35ff202f7bab63b13964984eaf863ce8dde239e6 Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Wed, 3 Jul 2024 11:06:42 -0400 Subject: [PATCH 13/21] 1j -> 2j --- analysis/mc_validation/gen_photon_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/mc_validation/gen_photon_processor.py b/analysis/mc_validation/gen_photon_processor.py index cbd0f3f45..af38181d9 100644 --- a/analysis/mc_validation/gen_photon_processor.py +++ b/analysis/mc_validation/gen_photon_processor.py @@ -245,7 +245,7 @@ def process(self, events): } sr_cat_dict = { "2los_ph" : { - "atleast_1j" : { + "atleast_3j" : { "lep_chan_lst" : ["2los_ph"], "lep_flav_lst" : ["ee", "mm", "em"], "appl_lst" : ["isSR_2lOS", "isAR_2lOS"], From 7ef3f207bb5f7bce08a9b9a054a3245531f0f5bb Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Wed, 3 Jul 2024 11:17:18 -0400 Subject: [PATCH 14/21] Remove sow requirements --- analysis/mc_validation/run_gen_analysis.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/analysis/mc_validation/run_gen_analysis.py b/analysis/mc_validation/run_gen_analysis.py index 5a9c2b6c9..96524d78d 100644 --- a/analysis/mc_validation/run_gen_analysis.py +++ b/analysis/mc_validation/run_gen_analysis.py @@ -20,16 +20,16 @@ LST_OF_KNOWN_EXECUTORS = ["futures","work_queue"] WGT_VAR_LST = [ - "nSumOfWeights_ISRUp", - "nSumOfWeights_ISRDown", - "nSumOfWeights_FSRUp", - "nSumOfWeights_FSRDown", - "nSumOfWeights_renormUp", - "nSumOfWeights_renormDown", - "nSumOfWeights_factUp", - "nSumOfWeights_factDown", - "nSumOfWeights_renormfactUp", - "nSumOfWeights_renormfactDown", + #"nSumOfWeights_ISRUp", + #"nSumOfWeights_ISRDown", + #"nSumOfWeights_FSRUp", + #"nSumOfWeights_FSRDown", + #"nSumOfWeights_renormUp", + #"nSumOfWeights_renormDown", + #"nSumOfWeights_factUp", + #"nSumOfWeights_factDown", + #"nSumOfWeights_renormfactUp", + #"nSumOfWeights_renormfactDown", ] if __name__ == '__main__': From 66dbf09331c2caaa8f4e965b61779b05fbe1d1a0 Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Wed, 3 Jul 2024 11:23:48 -0400 Subject: [PATCH 15/21] Use CMS naming convention for photon SFs --- analysis/mc_validation/gen_photon_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/mc_validation/gen_photon_processor.py b/analysis/mc_validation/gen_photon_processor.py index af38181d9..4b6250daf 100644 --- a/analysis/mc_validation/gen_photon_processor.py +++ b/analysis/mc_validation/gen_photon_processor.py @@ -286,7 +286,7 @@ def process(self, events): weights_obj_base.add("norm",(xsec/sow)*genw*lumi) for ch_name in ["2los_CRtt", "2los_ph", "2los_CR_Zg_ULttg"]: weights_dict[ch_name] = copy.deepcopy(weights_obj_base) - weights_dict[ch_name].add("phoSF", ak.ones_like(ak.firsts(gen_p).pt), np.random.rand(*ak.to_numpy(ak.firsts(gen_p).pt).shape), np.random.rand(*ak.to_numpy(ak.firsts(gen_p).pt).shape)) + weights_dict[ch_name].add("CMS_eff_g", ak.ones_like(ak.firsts(gen_p).pt), np.random.rand(*ak.to_numpy(ak.firsts(gen_p).pt).shape), np.random.rand(*ak.to_numpy(ak.firsts(gen_p).pt).shape)) # Example of reweighting based on Ht #if "private" in histAxisName: From 8b3ecfa6b1be071436d8705ffcbac9982bc858d0 Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Wed, 3 Jul 2024 11:27:35 -0400 Subject: [PATCH 16/21] More CMS naming for photon SFs --- analysis/mc_validation/gen_photon_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/mc_validation/gen_photon_processor.py b/analysis/mc_validation/gen_photon_processor.py index 4b6250daf..d22171fc0 100644 --- a/analysis/mc_validation/gen_photon_processor.py +++ b/analysis/mc_validation/gen_photon_processor.py @@ -134,7 +134,7 @@ def process(self, events): # Define the lists of systematics we include wgt_correction_syst_lst = [ - "phoSFUp","phoSFDown" # Exp systs + "CMS_eff_gUp","CMS_eff_gDown" # Exp systs ] gen_l = gen_l[ak.argsort(gen_l.pt, axis=-1, ascending=False)] From 41f125440a962d605ca4ebd32598693cc2ee4ffc Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Thu, 4 Jul 2024 10:16:51 -0400 Subject: [PATCH 17/21] Move things around again --- analysis/mc_validation/gen_photon_processor.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/analysis/mc_validation/gen_photon_processor.py b/analysis/mc_validation/gen_photon_processor.py index d22171fc0..926688d7a 100644 --- a/analysis/mc_validation/gen_photon_processor.py +++ b/analysis/mc_validation/gen_photon_processor.py @@ -123,12 +123,6 @@ def process(self, events): gen_m = genpart[is_final_mask & (abs(genpart.pdgId) == 13)] gen_t = genpart[is_final_mask & (abs(genpart.pdgId) == 15)] - # Object selection before padding - is_em = (np.abs(gen_l[:,0].pdgId)==11) & (np.abs(gen_l[:,1].pdgId)==13) - is_me = (np.abs(gen_l[:,1].pdgId)==11) & (np.abs(gen_l[:,0].pdgId)==13) - is2lOS = ak.any(gen_l[:,0].pdgId/np.abs(gen_l[:,1].pdgId)+gen_l[:,1].pdgId/np.abs(gen_l[:,1].pdgId)==0) & ((ak.num(gen_e) + ak.num(gen_m))==2) - is2lOS_em = is2lOS & ak.any(is_em | is_me) - gen_p = genpart[is_final_mask & (abs(genpart.pdgId) == 22)] ######### Systematics ########### @@ -143,6 +137,13 @@ def process(self, events): gen_m = gen_m[ak.argsort(gen_m.pt, axis=-1, ascending=False)] gen_t = gen_t[ak.argsort(gen_t.pt, axis=-1, ascending=False)] + # Object selection after padding + is_em = (np.abs(gen_l[:,0].pdgId)==11) & (np.abs(gen_l[:,1].pdgId)==13) + is_me = (np.abs(gen_l[:,1].pdgId)==11) & (np.abs(gen_l[:,0].pdgId)==13) + is2lOS = ak.any(gen_l[:,0].pdgId/np.abs(gen_l[:,1].pdgId)+gen_l[:,1].pdgId/np.abs(gen_l[:,1].pdgId)==0) & ((ak.num(gen_e) + ak.num(gen_m))==2) + is2lOS_em = is2lOS & ak.any(is_em | is_me) + + gen_p = ak.fill_none(ak.pad_none(gen_p,1), 0) gen_p = gen_p[ak.argsort(gen_p.pt, axis=-1, ascending=False)] gen_l_from_zg = ak.pad_none(gen_l[(gen_l.distinctParent.pdgId == 23) | (gen_l.distinctParent.pdgId == 22)], 2) From 7cab3c3f20611bb4ebe065d4be8ca4f774ea45d8 Mon Sep 17 00:00:00 2001 From: "Brent R. Yates" Date: Mon, 8 Jul 2024 10:05:11 -0400 Subject: [PATCH 18/21] Fix lint issue --- analysis/mc_validation/gen_photon_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/mc_validation/gen_photon_processor.py b/analysis/mc_validation/gen_photon_processor.py index 926688d7a..7cbcffd91 100644 --- a/analysis/mc_validation/gen_photon_processor.py +++ b/analysis/mc_validation/gen_photon_processor.py @@ -166,7 +166,7 @@ def process(self, events): #selections.add("2los_CRtt", (events.is2l_nozeeveto & charge2l_0 & events.is_em & bmask_exactly2med & pass_trg)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement selections.add("2los_ph", is2lOS & (nbjets==2) & (ak.num(gen_p)>0) & (ak.firsts(gen_p).pt>20)) # Explicitly add the em requirement here, so we don't have to rely on running with _split_by_lepton_flavor turned on to enforce this requirement #selections.add("2los_sf_ph", (retainedbyOverlap & events.is2l & charge2l_0 & (events.is_ee | events.is_mm) & ~sfosz_2los_ttg_mask & events.mask_SF_Zllgamma & bmask_atleast1med & pass_trg & exactly_1ph)) - sfosz_2los_ttg_mask = tc_es.get_Z_peak_mask(gen_l[:,0:2],pt_window=15.0) + sfosz_2los_ttg_mask = tc_es.get_Z_peak_mask(gen_l[:,0:2],pt_window=15.0) Zllgamma_SF_mask = (abs( (gen_l[:,0] + gen_l[:,1] + gen_p[:,0]).mass -91.2) > 15) selections.add("2los_sf_Zg_CR_ULttg", (is2lOS_em & ~sfosz_2los_ttg_mask & Zllgamma_SF_mask & (nbjets==2) & atleast_1ph)) #selections.add("2los_sf_Zg_CR_ULttg", (retainedbyOverlap & events.is2l_nozeeveto & charge2l_0 & (events.is_ee | events.is_mm) & ~sfosz_2los_ttg_mask & ~events.mask_SF_Zllgamma & bmask_atleast1med & pass_trg & atleast_1ph)) From 917d3df93fe6101e60b4b53f6aea7d912c8442b2 Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Tue, 9 Jul 2024 09:39:46 -0400 Subject: [PATCH 19/21] Match processor naming for now --- analysis/mc_validation/gen_photon_processor.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/analysis/mc_validation/gen_photon_processor.py b/analysis/mc_validation/gen_photon_processor.py index 7cbcffd91..d0ea03ed0 100644 --- a/analysis/mc_validation/gen_photon_processor.py +++ b/analysis/mc_validation/gen_photon_processor.py @@ -61,7 +61,7 @@ def __init__(self, samples, wc_names_lst=[], hist_lst=None, ecut_threshold=None, "ht" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(100, 0, 1000, name="ht", label=r"Scalar sum of genjet pt"), wc_names=wc_names_lst, rebin=False), "ht_clean" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(100, 0, 1000, name="ht_clean", label=r"Scalar sum of clean genjet pt"), wc_names=wc_names_lst, rebin=False), "tops_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(50, 0, 500, name="tops_pt", label=r"Pt of the sum of the tops"), wc_names=wc_names_lst, rebin=False), - "photon_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Variable([20,35,50,70,100,170,200,250,300], name="photon_pt", label=r"Pt of the sum of the photon"), wc_names=wc_names_lst, rebin=False), + "photon_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Variable([20,35,50,70,100,170,200,250,300], name="photon_pt", label=r"$p_{T}$ $\gamma$ (GeV)"), wc_names=wc_names_lst, rebin=False), "l0_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(50, 0, 500, name="l0_pt", label=r"Pt of leading lepton"), wc_names=wc_names_lst, rebin=False), "j0_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(50, 0, 500, name="j0_pt", label=r"Pt of leading jet"), wc_names=wc_names_lst, rebin=False), "tX_pt" : HistEFT(proc_axis, chan_axis, syst_axis, appl_axis, hist.axis.Regular(40, 0, 400, name="tX_pt", label=r"Pt of the t(t)X system"), wc_names=wc_names_lst, rebin=False), @@ -128,7 +128,8 @@ def process(self, events): # Define the lists of systematics we include wgt_correction_syst_lst = [ - "CMS_eff_gUp","CMS_eff_gDown" # Exp systs + "phoSFUp","phoSFDown" # Exp systs + #"CMS_effUp","CMS_effDown" # Exp systs ] gen_l = gen_l[ak.argsort(gen_l.pt, axis=-1, ascending=False)] @@ -287,7 +288,8 @@ def process(self, events): weights_obj_base.add("norm",(xsec/sow)*genw*lumi) for ch_name in ["2los_CRtt", "2los_ph", "2los_CR_Zg_ULttg"]: weights_dict[ch_name] = copy.deepcopy(weights_obj_base) - weights_dict[ch_name].add("CMS_eff_g", ak.ones_like(ak.firsts(gen_p).pt), np.random.rand(*ak.to_numpy(ak.firsts(gen_p).pt).shape), np.random.rand(*ak.to_numpy(ak.firsts(gen_p).pt).shape)) + weights_dict[ch_name].add("phoSF", ak.ones_like(ak.firsts(gen_p).pt), np.random.rand(*ak.to_numpy(ak.firsts(gen_p).pt).shape), np.random.rand(*ak.to_numpy(ak.firsts(gen_p).pt).shape)) + #weights_dict[ch_name].add("CMS_eff_g", ak.ones_like(ak.firsts(gen_p).pt), np.random.rand(*ak.to_numpy(ak.firsts(gen_p).pt).shape), np.random.rand(*ak.to_numpy(ak.firsts(gen_p).pt).shape)) # Example of reweighting based on Ht #if "private" in histAxisName: From 95fd95bacad0ae5959a8ac1eda9341cccd98ac3e Mon Sep 17 00:00:00 2001 From: Brent Yates Date: Thu, 11 Jul 2024 15:15:30 -0400 Subject: [PATCH 20/21] Update run script --- analysis/mc_validation/run_gen_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/mc_validation/run_gen_analysis.py b/analysis/mc_validation/run_gen_analysis.py index 96524d78d..d28565152 100644 --- a/analysis/mc_validation/run_gen_analysis.py +++ b/analysis/mc_validation/run_gen_analysis.py @@ -258,7 +258,7 @@ def LoadJsonToSampleName(jsonFile, prefix): extra_pip_local = {"topeft": ["topeft", "setup.py"]}, ), 'filepath': f'/project01/ndcms/{os.environ["USER"]}', - 'extra_input_files': ["mc_validation_gen_processor.py"], + 'extra_input_files': ["gen_photon_processor.py"], 'retries': 5, From bc9d0b76953a0a516af33fc79f0bb943fca34657 Mon Sep 17 00:00:00 2001 From: "Brent R. Yates" Date: Mon, 18 Nov 2024 13:47:40 -0500 Subject: [PATCH 21/21] CR invert ZllGamma mask --- analysis/mc_validation/gen_photon_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/mc_validation/gen_photon_processor.py b/analysis/mc_validation/gen_photon_processor.py index d0ea03ed0..ab4f9af4b 100644 --- a/analysis/mc_validation/gen_photon_processor.py +++ b/analysis/mc_validation/gen_photon_processor.py @@ -169,7 +169,7 @@ def process(self, events): #selections.add("2los_sf_ph", (retainedbyOverlap & events.is2l & charge2l_0 & (events.is_ee | events.is_mm) & ~sfosz_2los_ttg_mask & events.mask_SF_Zllgamma & bmask_atleast1med & pass_trg & exactly_1ph)) sfosz_2los_ttg_mask = tc_es.get_Z_peak_mask(gen_l[:,0:2],pt_window=15.0) Zllgamma_SF_mask = (abs( (gen_l[:,0] + gen_l[:,1] + gen_p[:,0]).mass -91.2) > 15) - selections.add("2los_sf_Zg_CR_ULttg", (is2lOS_em & ~sfosz_2los_ttg_mask & Zllgamma_SF_mask & (nbjets==2) & atleast_1ph)) + selections.add("2los_sf_Zg_CR_ULttg", (is2lOS_em & ~sfosz_2los_ttg_mask & ~Zllgamma_SF_mask & (nbjets==2) & atleast_1ph)) #selections.add("2los_sf_Zg_CR_ULttg", (retainedbyOverlap & events.is2l_nozeeveto & charge2l_0 & (events.is_ee | events.is_mm) & ~sfosz_2los_ttg_mask & ~events.mask_SF_Zllgamma & bmask_atleast1med & pass_trg & atleast_1ph))