-
Notifications
You must be signed in to change notification settings - Fork 3
/
ttZ_2l_Analysis.py
2000 lines (1612 loc) · 119 KB
/
ttZ_2l_Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# # Machine Learning for ttZ Opposite-sign dilepton analysis
# This notebook uses ATLAS Open Data http://opendata.atlas.cern to show you the steps to implement Machine Learning in the ttZ Opposite-sign dilepton analysis, following the ATLAS published paper [Measurement of the ttZ and ttW cross sections in proton-proton collisions at sqrt{s} = 13 TeV with the ATLAS detector](https://journals.aps.org/prd/pdf/10.1103/PhysRevD.99.072009).
#
# The whole code takes a few hours to follow through.
#
# Notebooks are a perfect platform to develop Machine Learning for your work, since you'll need exactly those 3 things: code, visualisations and narrative text!
#
# We're interested in Machine Learning because we can design an algorithm to figure out for itself how to do various analyses, potentially saving us countless human-hours of design and analysis work.
#
# Machine Learning use within ATLAS includes:
# * particle tracking
# * particle identification
# * signal/background classification
# * and more!
#
# This code will focus on signal/background classification.
#
# By the end of this code you will be able to:
# 1. run Boosted Decision Trees to classify signal and background
# 2. know some things you can change to improve your Boosted Decision Tree
#
# ## Introduction (from Section 1)
#
# Properties of the top quark have been explored by the
# Large Hadron Collider (LHC) and previous collider experiments in great detail.
#
# Other properties of the top quark are
# now becoming accessible, owing to the large centerof-mass energy and luminosity at the LHC.
#
# Measurements of top-quark pairs in association with a Z boson (ttZ) provide a direct probe of the
# weak couplings of the top quark. These couplings
# may be modified in the presence of physics beyond the
# Standard Model (BSM). Measurements of the ttZ production cross sections, sigma_{ttZ}, can be used to
# set constraints on the weak couplings of the top quark.
#
# The production of ttZ is often an important
# background in searches involving final states with multiple
# leptons and b-quarks. These processes also constitute an
# important background in measurements of the associated
# production of the Higgs boson with top quarks.
#
# This paper presents measurements of the ttZ cross section using proton–proton (pp) collision data
# at a center-of mass energy sqrt{s} = 13 TeV.
#
# The final states of top-quark pairs produced in association with a
# Z boson contain up to four isolated, prompt leptons. In this analysis, events with two opposite-sign
# (OS) leptons are considered. The dominant backgrounds
# in this channel are Z+jets and ttbar,
#
# (In this paper, lepton is used to denote electron or muon, and prompt lepton is used to denote a lepton produced in a Z or W
# boson decay, or in the decay of a τ-lepton which arises from a Z or W boson decay.)
# ## Data and simulated samples (from Section 3)
#
# The data were collected with the ATLAS detector at a proton–proton (pp) collision
# energy of 13 TeV.
#
# Monte Carlo (MC) simulation samples are used to model the expected signal and background distributions
# in the different control, validation and signal regions described below. All samples were processed through the
# same reconstruction software as used for the data.
# ## Opposite-sign dilepton analysis (from Section 5A)
#
# The OS dilepton analysis targets the ttZ process, where both top quarks decay hadronically and the Z boson
# decays to a pair of leptons (electrons or muons). Events are required to have exactly two OSSF leptons.
# Events with additional isolated leptons are rejected. The invariant mass of the lepton pair is required to be in the Z boson mass window, |m_{ll} − m_Z| < 10 GeV. The leading (subleading) lepton is required to have a
# transverse momentum of at least 30 (15) GeV.
#
# The OS dilepton analysis is affected by large backgrounds from Z+jets or ttbar production, both characterized
# by the presence of two leptons. In order to improve the signal-to-background ratio and constrain these
# backgrounds from data, three separate analysis regions are considered, depending on the number of
# jets (n_jets) and number of b-tagged jets (n_b-tags): 2l-Z-5j2b, 2l-Z-6j1b and 2l-Z-6j2b. The signal region
# requirements are summarized in Table 1 below.
# | Variable | 2l-Z-6j1b | 2l-Z-5j2b | 2l-Z-6j2b |
# |-----------------------------|-------------------------------------|-------------------------------------|-------------------------------------|
# | Leptons | = 2, same flavour and opposite sign | = 2, same flavour and opposite sign | = 2, same flavour and opposite sign |
# | m_ll | |m_ll − m_Z | < 10 GeV | |m_ll − m_Z |$ < 10 GeV | |m_ll − m_Z | < 10 GeV |
# | p_T (leading lepton) | > 30 GeV | > 30 GeV | > 30 GeV |
# | p_T (subleading lepton) | > 15 GeV | > 15 GeV | > 15 GeV |
# | n_b-tags | 1 | >=2 | >=2 |
# | n_jets | >=6 | 5 | >=6 |
#
# Table 1: Summary of the event selection requirements in the OS dilepton signal regions.
#
# This is Table 2 of the ATLAS published paper [Measurement of the ttZ and ttW cross sections in proton-proton collisions at sqrt{s} = 13 TeV with the ATLAS detector](https://journals.aps.org/prd/pdf/10.1103/PhysRevD.99.072009).
#
# In signal region 2l-Z-5j2b, exactly five jets
# are required, of which at least two must be b-tagged. In 2l-Z-6j1b (2l-Z-6j2b), at least six jets are required with
# exactly one (at least two) being b-tagged jets.
# Contents:
#
# To setup everytime
# Lumi, fraction, file path
# Samples to process
# Get data from files
# Find the good jets!
# Find the good leptons!
# Let's calculate some variables
# Can we process the data yet?!
# Samples to plot
# Function to plot Data/MC
#
# Boosted Decision Tree (BDT) in 6j2b Region
# Training and Testing split
# Training
# Signal Region plot
#
# Boosted Decision Tree (BDT) in 5j2b Region
# Training and Testing split
# Training
# Signal Region plot
#
# Boosted Decision Tree (BDT) in 6j1b Region
# Training and Testing split
# Training
# Signal Region plot
#
# Control Region plots
# 6j2b Control Region plot
# 5j2b Control Region plot
# 6j1b Control Region plot
#
# Data-driven ttbar estimate
# Function to plot data from histograms
# 6j2b Signal Region plot
# 5j2b Signal Region plot
# 6j1b Signal Region plot
#
# BDT feature importances
#
# Going further
# ## To setup everytime
#
# We're going to be using a number of tools to help us:
# * uproot: lets us read .root files typically used in particle physics into data formats used in Machine Learning
# * pandas: lets us store data as dataframes, a format widely used in Machine Learning
# * numpy: provides numerical calculations such as histogramming
# * matplotlib: common tool for making plots, figures, images, visualisations
import pandas as pd # to store data as dataframes
import numpy as np # for numerical calculations such as histogramming
import uproot # to read .root files as dataframes
import matplotlib.pyplot as plt # for plotting
from matplotlib.lines import Line2D # for dashed line in legend
from matplotlib.ticker import AutoMinorLocator,MaxNLocator # for minor ticks and forcing integer tick labels
import infofile # local file containing cross-sections and sum of weights
#import ttZ_2l_Samples # local file containing samples to process
#import ttZ_2l_Cuts # local file containing cuts to implement
# ## Lumi, fraction, file path
#
# General definitions of fraction of data used, where to access the input files.
#
# The code will run quicker if you download the input files to your computer, rather than reading them through the web.
#
# 1. download by navigating to https://atlas-opendata.web.cern.ch/atlas-opendata/samples/2020/exactly2lep/
# 2. Make a folder called exactly2lep inside notebooks-collection-opendata/13-TeV-examples/uproot_python/
# 3. Unzip the downloaded file in "exactly2lep"
# 4. Uncomment tuple_path = "exactly2lep/"
# 5. Comment out tuple_path = "https://atlas-opendata.web.cern.ch/atlas-opendata/samples/2020/exactly2lep/"
#lumi = 0.5 # 0.5 fb-1 for data_A
#lumi = 1.9 # 1.9 fb-1 for data_B
#lumi = 2.9 # 2.9 fb-1 for data_C
#lumi = 4.7 # 4.7 fb-1 for data_D
lumi = 9.5 # 9.5 fb-1 for data_B+C+D
#lumi = 10 # 10 fb-1 for data_A+B+C+D
fraction = 0.9 # fraction of data to use, increase this for better results
MC_to_data_ratio = 0.1 # fraction of simulated data to use, increase this for better results
tuple_path = "/eos/project/a/atlas-outreach/projects/open-data/OpenDataTuples/renamedLargeRJets/exactly2lep/" # local
#tuple_path = "https://atlas-opendata.web.cern.ch/atlas-opendata/samples/2020/exactly2lep/" # web address
# ## Samples to process
#
# In this code we only process the background samples that have a significant contribution. You can add in the uncommented background samples later if you wish.
samples = {
'data': {
'list' : [
#'data_A', # period A from data16
'data_B', # period B from data16
'data_C', # period C from data16
'data_D' # period D from data16
]
},
r'$t\bar{t}Z$' : { # ttZ(->ee) and ttZ(->μμ) signal (associated production of a top-quark pair with one vector boson)
'list' : [
'ttee',
'ttmumu'
]
},
r'$t\bar{t}$' : { # ttbar semileptonic / all-leptonic
'list' : ['ttbar_lep']
},
'Z' : { # Z+jets (Events containing Z bosons with associated jets)
'list' : [
#'Zmumu_PTV0_70_CVetoBVeto', # Z->μμ + jets with 0 < pT(Z) < 70 GeV whilst vetoing c and b-jets
#'Zmumu_PTV0_70_CFilterBVeto', # Z->μμ + jets with 0 < pT(Z) < 70 GeV and a requirement for c-jets, whilst vetoing b-jets
#'Zmumu_PTV0_70_BFilter', # Z->μμ + jets with 0 < pT(Z) < 70 GeV and a requirement for b-jets
#'Zmumu_PTV70_140_CVetoBVeto', # Z->μμ + jets with 70 < pT(Z) < 140 GeV whilst vetoing c and b-jets
#'Zmumu_PTV70_140_CFilterBVeto', # Z->μμ + jets with 70 < pT(Z) < 140 GeV and a requirement for c-jets, whilst vetoing b-jets
#'Zmumu_PTV70_140_BFilter', # Z->μμ + jets with 70 < pT(Z) < 140 GeV and a requirement for b-jets
#'Zmumu_PTV140_280_CVetoBVeto', # Z->μμ + jets with 140 < pT(Z) < 280 GeV whilst vetoing c and b-jets
'Zmumu_PTV140_280_CFilterBVeto', # Z->μμ + jets with 140 < pT(Z) < 280 GeV and a requirement for c-jets, whilst vetoing b-jets
'Zmumu_PTV140_280_BFilter', # Z->μμ + jets with 140 < pT(Z) < 280 GeV and a requirement for b-jets
#'Zmumu_PTV280_500_CVetoBVeto', # Z->μμ + jets with 280 < pT(Z) < 500 GeV whilst vetoing c and b-jets
'Zmumu_PTV280_500_CFilterBVeto', # Z->μμ + jets with 280 < pT(Z) < 500 GeV and a requirement for c-jets, whilst vetoing b-jets
'Zmumu_PTV280_500_BFilter', # Z->μμ + jets with 280 < pT(Z) < 500 GeV and a requirement for b-jets
'Zmumu_PTV500_1000', # Z->μμ + jets, with 500 < pT(Z) < 1000 GeV
#'Zmumu_PTV1000_E_CMS', # Z->μμ + jets with 1000 GeV < pT(Z) < centre-of-mass energy
#'Zee_PTV0_70_CVetoBVeto', # Z->ee + jets with 0 < pT(Z) < 70 GeV whilst vetoing c and b-jets
#'Zee_PTV0_70_CFilterBVeto', # Z->ee + jets with 0 < pT(Z) < 70 GeV and a requirement for c-jets, whilst vetoing b-jets
#'Zee_PTV0_70_BFilter', # Z->ee + jets with 0 < pT(Z) < 70 GeV and a requirement for b-jets
#'Zmumu_PTV70_140_CVetoBVeto', # Z->ee + jets with 70 < pT(Z) < 140 GeV whilst vetoing c and b-jets
#'Zee_PTV70_140_CFilterBVeto', # Z->ee + jets with 70 < pT(Z) < 140 GeV and a requirement for c-jets, whilst vetoing b-jets
#'Zee_PTV70_140_BFilter', # Z->ee + jets with 70 < pT(Z) < 140 GeV and a requirement for b-jets
#'Zee_PTV140_280_CVetoBVeto', # Z->ee + jets with 140 < pT(Z) < 280 GeV whilst vetoing c and b-jets
'Zee_PTV140_280_CFilterBVeto', # Z->ee + jets with 140 < pT(Z) < 280 GeV and a requirement for c-jets, whilst vetoing b-jets
'Zee_PTV140_280_BFilter', # Z->ee + jets with 140 < pT(Z) < 280 GeV and a requirement for b-jets
#'Zee_PTV280_500_CVetoBVeto', # Z->μμ + jets with 280 < pT(Z) < 500 GeV whilst vetoing c and b-jets
'Zee_PTV280_500_CFilterBVeto', # Z->ee + jets with 280 < pT(Z) < 500 GeV and a requirement for c-jets, whilst vetoing b-jets
'Zee_PTV280_500_BFilter', # Z->ee + jets with 280 < pT(Z) < 500 GeV and a requirement for b-jets
'Zee_PTV500_1000', # Z->ee + jets with 500 < pT(Z) < 1000 GeV
#'Zee_PTV1000_E_CMS', # Z->ee + jets with 1000 GeV < pT(Z) < centre-of-mass energy
]
},
'Other' : { # background with at least 2 prompt leptons, other than Z+jets and ttbar
'list' : [
'ZqqZll', # Z(->qq)Z(->ll)
#'single_top_wtchan', # Wt
#'single_antitop_wtchan', # Wt
#'lllv', # W(->lv)Z(->ll)
#'ttW',
#'llll', # ZZ->llll
#'llvv', # ZZ->llvv and WW->lvlv
#'Ztautau_PTV0_70_CVetoBVeto', # Z->ττ + jets with 0 < pT(Z) < 70 GeV whilst vetoing c and b-jets
#'Ztautau_PTV0_70_CFilterBVeto', # Z->ττ + jets with 0 < pT(Z) < 70 GeV and a requirement for c-jets, whilst vetoing b-jets
#'Ztautau_PTV0_70_BFilter', # Z->ττ + jets with 0 < pT(Z) < 70 GeV and a requirement for b-jets
#'Ztautau_PTV70_140_CVetoBVeto', # Z->μμ + jets with 70 < pT(Z) < 140 GeV whilst vetoing c and b-jets
#'Ztautau_PTV70_140_CFilterBVeto', # Z->ττ + jets with 70 < pT(Z) < 140 GeV and a requirement for c-jets, whilst vetoing b-jets
#'Ztautau_PTV70_140_BFilter', # Z->ττ + jets with 70 < pT(Z) < 140 GeV and a requirement for b-jets
#'Ztautau_PTV140_280_CVetoBVeto', # Z->μμ + jets with 140 < pT(Z) < 280 GeV whilst vetoing c and b-jets
#'Ztautau_PTV140_280_CFilterBVeto', # Z->ττ + jets with 140 < pT(Z) < 280 GeV and a requirement for c-jets, whilst vetoing b-jets
#'Ztautau_PTV140_280_BFilter', # Z->ττ + jets with 140 < pT(Z) < 280 GeV and a requirement for b-jets
#'Ztautau_PTV280_500_CVetoBVeto', # Z->μμ + jets with 280 < pT(Z) < 500 GeV whilst vetoing c and b-jets
#'Ztautau_PTV280_500_CFilterBVeto', # Z->ττ + jets with 280 < pT(Z) < 500 GeV and a requirement for c-jets, whilst vetoing b-jets
#'Ztautau_PTV280_500_BFilter', # Z->ττ + jets with 280 < pT(Z) < 500 GeV and a requirement for b-jets
#'Ztautau_PTV500_1000', # Z->ττ + jets with 500 < pT(Z) < 1000 GeV
#'Ztautau_PTV1000_E_CMS' # Z->ττ + jets with 1000 GeV < pT(Z) < centre-of-mass energy
]
},
}
# ## Get data from files
# define function to get data from files
#
# The datasets used in this code have already been filtered to include exactly 2 leptons per event, so that processing is quicker.
def get_data_from_files(samples):
data = {} # define empty dictionary to hold dataframes
for s in samples: # loop over samples
print('Processing '+s+' samples') # print which sample
frames = [] # define empty list to hold data
for val in samples[s]['list']: # loop over each file
if s == 'data': prefix = "Data/" # Data prefix
else: prefix = "MC/mc_"+str(infofile.infos[val]["DSID"])+"." # MC prefix
fileString = tuple_path+prefix+val+".exactly2lep.root" # file name to open
temp = read_file(fileString,val) # call the function read_file defined below
if 'data' in val: # processing data file
print('\t\tnumber of events ',len(temp)) # print total number of events for file
else: # processing Monte Carlo simulation file
print('\t\tsum of weights ',sum(temp['totalWeight'])) # print total weight for file
frames.append(temp) # append dataframe returned from read_file to list of dataframes
data[s] = pd.concat(frames) # dictionary entry is concatenated dataframes
return data # return dictionary of dataframes
# define function to calculate weight of MC event
#
# paper: "Simulated events were corrected so that the object
# identification, reconstruction and trigger efficiencies, energy scales and energy resolutions match those
# determined from data control samples."
def get_xsec_weight(sample):
info = infofile.infos[sample] # get infofile containing cross-sections, sums of weights, dataset IDs
return (lumi*1000*info["xsec"]/info["sumw"])/MC_to_data_ratio #*1000 to go from fb-1 to pb-1
def calc_weight(xsec_weight,mcWeight,scaleFactor_PILEUP,scaleFactor_ELE,
scaleFactor_MUON, scaleFactor_LepTRIGGER
#, scaleFactor_BTAG # uncomment this to get better Data vs MC match
):
return xsec_weight*mcWeight*scaleFactor_PILEUP*scaleFactor_ELE*scaleFactor_MUON*scaleFactor_LepTRIGGER#*scaleFactor_BTAG
# ## Find the good jets!
# define functions to find jets passing some minimum requirements
def find_good_jet_indices(jet_pt,jet_jvt,
#,jet_eta # uncomment this when you want to impose stricter requirements on jets
): # get indices of good jets
good_jets = [] # list to hold whether jets are good
for i in range(len(jet_pt)): # loop over jets
if jet_pt[i]>25000: # paper: "Jets are accepted if they fulfill the requirements pT > 25 GeV"
# paper: jets with pT < 60 GeV and |η| < 2.4 are required to satisfy pileup rejection criteria (JVT)
if jet_pt[i]<60000: # extra requirements for pt < 60 GeV and |η|<2.4
#if jet_pt[i]<60000 and abs(jet_eta[i])<2.4: # extra requirements for pt < 60 GeV and |η|<2.4
if jet_jvt[i]<0.59:
good_jets.append(0) # if jvt<0.59, this isn't a good jet
continue # move onto next jet
good_jets.append(1) # append good jet if gets here
continue # move onto next jet
good_jets.append(0) # if pt<25000, this isn't a good jet
string_ints = [str(i) for i in good_jets] # Convert each integer to a string
str_of_ints = "".join(string_ints) # Combine each string
return str_of_ints # return list of whether jets are good
# calculate number of good jets
def calc_goodjet_n(good_jets): # jet indices
return good_jets.count('1') # count number of 1 in good jets list
# selection on number of good jets
def select_good_jet_n(good_jets):
return good_jets.count('1')<5 # throw away if fewer than 5 good jets
# define functions to calculate number of b-jets
# calculate number of (good) b-jets
def calc_bjet_n(jet_MV2c10,good_jets): # MV2c10 scores and jet indices
bjet_n = 0 # start counter for number of b-jets
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
for i in good_jets_indices: # loop over good jets
if jet_MV2c10[i]>0.6459: # for 77% b-tag efficiency from https://cds.cern.ch/record/2160731/files/ATL-PHYS-PUB-2016-012.pdf Table 2
bjet_n+=1 # increment the counter for the number of b-jets
return bjet_n # return the number of b-jets
# selection on number of b-jets
def select_bjet_n(bjet_n):
# throw away if fewer than 1 b-jets
return bjet_n<1
# ## Find the good leptons!
# define function to calculate dilepton invariant mass (mll)
def calc_inv_mass_pair(pt_1,eta_1,phi_1,E_1,pt_2,eta_2,phi_2,E_2): # pt,eta,phi,energy of 2 objects
px_1 = pt_1*np.cos(phi_1) # x-momentum of object 1
py_1 = pt_1*np.sin(phi_1) # y-momentum of object 1
pz_1 = pt_1*np.sinh(eta_1) # z-momentum of object 1
px_2 = pt_2*np.cos(phi_2) # x-momentum of object 2
py_2 = pt_2*np.sin(phi_2) # y-momentum of object 2
pz_2 = pt_2*np.sinh(eta_2) # z-momentum of object 2
sumpx = px_1 + px_2 # x-momentum of combined system
sumpy = py_1 + py_2 # y-momentum of combined system
sumpz = pz_1 + pz_2 # z-momentum of combined system
sump = np.sqrt(sumpx**2 + sumpy**2 + sumpz**2) # total momentum of combined system
sumE = E_1 + E_2 # total energy of combined system
return np.sqrt(sumE**2 - sump**2)/1000 # /1000 to go from MeV to GeV
# calculate dilepton invariant mass
def calc_mll(lep_pt,lep_eta,lep_phi,lep_E): # lepton pt,eta,phi,energy
return calc_inv_mass_pair(lep_pt[0],lep_eta[0],lep_phi[0],lep_E[0],lep_pt[1],lep_eta[1],lep_phi[1],lep_E[1])
# define lepton selections for Opposite-sign dilepton analysis. Information is taken from Section 4. Object Reconstruction of the ATLAS published paper [Measurement of the ttZ and ttW cross sections in proton-proton collisions at sqrt{s} = 13 TeV with the ATLAS detector](https://journals.aps.org/prd/pdf/10.1103/PhysRevD.99.072009).
# selection on nth good lepton
def select_leptons(mll, lep_ptcone30, lep_pt, lep_type, lep_etcone20, lep_tracksigd0pvunbiased, lep_eta,
lep_charge
#,lep_z0 # uncomment this to apply stricter requirements
): # variables for good lepton
# paper: "invariant mass of the lepton pair is required to be in the Z boson mass window, |mll − mZ| < 10 GeV"
if (mll < 81.12) or (mll > 101.12): return True
# paper: "total sum of [...] transverse momenta in a surrounding cone [...] is required to be less than 6% of [...] pT"
if lep_ptcone30[0]>0.06*lep_pt[0] or lep_ptcone30[1]>0.06*lep_pt[1]: return True # bad lepton if ptcone>6%pt
# paper: "sum of [...] transverse energies [...] within a cone of size ∆Rη = 0.2 around any electron [...] is required to be less than 6% of [...] pT"
if lep_type[0]==11 and lep_etcone20[0]>0.06*lep_pt[0]: return True # bad electron if etcone>6%pt
if lep_type[1]==11 and lep_etcone20[1]>0.06*lep_pt[1]: return True # bad electron if etcone>6%pt
# paper: "significance of [...] d0 is required to satisfy |d0|/σ(d0) < 5 for electrons and |d0|/σ(d0) < 3 for muons"
if lep_type[0]==13 and lep_tracksigd0pvunbiased[0]>3: return True # bad muon if σ(d0)>3
if lep_type[1]==13 and lep_tracksigd0pvunbiased[1]>3: return True # bad muon if σ(d0)>3
if lep_tracksigd0pvunbiased[0]>5 or lep_tracksigd0pvunbiased[1]>5: return True # bad electron if σ(d0)>5
# paper Table 2: pT (leading lepton) > 30 GeV and pT (subleading lepton) > 15 GeV
if lep_pt[0]<30000 or lep_pt[1]<15000: return True # minimum pt requirments on leptons
# paper: "Muons are required to have |η| < 2.5"
if abs(lep_eta[0])>2.5 or abs(lep_eta[1])>2.5: return True # bad lepton if |η|>2.5
# paper: "Opposite-sign"
if lep_charge[0]+lep_charge[1]!=0: return True # throw away when charges don't add to 0
# paper: "longitudinal impact parameter [...], z0, is required to satisfy |z0 sinθ| < 0.5 mm"
#theta_i = 2*np.arctan(np.exp(-lep_eta[0])) # calculate theta angle
#if abs(lep_z0[0]*np.sin(theta_i))>0.5: return True # bad lepton if z0*sinθ > 0.5mm
#theta_i = 2*np.arctan(np.exp(-lep_eta[1])) # calculate theta angle
#if abs(lep_z0[1]*np.sin(theta_i))>0.5: return True # bad lepton if z0*sinθ > 0.5mm
return False # don't throw away event if gets here
# define function to return which channel. 121 is ee, 169 is mumu, 143 is emu.
# return number to represent which channel
def calc_channel(lep_type):
return lep_type[0]*lep_type[1]
# define functions to calculate invariant mass, ∆R and pT
def calc_inv_mass_triplet(pt_1,eta_1,phi_1,E_1,pt_2,eta_2,phi_2,E_2,pt_3,eta_3,phi_3,E_3): # pt,eta,phi,energy of 3 objects
px_1 = pt_1*np.cos(phi_1) # x-momentum of object 1
py_1 = pt_1*np.sin(phi_1) # y-momentum of object 1
pz_1 = pt_1*np.sinh(eta_1) # z-momentum of object 1
px_2 = pt_2*np.cos(phi_2) # x-momentum of object 2
py_2 = pt_2*np.sin(phi_2) # y-momentum of object 2
pz_2 = pt_2*np.sinh(eta_2) # z-momentum of object 2
px_3 = pt_3*np.cos(phi_3) # x-momentum of object 3
py_3 = pt_3*np.sin(phi_3) # y-momentum of object 3
pz_3 = pt_3*np.sinh(eta_3) # z-momentum of object 3
sumpx = px_1 + px_2 + px_3 # x-momentum of combined system
sumpy = py_1 + py_2 + py_3 # y-momentum of combined system
sumpz = pz_1 + pz_2 + pz_3 # z-momentum of combined system
sump = np.sqrt(sumpx**2 + sumpy**2 + sumpz**2) # total momentum of combined system
sumE = E_1 + E_2 + E_3 # total energy of combined system
return np.sqrt(sumE**2 - sump**2)/1000 # /1000 to go from MeV to GeV
def calc_delta_R(eta_1,phi_1,eta_2,phi_2): # eta,phi of 2 objects
delta_eta = eta_1-eta_2 # Δη between the 2 objects
delta_phi = phi_1-phi_2 # Δϕ between the 2 objects
if delta_phi >= np.pi: delta_phi -= 2*np.pi # use π periodicity to get number between -π and π
elif delta_phi < -np.pi: delta_phi += 2*np.pi # use π periodicity to get number between -π and π
return np.sqrt(delta_eta**2 + delta_phi**2) # return ∆R for this object
def calc_pT_triplet(pt_1,eta_1,phi_1,pt_2,eta_2,phi_2,pt_3,eta_3,phi_3): # pt,eta,phi of 3 objects
px_1 = pt_1*np.cos(phi_1) # x-momentum of object 1
py_1 = pt_1*np.sin(phi_1) # y-momentum of object 1
px_2 = pt_2*np.cos(phi_2) # x-momentum of object 2
py_2 = pt_2*np.sin(phi_2) # y-momentum of object 2
px_3 = pt_3*np.cos(phi_3) # x-momentum of object 3
py_3 = pt_3*np.sin(phi_3) # y-momentum of object 3
pz_3 = pt_3*np.sinh(eta_3) # z-momentum of object 3
sumpx = px_1 + px_2 + px_3 # x-momentum of combined system
sumpy = py_1 + py_2 + py_3 # y-momentum of combined system
return np.sqrt(sumpx**2 + sumpy**2)/1000 # /1000 to go from MeV to GeV
# define functions to calculate variables for 2b6j BDT
# functin to calculate pt of the lepton pair
def calc_ptll(lep_pt,lep_phi): # pt of dilepton system
px_0 = lep_pt[0]*np.cos(lep_phi[0]) # x-momentum of lep_0
py_0 = lep_pt[0]*np.sin(lep_phi[0]) # y-momentum of lep_0
px_1 = lep_pt[1]*np.cos(lep_phi[1]) # x-momentum of lep_1
py_1 = lep_pt[1]*np.sin(lep_phi[1]) # y-momentum of lep_1
sumpx = px_0 + px_1 # x-momentum of dilepton system
sumpy = py_0 + py_1 # y-momentum of dilepton system
return np.sqrt(sumpx**2 + sumpy**2)/1000 #/1000 to go from MeV to GeV
# function to calculate pt of ith jet
def calc_pti_jet(jet_pt,good_jets,index): # jet pt and index
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
if index<len(good_jets_indices) and good_jets_indices[index]<len(jet_pt):
return jet_pt[good_jets_indices[index]]/1000 # /1000 to go from MeV to GeV
else: return 0 # 0 if there's no jet at this index
# function to calculate ∆R between the two leptons
def calc_dRll(lep_eta, lep_phi): # Δη and Δϕ between leptons
return calc_delta_R(lep_eta[0], lep_phi[0], lep_eta[1], lep_phi[1]) # ∆R between the 2 leptons
# function to calculate number of jet pairs with mass within a window of 30 GeV around 85 GeV (~Z/W mass)
def calc_NJetPairsZMass(jet_pt,jet_eta,jet_phi,jet_E, # jet pt,eta,phi,energy
good_jets): # jet indices
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
NJetPairsZMass = 0 # start counter for number of jet pairs within a window of 30 GeV around 85 GeV
for i in good_jets_indices[:8]: # loop over good jets
for j in good_jets_indices[i+1:8]: # loop over remaining good jets
# call function calc_inv_mass_pair defined above to get jet-pair invariant mass
Vmass = calc_inv_mass_pair(jet_pt[i],jet_eta[i],jet_phi[i],jet_E[i],
jet_pt[j],jet_eta[j],jet_phi[j],jet_E[j])
if abs(Vmass-85)<30: # within 30 GeV of 85 GeV
NJetPairsZMass += 1 # increment counter for number of jet pairs within a window of 30 GeV around 85 GeV
return NJetPairsZMass # return number of jet pairs within a window of 30 GeV around 85 GeV
# function to calculate number of top-quark candidates
def calc_Nmbjj_top(jet_pt,jet_eta,jet_phi,jet_E,jet_MV2c10, # pt,eta,phi,energy,MV2c10
good_jets): # good jet indices
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
Nmbjj_top = 0 # start counter for number of top-quark candidates
for i in good_jets_indices[:8]: # loop over good jets
for j in good_jets_indices[i+1:8]: # loop over remaining good jets
for k in good_jets_indices[j+1:8]: # loop over remaining good jets
# number of b-jets in this 3-jet system
n_bjets = int(jet_MV2c10[i]>0.6459)+int(jet_MV2c10[j]>0.6459)+int(jet_MV2c10[k]>0.6459)
if n_bjets==1: # only 1 b-jet for top quark candidate
if jet_MV2c10[i]>0.6459: # if i is the b-jet
b = i # assign index i to b
j1 = j # assign index j to j1
j2 = k # assing index k to j2
elif jet_MV2c10[j]>0.6459: # if j is is the b-jet
b = j # assign index j to b
j1 = i # assign index i to j1
j2 = k # assign index k to j2
else: # if k is the b-jet
b = k # assign index k to b
j1 = i # assign index i to j1
j2 = j # assign index j to j2
px_b = jet_pt[b]*np.cos(jet_phi[b]) # x-momentum of the b-jet
py_b = jet_pt[b]*np.sin(jet_phi[b]) # y-momentum of the b-jet
pz_b = jet_pt[b]*np.sinh(jet_eta[b]) # z-momentum of the b-jet
px_j1 = jet_pt[j1]*np.cos(jet_phi[j1]) # x-momentum of (non-b) jet 1
py_j1 = jet_pt[j1]*np.sin(jet_phi[j1]) # y-momentum of (non-b) jet 1
pz_j1 = jet_pt[j1]*np.sinh(jet_eta[j1]) # z-momentum of (non-b) jet 1
px_j2 = jet_pt[j2]*np.cos(jet_phi[j2]) # x-momentum of (non-b) jet 2
py_j2 = jet_pt[j2]*np.sin(jet_phi[j2]) # y-momentum of (non-b) jet 2
pz_j2 = jet_pt[j2]*np.sinh(jet_eta[j2]) # z-momentum of (non-b) jet 2
sumpx_t = px_b + px_j1 + px_j2 # x-momentum of 3-jet system
sumpy_t = py_b + py_j1 + py_j2 # y-momentum of 3-jet system
sumpz_t = pz_b + pz_j1 + pz_j2 # z-momentum of 3-jet system
sump_t = np.sqrt(sumpz_t**2 + sumpx_t**2 + sumpy_t**2) # total momentum of 3-jet system
sumE_t = jet_E[i] + jet_E[j] + jet_E[k] # total energy of 3-jet system
M_bjj = np.sqrt(sumE_t**2 - sump_t**2)/1000 # mass of 3-jet system
sumpx_W = px_j1 + px_j2 # x-momentum of W candidate
sumpy_W = py_j1 + py_j2 # y-momentum of W candidate
sumpz_W = pz_j1 + pz_j2 # z-momentum of W candidate
sump_W = np.sqrt(sumpz_W**2 + sumpx_W**2 + sumpy_W**2) # total momentum of W candidate
sumE_W = jet_E[j1] + jet_E[j2] # total energy of W candidate
M_W = np.sqrt(sumE_W**2 - sump_W**2)/1000 # mass of W candidate
if abs(M_bjj - 172.5)<15 and abs(M_W - 80)<15: # within 15 GeV of top and W mass windows
Nmbjj_top += 1 # increment counter for number of top-quark candidates
return Nmbjj_top # return number of top-quark candidates
# function to calculate mass of the combination between any two jets with the smallest ∆R
def calc_MjjMindR(jet_pt,jet_eta,jet_phi,jet_E, # jet pt,eta,phi,energy
good_jets): # jet indices
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
MindR = 99 # dummy value for smallest ∆R
for jet_i in good_jets_indices[:8]: # loop over good jets
for jet_j in good_jets_indices[jet_i+1:8]: # loop over remaining good jets
dR = calc_delta_R(jet_eta[jet_i],jet_phi[jet_i],jet_eta[jet_j],jet_eta[jet_j]) # calculate ∆R
if dR < MindR: # if this ∆R is smaller than the smallest ∆R found previously
MindR = dR # set this ∆R to be the smallest ∆R
i1r = jet_i # i1r is the index of the 1st jet to be considered later
i2r = jet_j # i2r is the index of the 2nd jet to be considered later
# use function calc_inv_mass_pair defined above to get the jet-jet invariant mass
MjjMindR = calc_inv_mass_pair(jet_pt[i1r],jet_eta[i1r],jet_phi[i1r],jet_E[i1r],
jet_pt[i2r],jet_eta[i2r],jet_phi[i2r],jet_E[i2r])
return MjjMindR # return mass of the combination between any two jets with the smallest ∆R
# function to calculate mass of the two jets with the highest b-tag weight
def calc_MbbPtOrd(jet_pt,jet_eta,jet_phi,jet_E,jet_MV2c10, # jet pt,eta,phi,energy,MV2c10
good_jets):
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
MV2c10_scores = {} # dictionary to hold indices and MV2c10 scores
for i in good_jets_indices[:8]: # loop over good jets
MV2c10_scores[i] = jet_MV2c10[i] # dictionary entry of index and MV2c10 score
max_MV2c10_index = max(MV2c10_scores, key=MV2c10_scores.get) # get index of jet with maximum MV2c10 score
del MV2c10_scores[max_MV2c10_index] # delete jet with maximum MV2c10 score from dictionary
second_MV2c10_index = max(MV2c10_scores, key=MV2c10_scores.get) # get index of jet with second MV2c10 score
# use function calc_inv_mass_pair defined above to get mass of the two jets with the highest b-tag weight
return calc_inv_mass_pair(jet_pt[max_MV2c10_index],jet_eta[max_MV2c10_index],jet_phi[max_MV2c10_index],
jet_E[max_MV2c10_index],jet_pt[second_MV2c10_index],jet_eta[second_MV2c10_index],
jet_phi[second_MV2c10_index],jet_E[second_MV2c10_index])
# function to calculate jet centrality (scalar sum of pT divided by sum of E for all jets)
def calc_CentrJet(jet_pt,jet_E, # jet pt and energy
good_jets): # jet indices
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
sum_pt = 0 # start counter for pt of all jets
sum_E = 0 # start counter for energy of all jets
for i in good_jets_indices[:8]: # loop over good jets
sum_pt += jet_pt[i] # add single jet pt to sum of jet pt
sum_E += jet_E[i] # add single jet energy to sum of jet energy
return sum_pt/sum_E # ratio of total pt to total energy
# function to calculate average ∆R for all jet pairs
def calc_dRjjave_jet(jet_eta,jet_phi, # jet eta,phi
good_jets): # jet indices
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
delta_R = [] # list to hold ∆R between all jet pairs
for i in good_jets_indices[:8]: # loop over good jets
for j in good_jets_indices[i+1:8]: # loop over remaining good jets
delta_R.append(calc_delta_R(jet_eta[i],jet_phi[i],jet_eta[j],jet_phi[j])) # append ∆R for this jet pair to list
return sum(delta_R)/len(delta_R) # average ∆R of all jet pairs
# function to calculate maximum mass between a lepton and the tagged jet with the smallest ∆R
def calc_MaxMMindRlepb(lep_pt,lep_eta,lep_phi,lep_E,jet_pt,jet_eta,jet_phi,jet_E,jet_MV2c10, # pt,eta,phi,E,MV2c10
good_jets):
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
# what if the b-jet isn't one of the first 8 good jets? Use the jet within the first 8 with highes MV2c10 score
if len(good_jets)>8 and jet_MV2c10[good_jets_indices[0]]<0.6459 and jet_MV2c10[good_jets_indices[1]]<0.6459 and jet_MV2c10[good_jets_indices[2]]<0.6459 and jet_MV2c10[good_jets_indices[3]]<0.6459 and jet_MV2c10[good_jets_indices[4]]<0.6459 and jet_MV2c10[good_jets_indices[5]]<0.6459 and jet_MV2c10[good_jets_indices[6]]<0.6459 and jet_MV2c10[good_jets_indices[7]]<0.6459:
MV2c10_scores = {} # dictionary to hold indices and btagging scores
for i in good_jets_indices[:8]: # loop over good jets
MV2c10_scores[i] = jet_MV2c10[i] # dictionary entry of index and btagging score
ijetr = max(MV2c10_scores, key=MV2c10_scores.get) # get index of jet with maximum btagging score
MaxMMindRlepb = 0 # dummy value for maximum mass between a lepton and the tagged jet with the smallest ∆R
for lep_i in [0,1]: # loop over leptons
MindRlepb = 99 # dummy value for smallest ∆R
for jet_j in good_jets_indices[:8]: # loop over good jets
if jet_MV2c10[jet_j]>0.6459: # for 77% b-tag efficiency from https://cds.cern.ch/record/2160731/files/ATL-PHYS-PUB-2016-012.pdf Table 2
dRlepb = calc_delta_R(lep_eta[lep_i],lep_phi[lep_i],jet_eta[jet_j],jet_phi[jet_j]) # get ∆R
if dRlepb < MindRlepb: # if this ∆R is smaller than the smallest ∆R found previously for this lep
MindRlepb = dRlepb # set this ∆R to be the minimum ∆R for this lepton
ijetr = jet_j # ijetr is the index of the jet to be considered later
# use function calc_inv_mass_pair to get the lep-b pair invariant mass
MMindRlepb = calc_inv_mass_pair(lep_pt[lep_i],lep_eta[lep_i],lep_phi[lep_i],lep_E[lep_i],
jet_pt[ijetr],jet_eta[ijetr],jet_phi[ijetr],jet_E[ijetr])
if MMindRlepb > MaxMMindRlepb: # if this lep-b pair mass is bigger than the biggest previous
MaxMMindRlepb = MMindRlepb # set this lep-b pair mass to be the maximum mass
return MaxMMindRlepb # return maximum mass between a lepton and the tagged jet with the smallest ∆R
# function to calculate First Fox-Wolfram moment
def calc_H1(lep_pt,lep_eta,lep_phi,lep_E,jet_pt,jet_eta,jet_phi,jet_E, # lepton and jet pt,eta,phi,energy
good_jets): # jet indices
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
E_viss = 0 # start counter for visible energy in the event
H1 = 0 # start counter for H1
for lep_i in range(2): # loop over leptons
E_viss += lep_E[lep_i] # add energy of leptons to visible energy counter
for jet_i in range(len(jet_pt)): # loop over all jets (not just good jets)
E_viss += jet_E[jet_i] # add energy of jets to visible energy counter
for lep_i in range(2): # loop over leptons
px_i = lep_pt[lep_i]*np.cos(lep_phi[lep_i]) # x-momentum of lepton i
py_i = lep_pt[lep_i]*np.sin(lep_phi[lep_i]) # y-momentum of lepton i
pz_i = lep_pt[lep_i]*np.sinh(lep_eta[lep_i]) # z-momentum of lepton i
for lep_j in range(lep_i,2): # loop over leptons
px_j = lep_pt[lep_j]*np.cos(lep_phi[lep_j]) # x-momentum of lepton j
py_j = lep_pt[lep_j]*np.sin(lep_phi[lep_j]) # y-momentum of lepton j
pz_j = lep_pt[lep_j]*np.sinh(lep_eta[lep_j]) # z-momentum of lepton j
numerator = px_i*px_j + py_i*py_j + pz_i*pz_j # dot product of lepton momenta
H1 += numerator/(E_viss**2) # H1 calculation for this lepton pairing
for jet_i in good_jets_indices[:8]: # loop over good jets
px_i = jet_pt[jet_i]*np.cos(jet_phi[jet_i]) # x-momentum of jet i
py_i = jet_pt[jet_i]*np.sin(jet_phi[jet_i]) # y-momentum of jet i
pz_i = jet_pt[jet_i]*np.sinh(jet_eta[jet_i]) # z-momentum of jet i
for jet_j in good_jets_indices[jet_i:]: # loop over good jets
px_j = jet_pt[jet_j]*np.cos(jet_phi[jet_j]) # x-momentum of jet j
py_j = jet_pt[jet_j]*np.sin(jet_phi[jet_j]) # y-momentum of jet j
pz_j = jet_pt[jet_j]*np.sinh(jet_eta[jet_j]) # z-momentum of jet j
numerator = px_i*px_j + py_i*py_j + pz_i*pz_j # dot product of jet-pair momenta
H1 += numerator/(E_viss**2) # H1 calculation for this jet pairing
for lep_i in range(2): # loop over leptons
px_i = lep_pt[lep_i]*np.cos(lep_phi[lep_i]) # x-momentum of lepton
py_i = lep_pt[lep_i]*np.sin(lep_phi[lep_i]) # y-momentum of lepton
pz_i = lep_pt[lep_i]*np.sinh(lep_eta[lep_i]) # z-momentum of lepton
for jet_j in good_jets_indices[:8]: # loop over jets
px_j = jet_pt[jet_j]*np.cos(jet_phi[jet_j]) # x-momentum of jet
py_j = jet_pt[jet_j]*np.sin(jet_phi[jet_j]) # y-momentum of jet
pz_j = jet_pt[jet_j]*np.sinh(jet_eta[jet_j]) # z-momentum of jet
numerator = px_i*px_j + py_i*py_j + pz_i*pz_j # dot product of lepton-jet pair momenta
H1 += numerator/(E_viss**2) # H1 calcuation for this lepton-jet pairing
return H1 # return H1
# function to calculate sum of jet pT , up to 6 jets
def calc_HT_jet6(jet_pt, # jet pt
good_jets): # jet indices
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
sum_pt = 0 # start counter for total jet pt
for i in good_jets_indices[:6]: # loop over good jets, up to 6 jets
sum_pt += jet_pt[i] # add individual jet pt to total jet pt
return sum_pt/1000 # /1000 to go from MeV to GeV
# function to calculate η of dilepton system
def calc_eta_ll(lep_pt,lep_eta,lep_phi): # lepton pt,eta,phi
px_0 = lep_pt[0]*np.cos(lep_phi[0]) # x-momentum of lep_0
py_0 = lep_pt[0]*np.sin(lep_phi[0]) # y-momentum of lep_0
pz_0 = lep_pt[0]*np.sinh(lep_eta[0]) # z-momentum of lep_0
px_1 = lep_pt[1]*np.cos(lep_phi[1]) # x-momentum of lep_1
py_1 = lep_pt[1]*np.sin(lep_phi[1]) # y-momentum of lep_1
pz_1 = lep_pt[1]*np.sinh(lep_eta[1]) # z-momentum of lep_1
sumpx = px_0 + px_1 # x-momentum of dilepton system
sumpy = py_0 + py_1 # y-momentum of dilepton system
sumpz = pz_0 + pz_1 # z-momentum of dilepton system
sump = np.sqrt(sumpz**2 + sumpx**2 + sumpy**2) # total momentum of dilepton system
return np.arctanh(sumpz/sump) # arctan between z-momentum and total momentum
# function to calculate sum of the two closest 2 jet invariant masses from from jjj1 and jjj2, divided by 2
def calc_MWavg(jet_pt,jet_eta,jet_phi,jet_E,jet_MV2c10, # jet pt,eta,phi,energy,MV2c10
good_jets): # jet indices
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
if len(good_jets_indices)<6: return 0 # don't calculate this for 5j event
b1_i = 0 # index of first jet
dR1 = 99 # dummy value for min ∆R between 1st b-jet and another jet
for i in good_jets_indices[:8]: # loop over good jets
if i==b1_i: continue # if this jet is the 1st b-jet, move onto the next jet
delta_R = calc_delta_R(jet_eta[b1_i],jet_phi[b1_i],jet_eta[i],jet_phi[i]) # ∆R between the two jets
if delta_R < dR1: # if this ∆R is smaller than the smallest ∆R found previously for the 1st b-jet
dR1 = delta_R # set this ∆R as the smallest ∆R for the 1st b-jet
jj1_1_i = i # assign index jj1_1_i to the jet closest to the 1st b-jet
dR2 = 99 # dummy value for second smallest ∆R between the 1st b-jet and another jet
for i in good_jets_indices[:8]: # loop over good jets
if i in [b1_i,jj1_1_i]: continue # if this jet is the 1st b-jet or the jet closest to the 1st b-jet, move on
delta_R = calc_delta_R(jet_eta[b1_i],jet_phi[b1_i],jet_eta[i],jet_phi[i]) # ∆R between the two jets
if delta_R < dR2: # if this ∆R is smaller than the second smallest ∆R found previously for the 1st b-jet
dR2 = delta_R # set this ∆R as the second smallest ∆R for the 1st b-jet
jj1_2_i = i # assign index jj1_2_i to the jet second closest to the 1st b-jet
for i in good_jets_indices[:8]: # looking for the second "b-jet"
if i in [b1_i,jj1_1_i,jj1_2_i]: continue # if this jet is in the 3-jet system of the 1st b-jet, move on
b2_i = i # assign index i to the next highest pt jet
break # finish this for loop
dR1 = 99 # dummy value for min ∆R between 2nd b-jet and another jet
for i in good_jets_indices[:8]: # loop over good jets
if i in [b1_i,jj1_1_i,jj1_2_i,b2_i]: continue # if i is part of 1st b-jet system or is the 2nd b-jet, move on
delta_R = calc_delta_R(jet_eta[b2_i],jet_phi[b2_i],jet_eta[i],jet_phi[i]) # ∆R between the two jets
if delta_R < dR1: # if this ∆R is smaller than the smallest ∆R found previously for the 2nd b-jet
dR1 = delta_R # set this ∆R as the smallest ∆R for the 2nd b-jet
jj2_1_i = i # assign index jj2_1_i to the jet closest to the 2nd b-jet (if not in 1st b-jet system)
dR2 = 99 # dummy value for second smallest ∆R between the 2nd b-jet and another jet
for i in good_jets_indices[:8]: # loop over good jets
if i in [b1_i,jj1_1_i,jj1_2_i,b2_i,jj2_1_i]: continue # if this jet has been used previously, move on
delta_R = calc_delta_R(jet_eta[b2_i],jet_phi[b2_i],jet_eta[i],jet_phi[i]) # ∆R between the two jets
if delta_R < dR2: # if this ∆R is smaller than the second smallest ∆R found previously for the 2nd b-jet
dR2 = delta_R # set this ∆R as the second smallest ∆R for the 2nd b-jet
jj2_2_i = i # assign index jj2_2_i to the jet second closest to the 2nd b-jet
M1 = [] # list to hold masses of W candidates from 1st b-jet
# inv mass between 1st b-jet and the jet closest to it
M1.append(calc_inv_mass_pair(jet_pt[b1_i],jet_eta[b1_i],jet_phi[b1_i],jet_E[b1_i],
jet_pt[jj1_1_i],jet_eta[jj1_1_i],jet_phi[jj1_1_i],jet_E[jj1_1_i]))
# inv mass between 1st b-jet and the jet second closest to it
M1.append(calc_inv_mass_pair(jet_pt[b1_i],jet_eta[b1_i],jet_phi[b1_i],jet_E[b1_i],
jet_pt[jj1_2_i],jet_eta[jj1_2_i],jet_phi[jj1_2_i],jet_E[jj1_2_i]))
# inv mass between the two jets closest to the 1st b-jet
M1.append(calc_inv_mass_pair(jet_pt[jj1_1_i],jet_eta[jj1_1_i],jet_phi[jj1_1_i],jet_E[jj1_1_i],
jet_pt[jj1_2_i],jet_eta[jj1_2_i],jet_phi[jj1_2_i],jet_E[jj1_2_i]))
M2 = [] # list to hold masses of W candidates from 2nd b-jet
# inv mass between 2nd b-jet and the jet closest to it
M2.append(calc_inv_mass_pair(jet_pt[b2_i],jet_eta[b2_i],jet_phi[b2_i],jet_E[b2_i],
jet_pt[jj2_1_i],jet_eta[jj2_1_i],jet_phi[jj2_1_i],jet_E[jj2_1_i]))
# inv mass between 2nd b-jet and the jet second closest to it
M2.append(calc_inv_mass_pair(jet_pt[b2_i],jet_eta[b2_i],jet_phi[b2_i],jet_E[b2_i],
jet_pt[jj2_2_i],jet_eta[jj2_2_i],jet_phi[jj2_2_i],jet_E[jj2_2_i]))
# inv mass between the two jets closest to the 2nd b-jet
M2.append(calc_inv_mass_pair(jet_pt[jj2_1_i],jet_eta[jj2_1_i],jet_phi[jj2_1_i],jet_E[jj2_1_i],
jet_pt[jj2_2_i],jet_eta[jj2_2_i],jet_phi[jj2_2_i],jet_E[jj2_2_i]))
difM = 1000000 # dummy value for the difference between the masses of the 1st and 2nd W candidates
for i in range(3): # loop over [0,1,2]
for j in range(3): # loop over [0,1,2]
difM12 = abs(M1[i]-M2[j]) # get absolute difference between W candidate 1 and W candidate 2
if difM12<difM: # if this difference is smaller than the smallest difference found previously
difM=difM12 # set this difference as the smallest difference between the masses of W candidates
MW=(M1[i]+M2[j])/2 # mass average of W candidate 1 and W candidate 2
return MW # return average W candidate mass. divide by 1000 to go from MeV to GeV
# function to calculate ∆R between the two jets with the highest b-tagging weight in the event
def calc_dRbb(jet_eta,jet_phi,jet_MV2c10, # jet eta,phi,MV2c10
good_jets): # jet indices
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
MV2c10_scores = {} # dictionary to hold indices and MV2c10 scores
for i in good_jets_indices[:8]: # loop over good jets
MV2c10_scores[i] = jet_MV2c10[i] # dictionary entry of index and MV2c10 score
max_MV2c10_index = max(MV2c10_scores, key=MV2c10_scores.get) # get index of jet with maximum MV2c10 score
del MV2c10_scores[max_MV2c10_index] # delete jet with maximum MV2c10 score from dictionary
second_MV2c10_index = max(MV2c10_scores, key=MV2c10_scores.get) # get index of jet with second MV2c10 score
return calc_delta_R(jet_eta[max_MV2c10_index],jet_phi[max_MV2c10_index],jet_eta[second_MV2c10_index],
jet_phi[second_MV2c10_index]) # ∆R between the 2 b-jets
# function to calculate pT of the first b-jet (ordered according to the pT)
def calc_pT1b(jet_pt,jet_MV2c10, # jet pt,MV2c10
good_jets): # jet indices
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
for i in good_jets_indices[:8]: # loop over good jets
if jet_MV2c10[i]>0.6459: #77% WP to decide if it's a b-jet
return jet_pt[i]/1000 # /1000 to go from MeV to GeV
# what if the b-jet isn't one of the first 8 good jets? Use the jet within the first 8 with highes MV2c10 score
if len(good_jets)>8 and jet_MV2c10[good_jets_indices[0]]<0.6459 and jet_MV2c10[good_jets_indices[1]]<0.6459 and jet_MV2c10[good_jets_indices[2]]<0.6459 and jet_MV2c10[good_jets_indices[3]]<0.6459 and jet_MV2c10[good_jets_indices[4]]<0.6459 and jet_MV2c10[good_jets_indices[5]]<0.6459 and jet_MV2c10[good_jets_indices[6]]<0.6459 and jet_MV2c10[good_jets_indices[7]]<0.6459:
MV2c10_scores = {} # dictionary to hold indices and btagging scores
for i in good_jets_indices[:8]: # loop over good jets
MV2c10_scores[i] = jet_MV2c10[i] # dictionary entry of index and btagging score
max_MV2c10_index = max(MV2c10_scores, key=MV2c10_scores.get) # get index of jet with maximum btagging score
return jet_pt[max_MV2c10_index]/1000 # /1000 to go from MeV to GeV
# define function to calculate variables for BDT in 2b5j or 1b6j signal regions
# function to calculate mass of the two untagged jets with the highest pT
def calc_MuuPtOrd(jet_pt,jet_eta,jet_phi,jet_E,jet_MV2c10, # jet pt,eta,phi,energy,MV2c10
good_jets): # jet indices
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
MV2c10_scores = {} # dictionary to hold indices and MV2c10 scores
for i in good_jets_indices: # loop over good jets
MV2c10_scores[i] = jet_MV2c10[i] # dictionary entry of index and MV2c10 score
max_MV2c10_index = max(MV2c10_scores, key=MV2c10_scores.get) # get index of jet with maximum MV2c10 score
del MV2c10_scores[max_MV2c10_index] # delete jet with maximum MV2c10 score from dictionary
second_MV2c10_index = max(MV2c10_scores, key=MV2c10_scores.get) # get index of jet with second MV2c10 score
for i in good_jets_indices: # loop over good jets
if i==max_MV2c10_index or i==second_MV2c10_index: continue # if this jet is a b-jet, move onto next jet
for j in good_jets_indices[i+1:]: # loop over remaining good jets
if j==max_MV2c10_index or j==second_MV2c10_index: continue # if j is a b-jet, move onto next jet
# use function calc_inv_mass_pair defined above to get mass of jet pair
return calc_inv_mass_pair(jet_pt[i],jet_eta[i],jet_phi[i],jet_E[i],
jet_pt[j],jet_eta[j],jet_phi[j],jet_E[j])
# define function to calculate number of heavy-flavour jets
def calc_HF_n(jet_trueflav,good_jets): # jet true flavours and jet indices
good_jets_indices = [i for i,b in enumerate(good_jets) if b=='1'] # get the good jets
HF_n = 0 # start counter for number of heavy-flavour jets
for jet_i in good_jets_indices: # loop over good jets
if jet_trueflav[jet_i]==4 or jet_trueflav[jet_i]==5: # c=4, b=5
HF_n += 1 # increment counter for number of heavy-flavour jets
return HF_n # return number of heavy-flavour jets
# ## Can we process the data yet?!
# define function to read files
def read_file(path,sample):
print("\tProcessing: "+sample) # print which sample is being processed
data_all = pd.DataFrame() # define empty pandas DataFrame to hold all data for this sample
tree = uproot.open(path)["mini"] # open the tree called mini
XSection = tree.array('XSection')[0] # get cross-section from tree
SumWeights = tree.array('SumWeights')[0] # get Sum of Weights from tree
numevents = uproot.numentries(path, "mini") # number of events
if 'data' in sample: fraction_MC=fraction # process fraction*1 of measured data
else: fraction_MC=fraction*MC_to_data_ratio # process fraction*MC_to_data_ratio of simulated data
entrystop=numevents*fraction_MC # stop after fraction of events we want to process
branches = ['lep_pt','lep_eta','lep_phi','lep_E','lep_charge','lep_type','lep_ptcone30',
'lep_etcone20','lep_tracksigd0pvunbiased','met_et',
'jet_pt','jet_eta','jet_phi','jet_E','jet_jvt','jet_MV2c10'
#,'lep_z0' # uncomment this for stricter lepton requirements
]
if 'data' not in sample:
xsec_weight = get_xsec_weight(sample) # get cross-section weight
branches.extend(['mcWeight','scaleFactor_PILEUP','scaleFactor_ELE',
'scaleFactor_MUON','scaleFactor_LepTRIGGER'
#,'scaleFactor_BTAG' # uncomment this for better Data vs MC match
])
if 'Zmumu' in sample or 'Zee' in sample:
branches.extend(['jet_trueflav'])
for data in tree.iterate(branches,
outputtype=pd.DataFrame, # choose output type as pandas DataFrame
entrystop=entrystop): # process up to numevents*fraction
nIn = len(data.index) # number of events in this batch
#print('\t\t initial \t\t\t\t',nIn)
# get index of 0th good jet
data['good_jets'] = np.vectorize(find_good_jet_indices)(data.jet_pt, data.jet_jvt
#,data.jet_eta # uncomment this for stricter requirements
)
data['goodjet_n'] = np.vectorize(calc_goodjet_n)(data.good_jets)
# select on number of good jets
fail = data[ np.vectorize(select_good_jet_n)(data.good_jets) ].index # get events that fail the selection
data.drop(fail,inplace=True) # drop the events that have fewer than 5 jets
if len(data.index)==0: continue # move onto next batch if no events left
#print('\t\t at least 6 jets \t\t\t',len(data.index))
# calculate number of b-jets at 77% Working Point using function calc_bjet_n defined above
data['bjet_n'] = np.vectorize(calc_bjet_n)(data.jet_MV2c10, data.good_jets)
# select on number of b-jets
fail = data[ np.vectorize(select_bjet_n)(data.bjet_n) ].index # get events that fail this selection
data.drop(fail,inplace=True) # drop the events with fewer than 1 b-jets
if len(data.index)==0: continue # move onto next batch if no events left
#print('\t\t at least 2 b-jets \t\t\t',len(data.index))
# calculation of 2-lepton invariant mass
data['mll'] = np.vectorize(calc_mll)(data.lep_pt, data.lep_eta, data.lep_phi, data.lep_E)
# select on whether leptons are good
fail = data[ np.vectorize(select_leptons)(data.mll, data.lep_ptcone30, data.lep_pt, data.lep_type,
data.lep_etcone20, data.lep_tracksigd0pvunbiased,
data.lep_eta, data.lep_charge
#,data.lep_z0 # uncomment this for stricter requirements
) ].index # get events that fail this selection
data.drop(fail, inplace=True) # drop events where leptons aren't good
if len(data.index)==0: continue # move onto next batch if no events left
#print('\t\t leptons are good \t\t\t',len(data.index))
data['channel'] = np.vectorize(calc_channel)(data.lep_type) # ee, mm or em
data['ptll'] = np.vectorize(calc_ptll)(data.lep_pt, data.lep_phi) # pt of dilepton system
data['pt4_jet'] = np.vectorize(calc_pti_jet)(data.jet_pt, data.good_jets, 3) # pt of 4th jet
data['pt6_jet'] = np.vectorize(calc_pti_jet)(data.jet_pt, data.good_jets, 5) # pt of 6th jet
data['dRll'] = np.vectorize(calc_dRll)(data.lep_eta, data.lep_phi) # ∆R between the two leptons
# get number of jet pairs with mass within a window of 30 GeV around 85 GeV (~Z/W mass)
data['NJetPairsZMass'] = np.vectorize(calc_NJetPairsZMass)(data.jet_pt, data.jet_eta, data.jet_phi,
data.jet_E, data.good_jets)
# get mass of the combination between any two jets with the smallest ∆R
data['MjjMindR'] = np.vectorize(calc_MjjMindR)(data.jet_pt, data.jet_eta, data.jet_phi, data.jet_E,
data.good_jets)
# get number of top-quark candidates
data['Nmbjj_top'] = np.vectorize(calc_Nmbjj_top)(data.jet_pt, data.jet_eta, data.jet_phi, data.jet_E,
data.jet_MV2c10, data.good_jets)
# get mass of the two jets with the highest b-tag weight
data['MbbPtOrd'] = np.vectorize(calc_MbbPtOrd)(data.jet_pt, data.jet_eta, data.jet_phi, data.jet_E,
data.jet_MV2c10, data.good_jets)
# get jet centrality (scalar sum of pT divided by sum of E for all jets)
data['CentrJet'] = np.vectorize(calc_CentrJet)(data.jet_pt, data.jet_E, data.good_jets)
# get average ∆R for all jet pairs
data['dRjjave_jet'] = np.vectorize(calc_dRjjave_jet)(data.jet_eta, data.jet_phi, data.good_jets)
# get maximum mass between a lepton and the tagged jet with the smallest ∆R
data['MaxMMindRlepb'] = np.vectorize(calc_MaxMMindRlepb)(data.lep_pt, data.lep_eta, data.lep_phi,
data.lep_E, data.jet_pt, data.jet_eta,
data.jet_phi, data.jet_E, data.jet_MV2c10,
data.good_jets)
# get sum of jet pT , up to 6 jets
data['HT_jet6'] = np.vectorize(calc_HT_jet6)(data.jet_pt, data.good_jets)
data['eta_ll'] = np.vectorize(calc_eta_ll)(data.lep_pt, data.lep_eta, data.lep_phi) # η of dilepton system
# get sum of the two closest 2 jet invariant masses from from jjj1 and jjj2, divided by 2
data['MWavg'] = np.vectorize(calc_MWavg)(data.jet_pt, data.jet_eta, data.jet_phi, data.jet_E,
data.jet_MV2c10, data.good_jets)
# get ∆R between two jets with the highest b-tagging weight in the event
data['dRbb'] = np.vectorize(calc_dRbb)(data.jet_eta, data.jet_phi, data.jet_MV2c10, data.good_jets)
# get pT of the first b-jet (ordered according to the pT)
data['pT1b'] = np.vectorize(calc_pT1b)(data.jet_pt, data.jet_MV2c10, data.good_jets)
# get H1 (First Fox-Wolfram moment)
#data['H1'] = np.vectorize(calc_H1)(data.lep_pt, data.lep_eta, data.lep_phi, data.lep_E,
# data.jet_pt, data.jet_eta, data.jet_phi, data.jet_E,
# data.good_jets)
#data['pt5_jet'] = np.vectorize(calc_pti_jet)(data.jet_pt, data.good_jets, 4) # pt of 5th jet
# get mass of the two untagged jets with the highest pT
#data['MuuPtOrd'] = np.vectorize(calc_MuuPtOrd)(data.jet_pt, data.jet_eta, data.jet_phi, data.jet_E,
# data.jet_MV2c10, data.good_jets)
if 'data' not in sample: # if MC
# calculate total weight of event from all individual weights
data['totalWeight'] = np.vectorize(calc_weight)(xsec_weight,data.mcWeight,data.scaleFactor_PILEUP,
data.scaleFactor_ELE,data.scaleFactor_MUON,
data.scaleFactor_LepTRIGGER
#,data.scaleFactor_BTAG
)
data.drop(['mcWeight','scaleFactor_PILEUP','scaleFactor_ELE',
'scaleFactor_MUON','scaleFactor_LepTRIGGER'
#,'scaleFactor_BTAG' # uncomment this for better Data vs MC match
], axis=1, inplace=True)
if 'Zmumu' in sample or 'Zee' in sample:
# calculate number of heavy-flavour jets at truth level
data['HF_n'] = np.vectorize(calc_HF_n)(data.jet_trueflav,data.good_jets)
data.drop(['jet_trueflav'], axis=1, inplace=True)
# drop variables that don't need to be saved to csv
data.drop(['good_jets','lep_pt','lep_eta','lep_phi','lep_E','lep_charge','lep_type','lep_ptcone30',
'lep_etcone20','lep_tracksigd0pvunbiased','met_et','jet_pt','jet_eta','jet_phi',
'jet_E','jet_jvt','jet_MV2c10'
#,'lep_z0'
],
axis=1, inplace=True)
nOut = len(data.index) # number of events passing selections in this batch
data_all = data_all.append(data) # append dataframe from this batch to the dataframe for the whole sample
return data_all # return dataframe containing events passing all selections
# This is where the processing happens
data_dict = get_data_from_files(samples) # process all files
# ## Samples to plot
#
# Define dictionary of samples and colours to use for plotting