From 1ebbc9ad53414bceb4a00ad25c22556dc1fe72e2 Mon Sep 17 00:00:00 2001
From: Lanqing Yuan <yuanlq@uchicago.edu>
Date: Wed, 27 Mar 2024 08:03:30 -0500
Subject: [PATCH] Rewritten matching events to enable event building efficiency
 study (#84)

* move to development

* put fv cut as an option now in load_peaks

* removed useless plugins

* deprecated old event matching

* annotation

* redefined matching

* fixed typo

* inner _touching_windows bug

* deprecated load_events

* draft for load events refurbished
---
 jobs/config.ini       |   2 +-
 saltax/match/match.py | 198 +++++++++++++++++++++++++++++-------------
 saltax/match/utils.py |  87 +++++++++++++++++--
 3 files changed, 221 insertions(+), 66 deletions(-)

diff --git a/jobs/config.ini b/jobs/config.ini
index 126e8cc..e417d03 100644
--- a/jobs/config.ini
+++ b/jobs/config.ini
@@ -3,7 +3,7 @@ max_num_submit = 200
 t_sleep = 1
 
 [job]
-container = xenonnt-2024.02.1.simg
+container = xenonnt-development.simg
 # SE bootstrapped
 #runids = 47876,51907,47860,51905,51904,51906,51909,51911,51913,51939,47865,51910,49079,49077,53139,49432,52997,48693,48699,48692,49080,48698,49433,49028,53153,53167
 # AmBe 
diff --git a/saltax/match/match.py b/saltax/match/match.py
index 395494f..a2a777e 100644
--- a/saltax/match/match.py
+++ b/saltax/match/match.py
@@ -3,7 +3,7 @@
 import strax
 
 
-def filter_out_missing_s1_s2(truth, match):
+def _filter_out_missing_s1_s2(truth, match):
     """
     Filter out simulated events that have no S1 or S2 or both
     :param truth: truth from wfsim
@@ -34,7 +34,7 @@ def filter_out_missing_s1_s2(truth, match):
     return truth[~bad_mask], match[~bad_mask]
 
 
-def filter_out_multiple_s1_s2(truth, match):
+def _filter_out_multiple_s1_s2(truth, match):
     """
     Filter out simulated events that have multiple S1 or S2 reconstructed from wfsim.
     :param truth: truth from wfsim
@@ -58,7 +58,7 @@ def filter_out_multiple_s1_s2(truth, match):
     return truth[~bad_mask], match[~bad_mask]
 
 
-def filter_out_not_found(truth, match, s1s2=1):
+def _filter_out_not_found(truth, match, s1s2=1):
     """
     Filter out simulated events whose S1 are not found by pema.
     :param truth: truth from wfsim
@@ -88,7 +88,7 @@ def filter_out_not_found(truth, match, s1s2=1):
     return truth[~bad_mask], match[~bad_mask]
 
 
-def pair_events_to_filtered_truth(truth, events):
+def _pair_events_to_filtered_truth(truth, events):
     """
     Pair events to filtered truth.
     :param truth: filtered truth
@@ -130,7 +130,7 @@ def pair_events_to_filtered_truth(truth, events):
     return matched_to
 
 
-def pair_events_to_matched_simu(matched_simu, events):
+def _pair_events_to_matched_simu(matched_simu, events):
     """
     Pair salted events to matched simulation events.
     :param matched_simu: simulation events already matched to truth
@@ -160,31 +160,7 @@ def pair_events_to_matched_simu(matched_simu, events):
     return matched_to
 
 
-def pair_peaks_to_matched_simu(matched_simu, peaks, safeguard=0):
-    """
-    Pair salted peaks to simulation peaks who have been matched to truth.
-    :param matched_simu: simulation peaks already matched to truth
-    :param peaks: peaks from saltax after reconstruction
-    :param safeguard: extension of time range to consider as matched, as a workaround for timing problem in wfsim s2
-    :return: ind_simu_matched_to_salt: the index of matched simulation peaks for each salted peak
-    :return: ind_salt_matched_to_simu: the index of matched salted peaks for each simulation peak
-    """
-    windows = strax.touching_windows(peaks, 
-                                     matched_simu, 
-                                     window=safeguard)
-    windows_length = windows[:,1] - windows[:,0]
-    simu_matched_to_salt_mask = windows_length == 1
-    print("Filter out %s percent of simulation due to multiple or no matched sprinkled \
-           peaks"%(np.sum(~simu_matched_to_salt_mask)/len(matched_simu)*100))
-
-    matched_simu = matched_simu[simu_matched_to_salt_mask]
-    ind_simu_matched_to_salt = np.where(simu_matched_to_salt_mask)[0]
-    ind_salt_matched_to_simu = windows[simu_matched_to_salt_mask][:,0]
-
-    return ind_simu_matched_to_salt, ind_salt_matched_to_simu
-
-
-def pair_salt_to_simu_events(truth, match, events_simu, events_salt):
+def _pair_salt_to_simu_events(truth, match, events_simu, events_salt):
     """
     Filter out bad simulation truth and then pair salted events to matched simulation events.
     :param truth: filtered truth
@@ -193,42 +169,23 @@ def pair_salt_to_simu_events(truth, match, events_simu, events_salt):
     :param events_salt: events from saltax after reconstruction
     :return: ind_salt_matched_to_simu, ind_simu_matched_to_truth, truth_filtered, match_filtered
     """
-    truth_filtered, match_filtered = filter_out_missing_s1_s2(truth, match)
-    truth_filtered, match_filtered = filter_out_multiple_s1_s2(truth_filtered, match_filtered)
-    truth_filtered, match_filtered = filter_out_not_found(truth_filtered, match_filtered) 
+    truth_filtered, match_filtered = _filter_out_missing_s1_s2(truth, match)
+    truth_filtered, match_filtered = _filter_out_multiple_s1_s2(truth_filtered, match_filtered)
+    truth_filtered, match_filtered = _filter_out_not_found(truth_filtered, match_filtered) 
 
-    ind_simu_matched_to_truth = pair_events_to_filtered_truth(truth_filtered, events_simu)
+    ind_simu_matched_to_truth = _pair_events_to_filtered_truth(truth_filtered, events_simu)
     events_simu_matched_to_truth = events_simu[ind_simu_matched_to_truth[ind_simu_matched_to_truth>=0]]
 
-    ind_salt_matched_to_simu = pair_events_to_matched_simu(events_simu_matched_to_truth, 
+    ind_salt_matched_to_simu = _pair_events_to_matched_simu(events_simu_matched_to_truth, 
                                                            events_salt)
     
     return ind_salt_matched_to_simu, ind_simu_matched_to_truth, truth_filtered, match_filtered
 
 
-def pair_salt_to_simu_peaks(truth, match, peaks_simu, peaks_salt):
-    """
-    Filter out bad simulation truth and then pair salted events to matched simulation events.
-    :param truth: filtered truth
-    :param match: match_acceptance_extended from pema
-    :param peaks_simu: peaks from wfsim
-    :param peaks_salt: peaks from saltax after reconstruction
-    :return: ind_salt_matched_to_simu, ind_simu_matched_to_truth
-    """
-    ind_simu_matched_to_truth = match['matched_to']
-    peaks_simu_matched_to_truth = peaks_simu[ind_simu_matched_to_truth]
-
-    (_ind_simu_matched_to_truth, 
-     ind_salt_matched_to_simu) = pair_peaks_to_matched_simu(peaks_simu_matched_to_truth, 
-                                                            peaks_salt)
-    ind_simu_matched_to_truth = ind_simu_matched_to_truth[_ind_simu_matched_to_truth]
-
-    return ind_salt_matched_to_simu, ind_simu_matched_to_truth
-
-
-def match_events(truth, match, events_simu, events_salt):
+def match_events_deprecated(truth, match, events_simu, events_salt):
     """
-    Match salted events to simulation events.
+    Match salted events to simulation events. This function has been deprecated because it cannot track
+    event building efficiency in a proper way.
     :param truth: truth from wfsim
     :param match: match_acceptance_extended from pema
     :param events_simu: event_info from wfsim
@@ -237,7 +194,7 @@ def match_events(truth, match, events_simu, events_salt):
     """
     ind_salt_matched_to_simu, \
         ind_simu_matched_to_truth, \
-            _, _ = pair_salt_to_simu_events(truth, match, events_simu, events_salt)
+            _, _ = _pair_salt_to_simu_events(truth, match, events_simu, events_salt)
 
     events_simu_matched_to_truth = events_simu[ind_simu_matched_to_truth[ind_simu_matched_to_truth>=0]]
     events_salt_matched_to_simu = events_salt[ind_salt_matched_to_simu[ind_salt_matched_to_simu>=0]]
@@ -245,6 +202,87 @@ def match_events(truth, match, events_simu, events_salt):
 
     return events_salt_matched_to_simu, events_simu_matched_to_salt
 
+
+def match_events(events_simu, events_salt):
+    """
+    Match salted events to simulation events based on event or S1-S2 timing, without checking 
+    simulation truth. 
+    The procedures are:
+        1. Filter out events_simu which is missing S1. Then the rest events_simu is like 'truth'
+           without ambience interference. Those events_simu are called events_simu_filtered.
+        2. Find indices of events_salt whose time range overlaps with events_simu_filtered: 
+           ind_salt_event_found. Those events_salt[ind_salt_event_found] are called 
+           events_salt_event_found.
+        3. Find indcies of events_simu_filtered whose time range overlaps with events_salt_event_found: 
+           ind_simu_event_found. Those events_simu_filtered[ind_simu_event_found] are called 
+           events_simu_event_found. 
+        4. Find indcies of events_salt whose S1 time range overlaps with events_simu_filtered's S1 time 
+           range: ind_salt_s1_found. Those events_salt[ind_salt_s1_found] are called events_salt_s1_found.
+        5. Find indcies of events_simu_filtered whose S1 time range overlaps with events_salt_s1_found:
+           ind_simu_s1_found. Those events_simu_filtered[ind_simu_s1_found] are called events_simu_s1_found.
+        6. Find indcies of events_salt whose S2 time range overlaps with events_simu_filtered's S2 time 
+           range: ind_salt_s2_found. Those events_salt[ind_salt_s2_found] are called events_salt_s2_found.
+        7. Find indcies of events_simu_filtered whose S2 time range overlaps with events_salt_s2_found:
+           ind_simu_s2_found. Those events_simu_filtered[ind_simu_s2_found] are called events_simu_s2_found.
+    :param events_simu: event_info from wfsim
+    :param events_salt: event_info from saltax
+    :return: events_simu_filtered, 
+             ind_salt_event_found, ind_simu_event_found, ind_simu_event_lost, ind_simu_event_split,
+             ind_salt_s1_found, ind_simu_s1_found, 
+             ind_salt_s2_found, ind_simu_s2_found
+    """
+    # Step 1.
+    # Filter out events_simu which is missing S1
+    events_simu_filtered = events_simu[events_simu['s1_time']>=0]
+
+    # Step 2 & 3.
+    # Find indices of events_salt whose time range overlaps with events_simu_filtered
+    event_touching_windows = strax.touching_windows(events_salt, events_simu_filtered)
+    event_windows_length = event_touching_windows[:,1] - event_touching_windows[:,0]
+    # If windows_length is not 1, the sprinkled event is either split or not found
+    mask_simu_event_found = event_windows_length == 1
+    mask_simu_event_lost  = event_windows_length == 0
+    mask_simu_event_split = event_windows_length >= 2
+    ind_simu_event_found = np.where(mask_simu_event_found)[0]
+    ind_simu_event_lost  = np.where(mask_simu_event_lost)[0]
+    ind_simu_event_split = np.where(mask_simu_event_split)[0]
+    # Find indcies of events_salt whose event time range overlaps with events_simu_filtered
+    ind_salt_event_found = event_touching_windows[mask_simu_event_found][:,0]
+
+    # Step 4 & 5.
+    # Assumed s1 times has been sorted! It should be a safe assumption because their events are sorted.
+    s1_touching_windows = strax.processing.general._touching_windows(
+        events_salt["s1_time"],
+        events_salt["s1_endtime"],
+        events_simu_filtered["s1_time"],
+        events_simu_filtered["s1_endtime"],
+    )
+    s1_windows_length = s1_touching_windows[:,1] - s1_touching_windows[:,0]
+    mask_simu_s1_found = s1_windows_length == 1
+    ind_simu_s1_found = np.where(mask_simu_s1_found)[0]
+    ind_salt_s1_found = s1_touching_windows[mask_simu_s1_found][:,0]
+
+    # Step 6 & 7.
+    # Assumed s2 times has been sorted! It should be a safe assumption because their events are sorted.
+    s2_touching_windows = strax.processing.general._touching_windows(
+        events_salt["s2_time"],
+        events_salt["s2_endtime"],
+        events_simu_filtered["s2_time"],
+        events_simu_filtered["s2_endtime"],
+    )
+    s2_windows_length = s2_touching_windows[:,1] - s2_touching_windows[:,0]
+    mask_simu_s2_found = s2_windows_length == 1
+    ind_simu_s2_found = np.where(mask_simu_s2_found)[0]
+    ind_salt_s2_found = s2_touching_windows[mask_simu_s2_found][:,0]
+
+    return (
+        events_simu_filtered,
+        ind_salt_event_found, ind_simu_event_found, ind_simu_event_lost, ind_simu_event_split,
+        ind_salt_s1_found, ind_simu_s1_found,
+        ind_salt_s2_found, ind_simu_s2_found
+    )
+
+
 def match_peaks(truth, match, peaks_simu, peaks_salt):
     """
     Match salted peaks to simulation peaks.
@@ -267,3 +305,47 @@ def match_peaks(truth, match, peaks_simu, peaks_salt):
     peaks_simu_matched_to_salt = peaks_simu_matched_to_truth[ind_salt_matched_to_simu>=0]
 
     return peaks_salt_matched_to_simu, peaks_simu_matched_to_salt
+
+
+def pair_salt_to_simu_peaks(truth, match, peaks_simu, peaks_salt):
+    """
+    Filter out bad simulation truth and then pair salted events to matched simulation events.
+    :param truth: filtered truth
+    :param match: match_acceptance_extended from pema
+    :param peaks_simu: peaks from wfsim
+    :param peaks_salt: peaks from saltax after reconstruction
+    :return: ind_salt_matched_to_simu, ind_simu_matched_to_truth
+    """
+    ind_simu_matched_to_truth = match['matched_to']
+    peaks_simu_matched_to_truth = peaks_simu[ind_simu_matched_to_truth]
+
+    (_ind_simu_matched_to_truth, 
+     ind_salt_matched_to_simu) = pair_peaks_to_matched_simu(peaks_simu_matched_to_truth, 
+                                                            peaks_salt)
+    ind_simu_matched_to_truth = ind_simu_matched_to_truth[_ind_simu_matched_to_truth]
+
+    return ind_salt_matched_to_simu, ind_simu_matched_to_truth
+
+
+def pair_peaks_to_matched_simu(matched_simu, peaks, safeguard=0):
+    """
+    Pair salted peaks to simulation peaks who have been matched to truth.
+    :param matched_simu: simulation peaks already matched to truth
+    :param peaks: peaks from saltax after reconstruction
+    :param safeguard: extension of time range to consider as matched, as a workaround for timing problem in wfsim s2
+    :return: ind_simu_matched_to_salt: the index of matched simulation peaks for each salted peak
+    :return: ind_salt_matched_to_simu: the index of matched salted peaks for each simulation peak
+    """
+    windows = strax.touching_windows(peaks, 
+                                     matched_simu, 
+                                     window=safeguard)
+    windows_length = windows[:,1] - windows[:,0]
+    simu_matched_to_salt_mask = windows_length == 1
+    print("Filter out %s percent of simulation due to multiple or no matched sprinkled \
+           peaks"%(np.sum(~simu_matched_to_salt_mask)/len(matched_simu)*100))
+
+    matched_simu = matched_simu[simu_matched_to_salt_mask]
+    ind_simu_matched_to_salt = np.where(simu_matched_to_salt_mask)[0]
+    ind_salt_matched_to_simu = windows[simu_matched_to_salt_mask][:,0]
+
+    return ind_simu_matched_to_salt, ind_salt_matched_to_simu
diff --git a/saltax/match/utils.py b/saltax/match/utils.py
index 1901119..6c39c5e 100644
--- a/saltax/match/utils.py
+++ b/saltax/match/utils.py
@@ -60,14 +60,16 @@
             'cut_cs2_area_fraction_top',])
 
 
-def load_peaks(runs, st_salt, st_simu, plugins=('peak_basics', 'peak_positions', 
-                                                'peak_shadow', 'peak_ambience',
-                                                'cut_se_peaks')):
+def load_peaks(runs, st_salt, st_simu, 
+               plugins=('peak_basics', 'peak_positions_mlp'), 
+               truth_fv_cut=True):
     """
     Load peaks from the runs and do basic filtering suggeted by saltax.match_peaks
     :param runs: list of runs.
     :param st_salt: saltax context for salt mode
     :param st_simu: saltax context for simu mode
+    :param plugins: plugins to be loaded, default to ('peak_basics', 'peak_positions_mlp')
+    :param truth_fv_cut: whether to apply truth FV cut, default to True
     :return: peaks_simu: peaks related plugins from simulated dataset
     :return: peaks_salt:  peaks related plugins from sprinkled dataset
     :return: truth: truth information in simulation
@@ -86,10 +88,14 @@ def load_peaks(runs, st_salt, st_simu, plugins=('peak_basics', 'peak_positions',
         match_i = st_simu.get_array(run, 'match_acceptance_extended', progress_bar=False)
 
         # TODO: Ugly hardcoding for FV cut, need to be fixed
-        peaks_salt_matched_to_simu_i, \
-            peaks_simu_matched_to_salt_i = saltax.match_peaks(truth_i[(truth_i['z']<-13)&(truth_i['z']>-145)&(truth_i['x']**2+truth_i['y']**2<64**2)], 
-                                                              match_i[(truth_i['z']<-13)&(truth_i['z']>-145)&(truth_i['x']**2+truth_i['y']**2<64**2)],
-                                                              peaks_simu_i, peaks_salt_i)    
+        if truth_fv_cut:
+            peaks_salt_matched_to_simu_i, \
+                peaks_simu_matched_to_salt_i = saltax.match_peaks(truth_i[(truth_i['z']<-13)&(truth_i['z']>-145)&(truth_i['x']**2+truth_i['y']**2<64**2)], 
+                                                                match_i[(truth_i['z']<-13)&(truth_i['z']>-145)&(truth_i['x']**2+truth_i['y']**2<64**2)],
+                                                                peaks_simu_i, peaks_salt_i)    
+        else:
+            peaks_salt_matched_to_simu_i, \
+                peaks_simu_matched_to_salt_i = saltax.match_peaks(truth_i, match_i, peaks_simu_i, peaks_salt_i)
 
         if i==0:
             peaks_simu = peaks_simu_i
@@ -112,6 +118,73 @@ def load_peaks(runs, st_salt, st_simu, plugins=('peak_basics', 'peak_positions',
 
 
 def load_events(runs, st_salt, st_simu, plugins=('event_info', 'cuts_basic')):
+    """
+    Load events from the runs and do basic filtering suggeted by saltax.match_events
+    :param runs: list of runs.
+    :param st_salt: saltax context for salt mode
+    :param st_simu: saltax context for simu mode
+    :return: events_simu: events from simulated dataset, filtered out those who miss S1
+    :return: events_salt: events from sprinkled dataset
+    :return inds_dict: dictionary of indices of events from sprinkled or filtered simulated dataset, 
+                       regarding matching events or s1 or s2
+    """
+    # Initialize the dictionary to store the indices
+    inds_dict = {
+        "ind_salt_event_found": np.array([], dtype=np.int32),
+        "ind_salt_s1_found": np.array([], dtype=np.int32),
+        "ind_salt_s2_found": np.array([], dtype=np.int32),
+        "ind_simu_event_found": np.array([], dtype=np.int32),
+        "ind_simu_s1_found": np.array([], dtype=np.int32),
+        "ind_simu_s2_found": np.array([], dtype=np.int32)
+    }
+
+    for i, run in enumerate(runs):
+        print("Loading run %s"%(run))
+        
+        # Load plugins for both salt and simu
+        events_simu_i = st_simu.get_array(run, plugins, progress_bar=False)
+        events_salt_i = st_salt.get_array(run, plugins, progress_bar=False)
+
+        # Get matching result
+        (
+            events_simu_filtered_i,
+            ind_salt_event_found_i, ind_simu_event_found_i, ind_simu_event_lost_i, ind_simu_event_split_i,
+            ind_salt_s1_found_i, ind_simu_s1_found_i,
+            ind_salt_s2_found_i, ind_simu_s2_found_i
+        ) = saltax.match_events(events_simu_i, events_salt_i)
+
+        # Load the indices into the dictionary
+        inds_dict["ind_salt_event_found"] = np.concatenate(
+            (inds_dict["ind_salt_event_found"], ind_salt_event_found_i)
+        )
+        inds_dict["ind_salt_s1_found"] = np.concatenate(
+            (inds_dict["ind_salt_s1_found"], ind_salt_s1_found_i)
+        )
+        inds_dict["ind_salt_s2_found"] = np.concatenate(
+            (inds_dict["ind_salt_s2_found"], ind_salt_s2_found_i)
+        )
+        inds_dict["ind_simu_event_found"] = np.concatenate(
+            (inds_dict["ind_simu_event_found"], ind_simu_event_found_i)
+        )
+        inds_dict["ind_simu_s1_found"] = np.concatenate(
+            (inds_dict["ind_simu_s1_found"], ind_simu_s1_found_i)
+        )
+        inds_dict["ind_simu_s2_found"] = np.concatenate(
+            (inds_dict["ind_simu_s2_found"], ind_simu_s2_found_i)
+        )
+
+        # Concatenate the events
+        if i==0:
+            events_simu = events_simu_filtered_i
+            events_salt = events_salt_i
+        else:
+            events_simu = np.concatenate((events_simu, events_simu_filtered_i))
+            events_salt = np.concatenate((events_salt, events_salt_i))
+    
+    return events_simu, events_salt, inds_dict
+
+
+def load_events_deprecated(runs, st_salt, st_simu, plugins=('event_info', 'cuts_basic')):
     """
     Load events from the runs and do basic filtering suggeted by saltax.match_events
     :param runs: list of runs.