-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
170 lines (130 loc) · 5.46 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from tqdm import tqdm
import multiprocessing
import time
import os
import gc
# Data setting
train_batch_id_first = 100
train_batch_id_last = 101
train_batch_ids = range(train_batch_id_first, train_batch_id_last + 1)
# Feature Settings
max_pulse_count = 96
n_features = 7 # time, charge, aux, x, y, z, rank
# Directories
home_dir = "/kaggle/input/icecube-neutrinos-in-deep-ice/"
train_format = home_dir + 'train/batch_{batch_id:d}.parquet'
point_picker_format = 'pp_mpc96_n7_batch_{batch_id:d}.npz'
# Sensor Geometry Data
sensor_geometry_df = pd.read_csv(home_dir + "sensor_geometry.csv")
# X, Y, Z coordinates
sensor_x = sensor_geometry_df.x
sensor_y = sensor_geometry_df.y
sensor_z = sensor_geometry_df.z
# Detector constants
c_const = 0.299792458 # speed of light [m/ns]
# Min / Max information
x_min = sensor_x.min()
x_max = sensor_x.max()
y_min = sensor_y.min()
y_max = sensor_y.max()
z_min = sensor_z.min()
z_max = sensor_z.max()
# Detector Valid Length
detector_length = np.sqrt((x_max - x_min)**2 + (y_max - y_min)**2 + (z_max
- z_min)**2)
t_valid_length = detector_length / c_const
print(f"time valid length: {t_valid_length} ns")
"""
## Single event reader function
- Pick-up important data points first
- Rank 3 (First)
- not aux, in valid time window
- Rank 2
- not aux, out of valid time window
- Rank 1
- aux, in valid time window
- Rank 0 (Last)
- aux, out of valid time window
- In each ranks, take pulses from highest charge
"""
# read single event from batch_meta_df
def read_event(event_idx, batch_meta_df, max_pulse_count, batch_df, train=True):
# read metadata
batch_id, first_pulse_index, last_pulse_index = batch_meta_df.iloc[event_idx][["batch_id", "first_pulse_index", "last_pulse_index"]].astype("int")
# read event
event_feature = batch_df[first_pulse_index:last_pulse_index + 1]
sensor_id = event_feature.sensor_id
# merge features into single structured array
dtype = [("time", "float16"),
("charge", "float16"),
("auxiliary", "float16"),
("x", "float16"),
("y", "float16"),
("z", "float16"),
("rank", "short")]
event_x = np.zeros(last_pulse_index - first_pulse_index + 1, dtype)
event_x["time"] = event_feature.time.values - event_feature.time.min()
event_x["charge"] = event_feature.charge.values
event_x["auxiliary"] = event_feature.auxiliary.values
event_x["x"] = sensor_geometry_df.x[sensor_id].values
event_x["y"] = sensor_geometry_df.y[sensor_id].values
event_x["z"] = sensor_geometry_df.z[sensor_id].values
# For long event, pick-up
if len(event_x) > max_pulse_count:
# Find valid time window
t_peak = event_x["time"][event_x["charge"].argmax()]
t_valid_min = t_peak - t_valid_length
t_valid_max = t_peak + t_valid_length
t_valid = (event_x["time"] > t_valid_min) * (event_x["time"] < t_valid_max)
# rank
event_x["rank"] = 2 * (1 - event_x["auxiliary"]) + (t_valid)
# sort by rank and charge (important goes to backward)
event_x = np.sort(event_x, order=["rank", "charge"])
# pick-up from backward
event_x = event_x[-max_pulse_count:]
# resort by time
event_x = np.sort(event_x, order="time")
# resort by time
event_x = np.sort(event_x, order="time")
# for train data, give angles together
azimuth, zenith = batch_meta_df.iloc[event_idx][["azimuth", "zenith"]].astype("float16")
event_y = np.array([azimuth, zenith], dtype="float16")
return event_idx, len(event_x), event_x, event_y
# Read Train Meta Data
train_meta_df = pd.read_parquet(home_dir + 'train_meta.parquet')
batch_counts = train_meta_df.batch_id.value_counts().sort_index()
batch_max_index = batch_counts.cumsum()
batch_max_index[train_meta_df.batch_id.min() - 1] = 0
batch_max_index = batch_max_index.sort_index()
def train_meta_df_spliter(batch_id):
return train_meta_df.loc[batch_max_index[batch_id - 1]:batch_max_index[batch_id] - 1]
for batch_id in train_batch_ids:
print("Reading batch ", batch_id, end="")
# get batch meta data and data
batch_meta_df = train_meta_df_spliter(batch_id)
batch_df = pd.read_parquet(train_format.format(batch_id=batch_id))
# register pulses
batch_x = np.zeros((len(batch_meta_df), max_pulse_count, n_features), dtype="float16")
batch_y = np.zeros((len(batch_meta_df), 2), dtype="float16")
batch_x[:, :, 2] = -1
def read_event_local(event_idx):
return read_event(event_idx, batch_meta_df, max_pulse_count, batch_df, train=True)
# Proces Events
iterator = range(len(batch_meta_df))
with multiprocessing.Pool() as pool:
for event_idx, pulse_count, event_x, event_y in pool.map(read_event_local, iterator):
batch_x[event_idx, :pulse_count, 0] = event_x["time"]
batch_x[event_idx, :pulse_count, 1] = event_x["charge"]
batch_x[event_idx, :pulse_count, 2] = event_x["auxiliary"]
batch_x[event_idx, :pulse_count, 3] = event_x["x"]
batch_x[event_idx, :pulse_count, 4] = event_x["y"]
batch_x[event_idx, :pulse_count, 5] = event_x["z"]
batch_x[event_idx, :pulse_count, 6] = event_x["rank"]
batch_y[event_idx] = event_y
del batch_meta_df, batch_df
# Save
print(" DONE! Saving...")
np.savez(point_picker_format.format(batch_id=batch_id), x=batch_x, y=batch_y)