-
Notifications
You must be signed in to change notification settings - Fork 2
/
amex-feature-engg.py
31 lines (23 loc) · 1.01 KB
/
amex-feature-engg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import cudf
from utils import generate_aggregate_features
amex_data = cudf.read_parquet("train.parquet")
targets = cudf.read_csv("train_labels.csv")
# Aggregating Features
amex_data["customer_ID"] = (
amex_data["customer_ID"].str[-16:].str.hex_to_int().astype("int64")
)
amex_data.S_2 = cudf.to_datetime(amex_data.S_2)
amex_data = generate_aggregate_features(amex_data, "customer_ID")
# Converting Cutomer_ID to an Integer Value and merging with the features
targets["customer_ID"] = (
targets["customer_ID"].str[-16:].str.hex_to_int().astype("int64")
)
targets = targets.set_index("customer_ID")
amex_data = amex_data.merge(targets, left_index=True, right_index=True, how="left")
amex_data.target = amex_data.target.astype("int8")
# Cudf merge randomly shuffles rows
amex_data = amex_data.sort_index().reset_index()
amex_data.drop(["customer_ID"], axis=1, inplace=True)
print("Shape: ", amex_data.shape)
print(f"There are {len(amex_data.columns[1:-1])} features!")
amex_data.to_parquet("amex_data.parquet", index=False)