-
Notifications
You must be signed in to change notification settings - Fork 0
/
credit-fraud-imbalancd.py
114 lines (92 loc) · 4.41 KB
/
credit-fraud-imbalancd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix
import itertools
# Load and preprocess the data
def load_and_preprocess_data(file_path):
df = pd.read_csv(file_path)
# Check for null values
if df.isnull().sum().max() > 0:
raise ValueError("Dataset contains null values")
# Scale the 'Amount' and 'Time' features
robust_scaler = RobustScaler()
df['scaled_amount'] = robust_scaler.fit_transform(df['Amount'].values.reshape(-1, 1))
df['scaled_time'] = robust_scaler.fit_transform(df['Time'].values.reshape(-1, 1))
df.drop(['Time', 'Amount'], axis=1, inplace=True)
df.rename(columns={'scaled_amount': 'Amount', 'scaled_time': 'Time'}, inplace=True)
return df
# Split the data into training and testing sets
def split_data(df, test_size=0.2, random_state=42):
X = df.drop('Class', axis=1)
y = df['Class']
return train_test_split(X, y, test_size=test_size, random_state=random_state)
# Handle class imbalance with undersampling
def undersample_data(X_train, y_train):
undersample = NearMiss(version=1)
return undersample.fit_resample(X_train, y_train)
# Handle class imbalance with SMOTE
def oversample_data(X_train, y_train):
smote = SMOTE(sampling_strategy='minority', random_state=42)
return smote.fit_resample(X_train, y_train)
# Build and compile a neural network model
def build_nn_model(input_dim, learning_rate=0.001):
model = Sequential([
Dense(16, input_dim=input_dim, activation='relu'),
Dense(8, activation='relu'),
Dense(1, activation='sigmoid')
])
model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
return model
# Train the model
def train_model(model, X_train, y_train, batch_size=25, epochs=20, validation_split=0.2):
model.fit(X_train, y_train, validation_split=validation_split, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=2)
# Evaluate model performance with a confusion matrix
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
def main(file_path):
# Load and preprocess the data
df = load_and_preprocess_data(file_path)
# Split the data
X_train, X_test, y_train, y_test = split_data(df)
# Undersample the data
X_train_undersample, y_train_undersample = undersample_data(X_train, y_train)
# Build, train, and evaluate the undersampling model
undersample_model = build_nn_model(X_train_undersample.shape[1])
train_model(undersample_model, X_train_undersample, y_train_undersample)
undersample_predictions = undersample_model.predict(X_test, batch_size=200, verbose=0)
undersample_cm = confusion_matrix(y_test, undersample_predictions.round())
plot_confusion_matrix(undersample_cm, classes=['No Fraud', 'Fraud'])
# Oversample the data
X_train_smote, y_train_smote = oversample_data(X_train, y_train)
# Build, train, and evaluate the oversampling model
oversample_model = build_nn_model(X_train_smote.shape[1])
train_model(oversample_model, X_train_smote, y_train_smote, batch_size=300)
oversample_predictions = oversample_model.predict(X_test, batch_size=200, verbose=0)
oversample_smote_cm = confusion_matrix(y_test, oversample_predictions.round())
plot_confusion_matrix(oversample_smote_cm, classes=['No Fraud', 'Fraud'])
plt.show()
if __name__ == "__main__":
main("creditcard.csv")