Skip to content

Commit

Permalink
TCI
Browse files Browse the repository at this point in the history
  • Loading branch information
TheColdIce committed Nov 27, 2023
1 parent ff7ac45 commit 9eb32f7
Show file tree
Hide file tree
Showing 10 changed files with 114 additions and 118 deletions.
8 changes: 4 additions & 4 deletions AMLsim/paramFiles/1bank/conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
"min_amount": 1,
"max_amount": 150000,
"mean_amount": 637,
"std_amount": 3000,
"std_amount": 200,
"mean_amount_sar": 2000,
"std_amount_sar": 3000,
"std_amount_sar": 200,
"prob_income": 0.0,
"mean_income": 0.0,
"std_income": 0.0,
Expand All @@ -19,8 +19,8 @@
"std_income_sar": 0.0,
"mean_outcome": 200.0,
"std_outcome": 500.0,
"mean_outcome_sar": 0.0,
"std_outcome_sar": 0.0,
"mean_outcome_sar": 200.0,
"std_outcome_sar": 500.0,
"mean_phone_change_frequency": 1460,
"std_phone_change_frequency": 365,
"mean_phone_change_frequency_sar": 365,
Expand Down
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,33 @@ This is the repsitory for all the code in the project.

## Currently containing

AMLSim
# AMLsim
AMLsim is a simulator for generating transaction networks used in anti-money laundering research. It is based on the simulator by IBM (TODO: add link) and is extended to utilize distributions and model behavioural features. In short, it has two parts: a python part for generating the transaction network and a java part for simulating the behaviour of the agents. The simulation is controlled by 6 parameter files.
* A json file, which defines behviours of accounts and some paths varibles used during the simulation.
* 5 csv files, which defines some inital condtions and together defines the structure of the transaction network.

## Dependencies

### Alternative 1: Docker

1. pull image from thecoldice/amlsim:latest on dockerhub

### Alternative 2: Manual

Dependencies: python3.7, java, maven

1. clone repo
2. move into AMlsim folder
3. install python dependencies: `pip install -r requirements.txt` or `conda env create -f AMLamlsim.yml`
4. install java dependencies: `mvn install:install-file -Dfile=jars/mason.20.jar -DgroupId=mason -DartifactId=mason -Dversion=20 -Dpackaging=jar -DgeneratePom=true`
`
## Setup

1. Create a folder for the outputs: `mkdir outputs`
2. (Only for manual) Create a temporary folder for storing pyhton output: `mkdir tmp`
2. Create a folder for the simulation paramters: `mkdir paramFiles`
2. In paramFiles create a folder for a new simulation, e.g. `mkdir paramFiles/simulation1`
3. In the simulation folder, create these files: conf.json, accounts.csv, normalModels.csv, alertPatterns.csv, degree.csv and transactionTypes.csv

Transaction Network Explorer

Expand Down
Binary file added gnn/__pycache__/criterions.cpython-311.pyc
Binary file not shown.
Binary file modified gnn/__pycache__/data.cpython-311.pyc
Binary file not shown.
Binary file modified gnn/__pycache__/modules.cpython-311.pyc
Binary file not shown.
31 changes: 31 additions & 0 deletions gnn/criterions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import numpy as np
import torch
from torch.nn import functional as F

class ClassBalancedLoss(torch.nn.Module):
def __init__(self, beta, n_samples_per_classes, loss_type):
super(ClassBalancedLoss, self).__init__()
self.beta = beta
self.effective_nums = 1.0 - np.power(beta, n_samples_per_classes)
self.n_classes = len(n_samples_per_classes)
self.loss_type = loss_type

def forward(self, logits, labels):
labels = labels.to(torch.int64)
labels_one_hot = F.one_hot(labels, self.n_classes).float()
weights = (1.0 - self.beta) / np.array(self.effective_nums)
weights = weights / np.sum(weights) * self.n_classes
weights = torch.tensor(weights, device=logits.device).float()
weights = weights.unsqueeze(0)
weights = weights.repeat(labels_one_hot.shape[0],1) * labels_one_hot
weights = weights.sum(1)
weights = weights.unsqueeze(1)
weights = weights.repeat(1,self.n_classes)
if self.loss_type == "sigmoid":
loss = F.binary_cross_entropy_with_logits(input=logits,target=labels_one_hot,weight=weights)
elif self.loss_type == "softmax":
pred = logits.softmax(dim=1)
loss = F.binary_cross_entropy(input=pred,target=labels_one_hot,weight=weights)
else:
raise ValueError("loss_type must be sigmoid or softmax")
return loss
4 changes: 2 additions & 2 deletions gnn/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ def load_data(self, node_file, edge_file):
edge_index = torch.tensor(edges[['src', 'dst']].values, dtype=torch.long)
edge_index = edge_index.t().contiguous()
x = torch.tensor(nodes[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']].values, dtype=torch.float)
y = torch.tensor(nodes['y'].values, dtype=torch.float)
y = torch.nn.functional.one_hot(y.type(torch.long), num_classes=2).type(torch.float)
y = torch.tensor(nodes['y'].values, dtype=torch.long)
#y = torch.nn.functional.one_hot(y.type(torch.long), num_classes=2).type(torch.float)
data = Data(x=x, edge_index=edge_index, y=y)
return data

Expand Down
141 changes: 32 additions & 109 deletions gnn/main.py
Original file line number Diff line number Diff line change
@@ -1,105 +1,23 @@
import torch
import torch.optim as optim
import optuna
from optuna.trial import TrialState
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
import networkx as nx
import matplotlib.pyplot as plt
import random
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, balanced_accuracy_score, precision_score, recall_score, confusion_matrix

from data import AmlsimDataset
from modules import GCN
from data import EllipticDataset, AmlsimDataset

def define_gcn(trial):
n_layers = trial.suggest_int("n_layers", 2, 5)
hidden_dim = trial.suggest_int("hidden_dim", 2**5, 2**8, log=True)
dropout = trial.suggest_float("dropout", 0.3, 0.7)
return GCN(165,hidden_dim,2,n_layers,dropout)

def objective_gcn(trial, data, train_indices, val_indices, device):
model = define_gcn(trial).to(device)

optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
criterion = torch.nn.BCELoss()
t = trial.suggest_float("t", 0.2, 0.6)
for epoch in range(100):

model.train()
data = data.to(device)
optimizer.zero_grad()
out = model(data)

tmp = torch.nn.functional.one_hot(data.y.type(torch.long)).type(torch.float)
loss = criterion(out[train_indices], tmp[train_indices])
y = out.detach()[:, 1]
y = (y > t).type(torch.long)
f1 = f1_score(data.y.cpu()[train_indices], y.cpu()[train_indices])

loss.backward()
optimizer.step()

model.eval()
with torch.no_grad():
valf1 = f1_score(data.y.cpu()[val_indices], y.cpu()[val_indices])
trial.report(valf1, epoch)

if trial.should_prune():
raise optuna.exceptions.TrialPruned()

torch.save(model.state_dict(), "models/gcn-" + str(trial.number) + ".pth")
return valf1

def eval_gcn(device):
# load data
elliptic_data = EllipticDataset("data/elliptic_bitcoin_dataset", val_size=0.15, test_size=0.15, seed=42)
data, train_indices, val_indices, test_indices, train_labels, val_labels, test_labels = elliptic_data.get_data()

# train and optimize hyperparameters
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(
lambda trial: objective_gcn(trial, data, train_indices, val_indices, device), n_trials=100, timeout=10000,
)

# result of hyperparamter optimization
pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])
print("Study statistics: ")
print(" Number of finished trials: ", len(study.trials))
print(" Number of pruned trials: ", len(pruned_trials))
print(" Number of complete trials: ", len(complete_trials))

# retrieve best trial from hyperparameter optimization
print("Best trial:")
trial = study.best_trial
print(" Value: ", trial.value)
print(" Params: ")
for key, value in trial.params.items():
print(" {}: {}".format(key, value))
print("\t Trial number: ", trial.number)

# reconstruct best trained model
state_dict = torch.load("models/gcn-" + str(trial.number) + ".pth")
#files.download("gcn-" + str(trial.number) + ".pth")
model = GCN(165,trial.params["hidden_dim"],2,trial.params["n_layers"],trial.params["dropout"])
model.load_state_dict(state_dict)

# evaluate best trained model using test set
model.to(device)
model.eval()
out = model(data)
tmp = torch.nn.functional.one_hot(data.y.type(torch.long)).type(torch.float)
y = out.detach()[:, 1]
y = (y > trial.params["t"]).type(torch.long)
f1 = f1_score(data.y.cpu()[test_indices], y.cpu()[test_indices])
acc = accuracy_score(data.y.cpu()[test_indices], y.cpu()[test_indices])
pre = precision_score(data.y.cpu()[test_indices], y.cpu()[test_indices])
rec = recall_score(data.y.cpu()[test_indices], y.cpu()[test_indices])
print("test performance:")
print(f"\t f1: {f1}")
print(f"\t acc: {acc}")
print(f"\t pre: {pre}")
print(f"\t rec: {rec}")
from criterions import ClassBalancedLoss

def set_random_seed(seed:int=1):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
## NOTE: If you want every run to be exactly the same each time
## uncomment the following lines
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

def train_gcn(device):
# data
Expand All @@ -108,6 +26,12 @@ def train_gcn(device):
traindata = traindata.to(device)
testdata = testdata.to(device)

# normalize features
mean = traindata.x.mean(dim=0, keepdim=True)
std = traindata.x.std(dim=0, keepdim=True)
traindata.x = (traindata.x - mean) / std
testdata.x = (testdata.x - mean) / std

# model
input_dim = 10
hidden_dim = 16
Expand All @@ -118,33 +42,32 @@ def train_gcn(device):
model.to(device)

# optimizer
lr = 0.01
lr = 0.1
optimizer = optim.Adam(model.parameters(), lr=lr)

# loss function
criterion = torch.nn.BCELoss()
beta = 0.99999999
n_samples_per_classes = [(traindata.y == 0).sum().item(), (traindata.y == 1).sum().item()]
criterion = ClassBalancedLoss(beta=beta, n_samples_per_classes=n_samples_per_classes, loss_type='sigmoid')

for epoch in range(100):
for epoch in range(500):
model.train()
optimizer.zero_grad()
out = model(traindata)
loss = criterion(out, traindata.y)
loss.backward()
optimizer.step()

print(loss.item())

if epoch % 10 == 0:
model.eval()
with torch.no_grad():
out = model(testdata)
loss = criterion(out, testdata.y) # TODO: some out values are nan
print(f"epoch: {epoch}, loss: {loss}")
loss = criterion(out, testdata.y)
balanced_accuracy = balanced_accuracy_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1))
print(f'epoch: {epoch}, loss: {loss:.4f}, balanced_accuracy: {balanced_accuracy:.4f}')

def main():
device = torch.device("cpu")
#eval_gcn(device)
train_gcn(device)
set_random_seed(42)
train_gcn(torch.device('cuda:0'))

if __name__ == "__main__":
main()
18 changes: 17 additions & 1 deletion gnn/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,20 @@ def forward(self, data, adj_t=None):
x = F.relu(x)
x = F.dropout(x, p=self.dropout, training=self.training)
out = self.softmax(x)
return out
return out

class GCN2(torch.nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super().__init__()
self.conv1 = GCNConv(input_dim, hidden_dim)
self.conv2 = GCNConv(hidden_dim, output_dim)

def forward(self, data):
x, edge_index = data.x, data.edge_index

x = self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)

return F.log_softmax(x, dim=1)
2 changes: 1 addition & 1 deletion gnn/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def cal_node_features(df:pd.DataFrame) -> pd.DataFrame:
sums = gb['amount'].sum()
means = gb['amount'].mean()
medians = gb['amount'].median()
stds = gb['amount'].std()
stds = gb['amount'].std().fillna(0.0)
maxs = gb['amount'].max()
mins = gb['amount'].min()
in_degrees = gb['amount'].apply(lambda x: (x>0).sum())
Expand Down

0 comments on commit 9eb32f7

Please sign in to comment.