TCI

aidotse · Nov 27, 2023 · 9eb32f7 · 9eb32f7
1 parent ff7ac45
commit 9eb32f7
Show file tree

Hide file tree

Showing 10 changed files with 114 additions and 118 deletions.
diff --git a/AMLsim/paramFiles/1bank/conf.json b/AMLsim/paramFiles/1bank/conf.json
@@ -8,9 +8,9 @@
     "min_amount": 1,
     "max_amount": 150000,
     "mean_amount": 637,
-    "std_amount": 3000,
+    "std_amount": 200,
     "mean_amount_sar": 2000,
-    "std_amount_sar": 3000,
+    "std_amount_sar": 200,
     "prob_income": 0.0,
     "mean_income": 0.0,
     "std_income": 0.0,
@@ -19,8 +19,8 @@
     "std_income_sar": 0.0,
     "mean_outcome": 200.0,
     "std_outcome": 500.0,
-    "mean_outcome_sar": 0.0,
-    "std_outcome_sar": 0.0,
+    "mean_outcome_sar": 200.0,
+    "std_outcome_sar": 500.0,
     "mean_phone_change_frequency": 1460,
     "std_phone_change_frequency": 365,
     "mean_phone_change_frequency_sar": 365,

diff --git a/README.md b/README.md
@@ -6,7 +6,33 @@ This is the repsitory for all the code in the project.
 
 ## Currently containing
 
-AMLSim
+# AMLsim
+AMLsim is a simulator for generating transaction networks used in anti-money laundering research. It is based on the simulator by IBM (TODO: add link) and is extended to utilize distributions and model behavioural features. In short, it has two parts: a python part for generating the transaction network and a java part for simulating the behaviour of the agents. The simulation is controlled by 6 parameter files. 
+* A json file, which defines behviours of accounts and some paths varibles used during the simulation. 
+* 5 csv files, which defines some inital condtions and together defines the structure of the transaction network.
+
+## Dependencies
+
+### Alternative 1: Docker
+
+1. pull image from thecoldice/amlsim:latest on dockerhub
+
+### Alternative 2: Manual
+
+Dependencies: python3.7, java, maven
+
+1. clone repo
+2. move into AMlsim folder
+3. install python dependencies: `pip install -r requirements.txt` or `conda env create -f AMLamlsim.yml`
+4. install java dependencies: `mvn install:install-file -Dfile=jars/mason.20.jar -DgroupId=mason -DartifactId=mason -Dversion=20 -Dpackaging=jar -DgeneratePom=true`
+    `
+## Setup
+
+1. Create a folder for the outputs: `mkdir outputs`
+2. (Only for manual) Create a temporary folder for storing pyhton output: `mkdir tmp`
+2. Create a folder for the simulation paramters: `mkdir paramFiles`
+2. In paramFiles create a folder for a new simulation, e.g. `mkdir paramFiles/simulation1`
+3. In the simulation folder, create these files: conf.json, accounts.csv, normalModels.csv, alertPatterns.csv, degree.csv and transactionTypes.csv
 
 Transaction Network Explorer
 

diff --git a/gnn/__pycache__/criterions.cpython-311.pyc b/gnn/__pycache__/criterions.cpython-311.pyc
diff --git a/gnn/__pycache__/data.cpython-311.pyc b/gnn/__pycache__/data.cpython-311.pyc
diff --git a/gnn/__pycache__/modules.cpython-311.pyc b/gnn/__pycache__/modules.cpython-311.pyc
diff --git a/gnn/criterions.py b/gnn/criterions.py
@@ -0,0 +1,31 @@
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+class ClassBalancedLoss(torch.nn.Module):
+    def __init__(self, beta, n_samples_per_classes, loss_type):
+        super(ClassBalancedLoss, self).__init__()
+        self.beta = beta
+        self.effective_nums = 1.0 - np.power(beta, n_samples_per_classes)
+        self.n_classes = len(n_samples_per_classes)
+        self.loss_type = loss_type
+
+    def forward(self, logits, labels):
+        labels = labels.to(torch.int64)
+        labels_one_hot = F.one_hot(labels, self.n_classes).float()
+        weights = (1.0 - self.beta) / np.array(self.effective_nums)
+        weights = weights / np.sum(weights) * self.n_classes
+        weights = torch.tensor(weights, device=logits.device).float()
+        weights = weights.unsqueeze(0)
+        weights = weights.repeat(labels_one_hot.shape[0],1) * labels_one_hot
+        weights = weights.sum(1)
+        weights = weights.unsqueeze(1)
+        weights = weights.repeat(1,self.n_classes)
+        if self.loss_type == "sigmoid":
+            loss = F.binary_cross_entropy_with_logits(input=logits,target=labels_one_hot,weight=weights)
+        elif self.loss_type == "softmax":
+            pred = logits.softmax(dim=1)
+            loss = F.binary_cross_entropy(input=pred,target=labels_one_hot,weight=weights)
+        else:
+            raise ValueError("loss_type must be sigmoid or softmax")
+        return loss
diff --git a/gnn/data.py b/gnn/data.py
@@ -74,8 +74,8 @@ def load_data(self, node_file, edge_file):
         edge_index = torch.tensor(edges[['src', 'dst']].values, dtype=torch.long)
         edge_index = edge_index.t().contiguous()
         x = torch.tensor(nodes[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']].values, dtype=torch.float)
-        y = torch.tensor(nodes['y'].values, dtype=torch.float)
-        y = torch.nn.functional.one_hot(y.type(torch.long), num_classes=2).type(torch.float)
+        y = torch.tensor(nodes['y'].values, dtype=torch.long)
+        #y = torch.nn.functional.one_hot(y.type(torch.long), num_classes=2).type(torch.float)
         data = Data(x=x, edge_index=edge_index, y=y)
         return data
 

diff --git a/gnn/main.py b/gnn/main.py
@@ -1,105 +1,23 @@
 import torch
 import torch.optim as optim
-import optuna
-from optuna.trial import TrialState
-from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
-import networkx as nx
-import matplotlib.pyplot as plt
+import random
+import numpy as np
+from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, balanced_accuracy_score, precision_score, recall_score, confusion_matrix
 
+from data import AmlsimDataset
 from modules import GCN
-from data import EllipticDataset, AmlsimDataset
-
-def define_gcn(trial):
-    n_layers = trial.suggest_int("n_layers", 2, 5)
-    hidden_dim = trial.suggest_int("hidden_dim", 2**5, 2**8, log=True)
-    dropout = trial.suggest_float("dropout", 0.3, 0.7)
-    return GCN(165,hidden_dim,2,n_layers,dropout)
-
-def objective_gcn(trial, data, train_indices, val_indices, device):
-    model = define_gcn(trial).to(device)
-
-    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
-    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
-    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
-    criterion = torch.nn.BCELoss()
-    t = trial.suggest_float("t", 0.2, 0.6)
-    for epoch in range(100):
-
-        model.train()
-        data = data.to(device)
-        optimizer.zero_grad()
-        out = model(data)
-
-        tmp = torch.nn.functional.one_hot(data.y.type(torch.long)).type(torch.float)
-        loss = criterion(out[train_indices], tmp[train_indices])
-        y = out.detach()[:, 1]
-        y = (y > t).type(torch.long)
-        f1 = f1_score(data.y.cpu()[train_indices], y.cpu()[train_indices])
-
-        loss.backward()
-        optimizer.step()
-
-        model.eval()
-        with torch.no_grad():
-            valf1 = f1_score(data.y.cpu()[val_indices], y.cpu()[val_indices])
-            trial.report(valf1, epoch)
-
-        if trial.should_prune():
-            raise optuna.exceptions.TrialPruned()
-
-    torch.save(model.state_dict(), "models/gcn-" + str(trial.number) + ".pth")
-    return valf1
-
-def eval_gcn(device):
-    # load data
-    elliptic_data = EllipticDataset("data/elliptic_bitcoin_dataset", val_size=0.15, test_size=0.15, seed=42)
-    data, train_indices, val_indices, test_indices, train_labels, val_labels, test_labels = elliptic_data.get_data()
-
-    # train and optimize hyperparameters
-    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
-    study.optimize(
-        lambda trial: objective_gcn(trial, data, train_indices, val_indices, device), n_trials=100, timeout=10000,
-    )
-
-    # result of hyperparamter optimization
-    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
-    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])
-    print("Study statistics: ")
-    print("  Number of finished trials: ", len(study.trials))
-    print("  Number of pruned trials: ", len(pruned_trials))
-    print("  Number of complete trials: ", len(complete_trials))
-
-    # retrieve best trial from hyperparameter optimization
-    print("Best trial:")
-    trial = study.best_trial
-    print("  Value: ", trial.value)
-    print("  Params: ")
-    for key, value in trial.params.items():
-        print("    {}: {}".format(key, value))
-    print("\t Trial number: ", trial.number)
-
-    # reconstruct best trained model
-    state_dict = torch.load("models/gcn-" + str(trial.number) + ".pth")
-    #files.download("gcn-" + str(trial.number) + ".pth")
-    model = GCN(165,trial.params["hidden_dim"],2,trial.params["n_layers"],trial.params["dropout"])
-    model.load_state_dict(state_dict)
-
-    # evaluate best trained model using test set
-    model.to(device)
-    model.eval()
-    out = model(data)
-    tmp = torch.nn.functional.one_hot(data.y.type(torch.long)).type(torch.float)
-    y = out.detach()[:, 1]
-    y = (y > trial.params["t"]).type(torch.long)
-    f1 = f1_score(data.y.cpu()[test_indices], y.cpu()[test_indices])
-    acc = accuracy_score(data.y.cpu()[test_indices], y.cpu()[test_indices])
-    pre = precision_score(data.y.cpu()[test_indices], y.cpu()[test_indices])
-    rec = recall_score(data.y.cpu()[test_indices], y.cpu()[test_indices])
-    print("test performance:")
-    print(f"\t f1: {f1}")
-    print(f"\t acc: {acc}")
-    print(f"\t pre: {pre}")
-    print(f"\t rec: {rec}")
+from criterions import ClassBalancedLoss
+
+def set_random_seed(seed:int=1):
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        ## NOTE: If you want every run to be exactly the same each time
+        ##       uncomment the following lines
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
 
 def train_gcn(device):
     # data
@@ -108,6 +26,12 @@ def train_gcn(device):
     traindata = traindata.to(device)
     testdata = testdata.to(device)
 
+    # normalize features
+    mean = traindata.x.mean(dim=0, keepdim=True)
+    std = traindata.x.std(dim=0, keepdim=True)
+    traindata.x = (traindata.x - mean) / std
+    testdata.x = (testdata.x - mean) / std
+
     # model
     input_dim = 10
     hidden_dim = 16
@@ -118,33 +42,32 @@ def train_gcn(device):
     model.to(device)
 
     # optimizer
-    lr = 0.01
+    lr = 0.1
     optimizer = optim.Adam(model.parameters(), lr=lr)
 
     # loss function
-    criterion = torch.nn.BCELoss()
+    beta = 0.99999999
+    n_samples_per_classes = [(traindata.y == 0).sum().item(), (traindata.y == 1).sum().item()]
+    criterion = ClassBalancedLoss(beta=beta, n_samples_per_classes=n_samples_per_classes, loss_type='sigmoid')
 
-    for epoch in range(100):
+    for epoch in range(500):
         model.train()
         optimizer.zero_grad()
         out = model(traindata)
         loss = criterion(out, traindata.y)
         loss.backward()
         optimizer.step()
-
-        print(loss.item())
-
         if epoch % 10 == 0:
             model.eval()
             with torch.no_grad():
                 out = model(testdata)
-                loss = criterion(out, testdata.y) # TODO: some out values are nan 
-                print(f"epoch: {epoch}, loss: {loss}")
+                loss = criterion(out, testdata.y)
+                balanced_accuracy = balanced_accuracy_score(testdata.y.cpu().numpy(), out.cpu().numpy().argmax(axis=1))
+                print(f'epoch: {epoch}, loss: {loss:.4f}, balanced_accuracy: {balanced_accuracy:.4f}')
 
 def main():
-    device = torch.device("cpu")
-    #eval_gcn(device)
-    train_gcn(device)
+    set_random_seed(42)
+    train_gcn(torch.device('cuda:0'))
 
 if __name__ == "__main__":
     main()
diff --git a/gnn/modules.py b/gnn/modules.py
@@ -70,4 +70,20 @@ def forward(self, data, adj_t=None):
             x = F.relu(x)
             x = F.dropout(x, p=self.dropout, training=self.training)
         out = self.softmax(x)
-        return out
+        return out
+
+class GCN2(torch.nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim):
+        super().__init__()
+        self.conv1 = GCNConv(input_dim, hidden_dim)
+        self.conv2 = GCNConv(hidden_dim, output_dim)
+
+    def forward(self, data):
+        x, edge_index = data.x, data.edge_index
+
+        x = self.conv1(x, edge_index)
+        x = F.relu(x)
+        x = F.dropout(x, training=self.training)
+        x = self.conv2(x, edge_index)
+
+        return F.log_softmax(x, dim=1)
diff --git a/gnn/preprocessing.py b/gnn/preprocessing.py
@@ -51,7 +51,7 @@ def cal_node_features(df:pd.DataFrame) -> pd.DataFrame:
     sums = gb['amount'].sum()
     means = gb['amount'].mean()
     medians = gb['amount'].median()
-    stds = gb['amount'].std()
+    stds = gb['amount'].std().fillna(0.0)
     maxs = gb['amount'].max()
     mins = gb['amount'].min()
     in_degrees = gb['amount'].apply(lambda x: (x>0).sum())