Merge pull request #246 from Yc325/Machine_Learning_Minor_Update

Machine learning very small update
didymo · Oct 24, 2022 · 1f1673f · 1f1673f
2 parents c808b78 + b19754b
commit 1f1673f
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 11 deletions.
diff --git a/src/Model/batchprocessing/BatchProcessMachineLearning.py b/src/Model/batchprocessing/BatchProcessMachineLearning.py
@@ -45,8 +45,10 @@ def __init__(self, progress_callback, interrupt_flag,
         self.preprocessing = None
         self.run_ml = None
         self.X_train = None
+        self.X_train_for_confusion_matrix = None
         self.X_test = None
         self.y_train = None
+        self.y_train_for_confusion_matrix = None
         self.y_test = None
         self.params = None
         self.scaling = None
@@ -115,6 +117,8 @@ def preprocessing_for_ml(self):
                 self.preprocessing.scaling
             self.machine_learning_options['features'] =\
                 self.preprocessing.column_names
+            self.X_train_for_confusion_matrix = self.preprocessing.x_train_for_confusion_matrix
+            self.y_train_for_confusion_matrix = self.preprocessing.y_train_for_confusion_matrix
         self.run_model_accept =\
             self.preprocessing.permission
 
@@ -128,6 +132,8 @@ def run_model(self):
             self.run_ml = MlModeling(
                 self.X_train,
                 self.X_test,
+                self.X_train_for_confusion_matrix,
+                self.y_train_for_confusion_matrix,
                 self.y_train,
                 self.y_test,
                 self.preprocessing.target,

diff --git a/src/Model/batchprocessing/batchprocessingMachineLearning/MachineLearningTrainingStage.py b/src/Model/batchprocessing/batchprocessingMachineLearning/MachineLearningTrainingStage.py
@@ -30,6 +30,8 @@ class MlModeling():
     def __init__(self,
                  train_feature,
                  test_feature,
+                 train_feature_dataset_for_confusion_matrix,
+                 train_label_dataset_for_confusion_matrix,
                  train_label,
                  test_label,
                  target,
@@ -38,19 +40,23 @@ def __init__(self,
                  permission=None):
         self.train_feature = train_feature
         self.test_feature = test_feature
+        self.train_feature_dataset_for_confusion_matrix = train_feature_dataset_for_confusion_matrix
+        self.train_label_dataset_for_confusion_matrix = train_label_dataset_for_confusion_matrix
         self.train_label = train_label
         self.test_label = test_label
         self.target = target
         self.type_model = type_model
         self.tuning = tuning
         self.permission = permission
         self.confusion_matrix = None
+        self.train_dataset_confusion_matrix = None
         self.model = None
         self.score = None
         self.accuracy = {
             "accuracy": '',
             "model": ''
         }
+        self.model_names = ['RandomForestClassifier', 'MLPClassifier']
 
     """
     Class initializer function.
@@ -127,6 +133,7 @@ def calculate_balance(self):
         return balance
 
     def custom_confusion_matrix(self,
+                                test_label,
                                 predictions):
         """
         The function creates a confusion matrix
@@ -136,10 +143,10 @@ def custom_confusion_matrix(self,
         see here: https://towardsdatascience.com
                     /understanding-confusion-matrix-a9ad42dcfd62
         """
-        unique_label = np.unique([self.test_label,
+        unique_label = np.unique([test_label,
                                   predictions])
         cmtx = pd.DataFrame(
-            confusion_matrix(self.test_label,
+            confusion_matrix(test_label,
                              predictions,
                              labels=unique_label),
             index=['true:{:}'.format(x) for x in unique_label],
@@ -256,12 +263,14 @@ def classification_ml_tuned(self):
 
         if mlp_score > random_forest_score:
             self.confusion_matrix = self.custom_confusion_matrix(
+                self.test_label,
                 mlp_pred)
             self.score = mlp_score
             self.accuracy['accuracy'] = f'{self.score}'
             return mlp_model
 
         self.confusion_matrix = self.custom_confusion_matrix(
+            self.test_label,
             random_forest_pred)
         self.score = random_forest_score
         self.accuracy['accuracy'] = f'{self.score}'
@@ -308,12 +317,15 @@ def classification_ml(self):
         mlp_score = perfomance(mlp_pred)
 
         if mlp_score > random_forest_score:
-            self.confusion_matrix = self.custom_confusion_matrix(mlp_pred)
+            self.confusion_matrix = self.custom_confusion_matrix(
+                self.test_label,
+                mlp_pred)
             self.score = mlp_score
             self.accuracy['accuracy'] = f'{self.score}'
             return mlp_cla
 
         self.confusion_matrix = self.custom_confusion_matrix(
+            self.test_label,
             random_forest_pred)
         self.score = random_forest_score
         self.accuracy['accuracy'] = f'{self.score}'
@@ -523,15 +535,28 @@ def save_confusion_matrix(self, path):
         :param path: path were file will be saved.
         """
         path += f'{self.target}_ML_RiskTable.txt'
-        headers = ['RISK TABLE', 'ML PERFOMANCE']
-        with open(path, 'w') as f:
-            print(f'{headers[0]}\n',
-                  file=f)
-            df_as_string = self.confusion_matrix.to_string(header=True,
+        headers = ['TRAIN DATASET RISK TABLE', 'TEST DATASET RISK TABLE', 'ML PERFOMANCE']
+        if type(self.model).__name__ in self.model_names:
+            with open(path, 'w') as f:
+                print(f'{headers[0]}\n',
+                    file=f)
+                df_as_string_train = self.train_dataset_confusion_matrix.to_string(
+                    header=True,
+                    index=True)
+                f.write(df_as_string_train)
+
+                print(f'\n\n{headers[1]}\n',
+                    file=f)
+                df_as_string_test = self.confusion_matrix.to_string(header=True,
                                                            index=True)
-            f.write(df_as_string)
-            print(f'\n\n{headers[1]}\n', file=f)
-            print(f'{self.score[0]}: {self.score[1]}', file=f)
+                f.write(df_as_string_test)
+                print(f'\n\n{headers[2]}\n', file=f)
+                print(f'{self.score[0]}: {self.score[1]}', file=f)
+        else:
+            with open(path, 'w') as f:
+                print(f'\n\n{headers[2]}\n', file=f)
+                print(f'Accuracy: {self.score}', file=f)
+
 
     def run_model(self):
         """
@@ -552,5 +577,12 @@ def run_model(self):
                 self.model = self.regression_ml_tuned()
             else:
                 self.model = self.regression_ml()
+        if type(self.model).__name__ in self.model_names:
+            train_predictions_for_confusion_matrix = self.model.predict(
+                self.train_feature_dataset_for_confusion_matrix)
+
+            self.train_dataset_confusion_matrix = self.custom_confusion_matrix(
+                self.train_label_dataset_for_confusion_matrix,
+                train_predictions_for_confusion_matrix)
 
         self.accuracy['model'] = type(self.model).__name__
diff --git a/src/Model/batchprocessing/batchprocessingMachineLearning/Preprocessing.py b/src/Model/batchprocessing/batchprocessingMachineLearning/Preprocessing.py
@@ -45,6 +45,8 @@ def __init__(self,
         self.missing_id = []
         self.permission = None
         self.permission_ids = None
+        self.x_train_for_confusion_matrix = None
+        self.y_train_for_confusion_matrix = None
 
         """
         Class initializer function.
@@ -472,6 +474,9 @@ def prepare_for_ml(self):
                 ("cat", OneHotEncoder(handle_unknown='ignore'), final_cat)
             ])
 
+            self.x_train_for_confusion_matrix = x_train.copy()
+            self.y_train_for_confusion_matrix = self.x_train_for_confusion_matrix[self.target]
+
             # Check if label is imbalanced, if so,
             # then it does Up sampling on train
             if result[0]:
@@ -482,6 +487,7 @@ def prepare_for_ml(self):
             y_test = x_test[self.target]
             x_train = full_pipeline.fit_transform(x_train)
             x_test = full_pipeline.transform(x_test)
+            self.x_train_for_confusion_matrix = full_pipeline.transform(self.x_train_for_confusion_matrix)
             self.scaling = full_pipeline
 
             return x_train, x_test, y_train, y_test