git initiated

Merck · Sep 1, 2023 · 6a8cc19 · 6a8cc19
commit 6a8cc19
Show file tree

Hide file tree

Showing 21 changed files with 1,651 additions and 0 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/LICENSE b/LICENSE
diff --git a/LICENSES_THIRD_PARTY b/LICENSES_THIRD_PARTY
@@ -0,0 +1,20 @@
+--------------------------------------------------
+Third party dependencies listed by License type
+[Format: name (Python module) - URL]
+--------------------------------------------------
+
+OSI Approved (new BSD)
+* SciPy (scipy) -  https://github.com/scipy/scipy/blob/main/LICENSE.txt
+* scikit-learn (sklearn) - https://github.com/scikit-learn/scikit-learn/blob/main/COPYING
+
+BSD 3-Clause License
+* Numpy (numpy) - https://github.com/numpy/numpy/blob/main/LICENSE.txt
+* Pandas (pandas) - https://github.com/pandas-dev/pandas/blob/5aba6659e422e985683cfb46c07c3364a02b6e5b/AUTHORS.md
+* Dill (dill) - https://github.com/uqfoundation/dill/blob/master/LICENSE
+
+MIT License (MIT)
+* Keras (keras) - https://github.com/keras-team/keras/blob/dc698c5486117780b643eda0a2f60a8753625b8a/LICENSE
+* LightGBM (lightgbm) - https://github.com/microsoft/LightGBM/blob/master/LICENSE
+
+Apache Software License (Apache 2.0)
+* TensorFlow (tensorflow) - https://github.com/tensorflow/tensorflow/blob/6b6d843ccab78f9f91c3b98a43ca09ffecad4747/LICENSE
diff --git a/README.md b/README.md
@@ -0,0 +1,44 @@
+# Prediction Uncertainty for QSAR
+
+This package contains Python code to construct prediction intervals for QSAR regression.
+The implemented QSAR prediction models include: Random Forests, Fully-Connected Neural Networks, and Gradient Boosting.
+The methodology for developing prediction intervals accompanying the point predictors can be find in Reference [1] and [2].
+
+Authors: Yuting Xu, Andy Liaw, Robert P. Sheridan, and Vladimir Svetnik
+
+Affiliation: Merck & Co., Inc., Rahway, New Jersey 07065, United States
+
+Maintainer: yuting.xu@merck.com
+
+Last updated: 08/29/2023
+
+## Workflow
+
+<p align="center">
+  <img src="docs/readme_logo.jpg" alt="Logo_workflow" width="600">
+</p>
+
+## Usage
+
+The code is written in the functional programming paradigm without the hassle of installation.
+Simply clone or download the repository to your local machine, and use the provided examples as a starting point to experiment with your own workflow.
+
+### Prerequisites
+
+* numpy
+* pandas
+* dill
+* scipy
+* scikit-learn
+* tensorflow
+* keras
+* lightgbm
+
+## Reference
+
+[1] Xu, Y., Liaw, A., Sheridan, R.P. and Svetnik, V., 2023. Development and Evaluation of Conformal Prediction Methods for QSAR. arXiv preprint arXiv:2304.00970.
+
+[2] Cortes-Ciriano, I.; Bender, A. Reliable prediction errors for deep neural networks using test-time dropout. Journal of chemical information and modeling 2019, 59, 3330–3339.
+
+## License
+This project is licensed under the GNU General Public License v3.0 License - see the [LICENSE](LICENSE) file for details.
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,3 @@
+## Unzip the compressed example datasets in this folder
+
+tar -xvf exampleData.tar.xz
diff --git a/data/exampleData.tar.xz b/data/exampleData.tar.xz
diff --git a/docs/readme_logo.jpg b/docs/readme_logo.jpg
diff --git a/examples/run_DNN-dropout.py b/examples/run_DNN-dropout.py
@@ -0,0 +1,107 @@
+#    Copyright © 2023 Merck & Co., Inc., Rahway, NJ, USA and its affiliates. All rights reserved.
+#
+#    This file is part of the PUQSAR package, an open source software for computing the Prediction Uncertainty for QSAR.
+#
+#    Prediction Uncertainty for QSAR (PUQSAR) is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#!/usr/bin/env python
+
+# CUDA_VISIBLE_DEVICES=7 python run_DNN_dropout.py
+
+# Prerequisites
+import os
+import sys
+import time
+import glob
+import numpy as np
+import pandas as pd
+import dill
+sys.path.append("../puqsar")
+
+# Create folder to save results
+saveFolder = '../results/DNN-dropout'
+if not os.path.exists(saveFolder):
+    os.makedirs(saveFolder)
+
+# Load Example Data
+dat_train = pd.read_csv("../data/3A4_processed/dat_train.csv")
+dat_test= pd.read_csv("../data/3A4_processed/dat_test_rs.csv")
+
+# Preparing Data for DNN-dropout model training and testing
+from utils.preprocessing import *
+X_train, X_cal, y_train_norm, y_cal_norm, mu_tr, sd_tr, df_label_train, df_label_cal = preprocessing_DNN_default_train(dat_train, p_cal = 0.2, seed=111)
+X_test, df_label_test = preprocessing_DNN_default_test(dat_test, mu_tr, sd_tr)
+
+# Load functions for DNN dropout model
+from models.DNN_dropout import *
+
+# Hyperparameters for DNN structure and training
+p_batchSize = 0.05
+learn_rate = 0.001
+
+nn_pars = {'nodes' : [4000, 2000, 1000, 1000],
+           'dropout': [0.25, 0.25, 0.25, 0.1],
+           'batch_size' : min(128, round(X_train.shape[0]*p_batchSize)),
+           'learn_rate' : learn_rate,
+           'epochs' : 500,
+           'wt_decay' : 0.00005}
+dropouts = 100
+
+# Train a DNN-dropout model
+model = train_DNN_dropout(X_train, y_train_norm, X_cal, y_cal_norm, nn_pars)
+
+# Prediction on Calibration set
+pred_cal = np.zeros((X_cal.shape[0],dropouts))
+for k in range(dropouts):
+    pred_cal[:,k] = model.predict(X_cal)[:, 0]
+
+pred_cal_avg = np.mean(pred_cal, 1) * sd_tr + mu_tr
+pred_cal_sd = np.std(pred_cal, 1) * sd_tr
+df_pred_cal = pd.concat([df_label_cal.reset_index(drop=True, inplace=False),
+                         pd.DataFrame({"Pred": pred_cal_avg, "Pred_UNC": pred_cal_sd})], axis=1, ignore_index = False, sort = False)
+
+# Calibration step
+#  Conformal algorithm options: CP_ACE, CP_expSD, CP_homo
+from calibrators.ICP import *
+nominal_level=0.8
+fun_PI = CP_ACE(df_pred_cal, nominal_level)
+
+# Save the model to .h5, prediction for calibration set (including raw unertainty score) to .csv, and the calibration results to .pkl file
+model_path = os.path.join(saveFolder, 'model.h5')
+model.save(model_path)
+df_pred_cal.to_csv(os.path.join(saveFolder,"df_pred_cal.csv"), header=True, index=False)
+with open(os.path.join(saveFolder, 'calibration.pkl'), 'wb') as file:
+    dill.dump([fun_PI,nominal_level,mu_tr,sd_tr], file)
+
+# Application on Test set and save results as CSV
+"""
+from tensorflow.keras.models import load_model
+model = load_model(os.path.join(saveFolder, 'model.h5'))
+
+with open(os.path.join(saveFolder, 'calibration.pkl'), 'rb') as file:
+    fun_PI,nominal_level,mu_tr,sd_tr = dill.load(file)
+"""
+
+pred_test = np.zeros((X_test.shape[0],dropouts))
+for k in range(dropouts):
+    pred_test[:,k] = model.predict(X_test)[:, 0]
+
+pred_test_avg = np.mean(pred_test, 1) * sd_tr + mu_tr
+pred_test_sd = np.std(pred_test, 1) * sd_tr
+
+df_pred_test = pd.concat([df_label_test.reset_index(drop=True, inplace=False),
+                         pd.DataFrame({"Pred": pred_test_avg,"Pred_UNC": pred_test_sd})], axis=1, ignore_index = False, sort = False)
+df_pred_test = fun_PI(df_pred_test)
+df_pred_test.to_csv(os.path.join(saveFolder,"df_pred_test.csv"), header=True, index=False)
diff --git a/examples/run_DNN-multitask.py b/examples/run_DNN-multitask.py
@@ -0,0 +1,105 @@
+#    Copyright © 2023 Merck & Co., Inc., Rahway, NJ, USA and its affiliates. All rights reserved.
+#
+#    This file is part of the PUQSAR package, an open source software for computing the Prediction Uncertainty for QSAR.
+#
+#    Prediction Uncertainty for QSAR (PUQSAR) is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#!/usr/bin/env python
+
+# CUDA_VISIBLE_DEVICES=7 python run_DNN_multitask.py
+
+# Prerequisites
+import os
+import sys
+import time
+import glob
+import numpy as np
+import pandas as pd
+import dill
+sys.path.append("../puqsar")
+
+# Create folder to save results
+saveFolder = '../results/DNN-multitask'
+if not os.path.exists(saveFolder):
+    os.makedirs(saveFolder)
+
+# Load Example Data
+dat_train = pd.read_csv("../data/3A4_processed/dat_train.csv")
+dat_test= pd.read_csv("../data/3A4_processed/dat_test_rs.csv")
+
+# Hyperparameters for DNN-multitask outputs
+n_out = 50
+p_missing = 0.6
+
+# Preparing Data for DNN-multitask model training and testing
+from utils.preprocessing import *
+X_train, X_cal, y_train_norm, y_cal_norm, mu_tr, sd_tr,min_value,df_label_train, df_label_cal = preprocessing_DNN_multitask_train(dat_train, p_cal = 0.2, n_out=50,p_missing=0.6,seed=99)
+X_test, df_label_test = preprocessing_DNN_default_test(dat_test, mu_tr, sd_tr)
+
+# Load functions for DNN-multitask model, and specify Hyperparameters
+from models.DNN_multitask import *
+
+# Hyperparameters for DNN structure and training
+p_batchSize = 0.05
+learn_rate = 0.001
+
+nn_pars = {'nodes' : [4000, 2000, 1000, 1000],
+           'dropout': [0.25, 0.25, 0.25, 0.1],
+           'batch_size' : min(128,round(X_train.shape[0]*p_batchSize)),
+           'learn_rate' : learn_rate,
+           'epochs' : 500,
+           'n_out': n_out,
+           'min_value': min_value,
+           'wt_decay' : 0.00005}
+
+# Train a DNN-multitask model
+model = train_DNN_multitask(X_train, y_train_norm, X_cal, y_cal_norm, nn_pars)
+
+# Prediction on Calibration set
+pred_cal_mat = model.predict(X_cal)
+pred_cal_avg = np.mean(pred_cal_mat, 1) * sd_tr + mu_tr
+pred_cal_sd = np.std(pred_cal_mat, 1) * sd_tr
+df_pred_cal = pd.concat([df_label_cal.reset_index(drop=True, inplace=False),
+                         pd.DataFrame({"Pred": pred_cal_avg, "Pred_UNC": pred_cal_sd})], axis=1, ignore_index = False, sort = False)
+
+# Calibration step
+#  Conformal algorithm options: CP_ACE, CP_expSD, CP_homo
+from calibrators.ICP import *
+nominal_level=0.8
+fun_PI = CP_ACE(df_pred_cal, nominal_level)
+
+# Save the model to .h5, prediction for calibration set (including raw unertainty score) to .csv, and the calibration results to .pkl file
+model_path = os.path.join(saveFolder, 'model.h5')
+model.save(model_path)
+df_pred_cal.to_csv(os.path.join(saveFolder,"df_pred_cal.csv"), header=True, index=False)
+with open(os.path.join(saveFolder, 'calibration.pkl'), 'wb') as file:
+    dill.dump([fun_PI,nominal_level,mu_tr,sd_tr], file)
+
+# Application on Test set and save results as CSV
+"""
+from tensorflow.keras.models import load_model
+model = load_model(os.path.join(saveFolder, 'model.h5'))
+
+with open(os.path.join(saveFolder, 'calibration.pkl'), 'rb') as file:
+    fun_PI,nominal_level,mu_tr,sd_tr = dill.load(file)
+"""
+
+pred_test_mat = model.predict(X_test)
+pred_test_avg = np.mean(pred_test_mat, 1) * sd_tr + mu_tr
+pred_test_sd = np.std(pred_test_mat, 1) * sd_tr
+df_pred_test = pd.concat([df_label_test.reset_index(drop=True, inplace=False),
+                         pd.DataFrame({"Pred": pred_test_avg,"Pred_UNC": pred_test_sd})], axis=1, ignore_index = False, sort = False)
+df_pred_test = fun_PI(df_pred_test)
+df_pred_test.to_csv(os.path.join(saveFolder,"df_pred_test.csv"), header=True, index=False)
diff --git a/examples/run_LGB-tail.py b/examples/run_LGB-tail.py
@@ -0,0 +1,97 @@
+#    Copyright © 2023 Merck & Co., Inc., Rahway, NJ, USA and its affiliates. All rights reserved.
+#
+#    This file is part of the PUQSAR package, an open source software for computing the Prediction Uncertainty for QSAR.
+#
+#    Prediction Uncertainty for QSAR (PUQSAR) is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#!/usr/bin/env python
+
+# python run_LGB-tail.py
+
+# Prerequisites
+import os
+import sys
+import time
+import glob
+import numpy as np
+import pandas as pd
+sys.path.append("../puqsar")
+
+# Create folder to save results
+saveFolder = '../results/LGB-tail'
+if not os.path.exists(saveFolder):
+    os.makedirs(saveFolder)
+
+# Load Example Data
+dat_train = pd.read_csv("../data/3A4_processed/dat_train.csv")
+dat_test= pd.read_csv("../data/3A4_processed/dat_test_rs.csv")
+
+# Prepare Data for LGB-tail model training and testing
+from utils.preprocessing import *
+X_train, X_cal, y_train, y_cal, df_label_train, df_label_cal = preprocessing_default_train(dat_train, p_cal = 0.2, seed = 666)
+X_test, df_label_test = preprocessing_default_test(dat_test)
+
+from scipy import sparse
+import lightgbm as lgb
+train_xy = lgb.Dataset(sparse.csr_matrix(X_train), label=y_train)
+
+# Train a LGB model
+#import lightgbm as lgb
+
+param = {"num_leaves": 64,
+         "objective": "regression",
+         "metric": "mse",
+         "bagging_freq": 1,
+         "bagging_fraction": 0.7,
+         "feature_fraction": 0.7,
+         "learning_rate": 0.01,
+         "num_iterations": 1500,
+         "random_state": 1357,
+         "boosting_type": 'gbdt',
+         }
+model = lgb.train(param, train_xy)
+
+# Prediction on Calibration set
+from models.LGB_tail import *
+pred_cal, pred_cal_sd = lgb_tail_preds(model, X_cal, w=0.2)
+df_pred_cal = pd.concat([df_label_cal.reset_index(drop=True, inplace=False),
+                         pd.DataFrame({"Pred": pred_cal, "Pred_UNC": pred_cal_sd})], axis=1, ignore_index = False, sort = False)
+
+# Calibration step
+#  Conformal algorithm options: CP_ACE, CP_expSD, CP_homo
+from calibrators.ICP import *
+nominal_level=0.8
+fun_PI = CP_ACE(df_pred_cal, nominal_level)
+
+# Save the model to .txt file, the calibration results to .pkl file, and the prediction for calibration set (including raw unertainty score) to .csv
+import dill
+model.save_model(os.path.join(saveFolder, 'model.txt'))
+df_pred_cal.to_csv(os.path.join(saveFolder,"df_pred_cal.csv"), header=True, index=False)
+with open(os.path.join(saveFolder, 'calibration.pkl'), 'wb') as file:
+    dill.dump([fun_PI,nominal_level], file)
+
+# Application on Test set
+"""
+model = lgb.Booster(model_file=os.path.join(saveFolder, 'model.txt'))
+
+with open(os.path.join(saveFolder, 'calibration.pkl'), 'rb') as file:
+    fun_PI,nominal_level = dill.load(file)
+"""
+
+pred_test, pred_test_sd = lgb_tail_preds(model, X_test, w=0.2)
+df_pred_test = pd.concat([df_label_test.reset_index(drop=True, inplace=False),
+                         pd.DataFrame({"Pred": pred_test,"Pred_UNC": pred_test_sd})], axis=1, ignore_index = False, sort = False)
+df_pred_test = fun_PI(df_pred_test)
+df_pred_test.to_csv(os.path.join(saveFolder,"df_pred_test.csv"), header=True, index=False)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		## Unzip the compressed example datasets in this folder

		tar -xvf exampleData.tar.xz