diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..e4c514d --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,46 @@ +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +name: MLOps AWS + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v2 + + - name: Install kubectl + uses: azure/setup-kubectl@v2.0 + with: + version: 'v1.24.0' # default is latest stable + id: install + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-2 + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v1 + + - name: Build, tag, and push the image to Amazon ECR + id: build-image + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + ECR_REPOSITORY: ${{ secrets.REPO_NAME }} + IMAGE_TAG: latest + run: | + # Build a docker container and push it to ECR + docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG . + echo "Pushing image to ECR..." + docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG + echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" + diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b666461 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,43 @@ +FROM python:3.10-slim-buster + +RUN pip install --upgrade pip + +WORKDIR /app + +COPY . /app + +#set permissions + +RUN chmod +x /app/tests + +RUN chmod +w /app/tests + +RUN chmod +x /app/prediction_model + +RUN chmod +w /app/prediction_model/trained_models + +RUN chmod +w /app/prediction_model/datasets + + +ENV PYTHONPATH "${PYTHONPATH}:/app/prediction_model" + + +RUN pip install --no-cache-dir -r requirements.txt + +RUN pip install dvc[s3] + +RUN dvc pull + +RUN python /app/prediction_model/training_pipeline.py + +RUN pytest -v /app/tests/test_prediction.py + +RUN pytest --junitxml=/app/tests/test-results.xml /app/tests/test_prediction.py + +EXPOSE 8005 + +ENTRYPOINT ["python"] + +CMD ["main.py"] + + diff --git a/main.py b/main.py new file mode 100644 index 0000000..00ab622 --- /dev/null +++ b/main.py @@ -0,0 +1,139 @@ +from fastapi import FastAPI , File, UploadFile +from fastapi.responses import StreamingResponse +from pydantic import BaseModel +import uvicorn +import numpy as np +import pandas as pd +from fastapi.middleware.cors import CORSMiddleware +from prediction_model.predict import generate_predictions,generate_predictions_batch +from prediction_model.config import config +import mlflow +import io +import boto3 +from datetime import datetime + + +def upload_to_s3(file_content, filename): + s3 = boto3.client('s3') + + current_date = datetime.now().strftime("%Y-%m-%d") + if filename.endswith('.csv'): + filename = filename[:-4] + + current_datetime = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + + folder_path = f"{config.FOLDER}/{current_date}" + + filename_with_datetime = f"{filename}_{current_datetime}.csv" + + s3_key = f"{folder_path}/{filename_with_datetime}" + + response = s3.put_object(Bucket=config.S3_BUCKET, Key=s3_key, Body=file_content) + + return s3_key + +# mlflow.set_tracking_uri("http://localhost:5000") + +mlflow.set_tracking_uri(config.TRACKING_URI) + +app = FastAPI( + title="Loan Prediction App using FastAPI - MLOps", + description = "MLOps Demo", + version='1.0' +) + +origins=[ + "*" +] + +app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"] +) + +class LoanPrediction(BaseModel): + Gender: str + Married: str + Dependents: str + Education: str + Self_Employed: str + ApplicantIncome: float + CoapplicantIncome: float + LoanAmount: float + Loan_Amount_Term: float + Credit_History: float + Property_Area: str + + +@app.get("/") +def index(): + return {"message":"Welcome to Loan Prediction App using API - CI CD Jenkins" } + +@app.post("/prediction_api") +def predict(loan_details: LoanPrediction): + data = loan_details.model_dump() + prediction = generate_predictions([data])["prediction"][0] + if prediction == "Y": + pred = "Approved" + else: + pred = "Rejected" + return {"status":pred} + +@app.post("/prediction_ui") +def predict_gui(Gender: str, + Married: str, + Dependents: str, + Education: str, + Self_Employed: str, + ApplicantIncome: float, + CoapplicantIncome: float, + LoanAmount: float, + Loan_Amount_Term: float, + Credit_History: float, + Property_Area: str): + + input_data = [Gender, Married,Dependents, Education, Self_Employed,ApplicantIncome, + CoapplicantIncome,LoanAmount, Loan_Amount_Term,Credit_History, Property_Area ] + + cols = ['Gender', 'Married', 'Dependents', 'Education', + 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', + 'Loan_Amount_Term', 'Credit_History', 'Property_Area'] + + data_dict = dict(zip(cols,input_data)) + prediction = generate_predictions([data_dict])["prediction"][0] + if prediction == "Y": + pred = "Approved" + else: + pred = "Rejected" + return {"status":pred} + + +@app.post("/batch_prediction") +async def batch_predict(file: UploadFile = File(...)): + + content = await file.read() + df = pd.read_csv(io.BytesIO(content),index_col=False) + print(df) + + # Ensure the CSV file contains the required features + required_columns = config.FEATURES + if not all(column in df.columns for column in required_columns): + return {"error": "CSV file does not contain the required columns."} + + predictions = generate_predictions_batch(df)["prediction"] + + df['Prediction'] = predictions + result = df.to_csv(index=False) + + s3_key = upload_to_s3(result.encode('utf-8'), file.filename) + + return StreamingResponse(io.BytesIO(result.encode('utf-8')), media_type="text/csv", headers={"Content-Disposition":"attachment; filename=predictions.csv"}) + + + + +if __name__== "__main__": + uvicorn.run(app, host="0.0.0.0",port=8005) \ No newline at end of file diff --git a/prediction_model/VERSION b/prediction_model/VERSION new file mode 100644 index 0000000..3eefcb9 --- /dev/null +++ b/prediction_model/VERSION @@ -0,0 +1 @@ +1.0.0 diff --git a/prediction_model/__pycache__/pipeline.cpython-311.pyc b/prediction_model/__pycache__/pipeline.cpython-311.pyc new file mode 100644 index 0000000..5df1b1d Binary files /dev/null and b/prediction_model/__pycache__/pipeline.cpython-311.pyc differ diff --git a/prediction_model/__pycache__/predict.cpython-311.pyc b/prediction_model/__pycache__/predict.cpython-311.pyc new file mode 100644 index 0000000..fdcf347 Binary files /dev/null and b/prediction_model/__pycache__/predict.cpython-311.pyc differ diff --git a/prediction_model/config/__pycache__/config.cpython-311.pyc b/prediction_model/config/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000..901097d Binary files /dev/null and b/prediction_model/config/__pycache__/config.cpython-311.pyc differ diff --git a/prediction_model/config/config.py b/prediction_model/config/config.py new file mode 100644 index 0000000..f8f03ec --- /dev/null +++ b/prediction_model/config/config.py @@ -0,0 +1,60 @@ +import pathlib +import os + + +current_directory = os.path.dirname(os.path.realpath(__file__)) #current directory of the script + +PACKAGE_ROOT = os.path.dirname(current_directory) #parent directory of current directory + + +# PACKAGE_ROOT = pathlib.Path(prediction_model.__file__).resolve().parent + +DATAPATH = os.path.join(PACKAGE_ROOT,"datasets") + +TRAIN_FILE = 'train.csv' +TEST_FILE = 'test.csv' + +TARGET = 'Loan_Status' + +#Final features used in the model +FEATURES = ['Gender', 'Married', 'Dependents', 'Education', + 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', + 'Loan_Amount_Term', 'Credit_History', 'Property_Area'] + +NUM_FEATURES = ['ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term'] + +CAT_FEATURES = ['Gender', + 'Married', + 'Dependents', + 'Education', + 'Self_Employed', + 'Credit_History', + 'Property_Area'] + +# in our case it is same as Categorical features +FEATURES_TO_ENCODE = ['Gender', + 'Married', + 'Dependents', + 'Education', + 'Self_Employed', + 'Credit_History', + 'Property_Area'] + +FEATURE_TO_MODIFY = ['ApplicantIncome'] +FEATURE_TO_ADD = 'CoapplicantIncome' + +DROP_FEATURES = ['CoapplicantIncome'] + +LOG_FEATURES = ['ApplicantIncome', 'LoanAmount'] # taking log of numerical columns + +S3_BUCKET = "loanprediction" + +FOLDER="datadrift" + +TRACKING_URI="http://ec2-3-19-244-223.us-east-2.compute.amazonaws.com:5000/" + + +EXPERIMENT_NAME="loan_prediction_model" + +MODEL_NAME="/Loanprediction-model" + diff --git a/prediction_model/pipeline.py b/prediction_model/pipeline.py new file mode 100644 index 0000000..6dfeccb --- /dev/null +++ b/prediction_model/pipeline.py @@ -0,0 +1,23 @@ +from sklearn.pipeline import Pipeline +from prediction_model.config import config +import prediction_model.processing.preprocessing as pp +from sklearn.preprocessing import MinMaxScaler + + + + + +preprocessing_pipeline = Pipeline( + [ + ('DomainProcessing',pp.DomainProcessing(variable_to_modify = config.FEATURE_TO_MODIFY, + variable_to_add = config.FEATURE_TO_ADD)), + ('MeanImputation', pp.MeanImputer(variables=config.NUM_FEATURES)), + ('ModeImputation',pp.ModeImputer(variables=config.CAT_FEATURES)), + ('DropFeatures', pp.DropColumns(variables_to_drop=config.DROP_FEATURES)), + ('LabelEncoder',pp.CustomLabelEncoder(variables=config.FEATURES_TO_ENCODE)), + ('LogTransform',pp.LogTransforms(variables=config.LOG_FEATURES)), + ('MinMaxScale', MinMaxScaler()) + ] +) + + diff --git a/prediction_model/predict.py b/prediction_model/predict.py new file mode 100644 index 0000000..9619d93 --- /dev/null +++ b/prediction_model/predict.py @@ -0,0 +1,44 @@ +import pandas as pd +import numpy as np +from prediction_model.config import config +import mlflow + + + +def generate_predictions(data_input): + data = pd.DataFrame(data_input) + experiment_name = config.EXPERIMENT_NAME + experiment = mlflow.get_experiment_by_name(experiment_name) + experiment_id = experiment.experiment_id + runs_df=mlflow.search_runs(experiment_ids=experiment_id,order_by=['metrics.f1_score DESC']) + best_run=runs_df.iloc[0] + best_run_id=best_run['run_id'] + best_model='runs:/' + best_run_id + config.MODEL_NAME + loan_prediction_model=mlflow.sklearn.load_model(best_model) + prediction=loan_prediction_model.predict(data) + output = np.where(prediction==1,'Y','N') + result = {"prediction":output} + return result + + +def generate_predictions_batch(data_input): + # data = pd.DataFrame(data_input) + experiment_name = config.EXPERIMENT_NAME + experiment = mlflow.get_experiment_by_name(experiment_name) + experiment_id = experiment.experiment_id + runs_df=mlflow.search_runs(experiment_ids=experiment_id,order_by=['metrics.f1_score DESC']) + best_run=runs_df.iloc[0] + best_run_id=best_run['run_id'] + best_model='runs:/' + best_run_id + config.MODEL_NAME + loan_prediction_model=mlflow.sklearn.load_model(best_model) + prediction=loan_prediction_model.predict(data_input) + output = np.where(prediction==1,'Y','N') + result = {"prediction":output} + return result + + + + + +if __name__=='__main__': + generate_predictions() \ No newline at end of file diff --git a/prediction_model/processing/__pycache__/data_handling.cpython-311.pyc b/prediction_model/processing/__pycache__/data_handling.cpython-311.pyc new file mode 100644 index 0000000..4bd0661 Binary files /dev/null and b/prediction_model/processing/__pycache__/data_handling.cpython-311.pyc differ diff --git a/prediction_model/processing/__pycache__/data_handling.cpython-39.pyc b/prediction_model/processing/__pycache__/data_handling.cpython-39.pyc new file mode 100644 index 0000000..920605b Binary files /dev/null and b/prediction_model/processing/__pycache__/data_handling.cpython-39.pyc differ diff --git a/prediction_model/processing/__pycache__/preprocessing.cpython-311.pyc b/prediction_model/processing/__pycache__/preprocessing.cpython-311.pyc new file mode 100644 index 0000000..2eba70a Binary files /dev/null and b/prediction_model/processing/__pycache__/preprocessing.cpython-311.pyc differ diff --git a/prediction_model/processing/__pycache__/preprocessing.cpython-39.pyc b/prediction_model/processing/__pycache__/preprocessing.cpython-39.pyc new file mode 100644 index 0000000..190bfef Binary files /dev/null and b/prediction_model/processing/__pycache__/preprocessing.cpython-39.pyc differ diff --git a/prediction_model/processing/data_handling.py b/prediction_model/processing/data_handling.py new file mode 100644 index 0000000..74a1f1a --- /dev/null +++ b/prediction_model/processing/data_handling.py @@ -0,0 +1,10 @@ +import os +import pandas as pd +from prediction_model.config import config + +#Load the dataset +def load_dataset(file_name): + filepath = os.path.join(config.DATAPATH,file_name) + _data = pd.read_csv(filepath) + return _data + diff --git a/prediction_model/processing/preprocessing.py b/prediction_model/processing/preprocessing.py new file mode 100644 index 0000000..7128e9a --- /dev/null +++ b/prediction_model/processing/preprocessing.py @@ -0,0 +1,94 @@ +from sklearn.base import BaseEstimator,TransformerMixin +from prediction_model.config import config +import numpy as np + +class MeanImputer(BaseEstimator,TransformerMixin): + def __init__(self,variables=None): + self.variables = variables + + def fit(self,X,y=None): + self.mean_dict = {} + for col in self.variables: + self.mean_dict[col] = X[col].mean() + return self + + def transform(self,X): + X = X.copy() + for col in self.variables: + X[col].fillna(self.mean_dict[col],inplace=True) + return X + + +class ModeImputer(BaseEstimator,TransformerMixin): + def __init__(self,variables=None): + self.variables = variables + + def fit(self,X,y=None): + self.mode_dict = {} + for col in self.variables: + self.mode_dict[col] = X[col].mode()[0] + return self + + def transform(self,X): + X = X.copy() + for col in self.variables: + X[col].fillna(self.mode_dict[col],inplace=True) + return X + +class DropColumns(BaseEstimator,TransformerMixin): + def __init__(self,variables_to_drop=None): + self.variables_to_drop = variables_to_drop + + def fit(self,X,y=None): + return self + + def transform(self,X): + X = X.copy() + X = X.drop(columns = self.variables_to_drop) + return X + +class DomainProcessing(BaseEstimator,TransformerMixin): + def __init__(self,variable_to_modify = None, variable_to_add = None): + self.variable_to_modify = variable_to_modify + self.variable_to_add = variable_to_add + + def fit(self,X,y=None): + return self + + def transform(self,X): + X = X.copy() + for feature in self.variable_to_modify: + X[feature] = X[feature] + X[self.variable_to_add] + return X + +class CustomLabelEncoder(BaseEstimator,TransformerMixin): + def __init__(self, variables=None): + self.variables=variables + + def fit(self, X,y=None): + self.label_dict = {} + for var in self.variables: + t = X[var].value_counts().sort_values(ascending=True).index + self.label_dict[var] = {k:i for i,k in enumerate(t,0)} + return self + + def transform(self,X): + X=X.copy() + for feature in self.variables: + X[feature] = X[feature].map(self.label_dict[feature]) + return X + + +# Try out Log Transformation +class LogTransforms(BaseEstimator,TransformerMixin): + def __init__(self,variables=None): + self.variables = variables + + def fit(self,X,y=None): + return self + + def transform(self,X): + X = X.copy() + for col in self.variables: + X[col] = np.log(X[col]) + return X \ No newline at end of file diff --git a/prediction_model/training_pipeline.py b/prediction_model/training_pipeline.py new file mode 100644 index 0000000..577962e --- /dev/null +++ b/prediction_model/training_pipeline.py @@ -0,0 +1,114 @@ +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score +import mlflow +import mlflow.sklearn +from hyperopt import fmin, tpe, hp, Trials, STATUS_OK +import xgboost as xgb +from prediction_model.config import config +from prediction_model.processing.data_handling import load_dataset +import prediction_model.processing.preprocessing as pp +import prediction_model.pipeline as pipe +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import MinMaxScaler + + +#mlflow.set_tracking_uri("http://127.0.0.1:5000") + +mlflow.set_tracking_uri(config.TRACKING_URI) + +def get_data(input): + data=load_dataset(input) + x=data[config.FEATURES] + y=data[config.TARGET].map({'N':0,'Y':1}) + return x,y + + +X,Y=get_data(config.TRAIN_FILE) + +X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) + + +# Define the search space +search_space = { + 'max_depth': hp.choice('max_depth', np.arange(3, 10, dtype=int)), + 'learning_rate': hp.uniform('learning_rate', 0.01, 0.3), + 'n_estimators': hp.choice('n_estimators', np.arange(50, 300, 50, dtype=int)), + 'subsample': hp.uniform('subsample', 0.5, 1.0), + 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0), + 'gamma': hp.uniform('gamma', 0, 5), + 'reg_alpha': hp.uniform('reg_alpha', 0, 1), + 'reg_lambda': hp.uniform('reg_lambda', 0, 1) +} + + +def objective(params): + # Create an XGBoost classifier with the given hyperparameters + + clf = xgb.XGBClassifier( + max_depth=params['max_depth'], + learning_rate=params['learning_rate'], + n_estimators=params['n_estimators'], + subsample=params['subsample'], + colsample_bytree=params['colsample_bytree'], + gamma=params['gamma'], + reg_alpha=params['reg_alpha'], + reg_lambda=params['reg_lambda'], + use_label_encoder=False, + eval_metric='mlogloss' + ) + + # Define the complete pipeline with preprocessing and model + classification_pipeline = Pipeline( + [ + ('DomainProcessing', pp.DomainProcessing(variable_to_modify=config.FEATURE_TO_MODIFY, variable_to_add=config.FEATURE_TO_ADD)), + ('MeanImputation', pp.MeanImputer(variables=config.NUM_FEATURES)), + ('ModeImputation', pp.ModeImputer(variables=config.CAT_FEATURES)), + ('DropFeatures', pp.DropColumns(variables_to_drop=config.DROP_FEATURES)), + ('LabelEncoder', pp.CustomLabelEncoder(variables=config.FEATURES_TO_ENCODE)), + ('LogTransform', pp.LogTransforms(variables=config.LOG_FEATURES)), + ('MinMaxScale', MinMaxScaler()), + ('XGBoostClassifier', clf) + ] + ) + + + # Fit the pipeline + mlflow.xgboost.autolog() + mlflow.set_experiment("loan_prediction_model") + with mlflow.start_run(nested=True): + # Fit the pipeline + classification_pipeline.fit(X_train, y_train) + + # Make predictions + y_pred = classification_pipeline.predict(X_test) + + # Calculate metrics + f1 = f1_score(y_test, y_pred) + accuracy = accuracy_score(y_test, y_pred) + recall = recall_score(y_test, y_pred) + precision = precision_score(y_test, y_pred) + + # Log metrics manually + mlflow.log_metrics({ + 'f1_score': f1, + 'accuracy': accuracy, + 'recall': recall, + 'precision': precision + }) + + mlflow.sklearn.log_model(classification_pipeline, "Loanprediction-model") + return {'loss': 1-f1, 'status': STATUS_OK} + + + +trials = Trials() + +best_params = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=5, trials=trials) + +print("Best hyperparameters:", best_params) + + + + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d4f90f2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +# Model building requirements +numpy==1.24.3 +pandas==1.5.3 +joblib==1.2.0 +scikit-learn==1.2.2 +scipy==1.10.1 +pytest==7.4.2 +fastapi==0.103.0 +pydantic==2.4.2 +uvicorn==0.23.2 +gunicorn==21.2.0 +hyperopt==0.2.7 +mlflow==2.13.2 +xgboost==2.0.3 +python-multipart==0.0.9 \ No newline at end of file diff --git a/tests/test_prediction.py b/tests/test_prediction.py new file mode 100644 index 0000000..9f21698 --- /dev/null +++ b/tests/test_prediction.py @@ -0,0 +1,29 @@ +import pytest +from prediction_model.config import config +from prediction_model.processing.data_handling import load_dataset +from prediction_model.predict import generate_predictions +import mlflow + +# output from predict script not null +# output from predict script is str data type +# the output is Y for an example data + +mlflow.set_tracking_uri(config.TRACKING_URI) + +@pytest.fixture +def single_prediction(): + test_dataset = load_dataset(config.TEST_FILE) + single_row = test_dataset[config.FEATURES][:1] + result = generate_predictions(single_row) + return result + +def test_single_pred_not_none(single_prediction): # output is not none + assert single_prediction is not None + +def test_single_pred_str_type(single_prediction): # data type is string + assert isinstance(single_prediction.get('prediction')[0],str) + +def test_single_pred_validate(single_prediction): # check the output is Y + assert single_prediction.get('prediction')[0] == 'Y' + +