Skip to content

Commit

Permalink
mlops CI
Browse files Browse the repository at this point in the history
  • Loading branch information
Chandru-21 committed Jun 25, 2024
1 parent 1f406ee commit 9152031
Show file tree
Hide file tree
Showing 19 changed files with 618 additions and 0 deletions.
46 changes: 46 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

name: MLOps AWS

jobs:
deploy:
runs-on: ubuntu-latest

steps:
- name: Checkout
uses: actions/checkout@v2

- name: Install kubectl
uses: azure/setup-kubectl@v2.0
with:
version: 'v1.24.0' # default is latest stable
id: install

- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-2

- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1

- name: Build, tag, and push the image to Amazon ECR
id: build-image
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
ECR_REPOSITORY: ${{ secrets.REPO_NAME }}
IMAGE_TAG: latest
run: |
# Build a docker container and push it to ECR
docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
echo "Pushing image to ECR..."
docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
43 changes: 43 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
FROM python:3.10-slim-buster

RUN pip install --upgrade pip

WORKDIR /app

COPY . /app

#set permissions

RUN chmod +x /app/tests

RUN chmod +w /app/tests

RUN chmod +x /app/prediction_model

RUN chmod +w /app/prediction_model/trained_models

RUN chmod +w /app/prediction_model/datasets


ENV PYTHONPATH "${PYTHONPATH}:/app/prediction_model"


RUN pip install --no-cache-dir -r requirements.txt

RUN pip install dvc[s3]

RUN dvc pull

RUN python /app/prediction_model/training_pipeline.py

RUN pytest -v /app/tests/test_prediction.py

RUN pytest --junitxml=/app/tests/test-results.xml /app/tests/test_prediction.py

EXPOSE 8005

ENTRYPOINT ["python"]

CMD ["main.py"]


139 changes: 139 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from fastapi import FastAPI , File, UploadFile
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
import uvicorn
import numpy as np
import pandas as pd
from fastapi.middleware.cors import CORSMiddleware
from prediction_model.predict import generate_predictions,generate_predictions_batch
from prediction_model.config import config
import mlflow
import io
import boto3
from datetime import datetime


def upload_to_s3(file_content, filename):
s3 = boto3.client('s3')

current_date = datetime.now().strftime("%Y-%m-%d")
if filename.endswith('.csv'):
filename = filename[:-4]

current_datetime = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

folder_path = f"{config.FOLDER}/{current_date}"

filename_with_datetime = f"{filename}_{current_datetime}.csv"

s3_key = f"{folder_path}/{filename_with_datetime}"

response = s3.put_object(Bucket=config.S3_BUCKET, Key=s3_key, Body=file_content)

return s3_key

# mlflow.set_tracking_uri("http://localhost:5000")

mlflow.set_tracking_uri(config.TRACKING_URI)

app = FastAPI(
title="Loan Prediction App using FastAPI - MLOps",
description = "MLOps Demo",
version='1.0'
)

origins=[
"*"
]

app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"]
)

class LoanPrediction(BaseModel):
Gender: str
Married: str
Dependents: str
Education: str
Self_Employed: str
ApplicantIncome: float
CoapplicantIncome: float
LoanAmount: float
Loan_Amount_Term: float
Credit_History: float
Property_Area: str


@app.get("/")
def index():
return {"message":"Welcome to Loan Prediction App using API - CI CD Jenkins" }

@app.post("/prediction_api")
def predict(loan_details: LoanPrediction):
data = loan_details.model_dump()
prediction = generate_predictions([data])["prediction"][0]
if prediction == "Y":
pred = "Approved"
else:
pred = "Rejected"
return {"status":pred}

@app.post("/prediction_ui")
def predict_gui(Gender: str,
Married: str,
Dependents: str,
Education: str,
Self_Employed: str,
ApplicantIncome: float,
CoapplicantIncome: float,
LoanAmount: float,
Loan_Amount_Term: float,
Credit_History: float,
Property_Area: str):

input_data = [Gender, Married,Dependents, Education, Self_Employed,ApplicantIncome,
CoapplicantIncome,LoanAmount, Loan_Amount_Term,Credit_History, Property_Area ]

cols = ['Gender', 'Married', 'Dependents', 'Education',
'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
'Loan_Amount_Term', 'Credit_History', 'Property_Area']

data_dict = dict(zip(cols,input_data))
prediction = generate_predictions([data_dict])["prediction"][0]
if prediction == "Y":
pred = "Approved"
else:
pred = "Rejected"
return {"status":pred}


@app.post("/batch_prediction")
async def batch_predict(file: UploadFile = File(...)):

content = await file.read()
df = pd.read_csv(io.BytesIO(content),index_col=False)
print(df)

# Ensure the CSV file contains the required features
required_columns = config.FEATURES
if not all(column in df.columns for column in required_columns):
return {"error": "CSV file does not contain the required columns."}

predictions = generate_predictions_batch(df)["prediction"]

df['Prediction'] = predictions
result = df.to_csv(index=False)

s3_key = upload_to_s3(result.encode('utf-8'), file.filename)

return StreamingResponse(io.BytesIO(result.encode('utf-8')), media_type="text/csv", headers={"Content-Disposition":"attachment; filename=predictions.csv"})




if __name__== "__main__":
uvicorn.run(app, host="0.0.0.0",port=8005)
1 change: 1 addition & 0 deletions prediction_model/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1.0.0
Binary file not shown.
Binary file not shown.
Binary file not shown.
60 changes: 60 additions & 0 deletions prediction_model/config/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import pathlib
import os


current_directory = os.path.dirname(os.path.realpath(__file__)) #current directory of the script

PACKAGE_ROOT = os.path.dirname(current_directory) #parent directory of current directory


# PACKAGE_ROOT = pathlib.Path(prediction_model.__file__).resolve().parent

DATAPATH = os.path.join(PACKAGE_ROOT,"datasets")

TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'

TARGET = 'Loan_Status'

#Final features used in the model
FEATURES = ['Gender', 'Married', 'Dependents', 'Education',
'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
'Loan_Amount_Term', 'Credit_History', 'Property_Area']

NUM_FEATURES = ['ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

CAT_FEATURES = ['Gender',
'Married',
'Dependents',
'Education',
'Self_Employed',
'Credit_History',
'Property_Area']

# in our case it is same as Categorical features
FEATURES_TO_ENCODE = ['Gender',
'Married',
'Dependents',
'Education',
'Self_Employed',
'Credit_History',
'Property_Area']

FEATURE_TO_MODIFY = ['ApplicantIncome']
FEATURE_TO_ADD = 'CoapplicantIncome'

DROP_FEATURES = ['CoapplicantIncome']

LOG_FEATURES = ['ApplicantIncome', 'LoanAmount'] # taking log of numerical columns

S3_BUCKET = "loanprediction"

FOLDER="datadrift"

TRACKING_URI="http://ec2-3-19-244-223.us-east-2.compute.amazonaws.com:5000/"


EXPERIMENT_NAME="loan_prediction_model"

MODEL_NAME="/Loanprediction-model"

23 changes: 23 additions & 0 deletions prediction_model/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from sklearn.pipeline import Pipeline
from prediction_model.config import config
import prediction_model.processing.preprocessing as pp
from sklearn.preprocessing import MinMaxScaler





preprocessing_pipeline = Pipeline(
[
('DomainProcessing',pp.DomainProcessing(variable_to_modify = config.FEATURE_TO_MODIFY,
variable_to_add = config.FEATURE_TO_ADD)),
('MeanImputation', pp.MeanImputer(variables=config.NUM_FEATURES)),
('ModeImputation',pp.ModeImputer(variables=config.CAT_FEATURES)),
('DropFeatures', pp.DropColumns(variables_to_drop=config.DROP_FEATURES)),
('LabelEncoder',pp.CustomLabelEncoder(variables=config.FEATURES_TO_ENCODE)),
('LogTransform',pp.LogTransforms(variables=config.LOG_FEATURES)),
('MinMaxScale', MinMaxScaler())
]
)


44 changes: 44 additions & 0 deletions prediction_model/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pandas as pd
import numpy as np
from prediction_model.config import config
import mlflow



def generate_predictions(data_input):
data = pd.DataFrame(data_input)
experiment_name = config.EXPERIMENT_NAME
experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id
runs_df=mlflow.search_runs(experiment_ids=experiment_id,order_by=['metrics.f1_score DESC'])
best_run=runs_df.iloc[0]
best_run_id=best_run['run_id']
best_model='runs:/' + best_run_id + config.MODEL_NAME
loan_prediction_model=mlflow.sklearn.load_model(best_model)
prediction=loan_prediction_model.predict(data)
output = np.where(prediction==1,'Y','N')
result = {"prediction":output}
return result


def generate_predictions_batch(data_input):
# data = pd.DataFrame(data_input)
experiment_name = config.EXPERIMENT_NAME
experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id
runs_df=mlflow.search_runs(experiment_ids=experiment_id,order_by=['metrics.f1_score DESC'])
best_run=runs_df.iloc[0]
best_run_id=best_run['run_id']
best_model='runs:/' + best_run_id + config.MODEL_NAME
loan_prediction_model=mlflow.sklearn.load_model(best_model)
prediction=loan_prediction_model.predict(data_input)
output = np.where(prediction==1,'Y','N')
result = {"prediction":output}
return result





if __name__=='__main__':
generate_predictions()
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
10 changes: 10 additions & 0 deletions prediction_model/processing/data_handling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import os
import pandas as pd
from prediction_model.config import config

#Load the dataset
def load_dataset(file_name):
filepath = os.path.join(config.DATAPATH,file_name)
_data = pd.read_csv(filepath)
return _data

Loading

0 comments on commit 9152031

Please sign in to comment.