mlops CI

Chandru-21 · Jun 25, 2024 · 9152031 · 9152031
1 parent 1f406ee
commit 9152031
Show file tree

Hide file tree

Showing 19 changed files with 618 additions and 0 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,46 @@
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+name: MLOps AWS
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+
+    - name: Install kubectl
+      uses: azure/setup-kubectl@v2.0
+      with:
+        version: 'v1.24.0' # default is latest stable
+      id: install
+
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v1
+      with:
+        aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        aws-region: us-east-2
+
+    - name: Login to Amazon ECR
+      id: login-ecr
+      uses: aws-actions/amazon-ecr-login@v1
+
+    - name: Build, tag, and push the image to Amazon ECR
+      id: build-image
+      env:
+        ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+        ECR_REPOSITORY: ${{ secrets.REPO_NAME }}
+        IMAGE_TAG: latest
+      run: |
+        # Build a docker container and push it to ECR 
+        docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
+        echo "Pushing image to ECR..."
+        docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
+        echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
+   
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,43 @@
+FROM python:3.10-slim-buster
+
+RUN pip install --upgrade pip
+
+WORKDIR /app
+
+COPY . /app 
+
+#set permissions
+
+RUN chmod +x /app/tests
+
+RUN chmod +w /app/tests
+
+RUN chmod +x /app/prediction_model
+
+RUN chmod +w /app/prediction_model/trained_models
+
+RUN chmod +w /app/prediction_model/datasets
+
+
+ENV PYTHONPATH "${PYTHONPATH}:/app/prediction_model"
+
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+RUN pip install dvc[s3]
+
+RUN dvc pull
+
+RUN python /app/prediction_model/training_pipeline.py
+
+RUN pytest -v /app/tests/test_prediction.py
+
+RUN pytest --junitxml=/app/tests/test-results.xml /app/tests/test_prediction.py
+
+EXPOSE 8005
+
+ENTRYPOINT ["python"]
+
+CMD ["main.py"]
+
+
diff --git a/main.py b/main.py
@@ -0,0 +1,139 @@
+from fastapi import FastAPI , File, UploadFile
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+import uvicorn
+import numpy as np
+import pandas as pd 
+from fastapi.middleware.cors import CORSMiddleware
+from prediction_model.predict import generate_predictions,generate_predictions_batch
+from prediction_model.config import config  
+import mlflow
+import io
+import boto3
+from datetime import datetime
+
+
+def upload_to_s3(file_content, filename):
+    s3 = boto3.client('s3')
+
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    if filename.endswith('.csv'):
+        filename = filename[:-4]
+
+    current_datetime = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
+
+    folder_path = f"{config.FOLDER}/{current_date}"
+
+    filename_with_datetime = f"{filename}_{current_datetime}.csv"
+
+    s3_key = f"{folder_path}/{filename_with_datetime}"
+
+    response = s3.put_object(Bucket=config.S3_BUCKET, Key=s3_key, Body=file_content)
+
+    return s3_key 
+
+# mlflow.set_tracking_uri("http://localhost:5000")
+
+mlflow.set_tracking_uri(config.TRACKING_URI)
+
+app = FastAPI(
+    title="Loan Prediction App using FastAPI - MLOps",
+    description = "MLOps Demo",
+    version='1.0'
+)
+
+origins=[
+    "*"
+]
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"]
+)
+
+class LoanPrediction(BaseModel):
+    Gender: str
+    Married: str
+    Dependents: str
+    Education: str
+    Self_Employed: str
+    ApplicantIncome: float
+    CoapplicantIncome: float
+    LoanAmount: float
+    Loan_Amount_Term: float
+    Credit_History: float
+    Property_Area: str
+
+
+@app.get("/")
+def index():
+    return {"message":"Welcome to Loan Prediction App using API - CI CD Jenkins" }
+
+@app.post("/prediction_api")
+def predict(loan_details: LoanPrediction):
+    data = loan_details.model_dump()
+    prediction = generate_predictions([data])["prediction"][0]
+    if prediction == "Y":
+        pred = "Approved"
+    else:
+        pred = "Rejected"
+    return {"status":pred}
+
+@app.post("/prediction_ui")
+def predict_gui(Gender: str,
+    Married: str,
+    Dependents: str,
+    Education: str,
+    Self_Employed: str,
+    ApplicantIncome: float,
+    CoapplicantIncome: float,
+    LoanAmount: float,
+    Loan_Amount_Term: float,
+    Credit_History: float,
+    Property_Area: str):
+
+    input_data = [Gender, Married,Dependents, Education, Self_Employed,ApplicantIncome,
+     CoapplicantIncome,LoanAmount, Loan_Amount_Term,Credit_History, Property_Area  ]
+
+    cols = ['Gender', 'Married', 'Dependents', 'Education',
+       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
+       'Loan_Amount_Term', 'Credit_History', 'Property_Area']
+
+    data_dict = dict(zip(cols,input_data))
+    prediction = generate_predictions([data_dict])["prediction"][0]
+    if prediction == "Y":
+        pred = "Approved"
+    else:
+        pred = "Rejected"
+    return {"status":pred}
+
+
+@app.post("/batch_prediction")
+async def batch_predict(file: UploadFile = File(...)):
+
+    content = await file.read()
+    df = pd.read_csv(io.BytesIO(content),index_col=False)
+    print(df)
+
+    # Ensure the CSV file contains the required features
+    required_columns = config.FEATURES
+    if not all(column in df.columns for column in required_columns):
+        return {"error": "CSV file does not contain the required columns."}
+
+    predictions = generate_predictions_batch(df)["prediction"]
+
+    df['Prediction'] = predictions
+    result = df.to_csv(index=False)
+
+    s3_key = upload_to_s3(result.encode('utf-8'), file.filename)
+
+    return StreamingResponse(io.BytesIO(result.encode('utf-8')), media_type="text/csv", headers={"Content-Disposition":"attachment; filename=predictions.csv"})
+
+
+
+
+if __name__== "__main__":
+    uvicorn.run(app, host="0.0.0.0",port=8005)
diff --git a/prediction_model/VERSION b/prediction_model/VERSION
@@ -0,0 +1 @@
+1.0.0
diff --git a/prediction_model/__pycache__/pipeline.cpython-311.pyc b/prediction_model/__pycache__/pipeline.cpython-311.pyc
diff --git a/prediction_model/__pycache__/predict.cpython-311.pyc b/prediction_model/__pycache__/predict.cpython-311.pyc
diff --git a/prediction_model/config/__pycache__/config.cpython-311.pyc b/prediction_model/config/__pycache__/config.cpython-311.pyc
diff --git a/prediction_model/config/config.py b/prediction_model/config/config.py
@@ -0,0 +1,60 @@
+import pathlib
+import os
+
+
+current_directory = os.path.dirname(os.path.realpath(__file__)) #current directory of the script
+
+PACKAGE_ROOT = os.path.dirname(current_directory) #parent directory of current directory
+
+
+# PACKAGE_ROOT = pathlib.Path(prediction_model.__file__).resolve().parent
+
+DATAPATH = os.path.join(PACKAGE_ROOT,"datasets")
+
+TRAIN_FILE = 'train.csv'
+TEST_FILE = 'test.csv'
+
+TARGET = 'Loan_Status'
+
+#Final features used in the model
+FEATURES = ['Gender', 'Married', 'Dependents', 'Education',
+       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
+       'Loan_Amount_Term', 'Credit_History', 'Property_Area']
+
+NUM_FEATURES = ['ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
+
+CAT_FEATURES = ['Gender',
+ 'Married',
+ 'Dependents',
+ 'Education',
+ 'Self_Employed',
+ 'Credit_History',
+ 'Property_Area']
+
+# in our case it is same as Categorical features
+FEATURES_TO_ENCODE = ['Gender',
+ 'Married',
+ 'Dependents',
+ 'Education',
+ 'Self_Employed',
+ 'Credit_History',
+ 'Property_Area']
+
+FEATURE_TO_MODIFY = ['ApplicantIncome']
+FEATURE_TO_ADD = 'CoapplicantIncome'
+
+DROP_FEATURES = ['CoapplicantIncome']
+
+LOG_FEATURES = ['ApplicantIncome', 'LoanAmount'] # taking log of numerical columns
+
+S3_BUCKET = "loanprediction"
+
+FOLDER="datadrift"
+
+TRACKING_URI="http://ec2-3-19-244-223.us-east-2.compute.amazonaws.com:5000/"
+
+
+EXPERIMENT_NAME="loan_prediction_model"
+
+MODEL_NAME="/Loanprediction-model"
+
diff --git a/prediction_model/pipeline.py b/prediction_model/pipeline.py
@@ -0,0 +1,23 @@
+from sklearn.pipeline import Pipeline
+from prediction_model.config import config
+import prediction_model.processing.preprocessing as pp 
+from sklearn.preprocessing import MinMaxScaler
+
+
+
+
+
+preprocessing_pipeline = Pipeline(
+    [
+        ('DomainProcessing',pp.DomainProcessing(variable_to_modify = config.FEATURE_TO_MODIFY,
+        variable_to_add = config.FEATURE_TO_ADD)),
+        ('MeanImputation', pp.MeanImputer(variables=config.NUM_FEATURES)),
+        ('ModeImputation',pp.ModeImputer(variables=config.CAT_FEATURES)),
+        ('DropFeatures', pp.DropColumns(variables_to_drop=config.DROP_FEATURES)),
+        ('LabelEncoder',pp.CustomLabelEncoder(variables=config.FEATURES_TO_ENCODE)),
+        ('LogTransform',pp.LogTransforms(variables=config.LOG_FEATURES)),
+        ('MinMaxScale', MinMaxScaler())
+    ]
+)
+
+
diff --git a/prediction_model/predict.py b/prediction_model/predict.py
@@ -0,0 +1,44 @@
+import pandas as pd
+import numpy as np
+from prediction_model.config import config  
+import mlflow
+
+
+
+def generate_predictions(data_input):
+    data = pd.DataFrame(data_input)
+    experiment_name = config.EXPERIMENT_NAME
+    experiment = mlflow.get_experiment_by_name(experiment_name)
+    experiment_id = experiment.experiment_id
+    runs_df=mlflow.search_runs(experiment_ids=experiment_id,order_by=['metrics.f1_score DESC'])
+    best_run=runs_df.iloc[0]
+    best_run_id=best_run['run_id']
+    best_model='runs:/' + best_run_id + config.MODEL_NAME
+    loan_prediction_model=mlflow.sklearn.load_model(best_model)
+    prediction=loan_prediction_model.predict(data)
+    output = np.where(prediction==1,'Y','N')
+    result = {"prediction":output}
+    return result
+
+
+def generate_predictions_batch(data_input):
+    # data = pd.DataFrame(data_input)
+    experiment_name = config.EXPERIMENT_NAME
+    experiment = mlflow.get_experiment_by_name(experiment_name)
+    experiment_id = experiment.experiment_id
+    runs_df=mlflow.search_runs(experiment_ids=experiment_id,order_by=['metrics.f1_score DESC'])
+    best_run=runs_df.iloc[0]
+    best_run_id=best_run['run_id']
+    best_model='runs:/' + best_run_id + config.MODEL_NAME
+    loan_prediction_model=mlflow.sklearn.load_model(best_model)
+    prediction=loan_prediction_model.predict(data_input)
+    output = np.where(prediction==1,'Y','N')
+    result = {"prediction":output}
+    return result
+
+
+
+
+
+if __name__=='__main__':
+    generate_predictions()
diff --git a/prediction_model/processing/__pycache__/data_handling.cpython-311.pyc b/prediction_model/processing/__pycache__/data_handling.cpython-311.pyc
diff --git a/prediction_model/processing/__pycache__/data_handling.cpython-39.pyc b/prediction_model/processing/__pycache__/data_handling.cpython-39.pyc
diff --git a/prediction_model/processing/__pycache__/preprocessing.cpython-311.pyc b/prediction_model/processing/__pycache__/preprocessing.cpython-311.pyc
diff --git a/prediction_model/processing/__pycache__/preprocessing.cpython-39.pyc b/prediction_model/processing/__pycache__/preprocessing.cpython-39.pyc
diff --git a/prediction_model/processing/data_handling.py b/prediction_model/processing/data_handling.py
@@ -0,0 +1,10 @@
+import os
+import pandas as pd
+from prediction_model.config import config
+
+#Load the dataset
+def load_dataset(file_name):
+    filepath = os.path.join(config.DATAPATH,file_name)
+    _data = pd.read_csv(filepath)
+    return _data
+