From 91520311390bae9f1603b853e1cb2f8b75d3328a Mon Sep 17 00:00:00 2001
From: Chandramouli S <64595758+Chandru-21@users.noreply.github.com>
Date: Tue, 25 Jun 2024 11:22:45 +0530
Subject: [PATCH] mlops CI

---
 .github/workflows/main.yml                    |  46 ++++++
 Dockerfile                                    |  43 ++++++
 main.py                                       | 139 ++++++++++++++++++
 prediction_model/VERSION                      |   1 +
 .../__pycache__/pipeline.cpython-311.pyc      | Bin 0 -> 1446 bytes
 .../__pycache__/predict.cpython-311.pyc       | Bin 0 -> 2386 bytes
 .../config/__pycache__/config.cpython-311.pyc | Bin 0 -> 1416 bytes
 prediction_model/config/config.py             |  60 ++++++++
 prediction_model/pipeline.py                  |  23 +++
 prediction_model/predict.py                   |  44 ++++++
 .../__pycache__/data_handling.cpython-311.pyc | Bin 0 -> 1688 bytes
 .../__pycache__/data_handling.cpython-39.pyc  | Bin 0 -> 1143 bytes
 .../__pycache__/preprocessing.cpython-311.pyc | Bin 0 -> 6420 bytes
 .../__pycache__/preprocessing.cpython-39.pyc  | Bin 0 -> 4394 bytes
 prediction_model/processing/data_handling.py  |  10 ++
 prediction_model/processing/preprocessing.py  |  94 ++++++++++++
 prediction_model/training_pipeline.py         | 114 ++++++++++++++
 requirements.txt                              |  15 ++
 tests/test_prediction.py                      |  29 ++++
 19 files changed, 618 insertions(+)
 create mode 100644 .github/workflows/main.yml
 create mode 100644 Dockerfile
 create mode 100644 main.py
 create mode 100644 prediction_model/VERSION
 create mode 100644 prediction_model/__pycache__/pipeline.cpython-311.pyc
 create mode 100644 prediction_model/__pycache__/predict.cpython-311.pyc
 create mode 100644 prediction_model/config/__pycache__/config.cpython-311.pyc
 create mode 100644 prediction_model/config/config.py
 create mode 100644 prediction_model/pipeline.py
 create mode 100644 prediction_model/predict.py
 create mode 100644 prediction_model/processing/__pycache__/data_handling.cpython-311.pyc
 create mode 100644 prediction_model/processing/__pycache__/data_handling.cpython-39.pyc
 create mode 100644 prediction_model/processing/__pycache__/preprocessing.cpython-311.pyc
 create mode 100644 prediction_model/processing/__pycache__/preprocessing.cpython-39.pyc
 create mode 100644 prediction_model/processing/data_handling.py
 create mode 100644 prediction_model/processing/preprocessing.py
 create mode 100644 prediction_model/training_pipeline.py
 create mode 100644 requirements.txt
 create mode 100644 tests/test_prediction.py

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 0000000..e4c514d
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,46 @@
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+    
+name: MLOps AWS
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    
+    - name: Install kubectl
+      uses: azure/setup-kubectl@v2.0
+      with:
+        version: 'v1.24.0' # default is latest stable
+      id: install
+
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v1
+      with:
+        aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        aws-region: us-east-2
+
+    - name: Login to Amazon ECR
+      id: login-ecr
+      uses: aws-actions/amazon-ecr-login@v1
+
+    - name: Build, tag, and push the image to Amazon ECR
+      id: build-image
+      env:
+        ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+        ECR_REPOSITORY: ${{ secrets.REPO_NAME }}
+        IMAGE_TAG: latest
+      run: |
+        # Build a docker container and push it to ECR 
+        docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
+        echo "Pushing image to ECR..."
+        docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
+        echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
+   
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..b666461
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,43 @@
+FROM python:3.10-slim-buster
+
+RUN pip install --upgrade pip
+
+WORKDIR /app
+
+COPY . /app 
+
+#set permissions
+
+RUN chmod +x /app/tests
+
+RUN chmod +w /app/tests
+
+RUN chmod +x /app/prediction_model
+
+RUN chmod +w /app/prediction_model/trained_models
+
+RUN chmod +w /app/prediction_model/datasets
+
+
+ENV PYTHONPATH "${PYTHONPATH}:/app/prediction_model"
+
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+RUN pip install dvc[s3]
+
+RUN dvc pull
+
+RUN python /app/prediction_model/training_pipeline.py
+
+RUN pytest -v /app/tests/test_prediction.py
+
+RUN pytest --junitxml=/app/tests/test-results.xml /app/tests/test_prediction.py
+
+EXPOSE 8005
+
+ENTRYPOINT ["python"]
+
+CMD ["main.py"]
+
+
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..00ab622
--- /dev/null
+++ b/main.py
@@ -0,0 +1,139 @@
+from fastapi import FastAPI , File, UploadFile
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+import uvicorn
+import numpy as np
+import pandas as pd 
+from fastapi.middleware.cors import CORSMiddleware
+from prediction_model.predict import generate_predictions,generate_predictions_batch
+from prediction_model.config import config  
+import mlflow
+import io
+import boto3
+from datetime import datetime
+
+
+def upload_to_s3(file_content, filename):
+    s3 = boto3.client('s3')
+    
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    if filename.endswith('.csv'):
+        filename = filename[:-4]
+        
+    current_datetime = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
+
+    folder_path = f"{config.FOLDER}/{current_date}"
+ 
+    filename_with_datetime = f"{filename}_{current_datetime}.csv"
+    
+    s3_key = f"{folder_path}/{filename_with_datetime}"
+
+    response = s3.put_object(Bucket=config.S3_BUCKET, Key=s3_key, Body=file_content)
+  
+    return s3_key 
+
+# mlflow.set_tracking_uri("http://localhost:5000")
+
+mlflow.set_tracking_uri(config.TRACKING_URI)
+
+app = FastAPI(
+    title="Loan Prediction App using FastAPI - MLOps",
+    description = "MLOps Demo",
+    version='1.0'
+)
+
+origins=[
+    "*"
+]
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"]
+)
+
+class LoanPrediction(BaseModel):
+    Gender: str
+    Married: str
+    Dependents: str
+    Education: str
+    Self_Employed: str
+    ApplicantIncome: float
+    CoapplicantIncome: float
+    LoanAmount: float
+    Loan_Amount_Term: float
+    Credit_History: float
+    Property_Area: str
+
+
+@app.get("/")
+def index():
+    return {"message":"Welcome to Loan Prediction App using API - CI CD Jenkins" }
+
+@app.post("/prediction_api")
+def predict(loan_details: LoanPrediction):
+    data = loan_details.model_dump()
+    prediction = generate_predictions([data])["prediction"][0]
+    if prediction == "Y":
+        pred = "Approved"
+    else:
+        pred = "Rejected"
+    return {"status":pred}
+
+@app.post("/prediction_ui")
+def predict_gui(Gender: str,
+    Married: str,
+    Dependents: str,
+    Education: str,
+    Self_Employed: str,
+    ApplicantIncome: float,
+    CoapplicantIncome: float,
+    LoanAmount: float,
+    Loan_Amount_Term: float,
+    Credit_History: float,
+    Property_Area: str):
+
+    input_data = [Gender, Married,Dependents, Education, Self_Employed,ApplicantIncome,
+     CoapplicantIncome,LoanAmount, Loan_Amount_Term,Credit_History, Property_Area  ]
+    
+    cols = ['Gender', 'Married', 'Dependents', 'Education',
+       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
+       'Loan_Amount_Term', 'Credit_History', 'Property_Area']
+    
+    data_dict = dict(zip(cols,input_data))
+    prediction = generate_predictions([data_dict])["prediction"][0]
+    if prediction == "Y":
+        pred = "Approved"
+    else:
+        pred = "Rejected"
+    return {"status":pred}
+
+
+@app.post("/batch_prediction")
+async def batch_predict(file: UploadFile = File(...)):
+    
+    content = await file.read()
+    df = pd.read_csv(io.BytesIO(content),index_col=False)
+    print(df)
+    
+    # Ensure the CSV file contains the required features
+    required_columns = config.FEATURES
+    if not all(column in df.columns for column in required_columns):
+        return {"error": "CSV file does not contain the required columns."}
+
+    predictions = generate_predictions_batch(df)["prediction"]
+   
+    df['Prediction'] = predictions
+    result = df.to_csv(index=False)
+    
+    s3_key = upload_to_s3(result.encode('utf-8'), file.filename)
+
+    return StreamingResponse(io.BytesIO(result.encode('utf-8')), media_type="text/csv", headers={"Content-Disposition":"attachment; filename=predictions.csv"})
+
+
+   
+
+if __name__== "__main__":
+    uvicorn.run(app, host="0.0.0.0",port=8005)
\ No newline at end of file
diff --git a/prediction_model/VERSION b/prediction_model/VERSION
new file mode 100644
index 0000000..3eefcb9
--- /dev/null
+++ b/prediction_model/VERSION
@@ -0,0 +1 @@
+1.0.0
diff --git a/prediction_model/__pycache__/pipeline.cpython-311.pyc b/prediction_model/__pycache__/pipeline.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5df1b1d6a59f53626757f515aba13b1df2cc6bf4
GIT binary patch
literal 1446
zcmZ`&T~8BB7@pli*@d=1J;GAZ7?R_m8cGynPK+VJek>%G#R4X$scF{kl(V>B&bBA=
z5BvwN3D+0{7hZDdPuM1V(d?CpSKeXt$}8X5GSD7zGCO%@o@bu-V|U*rlcNa6ZT)3a
zjv@4!ZrB(*1K(bA2)#iX(l`T|T$SS}<_+GARbvs38AIl9gXfSLuZ~0~Lxy0ARgptH
z!kjh?>&RXdiF)GL9`mL+!2%w~4|#-+;OUJfuZ=`Elk5c4DTa@QL&)n0p56ep7^%nE
z38*I+zC=A5sVA8N)M<u~$A0lnMCvO{0qQ>(zQj8jsjo5xsHYgdM4bY)cnE?cCPGbq
zcVsY!>a-TyLfP>zRFmaGVN-Xop<5VmykT2SeRl(3qO4nG_0?8GH82S$3bv{0)+VtV
z*mZSlH_M0V|5T!@&kbB}*>%&_^yYpz-hZZQT9_(h)mk^5c1vyPwgtu_lej?I8onUI
zM1j~&39GF(!7fNA&!sek24t9ctUkv^(Q3d=0y}$mo2ZuCw22vx^`)GZ!U^|<fmLGV
zoU;N6e3>Ymb_mw=2F-?Mjg4HdXfj0E>@TcyUULq(i(NR{hm(U8?>OPr^YHz*$S_19
zaT;o}R9xNOsTAwmO1-QU)=U3{slKDtLIEmb-J`{B?3C+gf-Rs}?<kdD-R`4k8}qi&
zHZ3<y7Anf-1((jZ-Ii?*bTQ3LX(q*uyizEJ$^Oo|&}c=zFqs;tw$A$d9({wl&<{EP
zptj>;;@0y2s+LCDwVdnh!~0rv_Lpnr$BN_D;Y-@XjaKdd?XEGioU{K!cnB}3ZQvEM
z0KW7Ob(dj1jd2{;MSuE(vx{!|gR_h7`@fl8l=BDYX#!>DJ?WD;>(4EDkAk_Sk57Z*
zvu@@_kXeF|*Y12puzaV>DdNPtpTrq&#k+g3@}U&mll|GX&g@z+yXFh)U0`zF{6X%+
z{b0G|XXH*s4l=SYtaZg{@4A;hxPEdoc(CEmD4iK4m{ELT6Bu7w^!PwpJc$Q?Kk=nX
zN2&x;#TT|BZh>+OJx59INOB<c9H+d5mw1~xNcqBFr<3Av{A+A1!+%Gx_-Lv($xUe9
EPiCo}d;kCd

literal 0
HcmV?d00001

diff --git a/prediction_model/__pycache__/predict.cpython-311.pyc b/prediction_model/__pycache__/predict.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fdcf34725044f64aa21bd85a230fefd63e69b993
GIT binary patch
literal 2386
zcmc&#&2QT_6sIg%mSrhQ(xgtkAxl43cd3gtz)%#$f+1}-^c%<qY(_T0v6w`zEUT1^
zE`f~su)_{H47)Uk9kNT7B0cQBe?l${R3H#wz)rmd$gQV6Dv|7X*|6Ja`p3sd@;&nL
z@uy!^RYp)gF`q6!6%qQAb>fmb5)KFm&yj%)&O&7_YExxCYK5`@ZOTg5#j?n;IlPsr
zOJ%8^EoY-XXUS#R5WYv{oRJ1puxf};BHKhsA)MO=>$|Z}1Z%j3=wHmdVcLg9j*8WW
zy=<-&Q&hDY6+?GS2V16%sS<UKidS*5SLG5b2&F;h;eX(P_Bmo0ms%qGXdC3+q~GEU
zu9foG+M`lqHlpIE2IlfUUlZcdmT+uUV!;=T)C%{K--ZM4#sNjGw4bg)xDSgVoQXx$
z5`ED~_w_S|=x5ep+=#Fxd5KSiYMFT9mK00<tuGDiIqPS)vi&n++2;mk%HDW927Yp}
z?8}C9czH(FkhdW^yGevvIV0!iOs=K)$^v?!Jf!1rJ+|%;Y?xKgY}k6eVPI>0ypBC$
zR^8I_hq_yB5IlAJ&cYnr?hyXg!Nja%+tW?MrBZ_!nCMGugoTcZq-leGowe@GYwP3J
z?t}fF`KyunH|#no_Z8(HQn@!+RA|aEX!bTF%RH&nF%|39vekG-Pp@E4@AJI0rrT^>
z?bDeC&AGTjs;fG4P6g9yRB6V2Vu8}8a+HUWdzv}&NmI5%)6Z5h!9|tIEJ=0KcA6d)
zm`t@k^N~NfryyhO3>oH1H<qcigk6t0rny*xI8Zt2L|zHIf1*WEYBW9OMI_j5TAr(d
za;m?)ou9k8_=Ss!yEwO6u?^B(EV<4aMEa_;HnaHP{v*fLACpE6SG~nv01q!OR+OAI
zI<|ssOe!AkyJDBT5B?abxW5Cm(Oyp7xV0&Fa_76b^X=Q=>8tHWdn5VoNNMN0&d6+c
zWVU@b9GmQpO}8KHDWhFwa;F~5e;wTUCQ$T_qIVU&{b{I<Z7gn{-O$^gg~Jn@6W!r+
z?R%k?-~6DXT?{VIb~LcESD5;_^tv$fsxb50-A-Y)TbK=mGog0jwKn}qo8I}nqh0T6
z*I}ZnZG8D-ZR_k$^{u%bzN1ce)#>(pI8@mDC{WLVlpAS3IDqY=lR4!+q!<t)*W&(x
zR|a2G%1_mLub)4*DtSLXe<D{1zR=^DV_X3&a*QiP;0m#iD>5EC!cnOuoV1lNPdvt2
zKQq7pk}qu~{BmsOAZup5k=}SM;TJ#qFI*u5SIEE>z!(dNFh50^j6y?BF<|*m7z2^h
z4Dt*n7%*xi1qSZ`6czF=lP4Lx#~@lb#iX+g&M`R8AbB9<0_$D`a20_5mq9Lpf)-9n
zYkjHWRaeOsn83WdF9F)<kN+>5%N^}TSG)0Fvzc6GN0W5DUNLQ5FAh=Bfwy0A$uJv8
z+s(SO76q1ywZ@WVF46qp+b<n{h6&49TKwCnLmE}=x~9ESipY*}3w)(l2>W6YmVSp9
z%m^Y1B-;*SK3#u-qWwkje=FSrN^4s9V=@CX8JoMm0POP|$A#!r5Z4fm25}9M+Kzr9
zx)L0(A$lB~RFft9lFE(lqXecBDbig+g7QQ86`v3Id?-u=!bC?XbcI3?*L@y=n$Jf^
F_zQ;#LZJWv

literal 0
HcmV?d00001

diff --git a/prediction_model/config/__pycache__/config.cpython-311.pyc b/prediction_model/config/__pycache__/config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..901097d8b55a0c379350505fa34333b14b82f6cd
GIT binary patch
literal 1416
zcma)5&5PSc6d!rLvSdq^?K(~hg%WUC44dtS1PaC3px9bxi9Z%wTUucds%N~>R+iL}
z+~QD(&LI#8fz5F#C8f=!e~MiUUFKxysW(GzIc3IPcR%Q%Gt%#U=)K>2dK&#I%Sk|2
z-~PV;`#J!BGh?-=y;$##H~@|T1dwxpv6P&_A?_%)3O!j{X&2?Ci1@`PD;=%Bq1<oj
zy<C?qgTKV^-!=FvIf+X-!ixE#L<tysLwkSwZ~uk}*Nh~L8!5PMNdJq0zrrBHgpoc3
zQ23cM6e>k3GgKz2%$~^{OdWE@M^G~IP@>v3n1b?Q%qYOLaUCkg4ajvsDf16&wn+qN
zg;s#V;E5Pw%XYVV!4FS_5C!2v%k>;92<*PyvqIZ*xBTxMl3oNNXbwDxoPeb2p5>aI
z&<ck^DMk1iav{Pb(XcSK5hTef@|nvG10rg0xTGR-2RVIH8~BblqEU+K`wlJRhI?+$
z8z7Rcc-9qHl2uX%-p~z6W>Ix%m^#7(lCEF`?a+K^2cd^YMBc}qk8n6LRgA3C8cxwS
z#xjF6g;E0ZOiwUipAd@-*ipp_gA9X3j>I&UNYW1Nz-M)OY#u~p<Ad1lhoz56!neZ5
zj{Pl(c>#$tgCwAhU5mbrKwEGYF2OYWwu4NQWP3x5=~zq}gnEmfrG2&XNUdpRyVcT3
z{K&IiB2-mf-B<O8B&oO6y{5UlSJ#NBYaM;5CL67)RyUh!LnBH@{Yo>JjztcytL>Vm
z6Jb|V^@Fz7AyV_8VV+wRmC~%9<yM-hw@j^Bp~NJ6=3_2Oytn%eNndcO)hdyz?biM|
zi&Ssb&MmQX$9!;5c|@DwcU$$U*8UB=qr(L6>y<CN2LZxCxANF>As%+O0)Ipg&n<s+
zr`xEv{J^9i@rVwmdp`8#x$E{kw{IVwi7kKhUMSN8Idsqt{+yDt-#v&^oUd^lcM8(c
zVxIyrTI@Lwu3djsJ8r#fojmxZ_E+)lpT)a>d_5`dOp7~{{QYVE{upFuO7{7WFP=Vo
zI{tKfqTHS;w<Bqu1XB9>7ca`s%CDspW0KmQrnVztCf|sXr&1vjXYx&|<?^pKjyGR!
zzW!;vS)SzXO>_4mWhNFPe$L0FojF)RB+Lbn(YUcvr5ELo^O@WntbjUle1SR6U<`8e
u3@B`j^Bc6o(`+$H&$7jF=CeqiWp197$I5mj&7@CgBZ*wZvyE@jX?y^xw|=+)

literal 0
HcmV?d00001

diff --git a/prediction_model/config/config.py b/prediction_model/config/config.py
new file mode 100644
index 0000000..f8f03ec
--- /dev/null
+++ b/prediction_model/config/config.py
@@ -0,0 +1,60 @@
+import pathlib
+import os
+
+
+current_directory = os.path.dirname(os.path.realpath(__file__)) #current directory of the script
+
+PACKAGE_ROOT = os.path.dirname(current_directory) #parent directory of current directory
+
+
+# PACKAGE_ROOT = pathlib.Path(prediction_model.__file__).resolve().parent
+
+DATAPATH = os.path.join(PACKAGE_ROOT,"datasets")
+
+TRAIN_FILE = 'train.csv'
+TEST_FILE = 'test.csv'
+
+TARGET = 'Loan_Status'
+
+#Final features used in the model
+FEATURES = ['Gender', 'Married', 'Dependents', 'Education',
+       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
+       'Loan_Amount_Term', 'Credit_History', 'Property_Area']
+
+NUM_FEATURES = ['ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
+
+CAT_FEATURES = ['Gender',
+ 'Married',
+ 'Dependents',
+ 'Education',
+ 'Self_Employed',
+ 'Credit_History',
+ 'Property_Area']
+
+# in our case it is same as Categorical features
+FEATURES_TO_ENCODE = ['Gender',
+ 'Married',
+ 'Dependents',
+ 'Education',
+ 'Self_Employed',
+ 'Credit_History',
+ 'Property_Area']
+
+FEATURE_TO_MODIFY = ['ApplicantIncome']
+FEATURE_TO_ADD = 'CoapplicantIncome'
+
+DROP_FEATURES = ['CoapplicantIncome']
+
+LOG_FEATURES = ['ApplicantIncome', 'LoanAmount'] # taking log of numerical columns
+
+S3_BUCKET = "loanprediction"
+
+FOLDER="datadrift"
+
+TRACKING_URI="http://ec2-3-19-244-223.us-east-2.compute.amazonaws.com:5000/"
+
+
+EXPERIMENT_NAME="loan_prediction_model"
+
+MODEL_NAME="/Loanprediction-model"
+
diff --git a/prediction_model/pipeline.py b/prediction_model/pipeline.py
new file mode 100644
index 0000000..6dfeccb
--- /dev/null
+++ b/prediction_model/pipeline.py
@@ -0,0 +1,23 @@
+from sklearn.pipeline import Pipeline
+from prediction_model.config import config
+import prediction_model.processing.preprocessing as pp 
+from sklearn.preprocessing import MinMaxScaler
+
+
+
+
+
+preprocessing_pipeline = Pipeline(
+    [
+        ('DomainProcessing',pp.DomainProcessing(variable_to_modify = config.FEATURE_TO_MODIFY,
+        variable_to_add = config.FEATURE_TO_ADD)),
+        ('MeanImputation', pp.MeanImputer(variables=config.NUM_FEATURES)),
+        ('ModeImputation',pp.ModeImputer(variables=config.CAT_FEATURES)),
+        ('DropFeatures', pp.DropColumns(variables_to_drop=config.DROP_FEATURES)),
+        ('LabelEncoder',pp.CustomLabelEncoder(variables=config.FEATURES_TO_ENCODE)),
+        ('LogTransform',pp.LogTransforms(variables=config.LOG_FEATURES)),
+        ('MinMaxScale', MinMaxScaler())
+    ]
+)
+
+
diff --git a/prediction_model/predict.py b/prediction_model/predict.py
new file mode 100644
index 0000000..9619d93
--- /dev/null
+++ b/prediction_model/predict.py
@@ -0,0 +1,44 @@
+import pandas as pd
+import numpy as np
+from prediction_model.config import config  
+import mlflow
+
+
+
+def generate_predictions(data_input):
+    data = pd.DataFrame(data_input)
+    experiment_name = config.EXPERIMENT_NAME
+    experiment = mlflow.get_experiment_by_name(experiment_name)
+    experiment_id = experiment.experiment_id
+    runs_df=mlflow.search_runs(experiment_ids=experiment_id,order_by=['metrics.f1_score DESC'])
+    best_run=runs_df.iloc[0]
+    best_run_id=best_run['run_id']
+    best_model='runs:/' + best_run_id + config.MODEL_NAME
+    loan_prediction_model=mlflow.sklearn.load_model(best_model)
+    prediction=loan_prediction_model.predict(data)
+    output = np.where(prediction==1,'Y','N')
+    result = {"prediction":output}
+    return result
+
+
+def generate_predictions_batch(data_input):
+    # data = pd.DataFrame(data_input)
+    experiment_name = config.EXPERIMENT_NAME
+    experiment = mlflow.get_experiment_by_name(experiment_name)
+    experiment_id = experiment.experiment_id
+    runs_df=mlflow.search_runs(experiment_ids=experiment_id,order_by=['metrics.f1_score DESC'])
+    best_run=runs_df.iloc[0]
+    best_run_id=best_run['run_id']
+    best_model='runs:/' + best_run_id + config.MODEL_NAME
+    loan_prediction_model=mlflow.sklearn.load_model(best_model)
+    prediction=loan_prediction_model.predict(data_input)
+    output = np.where(prediction==1,'Y','N')
+    result = {"prediction":output}
+    return result
+
+
+    
+
+
+if __name__=='__main__':
+    generate_predictions()
\ No newline at end of file
diff --git a/prediction_model/processing/__pycache__/data_handling.cpython-311.pyc b/prediction_model/processing/__pycache__/data_handling.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4bd0661847b962b951377ba426537398c896a9b8
GIT binary patch
literal 1688
zcmc&!-)j>|9G}@t(xly(v{9=MC#$`em0FM<KHQ0rvk8j*1=R{6uxztayCL0OX0}R?
z2;uPQ5DrABpaq}w4qwmzV*~pT2l{kR_acS9`Q&$Io1AHjPlB_vpV{wsW<K+s{d{NM
z=JTTn%E#Qw`Uef6f2GkcV5@R)9hx2FAP2WlEopI0Q+BGBf>vvh&2%k|k&}9YY8i(B
zjyP$+tb<pOnfXr=Et*(l8osygK4@UYoB)9)1@FP{06Qp5yb7Cxaj*N)e9?<gnEBsq
zN*-0(=nag1H<<AlhhzOdgqnjnZTtrK-woDythF_W7B8A3Li4#GfgNrL()3-A$pA%m
zu6(z=T)ultXn`ZLjM|RX;1A7|$gR6AYI*i16<PVIW~MF24s9-{8iw)d{>`83_c&#|
zesjb29JW=T;lZ{;*_B{>w!SdG6mV;q`Aymg>j9&V+X!9Xvo?K)w!q;xDCe&Cpe{)*
z$<P98CfF8w%ZEVJS~O$@SZ)Mfeg&X|_6p-qAN|?>qrF>wwe;8RZsA&7xYoI~r=NOy
z=S8-wUySvOol0-4*qJ|o3Fu^LBIx-R(hLcfmg~uKOs;(5m)MkhQKgMQupwbF3dj4s
z2-5Rp_Lpc^8x|T86@k#&;%`4L$iHiB*xXp7)HAsKkUGYe2dWt11~sIjhM8kBR3;OL
z$#9ZvD6EuME7ro&TxH%;NimvyFP0Z7BGdHOTJD-4&emoi(gAb5(9}dRa03duQ7iN<
zxtPdFv}C?;nXrbzPC+B`iTiLgI}bkjQ}{9fh+3R@KC+`f(|1?nA7<ZI<DV+sVl^&S
zJGb}9=#x89(Rgik$#hJnBQm{5a?$AA`?K-YU!jHCC6$;|qJCvXAU#?4f1u~P>wfH*
z4y%4tTXl#*+cQVBm8XO{CyU7@V56)g;WU6Caxgm!pK!Q`L{lB-<QQF@Oe>-7pYlWX
zq|@qa%`U)j$<50EUstbm{`oI2w!0-WE}5NqrQD@1nTp9&M5cOUlM$Izavg3%WCFMd
zY|e7PoP<7k^zN8R?p&3$>UL1sq=NbzDKZuGr|fmu&{m7yV3&YQ-iq7@*iT`MdnngY
qPY;!%e(j-rM?F1sIU26}nG_!1M+4APxQeAcK%L-o`{+2RfIb80Utxj(

literal 0
HcmV?d00001

diff --git a/prediction_model/processing/__pycache__/data_handling.cpython-39.pyc b/prediction_model/processing/__pycache__/data_handling.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..920605b897f42b916dcc0ae1eff8b161b34ebebc
GIT binary patch
literal 1143
zcmZ`&&2AGh5VkkFn@yF_QdE#QpgnMC4(S1I2q8jRiJ~U0)WRWJY2~inX2br;b}9v>
zCwL2l)E;>No`4%qfPCe|D{x}Q3D6X<<QdP{p7DI&n6+9f1jg4-n<t+gLcXDKb%2jN
zg{e?*!f8f^b4`biwQIu~XeV=XZ|G6NYutTDhCcVW54z6lJOCYVdPv%h?+}Fy+SK?-
zS)8P&2}Q?Z)Ihs1doa~K5SonXg`;a@cSbIqbE>@ye?~9qxifPh8XdIj#wnF?E2c-r
z9hYe#@xdn8?jCgyx<}8AQ*jeW!FZggvv$oiPSQ-o1<QpAaA|LPF=v{6B(e#Ye|+gZ
zie4%qRn!}?f=iZ{lPrz4R5j&7cB<*asK2*gsqk4@>V)N0*k?)$nHHyEcQ@Rxgk(A`
z3l*M}GCZ6liBKx+z#d$RdSzZ^LW_=#(ZLu!h|$SeT0}2I@j5M3r?=bN?)0YzvK)&<
zM@K?wjGmNSM3odgO)x>6gPnn+#0+pC!ZKofbOzQ|H8stwgwoNigsvm5J_thB=n9oL
zV6{X5V)vVX2Vndm07&pG1^z{SMt}r|5)GuH|2_HOyrn=r0aBd7oBMryS2$uSJQkt|
z6+06=oD_gws7E41^uu;TBHt48Ne_i(ZuPMHYA5dRZ}05ImcW(ydeH6f7=K(IXX&wV
z`6RE5R{`X@?U=PHtprpoVqM0V%rsEO$gQPxE?SVpwB|dD3+P%PvQC>+-UfRKk)@X0
zgx&ujVw)22Kpbpy0uCG3a6m2u|99bX6@tq(6gNQ_7Z1x@u!N>vD$s6h;F+f7Kc0vE
zt59^uzmUN|DgZ&~I#3@_c^B4;rx?G2mSIX_gUr+4x!z@4^Z#l|vW;W0vQe&N#{VNN
Z&cS1N+$0lE@PjM#2aq+O&063G{!e4j9yR~~

literal 0
HcmV?d00001

diff --git a/prediction_model/processing/__pycache__/preprocessing.cpython-311.pyc b/prediction_model/processing/__pycache__/preprocessing.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2eba70ae47b45dc59e1047eddde22e092c013e42
GIT binary patch
literal 6420
zcmdT|-A@}=7N0L03^q3TqI@JcKxhobn}k%+wuB9slC&G>uEHjjMQcrF2G*(VaqpNa
zAXw$04^D)XbQdXT6=~U3MM31DmAX>?3=>(ZHPY_OKJ6QkdCI=*Id|;wjAuv$`?Qzg
z_|83d?#H?3cRud-fj|=j<@SHnxj-|+{0m>I#apSA{|S|+jLgVvib=C^HqFI37T>uP
zpBCZ*t@9~Q+{-c78Cm#-kv$5laGL_d?7+WK;tiDR1#W{2w~=xif$MYO`Y5*vxXmuy
zCd%~#H{il;rrZ|b23@#*nZLzELjT4^MmRk%E^5k0S}vIub1Kn8Gepd2b1F$IWGZ<-
znTfEvS5h-`$@yuivN8I@RbI+{P<hHIOq`XOI465#?i)7F!!HlN0tzSYky)71t2a+6
zV&>y?b|I&bX{kB_4uo|c{>vwTJY|;IWhMbNqsYuHW}BVfW_91!B1wvOQ;N3D>4K)D
z<_H`jqv7Y_@asFV4`zR-DMXu%{a(z-WMMX{Wfx_I3}qLGXQys{n$;4&A?iIv%FSkp
zA}6I>Qq3gNs;s1-L6sCuOJ?SgS9^<Q7xl(OBAH3%5{ab~mMui9t*AKV0FVbvsl6vJ
zthVIguM7>0guAL1R1Lbz6jYuvNyfU%W#%4R?Kr`F-nYVo6!`XP6SHBJm)W`rwH@;f
z#;Dc#5vZ@8TL$S7A5<f}?n}b~ByitUZn}W)+XBJ0>+EMbyQuS$n$mEgVJM^p8T?!_
zw-mH)92#&$?E@gV=T6*n>3G}P=pMuRE>Mz>m3n$NV&C<DKlr!7zYcvjl%H6gULW4|
zcNhHKrQq>=Giim-ZEE0w1}dOGLIu@e1Ad*wzUdWE7Mth>RyYtH_cqbNSLWeZd&}%|
zZWDyPV+x#QRtJ|sg}3?X8NfqBGLub-k`fVgK~l4ehAh2v$y6#MlH;)6ZJvshoPs)e
z2Z)9<nF96YsvuiBX%(y49E9HGkAZ-2{ej1mk0u}9SiO;FO%acvh(|!g{y<(@oqRm?
zXllLhS#;}ov18~5IX|`Q|FG!)ut1+uFqChkvZiXV?L~FYh-oE}(3=vd{R=6q`xA+;
z7DU(ratJPgpa@AA$TnjPfciihssxX#{A(Z&%)dR3_4LBxY{V;gPV6yu;uk!v*j7)B
zd2k8|{>@T7i^G{40QS+$4V4r@S3&$hDh7tafl3DKKy7T(4zywa5E81{rglM#0RQ-2
zz7OOr&>uJo^kufD?=EIw5Y0e@n<gkoY6)QdHcJ5S<Rmcd;HN48;J54+z~6!Weg(t{
z{$bS2VF&nsV2eGM{{Jli=|u4f<TU(H*po9@dk;xB5F5@Zasc5L*_j9r;}>U<psE<^
z)`PWvvyEacg0D!ZDS6jqV$=T)kO`t@V`^$4ozV{Ij}4E)>aC_nPUO^t4C6&O!}eW*
z6}sTYGNp+d+?Iy80d+ciDK8Wo&lKq6#1XnkiX-&7;32Bcjmphtb`gyB<3NBc1+5DJ
zRUUxGGMXEh+ubSQbaYA?Du-}N?1t&j;X|_kIIG5tdCqP^pB=LwxH62CxM}n<;E3^H
zB0SYLT|nUylxkJNs1h~_hoNEVq;-R)z71ht+-to@G2F}gO!4$^esb4;x#+(Pe#Y)H
zh?{0!fWZh7)L6~w1<(+9ti;Qr7qAmAj}E{|3{>Fghf;E{Bj{|<LlbIROlBbNtAuyc
zwwMghf}Sq!u|D!#<{&~dPY2^rL+?5LSXJ*)?ZJg6=N9##wM~>|I5`Zg2*zRLDiFIm
zqK92-uV0-BiedZ!V+S~~p34Y|?oxLYf4d6wIV3p9WycCAAWk?|0I0YEeaZVs95I>z
ztyEw+s2l<g_86_RK1RckzXpt&FI$3HFq)&mY!|}AI06F|HQ8FRuQ$voVs3#bc7cas
zsHG0ac1?_6G^>pRsmDlLXR$4cevO8=qxtKl_Kx){&pI~5EunaRq}YFTxBY5<EI<G7
zll&(&>;R6Z54P+$8$m)1Wj%g4&oA~EJ885-haw3gtNH+n22}=raP&jr$1Q0)7F*DA
zYWk*lS4n-8kuXd+im!N6FxBat!i2#X>_MyKR&C*GZspW{^aoTeUqa<A0oI+P1Xvq5
zx}(r!!DhqnAwXzZ5x^WVx#sFSC^F>3_S9m6*#!2E*~KMv`;3Ne9-4aiL`_mMG9;;z
zX_nwyoD%bnat9D&^Pe})f@xb}0cVg6{fS%k0dmk(Qvwkn7rmgfNuB-D29noJBbXja
zYC1c*G~jYBU>u?&DOD0v+Gw=a_XdotVJ!53DfEsT&vtBUY)jl2+YrqrLod<h!gWAX
z3U$g7g!;c0QwvH$QWr8gO>fpzl1tDgP4^@-vT|SdDVYVhnGkbI#A8XZ(AVocm|vaE
zX_%Ejs#VEejR5wP`%WgHtEK`0#CLRoas?|H7UzqxfIpxR$%a~=^sae-4)wka_3nnw
z6+`EK4qbd1y7*$E5W2V<8Y|*+JwH~W8~^(+U;OsPS3i#aGzz=k?Hw=njzir0>3VF}
zA1?UA2Lu^r=DP-~IkHt9thcJ}mj<9@rZQVIUwbfOTHW_*R@V~w>U>(vlAEwxat(-0
zi;T#(%SGs>u|<QYRZ`rh=Ewby`X8QOJrAKrTj$yz(AJK_0U$N4?O)FP`OKE^Jouua
zc>daM`&hAk%xQBEwzn}{DUxeAbG^M~J!dgCu@bKb;}tt;d856>(Ixmd3o1I6!(^?2
zo9g^b^?u+;VfmX(?J3*nrj?z6MIMm3qMhM!4`^XQGpj+6VN4DPDq8K+fYG>vGq^_-
zrtU8rGq4-*>nwId>A3ajUg2^$Ql(fl+;$gqfR2aVb)3s&bv~ueAC%m++4pH`cN|us
zVaQS+N>uO2`cS$1^09xMY+Jj%KDI7BoBZ~RwJ$)SrH7OG$yyL`utML4;R%vkIGIzS
zo!2mj4j^#!NI>^%U#1k1WTJQBu9%=Q5CMqZR)2{aHExOto`dcIQw^76C<^1sh~~01
zZNR{UVv(-j&?K4%W6t}!aRdTtxRV+scn6I7r`-a&$FnS3Vy+b2o)R-qus<c{SfTpt
z2`wzdNlx-)s1UyJ3K{z~ZjZAc{noW}xCuvgl1gV_?AN&O__4E-JXtB6d;b+O_G{d(
lAa-_=_0xrSue?IWevRATjGdk2$yA}|(ko=_yKt#H{x6N>;H&@u

literal 0
HcmV?d00001

diff --git a/prediction_model/processing/__pycache__/preprocessing.cpython-39.pyc b/prediction_model/processing/__pycache__/preprocessing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..190bfef51ede8bd895d97d3844ed251af1d414b5
GIT binary patch
literal 4394
zcmb_fOK;mo5ayE9(~=)bV#iK^3Qf^QK_vuDfdZ)!IFZvfZLAcCixz?cf*~oF4n@jK
zDu@mFlw5Oae}HoodhVa-PgwNUQ_sEi)cIBqQ;h7|K?p6iJ0xdjXTEu?SuExiy8itA
zdiyt;{e?yRO3`A2M))h;Y{h1t(qT>3QJX5mTJ^M!-qht>^HR<9*NUy%saJ}fa+ssO
z)SDTir0on*G9yZsC|Ns4l-!7tBTC+$Bg))}k|#>RE)u0UqRbIx-d-Te!iZ9^wZ}?j
z@o&;hsi>lO&kUXWVdQqqDBxmoo11>v4tU4mTkZ?juP~9e0>AC<G%S1)zcgBG&<L;7
z%~6`nR+=g)soL5r*3|5r+!Sl-!>u$^HY3q_F}LNI{)0}p7dgCPsr2#q!W^ScBZQ6n
ztgjekvd7ARH3m%No}1h?pL$L>P=y{kUYlp>wSmIt=<nY@>bF}zgboi|^`A}O=4L18
zd2Xu~cK2<ESG)VST3Zhvb;I(#Ac!op(=BhAA!*_IJLS6%%8$AZHzPOj!!ntm{J3XX
zP8gP}^o%}g)q_sgb0ViIc~rrpPCTj|)Ad{5I{q`)53BVD_08(m{`WlCb*!kh?T~GW
zyA{|@tIHkRwIIOg(ApzH015O0o*l1h-F=ZY4A*xf!$`vz(n{>$V$5{4@e39lF}`$~
zU`P|fF|Gm=_LW_>sXSRe)X1dTs&b^h0Zr*^>&jlCPX^|YQ_)1OL*6l<r!=k5u^#9g
z?hx!J!S;n_1)enc@K(SRt?foRnuPF&!qP0yF0zA#xW4qpl((TaLPe^2lO}z3q`XlM
z$>&nptA{EntF9`$Dp54dY-X#kDTwyw`-G)|*4QS1W?aAPnU+)0g>D7i{h{9Jw(EJm
z$>)jjK$Gh8C7SaKxB)IP?Q_u>_Y3r3w1+J1VT~PJitl0aUWGO2H%P-zzS(gML*xy(
zwdY}8FpRyPN%HZ87a{E;Zfg^{(;9FX0waVQNV1ksOPWQ?sS8%ypmF}FBgmaM>Le(5
zhQx6=H(@rpuNCMrf#<P|Gr;px`g97OYjJqmY&8MVirV0ezL6*;NmD4LHD8>eBg_t~
z$AEbmz>I17-hs(4K}(&(HWH2eGG=AmR&h%>ANt{85_mBk6TstFrrAh_2t@t?-Il>C
z1|-nlACNYA(5(kvuj7YjPT#}q!o<=ffg4d^*u<%#4rB8gluy_Naq|iGGR9X-^lW?~
zIslP6zCs?Hq&CTWDiM_EaZ)1Cas%IvBre1yCUJsyAW#Z}4<`wX?IMY(Hhaw|ouTZh
zlrBVu5+}@7(Ihm44g>IO^ySzJGD9V-0M1n!(xVE?vUy5MalOurjSq>-$74=1$!Huj
zi^kN2U2V`fZ<1Nu3{VZJ`i<&ZW2!iTEsRzNYXs}CzV|7=8`M3Fi;yZxW0(LXlvLgJ
zzF3IUOxq?Sq9$<^IbOjnVK<=X77d95l%#{zq<ZfJBL2|~VF1n-5QhCJRH@$*l|&DW
zF{#uwT1?P0EPrd`@&_aBQt?}Xxm2pxviOba^)SawY!C;y$g~|Z>TxG&7y#-vjaa&&
zl(_bm#>co)8>n_CCmjUoXW%$WI(lB?pjkAgE^K;(MhF<r=ow`joah*HqEzpNQP6p4
zK6SkNK6<IMH;!j0VRxa>$ZuEP7Bw3r<AN=*{1tJSs$`$XE7`6l`xdsVAL_Pd>#vk3
z)u)}lR5z8EaBi#(tE6R5lQToh@onl1Eu^#YHO9<iB815D5c*`kr5-Xu3$;tXsCznv
z(IgV6RI0U`g1Le{JEn5kvgOy&<yq9#-uZ@D{R1HfH)iagkXu{#0?YKmJGJAFhP5~?
z@*eyWE5?QArq^=}E9m)ADCWX|M}{PYBIWwF^Frhtzeh(5Gjb}aSZjuiEHsL9!J>q=
zvVGd5=ki=IY%rJERd%pETj6)>G>;~o<{L`e$zY2WbfI<3E=P9ZC>Kd(5n<m!*<m+4
z>LjnuQ1v**PKz&eN*mxeiZ3mj;h>!yU&shE;_IX{S@B`8vpw#C&so+_?+>@3_9(cP
zr=-k|D?t2|3Vvep{V^tIX<E*#%sV-Fz<r6#yL_ld>Jd&#ly(0P2&&%|niuSx*w{6p
z*f9?1RJ1a!UK|3aX4x-^lZ^Z^3Mk8hRw;@?_{?)m?$@5ud7i`E9EKDt$unndc$(*^
zJR(I=Nav+>ybgoN=23P8GJ8viNeMB0%hV$CM|OR9+_?jH$#TBP3M{LZmP(7ILTSEq
F;XlNROECZd

literal 0
HcmV?d00001

diff --git a/prediction_model/processing/data_handling.py b/prediction_model/processing/data_handling.py
new file mode 100644
index 0000000..74a1f1a
--- /dev/null
+++ b/prediction_model/processing/data_handling.py
@@ -0,0 +1,10 @@
+import os
+import pandas as pd
+from prediction_model.config import config
+
+#Load the dataset
+def load_dataset(file_name):
+    filepath = os.path.join(config.DATAPATH,file_name)
+    _data = pd.read_csv(filepath)
+    return _data
+
diff --git a/prediction_model/processing/preprocessing.py b/prediction_model/processing/preprocessing.py
new file mode 100644
index 0000000..7128e9a
--- /dev/null
+++ b/prediction_model/processing/preprocessing.py
@@ -0,0 +1,94 @@
+from sklearn.base import BaseEstimator,TransformerMixin
+from prediction_model.config import config
+import numpy as np
+
+class MeanImputer(BaseEstimator,TransformerMixin):
+    def __init__(self,variables=None):
+        self.variables = variables
+    
+    def fit(self,X,y=None):
+        self.mean_dict = {}
+        for col in self.variables:
+            self.mean_dict[col] = X[col].mean()
+        return self
+    
+    def transform(self,X):
+        X = X.copy()
+        for col in self.variables:
+            X[col].fillna(self.mean_dict[col],inplace=True)
+        return X
+
+
+class ModeImputer(BaseEstimator,TransformerMixin):
+    def __init__(self,variables=None):
+        self.variables = variables
+    
+    def fit(self,X,y=None):
+        self.mode_dict = {}
+        for col in self.variables:
+            self.mode_dict[col] = X[col].mode()[0]
+        return self
+    
+    def transform(self,X):
+        X = X.copy()
+        for col in self.variables:
+            X[col].fillna(self.mode_dict[col],inplace=True)
+        return X
+
+class DropColumns(BaseEstimator,TransformerMixin):
+    def __init__(self,variables_to_drop=None):
+        self.variables_to_drop = variables_to_drop
+    
+    def fit(self,X,y=None):
+        return self
+    
+    def transform(self,X):
+        X = X.copy()
+        X = X.drop(columns = self.variables_to_drop)
+        return X
+
+class DomainProcessing(BaseEstimator,TransformerMixin):
+    def __init__(self,variable_to_modify = None, variable_to_add = None):
+        self.variable_to_modify = variable_to_modify
+        self.variable_to_add = variable_to_add
+    
+    def fit(self,X,y=None):
+        return self
+    
+    def transform(self,X):
+        X = X.copy()
+        for feature in self.variable_to_modify:
+            X[feature] = X[feature] + X[self.variable_to_add]
+        return X
+
+class CustomLabelEncoder(BaseEstimator,TransformerMixin):
+    def __init__(self, variables=None):
+        self.variables=variables
+    
+    def fit(self, X,y=None):
+        self.label_dict = {}
+        for var in self.variables:
+            t = X[var].value_counts().sort_values(ascending=True).index 
+            self.label_dict[var] = {k:i for i,k in enumerate(t,0)}
+        return self
+    
+    def transform(self,X):
+        X=X.copy()
+        for feature in self.variables:
+            X[feature] = X[feature].map(self.label_dict[feature])
+        return X
+
+
+# Try out Log Transformation
+class LogTransforms(BaseEstimator,TransformerMixin):
+    def __init__(self,variables=None):
+        self.variables = variables
+    
+    def fit(self,X,y=None):
+        return self
+    
+    def transform(self,X):
+        X = X.copy()
+        for col in self.variables:
+            X[col] = np.log(X[col])
+        return X
\ No newline at end of file
diff --git a/prediction_model/training_pipeline.py b/prediction_model/training_pipeline.py
new file mode 100644
index 0000000..577962e
--- /dev/null
+++ b/prediction_model/training_pipeline.py
@@ -0,0 +1,114 @@
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
+import mlflow
+import mlflow.sklearn
+from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
+import xgboost as xgb
+from prediction_model.config import config
+from prediction_model.processing.data_handling import load_dataset
+import prediction_model.processing.preprocessing as pp 
+import prediction_model.pipeline as pipe
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import MinMaxScaler
+
+
+#mlflow.set_tracking_uri("http://127.0.0.1:5000")
+
+mlflow.set_tracking_uri(config.TRACKING_URI)
+
+def get_data(input):
+    data=load_dataset(input)
+    x=data[config.FEATURES]
+    y=data[config.TARGET].map({'N':0,'Y':1})
+    return x,y
+   
+
+X,Y=get_data(config.TRAIN_FILE)
+
+X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
+   
+
+# Define the search space
+search_space = {
+    'max_depth': hp.choice('max_depth', np.arange(3, 10, dtype=int)),
+    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
+    'n_estimators': hp.choice('n_estimators', np.arange(50, 300, 50, dtype=int)),
+    'subsample': hp.uniform('subsample', 0.5, 1.0),
+    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
+    'gamma': hp.uniform('gamma', 0, 5),
+    'reg_alpha': hp.uniform('reg_alpha', 0, 1),
+    'reg_lambda': hp.uniform('reg_lambda', 0, 1)
+}
+
+
+def objective(params):
+    # Create an XGBoost classifier with the given hyperparameters
+    
+    clf = xgb.XGBClassifier(
+        max_depth=params['max_depth'],
+        learning_rate=params['learning_rate'],
+        n_estimators=params['n_estimators'],
+        subsample=params['subsample'],
+        colsample_bytree=params['colsample_bytree'],
+        gamma=params['gamma'],
+        reg_alpha=params['reg_alpha'],
+        reg_lambda=params['reg_lambda'],
+        use_label_encoder=False,
+        eval_metric='mlogloss'
+    )
+    
+    # Define the complete pipeline with preprocessing and model
+    classification_pipeline = Pipeline(
+        [
+            ('DomainProcessing', pp.DomainProcessing(variable_to_modify=config.FEATURE_TO_MODIFY, variable_to_add=config.FEATURE_TO_ADD)),
+            ('MeanImputation', pp.MeanImputer(variables=config.NUM_FEATURES)),
+            ('ModeImputation', pp.ModeImputer(variables=config.CAT_FEATURES)),
+            ('DropFeatures', pp.DropColumns(variables_to_drop=config.DROP_FEATURES)),
+            ('LabelEncoder', pp.CustomLabelEncoder(variables=config.FEATURES_TO_ENCODE)),
+            ('LogTransform', pp.LogTransforms(variables=config.LOG_FEATURES)),
+            ('MinMaxScale', MinMaxScaler()),
+            ('XGBoostClassifier', clf)
+        ]
+    )
+    
+   
+    # Fit the pipeline
+    mlflow.xgboost.autolog()
+    mlflow.set_experiment("loan_prediction_model")
+    with mlflow.start_run(nested=True):
+        # Fit the pipeline
+        classification_pipeline.fit(X_train, y_train)
+        
+        # Make predictions
+        y_pred = classification_pipeline.predict(X_test)
+        
+        # Calculate metrics
+        f1 = f1_score(y_test, y_pred)
+        accuracy = accuracy_score(y_test, y_pred)
+        recall = recall_score(y_test, y_pred)
+        precision = precision_score(y_test, y_pred)
+        
+        # Log metrics manually
+        mlflow.log_metrics({
+            'f1_score': f1,
+            'accuracy': accuracy,
+            'recall': recall,
+            'precision': precision
+        })
+
+        mlflow.sklearn.log_model(classification_pipeline, "Loanprediction-model")
+    return {'loss': 1-f1, 'status': STATUS_OK}
+    
+
+
+trials = Trials()
+
+best_params = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=5, trials=trials)
+
+print("Best hyperparameters:", best_params)
+
+
+
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..d4f90f2
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,15 @@
+# Model building requirements
+numpy==1.24.3
+pandas==1.5.3
+joblib==1.2.0
+scikit-learn==1.2.2
+scipy==1.10.1
+pytest==7.4.2
+fastapi==0.103.0
+pydantic==2.4.2
+uvicorn==0.23.2
+gunicorn==21.2.0
+hyperopt==0.2.7
+mlflow==2.13.2
+xgboost==2.0.3
+python-multipart==0.0.9
\ No newline at end of file
diff --git a/tests/test_prediction.py b/tests/test_prediction.py
new file mode 100644
index 0000000..9f21698
--- /dev/null
+++ b/tests/test_prediction.py
@@ -0,0 +1,29 @@
+import pytest
+from prediction_model.config import config
+from prediction_model.processing.data_handling import load_dataset
+from prediction_model.predict import generate_predictions
+import mlflow
+
+# output from predict script not null
+# output from predict script is str data type
+# the output is Y for an example data
+
+mlflow.set_tracking_uri(config.TRACKING_URI)
+
+@pytest.fixture
+def single_prediction():
+    test_dataset = load_dataset(config.TEST_FILE)
+    single_row = test_dataset[config.FEATURES][:1]
+    result = generate_predictions(single_row)
+    return result
+
+def test_single_pred_not_none(single_prediction): # output is not none
+    assert single_prediction is not None
+
+def test_single_pred_str_type(single_prediction): # data type is string
+    assert isinstance(single_prediction.get('prediction')[0],str)
+
+def test_single_pred_validate(single_prediction): # check the output is Y
+    assert single_prediction.get('prediction')[0] == 'Y'
+
+