From 91520311390bae9f1603b853e1cb2f8b75d3328a Mon Sep 17 00:00:00 2001 From: Chandramouli S <64595758+Chandru-21@users.noreply.github.com> Date: Tue, 25 Jun 2024 11:22:45 +0530 Subject: [PATCH] mlops CI --- .github/workflows/main.yml | 46 ++++++ Dockerfile | 43 ++++++ main.py | 139 ++++++++++++++++++ prediction_model/VERSION | 1 + .../__pycache__/pipeline.cpython-311.pyc | Bin 0 -> 1446 bytes .../__pycache__/predict.cpython-311.pyc | Bin 0 -> 2386 bytes .../config/__pycache__/config.cpython-311.pyc | Bin 0 -> 1416 bytes prediction_model/config/config.py | 60 ++++++++ prediction_model/pipeline.py | 23 +++ prediction_model/predict.py | 44 ++++++ .../__pycache__/data_handling.cpython-311.pyc | Bin 0 -> 1688 bytes .../__pycache__/data_handling.cpython-39.pyc | Bin 0 -> 1143 bytes .../__pycache__/preprocessing.cpython-311.pyc | Bin 0 -> 6420 bytes .../__pycache__/preprocessing.cpython-39.pyc | Bin 0 -> 4394 bytes prediction_model/processing/data_handling.py | 10 ++ prediction_model/processing/preprocessing.py | 94 ++++++++++++ prediction_model/training_pipeline.py | 114 ++++++++++++++ requirements.txt | 15 ++ tests/test_prediction.py | 29 ++++ 19 files changed, 618 insertions(+) create mode 100644 .github/workflows/main.yml create mode 100644 Dockerfile create mode 100644 main.py create mode 100644 prediction_model/VERSION create mode 100644 prediction_model/__pycache__/pipeline.cpython-311.pyc create mode 100644 prediction_model/__pycache__/predict.cpython-311.pyc create mode 100644 prediction_model/config/__pycache__/config.cpython-311.pyc create mode 100644 prediction_model/config/config.py create mode 100644 prediction_model/pipeline.py create mode 100644 prediction_model/predict.py create mode 100644 prediction_model/processing/__pycache__/data_handling.cpython-311.pyc create mode 100644 prediction_model/processing/__pycache__/data_handling.cpython-39.pyc create mode 100644 prediction_model/processing/__pycache__/preprocessing.cpython-311.pyc create mode 100644 prediction_model/processing/__pycache__/preprocessing.cpython-39.pyc create mode 100644 prediction_model/processing/data_handling.py create mode 100644 prediction_model/processing/preprocessing.py create mode 100644 prediction_model/training_pipeline.py create mode 100644 requirements.txt create mode 100644 tests/test_prediction.py diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..e4c514d --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,46 @@ +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +name: MLOps AWS + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v2 + + - name: Install kubectl + uses: azure/setup-kubectl@v2.0 + with: + version: 'v1.24.0' # default is latest stable + id: install + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-2 + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v1 + + - name: Build, tag, and push the image to Amazon ECR + id: build-image + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + ECR_REPOSITORY: ${{ secrets.REPO_NAME }} + IMAGE_TAG: latest + run: | + # Build a docker container and push it to ECR + docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG . + echo "Pushing image to ECR..." + docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG + echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" + diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b666461 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,43 @@ +FROM python:3.10-slim-buster + +RUN pip install --upgrade pip + +WORKDIR /app + +COPY . /app + +#set permissions + +RUN chmod +x /app/tests + +RUN chmod +w /app/tests + +RUN chmod +x /app/prediction_model + +RUN chmod +w /app/prediction_model/trained_models + +RUN chmod +w /app/prediction_model/datasets + + +ENV PYTHONPATH "${PYTHONPATH}:/app/prediction_model" + + +RUN pip install --no-cache-dir -r requirements.txt + +RUN pip install dvc[s3] + +RUN dvc pull + +RUN python /app/prediction_model/training_pipeline.py + +RUN pytest -v /app/tests/test_prediction.py + +RUN pytest --junitxml=/app/tests/test-results.xml /app/tests/test_prediction.py + +EXPOSE 8005 + +ENTRYPOINT ["python"] + +CMD ["main.py"] + + diff --git a/main.py b/main.py new file mode 100644 index 0000000..00ab622 --- /dev/null +++ b/main.py @@ -0,0 +1,139 @@ +from fastapi import FastAPI , File, UploadFile +from fastapi.responses import StreamingResponse +from pydantic import BaseModel +import uvicorn +import numpy as np +import pandas as pd +from fastapi.middleware.cors import CORSMiddleware +from prediction_model.predict import generate_predictions,generate_predictions_batch +from prediction_model.config import config +import mlflow +import io +import boto3 +from datetime import datetime + + +def upload_to_s3(file_content, filename): + s3 = boto3.client('s3') + + current_date = datetime.now().strftime("%Y-%m-%d") + if filename.endswith('.csv'): + filename = filename[:-4] + + current_datetime = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + + folder_path = f"{config.FOLDER}/{current_date}" + + filename_with_datetime = f"{filename}_{current_datetime}.csv" + + s3_key = f"{folder_path}/{filename_with_datetime}" + + response = s3.put_object(Bucket=config.S3_BUCKET, Key=s3_key, Body=file_content) + + return s3_key + +# mlflow.set_tracking_uri("http://localhost:5000") + +mlflow.set_tracking_uri(config.TRACKING_URI) + +app = FastAPI( + title="Loan Prediction App using FastAPI - MLOps", + description = "MLOps Demo", + version='1.0' +) + +origins=[ + "*" +] + +app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"] +) + +class LoanPrediction(BaseModel): + Gender: str + Married: str + Dependents: str + Education: str + Self_Employed: str + ApplicantIncome: float + CoapplicantIncome: float + LoanAmount: float + Loan_Amount_Term: float + Credit_History: float + Property_Area: str + + +@app.get("/") +def index(): + return {"message":"Welcome to Loan Prediction App using API - CI CD Jenkins" } + +@app.post("/prediction_api") +def predict(loan_details: LoanPrediction): + data = loan_details.model_dump() + prediction = generate_predictions([data])["prediction"][0] + if prediction == "Y": + pred = "Approved" + else: + pred = "Rejected" + return {"status":pred} + +@app.post("/prediction_ui") +def predict_gui(Gender: str, + Married: str, + Dependents: str, + Education: str, + Self_Employed: str, + ApplicantIncome: float, + CoapplicantIncome: float, + LoanAmount: float, + Loan_Amount_Term: float, + Credit_History: float, + Property_Area: str): + + input_data = [Gender, Married,Dependents, Education, Self_Employed,ApplicantIncome, + CoapplicantIncome,LoanAmount, Loan_Amount_Term,Credit_History, Property_Area ] + + cols = ['Gender', 'Married', 'Dependents', 'Education', + 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', + 'Loan_Amount_Term', 'Credit_History', 'Property_Area'] + + data_dict = dict(zip(cols,input_data)) + prediction = generate_predictions([data_dict])["prediction"][0] + if prediction == "Y": + pred = "Approved" + else: + pred = "Rejected" + return {"status":pred} + + +@app.post("/batch_prediction") +async def batch_predict(file: UploadFile = File(...)): + + content = await file.read() + df = pd.read_csv(io.BytesIO(content),index_col=False) + print(df) + + # Ensure the CSV file contains the required features + required_columns = config.FEATURES + if not all(column in df.columns for column in required_columns): + return {"error": "CSV file does not contain the required columns."} + + predictions = generate_predictions_batch(df)["prediction"] + + df['Prediction'] = predictions + result = df.to_csv(index=False) + + s3_key = upload_to_s3(result.encode('utf-8'), file.filename) + + return StreamingResponse(io.BytesIO(result.encode('utf-8')), media_type="text/csv", headers={"Content-Disposition":"attachment; filename=predictions.csv"}) + + + + +if __name__== "__main__": + uvicorn.run(app, host="0.0.0.0",port=8005) \ No newline at end of file diff --git a/prediction_model/VERSION b/prediction_model/VERSION new file mode 100644 index 0000000..3eefcb9 --- /dev/null +++ b/prediction_model/VERSION @@ -0,0 +1 @@ +1.0.0 diff --git a/prediction_model/__pycache__/pipeline.cpython-311.pyc b/prediction_model/__pycache__/pipeline.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5df1b1d6a59f53626757f515aba13b1df2cc6bf4 GIT binary patch literal 1446 zcmZ`&T~8BB7@pli*@d=1J;GAZ7?R_m8cGynPK+VJek>%G#R4X$scF{kl(V>B&bBA= z5BvwN3D+0{7hZDdPuM1V(d?CpSKeXt$}8X5GSD7zGCO%@o@bu-V|U*rlcNa6ZT)3a zjv@4!ZrB(*1K(bA2)#iX(l`T|T$SS}<_+GARbvs38AIl9gXfSLuZ~0~Lxy0ARgptH z!kjh?>&RXdiF)GL9`mL+!2%w~4|#-+;OUJfuZ=`Elk5c4DTa@QL&)n0p56ep7^%nE z38*I+zC=A5sVA8N)M(zQj8jsjo5xsHYgdM4bY)cnE?cCPGbq zcVsY!>a-TyLfP>zRFmaGVN-Xop<5VmykT2SeRl(3qO4nG_0?8GH82S$3bv{0)+VtV z*mZSlH_M0V|5T!@&kbB}*>%&_^yYpz-hZZQT9_(h)mk^5c1vyPwgtu_lej?I8onUI zM1j~&39GF(!7fNA&!sek24t9ctUkv^(Q3d=0y}$mo2ZuCw22vx^`)GZ!U^|Ymb_mw=2F-?Mjg4HdXfj0E>@TcyUULq(i(NR{hm(U8?>OPr^YHz*$S_19 zaT;o}R9xNOsTAwmO1-QU)=U3{slKDtLIEmb-J`{B?3C+gf-Rs}?;;@0y2s+LCDwVdnh!~0rv_Lpnr$BN_D;Y-@XjaKdd?XEGioU{K!cnB}3ZQvEM z0KW7Ob(dj1jd2{;MSuE(vx{!|gR_h7`@fl8l=BDYX#!>DJ?WD;>(4EDkAk_Sk57Z* zvu@@_kXeF|*Y12puzaV>DdNPtpTrq&#k+g3@}U&mll|GX&g@z+yXFh)U0`zF{6X%+ z{b0G|XXH*s4l=SYtaZg{@4A;hxPEdoc(CEmD4iK4m{ELT6Bu7w^!PwpJc$Q?Kk=nX zN2&x;#TT|BZh>+OJx59INOBfmb5)KFm&yj%)&O&7_YExxCYK5`@ZOTg5#j?n;IlPsr zOJ%8^EoY-XXUS#R5WYv{oRJ1puxf};BHKhsA)MO=>$|Z}1Z%j3=wHmdVcLg9j*8WW zy=<-&Q&hDY6+?GS2V16%sS0ypBC$ zR^8I_hq_yB5IlAJ&cYnr?hyXg!Nja%+tW?MrBZ_!nCMGugoTcZq-leGowe@GYwP3J z?t}fF`KyunH|#no_Z8(HQn@!+RA|aEX!bTF%RH&nF%|39vekG-Pp@E4@AJI0rrT^> z?bDeC&AGTjs;fG4P6g9yRB6V2Vu8}8a+HUWdzv}&NmI5%)6Z5h!9|tIEJ=0KcA6d) zm`t@k^N~NfryyhO3>oH1H8tHWdn5VoNNMN0&d6+c zWVU@b9GmQpO}8KHDWhFwa;F~5e;wTUCQ$T_qIVU&{b{I(pI8@mDC{WLVlpAS3IDqY=lR4!+q!5EC!cnOuoV1lNPdvt2 zKQq7pk}qu~{BmsOAZup5k=}SM;TJ#qFI*u5SIEE>z!(dNFh50^j6y?BF<|*m7z2^h z4Dt*n7%*xi1qSZ`6czF=lP4Lx#~@lb#iX+g&M`R8AbB9<0_$D`a20_5mq9Lpf)-9n zYkjHWRaeOsn83WdF9F)5%N^}TSG)0Fvzc6GN0W5DUNLQ5FAh=Bfwy0A$uJv8 z+s(SO76q1ywZ@WVF46qp+bW6YmVSp9 z%m^Y1B-;*SK3#u-qWwkje=FSrN^4s9V=@CX8JoMm0POP|$A#!r5Z4fm25}9M+Kzr9 zx)L0(A$lB~RFft9lFE(lqXecBDbig+g7QQ86`v3Id?-u=!bC?XbcI3?*L@y=n$Jf^ F_zQ;#LZJWv literal 0 HcmV?d00001 diff --git a/prediction_model/config/__pycache__/config.cpython-311.pyc b/prediction_model/config/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..901097d8b55a0c379350505fa34333b14b82f6cd GIT binary patch literal 1416 zcma)5&5PSc6d!rLvSdq^?K(~hg%WUC44dtS1PaC3px9bxi9Z%wTUucds%N~>R+iL} z+~QD(&LI#8fz5F#C8f=!e~MiUUFKxysW(GzIc3IPcR%Q%Gt%#U=)K>2dK&#I%Sk|2 z-~PV;`#J!BGh?-=y;$##H~@|T1dwxpv6P&_A?_%)3O!j{X&2?Ci1@`PD;=%Bq1k3GgKz2%$~^{OdWE@M^G~IP@>v3n1b?Q%qYOLaUCkg4ajvsDf16&wn+qN zg;s#V;E5Pw%XYVV!4FS_5C!2v%k>;92<*PyvqIZ*xBTxMl3oNNXbwDxoPeb2p5>aI z&Ix%m^#7(lCEF`?a+K^2cd^YMBc}qk8n6LRgA3C8cxwS z#xjF6g;E0ZOiwUipAd@-*ipp_gA9X3j>I&UNYW1Nz-M)OY#u~p#$tgCwAhU5mbrKwEGYF2OYWwu4NQWP3x5=~zq}gnEmfrG2&XNUdpRyVcT3 z{K&IiB2-mf-BJjztcytL>Vm z6Jb|V^@Fz7AyV_8VV+wRmC~%9yAs%+O0)Ipg&nd_5`dOp7~{{QYVE{upFuO7{7WFP=Vo zI{tKfqTHS;w7ca`s%CDspW0KmQrnVztCf|sXr&1vjXYx&|_4mWhNFPe$L0FojF)RB+Lbn(YUcvr5ELo^O@WntbjUle1SR6U<`8e u3@B`j^Bc6o(`+$H&$7jF=CeqiWp197$I5mj&7@CgBZ*wZvyE@jX?y^xw|=+) literal 0 HcmV?d00001 diff --git a/prediction_model/config/config.py b/prediction_model/config/config.py new file mode 100644 index 0000000..f8f03ec --- /dev/null +++ b/prediction_model/config/config.py @@ -0,0 +1,60 @@ +import pathlib +import os + + +current_directory = os.path.dirname(os.path.realpath(__file__)) #current directory of the script + +PACKAGE_ROOT = os.path.dirname(current_directory) #parent directory of current directory + + +# PACKAGE_ROOT = pathlib.Path(prediction_model.__file__).resolve().parent + +DATAPATH = os.path.join(PACKAGE_ROOT,"datasets") + +TRAIN_FILE = 'train.csv' +TEST_FILE = 'test.csv' + +TARGET = 'Loan_Status' + +#Final features used in the model +FEATURES = ['Gender', 'Married', 'Dependents', 'Education', + 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', + 'Loan_Amount_Term', 'Credit_History', 'Property_Area'] + +NUM_FEATURES = ['ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term'] + +CAT_FEATURES = ['Gender', + 'Married', + 'Dependents', + 'Education', + 'Self_Employed', + 'Credit_History', + 'Property_Area'] + +# in our case it is same as Categorical features +FEATURES_TO_ENCODE = ['Gender', + 'Married', + 'Dependents', + 'Education', + 'Self_Employed', + 'Credit_History', + 'Property_Area'] + +FEATURE_TO_MODIFY = ['ApplicantIncome'] +FEATURE_TO_ADD = 'CoapplicantIncome' + +DROP_FEATURES = ['CoapplicantIncome'] + +LOG_FEATURES = ['ApplicantIncome', 'LoanAmount'] # taking log of numerical columns + +S3_BUCKET = "loanprediction" + +FOLDER="datadrift" + +TRACKING_URI="http://ec2-3-19-244-223.us-east-2.compute.amazonaws.com:5000/" + + +EXPERIMENT_NAME="loan_prediction_model" + +MODEL_NAME="/Loanprediction-model" + diff --git a/prediction_model/pipeline.py b/prediction_model/pipeline.py new file mode 100644 index 0000000..6dfeccb --- /dev/null +++ b/prediction_model/pipeline.py @@ -0,0 +1,23 @@ +from sklearn.pipeline import Pipeline +from prediction_model.config import config +import prediction_model.processing.preprocessing as pp +from sklearn.preprocessing import MinMaxScaler + + + + + +preprocessing_pipeline = Pipeline( + [ + ('DomainProcessing',pp.DomainProcessing(variable_to_modify = config.FEATURE_TO_MODIFY, + variable_to_add = config.FEATURE_TO_ADD)), + ('MeanImputation', pp.MeanImputer(variables=config.NUM_FEATURES)), + ('ModeImputation',pp.ModeImputer(variables=config.CAT_FEATURES)), + ('DropFeatures', pp.DropColumns(variables_to_drop=config.DROP_FEATURES)), + ('LabelEncoder',pp.CustomLabelEncoder(variables=config.FEATURES_TO_ENCODE)), + ('LogTransform',pp.LogTransforms(variables=config.LOG_FEATURES)), + ('MinMaxScale', MinMaxScaler()) + ] +) + + diff --git a/prediction_model/predict.py b/prediction_model/predict.py new file mode 100644 index 0000000..9619d93 --- /dev/null +++ b/prediction_model/predict.py @@ -0,0 +1,44 @@ +import pandas as pd +import numpy as np +from prediction_model.config import config +import mlflow + + + +def generate_predictions(data_input): + data = pd.DataFrame(data_input) + experiment_name = config.EXPERIMENT_NAME + experiment = mlflow.get_experiment_by_name(experiment_name) + experiment_id = experiment.experiment_id + runs_df=mlflow.search_runs(experiment_ids=experiment_id,order_by=['metrics.f1_score DESC']) + best_run=runs_df.iloc[0] + best_run_id=best_run['run_id'] + best_model='runs:/' + best_run_id + config.MODEL_NAME + loan_prediction_model=mlflow.sklearn.load_model(best_model) + prediction=loan_prediction_model.predict(data) + output = np.where(prediction==1,'Y','N') + result = {"prediction":output} + return result + + +def generate_predictions_batch(data_input): + # data = pd.DataFrame(data_input) + experiment_name = config.EXPERIMENT_NAME + experiment = mlflow.get_experiment_by_name(experiment_name) + experiment_id = experiment.experiment_id + runs_df=mlflow.search_runs(experiment_ids=experiment_id,order_by=['metrics.f1_score DESC']) + best_run=runs_df.iloc[0] + best_run_id=best_run['run_id'] + best_model='runs:/' + best_run_id + config.MODEL_NAME + loan_prediction_model=mlflow.sklearn.load_model(best_model) + prediction=loan_prediction_model.predict(data_input) + output = np.where(prediction==1,'Y','N') + result = {"prediction":output} + return result + + + + + +if __name__=='__main__': + generate_predictions() \ No newline at end of file diff --git a/prediction_model/processing/__pycache__/data_handling.cpython-311.pyc b/prediction_model/processing/__pycache__/data_handling.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bd0661847b962b951377ba426537398c896a9b8 GIT binary patch literal 1688 zcmc&!-)j>|9G}@t(xly(v{9=MC#$`em0FMvvh&2%k|k&}9YY8i(B zjyP$+tb`83_c&#| zesjb29JW=T;lZ{;*_B{>w!SdG6mV;q`Aymg>j9&V+X!9Xvo?K)w!q;xDCe&Cpe{)* z$qrF>wwe;8RZsA&7xYoI~r=NOy z=S8-wUySvOol0-4*qJ|o3Fu^LBIx-R(hLcfmg~uKOs;(5m)MkhQKgMQupwbF3dj4s z2-5Rp_Lpc^8x|T86@k#&;%`4L$iHiB*xXp7)HAsKkUGYe2dWt11~sIjhM8kBR3;OL z$#9ZvD6EuME7ro&TxH%;NimvyFP0Z7BGdHOTJD-4&emoi(gAb5(9}dRa03duQ7iN< zxtPdFv}C?;nXrbzPC+B`iTiLgI}bkjQ}{9fh+3R@KC+`f(|1?nA7wfH* z4y%4tTXl#*+cQVBm8XO{CyU7@V56)g;WU6Caxgm!pK!Q`L{lB--7pYlWX zq|@qa%`U)j$<50EUstbm{`oI2w!0-WE}5NqrQD@1nTp9&M5cOUlM$Izavg3%WCFMd zY|e7PoP<7k^zN8R?p&3$>UL1sq=NbzDKZuGr|fmu&{m7yV3&YQ-iq7@*iT`MdnngY qPY;!%e(j-rM?F1sIU26}nG_!1M+4APxQeAcK%L-o`{+2RfIb80Utxj( literal 0 HcmV?d00001 diff --git a/prediction_model/processing/__pycache__/data_handling.cpython-39.pyc b/prediction_model/processing/__pycache__/data_handling.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..920605b897f42b916dcc0ae1eff8b161b34ebebc GIT binary patch literal 1143 zcmZ`&&2AGh5VkkFn@yF_QdE#QpgnMC4(S1I2q8jRiJ~U0)WRWJY2~inX2br;b}9v> zCwL2l)E;>No`4%qfPCe|D{x}Q3D6X<@ye?~9qxifPh8XdIj#wnF?E2c-r z9hYe#@xdn8?jCgyx<}8AQ*jeW!FZggvv$oiPSQ-o1V)N0*k?)$nHHyEcQ@Rxgk(A` z3l*M}GCZ6liBKx+z#d$RdSzZ^LW_=#(ZLu!h|$SeT0}2I@j5M3r?=bN?)0YzvK)&< zM@K?wjGmNSM3odgO)x>6gPnn+#0+pC!ZKofbOzQ|H8stwgwoNigsvm5J_thB=n9oL zV6{X5V)vVX2Vndm07&pG1^z{SMt}r|5)GuH|2_HOyrn=r0aBd7oBMryS2$uSJQkt| z6+06=oD_gws7E41^uu;TBHt48Ne_i(ZuPMHYA5dRZ}05ImcW(ydeH6f7=K(IXX&wV z`6RE5R{`X@?U=PHtprpoVqM0V%rsEO$gQPxE?SVpwB|dD3+P%PvQC>+-UfRKk)@X0 zgx&ujVw)22Kpbpy0uCG3a6m2u|99bX6@tq(6gNQ_7Z1x@u!N>vD$s6h;F+f7Kc0vE zt59^uzmUN|DgZ&~I#3@_c^B4;rx?G2mSIX_gUr+4x!z@4^Z#l|vW;W0vQe&N#{VNN Z&cS1N+$0lE@PjM#2aq+O&063G{!e4j9yR~~ literal 0 HcmV?d00001 diff --git a/prediction_model/processing/__pycache__/preprocessing.cpython-311.pyc b/prediction_model/processing/__pycache__/preprocessing.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2eba70ae47b45dc59e1047eddde22e092c013e42 GIT binary patch literal 6420 zcmdT|-A@}=7N0L03^q3TqI@JcKxhobn}k%+wuB9slC&G>uEHjjMQcrF2G*(VaqpNa zAXw$04^D)XbQdXT6=~U3MM31DmAX>?3=>(ZHPY_OKJ6QkdCI=*Id|;wjAuv$`?Qzg z_|83d?#H?3cRud-fj|=j<@SHnxj-|+{0m>I#apSA{|S|+jLgVvib=C^HqFI37T>uP zpBCZ*t@9~Q+{-c78Cm#-kv$5laGL_d?7+WK;tiDR1#W{2w~=xif$MYO`Y5*vxXmuy zCd%~#H{il;rrZ|b23@#*nZLzELjT4^MmRk%E^5k0S}vIub1Kn8Gepd2b1F$IWGZ<- znTfEvS5h-`$@yuivN8I@RbI+{Py?b|I&bX{kB_4uo|c{>vwTJY|;IWhMbNqsYuHW}BVfW_91!B1wvOQ;N3D>4K)D z<_H`jqv7Y_@asFV4`zR-DMXu%{a(z-WMMX{Wfx_I3}qLGXQys{n$;4&A?iIv%FSkp zA}6I>Qq3gNs;s1-L6sCuOJ?SgS9^0guAL1R1Lbz6jYuvNyfU%W#%4R?Kr`F-nYVo6!`XP6SHBJm)W`rwH@;f z#;Dc#5vZ@8TL$S7A5+EMbyQuS$n$mEgVJM^p8T?!_ zw-mH)92#&$?E@gV=T6*n>3G}P=pMuRE>Mz>m3n$NV&C=Giim-ZEE0w1}dOGLIu@e1Ad*wzUdWE7Mth>RyYtH_cqbNSLWeZd&}%| zZWDyPV+x#QRtJ|sg}3?X8NfqBGLub-k`fVgK~l4ehAh2v$y6#MlH;)6ZJvshoPs)e z2Z)9@T7i^G{40QS+$4V4r@S3&$hDh7tafl3DKKy7T(4zywa5E81{rglM#0RQ-2 zz7OOr&>uJo^kufD?=EIw5Y0e@nUaC_nPUO^t4C6&O!}eW* z6}sTYGNp+d+?Iy80d+ciDK8Wo&lKq6#1XnkiX-&7;32Bcjmphtb`gyB<3NBc1+5DJ zRUUxGGMXEh+ubSQbaYA?Du-}N?1t&j;X|_kIIG5tdCqP^pB=LwxH62CxM}n<;E3^H zB0SYLT|nUylxkJNs1h~_hoNEVq;-R)z71ht+-to@G2F}gO!4$^esb4;x#+(Pe#Y)H zh?{0!fWZh7)L6~w1<(+9ti;Qr7qAmAj}E{|3{>Fghf;E{Bj{||I5`Zg2*zRLDiFIm zqK92-uV0-BiedZ!V+S~~p34Y|?oxLYf4d6wIV3p9WycCAAWk?|0I0YEeaZVs95I>z ztyEw+s2lpRsmDlLXR$4cevO8=qxtKl_Kx){&pI~5EunaRq}YFTxBY5;R6Z54P+$8$m)1Wj%g4&oA~EJ885-haw3gtNH+n22}=raP&jr$1Q0)7F*DA zYWk*lS4n-8kuXd+im!N6FxBat!i2#X>_MyKR&C*GZspW{^aoTeUqa3_S9m6*#!2E*~KMv`;3Ne9-4aiL`_mMG9;;z zX_nwyoD%bnat9D&^Pe})f@xb}0cVg6{fS%k0dmk(Qvwkn7rmgfNuB-D29noJBbXja zYC1c*G~jYBU>u?&DOD0v+Gw=a_XdotVJ!53DfEsT&vtBUY)jl2+YrqrLodfpzl1tDgP4^@-vT|SdDVYVhnGkbI#A8XZ(AVocm|vaE zX_%Ejs#VEejR5wP`%WgHtEK`0#CLRoas?|H7UzqxfIpxR$%a~=^sae-4)wka_3nnw z6+`EK4qbd1y7*$E5W2V<8Y|*+JwH~W8~^(+U;OsPS3i#aGzz=k?Hw=njzir0>3VF} zA1?UA2Lu^r=DP-~IkHt9thcJ}mj<9@rZQVIUwbfOTHW_*R@V~w>U>(vlAEwxat(-0 zi;T#(%SGs>u|daM`&hAk%xQBEwzn}{DUxeAbG^M~J!dgCu@bKb;}tt;d856>(Ixmd3o1I6!(^?2 zo9g^b^?u+;VfmX(?J3*nrj?z6MIMm3qMhM!4`^XQGpj+6VN4DPDq8K+fYG>vGq^_- zrtU8rGq4-*>nwId>A3ajUg2^$Ql(fl+;$gqfR2aVb)3s&bv~ueAC%m++4pH`cN|us zVaQS+N>uO2`cS$1^09xMY+Jj%KDI7BoBZ~RwJ$)SrH7OG$yyL`utML4;R%vkIGIzS zo!2mj4j^#!NI>^%U#1k1WTJQBu9%=Q5CMqZR)2{aHExOto`dcIQw^76C<^1sh~~01 zZNR{UVv(-j&?K4%W6t}!aRdTtxRV+scn6I7r`-a&$FnS3Vy+b2o)R-qus;H&@u literal 0 HcmV?d00001 diff --git a/prediction_model/processing/__pycache__/preprocessing.cpython-39.pyc b/prediction_model/processing/__pycache__/preprocessing.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..190bfef51ede8bd895d97d3844ed251af1d414b5 GIT binary patch literal 4394 zcmb_fOK;mo5ayE9(~=)bV#iK^3Qf^QK_vuDfdZ)!IFZvfZLAcCixz?cf*~oF4n@jK zDu@mFlw5Oae}HoodhVa-PgwNUQ_sEi)cIBqQ;h7|K?p6iJ0xdjXTEu?SuExiy8itA zdiyt;{e?yRO3`A2M))h;Y{h1t(qT>3QJX5mTJ^M!-qht>^HR<9*NUy%saJ}fa+ssO z)SDTir0on*G9yZsC|Ns4l-!7tBTC+$Bg))}k|#>RE)u0UqRbIx-d-Te!iZ9^wZ}?j z@o&;hsi>lO&kUXWVdQqqDBxmoo11>v4tU4mTkZ?juP~9e0>ACu$^HY3q_F}LNI{)0}p7dgCPsr2#q!W^ScBZQ6n ztgjekvd7ARH3m%No}1h?pL$L>P=y{kUYlp>wSmIt=bF}zgboi|^`A}O=4L18 zd2Xu~cK2$V$5{4@e39lF}`$~ zU`P|fF|Gm=_LW_>sXSRe)X1dTs&b^h0Zr*^>&jlCPX^|YQ_)1OL*6l&nptA{EntF9`$Dp54dY-X#kDTwyw`-G)|*4QS1W?aAPnU+)0g>D7i{h{9Jw(EJm z$>)jjK$Gh8C7SaKxB)IP?Q_u>_Y3r3w1+J1VT~PJitl0aUWGO2H%P-zzS(gML*xy( zwdY}8FpRyPN%HZ87a{E;Zfg^{(;9FX0waVQNV1ksOPWQ?sS8%ypmF}FBgmaM>Le(5 zhQx6=H(@rpuNCMrf#eBg_t~ z$AEbmz>I17-hs(4K}(&(HWH2eGG=AmR&h%>ANt{85_mBk6TstFrrAh_2t@t?-Il>C z1|-nlACNYA(5(kvuj7YjPT#}q!o<=ffg4d^*u<%#4rB8gluy_Naq|iGGR9X-^lW?~ zIslP6zCs?Hq&CTWDiM_EaZ)1Cas%IvBre1yCUJsyAW#Z}4<`wX?IMY(Hhaw|ouTZh zlrBVu5+}@7(Ihm44g>IO^ySzJGD9V-0M1n!(xVE?vUy5MalOurjSq>-$74=1$!Huj zi^kN2U2V`fZ<1Nu3{VZJ`i<&ZW2!iTEsRzNYXs}CzV|7=8`M3Fi;yZxW0(LXlvLgJ zzF3IUOxq?Sq9$<^IbOjnVK<=X77d95l%#{zqTxG&7y#-vjaa&& zl(_bm#>co)8>n_CCmjUoXW%$WI(lB?pjkAgE^K;(MhF$ZuEP7Bw3r#vk3 z)u)}lR5z8EaBi#(tE6R5lQToh@onl1Eu^#YHO9q= zvVGd5=ki=IY%rJERd%pETj6)>G>;~o<{L`e$zY2WbfI<3E=P9ZC>Kd(5nLjnuQ1v**PKz&eN*mxeiZ3mj;h>!yU&shE;_IX{S@B`8vpw#C&so+_?+>@3_9(cP zr=-k|D?t2|3Vvep{V^tIXJd&#ly(0P2&&%|niuSx*w{6p z*f9?1RJ1a!UK|3aX4x-^lZ^Z^3Mk8hRw;@?_{?)m?$@5ud7i`E9EKDt$unndc$(*^ zJR(I=Nav+>ybgoN=23P8GJ8viNeMB0%hV$CM|OR9+_?jH$#TBP3M{LZmP(7ILTSEq F;XlNROECZd literal 0 HcmV?d00001 diff --git a/prediction_model/processing/data_handling.py b/prediction_model/processing/data_handling.py new file mode 100644 index 0000000..74a1f1a --- /dev/null +++ b/prediction_model/processing/data_handling.py @@ -0,0 +1,10 @@ +import os +import pandas as pd +from prediction_model.config import config + +#Load the dataset +def load_dataset(file_name): + filepath = os.path.join(config.DATAPATH,file_name) + _data = pd.read_csv(filepath) + return _data + diff --git a/prediction_model/processing/preprocessing.py b/prediction_model/processing/preprocessing.py new file mode 100644 index 0000000..7128e9a --- /dev/null +++ b/prediction_model/processing/preprocessing.py @@ -0,0 +1,94 @@ +from sklearn.base import BaseEstimator,TransformerMixin +from prediction_model.config import config +import numpy as np + +class MeanImputer(BaseEstimator,TransformerMixin): + def __init__(self,variables=None): + self.variables = variables + + def fit(self,X,y=None): + self.mean_dict = {} + for col in self.variables: + self.mean_dict[col] = X[col].mean() + return self + + def transform(self,X): + X = X.copy() + for col in self.variables: + X[col].fillna(self.mean_dict[col],inplace=True) + return X + + +class ModeImputer(BaseEstimator,TransformerMixin): + def __init__(self,variables=None): + self.variables = variables + + def fit(self,X,y=None): + self.mode_dict = {} + for col in self.variables: + self.mode_dict[col] = X[col].mode()[0] + return self + + def transform(self,X): + X = X.copy() + for col in self.variables: + X[col].fillna(self.mode_dict[col],inplace=True) + return X + +class DropColumns(BaseEstimator,TransformerMixin): + def __init__(self,variables_to_drop=None): + self.variables_to_drop = variables_to_drop + + def fit(self,X,y=None): + return self + + def transform(self,X): + X = X.copy() + X = X.drop(columns = self.variables_to_drop) + return X + +class DomainProcessing(BaseEstimator,TransformerMixin): + def __init__(self,variable_to_modify = None, variable_to_add = None): + self.variable_to_modify = variable_to_modify + self.variable_to_add = variable_to_add + + def fit(self,X,y=None): + return self + + def transform(self,X): + X = X.copy() + for feature in self.variable_to_modify: + X[feature] = X[feature] + X[self.variable_to_add] + return X + +class CustomLabelEncoder(BaseEstimator,TransformerMixin): + def __init__(self, variables=None): + self.variables=variables + + def fit(self, X,y=None): + self.label_dict = {} + for var in self.variables: + t = X[var].value_counts().sort_values(ascending=True).index + self.label_dict[var] = {k:i for i,k in enumerate(t,0)} + return self + + def transform(self,X): + X=X.copy() + for feature in self.variables: + X[feature] = X[feature].map(self.label_dict[feature]) + return X + + +# Try out Log Transformation +class LogTransforms(BaseEstimator,TransformerMixin): + def __init__(self,variables=None): + self.variables = variables + + def fit(self,X,y=None): + return self + + def transform(self,X): + X = X.copy() + for col in self.variables: + X[col] = np.log(X[col]) + return X \ No newline at end of file diff --git a/prediction_model/training_pipeline.py b/prediction_model/training_pipeline.py new file mode 100644 index 0000000..577962e --- /dev/null +++ b/prediction_model/training_pipeline.py @@ -0,0 +1,114 @@ +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score +import mlflow +import mlflow.sklearn +from hyperopt import fmin, tpe, hp, Trials, STATUS_OK +import xgboost as xgb +from prediction_model.config import config +from prediction_model.processing.data_handling import load_dataset +import prediction_model.processing.preprocessing as pp +import prediction_model.pipeline as pipe +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import MinMaxScaler + + +#mlflow.set_tracking_uri("http://127.0.0.1:5000") + +mlflow.set_tracking_uri(config.TRACKING_URI) + +def get_data(input): + data=load_dataset(input) + x=data[config.FEATURES] + y=data[config.TARGET].map({'N':0,'Y':1}) + return x,y + + +X,Y=get_data(config.TRAIN_FILE) + +X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) + + +# Define the search space +search_space = { + 'max_depth': hp.choice('max_depth', np.arange(3, 10, dtype=int)), + 'learning_rate': hp.uniform('learning_rate', 0.01, 0.3), + 'n_estimators': hp.choice('n_estimators', np.arange(50, 300, 50, dtype=int)), + 'subsample': hp.uniform('subsample', 0.5, 1.0), + 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0), + 'gamma': hp.uniform('gamma', 0, 5), + 'reg_alpha': hp.uniform('reg_alpha', 0, 1), + 'reg_lambda': hp.uniform('reg_lambda', 0, 1) +} + + +def objective(params): + # Create an XGBoost classifier with the given hyperparameters + + clf = xgb.XGBClassifier( + max_depth=params['max_depth'], + learning_rate=params['learning_rate'], + n_estimators=params['n_estimators'], + subsample=params['subsample'], + colsample_bytree=params['colsample_bytree'], + gamma=params['gamma'], + reg_alpha=params['reg_alpha'], + reg_lambda=params['reg_lambda'], + use_label_encoder=False, + eval_metric='mlogloss' + ) + + # Define the complete pipeline with preprocessing and model + classification_pipeline = Pipeline( + [ + ('DomainProcessing', pp.DomainProcessing(variable_to_modify=config.FEATURE_TO_MODIFY, variable_to_add=config.FEATURE_TO_ADD)), + ('MeanImputation', pp.MeanImputer(variables=config.NUM_FEATURES)), + ('ModeImputation', pp.ModeImputer(variables=config.CAT_FEATURES)), + ('DropFeatures', pp.DropColumns(variables_to_drop=config.DROP_FEATURES)), + ('LabelEncoder', pp.CustomLabelEncoder(variables=config.FEATURES_TO_ENCODE)), + ('LogTransform', pp.LogTransforms(variables=config.LOG_FEATURES)), + ('MinMaxScale', MinMaxScaler()), + ('XGBoostClassifier', clf) + ] + ) + + + # Fit the pipeline + mlflow.xgboost.autolog() + mlflow.set_experiment("loan_prediction_model") + with mlflow.start_run(nested=True): + # Fit the pipeline + classification_pipeline.fit(X_train, y_train) + + # Make predictions + y_pred = classification_pipeline.predict(X_test) + + # Calculate metrics + f1 = f1_score(y_test, y_pred) + accuracy = accuracy_score(y_test, y_pred) + recall = recall_score(y_test, y_pred) + precision = precision_score(y_test, y_pred) + + # Log metrics manually + mlflow.log_metrics({ + 'f1_score': f1, + 'accuracy': accuracy, + 'recall': recall, + 'precision': precision + }) + + mlflow.sklearn.log_model(classification_pipeline, "Loanprediction-model") + return {'loss': 1-f1, 'status': STATUS_OK} + + + +trials = Trials() + +best_params = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=5, trials=trials) + +print("Best hyperparameters:", best_params) + + + + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d4f90f2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +# Model building requirements +numpy==1.24.3 +pandas==1.5.3 +joblib==1.2.0 +scikit-learn==1.2.2 +scipy==1.10.1 +pytest==7.4.2 +fastapi==0.103.0 +pydantic==2.4.2 +uvicorn==0.23.2 +gunicorn==21.2.0 +hyperopt==0.2.7 +mlflow==2.13.2 +xgboost==2.0.3 +python-multipart==0.0.9 \ No newline at end of file diff --git a/tests/test_prediction.py b/tests/test_prediction.py new file mode 100644 index 0000000..9f21698 --- /dev/null +++ b/tests/test_prediction.py @@ -0,0 +1,29 @@ +import pytest +from prediction_model.config import config +from prediction_model.processing.data_handling import load_dataset +from prediction_model.predict import generate_predictions +import mlflow + +# output from predict script not null +# output from predict script is str data type +# the output is Y for an example data + +mlflow.set_tracking_uri(config.TRACKING_URI) + +@pytest.fixture +def single_prediction(): + test_dataset = load_dataset(config.TEST_FILE) + single_row = test_dataset[config.FEATURES][:1] + result = generate_predictions(single_row) + return result + +def test_single_pred_not_none(single_prediction): # output is not none + assert single_prediction is not None + +def test_single_pred_str_type(single_prediction): # data type is string + assert isinstance(single_prediction.get('prediction')[0],str) + +def test_single_pred_validate(single_prediction): # check the output is Y + assert single_prediction.get('prediction')[0] == 'Y' + +