Skip to content

Commit

Permalink
bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
AshishShukla-1992 committed Jul 23, 2024
1 parent bf09ffc commit be489b7
Show file tree
Hide file tree
Showing 6 changed files with 30,221 additions and 193 deletions.
Binary file modified __pycache__/test_model.cpython-38-pytest-8.2.2.pyc
Binary file not shown.
30,163 changes: 30,163 additions & 0 deletions data/cleaned_data.csv

Large diffs are not rendered by default.

Binary file modified ml/__pycache__/clean_data.cpython-38.pyc
Binary file not shown.
2 changes: 1 addition & 1 deletion ml/clean_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ def cleaned_data():
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Save the cleaned DataFrame to a new CSV file
df.to_csv('data/cleaned_census_income.csv', index=False)
df.to_csv('data/cleaned_data.csv', index=False)

return df
229 changes: 47 additions & 182 deletions test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 22,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -12,170 +12,9 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>workclass</th>\n",
" <th>fnlgt</th>\n",
" <th>education</th>\n",
" <th>education-num</th>\n",
" <th>marital-status</th>\n",
" <th>occupation</th>\n",
" <th>relationship</th>\n",
" <th>race</th>\n",
" <th>sex</th>\n",
" <th>capital-gain</th>\n",
" <th>capital-loss</th>\n",
" <th>hours-per-week</th>\n",
" <th>native-country</th>\n",
" <th>salary</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>39</td>\n",
" <td>State-gov</td>\n",
" <td>77516</td>\n",
" <td>Bachelors</td>\n",
" <td>13</td>\n",
" <td>Never-married</td>\n",
" <td>Adm-clerical</td>\n",
" <td>Not-in-family</td>\n",
" <td>White</td>\n",
" <td>Male</td>\n",
" <td>2174</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>50</td>\n",
" <td>Self-emp-not-inc</td>\n",
" <td>83311</td>\n",
" <td>Bachelors</td>\n",
" <td>13</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Exec-managerial</td>\n",
" <td>Husband</td>\n",
" <td>White</td>\n",
" <td>Male</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>13</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>38</td>\n",
" <td>Private</td>\n",
" <td>215646</td>\n",
" <td>HS-grad</td>\n",
" <td>9</td>\n",
" <td>Divorced</td>\n",
" <td>Handlers-cleaners</td>\n",
" <td>Not-in-family</td>\n",
" <td>White</td>\n",
" <td>Male</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>53</td>\n",
" <td>Private</td>\n",
" <td>234721</td>\n",
" <td>11th</td>\n",
" <td>7</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Handlers-cleaners</td>\n",
" <td>Husband</td>\n",
" <td>Black</td>\n",
" <td>Male</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>28</td>\n",
" <td>Private</td>\n",
" <td>338409</td>\n",
" <td>Bachelors</td>\n",
" <td>13</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Prof-specialty</td>\n",
" <td>Wife</td>\n",
" <td>Black</td>\n",
" <td>Female</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>Cuba</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age workclass fnlgt education education-num \\\n",
"0 39 State-gov 77516 Bachelors 13 \n",
"1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
"2 38 Private 215646 HS-grad 9 \n",
"3 53 Private 234721 11th 7 \n",
"4 28 Private 338409 Bachelors 13 \n",
"\n",
" marital-status occupation relationship race sex \\\n",
"0 Never-married Adm-clerical Not-in-family White Male \n",
"1 Married-civ-spouse Exec-managerial Husband White Male \n",
"2 Divorced Handlers-cleaners Not-in-family White Male \n",
"3 Married-civ-spouse Handlers-cleaners Husband Black Male \n",
"4 Married-civ-spouse Prof-specialty Wife Black Female \n",
"\n",
" capital-gain capital-loss hours-per-week native-country salary \n",
"0 2174 0 40 United-States <=50K \n",
"1 0 0 13 United-States <=50K \n",
"2 0 0 40 United-States <=50K \n",
"3 0 0 40 United-States <=50K \n",
"4 0 0 40 Cuba <=50K "
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Load the data\n",
"df = pd.read_csv('data/census.csv', sep=',\\s', engine='python')\n",
Expand All @@ -186,7 +25,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -196,7 +35,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -206,7 +45,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -216,7 +55,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -226,24 +65,31 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'ml'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[5], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmodel_selection\u001b[39;00m \u001b[39mimport\u001b[39;00m train_test_split\n\u001b[0;32m----> 2\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mml\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mclean_data\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mml\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdata\u001b[39;00m \u001b[39mimport\u001b[39;00m process_data\n\u001b[1;32m 4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mml\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmodel\u001b[39;00m \u001b[39mimport\u001b[39;00m train_model, inference,compute_model_metrics,process_slices\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'ml'"
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[31mERROR: Could not find a version that satisfies the requirement ml (from versions: none)\u001b[0m\u001b[31m\n",
"\u001b[0m\u001b[31mERROR: No matching distribution found for ml\u001b[0m\u001b[31m\n",
"\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install ml"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"import ml.clean_data\n",
"from ml import clean_data\n",
"from ml.data import process_data\n",
"from ml.model import train_model, inference,compute_model_metrics,process_slices\n",
"import logging\n",
Expand All @@ -252,9 +98,28 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/t0142f5/miniconda3/envs/udacity_ml_ops_3/lib/python3.8/site-packages/sklearn/preprocessing/_encoders.py:975: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"['model/trained_model.joblib']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# log config \n",
"logging.basicConfig(filename='logs/log',level=logging.INFO,filemode='w')\n",
Expand Down Expand Up @@ -289,7 +154,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -310,7 +175,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand Down
20 changes: 10 additions & 10 deletions test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from sklearn.ensemble import RandomForestClassifier
import joblib
from ml.data import process_data
from ml.clean_data import cleaned_data
from ml.model import train_model

cat_features = [
Expand All @@ -21,16 +22,15 @@
]


@pytest.fixture(name='cleaned_data')
def cleaned_data():
@pytest.fixture(name='data')
def data():
"""
This is a fixture for loading cleaned data and will be used by other tests.
Yields:
pd.Dataframe : Cleaned data
"""
yield pd.read_csv('data/cleaned_census_income.csv')

yield cleaned_data()

def test_model():
"""
Expand All @@ -40,26 +40,26 @@ def test_model():
assert isinstance(model, RandomForestClassifier)


def test_cleaned_data(cleaned_data):
def test_data(data):
"""
Test case to check if the cleaned data is loaded properly
Args:
cleaned_data (pd.Dataframe): Cleaned Data from the fixture
data (pd.Dataframe): Cleaned Data from the fixture
"""
assert cleaned_data.shape[0] > 0 and cleaned_data.shape[1] > 0
assert data.shape[0] > 0 and data.shape[1] > 0


def test_ml_training(cleaned_data):
def test_ml_training(data):
"""
Test case to check after the cleaned data is loaded
model is trained propely or not.
Args:
cleaned_data (pd.Dataframe): Cleaned Data from the fixture
data (pd.Dataframe): Cleaned Data from the fixture
"""
X_train, y_train, encoder, lb = process_data(
cleaned_data, categorical_features=cat_features, label="salary", training=True)
data, categorical_features=cat_features, label="salary", training=True)
model = train_model(X_train, y_train)
assert model is not None
assert encoder is not None
Expand Down

0 comments on commit be489b7

Please sign in to comment.