bug fix

AshishShukla-1992 · Jul 23, 2024 · be489b7 · be489b7
1 parent bf09ffc
commit be489b7
Show file tree

Hide file tree

Showing 6 changed files with 30,221 additions and 193 deletions.
diff --git a/__pycache__/test_model.cpython-38-pytest-8.2.2.pyc b/__pycache__/test_model.cpython-38-pytest-8.2.2.pyc
diff --git a/data/cleaned_data.csv b/data/cleaned_data.csv
diff --git a/ml/__pycache__/clean_data.cpython-38.pyc b/ml/__pycache__/clean_data.cpython-38.pyc
diff --git a/ml/clean_data.py b/ml/clean_data.py
@@ -19,6 +19,6 @@ def cleaned_data():
  df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
 
  # Save the cleaned DataFrame to a new CSV file
- df.to_csv('data/cleaned_census_income.csv', index=False)
+ df.to_csv('data/cleaned_data.csv', index=False)
 
  return df
diff --git a/test.ipynb b/test.ipynb
@@ -2,7 +2,7 @@
  "cells": [
  {
  "cell_type": "code",
- "execution_count": 22,
+ "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
@@ -12,170 +12,9 @@
  },
  {
  "cell_type": "code",
- "execution_count": 23,
+ "execution_count": null,
  "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>age</th>\n",
- " <th>workclass</th>\n",
- " <th>fnlgt</th>\n",
- " <th>education</th>\n",
- " <th>education-num</th>\n",
- " <th>marital-status</th>\n",
- " <th>occupation</th>\n",
- " <th>relationship</th>\n",
- " <th>race</th>\n",
- " <th>sex</th>\n",
- " <th>capital-gain</th>\n",
- " <th>capital-loss</th>\n",
- " <th>hours-per-week</th>\n",
- " <th>native-country</th>\n",
- " <th>salary</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>39</td>\n",
- " <td>State-gov</td>\n",
- " <td>77516</td>\n",
- " <td>Bachelors</td>\n",
- " <td>13</td>\n",
- " <td>Never-married</td>\n",
- " <td>Adm-clerical</td>\n",
- " <td>Not-in-family</td>\n",
- " <td>White</td>\n",
- " <td>Male</td>\n",
- " <td>2174</td>\n",
- " <td>0</td>\n",
- " <td>40</td>\n",
- " <td>United-States</td>\n",
- " <td>&lt;=50K</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>50</td>\n",
- " <td>Self-emp-not-inc</td>\n",
- " <td>83311</td>\n",
- " <td>Bachelors</td>\n",
- " <td>13</td>\n",
- " <td>Married-civ-spouse</td>\n",
- " <td>Exec-managerial</td>\n",
- " <td>Husband</td>\n",
- " <td>White</td>\n",
- " <td>Male</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>13</td>\n",
- " <td>United-States</td>\n",
- " <td>&lt;=50K</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>38</td>\n",
- " <td>Private</td>\n",
- " <td>215646</td>\n",
- " <td>HS-grad</td>\n",
- " <td>9</td>\n",
- " <td>Divorced</td>\n",
- " <td>Handlers-cleaners</td>\n",
- " <td>Not-in-family</td>\n",
- " <td>White</td>\n",
- " <td>Male</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>40</td>\n",
- " <td>United-States</td>\n",
- " <td>&lt;=50K</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>53</td>\n",
- " <td>Private</td>\n",
- " <td>234721</td>\n",
- " <td>11th</td>\n",
- " <td>7</td>\n",
- " <td>Married-civ-spouse</td>\n",
- " <td>Handlers-cleaners</td>\n",
- " <td>Husband</td>\n",
- " <td>Black</td>\n",
- " <td>Male</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>40</td>\n",
- " <td>United-States</td>\n",
- " <td>&lt;=50K</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>28</td>\n",
- " <td>Private</td>\n",
- " <td>338409</td>\n",
- " <td>Bachelors</td>\n",
- " <td>13</td>\n",
- " <td>Married-civ-spouse</td>\n",
- " <td>Prof-specialty</td>\n",
- " <td>Wife</td>\n",
- " <td>Black</td>\n",
- " <td>Female</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>40</td>\n",
- " <td>Cuba</td>\n",
- " <td>&lt;=50K</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " age workclass fnlgt education education-num \\\n",
- "0 39 State-gov 77516 Bachelors 13 \n",
- "1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
- "2 38 Private 215646 HS-grad 9 \n",
- "3 53 Private 234721 11th 7 \n",
- "4 28 Private 338409 Bachelors 13 \n",
- "\n",
- " marital-status occupation relationship race sex \\\n",
- "0 Never-married Adm-clerical Not-in-family White Male \n",
- "1 Married-civ-spouse Exec-managerial Husband White Male \n",
- "2 Divorced Handlers-cleaners Not-in-family White Male \n",
- "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n",
- "4 Married-civ-spouse Prof-specialty Wife Black Female \n",
- "\n",
- " capital-gain capital-loss hours-per-week native-country salary \n",
- "0 2174 0 40 United-States <=50K \n",
- "1 0 0 13 United-States <=50K \n",
- "2 0 0 40 United-States <=50K \n",
- "3 0 0 40 United-States <=50K \n",
- "4 0 0 40 Cuba <=50K "
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
  "source": [
  "# Load the data\n",
  "df = pd.read_csv('data/census.csv', sep=',\\s', engine='python')\n",
@@ -186,7 +25,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 24,
+ "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
@@ -196,7 +35,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 25,
+ "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
@@ -206,7 +45,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 27,
+ "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
@@ -216,7 +55,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 30,
+ "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
@@ -226,24 +65,31 @@
  },
  {
  "cell_type": "code",
- "execution_count": 5,
+ "execution_count": 3,
  "metadata": {},
  "outputs": [
  {
- "ename": "ModuleNotFoundError",
- "evalue": "No module named 'ml'",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[5], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmodel_selection\u001b[39;00m \u001b[39mimport\u001b[39;00m train_test_split\n\u001b[0;32m----> 2\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mml\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mclean_data\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mml\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdata\u001b[39;00m \u001b[39mimport\u001b[39;00m process_data\n\u001b[1;32m 4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mml\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmodel\u001b[39;00m \u001b[39mimport\u001b[39;00m train_model, inference,compute_model_metrics,process_slices\n",
- "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'ml'"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[31mERROR: Could not find a version that satisfies the requirement ml (from versions: none)\u001b[0m\u001b[31m\n",
+ "\u001b[0m\u001b[31mERROR: No matching distribution found for ml\u001b[0m\u001b[31m\n",
+ "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
  ]
  }
  ],
+ "source": [
+ "%pip install ml"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
  "source": [
  "from sklearn.model_selection import train_test_split\n",
- "import ml.clean_data\n",
+ "from ml import clean_data\n",
  "from ml.data import process_data\n",
  "from ml.model import train_model, inference,compute_model_metrics,process_slices\n",
  "import logging\n",
@@ -252,9 +98,28 @@
  },
  {
  "cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
  "metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/t0142f5/miniconda3/envs/udacity_ml_ops_3/lib/python3.8/site-packages/sklearn/preprocessing/_encoders.py:975: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "['model/trained_model.joblib']"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
  "source": [
  "# log config \n",
  "logging.basicConfig(filename='logs/log',level=logging.INFO,filemode='w')\n",
@@ -289,7 +154,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 6,
+ "execution_count": 4,
  "metadata": {},
  "outputs": [],
  "source": [
@@ -310,7 +175,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 7,
+ "execution_count": 5,
  "metadata": {},
  "outputs": [],
  "source": [

diff --git a/test_model.py b/test_model.py
@@ -7,6 +7,7 @@
 from sklearn.ensemble import RandomForestClassifier
 import joblib
 from ml.data import process_data
+from ml.clean_data import cleaned_data
 from ml.model import train_model
 
 cat_features = [
@@ -21,16 +22,15 @@
 ]
 
 
-@pytest.fixture(name='cleaned_data')
-def cleaned_data():
+@pytest.fixture(name='data')
+def data():
  """
  This is a fixture for loading cleaned data and will be used by other tests.
 
  Yields:
  pd.Dataframe : Cleaned data 
  """
- yield pd.read_csv('data/cleaned_census_income.csv')
-
+ yield cleaned_data()
 
 def test_model():
  """
@@ -40,26 +40,26 @@ def test_model():
  assert isinstance(model, RandomForestClassifier)
 
 
-def test_cleaned_data(cleaned_data):
+def test_data(data):
  """
  Test case to check if the cleaned data is loaded properly
 
  Args:
- cleaned_data (pd.Dataframe): Cleaned Data from the fixture
+ data (pd.Dataframe): Cleaned Data from the fixture
  """
- assert cleaned_data.shape[0] > 0 and cleaned_data.shape[1] > 0
+ assert data.shape[0] > 0 and data.shape[1] > 0
 
 
-def test_ml_training(cleaned_data):
+def test_ml_training(data):
  """
  Test case to check after the cleaned data is loaded
  model is trained propely or not.
 
  Args:
- cleaned_data (pd.Dataframe): Cleaned Data from the fixture
+ data (pd.Dataframe): Cleaned Data from the fixture
  """
  X_train, y_train, encoder, lb = process_data(
- cleaned_data, categorical_features=cat_features, label="salary", training=True)
+ data, categorical_features=cat_features, label="salary", training=True)
  model = train_model(X_train, y_train)
  assert model is not None
  assert encoder is not None