diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 000000000..9bfdd4a1d Binary files /dev/null and b/.DS_Store differ diff --git a/Data Analytics Salary Prediction/Images/avdslr_peryear_loc b/Data Analytics Salary Prediction/Images/avdslr_peryear_loc deleted file mode 100644 index 241e5a26b..000000000 Binary files a/Data Analytics Salary Prediction/Images/avdslr_peryear_loc and /dev/null differ diff --git a/Data Analytics Salary Prediction/Images/avgslr_each_job.png b/Data Analytics Salary Prediction/Images/avgslr_each_job.png deleted file mode 100644 index 4899150d1..000000000 Binary files a/Data Analytics Salary Prediction/Images/avgslr_each_job.png and /dev/null differ diff --git a/Data Analytics Salary Prediction/Images/avgslr_location.png b/Data Analytics Salary Prediction/Images/avgslr_location.png deleted file mode 100644 index 728dadc1e..000000000 Binary files a/Data Analytics Salary Prediction/Images/avgslr_location.png and /dev/null differ diff --git a/Data Analytics Salary Prediction/Images/avgslr_top_10_company.png b/Data Analytics Salary Prediction/Images/avgslr_top_10_company.png deleted file mode 100644 index e5885f1bf..000000000 Binary files a/Data Analytics Salary Prediction/Images/avgslr_top_10_company.png and /dev/null differ diff --git a/Data Analytics Salary Prediction/Images/top_6_high_pay_company b/Data Analytics Salary Prediction/Images/top_6_high_pay_company deleted file mode 100644 index 8c538d475..000000000 Binary files a/Data Analytics Salary Prediction/Images/top_6_high_pay_company and /dev/null differ diff --git a/Data Analytics Salary Prediction/Images/top_6_jobs.png b/Data Analytics Salary Prediction/Images/top_6_jobs.png deleted file mode 100644 index 590ca4676..000000000 Binary files a/Data Analytics Salary Prediction/Images/top_6_jobs.png and /dev/null differ diff --git a/Data Analytics Salary Prediction/Images/top_6_less_pay_company b/Data Analytics Salary Prediction/Images/top_6_less_pay_company deleted file mode 100644 index 048c25076..000000000 Binary files a/Data Analytics Salary Prediction/Images/top_6_less_pay_company and /dev/null differ diff --git a/DataAnalyticsSalaryPrediction/.DS_Store b/DataAnalyticsSalaryPrediction/.DS_Store new file mode 100644 index 000000000..be42c59d9 Binary files /dev/null and b/DataAnalyticsSalaryPrediction/.DS_Store differ diff --git a/DataAnalyticsSalaryPrediction/.gitignore b/DataAnalyticsSalaryPrediction/.gitignore new file mode 100644 index 000000000..b694934fb --- /dev/null +++ b/DataAnalyticsSalaryPrediction/.gitignore @@ -0,0 +1 @@ +.venv \ No newline at end of file diff --git a/Data Analytics Salary Prediction/Dataset/README.md b/DataAnalyticsSalaryPrediction/Dataset/README.md similarity index 100% rename from Data Analytics Salary Prediction/Dataset/README.md rename to DataAnalyticsSalaryPrediction/Dataset/README.md diff --git a/Data Analytics Salary Prediction/Dataset/Salary Dataset.csv b/DataAnalyticsSalaryPrediction/Dataset/Salary Dataset.csv similarity index 100% rename from Data Analytics Salary Prediction/Dataset/Salary Dataset.csv rename to DataAnalyticsSalaryPrediction/Dataset/Salary Dataset.csv diff --git a/DataAnalyticsSalaryPrediction/Images/2024-08-03 00-49-28.mkv b/DataAnalyticsSalaryPrediction/Images/2024-08-03 00-49-28.mkv new file mode 100644 index 000000000..46457f8a0 Binary files /dev/null and b/DataAnalyticsSalaryPrediction/Images/2024-08-03 00-49-28.mkv differ diff --git "a/DataAnalyticsSalaryPrediction/Images/Screenshot 2024-08-03 at 12.46.57\342\200\257AM.png" "b/DataAnalyticsSalaryPrediction/Images/Screenshot 2024-08-03 at 12.46.57\342\200\257AM.png" new file mode 100644 index 000000000..98082ca73 Binary files /dev/null and "b/DataAnalyticsSalaryPrediction/Images/Screenshot 2024-08-03 at 12.46.57\342\200\257AM.png" differ diff --git "a/DataAnalyticsSalaryPrediction/Images/Screenshot 2024-08-03 at 12.47.27\342\200\257AM.png" "b/DataAnalyticsSalaryPrediction/Images/Screenshot 2024-08-03 at 12.47.27\342\200\257AM.png" new file mode 100644 index 000000000..e2684307b Binary files /dev/null and "b/DataAnalyticsSalaryPrediction/Images/Screenshot 2024-08-03 at 12.47.27\342\200\257AM.png" differ diff --git a/Data Analytics Salary Prediction/Model/Data Analytics Salary Prediction.ipynb b/DataAnalyticsSalaryPrediction/Model/.ipynb_checkpoints/Data Analytics Salary Prediction.ipynb similarity index 100% rename from Data Analytics Salary Prediction/Model/Data Analytics Salary Prediction.ipynb rename to DataAnalyticsSalaryPrediction/Model/.ipynb_checkpoints/Data Analytics Salary Prediction.ipynb diff --git a/Data Analytics Salary Prediction/Model/README.md b/DataAnalyticsSalaryPrediction/Model/README.md similarity index 100% rename from Data Analytics Salary Prediction/Model/README.md rename to DataAnalyticsSalaryPrediction/Model/README.md diff --git a/DataAnalyticsSalaryPrediction/Model/encoder.pkl b/DataAnalyticsSalaryPrediction/Model/encoder.pkl new file mode 100644 index 000000000..ac01f4e71 Binary files /dev/null and b/DataAnalyticsSalaryPrediction/Model/encoder.pkl differ diff --git a/DataAnalyticsSalaryPrediction/Model/model.ipynb b/DataAnalyticsSalaryPrediction/Model/model.ipynb new file mode 100644 index 000000000..1d9f15c5b --- /dev/null +++ b/DataAnalyticsSalaryPrediction/Model/model.ipynb @@ -0,0 +1,277 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", + "from sklearn.metrics import confusion_matrix, accuracy_score\n", + "from imblearn.over_sampling import SMOTE\n", + "import numpy as np\n", + "import tensorflow as tf\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense, Dropout, Input\n", + "from scikeras.wrappers import KerasClassifier\n", + "import os\n", + "\n", + "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'\n", + "os.environ['TF_ENABLE_ONEDNN_OPTS']='0'\n", + "df = pd.read_csv('/Users/akshay/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/Dataset/Salary Dataset.csv')\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def convert_salary(salary):\n", + " salary = salary.replace('₹', '').replace('$', '').replace(',', '')\n", + " salary = salary.replace('₹', '').replace('£', '').replace(',', '')\n", + " salary = salary.replace('₹', '').replace('AFN', '').replace(',', '')\n", + " if '/mo' in salary:\n", + " return float(salary.replace('/mo', '')) * 12\n", + " elif '/hr' in salary:\n", + " return float(salary.replace('/hr', '')) * 40 * 52\n", + " else:\n", + " return float(salary.replace('/yr', ''))\n", + "\n", + "df['Salary'] = df['Salary'].apply(convert_salary)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Cannot cast object dtype to int64", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/venv/lib/python3.11/site-packages/pandas/core/arrays/categorical.py:591\u001b[0m, in \u001b[0;36mCategorical.astype\u001b[0;34m(self, dtype, copy)\u001b[0m\n\u001b[1;32m 590\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 591\u001b[0m new_cats \u001b[38;5;241m=\u001b[39m \u001b[43mnew_cats\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 592\u001b[0m fill_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcategories\u001b[38;5;241m.\u001b[39m_na_value\n", + "\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: 'more than 0'", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m salary_bins \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m500000\u001b[39m, \u001b[38;5;241m1000000\u001b[39m, \u001b[38;5;241m1500000\u001b[39m, \u001b[38;5;241m2000000\u001b[39m, np\u001b[38;5;241m.\u001b[39minf]\n\u001b[1;32m 2\u001b[0m salary_labels \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmore than 0\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmore than 500k\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmore than 1000k\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmore than 1500k\u001b[39m\u001b[38;5;124m'\u001b[39m , \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmore than 2000k\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;66;03m# Assign a label to each bin\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSalaryBin\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcut\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSalary\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbins\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msalary_bins\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msalary_labels\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mint\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m categorical_cols \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCompany Name\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mJob Title\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mLocation\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 6\u001b[0m X \u001b[38;5;241m=\u001b[39m df[categorical_cols]\n", + "File \u001b[0;32m~/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/venv/lib/python3.11/site-packages/pandas/core/generic.py:6643\u001b[0m, in \u001b[0;36mNDFrame.astype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 6637\u001b[0m results \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 6638\u001b[0m ser\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy, errors\u001b[38;5;241m=\u001b[39merrors) \u001b[38;5;28;01mfor\u001b[39;00m _, ser \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems()\n\u001b[1;32m 6639\u001b[0m ]\n\u001b[1;32m 6641\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 6642\u001b[0m \u001b[38;5;66;03m# else, only a single dtype is given\u001b[39;00m\n\u001b[0;32m-> 6643\u001b[0m new_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mgr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6644\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_constructor_from_mgr(new_data, axes\u001b[38;5;241m=\u001b[39mnew_data\u001b[38;5;241m.\u001b[39maxes)\n\u001b[1;32m 6645\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mastype\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/venv/lib/python3.11/site-packages/pandas/core/internals/managers.py:430\u001b[0m, in \u001b[0;36mBaseBlockManager.astype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 427\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m using_copy_on_write():\n\u001b[1;32m 428\u001b[0m copy \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m--> 430\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mastype\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 435\u001b[0m \u001b[43m \u001b[49m\u001b[43musing_cow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43musing_copy_on_write\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 436\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/venv/lib/python3.11/site-packages/pandas/core/internals/managers.py:363\u001b[0m, in \u001b[0;36mBaseBlockManager.apply\u001b[0;34m(self, f, align_keys, **kwargs)\u001b[0m\n\u001b[1;32m 361\u001b[0m applied \u001b[38;5;241m=\u001b[39m b\u001b[38;5;241m.\u001b[39mapply(f, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 362\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 363\u001b[0m applied \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 364\u001b[0m result_blocks \u001b[38;5;241m=\u001b[39m extend_blocks(applied, result_blocks)\n\u001b[1;32m 366\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39mfrom_blocks(result_blocks, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maxes)\n", + "File \u001b[0;32m~/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/venv/lib/python3.11/site-packages/pandas/core/internals/blocks.py:758\u001b[0m, in \u001b[0;36mBlock.astype\u001b[0;34m(self, dtype, copy, errors, using_cow, squeeze)\u001b[0m\n\u001b[1;32m 755\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCan not squeeze with more than one column.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 756\u001b[0m values \u001b[38;5;241m=\u001b[39m values[\u001b[38;5;241m0\u001b[39m, :] \u001b[38;5;66;03m# type: ignore[call-overload]\u001b[39;00m\n\u001b[0;32m--> 758\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array_safe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 760\u001b[0m new_values \u001b[38;5;241m=\u001b[39m maybe_coerce_values(new_values)\n\u001b[1;32m 762\u001b[0m refs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:237\u001b[0m, in \u001b[0;36mastype_array_safe\u001b[0;34m(values, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 234\u001b[0m dtype \u001b[38;5;241m=\u001b[39m dtype\u001b[38;5;241m.\u001b[39mnumpy_dtype\n\u001b[1;32m 236\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 237\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mValueError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m):\n\u001b[1;32m 239\u001b[0m \u001b[38;5;66;03m# e.g. _astype_nansafe can fail on object-dtype of strings\u001b[39;00m\n\u001b[1;32m 240\u001b[0m \u001b[38;5;66;03m# trying to convert to float\u001b[39;00m\n\u001b[1;32m 241\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", + "File \u001b[0;32m~/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:179\u001b[0m, in \u001b[0;36mastype_array\u001b[0;34m(values, dtype, copy)\u001b[0m\n\u001b[1;32m 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n\u001b[1;32m 177\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(values, np\u001b[38;5;241m.\u001b[39mndarray):\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# i.e. ExtensionArray\u001b[39;00m\n\u001b[0;32m--> 179\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43mvalues\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 182\u001b[0m values \u001b[38;5;241m=\u001b[39m _astype_nansafe(values, dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n", + "File \u001b[0;32m~/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/venv/lib/python3.11/site-packages/pandas/core/arrays/categorical.py:602\u001b[0m, in \u001b[0;36mCategorical.astype\u001b[0;34m(self, dtype, copy)\u001b[0m\n\u001b[1;32m 597\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\n\u001b[1;32m 598\u001b[0m \u001b[38;5;167;01mTypeError\u001b[39;00m, \u001b[38;5;66;03m# downstream error msg for CategoricalIndex is misleading\u001b[39;00m\n\u001b[1;32m 599\u001b[0m \u001b[38;5;167;01mValueError\u001b[39;00m,\n\u001b[1;32m 600\u001b[0m ):\n\u001b[1;32m 601\u001b[0m msg \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot cast \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcategories\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m dtype to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 602\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg)\n\u001b[1;32m 604\u001b[0m result \u001b[38;5;241m=\u001b[39m take_nd(\n\u001b[1;32m 605\u001b[0m new_cats, ensure_platform_int(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_codes), fill_value\u001b[38;5;241m=\u001b[39mfill_value\n\u001b[1;32m 606\u001b[0m )\n\u001b[1;32m 608\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", + "\u001b[0;31mValueError\u001b[0m: Cannot cast object dtype to int64" + ] + } + ], + "source": [ + "salary_bins = [0, 500000, 1000000, 1500000, 2000000, np.inf]\n", + "salary_labels = [0, 1, 2, 3, 4] # Assign a label to each bin\n", + "df['SalaryBin'] = pd.cut(df['Salary'], bins=salary_bins, labels=salary_labels).astype(int)\n", + "\n", + "categorical_cols = ['Company Name', 'Job Title', 'Location']\n", + "X = df[categorical_cols]\n", + "y = df['SalaryBin']\n", + "\n", + "encoder = OneHotEncoder(sparse_output=False)\n", + "X_encoded = encoder.fit_transform(X)\n", + "\n", + "scaler = StandardScaler()\n", + "X_encoded_scaled = scaler.fit_transform(X_encoded)\n", + "\n", + "smote = SMOTE(random_state=42)\n", + "X_resampled, y_resampled = smote.fit_resample(X_encoded_scaled, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def create_model(optimizer='adam'):\n", + " model = Sequential()\n", + " model.add(Input(shape=(X_resampled.shape[1],)))\n", + " model.add(Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)))\n", + " model.add(Dropout(0.5))\n", + " model.add(Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)))\n", + " model.add(Dropout(0.5))\n", + " model.add(Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)))\n", + " model.add(Dense(5, activation='softmax'))\n", + " model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "model = KerasClassifier(model=create_model, verbose=0)\n", + "param_grid = {\n", + " 'optimizer': ['adam', 'rmsprop'],\n", + " 'epochs': [50, 100],\n", + " 'batch_size': [64, 128]\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best: 0.600000 using {'batch_size': 128, 'epochs': 100, 'optimizer': 'adam'}\n", + "Accuracy: 61.22% (+/- 8.57%)\n", + "Confusion Matrix:\n", + "[[195 92 62 13 8]\n", + " [111 134 131 22 8]\n", + " [ 33 38 282 12 12]\n", + " [ 7 13 12 350 5]\n", + " [ 4 11 6 11 333]]\n", + "Accuracy: 0.68\n" + ] + } + ], + "source": [ + "grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=4, cv=3, error_score='raise')\n", + "grid_result = grid.fit(X_resampled, y_resampled)\n", + "\n", + "print(\"Best: %f using %s\" % (grid_result.best_score_, grid_result.best_params_))\n", + "best_model = grid_result.best_estimator_\n", + "\n", + "# Evaluate the best model with cross-validation\n", + "scores = cross_val_score(best_model, X_resampled, y_resampled, cv=5)\n", + "print(\"Accuracy: %.2f%% (+/- %.2f%%)\" % (scores.mean() * 100, scores.std() * 100))\n", + "\n", + "# Split the data into training and testing sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42)\n", + "history = best_model.fit(X_train, y_train)\n", + "\n", + "# Use predict_proba if available, otherwise use predict\n", + "try:\n", + " y_pred_prob = best_model.predict_proba(X_test)\n", + " y_pred = np.argmax(y_pred_prob, axis=1)\n", + "except AttributeError:\n", + " y_pred = best_model.predict(X_test)\n", + "\n", + "# Compute confusion matrix and accuracy\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "\n", + "print(\"Confusion Matrix:\")\n", + "print(cm)\n", + "print(f\"Accuracy: {accuracy:.2f}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "model_filename = 'model1'\n", + "with open(model_filename, 'wb') as file:\n", + " pickle.dump(best_model, file)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", + "import pickle\n", + "\n", + "# Load your data\n", + "df = pd.read_csv('/Users/akshay/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/Dataset/Salary Dataset.csv')\n", + "\n", + "def convert_salary(salary):\n", + " salary = salary.replace('₹', '').replace('$', '').replace(',', '')\n", + " salary = salary.replace('₹', '').replace('£', '').replace(',', '')\n", + " salary = salary.replace('₹', '').replace('AFN', '').replace(',', '')\n", + " if '/mo' in salary:\n", + " return float(salary.replace('/mo', '')) * 12\n", + " elif '/hr' in salary:\n", + " return float(salary.replace('/hr', '')) * 40 * 52\n", + " else:\n", + " return float(salary.replace('/yr', ''))\n", + "\n", + "df['Salary'] = df['Salary'].apply(convert_salary)\n", + "\n", + "salary_bins = [0, 500000, 1000000, 1500000, 2000000, np.inf]\n", + "salary_labels = [0, 1, 2, 3, 4]\n", + "df['SalaryBin'] = pd.cut(df['Salary'], bins=salary_bins, labels=salary_labels).astype(int)\n", + "\n", + "categorical_cols = ['Company Name', 'Job Title', 'Location']\n", + "X = df[categorical_cols]\n", + "y = df['SalaryBin']\n", + "\n", + "# Create and fit the encoder\n", + "encoder = OneHotEncoder(sparse_output=False)\n", + "X_encoded = encoder.fit_transform(X)\n", + "\n", + "# Create and fit the scaler\n", + "scaler = StandardScaler()\n", + "X_encoded_scaled = scaler.fit_transform(X_encoded)\n", + "\n", + "# Save the encoder\n", + "with open('/Users/akshay/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/Model/encoder.pkl', 'wb') as file:\n", + " pickle.dump(encoder, file)\n", + "\n", + "# Save the scaler\n", + "with open('/Users/akshay/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/Model/scaler.pkl', 'wb') as file:\n", + " pickle.dump(scaler, file)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/DataAnalyticsSalaryPrediction/Model/model1 b/DataAnalyticsSalaryPrediction/Model/model1 new file mode 100644 index 000000000..1f90a7d2d Binary files /dev/null and b/DataAnalyticsSalaryPrediction/Model/model1 differ diff --git a/DataAnalyticsSalaryPrediction/Model/requirements.txt b/DataAnalyticsSalaryPrediction/Model/requirements.txt new file mode 100644 index 000000000..492a43a99 --- /dev/null +++ b/DataAnalyticsSalaryPrediction/Model/requirements.txt @@ -0,0 +1,6 @@ +pandas +numpy +scikit-learn +tensorflow +matplotlib +keras diff --git a/DataAnalyticsSalaryPrediction/Model/scaler.pkl b/DataAnalyticsSalaryPrediction/Model/scaler.pkl new file mode 100644 index 000000000..7eb3a21e3 Binary files /dev/null and b/DataAnalyticsSalaryPrediction/Model/scaler.pkl differ diff --git a/DataAnalyticsSalaryPrediction/readme.md b/DataAnalyticsSalaryPrediction/readme.md new file mode 100644 index 000000000..635fa9a87 --- /dev/null +++ b/DataAnalyticsSalaryPrediction/readme.md @@ -0,0 +1,20 @@ +## Data Analysis Salary Prediction - Web Interface + +### Goal 🎯 + +The main goal of this project is to provide an easy-to-use web interface for predicting Salary of various job posts accross India based on user input parameters. This tool aims to make salary predition accessible to non-technical users by integrating a machine learning model with a user-friendly Flask web application. + +### Model(s) used for the Web App 🧮 + +The backend part of the web app uses a pre-trained machine learning model (`../Model`) serialized with `pickle`. The model was trained on a dataset of body measurements and is designed to predict body fat percentage accurately. + +### Video Demonstration 🎥 + + + +### Signature ✒️ + +Developed by [Akshaykumar](https://github.com/MRMORNINGSTAR2233) + +- [GitHub](https://github.com/MRMORNINGSTAR2233) +- [LinkedIn](https://www.linkedin.com/in/akshay-kumar-hegde/) diff --git a/Data Analytics Salary Prediction/requirements.txt b/DataAnalyticsSalaryPrediction/requirements.txt similarity index 100% rename from Data Analytics Salary Prediction/requirements.txt rename to DataAnalyticsSalaryPrediction/requirements.txt diff --git a/DataAnalyticsSalaryPrediction/web/app.py b/DataAnalyticsSalaryPrediction/web/app.py new file mode 100644 index 000000000..18f9c8945 --- /dev/null +++ b/DataAnalyticsSalaryPrediction/web/app.py @@ -0,0 +1,56 @@ +import os +import pickle +import numpy as np +from flask import Flask, request, jsonify, render_template +from sklearn.preprocessing import OneHotEncoder, StandardScaler +import pandas as pd +import tensorflow as tf +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense, Dropout, Input + + +app = Flask(__name__) + +def create_model(optimizer='adam'): + model = Sequential() + model.add(Input(shape=(X_resampled.shape[1],))) # Assuming X_resampled.shape[1] is known + model.add(Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))) + model.add(Dropout(0.5)) + model.add(Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))) + model.add(Dropout(0.5)) + model.add(Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))) + model.add(Dense(5, activation='softmax')) + model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy']) + return model + +# Load the trained model +with open('/Users/akshay/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/Model/model1', 'rb') as file: + model = pickle.load(file) + +# Load the encoder and scaler +with open('/Users/akshay/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/Model/encoder.pkl', 'rb') as file: + encoder = pickle.load(file) + +with open('/Users/akshay/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/Model/scaler.pkl', 'rb') as file: + scaler = pickle.load(file) + +# Preprocessing function +def preprocess_input(data): + df = pd.DataFrame(data, index=[0]) + df_encoded = encoder.transform(df) + df_scaled = scaler.transform(df_encoded) + return df_scaled + +@app.route('/') +def home(): + return render_template('index.html') + +@app.route('/predict', methods=['POST']) +def predict(): + data = request.get_json() + input_data = preprocess_input(data) + prediction = model.predict(input_data) + return jsonify({'prediction': int(prediction[0])}) + +if __name__ == '__main__': + app.run(debug=True) diff --git a/DataAnalyticsSalaryPrediction/web/templates/index.html b/DataAnalyticsSalaryPrediction/web/templates/index.html new file mode 100644 index 000000000..2629f826e --- /dev/null +++ b/DataAnalyticsSalaryPrediction/web/templates/index.html @@ -0,0 +1,133 @@ + + + + + + + Salary Prediction + + + + +
+

Salary Prediction

+
+ + + + + + + + + + +
+
+
+ + + + \ No newline at end of file