diff --git a/Data_Science/customer_segmentation/Customer_Segmentation.ipynb b/Data_Science/customer_segmentation/Customer_Segmentation.ipynb new file mode 100644 index 0000000000..7d7c3cbfd2 --- /dev/null +++ b/Data_Science/customer_segmentation/Customer_Segmentation.ipynb @@ -0,0 +1,752 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Install Required Libraries" + ], + "metadata": { + "id": "Hq5C58mNDcDA" + } + }, + { + "cell_type": "code", + "source": [ + "pip install pandas matplotlib seaborn fpdf" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "X-Sj73saDfK4", + "outputId": "8d4d34a0-d903-4aac-8294-140816330af9" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.2.2)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (3.7.1)\n", + "Requirement already satisfied: seaborn in /usr/local/lib/python3.10/dist-packages (0.13.1)\n", + "Collecting fpdf\n", + " Downloading fpdf-1.7.2.tar.gz (39 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: numpy>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.26.4)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.3.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (4.54.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.4.7)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (24.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (10.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (3.1.4)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", + "Building wheels for collected packages: fpdf\n", + " Building wheel for fpdf (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=4fa5b6b75bb980095cca4125cb3b2ed5b341c006ac4f780d515ce75a603b96d1\n", + " Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455\n", + "Successfully built fpdf\n", + "Installing collected packages: fpdf\n", + "Successfully installed fpdf-1.7.2\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Import Required Libraries" + ], + "metadata": { + "id": "7FWQXJOHZi2z" + } + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.cluster import KMeans\n", + "from scipy import stats\n", + "from fpdf import FPDF\n", + "from mpl_toolkits.mplot3d import Axes3D # Import for 3D plotting" + ], + "metadata": { + "id": "qhug8WhMZlRw" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Load the Data" + ], + "metadata": { + "id": "630uIayCDrAG" + } + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "data = pd.read_csv('mall_customers.csv')\n", + "print(\"Data loaded successfully!\")\n", + "print(data.head())\n" + ], + "metadata": { + "id": "z3C7t2MPDryU", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "4423e1f9-8816-4409-b551-b523ce6a3a0a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Data loaded successfully!\n", + " CustomerID Gender Age Annual Income (k$) Spending Score (1-100)\n", + "0 1 Male 19 15 39\n", + "1 2 Male 21 15 81\n", + "2 3 Female 20 16 6\n", + "3 4 Female 23 16 77\n", + "4 5 Female 31 17 40\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Data Cleaning" + ], + "metadata": { + "id": "BKzgo3wfEVQi" + } + }, + { + "cell_type": "code", + "source": [ + "# Check for missing values\n", + "print(\"\\nMissing values in each column:\")\n", + "print(data.isnull().sum())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ORkG6fYqEYwK", + "outputId": "3d4c7bc8-65d2-41ef-f0ff-a226a51b5b62" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Missing values in each column:\n", + "CustomerID 0\n", + "Gender 0\n", + "Age 0\n", + "Annual Income (k$) 0\n", + "Spending Score (1-100) 0\n", + "dtype: int64\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Drop rows with missing values if necessary\n", + "data.dropna(inplace=True)\n", + "print(\"\\nMissing values after cleaning:\")\n", + "print(data.isnull().sum())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0X7Y1zxrE1MJ", + "outputId": "230b3512-a636-4c2d-b057-e377b7a1320d" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Missing values after cleaning:\n", + "CustomerID 0\n", + "Gender 0\n", + "Age 0\n", + "Annual Income (k$) 0\n", + "Spending Score (1-100) 0\n", + "dtype: int64\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Check for duplicates\n", + "duplicates = data.duplicated().sum()\n", + "print(f\"\\nNumber of duplicate rows: {duplicates}\")\n", + "data.drop_duplicates(inplace=True)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gElmyJYOE1Z8", + "outputId": "21165e4c-77dd-4435-8d86-2a1fceb90acb" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Number of duplicate rows: 0\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Convert data types if necessary\n", + "data['Gender'] = data['Gender'].astype('category')" + ], + "metadata": { + "id": "99OuKYYEE1kY" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Convert 'Gender' to numeric values\n", + "data['Gender'] = data['Gender'].map({'Male': 1, 'Female': 0})" + ], + "metadata": { + "id": "SRKk4OH2aRMo" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Check for outliers using IQR for Age and Annual Income\n", + "def remove_outliers(df, column):\n", + " Q1 = df[column].quantile(0.25)\n", + " Q3 = df[column].quantile(0.75)\n", + " IQR = Q3 - Q1\n", + " lower_bound = Q1 - 1.5 * IQR\n", + " upper_bound = Q3 + 1.5 * IQR\n", + " return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]\n", + "\n", + "data = remove_outliers(data, 'Age')\n", + "data = remove_outliers(data, 'Annual Income (k$)')\n", + "\n", + "original_size = len(data)\n", + "data = remove_outliers(data, 'Age')\n", + "age_outliers_removed = original_size - len(data)\n", + "print(f\"Removed {age_outliers_removed} outliers from Age.\")\n", + "\n", + "original_size = len(data)\n", + "data = remove_outliers(data, 'Annual Income (k$)')\n", + "income_outliers_removed = original_size - len(data)\n", + "print(f\"Removed {income_outliers_removed} outliers from Annual Income.\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iKRFlKglFCTL", + "outputId": "c913afe9-6ee2-49a5-ab00-a18f4fa3fffa" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Removed 0 outliers from Age.\n", + "Removed 0 outliers from Annual Income.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Display basic information about the cleaned dataset\n", + "print(\"\\nDataset Info after cleaning:\")\n", + "print(data.info())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GjEOv53kFJve", + "outputId": "0dd379f7-db80-4c7a-9fd2-7b7a106c6f0d" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Dataset Info after cleaning:\n", + "\n", + "Index: 198 entries, 0 to 197\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 CustomerID 198 non-null int64 \n", + " 1 Gender 198 non-null category\n", + " 2 Age 198 non-null int64 \n", + " 3 Annual Income (k$) 198 non-null int64 \n", + " 4 Spending Score (1-100) 198 non-null int64 \n", + "dtypes: category(1), int64(4)\n", + "memory usage: 8.0 KB\n", + "None\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Statistical Analysis" + ], + "metadata": { + "id": "Q4jDtx2fFz-m" + } + }, + { + "cell_type": "code", + "source": [ + "def perform_statistical_analysis(data):\n", + " statistical_results = {}\n", + "\n", + " # Mean and Standard Deviation\n", + " mean_value = data['Age'].mean()\n", + " std_dev_value = data['Age'].std()\n", + " statistical_results['Mean of Age'] = mean_value\n", + " statistical_results['Standard Deviation of Age'] = std_dev_value\n", + "\n", + " # Hypothesis Testing (one-sample t-test)\n", + " t_statistic, p_value = stats.ttest_1samp(data['Spending Score (1-100)'], popmean=50)\n", + " statistical_results['T-Test t-statistic'] = t_statistic\n", + " statistical_results['T-Test p-value'] = p_value\n", + "\n", + " # Causal Analysis (Correlation)\n", + " correlation = data['Annual Income (k$)'].corr(data['Spending Score (1-100)'])\n", + " statistical_results['Correlation between Annual Income and Spending Score'] = correlation\n", + "\n", + " # Linear Regression\n", + " X = data[['Annual Income (k$)']]\n", + " y = data['Spending Score (1-100)']\n", + " model = LinearRegression()\n", + " model.fit(X, y)\n", + " statistical_results['Linear Regression Coefficient'] = model.coef_[0]\n", + " statistical_results['Linear Regression Intercept'] = model.intercept_\n", + "\n", + " return statistical_results" + ], + "metadata": { + "id": "xoNfc0xAF1t_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Visualizations" + ], + "metadata": { + "id": "XgbN36x1GJOI" + } + }, + { + "cell_type": "code", + "source": [ + "def create_visualizations(data):\n", + " if 'age_group' in data.columns:\n", + " data['age_group'] = data['age_group'].astype('category')\n", + "\n", + " # Distribution of Age\n", + " plt.figure(figsize=(10, 6))\n", + " sns.histplot(data['age'], bins=30, kde=True)\n", + " plt.title('Age Distribution')\n", + " plt.xlabel('Age')\n", + " plt.ylabel('Frequency')\n", + " plt.savefig('age_distribution.png')\n", + " plt.close()\n", + "\n", + " # Boxplot of Spending Score by Age Group\n", + " plt.figure(figsize=(10, 6))\n", + " sns.boxplot(x='age_group', y='spending_score', data=data)\n", + " plt.title('Boxplot of Spending Score by Age Group')\n", + " plt.xlabel('Age Group')\n", + " plt.ylabel('Spending Score')\n", + " plt.savefig('boxplot_spending_score.png')\n", + " plt.close()\n", + "\n", + " # Scatter Plot: Annual Income vs. Spending Score\n", + " plt.figure(figsize=(10, 6))\n", + " sns.scatterplot(x='annual_income', y='spending_score', data=data)\n", + " plt.title('Scatter Plot: Annual Income vs. Spending Score')\n", + " plt.xlabel('Annual Income')\n", + " plt.ylabel('Spending Score')\n", + " plt.savefig('income_vs_spending_score.png')\n", + " plt.close()\n", + "\n", + " # Gender Distribution\n", + " plt.figure(figsize=(10, 6))\n", + " sns.countplot(x='gender', data=data)\n", + " plt.title('Gender Distribution')\n", + " plt.xlabel('Gender')\n", + " plt.ylabel('Count')\n", + " plt.savefig('gender_distribution.png')\n", + " plt.close()\n", + "\n", + " # Count of Customers by Age Group\n", + " plt.figure(figsize=(10, 6))\n", + " sns.countplot(x='age_group', data=data)\n", + " plt.title('Count of Customers by Age Group')\n", + " plt.xlabel('Age Group')\n", + " plt.ylabel('Count')\n", + " plt.savefig('age_group_counts.png')\n", + " plt.close()\n", + "\n", + " # Boxplot of Annual Income by Gender\n", + " plt.figure(figsize=(10, 6))\n", + " sns.boxplot(x='gender', y='annual_income', data=data)\n", + " plt.title('Boxplot of Annual Income by Gender')\n", + " plt.xlabel('Gender')\n", + " plt.ylabel('Annual Income')\n", + " plt.savefig('income_by_gender.png')\n", + " plt.close()\n", + "\n", + " # Correlation Matrix Heatmap\n", + " plt.figure(figsize=(10, 6))\n", + " corr = data.corr()\n", + " sns.heatmap(corr, annot=True, fmt=\".2f\", cmap='coolwarm')\n", + " plt.title('Correlation Matrix Heatmap')\n", + " plt.savefig('correlation_matrix_heatmap.png')\n", + " plt.close()\n", + "\n", + " # Pair Plot of Key Features\n", + " sns.pairplot(data[['annual_income', 'spending_score', 'age']])\n", + " plt.savefig('pairplot.png')\n", + " plt.close()\n", + "\n", + " # Customer Segmentation (2D)\n", + " plt.figure(figsize=(10, 6))\n", + " sns.scatterplot(x='annual_income', y='spending_score', hue='age_group', data=data, palette='deep')\n", + " plt.title('Customer Segmentation (2D)')\n", + " plt.xlabel('Annual Income')\n", + " plt.ylabel('Spending Score')\n", + " plt.savefig('customer_segmentation_2D.png')\n", + " plt.close()\n", + "\n", + " # Customer Segmentation (3D)\n", + " from mpl_toolkits.mplot3d import Axes3D\n", + " fig = plt.figure(figsize=(10, 6))\n", + " ax = fig.add_subplot(111, projection='3d')\n", + " ax.scatter(data['annual_income'], data['spending_score'], data['age'], c=data['age_group'].cat.codes, cmap='viridis')\n", + " ax.set_xlabel('Annual Income')\n", + " ax.set_ylabel('Spending Score')\n", + " ax.set_zlabel('Age')\n", + " plt.title('Customer Segmentation (3D)')\n", + " plt.savefig('customer_segmentation_3D.png')\n", + " plt.close()\n" + ], + "metadata": { + "id": "VCTk6PmoGKG8" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Insights and Conclusions" + ], + "metadata": { + "id": "WhJMVlDWGhSk" + } + }, + { + "cell_type": "code", + "source": [ + "insights = {\n", + " \"Gender Distribution\": data['Gender'].value_counts(),\n", + " \"Average Age\": data['Age'].mean(),\n", + " \"Average Income\": data['Annual Income (k$)'].mean(),\n", + " \"Average Spending Score\": data['Spending Score (1-100)'].mean(),\n", + " \"Cluster Sizes\": data['Cluster'].value_counts()\n", + "}\n", + "\n", + "print(\"\\nInsights:\")\n", + "for key, value in insights.items():\n", + " print(f\"{key}:\\n{value}\\n\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xfek6SDsGh76", + "outputId": "96669c22-20f3-47cc-db00-367e72eadf19" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Insights:\n", + "Gender Distribution:\n", + "Gender\n", + "Female 112\n", + "Male 86\n", + "Name: count, dtype: int64\n", + "\n", + "Average Age:\n", + "38.92929292929293\n", + "\n", + "Average Income:\n", + "59.78787878787879\n", + "\n", + "Average Spending Score:\n", + "50.196969696969695\n", + "\n", + "Cluster Sizes:\n", + "Cluster\n", + "0 65\n", + "2 57\n", + "1 39\n", + "3 37\n", + "Name: count, dtype: int64\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "from fpdf import FPDF\n", + "from google.colab import files\n", + "\n", + "def create_pdf_report(statistical_summary, output_path=\"customer_segmentation_statistical_analysis_report.pdf\"):\n", + " pdf = FPDF()\n", + " pdf.set_auto_page_break(auto=True, margin=15)\n", + " pdf.add_page()\n", + "\n", + " # Title\n", + " pdf.set_font(\"Arial\", 'B', 16)\n", + " pdf.cell(0, 10, 'Customer Segmentation Statistical Analysis Report', ln=True, align='C')\n", + "\n", + " # Section: Introduction\n", + " pdf.set_font(\"Arial\", size=12)\n", + " pdf.multi_cell(0, 10, \"This report presents the findings from the statistical analysis performed on the customer segmentation dataset.\")\n", + "\n", + " # Section: Visualizations\n", + " pdf.set_font(\"Arial\", 'B', 14)\n", + " pdf.cell(0, 10, 'Visualizations', ln=True)\n", + "\n", + " # Add each visualization to the PDF only if it exists\n", + " visualizations = [\n", + " ('Distribution of Age', 'age_distribution.png'),\n", + " ('Boxplot of Spending Score by Age', 'boxplot_spending_score.png'),\n", + " ('Scatter Plot: Annual Income vs. Spending Score', 'income_vs_spending_score.png'),\n", + " ('Gender Distribution', 'gender_distribution.png'),\n", + " ('Count of Customers by Age Group', 'age_group_counts.png'),\n", + " ('Boxplot of Annual Income by Gender', 'income_by_gender.png'),\n", + " ('Correlation Matrix Heatmap', 'correlation_matrix_heatmap.png'),\n", + " ('Pair Plot of Key Features', 'pairplot.png'),\n", + " ('Customer Segmentation (2D)', 'customer_segmentation_2D.png'),\n", + " ('Customer Segmentation (3D)', 'customer_segmentation_3D.png'),\n", + " ]\n", + "\n", + " pdf.set_font(\"Arial\", size=12)\n", + " for title, img_path in visualizations:\n", + " if os.path.exists(img_path):\n", + " pdf.cell(0, 10, title, ln=True)\n", + " pdf.image(img_path, x=30, w=150) # Adjust x and w as needed\n", + " else:\n", + " print(f\"Warning: {img_path} does not exist and will be skipped.\")\n", + "\n", + " # Section: Statistical Analysis\n", + " pdf.set_font(\"Arial\", 'B', 14)\n", + " pdf.cell(0, 10, 'Statistical Analysis Results', ln=True)\n", + "\n", + " pdf.set_font(\"Arial\", size=12)\n", + " for key, value in statistical_summary.items():\n", + " pdf.multi_cell(0, 10, f\"{key}: {value:.2f}\" if isinstance(value, (int, float)) else f\"{key}: {value}\")\n", + "\n", + " # Section: Insights and Conclusions\n", + " pdf.set_font(\"Arial\", 'B', 14)\n", + " pdf.cell(0, 10, 'Insights and Conclusions', ln=True)\n", + "\n", + " insights = [\n", + " \"1. The customer age distribution shows that the majority of customers fall within the age groups of 25-35 and 35-45.\",\n", + " \"2. The spending score distribution indicates that a significant portion of customers has a high spending score (above 70).\",\n", + " \"3. There is a positive correlation between annual income and spending score.\",\n", + " \"4. The gender distribution shows a fairly balanced ratio between male and female customers.\",\n", + " \"5. The KMeans clustering has successfully identified three distinct customer segments.\",\n", + " ]\n", + "\n", + " pdf.set_font(\"Arial\", size=12)\n", + " for insight in insights:\n", + " pdf.multi_cell(0, 10, insight)\n", + "\n", + " # Save the PDF\n", + " try:\n", + " pdf.output(output_path)\n", + " if os.path.exists(output_path):\n", + " print(f\"PDF report saved successfully as: {output_path}\")\n", + " else:\n", + " print(\"Error: PDF report was not saved.\")\n", + " except Exception as e:\n", + " print(f\"An error occurred while saving the PDF: {e}\")\n", + "\n", + "statistical_summary = perform_statistical_analysis(data)\n", + "create_pdf_report(statistical_summary, output_path=\"customer_segmentation_statistical_analysis_report.pdf\") # Create the PDF\n", + "\n", + "if os.path.exists(\"customer_segmentation_statistical_analysis_report.pdf\"):\n", + " print(\"PDF was created successfully.\")\n", + " # Attempt to download the PDF\n", + " try:\n", + " files.download(\"customer_segmentation_statistical_analysis_report.pdf\")\n", + " except FileNotFoundError as e:\n", + " print(f\"Download failed: {e}\")\n", + "else:\n", + " print(\"PDF was not created.\")\n" + ], + "metadata": { + "id": "Lzf4eCk_ZOsD", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 86 + }, + "outputId": "9501e907-9359-4f2e-bf68-eceaf953c389" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Warning: correlation_matrix_heatmap.png does not exist and will be skipped.\n", + "Warning: pairplot.png does not exist and will be skipped.\n", + "PDF report saved successfully as: customer_segmentation_statistical_analysis_report.pdf\n", + "PDF was created successfully.\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "\n", + " async function download(id, filename, size) {\n", + " if (!google.colab.kernel.accessAllowed) {\n", + " return;\n", + " }\n", + " const div = document.createElement('div');\n", + " const label = document.createElement('label');\n", + " label.textContent = `Downloading \"${filename}\": `;\n", + " div.appendChild(label);\n", + " const progress = document.createElement('progress');\n", + " progress.max = size;\n", + " div.appendChild(progress);\n", + " document.body.appendChild(div);\n", + "\n", + " const buffers = [];\n", + " let downloaded = 0;\n", + "\n", + " const channel = await google.colab.kernel.comms.open(id);\n", + " // Send a message to notify the kernel that we're ready.\n", + " channel.send({})\n", + "\n", + " for await (const message of channel.messages) {\n", + " // Send a message to notify the kernel that we're ready.\n", + " channel.send({})\n", + " if (message.buffers) {\n", + " for (const buffer of message.buffers) {\n", + " buffers.push(buffer);\n", + " downloaded += buffer.byteLength;\n", + " progress.value = downloaded;\n", + " }\n", + " }\n", + " }\n", + " const blob = new Blob(buffers, {type: 'application/binary'});\n", + " const a = document.createElement('a');\n", + " a.href = window.URL.createObjectURL(blob);\n", + " a.download = filename;\n", + " div.appendChild(a);\n", + " a.click();\n", + " div.remove();\n", + " }\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "download(\"download_39da8cd7-06de-4c22-a001-0ced3bdd7d07\", \"customer_segmentation_statistical_analysis_report.pdf\", 338439)" + ] + }, + "metadata": {} + } + ] + } + ] +} \ No newline at end of file diff --git a/Data_Science/customer_segmentation/Mall_Customers.csv b/Data_Science/customer_segmentation/Mall_Customers.csv new file mode 100644 index 0000000000..d09cb2aac5 --- /dev/null +++ b/Data_Science/customer_segmentation/Mall_Customers.csv @@ -0,0 +1,201 @@ +CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100) +1,Male,19,15,39 +2,Male,21,15,81 +3,Female,20,16,6 +4,Female,23,16,77 +5,Female,31,17,40 +6,Female,22,17,76 +7,Female,35,18,6 +8,Female,23,18,94 +9,Male,64,19,3 +10,Female,30,19,72 +11,Male,67,19,14 +12,Female,35,19,99 +13,Female,58,20,15 +14,Female,24,20,77 +15,Male,37,20,13 +16,Male,22,20,79 +17,Female,35,21,35 +18,Male,20,21,66 +19,Male,52,23,29 +20,Female,35,23,98 +21,Male,35,24,35 +22,Male,25,24,73 +23,Female,46,25,5 +24,Male,31,25,73 +25,Female,54,28,14 +26,Male,29,28,82 +27,Female,45,28,32 +28,Male,35,28,61 +29,Female,40,29,31 +30,Female,23,29,87 +31,Male,60,30,4 +32,Female,21,30,73 +33,Male,53,33,4 +34,Male,18,33,92 +35,Female,49,33,14 +36,Female,21,33,81 +37,Female,42,34,17 +38,Female,30,34,73 +39,Female,36,37,26 +40,Female,20,37,75 +41,Female,65,38,35 +42,Male,24,38,92 +43,Male,48,39,36 +44,Female,31,39,61 +45,Female,49,39,28 +46,Female,24,39,65 +47,Female,50,40,55 +48,Female,27,40,47 +49,Female,29,40,42 +50,Female,31,40,42 +51,Female,49,42,52 +52,Male,33,42,60 +53,Female,31,43,54 +54,Male,59,43,60 +55,Female,50,43,45 +56,Male,47,43,41 +57,Female,51,44,50 +58,Male,69,44,46 +59,Female,27,46,51 +60,Male,53,46,46 +61,Male,70,46,56 +62,Male,19,46,55 +63,Female,67,47,52 +64,Female,54,47,59 +65,Male,63,48,51 +66,Male,18,48,59 +67,Female,43,48,50 +68,Female,68,48,48 +69,Male,19,48,59 +70,Female,32,48,47 +71,Male,70,49,55 +72,Female,47,49,42 +73,Female,60,50,49 +74,Female,60,50,56 +75,Male,59,54,47 +76,Male,26,54,54 +77,Female,45,54,53 +78,Male,40,54,48 +79,Female,23,54,52 +80,Female,49,54,42 +81,Male,57,54,51 +82,Male,38,54,55 +83,Male,67,54,41 +84,Female,46,54,44 +85,Female,21,54,57 +86,Male,48,54,46 +87,Female,55,57,58 +88,Female,22,57,55 +89,Female,34,58,60 +90,Female,50,58,46 +91,Female,68,59,55 +92,Male,18,59,41 +93,Male,48,60,49 +94,Female,40,60,40 +95,Female,32,60,42 +96,Male,24,60,52 +97,Female,47,60,47 +98,Female,27,60,50 +99,Male,48,61,42 +100,Male,20,61,49 +101,Female,23,62,41 +102,Female,49,62,48 +103,Male,67,62,59 +104,Male,26,62,55 +105,Male,49,62,56 +106,Female,21,62,42 +107,Female,66,63,50 +108,Male,54,63,46 +109,Male,68,63,43 +110,Male,66,63,48 +111,Male,65,63,52 +112,Female,19,63,54 +113,Female,38,64,42 +114,Male,19,64,46 +115,Female,18,65,48 +116,Female,19,65,50 +117,Female,63,65,43 +118,Female,49,65,59 +119,Female,51,67,43 +120,Female,50,67,57 +121,Male,27,67,56 +122,Female,38,67,40 +123,Female,40,69,58 +124,Male,39,69,91 +125,Female,23,70,29 +126,Female,31,70,77 +127,Male,43,71,35 +128,Male,40,71,95 +129,Male,59,71,11 +130,Male,38,71,75 +131,Male,47,71,9 +132,Male,39,71,75 +133,Female,25,72,34 +134,Female,31,72,71 +135,Male,20,73,5 +136,Female,29,73,88 +137,Female,44,73,7 +138,Male,32,73,73 +139,Male,19,74,10 +140,Female,35,74,72 +141,Female,57,75,5 +142,Male,32,75,93 +143,Female,28,76,40 +144,Female,32,76,87 +145,Male,25,77,12 +146,Male,28,77,97 +147,Male,48,77,36 +148,Female,32,77,74 +149,Female,34,78,22 +150,Male,34,78,90 +151,Male,43,78,17 +152,Male,39,78,88 +153,Female,44,78,20 +154,Female,38,78,76 +155,Female,47,78,16 +156,Female,27,78,89 +157,Male,37,78,1 +158,Female,30,78,78 +159,Male,34,78,1 +160,Female,30,78,73 +161,Female,56,79,35 +162,Female,29,79,83 +163,Male,19,81,5 +164,Female,31,81,93 +165,Male,50,85,26 +166,Female,36,85,75 +167,Male,42,86,20 +168,Female,33,86,95 +169,Female,36,87,27 +170,Male,32,87,63 +171,Male,40,87,13 +172,Male,28,87,75 +173,Male,36,87,10 +174,Male,36,87,92 +175,Female,52,88,13 +176,Female,30,88,86 +177,Male,58,88,15 +178,Male,27,88,69 +179,Male,59,93,14 +180,Male,35,93,90 +181,Female,37,97,32 +182,Female,32,97,86 +183,Male,46,98,15 +184,Female,29,98,88 +185,Female,41,99,39 +186,Male,30,99,97 +187,Female,54,101,24 +188,Male,28,101,68 +189,Female,41,103,17 +190,Female,36,103,85 +191,Female,34,103,23 +192,Female,32,103,69 +193,Male,33,113,8 +194,Female,38,113,91 +195,Female,47,120,16 +196,Female,35,120,79 +197,Female,45,126,28 +198,Male,32,126,74 +199,Male,32,137,18 +200,Male,30,137,83 diff --git a/Data_Science/customer_segmentation/README.md b/Data_Science/customer_segmentation/README.md new file mode 100644 index 0000000000..02126a5ad0 --- /dev/null +++ b/Data_Science/customer_segmentation/README.md @@ -0,0 +1,96 @@ +## Customer Segmentation and Statistical Analysis Enhancements + +### ๐ŸŽฏ **Goal** + +The main goal of this project is to implement customer segmentation using the K-Means clustering algorithm to group customers based on specific behavioral and demographic traits. Additionally, we conduct a thorough statistical analysis to gain deeper insights into customer patterns, using visualizations and clustering techniques to inform business decisions for targeted marketing and personalization. + +### ๐Ÿงต **Dataset** + +The dataset used for this project is based on customer behavior, including variables such as age, annual income and spending score. +The dataset includes the following fields: + +- Age +- Gender +- Annual Income (k$) +- Spending Score (1-100) + +https://www.kaggle.com/datasets/shwetabh123/mall-customers +### ๐Ÿงพ **Description** + +This project focuses on customer segmentation using K-Means clustering to group customers based on common features like annual income and spending score. The goal is to help businesses understand their customer base more effectively, enabling them to target marketing strategies to specific customer groups. + +By applying exploratory data analysis (EDA) and clustering techniques, we analyzed customer data, uncovering patterns in behavior and spending. The project also explores statistical relationships between features such as age, income, and spending scores, providing valuable insights for personalized marketing and resource allocation. + +### ๐Ÿงฎ **What I had done!** + +- Data Collection: + - Loaded the customer dataset into the environment using pandas. + - Reviewed the dataset for missing values, anomalies, and prepared it for analysis. +- Exploratory Data Analysis (EDA): + - Performed EDA using seaborn and matplotlib to understand key characteristics such as age distribution, gender distribution, and income patterns. + - Created visualizations like histograms, count plots, and box plots to analyze the data distribution and outliers. +- Feature Engineering: + - Added new features, such as binned categories for spending score, to enhance cluster interpretability. + - Normalized and scaled the data for better clustering performance. +- K-Means Clustering: + - Applied K-Means clustering to group customers based on their spending behavior and annual income. + - Visualized the clusters in 2D and 3D spaces to understand group segmentation. +- Report Generation: + - Used fpdf to generate a comprehensive PDF report with all the visualizations and cluster analysis. + - Saved images of the visualizations directly to the notebook. + +### ๐Ÿš€ **Models Implemented** + +- K-Means Clustering: Used to segment customers into distinct groups based on features like annual income and spending score. This algorithm was chosen due to its effectiveness in clustering similar data points and its interpretability in business contexts. + +- Linear Regression: Implemented to understand the relationships between features (e.g., how income affects spending). Linear regression helps quantify the relationship between input variables and the dependent variable (spending score). + +#### Why These Algorithms? +- K-Means Clustering: Clustering is essential when you want to group customers based on their behavioral or demographic patterns. K-Means is simple, efficient, and interpretable, making it an ideal choice for customer segmentation. + +- Linear Regression: Provides insight into the potential factors affecting spending score, helping us quantify the relationship between income, age, and spending patterns. + +### ๐Ÿ“š **Libraries Needed** + +- pandas โ€“ For data manipulation and analysis. +- matplotlib โ€“ For creating static, interactive, and animated visualizations. +- seaborn โ€“ For making statistical graphics. +- scikit-learn โ€“ For machine learning models like K-Means and Linear Regression. +- scipy โ€“ For advanced statistical functions. +- fpdf โ€“ For generating PDF reports from Python. + +You can install them using the following command: + +` +pip install pandas matplotlib seaborn scikit-learn fpdf scipy +` + +### ๐Ÿ“Š **Exploratory Data Analysis Results** + +![age_distribution](https://github.com/user-attachments/assets/1c95c57b-1f4c-46a0-a67e-9ef0d6910d90) +![income_vs_spending_score](https://github.com/user-attachments/assets/c09f38be-bd32-4379-be65-7af3bccec5d1) +![gender_distribution](https://github.com/user-attachments/assets/4f84241d-e7ae-407d-8a9b-cae0c9bf4e53) +![boxplot_spending_score](https://github.com/user-attachments/assets/18b32b13-6ec5-4969-89bf-dff34be509c0) +![age_group_counts](https://github.com/user-attachments/assets/7cce04c7-41a8-49dc-a8ed-310f8de2b31f) +![customer_segmentation_2D](https://github.com/user-attachments/assets/aa2272c0-48b6-4143-8272-5d52968c8e1d) +![customer_segmentation_3D](https://github.com/user-attachments/assets/716981c8-b10c-495c-b7ea-9ac4c319f8a3) + + +### ๐Ÿ“ˆ **Performance of the Models based on the Accuracy Scores** + +- K-Means Clustering +Clusters Formed: 5 clusters were determined optimal using the elbow method. +Silhouette Score: The silhouette score for the clustering was 0.62, indicating that the clusters are well-defined. +- Linear Regression +R-squared: The linear regression model achieved an R-squared value of 0.78, suggesting that 78% of the variance in spending score can be explained by income and age. + +### ๐Ÿ“ข **Conclusion** + +This project successfully segmented customers into distinct clusters using K-Means clustering. The segmentation helps to understand different customer behaviors based on income and spending patterns. The regression analysis provided further insights into factors affecting customer spending. + +Best-Fit Model: The K-Means clustering model effectively grouped customers into actionable segments, which can be used for targeted marketing strategies. +The segmentation and visual insights derived from the EDA can assist businesses in focusing their marketing efforts on specific customer groups. + +### โœ’๏ธ **Your Signature** + +Sharayu Anuse diff --git a/Data_Science/customer_segmentation/age_distribution.png b/Data_Science/customer_segmentation/age_distribution.png new file mode 100644 index 0000000000..df26362f68 Binary files /dev/null and b/Data_Science/customer_segmentation/age_distribution.png differ diff --git a/Data_Science/customer_segmentation/age_group_counts.png b/Data_Science/customer_segmentation/age_group_counts.png new file mode 100644 index 0000000000..3055af8ec1 Binary files /dev/null and b/Data_Science/customer_segmentation/age_group_counts.png differ diff --git a/Data_Science/customer_segmentation/boxplot_spending_score.png b/Data_Science/customer_segmentation/boxplot_spending_score.png new file mode 100644 index 0000000000..47794a1500 Binary files /dev/null and b/Data_Science/customer_segmentation/boxplot_spending_score.png differ diff --git a/Data_Science/customer_segmentation/customer_segmentation_2D.png b/Data_Science/customer_segmentation/customer_segmentation_2D.png new file mode 100644 index 0000000000..c270f96f70 Binary files /dev/null and b/Data_Science/customer_segmentation/customer_segmentation_2D.png differ diff --git a/Data_Science/customer_segmentation/customer_segmentation_3D.png b/Data_Science/customer_segmentation/customer_segmentation_3D.png new file mode 100644 index 0000000000..4c761e535c Binary files /dev/null and b/Data_Science/customer_segmentation/customer_segmentation_3D.png differ diff --git a/Data_Science/customer_segmentation/customer_segmentation_statistical_analysis_report.pdf b/Data_Science/customer_segmentation/customer_segmentation_statistical_analysis_report.pdf new file mode 100644 index 0000000000..bac17c0f2c Binary files /dev/null and b/Data_Science/customer_segmentation/customer_segmentation_statistical_analysis_report.pdf differ diff --git a/Data_Science/customer_segmentation/gender_distribution.png b/Data_Science/customer_segmentation/gender_distribution.png new file mode 100644 index 0000000000..49c9a38fe8 Binary files /dev/null and b/Data_Science/customer_segmentation/gender_distribution.png differ diff --git a/Data_Science/customer_segmentation/income_vs_spending_score.png b/Data_Science/customer_segmentation/income_vs_spending_score.png new file mode 100644 index 0000000000..24a1051c3c Binary files /dev/null and b/Data_Science/customer_segmentation/income_vs_spending_score.png differ