From a32455009ed9f10fa3b0803c5ca30c14f23d4a7b Mon Sep 17 00:00:00 2001 From: anuragithub Date: Sun, 31 May 2020 00:04:55 +0530 Subject: [PATCH] Added dataframe support and correlation plot --- README.md | 40 +++++++++++++++++++- pycorrcat/pycorrcat.py | 83 +++++++++++++++++++++++++++++++++++++++++- requirements.txt | 5 ++- setup.py | 2 +- 4 files changed, 126 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 53d07ea..ddde04d 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,40 @@ # PyCorr -Simple package to calculate correlation between variables. Currently provides correlation between nominal variables. Correlation between mixed variables would be added soon. \ No newline at end of file +A simple library to calculate correlation between variables. Currently provides correlation between nominal variables. + +Based on statistical methodology like Cramer'V and Tschuprow'T allows to gauge the correlation between categorical variables. Ability to plot the correlation in form of heatmap is also provided. + +## Usage example + + +## Development setup + +## Release History + +* 0.1.3 + * CHANGE: Remove `setDefaultXYZ()` + * ADD: Ability to pass dataframe to get correlation matrix + * ADD: Ability to plot the correlation in form of heatmap +* 0.1.2 + * Added as first release +* 0.1.1 + * Test release + +## Meta + +Anurag Kumar Mishra – [https://github.com/anuragithub](https://github.com/anuragithub) – anuragkm25@outlook.com + +Distributed under the GNU license. See ``LICENSE`` for more information. + +[https://github.com/MavericksDS/pycorr](https://github.com/MavericksDS/pycorr) + + +## Contributing + +1. Fork it () +2. Create your feature branch (`git checkout -b feature/fooBar`) +3. Commit your changes (`git commit -am 'Add some fooBar'`) +4. Push to the branch (`git push origin feature/fooBar`) +5. Create a new Pull Request + + +[pip-url]: https://pypi.org/project/pycorr/ diff --git a/pycorrcat/pycorrcat.py b/pycorrcat/pycorrcat.py index b69dd10..32ed9df 100644 --- a/pycorrcat/pycorrcat.py +++ b/pycorrcat/pycorrcat.py @@ -2,6 +2,9 @@ import pandas as pd import scipy.stats as stats import numpy as np +from typing import List +import matplotlib as matplotlib +import seaborn as sns def fillna(object): if isinstance(object, pd.Series): @@ -62,4 +65,82 @@ def corr(x, return corr_coeff except: warnings.warn("Error calculating Cramer's V",RuntimeWarning) - return corr_coeff \ No newline at end of file + return corr_coeff + + +def corr_matrix(data: pd.DataFrame, + columns: List, + bias_correction: bool=True, + Tschuprow: bool=False) -> pd.DataFrame: + """ + Calculates correlation for all the columns provided and returns pandas like correlation matrix. + The two measures supported are: + 1. Cramer'V ( default ) + 2. Tschuprow'T + + Parameters: + ----------- + data : pandas DataFrame + A pandas DataFrame containing the categorical columns + columns : list + A list of categorical columns + bias_correction : Boolean, default = True + Tschuprow : Boolean, default = False + For choosing Tschuprow as measure + Returns: + -------- + pandas dataframe object similar to pandas.DataFrame.corr() + """ + # checking length of columns + if not columns.__len__()>0 or set(data.columns.values).intersection(columns).__len__()>0 : + ValueError("Check the columns list provided") + + target_data = data.filter(columns) + cols = target_data.columns.values + shape = target_data.columns.__len__() + + matrix = np.zeros((shape, shape)) + for x,i in enumerate(cols): + temp = np.zeros((0, shape)) + for j in cols: + temp = np.append(temp,corr(target_data[i], target_data[j], bias_correction=bias_correction, Tschuprow=Tschuprow)) + matrix[x] = temp + + corr_matrix = pd.DataFrame(data=matrix, + index=cols, + columns=cols) + return corr_matrix + +def plot_corr(data: pd.DataFrame, + columns: List, + diagonal: str = False, + bias_correction: bool=True, + Tschuprow: bool=False + ) -> matplotlib.axes.Axes: + """ + Plots correlation matrix for all the columns provided and returns Matplotlib axes. + The two measures supported are: + 1. Cramer'V ( default ) + 2. Tschuprow'T + + Parameters: + ----------- + data : pandas DataFrame + A pandas DataFrame containing the categorical columns + columns : list + A list of categorical columns + diagonal : string + When true gives a masked version of heatmap + bias_correction : Boolean, default = True + Tschuprow : Boolean, default = False + For choosing Tschuprow as measure + Returns: + -------- + ax : matplotlib Axes + Axes object with the heatmap. + """ + corr = corr_matrix(data, columns, bias_correction=bias_correction, Tschuprow=Tschuprow) + if(diagonal): + mask = np.triu(corr) + return sns.heatmap(corr, annot=True, mask=mask) + return sns.heatmap(corr, annot=True) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9c7e740..62d452a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,12 @@ bleach==3.1.5 certifi==2020.4.5.1 chardet==3.0.4 +cycler==0.10.0 docutils==0.16 idna==2.9 keyring==21.2.1 +kiwisolver==1.2.0 +matplotlib==3.2.1 numpy==1.18.4 packaging==20.3 pandas==1.0.3 @@ -17,9 +20,9 @@ readme-renderer==26.0 requests==2.23.0 requests-toolbelt==0.9.1 scipy==1.4.1 +seaborn==0.10.1 six==1.14.0 tqdm==4.46.0 twine==3.1.1 urllib3==1.25.9 webencodings==0.5.1 -wheel==0.33.1 diff --git a/setup.py b/setup.py index f8f25de..9bec936 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='pycorr', - version='0.1.2', + version='0.1.3', description='Python package for calculating correlation amongst categorical variables', long_description_content_type='text/markdown', long_description=long_description,