-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_analysis_functions.py
241 lines (173 loc) · 9.56 KB
/
data_analysis_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
''' This file contains all the functions that are used in the main file.
This is so as to reduce the clutter in the main file and isolate the core functionalites of the application in seprate file
'''
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import plotly.express as px
# Function to load the csv data to a dataframe
def load_data(file):
return pd.read_csv(file)
# Function to find categorical and numerical columns/variables in dataset
def categorical_numerical(df):
num_columns,cat_columns = [],[]
for col in df.columns:
if len(df[col].unique()) <= 30 or df[col].dtype== np.object_:
cat_columns.append(col.strip())
else:
num_columns.append(col.strip())
return num_columns,cat_columns
# Function to display dataset overview
def display_dataset_overview(df,cat_columns,num_columns):
display_rows = st.slider("Display Rows", 1, len(df), len(df) if len(df) < 20 else 20)
st.write(df.head(display_rows))
st.subheader("2. Dataset Overview")
st.write(f"**Rows:** {df.shape[0]}")
st.write(f"**Columns:** {df.shape[1]}")
st.write(f"**Duplicates:** {df.shape[0] - df.drop_duplicates().shape[0]}")
st.write(f"**Categorical Columns:** {len(cat_columns)}")
st.write(cat_columns)
st.write(f"**Numerical Columns:** {len(num_columns)}")
st.write(num_columns)
# Function to find the missing values in the dataset
def display_missing_values(df):
missing_count = df.isnull().sum()
missing_percentage = (missing_count / len(df)) * 100
missing_data = pd.DataFrame({'Missing Count': missing_count, 'Missing Percentage': missing_percentage})
missing_data = missing_data[missing_data['Missing Count'] > 0].sort_values(by='Missing Count', ascending=False)
if not missing_data.empty:
st.write("Missing Data Summary:")
st.write(missing_data)
else:
st.info("No Missing Value present in the Dataset")
# Function to display basic statistics and visualizations about the dataset
def display_statistics_visualization(df,cat_columns,num_columns):
st.write("Summary Statistics for Numerical Columns")
if len(num_columns)!=0:
num_df = df[num_columns]
st.write(num_df.describe())
else:
st.info("The dataset does not have any numerical columns")
st.write("Statistics for Categorical Columns")
if len(cat_columns)!=0:
num_cat_columns = st.number_input("Select the number of categorical columns to visualize:",min_value=1,max_value=len(cat_columns))
selected_cat_columns = st.multiselect("Select the Categorical Columns for bar chart",cat_columns,cat_columns[:num_cat_columns])
for column in selected_cat_columns:
st.write(f"**{column}**")
value_counts = df[column].value_counts()
st.bar_chart(value_counts)
# display the value count in tabular format
st.write(f"Value Count for {column}")
value_counts_table = df[column].value_counts().reset_index()
value_counts_table.columns = ['Value','Count']
st.write(value_counts_table)
else:
st.info("The dataset does not have any categorical columns")
# Funciton to display the datatypes
def display_data_types(df):
data_types_df = pd.DataFrame({'Data Type':df.dtypes})
st.write(data_types_df)
# Function to search for a particular column or particular datatype in the dataset
def search_column(df):
search_query = st.text_input("Search for a column:")
selected_data_type = st.selectbox("Filter by Data Type:", ['All'] + df.dtypes.unique().tolist())
# Apply filters to the DataFrame
filtered_df = df.copy()
# Filter by search query
if search_query:
filtered_df = filtered_df.loc[:, filtered_df.columns.str.contains(search_query, case=False)]
# Filter by data type
if selected_data_type != 'All':
filtered_df = filtered_df.select_dtypes(include=[selected_data_type])
# Display the filtered DataFrame
st.write(filtered_df)
## FUNCTIONS FOR TAB2: Data Exploration and Visualization
def display_individual_feature_distribution(df,num_columns):
st.subheader("Analyze Individual Feature Distribution")
st.markdown("Here, you can explore individual numerical features, visualize their distributions, and analyze relationships between features.")
if len(num_columns) == 0:
st.info("The dataset does not have any numerical columns")
return
st.write("#### Understanding Numerical Features")
feature = st.selectbox(label="Select Numerical Feature", options=num_columns, index=0)
df_description = df.describe()
# Display summary statistics
null_count = df[feature].isnull().sum()
st.write("Count: ", df_description[feature]['count'])
st.write("Missing Count: ", null_count)
st.write("Mean: ", df_description[feature]['mean'])
st.write("Standard Deviation: ", df_description[feature]['std'])
st.write("Minimum: ", df_description[feature]['min'])
st.write("Maximum: ", df_description[feature]['max'])
# create plots for distribution
st.subheader("Distribution Plots")
plot_type = st.selectbox(label="Select Plot Type",options=['Histogram','Scatter Plot','Density Plot','Box Plot'])
if plot_type=='Histogram':
fig=px.histogram(df,x=feature,title=f'Histogram of {feature}')
elif plot_type=='Scatter Plot':
fig = px.scatter(df,x=feature,y=feature,title=f'Scatter plot of {feature}')
elif plot_type=='Density Plot':
fig = px.density_contour(df,x=feature,title=f'Density plot of {feature}')
elif plot_type=='Box Plot':
fig = px.box(df,y=feature,title=f'Box plot of {feature}')
st.plotly_chart(fig,use_container_width=True)
def display_scatter_plot_of_two_numeric_features(df,num_columns):
if len(num_columns) == 0:
st.info("The dataset does not have any numerical columns")
return
if len(num_columns)!=0:
x_feature = st.selectbox(label="Select X-Axis Feature", options=num_columns, index=0)
y_feature = st.selectbox(label="Select Y-Axis Feature", options=num_columns, index=1)
scatter_fig = px.scatter(df, x=x_feature, y=y_feature, title=f'Scatter Plot: {x_feature} vs {y_feature}')
st.plotly_chart(scatter_fig, use_container_width=True)
def categorical_variable_analysis(df,cat_columns):
categorical_feature = st.selectbox(label="Select Categorical Feature",options=cat_columns)
categorical_plot_type = st.selectbox(label="Select Plot Type",options=["Bar Chart","Pie Chart","Stacked Bar Chart","Frequency Count"])
if categorical_plot_type =="Bar Chart":
fig = px.bar(df,x=categorical_feature,title=f"Bar Chart of {categorical_feature}")
elif categorical_plot_type == "Pie Chart":
fig = px.pie(df,names=categorical_feature,title=f"Pie Chart of {categorical_feature}")
elif categorical_plot_type == "Stacked Bar Chart":
st.write("Select a second categorical feature for stacking")
second_categorical_feature = st.selectbox(label="Select Second Categorical Feature",options=cat_columns)
fig = px.bar(df,x=categorical_feature,color=second_categorical_feature,title=f"Stacked Bar Chart of {categorical_feature} by {second_categorical_feature}")
elif categorical_plot_type == "Frequency Count":
cat_value_counts = df[categorical_feature].value_counts()
st.write(f"Frequency Count for {categorical_feature}: ")
st.write(cat_value_counts)
if categorical_plot_type!= "Frequency Count" and fig is not None:
st.plotly_chart(fig,use_container_width=True)
def feature_exploration_numerical_variables(df,num_columns):
selected_features = st.multiselect("Select Features for Exploration:", num_columns, default=num_columns[:2], key="feature_exploration")
if len(selected_features) < 2:
st.warning("Please select at least two numerical features for exploration.")
else:
st.subheader("Explore Relationships Between Features")
# Scatter Plot Matrix
if st.button("Generate Scatter Plot Matrix"):
scatter_matrix_fig = px.scatter_matrix(df, dimensions=selected_features, title="Scatter Plot Matrix")
st.plotly_chart(scatter_matrix_fig, use_container_width=True)
# Pair Plot
if st.button("Generate Pair Plot"):
pair_plot_fig = sns.pairplot(df[selected_features])
st.pyplot(pair_plot_fig)
# Correlation Heatmap
if st.button("Generate Correlation Heatmap"):
correlation_matrix = df[selected_features].corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap")
st.pyplot(plt)
def categorical_numerical_variable_analysis(df,cat_columns,num_columns):
categorical_feature_1 = st.selectbox(label="Categorical Feature", options=cat_columns)
numerical_feature_1 = st.selectbox(label="Numerical Feature", options=num_columns)
# Group by the selected categorical column and calculate the mean of the numerical column
group_data = df.groupby(categorical_feature_1)[numerical_feature_1].mean().reset_index()
st.subheader("Relationship between Categorical and Numerical Variables")
st.write(f"Mean {numerical_feature_1} by {categorical_feature_1}")
# Create a bar chart
fig = px.bar(group_data, x=categorical_feature_1, y=numerical_feature_1, title=f"{numerical_feature_1} by {categorical_feature_1}")
st.plotly_chart(fig, use_container_width=True)