-
Notifications
You must be signed in to change notification settings - Fork 0
/
wrangle_mall.py
76 lines (65 loc) · 2.61 KB
/
wrangle_mall.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# wrangle_mall
# regular imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import env
# from our acquire.py:
def get_connection(db, user=env.user, host=env.host, password=env.password):
return f'mysql+pymysql://{user}:{password}@{host}/{db}'
def get_mallcustomer_data():
df = pd.read_sql('SELECT * FROM customers;', get_connection('mall_customers'))
return df.set_index('customer_id')
# acquire our dataframe
df = get_mallcustomer_data()
def summarize(df):
'''
summarize will take in a single argument (a pandas dataframe)
and output to console various statistics on said dataframe, including:
# .head()
# .info()
# .describe()
# value_counts()
# observation of nulls in the dataframe
'''
print('=====================================================')
print('Dataframe head: ')
print(df.head(3).to_markdown())
print('=====================================================')
print('Dataframe info: ')
print(df.info())
print('=====================================================')
print('Dataframe Description: ')
print(df.describe().to_markdown())
num_cols = [col for col in df.columns if df[col].dtype != 'O']
cat_cols = [col for col in df.columns if col not in num_cols]
print('=====================================================')
print('DataFrame value counts: ')
for col in df.columns:
if col in cat_cols:
print(df[col].value_counts())
else:
print(df[col].value_counts(bins=10, sort=False))
print('=====================================================')
print('nulls in dataframe by column: ')
print(nulls_by_col(df))
print('=====================================================')
print('nulls in dataframe by row: ')
print(nulls_by_row(df))
print('=====================================================')
def remove_columns(df, cols_to_remove):
df = df.drop(columns=cols_to_remove)
return df
def handle_missing_values(df, prop_required_columns=0.5, prop_required_row=0.75):
threshold = int(round(prop_required_columns * len(df.index), 0))
df = df.dropna(axis=1, thresh=threshold)
threshold = int(round(prop_required_columns * len(df.columns), 0))
df = df.dropna(axis=0, thresh=threshold)
return df
# combining everything in a cleaning function:
def data_prep(df, cols_to_remove=[], prop_required_column=0.5, prop_required_row=0.75):
df = remove_columns(df, cols_to_remove)
df = handle_missing_values(df, prop_required_column, prop_required_row)
return df