-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunivariate_analysis.py
82 lines (66 loc) · 4.07 KB
/
univariate_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from variables import *
from data_menegement import primary_data_management
import seaborn
import matplotlib.pyplot as plt
def output_distribution(data_set, variable,
sample_volume): # showing the distribution of the data set by used variables
# showing the number of events in every category
print("======================================================================================================")
print(f"The {variable} variable distribution")
c = data_set.groupby(variable).size()
print(c)
# showing the percentage of events in every category
pt = c * 100 / sample_volume
print(f"The percentage {variable} variable distribution")
print(pt)
def histogram(variable, label, data_set):
# histogram for categorical variables, shows the number of event in each category
seaborn.set()
seaborn.countplot(y=variable, data=data_set)
plt.ylabel(label)
plt.title(f"The number of weather events by {variable}")
plt.show()
def description(variable, data_set):
# also it is possible to use .mode(), .mean() etc instead of .describe
print(f"Describe {variable}")
print(data_set[variable].describe())
print(f"The modes are {str(data_set[variable].mode()[1:])}")
def univariate_analysis(data_set, cat_explanatory, q_explanatory, cat_response, q_response):
print("===========================================================================================================")
sample_volume = len(data_set)
print('The sample volume is %i' % sample_volume) # printing the sample volume
# performs univariate analysis of all variables
all_variables = {**cat_explanatory, **q_explanatory, **cat_response, **q_response}
for variable in all_variables:
print('=======================================================================================================')
description(variable, data_set) # main features of every variable
if variable in q_response or variable in q_explanatory:
# showing boxplot and histogram for quantitative variables
plt.boxplot(x=data_set[variable])
plt.title(f"The {variable} box plot")
plt.show()
plt.hist(x=data_set[variable])
plt.title(f"The {variable} histogram")
plt.show()
else:
output_distribution(data_set, variable, sample_volume) # showing the distribution in textual format
histogram(variable, all_variables.get(variable, ''), data_set) # showing histogram for categorical variables
if __name__ == "__main__":
# retrieving all variables for the research
categorical_explanatory_variable = retrieve_cat_explanatory_vars()
categorical_response_variables = retrieve_cat_response_variables()
quantitative_independent_variables = retrieve_quantitative_explanatory_vars()
quantitative_response_variables = retrieve_q_response_variables()
# preforming primary data management, adding new variables (more details in variables.py or in the report.pdf)
data = primary_data_management({**categorical_explanatory_variable, **quantitative_independent_variables},
categorical_response_variables.copy())
# performing univariate analysis for all data where damage_property is not nan
univariate_analysis(data, categorical_explanatory_variable.copy(), quantitative_independent_variables.copy(),
categorical_response_variables.copy(), {})
# performing the data management to work only with data where property is damage and evaluated clear
data = primary_data_management({**categorical_explanatory_variable, **quantitative_independent_variables},
quantitative_response_variables.copy(),
"""(data_set["damage_property"] > 0)""")
# performing univariate analysis with data where property is damage and evaluated clear
univariate_analysis(data, categorical_explanatory_variable.copy(), quantitative_independent_variables.copy(), {},
quantitative_response_variables.copy())