-
Notifications
You must be signed in to change notification settings - Fork 0
/
Final Project Code.qmd
175 lines (120 loc) · 4.02 KB
/
Final Project Code.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
---
title: "Untitled"
format: html
---
```{python}
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import skew, kurtosis
df = pd.read_csv('Real_Estate_Sales_2001-2020_GL.csv')
# Remove data where the cost is less than 100
df = df[df['Non Use Code'].notnull()]
freq = df['List Year'].value_counts().sort_index()
freq.plot(kind='bar')
plt.title('Amount of Sales per List Year (Filtered by Non Use Code)')
plt.xlabel('List Year')
plt.ylabel('Frequency')
# calculate skewness and kurtosis
skewness = skew(freq)
kurt = kurtosis(freq)
print(f"Skewness: {skewness:.2f}")
print(f"Kurtosis: {kurt:.2f}")
```
```{python}
import pandas as pd
from scipy.stats import kruskal
# Group data by listing year and calculate median sales ratio
grouped = df.groupby('List Year')['Sales Ratio'].median()
# Convert groupby object back into DataFrame and drop NaN values
data = pd.DataFrame({'List Year': grouped.index, 'Sales Ratio': grouped.values}).dropna()
# Run Kruskal-Wallis test
stat, p = kruskal(*[group['Sales Ratio'] for name, group in data.groupby('List Year')])
# Print results
print('Kruskal-Wallis test:')
print('H-statistic = {:.3f}'.format(stat))
print('p-value = {:.3f}'.format(p))
if p < 0.05:
print('Medians of List Years are statistically different')
else:
print('Not enough evidence to conclude that the medians of List Years are not statistically different')
```
```{python}
# List of years to analyze
years = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
# Convert "Year" column to integer
df["List Year"] = df["List Year"].astype(int)
# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=["List Year", "Count", "Mean", "Std Dev", "Median"])
df['Profit'] = df['Sale Amount'] - df['Assessed Value']
## Loop over the years and calculate the descriptive statistics for Sales Ratio
for year in years:
year_df = df[df["List Year"] == year]
sales_ratio_stats = year_df["Profit"].describe()
results_df = results_df.append({
"List Year": year,
"Count": sales_ratio_stats["count"].astype(int),
"Mean": sales_ratio_stats["mean"],
"Std Dev": sales_ratio_stats["std"],
"Median": sales_ratio_stats["50%"],
}, ignore_index=True)
# Print the results DataFrame
print(results_df['Count'])
#print(results_df.to_latex(index=False))
```
```{python}
df.head()
```
```{python}
from scipy.stats import shapiro
from scipy.stats import kstest
# extract the column to test
data = df['Sales Ratio']
# perform Shapiro-Wilk test
stat, p = shapiro(data)
# print the results
print('Shapiro-Wilk test statistic:', stat)
print('p-value:', p)
if p > 0.05:
print('Data is likely normally distributed')
else:
print('Data is not likely normally distributed')
```
```{python}
import pandas as pd
from scipy.stats import kruskal
# Run Kruskal-Wallis test
stat, p = kruskal(results_df['List Year'], results_df['Count'])
# Print results
print('Kruskal-Wallis test:')
print('H-statistic = {:.3f}'.format(stat))
print('p-value = {:.3f}'.format(p))
if p < 0.05:
print('Data is likely statistically significiant')
else:
print('Data is not likely statistically significant')
```
```{python}
import pandas as pd
from plotnine import *
# Create Q-Q plot using ggplot2
ggplot(df, aes(sample='Sales Ratio')) + \
stat_qq() + \
ggtitle("Q-Q Plot for Sales Ratio")
```
```{python}
import seaborn as sns
# Create Boxplot
YearDayPlot = sns.boxplot(x='Residential Type', y="Sales Ratio", data=df)
YearDayPlot.set(title = "Boxplot of Durations from Boroughs on Days of the Week", ylabel = "Sales Ratio")
```
```{python}
import pandas as pd
from scikit_posthocs import posthoc_dunn
# Perform post hoc Dunn's test
dunn_results = posthoc_dunn(results_df, val_col='Count', group_col='List Year')
# Set values greater than 0.05 to 0
dunn_results[dunn_results > 0.05] = 0
# Create heatmap
sns.heatmap(dunn_results, cmap="coolwarm", annot=True, fmt=".2f", annot_kws={'fontsize': 5})
```