-
Notifications
You must be signed in to change notification settings - Fork 1
/
5 Evaluation.R
153 lines (127 loc) · 6.27 KB
/
5 Evaluation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#Project: Zillow Tracker
#Code: 5 Evaluation
#Author: Scott Onestak
#Last Executed: 7/10/2022
#Packages
library(tidyverse)
library(stringr)
library(h2o)
library(fredr)
#Data
theData = read.csv('Data/cleanedFinalDataset.csv',header=T,stringsAsFactors=F)
redfin = read.csv('Data/redfin_data_for_asset_appreciation.csv',header=T,stringsAsFactors=F)
#Pull in Interest Rate Data from FRED
fredr_set_key("1e7ed343d3ccb82af108e43174bf8f1f")
fred_30yr = fredr(series_id = "MORTGAGE30US",
observation_start = as.Date("2019-01-01",format="%Y-%m-%d"),
frequency = "w") %>% select(date,value) %>% rename(rate=value)
fred_start = min(fred_30yr$date)
fred_end = Sys.Date()
rate = NA
for(i in seq(from=fred_start,to=fred_end,by=1)){
if(i %in% fred_30yr$date){
rate = unlist(fred_30yr %>% filter(date == i) %>% select(rate))
}
if(i == fred_start){
mortgage_rates = as.data.frame(cbind(i,rate))
} else {
mortgage_rates = rbind(mortgage_rates,
as.data.frame(cbind(i,rate)))
}
}
colnames(mortgage_rates) = c("date","rate")
mortgage_rates$date = as.Date(mortgage_rates$date,origin = "1970-01-01")
#Create lag... 30 - 45 days typically between accepting offer and closing
#A person would often lock in the interest rate at that time
mortgage_rates$avg_rate = NA
mortgage_cuttoff = fred_start + 45
for(i in seq(from=1,to=dim(mortgage_rates)[1],by=1)){
if(mortgage_rates[i,"date"]>=mortgage_cuttoff){
mortgage_rates[i,"avg_rate"] = round(mean(unlist(mortgage_rates %>%
filter(date>=mortgage_rates[i,"date"]-45 &
date<=mortgage_rates[i,"date"]-30) %>%
select(rate))),2)
}
}
mortgage_rates = mortgage_rates %>% filter(!is.na(avg_rate))
colnames(mortgage_rates)[1] = "soldDate"
#Create field to determine which month to join to
theData$join_date = substr(theData$soldDate,1,7)
redfin$join_date = substr(redfin$Time,1,7)
#Modify fields for use
theData$soldDate = as.Date(theData$soldDate)
theData$listDate = as.Date(theData$listDate)
redfin$Time = as.Date(redfin$Time)
#Filter data to >= 2020
theData = theData %>% filter(substr(join_date,1,4) %in% c("2020","2021","2022"))
#Mutate the join for months too new for data
theMissing = setdiff(unique(theData$join_date),unique(redfin$join_date))
changeto = redfin[dim(redfin)[1],"join_date"]
theData$join_date = ifelse(theData$join_date %in% theMissing,changeto,theData$join_date)
#Calculate Asset Appreciation to Now
price_now = redfin[dim(redfin)[1],"Median_Price"]
redfin$appreciation = ((price_now - redfin$Median_Price) / redfin$Median_Price) + 1
#Join Appreciation and Adjust Sold Price
theData = theData %>%
left_join(.,redfin %>% select(join_date,appreciation),by="join_date") %>%
left_join(.,mortgage_rates %>% select(soldDate,avg_rate),by="soldDate") %>%
mutate(soldPriceAdj = soldPrice * appreciation)
#Filter out suburbs with little volume
theSuburbs = theData %>% group_by(suburb) %>% summarise(count=n())
suburbsKeep = unlist(theSuburbs %>% filter(count > 20) %>% select(suburb))
theData = theData %>% filter(suburb %in% suburbsKeep)
theData$suburb = as.factor(theData$suburb)
#tax assessed to list price ratio
taxAssessed = theData %>%
filter(!is.na(soldPrice) & !is.na(taxAssessedValue)) %>%
mutate(taxAssessedDiff = (taxAssessedValue - soldPrice)/soldPrice + 1) %>%
group_by(suburb) %>%
summarise(medTaxAssessedDiff = median(taxAssessedDiff,na.rm=T),
meanTaxAssessedDiff = mean(taxAssessedDiff,na.rm=T))
write.csv(taxAssessed,"Data/taxAssessedToSoldPriceAdj.csv",row.names = FALSE)
#try to pull out most important variables in predicting price
h2o.init(max_mem_size = "16g")
y = "soldPriceAdj"
x = setdiff(colnames(theData),c(y,"Address","Street","City","State","ZipCode","zpid","url","soldPrice","area",
"latitude","longitude","soldDate","Type","description","cooling","heating",
"parkingLocation","flooring","daysToSale","sellDiff","sellDiffPrct","pricePerSquareFoot",
"join_date","appreciation","listPrice","listDate","taxAssessedValue","roof","style","avg_rate"))
H2O_theData = as.h2o(theData)
#Random Forest
#R^2 only 42%. Not good. Try others to see if better fit for data.
rf = h2o.randomForest(y=y,
x=x,
training_frame=H2O_theData,
model_id = "rf",
nfolds=10,
seed=412)
rf_varimp_curr = as.data.frame(h2o.varimp(rf))
h2o.saveModel(rf,path="Models/",force=T)
rf_r2 = h2o.r2(rf)
#Gradient Boosting Machine
#R^2 90%. Much better fit, and not worried about overfitting because the model is cross-validated.
#Best model. Use gbm in understanding influences on pricing.
gbm = h2o.gbm(y=y,
x=x,
training_frame=H2O_theData,
model_id = "gbm",
nfolds=10,
ntrees=250,
seed=412)
gbm_varimp_curr = as.data.frame(h2o.varimp(gbm))
h2o.saveModel(gbm,path="Models/",force=T)
gbm_r2 = h2o.r2(gbm)
h2o.shutdown(prompt=F)
#Write out variable importance results
#The GBM variable importance list is very similar to the RF for the most important variables, but it seems to
#be more in line with my expectation of importance. For example, hasCooling is much higher on the GBM varimp
#list than it is on the RF varimp list.
write.csv(rf_varimp_curr,"Results/randomForestVariableImportance.csv",row.names=F)
write.csv(gbm_varimp_curr,"Results/GBMVariableImportance.csv",row.names=F)
#Build linear model - Top Variables Only Account for 54% of variability
#Suburb is probably too split in the linear model but is an important variable overall
#Some signs don't make sense logically. Bed should logically be positive, not negative.
#Conclusion: Not really a linear relationship. Nonparametric model needed. GBM best option.
lm = lm(soldPriceAdj ~ suburb + avg_rate + baths + area + walk_score + beds,
data=theData)
summary(lm)