-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path1 Pull Sold Housing Data.R
141 lines (116 loc) · 5.25 KB
/
1 Pull Sold Housing Data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#Project: Zillow Tracker
#Code: 1 Pull Sold Housing Data
#Author: Scott Onestak
#Last Executed: 8/10/2022
#Packages
library(rvest)
library(dplyr)
library(tidyr)
library(httr)
library(stringr)
library(jsonlite)
options(scipen=999)
#read in previous
theListDedup_prev = read.csv("Data/theListDedup.csv",header=T,stringsAsFactors=F)
#Set the zip codes to pull
zipcode_search = c(15206,15208,15217,15218,15221,15232)
numberoftiles = 40 #This is the number of tiles of listings that show up on Zillow
#Run through results to get webpages for the sold houses
theList = NA
listStarted = FALSE
for(i in seq(from=1,to=length(zipcode_search),by=1)){
#Create building blocks of the url build to search
cat(zipcode_search[i],"\n",sep="")
str1 = paste("https://www.zillow.com/pittsburgh-pa-",zipcode_search[i],"/sold/",sep="")
#check site status
check = GET(str1)
theCheck = unlist(http_status(check)$reason)
#if the website exists, good to continue
if(theCheck == "OK"){
#Check url exists and how many pages will need to be built to find housing data
holder = read_html(str1)
count = holder %>% html_nodes('.result-count') %>% html_text() %>% str_remove(' results') %>% str_remove(",") %>% as.numeric()
page_count = ceiling(min(count,1000)/numberoftiles)
cat("\nPage Count: ",page_count,"\n",sep="")
#Check to make sure strings are not empty... if not, build string to pull housing data and get home urls
if(!is.na(str1)){
theURL = NA
for(j in seq(from=1,to=page_count,by=1)){
cat("Page: ",j,"\n",sep="")
theURL = paste(str1,j,"_p/",sep="")
gotoStr = NA
#check site status
check2 = GET(theURL)
theCheck2 = unlist(http_status(check2)$reason)
#if the website exists, good to continue
if(theCheck2 == "OK"){
soldURLs = read_html(theURL) %>%
html_nodes("script") %>%
html_text()
Sys.sleep(5)
#Find the right script to use
for(k in seq(from=1,to=length(soldURLs),by=1)){
if(str_detect(soldURLs[k],"zpid") & str_detect(soldURLs[k],"<!--")){
gotoStr = k
}
}
if(!is.na(gotoStr)){
temp = fromJSON(str_replace_all(str_replace_all(soldURLs[gotoStr],"<!--",""),"-->",""))$cat1$searchResults$listResults
zpids = temp$zpid
hasImages = temp$hasImage
urls = temp$detailUrl
statuses = temp$statusType
soldPrices = temp$unformattedPrice
addresses = temp$address
streets = temp$addressStreet
cities = temp$addressCity
states = temp$addressState
zipcodes = temp$addressZipcode
beds = temp$beds
baths = temp$baths
areas = temp$area
latitudes = temp$latLong$latitude
longitudes = temp$latLong$longitude
isZillowOwned = temp$isZillowOwned
soldDates = trimws(str_replace_all(toupper(temp$variableData$text),"SOLD",""))
homeTypes = temp$hdpData$homeInfo$homeType
lotAreas = temp$hdpData$homeInfo$lotAreaValue
lotAreaUnits = temp$hdpData$homeInfo$lotAreaUnit
taxAssessedValues = temp$hdpData$homeInfo$taxAssessedValue
temp_results = as.data.frame(cbind(addresses,streets,cities,states,zipcodes,zpids,urls,hasImages,
statuses, soldPrices,beds,baths,areas,latitudes,longitudes,
isZillowOwned,soldDates, homeTypes,lotAreas,lotAreaUnits,
taxAssessedValues))
}
#Stack Results
if(listStarted == FALSE){
theList = temp_results
listStarted = TRUE
} else {
theList = rbind(theList,temp_results)
}
} else {
paste("Error: ",theURL,"\n",sep="")
}
}
}
} else {
paste("Error: ",url_build[i],"\n",sep="")
}
}
#Rename columns
colnames(theList) = c("Address","Street","City","State","ZipCode","zpid","url","hasImage","status","soldPrice",
"beds","baths","area","latitude","longitude","isZillowOwned","soldDate","Type","lotArea",
"lotAreaUnits","taxAssessedValue")
theList$soldDate = as.Date(theList$soldDate,format="%m/%d/%Y")
#Dedup the list... in case of multiple sells, only the most recent one will show but will show as multiple in list
theListDeduped = unique(theList) %>% filter(!Type %in% c("APARTMENT","MANUFACTURED","LOT","CONDO"))
theListDeduped$soldDate = as.character(theListDeduped$soldDate)
theListDeduped$zpid = as.numeric(theListDeduped$zpid)
old = theListDedup_prev %>% select(zpid,soldDate)
new = theListDeduped %>% select(zpid,soldDate)
newlistings = setdiff(new,old)
theListingsAppend = theListDeduped %>% inner_join(.,newlistings,by=c("zpid","soldDate"))
#Append new listings and write out
theListDedup = rbind(theListDedup_prev,theListingsAppend)
write.csv(theListDedup,"Data/theListDedup.csv",row.names = FALSE)