-
Notifications
You must be signed in to change notification settings - Fork 0
/
ApScrape
executable file
·244 lines (184 loc) · 9.15 KB
/
ApScrape
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/usr/bin/env python3
# Author: Jasper Hawks
#
# ArgumentScraper the file that handles
# scraping apartments.com for listings
# and exporting them to a spreadsheet
import argparse # Import argparse because eventually I want everything to be handled with commands
import xlwt # Import xlwt to write to spreadsheets
from xlwt import Workbook # Import the workbook function of xlwt
import requests # Import requests so we can request from the Apartments.com site
from bs4 import BeautifulSoup # Import Beautiful Soup to scrape data from the site
import re # Import re so we can use regex
tr = 0 # Instantiate the title row variable
ar = 0 # Instantiate the address row variable
pr = 0 # Instantiate the price row variable
br = 0 # Instantiate the beds row variable
availr = 0 #Instantiate the available row variable
amsr = 0 # Instantiate the amenities row variable
sqr = 0 # instantiate the square feet row variable
linkr = 0 # Instantiate the link row variable
# Instantiate a parser so that we can use positional arguments on the command line
parser = argparse.ArgumentParser(description="Scrapes Apartments.com for different apartment listings then exports the contents to an Excel file.")
# Add arguments to the parser starting with the name of the argument, the nargs which allows optional or required arguments, the type then the help text.
parser.add_argument("Region", metavar="Region",type=str,help="The name of the region you would like to search for apartments in separated by -. The formatting should be region, city, and state abbreviation. For example Downtown-Norfolk-Norfolk-VA. You can also search by city alone. For example Virginia-Beach-VA.")
parser.add_argument("--minB",nargs="?",type=int,help="Minimum amount of beds",default=-1)
parser.add_argument("--maxB",nargs="?",type=int,help="Maximum amount of beds",default=-1)
parser.add_argument("--minP",nargs="?",type=int,help="Minimum amount of prices",default=-1)
parser.add_argument("--maxP",nargs="?",type=int,help="Maximum amount of prices",default=-1)
# Then args will store all of our arguments so that we can use them throughout the program
args = parser.parse_args()
# The Anatomy of Apartments.com is as follows:
# Apartments url/region/bedrooms-price/page number
# For example:
# https://www.apartments.com/san-francisco-ca/3-bedrooms-1225-to-1700/2
def main(): # Function that starts the rest of the program and sets things up
minBeds = 0 # Variable for min bedrooms
maxBeds = 0 # Variable for max bedrooms
minPrice = 0 # Variable for min price
maxPrice = 0 # Variable for max price
minBedVal = False # Variable to see if we have a minBedVal
maxBedVal = False # Variable to see if we have a maxBedVal
minPriceVal = False# Variable to see if we have a minPriceVal
maxPriceVal = False# Variable to see if we have a maxPriceVal
wb = Workbook() # Instantiate the workbook
apts = wb.add_sheet("Apartments") # Instantiate a sheet named apts that we will write data to
userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36'
headers = {"User-Agent":userAgent}
# These two lines spoof our user agent since apartments.com filters requests by user agent
region = args.Region # Get the region from the user
minBeds = args.minB
if minBeds <= 0:
minBeds = ""
maxBeds = args.maxB
if maxBeds <= 0:
maxBeds = ""
try:
if minBeds > maxBeds:
print("Invalid amount of beds")
exit()
except TypeError:
pass
minPrice = args.minP
if minPrice <= 0:
minPrice = ""
maxPrice = args.maxP
if maxPrice <= 0:
maxPrice = ""
try:
if minPrice > maxPrice:
print("Invalid minimum price")
exit()
except TypeError:
pass
# This portion of the code base converts the inputs into data
# that we can insert into the URL
if type(minPrice) is int: # If the price is an int
minPriceVal = True # Then set the minimumPriceValue to true
if type(maxPrice) is int:
maxPriceVal = True
# These statements format the prices the way that Apartments.com formats them
if minPriceVal is True and maxPriceVal is True:
# If we have a range format it this way
price = str(minPrice) + "-to-" + str(maxPrice)
elif minPriceVal is True and maxPriceVal is False:
# Just the minimum price format it this way
price = "over-" + str(minPrice)
elif minPriceVal is False and maxPriceVal is True:
# Just the maximum price format it this way
price = "under-" + str(maxPrice)
else:
# Otherwise leave it blank
price = ""
# Repeat this with beds
if type(minBeds) is int:
minBedVal = True
if type(maxBeds) is int:
maxBedVal = True
if minBedVal is True and maxBedVal is True:
beds = str(minBeds) + "-to-" + str(maxBeds) + "-bedrooms-"
elif minBedVal is True and maxBedVal is False:
beds = "min-" + str(minBeds) + "-bedrooms-"
elif minBedVal is False and maxBedVal is True:
beds = "max-" + str(maxBeds) + "-bedrooms-"
else:
beds = ""
# getPages will return an int with a value that will act as the range for the for loop
for i in range(getPages(region,headers,beds,price)):
# Once we have the amount of pages we have to scrape
# we use that as an argument in the url to change
# pages
scrapeSave(headers,apts,wb,region,i+1,price,beds)
print("Page " + str(i + 1) + ": Done")
# Place the headers on top of the columns
apts.write(0,0,"Apartment Complex")
apts.write(0,1,"Address")
apts.write(0,2,"Price")
apts.write(0,3,"Beds")
apts.write(0,4,"Availability")
apts.write(0,5,"Square Feet")
apts.write(0,6,"URL")
apts.write(0,8,"Thanks for using Apartment Scraper. Check out more of my projects on my website: https://jasperhawks.netlify.app/")
wb.save(region + " Apartments.xls") # Save the worksheet as the region name + apartments.xls
def scrapeSave(headers,apts,wb,region,page,p,beds): # Function that scrapes the majority of the content
# Append arguments to the url
r = requests.get("https://www.apartments.com/" + region +"/" + beds + p + "/" + str(page)+ "/",headers=headers).text
soup = BeautifulSoup(r,"html.parser")
for titles in soup.find_all(class_ = "js-placardTitle title"): # Find all divs with this class
global tr
tr += 1 # Increment the title row variable
apts.write(tr,0,titles.text) # Write the data to the spreadsheet
# This is repeated for all items
for adresses in soup.find_all(class_ = "property-address js-url"):
global ar
ar += 1
apts.write(ar,1,adresses.text)
for prices in soup.find_all(class_ = "price-range"):
global pr
pr += 1
apts.write(pr,2,prices.text)
for beds in soup.find_all(class_ = "bed-range"):
global br
br += 1
apts.write(br,3,beds.text)
for links in soup.find_all(lambda tag: tag.name == 'a' and tag.get('class') == ['property-link']):
global linkr
linkr += 1
apts.write(linkr,6,links['href'])
moreinfo(links['href'],apts,wb,headers)
for avail in soup.find_all(class_ = "availability"):
if re.search("unavailable",avail.text):
break
else:
global availr
availr += 1
apts.write(availr,4,avail.text)
def moreinfo(link,apts,wb,h): # This function gets the square footage
r = requests.get(link,headers=h).text # Use the link we used previously
soup = BeautifulSoup(r,"html.parser")
c = 0 # Instantiate a counter variable
for sq in soup.find_all(class_ = "rentInfoDetail"): # Find the four boxes with different data
if re.search("sq",sq.text): # If the data has sq in its text
global sqr
sqr += 1 # Increment the square footage row variable
sqclean = re.sub("\n[\d]\b.*\n","",sq.text) # Remove the title text
apts.write(sqr,5,sqclean) # Write the rest to the cell
else: # Otherwise
c += 1 # Instantiate the counter variable
if c == 4: # Since there are four boxes if we go through all four
sqr += 1 # Increment the square footage variable by one
apts.write(sqr,5,"Square footage not listed") # Write Square footage not listed to the cell
c = 0 # Reset the counter variable
def getPages(reg,head,b,p): # Function to get the number of pages we have to scrape
reg = re.sub("\b(\s)\b","-",reg) # substitute whitespace for -
r = requests.get("https://www.apartments.com/" + reg + "/" + b + p,headers=head).text # Append the region, price, and beds to the url
soup = BeautifulSoup(r,"html.parser")
try:
num = soup.find(class_ = "pageRange").text # Try to find the page range div
except: # If there is none then we have no extra pages to scrape
return 1 # So return one for the first page
# Since the page range is formatted Page 1 of X then we have to use regex to get rid of the Page 1 of
num = re.sub("^\S*\s\d*....","",num)
return int(num) # Return the int
if __name__ == "__main__":
main()