-
Notifications
You must be signed in to change notification settings - Fork 0
/
renthop-crawler.py
59 lines (54 loc) · 2.22 KB
/
renthop-crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import re
# Initialize columns
B = []
C = []
D = []
D_bed = []
D_bath = []
# Define crawl function and crawl through url for max number of pages
def crawl(url, maxpage):
count = 1
while count < maxpage:
print("Crawling: " + url + str(count))
req = urllib.request.Request(url + str(count), headers={'User-Agent' : "Magic Browser"})
con = urllib.request.urlopen(req)
soup = BeautifulSoup(con, "html.parser")
# Find the main div and extract correct neighboorhood data
main_div = soup.find("div", {"class": "col-12 col-lg-8 pr-0 pr-lg-5 pr-xl-0"})
neighborhood_div = main_div.findAll("div", {"id": re.compile("listing-.*")})
# Append neighborhood values to column B
for each in neighborhood_div:
neighborhood = each.contents
for item in neighborhood:
item = item.strip()
B.append(item)
# Find price values and append to column C
for row in main_div.findAll("tr"):
cells = row.findAll("td", {"id": re.compile("listing.*")})
for each in cells:
price = each.contents
for item in price:
item = item.strip()
C.append(item)
# Find both the bed and bath values (same span style) and append to D
for row in main_div.findAll("td", {"class": "font-size-11"}):
cells = row.findAll("span", {"style": "font-weight: bold; color: #444444;"})
for each in cells:
bed_bath = each.contents
for item in bed_bath:
item = item.strip()
D.append(item)
# Split D into D_bed and D_bath based on item indexing
D_bed = [item for i,item in enumerate(D) if i%2==0]
D_bath = [item for i,item in enumerate(D) if i%2==1]
# Increase the count and print data to csv
count += 1
df = pd.DataFrame(B, columns=["Neighborhood"])
df["Price"] = C
df["Number of Beds"] = D_bed
df["Number of Baths"] = D_bath
df.to_csv("data.csv")
crawl("https://www.renthop.com/boston-ma/apartments-for-rent?page=",150)