-
Notifications
You must be signed in to change notification settings - Fork 0
/
yellowpages_spider_byURL.py
91 lines (74 loc) · 3.1 KB
/
yellowpages_spider_byURL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import scrapy
from scrapy.crawler import CrawlerProcess
from random import randrange
URLs = [
"https://www.yellowpages.com/search?search_terms=resturants&geo_location_terms=Minneapolis%2C+MN",
]
class Yellowpages(scrapy.Spider):
name = "yellowpages"
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI': 'Data.csv',
}
def start_requests(self):
for url in URLs:
yield scrapy.Request(url=url,
callback=self.parse, dont_filter=True,
headers={
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/81.0.4044.138 Safari/537.36",
},
)
def parse(self, response):
data = response.css("div.search-results.organic > div")
companies = data.css("a.business-name::attr(href)").extract()
for company in companies:
yield scrapy.Request(
url="https://www.yellowpages.com" + company,
callback=self.parse2, dont_filter=True,
headers={
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/81.0.4044.138 Safari/537.36",
},
)
nextPage = response.css("li > a.next::attr(href)").extract_first()
if nextPage:
yield scrapy.Request(
url="https://www.yellowpages.com" + nextPage,
callback=self.parse, dont_filter=True,
headers={
'USER-AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/81.0.4044.138 Safari/537.36",
},
)
def parse2(self, response):
name = response.css("h1.business-name::text").extract_first()
phone = response.css("a.phone > strong::text").extract_first()
address1 = response.css("span.address > span::text").extract_first()
address2 = response.css("span.address::text").extract_first()
if address2:
zipCode = address2.split(" ")[-1].strip()
else:
zipCode = ""
if address1 and address2:
address = address1 + ", " + address2
else:
address = "N/A"
website = response.css("a.website-link::attr(href)").extract_first()
email = response.css("a.email-business::attr(href)").extract_first()
if not website:
website = "N/A"
if not email:
email = "N/A"
email = email.replace("mailto:", "")
yield {
"Name": name,
"Phone": phone,
"Email": email,
"Website": website,
"Zip Code": zipCode,
"Address": address,
}
process = CrawlerProcess()
process.crawl(Yellowpages)
process.start()