-
Notifications
You must be signed in to change notification settings - Fork 11
/
leboncoin.py
92 lines (81 loc) · 3.8 KB
/
leboncoin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from __future__ import unicode_literals
# -*- coding: utf-8 -*-
from scrapy.http.request import Request
from location.models import Offer, Source, OfferCategory
from datetime import datetime
from location.spiders.offer_spider import offerSpider
import urlparse
import re
class leboncoinSpider(offerSpider):
name = "leboncoin"
max_price = 800
start_urls = ()
source_id = Source.objects.filter(name='leboncoin')[0].id
map_category_to_url = {
'location': 'https://www.leboncoin.fr/locations/offres/ile_de_france/paris/?f=a&th=1&mre=&sqs=1&ret=2',
'colocation': 'https://www.leboncoin.fr/colocations/offres/ile_de_france/?th=1&location=Paris&parrot=0',
'telephone': 'https://www.leboncoin.fr/telephonie/offres/ile_de_france/?th=1&q=note%204&parrot=0&ps=8&pe=12'
}
def __init__(self, category="location"):
super(self.__class__, self).__init__()
self.start_urls = (self.map_category_to_url[category],)
self.offer_category_id = OfferCategory.objects.filter(name=category)[0].id
def parse_next_page(self, response):
try:
tags = response.xpath('.//li[@itemtype="http://schema.org/Offer"]')
if not len(tags):
exit()
for elmt in tags:
html_id = elmt.xpath('.//div[@class="saveAd"]/@data-savead-id').extract()[0]
check_offer = Offer.objects.filter(html_id=html_id).distinct()
if Offer.objects.filter(html_id=html_id).count() == 0:
offer = Offer()
offer.first_crawl_date = datetime.now()
else:
offer = check_offer[0]
offer.html_id = html_id
offer.source_id = self.source_id
offer.offer_category_id = self.offer_category_id
offer.url = 'http:' + elmt.xpath('.//a/@href').extract()[0]
offer.title = elmt.xpath('.//section[@class="item_infos"]/h2/text()').extract()[0].strip()
try:
offer.price = elmt.xpath('.//div[@class="price"]/text()').extract()[0].strip()
except:
try:
offer.price = elmt.xpath('.//h3[@class="item_price"]/@content').extract()[0].strip()
except:
pass # there's definitely no price down here
try:
offer.address = elmt.xpath('.//p[@itemtype="http://schema.org/Place"]/text()').extract()[0].strip()
except:
pass
offer.last_crawl_date = datetime.now()
offer.save()
yield Request(offer.url, callback=self.parse_one_annonce, meta={'offer':offer})
except UnboundLocalError:
print "Crawling done. Exiting..."
exit()
parse = urlparse.urlparse(response.url)
t = True
try:
n = urlparse.parse_qs(parse.query)['o'][0]
except KeyError:
t = False
next_page = response.url + '&o=2'
yield Request(next_page,
callback=self.parse_next_page)
if t:
parsed = int(n)
next_page = response.url[:-len(n)] + str(parsed + 1)
yield Request(next_page,
callback=self.parse_next_page)
def parse_one_annonce(self, response):
offer = super(leboncoinSpider, self).parse_one_annonce(response)
surface = response.xpath('//span[text()="Surface"]/following::span/text()').extract()
description = response.xpath('//div/p[@itemprop="description"]').extract()
try:
offer.area = re.compile('(\D+)').sub('', surface[0])
except:
pass
offer.description = re.compile('<.*?>').sub('', description[0])
offer.save()