-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraping_fin.py
142 lines (115 loc) · 5.32 KB
/
scraping_fin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import csv
import time
import re
import requests
from bs4 import BeautifulSoup
from ural.utils import safe_urlsplit
from ural import normalize_url, is_url, ensure_protocol, urls_from_html, urls_from_text
from urllib.parse import urljoin, urlsplit
from tqdm import tqdm
import os
import sys
import pathlib
class Crawler:
def __init__(self, url, writing_dir_html, writing_dir_urls, n, actif = True):
self.url = url
self.writing_dir_html = writing_dir_html
self.writing_dir_urls = writing_dir_urls
self.reading_dir_url =""
self.current_depth = 0
self.max_depth = n
self.actif = actif
def get_content(self):
""" return the html of the passed url and a list of all the url in it"""
#self.profondeur -= 1
req =""
try:
req = requests.get(self.url)
#updating the url
except:
return("erreur de requetes")
# localement par rapport au fichier csv ou l'on se trovue ?
time.sleep(0.20)
html = BeautifulSoup(req.content, "lxml").prettify()
### ---get url
urls_html = urls_from_text(html) # rappel : generator (yield, lazy iterator)
urls_in = set()
for url in urls_html:
url = urlsplit(url).geturl()
if is_url(url):
if url not in urls_in:
urls_in.add(url)
return(html,urls_in)
#fonction pour récupérer le nombre de fichier urls dans une couche
def file_number(self):
#libraire os.path pour pas que ça pete avec un autre os
os.chdir(self.reading_dir_url)
count = 0
for path in pathlib.Path(self.reading_dir_url.iterdir():
if path.is_file():
count += 1
return count
def state_update_andwrite(self):
if self.current_depth == self.max_depth + 1:
self.actif = False
return False
#initialisation
if self.current_depth == 0:
couche = set()
couche = self.get_content() #on recupere la couche 0 en fait
self.current_depth += 1
with open("{}/0.html".format(self.writing_dir_html),'w') as html_file:
# ecriture du fichier html0
html_file.write(couche[0])
dossier = self.writing_dir_html.split("/")
chemin = dossier.pop()
self.writing_dir_html = "/".join(dossier)+ "/" + chemin[:len(chemin)-1] + str(self.current_depth)
with open("{}/0.csv".format(self.writing_dir_urls),"w") as next_url_file:
# ecriture urls contenus dans le fichier html0
writer = csv.DictWriter(next_url_file, fieldnames=["FROM_HTML","URL"])
writer.writeheader()
x = 0
for url in couche[1]:
writer.writerow({"FROM_HTML":"{}.html".format(x),"URL":url})
x += 1
# le dossier de lecture devient celui de la couche qui vient d'etre traitée
self.reading_dir_url = self.writing_dir_urls
# on change le dossier d'ecriture
dossier2 = self.writing_dir_urls.split("/")
chemin2 = dossier2.pop()
self.writing_dir_urls = "/".join(dossier2) + "/" + chemin2[:len(chemin2)-1] + str(self.current_depth)
self.actif = True
return True
#hérédité
nombre_fichiers_urlcsv = self.file_number()
for x in range(nombre_fichiers_urlcsv):
with open("{}/{}.csv".format(self.reading_dir_url,x)) as url_file :
file_content = csv.DictReader(url_file)
compteur = 0
for row in file_content:
if not is_url(row["URL"]) or type(row["URL"]) != str:
continue
self.url = row["URL"]
couche_suivante = self.get_content()
with open("{}/{}.html".format(self.writing_dir_html,compteur),'w') as html_file:
html_file.write(couche_suivante[0])
with open("{}/{}.csv".format(self.writing_dir_urls,compteur),"w") as next_url_file:
writer = csv.DictWriter(next_url_file, fieldnames=["FROM_HTML","URL"])
writer.writeheader()
for url in couche_suivante[1]:
writer.writerow({"FROM_HTML":"{}.html".format(x),"URL":url})
compteur += 1
#changement du dossier de lecture
self.reading_dir_url= self.writing_dir_urls
self.current_depth += 1
dossier = self.writing_dir_html.split("/")
chemin = dossier.pop()
self.writing_dir_html = "/".join(dossier) + "/" + chemin[:len(chemin)-1] + str(self.current_depth)
dossier2 = self.writing_dir_urls.split("/")
chemin2 = dossier2.pop()
self.writing_dir_urls = "/".join(dossier2) + "/" + chemin2[:len(chemin2)-1] + str(self.current_depth)
self.actif = True
return True
crawl = Crawler("https://www.echojs.com/","/home/ptl7123/Bureau/test_scraping/code_html_cou0","/home/ptl7123/Bureau/test_scraping/url_cou0",2)
while crawl.actif:
crawl.state_update_andwrite()