-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_scraper.txt
60 lines (45 loc) · 1.7 KB
/
pdf_scraper.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import requests
import openpyxl
import os,sys
wb = openpyxl.load_workbook('ICMA-Sustainable-Bonds-Database-151022.xlsx')
os.chdir("S:\IFI Applications\Green_Bong_Makerfest" )
def download_pdf(sheet):
ws = wb[sheet]
last_raw = ws.max_row
print(ws.cell(row=3, column=6).hyperlink.target)
list_link = []
for bond in range(3, last_raw):
try:
link = ws.cell(row=bond, column=6).hyperlink.target
name = ws.cell(row=bond, column=1).value
country = ws.cell(row=bond, column=2).value
if "../.." not in link:
list_link.append((name, link, country))
except:
pass
i = 0
errors = []
for link in list_link:
i += 1
try:
print("Downloading file: ", i)
# response = requests.get(link[1])
response = requests.get(link[1], stream=True)
pdf_file_name = os.path.basename(link[1])
#
if response.status_code == 200:
with open(link[2] + "_" + sheet + "_" + pdf_file_name, 'wb') as file:
file.write(response.content)
print('File downloaded successfully')
else:
print('Failed to download file')
except:
print("file {} had a problem".format(i))
print(link)
errors.append(link)
print("All PDF files downloaded")
print("Except following errors:")
print(errors)
print("total errors: {}".format(len(errors)))
for sheet in wb.sheetnames:
download_pdf(sheet)