This repository has been archived by the owner on Feb 15, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
aber_past_paper_retriever.py
235 lines (176 loc) · 8.34 KB
/
aber_past_paper_retriever.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
"""Aberystwyth University Automatic Past Paper Retriever.
This module provides an automated solution to retrieving past papers from the
Abersywtyth University Past Paper's Page for a given Module Code.
If the user is authorised for access to past paper's with their provided
credentials, papers from all available years for a given module are downloaded
in to their own directory.
Requirements:
requests
lxml.html
Usage:
Use the CLI or GUI interface, both of which utilise this class.
"""
import os
import requests
import lxml.html
import re
WEBSITE_BASE_URL = 'https://www.aber.ac.uk'
DEPARTMENT_LISTING_URL = WEBSITE_BASE_URL + '/en/past-papers/'
class PaperRetriever:
"""Represents a Past Paper Retriever from the Aber Uni Website.
Attributes:
auth_header (HTTPBasicAuth Header): stores user credential header
department_url (str): URL for the department
graduate_level (str): the graduate level of the user
destination_folder (str): a path for the output folder for PDF's found
module_code (str): the 7 character module code
"""
def set_auth_header(self, username, password):
"""Sets the authentication header attribute.
Attributes:
username (str): the username
password (str): the password
"""
# encode HTTPBasicAuth header
self.auth_header = requests.auth.HTTPBasicAuth(username, password)
def set_department_url(self, department_url):
"""Sets the department URL attribute.
Attributes:
department_url (str): the new value for the department URL attribute
"""
self.department_url = department_url
# check to see if we are on the Business School intermediary graduate level page
if self.department_url[-6:] == '/smba/':
if self.graduate_level == 'Undergraduate':
self.department_url = self.department_url + 'ugrad/'
else:
self.department_url = self.department_url + 'postg/'
def set_module_code(self, module_code):
"""Sets the module code attribute.
Attributes:
module_code (str): the new value for the module code attribute
"""
# validate
validation_result = re.match('^[A-Z]{2,3}[0-9]{4,5}$', module_code)
if validation_result is None:
raise ValueError('Invalid Module Code Provided')
self.module_code = module_code
def set_destination_folder(self, destination_folder):
"""Sets the destination folder path attribute.
Attributes:
destination_folder (str): the new value for the destination path
"""
self.destination_folder = destination_folder
def set_graduate_level(self, graduate_level):
"""Sets the user graduate level attribute.
Attributes:
graduate_level (str): the new value for the graduate level attribute
"""
self.graduate_level = graduate_level
def move_into_module_folder(self):
"""Moves into the module folder."""
# move in to the destination directory
os.chdir(self.destination_folder)
# check if the folder where we are going to write exists
try:
os.mkdir(self.module_code)
except OSError:
pass
# move in to the module directory
os.chdir(self.module_code)
def move_out_of_module_folder(self):
"""Moves out of the module folder."""
# move in to the destination directory
os.chdir('..')
def is_existing_file(self, file_url):
"""Checks if a file exists at the given URL.
Args:
file_url (str): the url to retrieve the file from
Returns:
bool: True if a file exists at the given URL, False otherwise.
"""
# get header of file at url
url_request_response = requests.head(file_url, auth=self.auth_header)
return url_request_response.headers['Content-Length'] != '' # file exists
def get_paper(self, pdf_url):
"""Gets a local copy of the paper from the given URL.
Args:
pdf_url (str): the url to retrieve the file from
"""
if self.is_existing_file(pdf_url):
# send a HTTP request to the server and save
# the HTTP response in a response object called r
url_request_response = requests.get(pdf_url, auth=self.auth_header)
# raise exception if response not successful
url_request_response.raise_for_status()
# get semester ID and PDF name and combine the two
local_pdf_path = '-'.join(pdf_url.split('/')[-2:])
with open(local_pdf_path, 'wb') as local_file:
# Saving received content as a PDF file in binary format
# write the contents of the response (r.content) to a new file in binary mode.
local_file.write(url_request_response.content)
print('Retrieved', pdf_url)
print('Local PDF path', local_pdf_path)
else:
raise ValueError('Failed to find PDF at URL: ' + pdf_url)
def get_semester_page_links(self):
"""Gets all semester page links from a department's exam paper page.
Returns:
list (str): a list of URLs leading to all semester exam pages
"""
department_page_response = requests.get(self.department_url, stream=True)
department_page_response.raw.decode_content = True
page_tree = lxml.html.parse(department_page_response.raw)
semester_links = page_tree.xpath('/html/body/div[2]/main/div/article/div//a/@href')
semester_links = [WEBSITE_BASE_URL + sem_url for sem_url in semester_links]
return semester_links
def find_module_paper_url(self, paper_urls):
"""Searches for a module code within a list of past paper file URLs.
Args:
paper_urls (list str): a list of URLs leading to all exam paper files
Returns:
str containing the module papers URL if module code found, None otherwise
"""
for url in paper_urls:
if self.module_code in url:
return url
return None
def get_module_paper_for_semester(self, paper_urls):
"""Gets exam paper file (if exists) from a list of exam paper file links.
Args:
paper_urls (list str): a list of URLs leading to all exam paper files
module_code (str): the code for the module searching for
auth_header (HTTPBasicAuth): a authorization header for the request
"""
paper_url = self.find_module_paper_url(paper_urls)
if paper_url is not None:
self.get_paper(paper_url)
def get_all_module_papers(self, semester_page_links):
"""Gets all exam papers for a given module for all semesters.
Args:
semester_page_links (list str): a list of URLs leading to semesters
Returns:
list str: links to all exam papers found for the module
"""
for semester in semester_page_links:
current_semester_paper_links = self.get_paper_links_for_semester(semester)
self.get_module_paper_for_semester(current_semester_paper_links)
def get_paper_links_for_semester(self, semester_url):
"""Gets all exam paper file links from a semester's exam paper page.
Args:
semester_url (str): the url for the semester's exam paper page
Returns:
list (str): a list of URLs leading to all exam paper files
"""
semester_page_response = requests.get(semester_url, stream=True)
semester_page_response.raw.decode_content = True
page_tree = lxml.html.parse(semester_page_response.raw)
paper_links = page_tree.xpath('/html/body/div[2]/main/div/article/div//a/@href')
paper_links = [WEBSITE_BASE_URL + paper_url for paper_url in paper_links]
return paper_links
def retrieve(self):
"""Retrieves all module exam papers for all years."""
print('Retrieving from', self.department_url, 'for module', self.module_code)
semester_page_links = self.get_semester_page_links()
self.get_all_module_papers(semester_page_links)
return 'Retrieved any module papers found - check the destination folder.'