-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
94 lines (78 loc) · 3.31 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from atexit import register
from lxml.html import fromstring
from collections import defaultdict
from json import dump
import pendulum
pendulum.set_locale('de')
now = pendulum.now()
month = now.month
year = now.year
if month == 1:
SEM = f"{year-1}W"
elif month in {2, 3, 4, 5, 6, 7}:
SEM = f"{year}S"
else:
SEM = f"{year}W"
semesters = [SEM, f"{int(SEM[:4])-1}{SEM[-1]}"]
def collapse_whitespace(s):
return ' '.join(s.split())
driver = Chrome(ChromeDriverManager().install())
# driver.set_window_rect(0, 0, 1280, 1024)
wait = WebDriverWait(driver, 10)
driver.implicitly_wait(10)
register(driver.quit)
driver.get("https://tiss.tuwien.ac.at/curriculum/public/curriculum.xhtml?dswid=5025&dsrid=300&key=64576")
data = defaultdict(list)
for semester in semesters:
driver.get("https://tiss.tuwien.ac.at/curriculum/public/curriculum.xhtml?dswid=5025&dsrid=300&key=64576")
try:
Select(driver.find_element_by_id("j_id_2d:semesterSelect")).select_by_value(semester)
except NoSuchElementException:
continue
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "tbody.ui-datatable-data"), "Ausgewählte Kapitel der Wahrscheinlichkeitstheorie (AKWTH)"))
html = driver.find_element_by_css_selector("tbody.ui-datatable-data").get_attribute("innerHTML")
soup = fromstring(html)
current_subject = ""
for row in soup.cssselect("tr"):
cells = row.cssselect("td")
classes = cells[0].cssselect("div")[0].get("class")
if "nodeTable-level-1" in classes and cells[0].cssselect("div > span.bold"):
current_subject = cells[0].text_content().strip()
print(f"\n{current_subject}")
print("-"*128)
elif "nodeTable-level-4" in classes:
links = row.cssselect("div.courseTitle > a")
if links:
cnum, kind, sem, title = collapse_whitespace(cells[0].text_content().strip().replace("\n", " ")).split(maxsplit=3)
if "Distance Learning" in title:
title = title.split("Distance Learning")[0].strip()
if sem not in semesters:
continue
link = links[0].get("href")
assert link[0] == '/'
print(cnum, " ", kind, title, link)
cnums = set(map(lambda x: x['cnum'], data[current_subject]))
if cnum not in cnums:
data[current_subject].append({
'cnum': cnum,
'kind': kind,
'sem': sem,
'title': title,
'link': link,
'ects': cells[3].text_content()
})
else:
raise ValueError("No link for", cells[0].text_content())
assert len(data) > 0
with open("public/data.json", "w") as f:
dump({
'data': data,
'sem': semesters,
'updated': pendulum.now().format("dddd, DD. MMMM YYYY HH:mm:ss")
}, f, sort_keys=True, indent=4)