-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
109 lines (84 loc) · 3.15 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt
# Google stuff
# Choose options for Chrome window opened by selenium
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# create a webdriver with our desired options
driver = webdriver.Chrome(options=chrome_options)
# get url for search
url = "https://news.google.com/search?q=kpop"
driver.get(url)
# wait a second for it to load
time.sleep(2)
scroll_pause_time = 1
# get the screen height so we can track how much we have loaded
screen_height = driver.execute_script("return window.screen.height;")
i = 1
while True:
# using the selenium driver, scroll until the screen is a specific height
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(
screen_height=screen_height, i=i))
i += 1
# pause for the scroll to complete
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
# grab the source code for the page and parse the html
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html5lib')
#all done! Close the driver
driver.quit()
# Actual Algorithm bit
# make txt file into a list
text_file = open('groups.txt', 'r')
groups = text_file.read().splitlines()
text_file.close()
#make a dict for groups
groups_found = dict()
for heading in soup.findAll('h3', class_='ipQwMb ekueJc RD0gLb'):
#grab the title text from the articles
title = heading.find('a', class_='DY5T1d RZIKme').text
# see if any groups match in heading
# if it does, either add or increment the group to the dict
for i in groups:
if i in title or i.upper() in title:
if i in groups_found:
groups_found[i] += 1
else:
groups_found.update({i: 1})
# make summary file
fileOut = open("kpopsumrev.txt", "w")
for i in groups_found:
fileOut.write(i + "\n")
for heading in soup.findAll('h3', class_='ipQwMb ekueJc RD0gLb'):
#grab the title text from the articles
title = heading.find('a', class_='DY5T1d RZIKme').text
if i in title or i.upper() in title:
fileOut.write("-- " + title + "\n")
fileOut.write("\n")
fileOut.close()
# sort the dictionary in reverse order
groups_sorted = dict(
sorted(groups_found.items(), key=lambda item: item[1], reverse=True))
# make bar graph of the data
names = list(groups_sorted.keys())
values = list(groups_sorted.values())
plt.bar(range(len(groups_sorted)), values, tick_label=names, color='pink')
plt.title("Kpop Group Relevance in Google News")
plt.ylabel("Number of Times Appeared")
plt.xlabel("Groups")
# makes the group names vertical
plt.xticks(range(len(groups_sorted)), groups_sorted, rotation=90)
# makes it so the names aren't cutoff
plt.tight_layout()
# saves a copy of the data
plt.savefig('results.png', dpi=400)
plt.show()