-
Notifications
You must be signed in to change notification settings - Fork 0
/
giantsArticles.py
132 lines (113 loc) · 3.96 KB
/
giantsArticles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import tweepy
import requests
import json
from bs4 import BeautifulSoup
from lxml import etree
API_KEYS_FILE = "api_keys.txt"
DB_FILE = "history.json"
HEADER = {
'referer':'https://www.google.com/'
}
class Article:
title: str
link: str
def __init__(self, t, l):
self.title = t
self.link = l
class Website:
name: str
link: str
title_xpath: str
link_xpath: str
def __init__(self, n, l, atx, alx):
self.name = n
self.link = l
self.title_xpath = atx
self.link_xpath = alx
sites = [
Website("The Athletic", "https://theathletic.com/team/sf-giants", '//*[@id="body-container"]/div[3]/div/div/div[1]/div/div/a/div/span/div[2]/div[1]/div/h4/span', '//*[@id="body-container"]/div[3]/div/div/div[1]/div/div/a'),
# Website("SF Chronicle", "https://www.sfchronicle.com/sports/giants", '//*[@id="__next"]/main/div[4]/div/div/div/div[1]/div/div[2]/h2/a', '//*[@id="__next"]/main/div[4]/div/div/div/div[1]/div/div[2]/h2/a'), # CAPTCHA Protected
Website("NBCS Bay Area", "https://www.nbcsports.com/bayarea/giants", '//*[@id="main"]/div[2]/div[2]/div[1]/div[2]/div[2]/a', '//*[@id="main"]/div[2]/div[2]/div[1]/div[2]/div[2]/a'),
# Website("KNBR", "https://www.knbr.com/giantsnews", '//*[@id="wp--skip-link--target"]/div[1]/div[1]/ul/li[1]/div/div[2]/h2/a', '//*[@id="wp--skip-link--target"]/div[1]/div[1]/ul/li[1]/div/div[2]/h2/a'), # Not Working
Website("Around The Foghorn", "https://aroundthefoghorn.com", '//*[@id="mm-root"]/main/div[2]/section[1]/div/article/div/a', '//*[@id="mm-root"]/main/div[2]/section[1]/div/article/div/a'),
Website("The Mercury News", "https://www.mercurynews.com/sports/mlb/san-francisco-giants/", '//*[@id="main"]/section/article/div/header/h3/a/span', '//*[@id="main"]/section/article/figure/div/a')
]
def get_latest_article(website: Website) -> Article:
home = requests.get(website.link, headers=HEADER)
content = BeautifulSoup(home.content, 'html.parser')
parsed_content = etree.HTML(str(content))
title = ""
element = parsed_content.xpath(website.title_xpath)
while len(element) > 0:
if element[0].text != None:
title += element[0].text
elif element[0].get('title') != None:
title += element[0].get('title')
element = element[0].getchildren()
link = ""
element = parsed_content.xpath(website.link_xpath)
if len(element) > 0:
article_link = element[0].get('href')
if not ("http" in article_link):
link = website.link + article_link
else:
link = article_link
if title == "" or link == "":
return None
return Article(title.strip(), link.strip())
def already_posted(site: str, article: Article, history: dict) -> bool:
if not site in history:
history[site] = article.link
return False
if history[site] == article.link:
return True
history[site] = article.link
return False
def read_history() -> dict:
try:
with open(DB_FILE, 'r') as file:
history = json.load(file)
except:
history = {}
return history
def write_history(history: dict):
with open(DB_FILE, 'w+') as file:
json.dump(history, file)
print("Successfully saved post history!")
return
def get_api():
apiKeys = open(API_KEYS_FILE, "r")
keys = apiKeys.readlines()
keys = [x.strip() for x in keys]
# Create client
client = tweepy.Client(
consumer_key=keys[0],
consumer_secret=keys[1],
access_token=keys[2],
access_token_secret=keys[3]
)
return client
def post(article, api):
print("Posting link to", article.link)
text = article.title + "\n\n" + article.link
api.create_tweet(text=text)
return
def main():
history = read_history()
x_api = get_api()
for site in sites:
print("\nProcessing", site.name)
try:
article = get_latest_article(site)
if article == None:
continue
if already_posted(site.name, article, history):
print("Duplicate article for", site.name, "--", article.link)
continue
post(article, x_api)
except Exception as e:
print("Encountered an error while processing", site.name, ":", type(e).__name__, "-", e)
write_history(history)
return 0
if __name__ == "__main__":
main()