-
Notifications
You must be signed in to change notification settings - Fork 3
/
tasks.py
377 lines (300 loc) Β· 13.2 KB
/
tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
"""Extracts all the paper metadata."""
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from urllib import request
import pandas as pd
import twint
import nest_asyncio
import datetime
import logging
import json
import time
import os
def get_char_sequence(string: str) -> str:
"""Remove non-alpha characters from a string and convert it to lowercase."""
return "".join(filter(str.isalpha, string.lower()))
def get_semantic_scholar_id(paper_title: str, driver: webdriver.Firefox) -> str:
"""Searches Semantic Scholar to see if the title exists."""
search = driver.find_elements_by_class_name("form-input")[0]
search.clear()
search.send_keys(paper_title)
search.send_keys(Keys.RETURN)
try:
WebDriverWait(driver, 3).until(
EC.visibility_of_element_located(
(By.XPATH, "//a[@data-selenium-selector='title-link']")
)
)
except:
logging.warning(f"Semantic Scholar timeout with {paper_title}!")
candidate_char_sequences = []
papers = driver.find_elements_by_xpath("//a[@data-selenium-selector='title-link']")
for paper in papers[:3]:
soup = BeautifulSoup(paper.get_attribute("innerHTML"), features="lxml")
candidate_title = soup.get_text()
# determine if the papers have the same title
char_sequence = get_char_sequence(candidate_title)
candidate_char_sequences.append(char_sequence)
# check for duplicate titles
if len(candidate_char_sequences) != len(set(candidate_char_sequences)):
raise Exception(f"More than 1 paper with the title: {paper_title}!")
paper_seq = get_char_sequence(paper_title)
for i, c_seq in enumerate(candidate_char_sequences):
if c_seq == paper_seq:
paper_url = papers[i].get_attribute("href")
paper_id = paper_url[paper_url.rfind("/") + 1 :]
return paper_id
raise Exception(f"Paper not found {paper_title}!")
def get_abstract(pdf_url: str) -> str:
"""Get the abstract from the CVPR pdf_url."""
html_url = pdf_url.replace("/papers/", "/html/").replace(".pdf", ".html")
r = requests.get(html_url)
soup = BeautifulSoup(r.text, features="lxml")
abstract = soup.select("div#abstract")[0].get_text().strip()
return abstract
def get_paper_data(
cvpr_papers_url: str = "https://openaccess.thecvf.com/CVPR2021?day=all",
fetch_semantic_scholar_id: bool = True,
fetch_abstract: bool = True,
start_at_paper: int = 0,
sleep_increment: int = 100,
):
"""Get the metadata for each paper.
The abstracts are on a different page then the titles and authors,
so fetch_abstract specifies if one wants to request this page. Similarly,
the Semantic Scholar IDs also must be fetched from a separate page.
start_at_paper is used if one is continuing from an interruption.
Semantic Scholar hits rate limits about every 100 papers, hence why there
is a sleep_increment, which sleeps for 3 minutes.
"""
if fetch_semantic_scholar_id:
driver = webdriver.Firefox(executable_path="./geckodriver.exe")
driver.get("https://semanticscholar.org")
r = requests.get(cvpr_papers_url)
soup = BeautifulSoup(r.text, features="lxml")
titles = soup.find_all("dt", class_="ptitle")[start_at_paper:]
for i, title_tag in enumerate(titles):
if fetch_semantic_scholar_id and i % sleep_increment == 0 and i != 0:
# hits rate limit requests on intervals 130 papers
print(f"Starting 3 minute sleep at {datetime.datetime.now().time()}")
time.sleep(2 * 60)
pass
print(f"Starting {i + 1}/{len(titles)}")
paper_title = title_tag.get_text()
paper = dict(
arXiv="", title=paper_title, pdf="", authors=[], abstract="", s2id=""
)
if fetch_semantic_scholar_id:
try:
paper["s2id"] = get_semantic_scholar_id(
paper_title=paper_title, driver=driver
)
except Exception as e:
logging.error(str(e))
author_tags = title_tag.find_next("dd")
authors = []
for author_tag in author_tags.find_all("a"):
authors.append(author_tag.get_text())
paper["authors"] = authors
paper["arXiv"] = None
link_tags = author_tags.find_next("dd")
for link_tag in link_tags.find_all("a"):
tag = link_tag.get_text()
if tag == "arXiv":
paper["arXiv"] = link_tag["href"]
elif tag == "pdf":
pdf_url: str = link_tag["href"]
paper["pdf"] = pdf_url
if fetch_abstract:
paper["abstract"] = get_abstract(pdf_url=pdf_url)
if not paper["pdf"]:
logging.warning(f"No PDF found for {paper_title}!")
break
paper_id = paper["pdf"][
paper["pdf"].rfind("/") + 1 : -len("_CVPR_2021_paper.pdf")
]
path = f"paper-data/{paper_id}.json"
# prevents accidentally overwriting additional data from the paper
if os.path.exists(path):
with open(path, "r") as f:
existing_data = json.load(f)
existing_data.update(paper)
paper = existing_data
with open(path, "w") as f:
f.write(json.dumps(paper, indent=2))
def add_2021_paper_poster_sessions():
dates = dict(
Monday="https://openaccess.thecvf.com/CVPR2021?day=2021-06-21",
Tuesday="https://openaccess.thecvf.com/CVPR2021?day=2021-06-22",
Wednesday="https://openaccess.thecvf.com/CVPR2021?day=2021-06-23",
Thursday="https://openaccess.thecvf.com/CVPR2021?day=2021-06-24",
Friday="https://openaccess.thecvf.com/CVPR2021?day=2021-06-25",
)
for session_day, date_url in dates.items():
r = requests.get(date_url)
soup = BeautifulSoup(r.text, features="lxml")
titles = soup.find_all("dt", class_="ptitle")
for i, title in enumerate(titles):
print(f"Session: {session_day}, Starting {i}/{len(titles)}")
pdf_link = title.find_next("a")["href"]
paper_id = pdf_link[pdf_link.rfind("/") + 1 : -len("_CVPR_2021_paper.html")]
data_file = f"paper-data/{paper_id}.json"
with open(data_file, "r") as f:
data = json.load(f)
data["posterSession"] = session_day
with open(data_file, "w") as f:
f.write(json.dumps(data, indent=2))
def query_twitter(search: str, output_path: str) -> None:
"""Save a csv file containing all of the tweets for a search query."""
c = twint.Config()
c.Search = search
c.Stats = True
c.Store_csv = True
c.Output = output_path
twint.run.Search(c)
def add_to_query(query: str, to_add: str, in_quotes: bool = True):
if in_quotes:
to_add = f'"{to_add}"'
query = to_add if not query else f"{query} OR {to_add}"
return query
def get_twitter_data(start_at_paper: int = 0):
"""Download all the twitter data locally."""
data_dir = "paper-data"
paper_data_paths = [
paper for paper in os.listdir(data_dir) if paper.endswith(".json")
]
# these contain too many tweets that are non-specific to the paper
skip_titles = set(["Learning by Watching",])
# make it possible to debug in an interactive environment
nest_asyncio.apply()
for i, paper in enumerate(paper_data_paths[start_at_paper:]):
print(f"Starting {i}/{len(paper_data_paths[start_at_paper:])}")
with open(os.path.join(data_dir, paper), "r") as f:
paper_data = json.load(f)
query = ""
if paper_data["arXiv"] is not None:
# want both http and https data
query = add_to_query(query, paper_data["arXiv"][len("http://") :])
if paper_data["title"] not in skip_titles:
query = add_to_query(query, paper_data["title"])
if query:
query_twitter(
search=query, output_path=f"twitter/{paper[: paper.rfind('.json')]}.csv"
)
def parse_twitter_data():
"""Update the paper-data directory with the data from Twitter."""
data_dir = "twitter"
twitter_data_paths = [
paper for paper in os.listdir(data_dir) if paper.endswith(".csv")
]
for i, path in enumerate(twitter_data_paths):
print(f"Starting {i}/{len(twitter_data_paths)}")
df = pd.read_csv(f"{data_dir}/{path}")
retweets = int(df["retweets_count"].sum())
likes = int(df["likes_count"].sum())
replies = int(df["replies_count"].sum())
# update the paper data
paper_data_path = f"paper-data/{path[:path.rfind('.csv')]}.json"
with open(paper_data_path, "r") as f:
data = json.load(f)
data["twitter"] = dict(retweets=retweets, likes=likes, replies=replies)
with open(paper_data_path, "w") as f:
f.write(json.dumps(data, indent=2))
def update_citation_data(start_at_paper: int = 0):
"""Update the citation count for each paper that has a Semantic Scholar ID."""
paths = [paper for paper in os.listdir("paper-data") if paper.endswith(".json")]
skip_titles = set(["Meta Pseudo Labels"])
for i, path in enumerate(paths[start_at_paper:]):
print(f"Starting {i}/{len(paths[start_at_paper:])}")
full_path = f"paper-data/{path}"
with open(full_path, "r") as f:
paper = json.load(f)
if not paper["s2id"]:
continue
# some papers have diffs from s2 and the api. This seems like a bug in the api.
if paper["title"] in skip_titles:
continue
url = f"https://api.semanticscholar.org/v1/paper/{paper['s2id']}"
with request.urlopen(url) as f:
s2_data = json.loads(f.read())
paper["citations"] = len(s2_data["citations"])
with open(full_path, "w") as f:
f.write(json.dumps(paper, indent=2))
def update_tweet_ids(df, file_id):
df.sort_values("likes_count", ascending=False, inplace=True)
tweet_ids = df["id"].values.tolist()
# solves precision issue from graphql
tweet_ids = [str(uuid) for uuid in tweet_ids]
data_file = "paper-data/" + file_id + ".json"
with open(data_file, "r") as f:
data = json.load(f)
data["twitter"]["ids"] = tweet_ids
with open(data_file, "w") as f:
f.write(json.dumps(data, indent=2))
def add_tweet_ids_to_json():
"""Assumes twitter/ is already full of the scraped .csv files of tweets."""
for tweets_file in os.listdir("twitter"):
df = pd.read_csv(f"twitter/{tweets_file}")
update_tweet_ids(df, tweets_file[: -len(".csv")])
def add_manual_data():
driver = webdriver.Firefox(executable_path="./geckodriver.exe")
with open("manual-data.json", "r") as f:
manual_data = json.load(f)
for paper_id, extras in manual_data.items():
df = pd.read_csv(f"twitter/{paper_id}.csv")
for user, tweet_id in extras:
driver.get(f"https://twitter.com/{user}/status/{tweet_id}")
try:
WebDriverWait(driver, 3).until(
EC.visibility_of_element_located(
(
By.CSS_SELECTOR,
".css-901oao.css-16my406.r-poiln3.r-b88u0q.r-bcqeeo.r-d3hbe1.r-qvutc0",
)
)
)
except:
logging.warning(f"Twitter timeout with {user}/{tweet_id}")
likes_count = 0
retweets_count = 0
replies_count = 0
soup = BeautifulSoup(driver.page_source, features="lxml")
stats = soup.select(
".css-901oao.css-16my406.r-poiln3.r-b88u0q.r-bcqeeo.r-d3hbe1.r-qvutc0"
)
for stat in stats:
n = int(stat.get_text().replace(",", ""))
stat_type = stat.parent()[1].find_next("span").get_text()
if stat_type == "Retweets":
retweets_count = n
elif stat_type == "Likes":
likes_count = n
elif stat_type == "Quote Tweets":
replies_count = n
df = df.append(
{
"id": tweet_id,
"likes_count": likes_count,
"retweets_count": retweets_count,
"replies_count": replies_count,
},
ignore_index=True,
)
df.sort_values("likes_count", ascending=False, inplace=True)
tweet_ids = df["id"].values.tolist()
# solves precision issue from graphql
tweet_ids = [str(uuid) for uuid in tweet_ids]
data_file = "paper-data/" + paper_id + ".json"
with open(data_file, "r") as f:
data = json.load(f)
data["twitter"]["ids"] = tweet_ids
data["twitter"]["replies"] = int(df["replies_count"].sum())
data["twitter"]["retweets"] = int(df["retweets_count"].sum())
data["twitter"]["likes"] = int(df["likes_count"].sum())
with open(data_file, "w") as f:
f.write(json.dumps(data, indent=2))