-
Notifications
You must be signed in to change notification settings - Fork 1
/
helpers.py
431 lines (331 loc) · 15.4 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
import glob
import os
import ast # To safely evaluate the string representation of the lists
import itertools
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import kendalltau, spearmanr
Path("data").mkdir(exist_ok=True)
Path("data/byfield").mkdir(exist_ok=True)
Path("output").mkdir(exist_ok=True)
Path("figures").mkdir(exist_ok=True)
RANDOM_SEED = 42
def compute_average_citation_age_from_field_to_other_fields(
field: str = "NLP",
):
"""
Computes the average citation age from a specified field to other fields.
Args:
field (str, optional): The field for which to compute the citation age. Defaults to "NLP".
Returns:
None
Example:
```python
compute_citation_age_from_field_to_other_fields("NLP")
```"""
# Read output/NLP_paper_to_stats.csv
df = pd.read_csv(f"data/byfield/{field}_paper_to_stats.csv")
# Drop na for outgoing_citation_ages and outgoing_fields
df = df.dropna(subset=["outgoing_citation_ages", "outgoing_fields"])
# Convert the string representations of lists to actual lists
df["outgoing_citation_ages"] = df["outgoing_citation_ages"].apply(ast.literal_eval)
df["outgoing_fields"] = df["outgoing_fields"].apply(ast.literal_eval)
# Explode outgoing_citation_ages
df = df.explode("outgoing_citation_ages")
# Explode outgoing_fields twice
df = df.explode("outgoing_fields") # This explodes the outer list
df = df.explode("outgoing_fields") # This explodes the inner list
# Compute the average citation age from NLP to other fields
average_citation_age = df.groupby("outgoing_fields")[
"outgoing_citation_ages"
].mean()
# Export to output/NLP_average_citation_age_to_field.csv
average_citation_age.to_csv(
f"data/byfield/{field}_average_citation_age_to_field.csv"
)
print(average_citation_age.head())
def compute_citation_age_from_own_field_and_to_other_fields(
fields: list = [
"NLP",
"ML",
"Psychology",
"Sociology",
"Linguistics",
"Mathematics",
"Computer science",
],
):
"""
Computes the average citation age from a field to itself and to other fields.
Args:
fields (list, optional): A list of fields for which to compute the citation age. Defaults to ["NLP", "ML", "Psychology", "Sociology", "Linguistics", "Mathematics"].
Returns:
None
Example:
```python
compute_citation_age_from_own_field_and_to_other_fields(["NLP", "ML"])
```
"""
for field in fields:
# Read output/NLP_paper_to_stats.csv
df = pd.read_csv(f"data/byfield/{field}_paper_to_stats.csv")
# Drop na for outgoing_citation_ages and outgoing_fields
df = df.dropna(subset=["outgoing_citation_ages", "outgoing_fields"])
# Convert the string representations of lists to actual lists
df["outgoing_citation_ages"] = df["outgoing_citation_ages"].apply(
ast.literal_eval
)
df["outgoing_fields"] = df["outgoing_fields"].apply(ast.literal_eval)
# Explode outgoing_citation_ages
df = df.explode("outgoing_citation_ages")
# Explode outgoing_fields twice
df = df.explode("outgoing_fields") # This explodes the outer list
df = df.explode("outgoing_fields") # This explodes the inner list
# Compute the average citation age from field to itself (where outgoing_fields == field)
average_citation_age_self = (
df[
(
df["outgoing_fields"] == field
if field not in ["NLP", "ML"]
else df["outgoing_fields"] == "Computer science"
)
]
.groupby("year")["outgoing_citation_ages"]
.mean()
.rename("average_citation_age_to_self")
)
# Compute the average citation age from field to other fields per year (where outgoing_fields != field)
average_citation_age_other = (
df[df["outgoing_fields"] != field]
.groupby("year")["outgoing_citation_ages"]
.mean()
).rename("average_citation_age_to_other_fields")
# Combine results into one DataFrame
result = pd.concat(
[average_citation_age_self, average_citation_age_other], axis=1
)
# Export to output/{field}_average_citation_age_to_self_and_others.csv
result.reset_index().to_csv(
f"data/byfield/{field}_average_citation_age_to_self_and_others.csv",
index=False,
)
def _sum_citations_in_epoch(citations_list, epoch_start, epoch_end):
"""
Calculates the sum of the "cited_by_count" values for citations within a specified epoch.
Args:
citations_list (list): A list of dictionaries representing citations, each containing a "cited_by_count" and "year" key.
epoch_start (int): The start year of the epoch (inclusive).
epoch_end (int): The end year of the epoch (exclusive).
Returns:
int: The sum of the "cited_by_count" values for citations within the specified epoch.
Example:
```python
citations = [
{"cited_by_count": 10, "year": 2010},
{"cited_by_count": 5, "year": 2015},
{"cited_by_count": 8, "year": 2020},
{"cited_by_count": 3, "year": 2025},
]
epoch_start = 2015
epoch_end = 2025
sum_citations = _sum_citations_in_epoch(citations, epoch_start, epoch_end)
print(sum_citations) # Output: 11
```"""
return sum(
d["cited_by_count"]
for d in citations_list
if epoch_start <= d["year"] < epoch_end
)
def compute_rankings(field):
# Load the data
df = pd.read_csv(f"data/byfield/{field}_paper_to_stats.csv")[:10000]
# Define time periods for analysis
epochs = [(1980, 1990), (1990, 2000), (2000, 2010), (2010, 2015), (2015, 2020)]
# Initialize a DataFrame to store the rankings with all unique paper_ids
rankings_df = pd.DataFrame()
# Iterate through each combination of publishing and citing epochs
for i, publish_epoch in enumerate(epochs[:-1]):
for cite_epoch in epochs[i + 1 :]:
# Column name for the current combination
column_name = (
f"papers_from_{publish_epoch}_ranked_by_citations_from_{cite_epoch}"
)
# Filter papers published in the publishing epoch
filtered_papers = df[
df["year"].between(publish_epoch[0], publish_epoch[1])
].copy()
# Initialize an empty list to store the citation counts for each paper
citation_counts = []
# Iterate through each row in the filtered_papers DataFrame
for index, row in filtered_papers.iterrows():
# Initialize the citation count for the current paper
current_citation_count = 0
# Convert the string representation of the list to an actual list
counts_by_year = ast.literal_eval(row['counts_by_year'])
# Iterate through each count dictionary in the counts_by_year list
for count in counts_by_year:
# Check if the year of the citation is within the citing epoch
if cite_epoch[0] <= count['year'] < 2023:
# Add the cited_by_count to the current paper's citation count
current_citation_count += int(count['cited_by_count'])
# Append the calculated citation count for the current paper to the list
citation_counts.append(current_citation_count)
# Add the calculated citation counts as a new column to the filtered_papers DataFrame
filtered_papers[column_name + "_count"] = citation_counts
print(filtered_papers[column_name + "_count"].isna().count())
print(filtered_papers[column_name + "_count"].count())
# Fill filtered papers in column name with 0s if NaN
# filtered_papers[column_name + "_count"] = filtered_papers[
# column_name + "_count"
# ].fillna(0)
# Sort the papers by the citation counts
filtered_papers = filtered_papers.sort_values(
by=column_name + "_count", ascending=False
)
# Rename the paper_id column to the column_name
filtered_papers = filtered_papers.rename(columns={"paper_id": column_name})
# Add the paper_ids to the rankings DataFrame
rankings_df = pd.concat(
[rankings_df, filtered_papers[column_name].reset_index(drop=True)],
axis=1,
)
# Save the rankings to a CSV file
rankings_df.to_csv(f"output/{field}_rankings.csv", index=False)
return rankings_df
def compute_ranking_correlations(field: str = "NLP", percentile=0.1):
"""Computes Spearman, Kendall Tau, and RBO ranking correlations for a given field and percentile.
Args:
field (str): The field for which rankings are computed. Defaults to "NLP".
percentile (float): The percentile of papers to consider. Defaults to 0.5.
Returns:
dict: A dictionary containing three pandas DataFrames for Spearman, Kendall Tau, and RBO correlations.
"""
# Load the csv
df = pd.read_csv(f"output/{field}_rankings.csv")
# Take first n % of the papers
df = df.iloc[: int(len(df) * percentile)]
# Drop na
df = df.dropna()
print(df.head())
# Convert the URLs to rankings
for column in df.columns:
df[column] = df[column].rank().astype(int)
# Define the epochs
base_epochs = ["(1980, 1990)", "(1990, 2000)", "(2000, 2010)"]
citing_epochs_pairs = [
("(1990, 2000)", "(2000, 2010)"),
("(2000, 2010)", "(2010, 2015)"),
("(2010, 2015)", "(2015, 2020)"),
]
# Initialize correlation tables
spearman_table = pd.DataFrame(
index=base_epochs,
columns=[f"{ep1} <> {ep2}" for ep1, ep2 in citing_epochs_pairs],
)
kendall_table = spearman_table.copy()
# Loop through the base epochs and citing epoch pairs to compute the correlations
for base_epoch in base_epochs:
for ep1, ep2 in citing_epochs_pairs:
col1 = f"papers_from_{base_epoch}_ranked_by_citations_from_{ep1}"
col2 = f"papers_from_{base_epoch}_ranked_by_citations_from_{ep2}"
# Check if both columns exist in df
if col1 in df.columns and col2 in df.columns:
# Compute Spearman and Kendall correlations
spearman_corr, _ = spearmanr(df[col1], df[col2])
kendall_corr, _ = kendalltau(df[col1], df[col2])
# Update the correlation tables
spearman_table.at[base_epoch, f"{ep1} <> {ep2}"] = spearman_corr
kendall_table.at[base_epoch, f"{ep1} <> {ep2}"] = kendall_corr
# Replace NaN with a dash
spearman_table = spearman_table.fillna("-")
kendall_table = kendall_table.fillna("-")
# Save tables to LaTeX files
for table, name in zip([spearman_table, kendall_table], ["spearman", "kendall"]):
with open(f"output/{field}_{name}_correlation_table.tex", "w") as file:
file.write(table.to_latex())
# Return the correlation tables
return {"Spearman": spearman_table, "Kendall": kendall_table}
def compute_average_citation_age():
# Pattern to match all relevant CSV files
pattern = os.path.join("data", "byfield", '*_paper_to_stats.csv')
# List to hold data from all matching files
aggregated_data = []
# Find all files matching the pattern
for filepath in glob.glob(pattern):
# Extract field name from the filename
filename = os.path.basename(filepath)
field = filename.replace('_paper_to_stats.csv', '')
# Read the CSV file into a DataFrame
df = pd.read_csv(filepath)
# Compute the average citation age per year
avg_citation_age_by_year = df.groupby('year')['avg_outgoing_citation_age'].mean().reset_index()
# Add the field name to the DataFrame
avg_citation_age_by_year['field'] = field
# Append the results to the aggregated data list
aggregated_data.append(avg_citation_age_by_year)
# Concatenate all DataFrames in the list
final_df = pd.concat(aggregated_data, ignore_index=True)
# Reorder the columns
final_df = final_df[['field', 'year', 'avg_outgoing_citation_age']]
# Output the DataFrame to a CSV file
output_path = os.path.join('data', 'citation_ages_by_year_and_concept.csv')
final_df.to_csv(output_path, index=False)
print(f"Output saved to {output_path}")
def compute_volume_age_correlation():
# Load the datasets
volume_df = pd.read_csv('data/works_by_year_and_concept.csv')
citation_df = pd.read_csv('data/citation_ages_by_year_and_concept.csv')
# Merge the two DataFrames on 'field' and 'year'
merged_df = pd.merge(volume_df, citation_df, on=['field', 'year'])
# Define the time ranges
time_ranges = [(1980, 1990), (1990, 2000), (2000, 2010), (2010, 2020)]
# Initialize a dictionary to store correlation results
correlations = {}
# Compute overall and time range-specific correlations
for field, group_df in merged_df.groupby('field'):
correlations[field] = {}
# Overall correlation
overall_corr = group_df[['count', 'avg_outgoing_citation_age']].corr(method='pearson').iloc[0, 1]
correlations[field]['Overall'] = overall_corr
# Time range-specific correlations
for start_year, end_year in time_ranges:
range_df = group_df[(group_df['year'] >= start_year) & (group_df['year'] <= end_year)]
if not range_df.empty:
corr = range_df[['count', 'avg_outgoing_citation_age']].corr(method='pearson').iloc[0, 1]
correlations[field][f'{start_year}-{end_year}'] = corr
else:
correlations[field][f'{start_year}-{end_year}'] = 'N/A' # In case there's no data in the time range
return correlations
def generate_latex_table(correlations):
# Convert the nested dictionary into a list of dictionaries for easier DataFrame creation
data = []
for field, time_ranges in correlations.items():
row = {'Field': field}
row.update(time_ranges)
data.append(row)
# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)
# Set the 'Field' column as the DataFrame index
df.set_index('Field', inplace=True)
# Generate the LaTeX table
latex_table = df.to_latex(float_format="{:0.2f}".format, escape=False)
# Write to a .tex file
with open('output/volume_age_correlation_table.tex', 'w') as f:
f.write(latex_table)
if __name__ == "__main__":
compute_average_citation_age()
correlations = compute_volume_age_correlation()
# Display the correlations
for field, data in correlations.items():
print(f'Field: {field}')
for time_range, correlation in data.items():
print(f' {time_range}: {correlation}')
generate_latex_table(correlations)
# print("Start")
# compute_rankings("NLP")
# correlation_tables = compute_ranking_correlations("NLP", 0.5)
# print("End")