-
Notifications
You must be signed in to change notification settings - Fork 1
/
abstracts_to_dict.py
106 lines (80 loc) · 4.34 KB
/
abstracts_to_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import argparse
import datetime
import gzip
import pickle
import pickletools
from pathlib import Path
import pandas as pd
def _save_object(name: str | Path, obj: object) -> None:
with gzip.open(f'{name}.pkl.gz', 'wb') as f:
# pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
pickled = pickle.dumps(obj)
optimized_pickled = pickletools.optimize(pickled)
f.write(optimized_pickled)
def _load_abstracts(filename: str) -> pd.DataFrame:
extensions = Path(filename).suffixes
if '.csv' in extensions:
abstracts: pd.DataFrame = pd.read_csv(filename, sep='|')
elif '.feather' in extensions:
abstracts = pd.read_feather(filename)
elif '.json' in extensions:
abstracts = pd.read_json(filename)
def remove_quotes(text: str) -> str:
if (text.startswith('"') and text.endswith('"')) or (text.startswith("'") and text.endswith("'")):
return text[1:-1].strip()
return text
abstracts['abstract'] = abstracts['abstract'].apply(remove_quotes)
return abstracts
def _create_abstracts_dict(abstracts: pd.DataFrame) -> tuple[list[str], dict[str, int]]:
abstract_words = {w for _, abstract in abstracts.abstract.items() for w in abstract.split()}
idx_to_word = list(abstract_words)
idx_to_word.sort(key=lambda item: (len(item), item))
word_to_idx = {w: i for i, w in enumerate(idx_to_word)}
return idx_to_word, word_to_idx
def _convert_abstracts_to_indices(abstracts: pd.DataFrame, word_to_idx: dict) -> pd.DataFrame:
def words_to_indices(text):
return ' '.join(str(word_to_idx[w]) for w in text.split())
abstracts['abstract'] = abstracts['abstract'].apply(words_to_indices)
return abstracts
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--model_dir', type=str, default='model_data',
help='directory for saving the model')
parser.add_argument('-i', '--ignore_arxiv_papers', action='store_true',
help='ignore papers from arXiv and without conference name when building paper vectors')
parser.add_argument('-y', '--year', type=int, default=datetime.date.today().year-6,
help='only keep papers from this year and later')
args = parser.parse_args()
abstracts = _load_abstracts(f'{args.model_dir}/abstracts.feather')
paper_urls = pd.read_feather(f'{args.model_dir}/pdfs_urls.feather').dropna()
if args.ignore_arxiv_papers:
print('Filtering arXiv papers')
print(f'\tPapers before: {len(abstracts):n}')
indices = abstracts[abstracts['conference'].isin({'arxiv', 'none', 'None'})].index
abstracts.drop(indices, inplace=True)
indices = paper_urls[paper_urls['conference'].isin({'arxiv', 'none', 'None'})].index
paper_urls.drop(indices, inplace=True)
# remove papers from conferences like 'W18-5604' and 'C18-1211', which are usually from aclanthology and are not
# with the correct conference name
indices = abstracts[abstracts.conference.str.contains(r'[\w][\d]{2}-[\d]{4}')].index
abstracts.drop(indices, inplace=True)
abstracts.reset_index(drop=True, inplace=True)
indices = paper_urls[paper_urls.conference.str.contains(r'[\w][\d]{2}-[\d]{4}')].index
paper_urls.drop(indices, inplace=True)
paper_urls.reset_index(drop=True, inplace=True)
print(f'\tPapers after: {len(abstracts):n}')
print(f'Filtering papers before {args.year}')
print(f'\tPapers before: {len(abstracts):n}')
indices = abstracts[abstracts['year'] < args.year].index
abstracts.drop(indices, inplace=True)
abstracts.reset_index(drop=True, inplace=True)
paper_urls['year'] = paper_urls['year'].astype('int')
indices = paper_urls[paper_urls['year'] < args.year].index
paper_urls.drop(indices, inplace=True)
paper_urls.reset_index(drop=True, inplace=True)
print(f'\tPapers after: {len(abstracts):n}')
idx_to_word, word_to_idx = _create_abstracts_dict(abstracts)
abstracts = _convert_abstracts_to_indices(abstracts, word_to_idx)
_save_object(f'{args.model_dir}/abstracts_idx_to_word', idx_to_word)
abstracts.to_feather(f'{args.model_dir}/abstracts_mod.feather', compression='zstd')
paper_urls.to_feather(f'{args.model_dir}/pdfs_urls_mod.feather', compression='zstd')