-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_data.py
129 lines (102 loc) · 5.58 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
Example of how one would download & process a single batch of S2ORC to filter to specific field of study.
Can be useful for those who can't store the full dataset onto disk easily.
Please adapt this to your own field of study.
Creates directory structure:
|-- metadata/
|-- raw/
|-- metadata_0.jsonl.gz << input; deleted after processed
|-- medicine/
|-- metadata_0.jsonl << output
|-- pdf_parses/
|-- raw/
|-- pdf_parses_0.jsonl.gz << input; deleted after processed
|-- medicine/
|-- pdf_parses_0.jsonl << output
"""
import os
import subprocess
import gzip
import io
import json
from turtle import down
from tqdm import tqdm
from collections import defaultdict
import re
import glob
# TODO: update with right info
FIELD_OF_STUDY = 'Computer Science'
FOLDER_NAME = 'computer_science'
URLS_EXPIRES = '20220715'
METADATA_INPUT_DIR = 'metadata/raw/'
METADATA_OUTPUT_DIR = f'metadata/{FOLDER_NAME}/'
PDF_PARSES_INPUT_DIR = 'pdf_parses/raw/'
PDF_PARSES_OUTPUT_DIR = f'pdf_parses/{FOLDER_NAME}/'
METADATA_FILE_LIST = [os.path.basename(x) for x in glob.glob(METADATA_OUTPUT_DIR + "*.jsonl.gz")]
PDF_PARSES_FILE_LIST = [os.path.basename(x) for x in glob.glob(PDF_PARSES_OUTPUT_DIR + "*.jsonl.gz")]
# process single batch
def process_batch(batch: dict):
# this downloads both the metadata & full text files for a particular shard
cmd = ["wget", "-O", batch['input_metadata_path'], batch['input_metadata_url']]
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False)
cmd = ["wget", "-O", batch['input_pdf_parses_path'], batch['input_pdf_parses_url']]
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False)
# first, let's filter metadata JSONL to only papers with a particular field of study.
# we also want to remember which paper IDs to keep, so that we can get their full text later.
paper_ids_to_keep = set()
with gzip.open(batch['input_metadata_path'], 'rb') as gz, open(batch['output_metadata_path'], 'wb') as f_out:
f = io.BufferedReader(gz)
for line in tqdm(f.readlines()):
metadata_dict = json.loads(line)
paper_id = metadata_dict['paper_id']
mag_field_of_study = metadata_dict['mag_field_of_study']
if mag_field_of_study and FIELD_OF_STUDY in mag_field_of_study: # TODO: <<< change this to your filter
paper_ids_to_keep.add(paper_id)
f_out.write(line)
# now, we get those papers' full text
with gzip.open(batch['input_pdf_parses_path'], 'rb') as gz, open(batch['output_pdf_parses_path'], 'wb') as f_out:
f = io.BufferedReader(gz)
for line in tqdm(f.readlines()):
metadata_dict = json.loads(line)
paper_id = metadata_dict['paper_id']
if paper_id in paper_ids_to_keep:
f_out.write(line)
# now delete the raw files to clear up space for other shards
os.remove(batch['input_metadata_path'])
os.remove(batch['input_pdf_parses_path'])
def already_downloaded(download_links):
metadata_filename = os.path.basename(download_links['metadata'].split('?')[0])
pdf_parses_filename = os.path.basename(download_links['pdf_parses'].split('?')[0])
return metadata_filename in METADATA_FILE_LIST and pdf_parses_filename in PDF_PARSES_FILE_LIST
if __name__ == '__main__':
os.makedirs(METADATA_INPUT_DIR, exist_ok=True)
os.makedirs(METADATA_OUTPUT_DIR, exist_ok=True)
os.makedirs(PDF_PARSES_INPUT_DIR, exist_ok=True)
os.makedirs(PDF_PARSES_OUTPUT_DIR, exist_ok=True)
# TODO: make sure to put the links we sent to you here
# there are 100 shards with IDs 0 to 99. make sure these are paired correctly.
download_linkss_dict = defaultdict(lambda: {"metadata": None, "pdf_parses": None})
with open(f'dl_s2orc_20200705v1_full_urls_expires_{URLS_EXPIRES}.sh', 'r', encoding='utf-8') as f:
for line in f:
if metadata_match := re.search(r"^wget -O 20200705v1/full/metadata/metadata_(?P<shard_number>\d+).jsonl.gz '(?P<link>.+)'$", line):
download_linkss_dict[metadata_match.group("shard_number")]["metadata"] = metadata_match.group("link")
elif pdf_match := re.search(r"^wget -O 20200705v1/full/pdf_parses/pdf_parses_(?P<shard_number>\d+).jsonl.gz '(?P<link>.+)'$", line):
download_linkss_dict[pdf_match.group("shard_number")]["pdf_parses"] = pdf_match.group("link")
download_linkss = download_linkss_dict.values()
# turn these into batches of work
# TODO: feel free to come up with your own naming convention for 'input_{metadata|pdf_parses}_path'
batches = [{
'input_metadata_url': download_links['metadata'],
'input_metadata_path': os.path.join(METADATA_INPUT_DIR,
os.path.basename(download_links['metadata'].split('?')[0])),
'output_metadata_path': os.path.join(METADATA_OUTPUT_DIR,
os.path.basename(download_links['metadata'].split('?')[0])),
'input_pdf_parses_url': download_links['pdf_parses'],
'input_pdf_parses_path': os.path.join(PDF_PARSES_INPUT_DIR,
os.path.basename(download_links['pdf_parses'].split('?')[0])),
'output_pdf_parses_path': os.path.join(PDF_PARSES_OUTPUT_DIR,
os.path.basename(download_links['pdf_parses'].split('?')[0])),
} for download_links in download_linkss
if not already_downloaded(download_links)]
for batch in batches:
process_batch(batch=batch)