-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_data.py
301 lines (256 loc) · 11.9 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
import requests
import os
import json
import time
from tqdm import tqdm
from bs4 import BeautifulSoup
import argparse
from utils import read_pmc, read_pm
def download_one_pmc_article(pmc_id, save_path):
# 构建PMC文章的URL
# https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=5436877
article_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id={pmc_id}"
# 发送GET请求
response = requests.get(article_url)
# 检查响应
if response.status_code == 200:
# 获取文章内容
article_content = response.text
# 在这里,您可以对文章内容进行处理,保存到文件或者进行其他操作
# 例如,保存到文件
with open(f"{save_path}/{pmc_id}.html", "w", encoding="utf-8") as file:
file.write(article_content)
print(f"Article {pmc_id} downloaded successfully.")
return 1
else:
print(f"Error occurred while downloading article {pmc_id}.")
return 0
def download_one_pm_article(pm_id, save_path):
# 构建PMC文章的URL
# https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=5436877
article_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pm_id}"
# 发送GET请求
response = requests.get(article_url)
# 检查响应
if response.status_code == 200:
# 获取文章内容
article_content = response.text
# 在这里,您可以对文章内容进行处理,保存到文件或者进行其他操作
# 例如,保存到文件
with open(f"{save_path}/{pm_id}.html", "w", encoding="utf-8") as file:
file.write(article_content)
print(f"Article {pm_id} downloaded successfully.")
return 1
else:
print(f"Error occurred while downloading article {pm_id}.")
return 0
def download_pmc_article(data, mode, num):
###################################
# 下载 PMC 文章
###################################
save_path = f"./data/{mode}/pmc/"
if not os.path.exists(save_path):
# 如果文件夹不存在,则创建它
os.makedirs(save_path)
PMC_num = 0
for key in tqdm(data['PMCid'], total=num, desc="Downloading articles"):
pmc_id = data['PMCid'][key][3:]
while not download_one_pmc_article(pmc_id, save_path):
time.sleep(1)
# res = download_one_pmc_article(pmc_id, save_path)
PMC_num += 1 # res
if PMC_num >= num:
break
print("下载PMC文章数量为:", PMC_num)
def download_pm_article(data, mode, num):
###################################
# 下载 pubmed 文章
###################################
# 使用集合(set)来存储不重复的元素
unique_elements = set()
# 遍历字典的值(列表),将元素添加到集合中
PMC_num = 0
for key in data['references']:
unique_elements.update(data['references'][key])
PMC_num += 1
if PMC_num >= num:
break
save_path = f"./data/{mode}/pm/"
if not os.path.exists(save_path):
# 如果文件夹不存在,则创建它
os.makedirs(save_path)
pm_num = 0
for key in tqdm(unique_elements, total=len(unique_elements), desc="Downloading articles"):
pm_id = key
while not download_one_pm_article(pm_id, save_path):
time.sleep(1)
# res = download_pm_article(pm_id, save_path)
pm_num += 1 # res
print("下载pm文章数量为:", pm_num)
def process_raw_data(mode, selected_data):
# 遍历每篇文章数据
save_folder = f'./data/available_induc_{mode}/'
if not os.path.exists(save_folder):
os.makedirs(save_folder)
save_pmc_folder = f'./data/available_induc_{mode}/pmc/'
if not os.path.exists(save_pmc_folder):
os.makedirs(save_pmc_folder)
save_pm_folder = f'./data/available_induc_{mode}/pm/'
if not os.path.exists(save_pm_folder):
os.makedirs(save_pm_folder)
for index, data in tqdm(enumerate(selected_data), total=len(selected_data)):
# 读取源文章及参考文献数据
# 源文章
origin_path = f'./data/{mode}/pmc/' + data + '.html'
article_title_text, abstract_text, introduction_text, sec_dict = read_pmc(origin_path)
if article_title_text == "":
print("article_title_text error:", data)
elif abstract_text == "":
print("abstract_text error:", data)
elif introduction_text == "":
print("introduction_text error:", data)
else:
dict = {'article_title_text': article_title_text, 'abstract_text': abstract_text,
'introduction_text': introduction_text, 'sec_dict': sec_dict}
json_file_path = save_pmc_folder + data + '.json'
with open(json_file_path, 'w') as json_file:
json.dump(dict, json_file)
# 参考文献
reference_title_abstract_dict = {}
for pm_id in selected_data_reference[index]:
reference_path = f'./data/{mode}/pm/' + pm_id + '.html'
reference_title_text, reference_abstract_text = read_pm(reference_path)
reference_title_abstract_dict[reference_title_text] = reference_abstract_text
json_file_path = save_pm_folder + data + '.json'
with open(json_file_path, 'w') as json_file:
json.dump(reference_title_abstract_dict, json_file)
def get_availabe_pmc_data(mode, selected_data):
available_num = 0
available_pmc_list = []
available_pmc_idx = []
for index, data in enumerate(selected_data):
path = f'./data/{mode}/pmc/' + data + '.html'
# 打开并读取HTML文件
with open(path, 'r', encoding='utf-8') as file:
html_content = file.read()
# 使用Beautiful Soup解析HTML内容
soup = BeautifulSoup(html_content, "html.parser")
# 查找<sec>标签,并指定sec-type为"intro"
sec_tag = soup.find_all("sec")
# sec_dict = {}
# 如果找到了<sec>标签
if sec_tag:
# print(len(sec_tag))
# 查找<title>标签
for tag in sec_tag:
title_tag = tag.find("title")
if title_tag:
#sec_dict[title_tag.text] = tag.text
if title_tag.text in ['Introduction', 'INTRODUCTION', '1. Introduction', '1 INTRODUCTION', 'introduction']:
# 查找<abstract>标签
abstract_tag = soup.find("abstract")
# 提取<abstract>标签中的文本内容
if abstract_tag:
# abstract_text = abstract_tag.text
available_num += 1
available_pmc_list.append(data)
available_pmc_idx.append(index)
else:
print("Abstract tag not found in the HTML content.")
else:
print("No <sec> tag found in the HTML content.")
print("wrong data: ", data)
print("可用文章数量:", available_num)
return available_pmc_list, available_pmc_idx
def get_availabe_pm_data(mode, available_pmc_idx):
available_pm_list = []
available_pm_num = 0
for reference_list_idx in available_pmc_idx:
for i in selected_data_reference[reference_list_idx]:
if i not in available_pm_list:
# 打开并读取HTML文件
path = f'./data/{mode}/pm/' + i + '.html'
with open(path, 'r', encoding='utf-8') as file:
html_content = file.read()
# 使用Beautiful Soup解析HTML内容
soup = BeautifulSoup(html_content, "html.parser")
# 查找<title>标签
title_tag = soup.find("articletitle")
# 提取<title>标签中的文本内容
if title_tag:
#title_text = title_tag.text
# 查找<abstract>标签
abstract_tag = soup.find("abstract")
# 提取<abstract>标签中的文本内容
if abstract_tag:
#abstract_text = abstract_tag.text
# print("Abstract:", abstract_text)
available_pm_list.append(i)
available_pm_num += 1
else:
print("Abstract tag not found in the HTML content.")
print("pm可用文章数量:", available_pm_num)
return available_pm_list
def generate_available_dict(selected_data, selected_data_reference, available_pm_list, required_num):
available_graph_dict = {}
available_graph_dict['PMCid'] = []
available_graph_dict['references'] = []
data_num = 0
for index, data in enumerate(selected_data):
if data in available_pmc_list:
reference_list = selected_data_reference[index]
if reference_list == []:
available_graph_dict['PMCid'].append(data)
available_graph_dict['references'].append([])
data_num += 1
else:
no_flag = 0
for i in reference_list:
if i not in available_pm_list:
no_flag += 1
else:
pass
if no_flag == 0:
available_graph_dict['PMCid'].append(data)
available_graph_dict['references'].append(reference_list)
data_num += 1
if data_num >= required_num:
break
print(len(available_graph_dict['references']))
return available_graph_dict
if __name__ == "__main__":
"""
Download 100 train and 100 test articles
"""
parser = argparse.ArgumentParser()
parser.add_argument('--required_num', type=int, default=100, help='Number of articles data required')
args = parser.parse_args()
required_num = args.required_num
# About half of the downloaded articles are not available
download_num = 2 * required_num
modes = ['test', 'train']
for mode in modes:
# 打开JSON文件
with open(f'./PubMedCite/induc_graph/{mode}_graph.json', 'r') as json_file:
# 从文件中加载JSON数据
data = json.load(json_file)
print('Downloading pmc article')
download_pmc_article(data, mode, download_num)
print('Downloading pm article')
#
download_pm_article(data, mode, download_num)
data_ids = list(range(download_num))
selected_data = [data['PMCid'][f'{i}'][3:] for i in data_ids]
selected_data_reference = [data['references'][f'{i}'] for i in data_ids]
print('Processing raw data')
process_raw_data(mode, selected_data)
print('Get availabe pmc data')
available_pmc_list, available_pmc_idx = get_availabe_pmc_data(mode, selected_data)
print('Get availabe pm data')
available_pm_list = get_availabe_pm_data(mode, available_pmc_idx)
print('Generating available dict')
available_graph_dict = generate_available_dict(selected_data, selected_data_reference, available_pm_list, required_num)
save_path = f'./data/available_induc_{mode}_graph.json'
# 将排序后的列表保存为JSON文件
with open(save_path, 'w') as json_file:
json.dump(available_graph_dict, json_file)