-
Notifications
You must be signed in to change notification settings - Fork 1
/
bioasq_to_squad.py
127 lines (110 loc) · 3.33 KB
/
bioasq_to_squad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Converts BioASQ format to SQuAD format.
"""
import argparse
import json
import logging
from multiprocessing.pool import ThreadPool
from pathlib import Path
from xml.etree import ElementTree
import requests
def generate_questions(fname):
"""
Generates questions from a JSON file in the BioASQ dataset.
"""
with open(fname, 'r') as f:
data = json.load(f)
for question in data['questions']:
yield question
def query_pubmed(docid):
"""
Queries PuBMed API for XML metadata associated to a given document.
"""
endpoint = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
payload = {
'db': 'pubmed',
'id': docid,
'retmode': 'xml'
}
response = requests.get(endpoint, params=payload)
try:
xml_metadata = ElementTree.fromstring(response.text)
except:
print(response.text)
return xml_metadata
def render_section(abstract_section):
"""
Renders a section of the abstract as text. Potentailly including the
section title.
"""
# WARNING: This adds labels to each section if available. This appears to
# what was done to (most of) the abstracts BioASQ. However sometimes this
# strategy is preventing matches.
label = abstract_section.get('Label')
if label is not None:
return f'{label}: {abstract_section.text}'
else:
return abstract_section.text
def extract_context(xml_metadata):
"""
Extracts the abstract from PubMed XML metadata.
"""
title = xml_metadata.find('.//ArticleTitle').text
abstract_sections = xml_metadata.findall('.//AbstractText')
abstract = ' '.join(render_section(x) for x in abstract_sections)
context = f'TITLE: {title} ABSTRACT: {abstract}'
return context
def squadify(question):
"""
Converts a BioASQ-formatted instance to a SQuAD-formatted instance.
"""
squad_instance = {
'title': question['id'],
'paragraphs': []
}
for snippet in question['snippets']:
docid = snippet['document'].split('/')[-1]
xml_metadata = query_pubmed(docid)
context = extract_context(xml_metadata)
answer_text = snippet['text']
answer_start = context.find(answer_text)
# TODO: Analyze remaining failure modes.
if answer_start == -1:
continue
paragraph = {
'id': docid,
'context': context,
'question': question['body'],
'qas': [{
"answer_start": answer_start,
"text": answer_text
}]
}
squad_instance['paragraphs'].append(paragraph)
return squad_instance
def convert(source, dest):
"""
Converts BioASQ-formatted dataset, to a SQuAD-formatted dataset.
"""
questions = generate_questions(args.input)
output = {'data': [squadify(x) for x in questions]}
with open(args.output, 'w') as f:
json.dump(output, f, ensure_ascii=False)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'-i',
'--input',
type=Path,
required=True,
help='input BioASQ file'
)
parser.add_argument(
'-o',
'--output',
type=Path,
required=True,
help='output file'
)
args = parser.parse_args()
convert(args.input, args.output)