-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathGPT3Extractor.py
259 lines (237 loc) · 9.86 KB
/
GPT3Extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
"GPT3 Extractor class"
import json
import re
from typing import List, Set, Tuple
import openai
import spacy
from spacy_help_functions import create_entity_pairs
from lib.utils import (
ENTITIES_OF_INTEREST,
PROMPT_AIDS,
PRONOUNS_AND_CONJUNCTIONS,
RELATIONS,
SEED_PROMPTS,
SEED_SENTENCES,
SUBJ_OBJ_REQUIRED_ENTITIES,
)
class gpt3Extractor:
"""
GPT3 Extractor class
"""
def __init__(self, r, openai_key, model="en_core_web_sm"):
"""
Initialize a gpt3Predictor object
Parameters:
r: the relation to extract
openai_key: the key to use for the OpenAI API
model: the spaCy model to use
"""
self.openai_key = openai_key
openai.api_key = self.openai_key
self.nlp = spacy.load(model)
self.r = r
self.relations = set()
def get_relations(self, text: str) -> List[Tuple[str, str]]:
"""
Exposed function to take in text and return named entities
Parameters:
text: the text to extract entities from
Returns:
entities: a list of tuples of the form (subject, object)
"""
doc = self.nlp(text)
print(" Annotating the webpage using spacy...")
num_sents = len(list(doc.sents))
print(
f" Extracted {num_sents} sentences. Processing each sentence one by one to check for presence of right pair of named entity types; if so, will run the second pipeline ..."
)
# Get tagged version of text from spaCy.
target_candidate_pairs = self.extract_candidate_pairs(doc)
if len(target_candidate_pairs) == 0:
return []
# print("target_candidate_pairs: {}".format(target_candidate_pairs))
self.extract_entity_relations(target_candidate_pairs)
return self.relations
def extract_candidate_pairs(self, doc) -> Set[Tuple[str, str]]:
"""
Extract candidate pairs from a given document using spaCy
parameters:
doc: the document to extract candidate pairs from
returns:
relations: a list of candidate entity pairs, where each item is a tuple
(subj, obj)
"""
num_sents = len(list(doc.sents))
extracted_sentences = 0
extracted_annotations = 0
for i, sentence in enumerate(doc.sents):
if i % 5 == 0 and i != 0:
print(f" Processed {i} / {num_sents} sentences")
# Create entity pairs
sentence_entity_pairs = create_entity_pairs(
sentence, ENTITIES_OF_INTEREST[self.r]
)
# Check entity pairs if any appropriate subj/obj pairing exists
candidates = self.filter_candidates_exist(sentence_entity_pairs)
# If any viable candidates exist, pass to GPT-3 for extraction
if candidates:
relation = self.extract_entity_relations(sentence)
output = self.parse_gpt_output(relation)
# If GPT-3 returns invalid relation, move on to next sentence
if not output:
continue
# If GPT-3 returns valid relation, check if it's a duplicate
output_tuple = (output["subj"], output["obj"])
if output_tuple not in self.relations:
# If not a duplicate, add to set, print output
self.relations.add(output_tuple)
extracted_annotations += 1
extracted_sentences += 1
self.print_output_relation(sentence, output, duplicate=False)
else:
# If duplicate, print output and move on
self.print_output_relation(sentence, output, duplicate=True)
print(
f"Extracted annotations for {extracted_sentences} out of total {num_sents} sentences"
)
print(
f"Relations extracted from this website: {extracted_annotations} (Overall: {len(self.relations)})"
)
return self.relations
def print_output_relation(self, sentence, output, duplicate):
print(" === Extracted Relation ===")
print(f" Sentence: {sentence}")
print(f" Subject: {output['subj']} ; Object: {output['obj']} ;")
if duplicate:
print(" Duplicate. Ignoring this.")
else:
print(" Adding to set of extracted relations")
print(" ==========")
def filter_candidates_exist(self, sentence_entity_pairs: List) -> bool:
"""
Filter candidate pairs to only include those that are of the right type
Parameters:
sentence_entity_pairs: a list of candidate entity pairs, where each pair is a dictionary
Returns:
bool: if at least 1 viable candidate pair exists, return True. Else, False.
"""
# Create candidate pairs. Filter out subject-object pairs that
# aren't the right type for the target relation.
# (e.g. don't include anything that's not Person:Organization for the "Work_For" relation)
candidate_pairs = []
for ep in sentence_entity_pairs:
candidate_pairs.append(
{"tokens": ep[0], "subj": ep[1], "obj": ep[2]}
) # e1=Subject, e2=Object
candidate_pairs.append(
{"tokens": ep[0], "subj": ep[2], "obj": ep[1]}
) # e1=Object, e2=Subject
for p in candidate_pairs:
if (
p["subj"][1] in SUBJ_OBJ_REQUIRED_ENTITIES[self.r]["SUBJ"]
and p["obj"][1] in SUBJ_OBJ_REQUIRED_ENTITIES[self.r]["OBJ"]
):
return True
print(" No potential relations found in this sentence...")
# This info, formatted, should be printed in extract_candidate_pairs.
# print("Filtered target_candidate_paris: {}".format(target_candidate_pairs))
return False
def parse_gpt_output(self, output_str: str):
"""
Parse the output of GPT-3
Parameters:
output: the output of GPT-3, string '{"PERSON": "John Doe", "ORGANIZATION": "Google", "RELATION": "Work_For"}'
Returns:
resultant_relation: the extracted relation as a dict
with format:
{
"subj": <subject>,
"obj": <object>,
"relation": <relation>
}
If any KeyError in the GPT output, return None
Raises:
None
"""
resultant_relation = {}
try:
output = json.loads(output_str)
resultant_relation["subj"] = output[
SUBJ_OBJ_REQUIRED_ENTITIES[self.r]["SUBJ"][0].strip()
]
resultant_relation["obj"] = output[
SUBJ_OBJ_REQUIRED_ENTITIES[self.r]["OBJ"][0].strip()
]
resultant_relation["relation"] = output["RELATION"]
# This filters out any relations that don't match the target relation.
# It also filters out blank or "n/a" subject and objects.
# It also filters out relations where subject is/contains a prounoun
if resultant_relation["relation"] != RELATIONS[self.r]:
resultant_relation = None
if resultant_relation["subj"] == "" or resultant_relation["obj"] == "":
resultant_relation = None
if (
resultant_relation["subj"] == "None"
or resultant_relation["obj"] == "None"
):
resultant_relation = None
if (
resultant_relation["subj"] == "n/a"
or resultant_relation["obj"] == "n/a"
):
resultant_relation = None
if (
resultant_relation["subj"] == "N/A"
or resultant_relation["obj"] == "N/A"
):
resultant_relation = None
if any(
p in resultant_relation["subj"].lower()
for p in PRONOUNS_AND_CONJUNCTIONS
):
resultant_relation = None
except Exception:
print(f"Error parsing GPT-3 output: {output_str}")
resultant_relation = None
return resultant_relation
def extract_entity_relations(self, sentence):
"""
Extract entity relations
Parameters:
candidate_pairs: a list of candidate pairs to extract relations from
Returns:
relations: a list of tuples of the form (subject, object)
"""
prompt = self.construct_prompt(sentence)
relation = self.gpt3_complete(prompt)
return relation
def gpt3_complete(self, prompt):
"""
Use GPT-3 to complete a prompt
Parameters:
prompt: the prompt to complete
Returns:
completion: the completion of the prompt
"""
completion = openai.Completion.create(
engine="text-davinci-003",
prompt=prompt,
max_tokens=100,
temperature=0.2,
top_p=1,
frequency_penalty=0.0,
presence_penalty=0.0,
)
return completion["choices"][0]["text"]
def construct_prompt(self, sentence):
"""
Construct a prompt for GPT-3 to complete.
Parameters:
candidate_pairs: a single candidate pairs to extract relations from
Returns:
prompt: a string to be passed to GPT-3
"""
seed = f"In a given sentence, find relations where {PROMPT_AIDS[self.r]}"
example = f"Example Input: '{SEED_SENTENCES[self.r]}' Example Output: {SEED_PROMPTS[self.r]}."
sentence = f"Input: {sentence} Output:"
return seed + example + sentence