-
Notifications
You must be signed in to change notification settings - Fork 3
/
CLOCQInterfaceClient.py
225 lines (200 loc) · 7.97 KB
/
CLOCQInterfaceClient.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import json
import pickle
import random
import re
import time
import requests
class CLOCQInterfaceClient:
def __init__(self, host="http://localhost", port="7777"):
self.host = host
self.port = port
self.req = requests.Session()
self.ENTITY_PATTERN = re.compile("^Q[0-9]+$")
self.PRED_PATTERN = re.compile("^P[0-9]+$")
def get_label(self, kb_item):
"""
Retrieves a single label for the given KB item.
E.g. "France national association football team" for "Q47774".
Note: The n-triples Wikidata dump stores multiple labels (not aliases) for the same item.
Here, we return the first KB label which is not exactly the KB item id (i.e. "Q47774").
Shown as "Label" in Wikidata.
"""
params = {"item": kb_item}
res = self._req("/item_to_label", params)
json_string = res.content.decode("utf-8")
label = json.loads(json_string)
return label
def get_labels(self, kb_item):
"""
Retrieves the list of label for the given KB item.
E.g. ["France national association football team", "France national team"] for "Q47774".
Note: The n-triples Wikidata dump stores multiple labels (not aliases) for the same item.
Here, we return the full list of KB labels stored in the n-triples dump.
Shown as "Label" in Wikidata.
"""
params = {"item": kb_item}
res = self._req("/item_to_labels", params)
json_string = res.content.decode("utf-8")
label = json.loads(json_string)
return label
def get_aliases(self, kb_item):
"""
Retrieves the aliases for the given KB item.
E.g. "France" for "Q47774".
Shown as "Also known as" in Wikidata.
"""
params = {"item": kb_item}
res = self._req("/item_to_aliases", params)
json_string = res.content.decode("utf-8")
aliases = json.loads(json_string)
return aliases
def get_description(self, kb_item):
"""
Retrieves the description for the given KB item.
The descriptions can be seen on top of Wikidata pages.
E.g. "men's national association football team representing France" for "Q47774".
Shown as "Description" in Wikidata.
"""
params = {"item": kb_item}
res = self._req("/item_to_description", params)
json_string = res.content.decode("utf-8")
aliases = json.loads(json_string)
return aliases
def get_types(self, kb_item):
"""
Retrieves the types for the given KB item.
Returns list of items with keys: {"id", "label"}.
E.g. [{"id": "Q6979593", "label": "national association football team"}] for "Q47774".
"""
params = {"item": kb_item}
res = self._req("/item_to_types", params)
json_string = res.content.decode("utf-8")
types = json.loads(json_string)
return types
def get_type(self, kb_item):
"""
Retrieves the most frequent type for the given KB item.
Returns one item with keys: {"id", "label"}.
E.g. {"id": "Q6979593", "label": "national association football team"} for "Q47774".
"""
params = {"item": kb_item}
res = self._req("/item_to_type", params)
json_string = res.content.decode("utf-8")
types = json.loads(json_string)
return types
def get_frequency(self, kb_item):
"""
A list of two frequency numbers for the given KB item:
- number of facts with the item occuring as subject
- number of facts with the item occuring as object/qualifier-object.
"""
params = {"item": kb_item}
res = self._req("/frequency", params)
json_string = res.content.decode("utf-8")
frequencies = json.loads(json_string)
return frequencies
def get_neighborhood(self, kb_item, p=1000, include_labels=True, include_type=False):
"""
Returns a list of facts including the item (the 1-hop neighborhood)
each fact is a n-tuple, with subject, predicate, object and qualifier information.
"""
params = {"item": kb_item, "p": p, "include_labels": include_labels, "include_type": include_type}
res = self._req("/neighborhood", params)
json_string = res.content.decode("utf-8")
neighbors = json.loads(json_string)
return neighbors
def get_neighborhood_two_hop(self, kb_item, p=1000, include_labels=True, include_type=False):
"""
Returns a list of facts in the 2-hop neighborhood of the item
each fact is a n-tuple, with subject, predicate, object and qualifier information.
"""
params = {"item": kb_item, "p": p, "include_labels": include_labels, "include_type": include_type}
res = self._req("/two_hop_neighborhood", params)
json_string = res.content.decode("utf-8")
neighbors = json.loads(json_string)
return neighbors
def connect(self, kb_item1, kb_item2):
"""
Returns a list of paths between item1 and item2. Each path is given by either 1 fact
(1-hop connection) or 2 facts (2-hop connections).
"""
params = {"item1": kb_item1, "item2": kb_item2}
res = self._req("/connect", params)
json_string = res.content.decode("utf-8")
paths = json.loads(json_string)
return paths
def connectivity_check(self, kb_item1, kb_item2):
"""
Returns the distance of the two items in the graph, given a fact-based definition.
Returns 1 if the items are within 1 hop of each other,
Returns 0.5 if the items are within 2 hops of each other,
and returns 0 otherwise.
"""
params = {"item1": kb_item1, "item2": kb_item2}
res = self._req("/connectivity_check", params)
connectivity = float(res.content)
return connectivity
def relation_linking(self, question, parameters=dict(), top_ranked=True):
"""
Run relation linking on the given question.
This method follows the approach submitted to the SMART 2022 task.
For implementing the linking method, the standard CLOCQ algorithm is used.
The output is a set of linkings: a list of dicts, with the mention and relation.
"""
params = {"question": question, "parameters": parameters, "top_ranked": top_ranked}
res = self._req("/relation_linking", params, linking_path=True)
json_string = res.content.decode("utf-8")
result = json.loads(json_string)
return result
def entity_linking(self, question, parameters=dict(), k="AUTO"):
"""
Run entity linking on the given question.
This method follows the approach submitted to the SMART 2022 task.
For implementing the linking method, the standard CLOCQ algorithm is used.
k can be given as part of the parameters dict, or separately.
If both are given, the value in the parameters dict is used.
The output is a set of linkings: a list of dicts, with the mention and entity.
"""
params = {"question": question, "parameters": parameters, "k": k}
res = self._req("/entity_linking", params, linking_path=True)
json_string = res.content.decode("utf-8")
result = json.loads(json_string)
return result
def get_search_space(self, question, parameters=dict(), include_labels=True, include_type=False):
"""
Extract a question-specific context for the given question using the CLOCQ algorithm.
Returns k (context tuple, context graph)-pairs for the given questions,
i.e. a mapping of question words to KB items and a question-relevant KG subset.
In case the dict is empty, the default CLOCQ parameters are used
"""
params = {"question": question, "parameters": parameters, "include_labels": include_labels, "include_type": include_type}
res = self._req("/search_space", params)
json_string = res.content.decode("utf-8")
result = json.loads(json_string)
return result
def is_wikidata_entity(self, string):
"""
Check whether the given string can be a wikidata entity.
"""
return self.ENTITY_PATTERN.match(string) is not None
def is_wikidata_predicate(self, string):
"""
Check whether the given string can be a wikidata predicate.
"""
return self.PRED_PATTERN.match(string) is not None
def _req(self, action, json, linking_path=False):
# linking has a different backend (wrapper around native CLOCQ API)
if linking_path:
return self.req.post(self.host.replace("api", "linking_api") + action, json=json)
if self.port == "443":
return self.req.post(self.host + action, json=json)
else:
return self.req.post(self.host + ":" + self.port + action, json=json)
"""
MAIN
"""
if __name__ == "__main__":
clocq = CLOCQInterfaceClient(host="https://clocq.mpi-inf.mpg.de/api", port="443")
kb_item = "Q5"
res = clocq.get_label(kb_item)
print(res)