-
Notifications
You must be signed in to change notification settings - Fork 1
/
elastic_utils.py
134 lines (122 loc) · 4.11 KB
/
elastic_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import requests
import json
import os
from tqdm import tqdm
from hazm import *
class ElasticUtils():
def __init__(self, server_ip, server_port):
self.server_ip = server_ip
self.server_port = server_port
self.base_path = 'http://{}:{}/elasticbot'.format(server_ip, server_port)
self.first_time = self.__setup_index()
if self.first_time:
self.__add_corpus()
self.normalizer = Normalizer()
def __normalize(self, text):
return ' '.join(word_tokenize(self.normalizer.normalize(text)))
def __add_corpus(self):
with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'persian_corpus.json'), 'r') as fh:
corpus = json.load(fh)
for i in range(len(corpus)):
corpus[i]['exact_input'] = corpus[i]['input']
print('Sending dialogues to elasticsearch ...')
for corp in tqdm(corpus):
r = requests.post(self.base_path + "/dialogue", json=corp)
def __setup_index(self):
body = {
"settings": {
"analysis": {
"char_filter": {
"zero_width_spaces": {
"type": "mapping",
"mappings": [ "\\u200C=> "]
}
},
"filter": {
"persian_stop": {
"type": "stop",
"stopwords": "_persian_"
}
},
"analyzer": {
"rebuilt_persian": {
"tokenizer": "standard",
"char_filter": [ "zero_width_spaces" ],
"filter": [
"lowercase",
"decimal_digit",
"arabic_normalization",
"persian_normalization",
"persian_stop"
]
}
}
}
},
"mappings": {
"dialogue": {
"properties": {
"input": {
"type": "text"
},
"output": {
"type": "text"
},
"exact_input": {
"type": "keyword"
},
"tag": {
"type": "keyword"
}
}
}
}
}
r = requests.put(self.base_path, json=body)
if 'resource_already_exists_exception' in r.text:
print('Successfully set up the index')
return False
elif '"acknowledged":true' in r.text:
print('Successfully set up the index')
return True
else:
print('Encountered an error while seting up index: {}'.format(r.text))
return None
def respond(self, input_text, size=2):
input_text = self.__normalize(input_text)
r = requests.post(self.base_path + '/dialogue/_search', json={
"query": {
"bool": {
"should": [
{
"term" : {
"exact_input": {
"value": input_text,
"boost": 100
}
}
},
{
"multi_match" : {
"query" : input_text,
"fields": ["input^3", "output"],
# "fuzziness": 2
}
}
]
}
},
"size": size
})
max_score = json.loads(r.text)['hits']['max_score']
hits = json.loads(r.text)['hits']['hits']
if max_score:
return hits[0]['_source']
else:
return None
def learn(self, user_teach):
user_teach['input'] = self.__normalize(user_teach['input'])
user_teach['output'] = [self.__normalize(user_teach['output'])]
user_teach['tag'] = 'user_teach'
user_teach['exact_input'] = user_teach['input']
requests.post(self.base_path + "/dialogue", json=user_teach)