-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuser_input.py
107 lines (99 loc) · 4.14 KB
/
user_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
###############################################
# #
# download stopwords 34an my3mlsh error ta7t #
# #
###############################################
import nltk
import re
import nltk as n
from nltk.stem import *
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk import ISRIStemmer
import xlrd
import pickle
import numpy as np
import tflearn as tf
from tensorflow.python.framework import ops
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Binarizer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
#########################
# #
# Functions #
# #
#########################
def preprocessing_test_data(user_string):
tokens_array = n.tokenize._treebank_word_tokenizer.tokenize(user_string)
stemmer = ISRIStemmer()
stop_words = stopwords.words('arabic')
preprocessing_result_question = list()
for word in tokens_array:
word = stemmer.norm(word, num=1) # remove diacritics which representing Arabic short vowels
if not word in stop_words: # exclude stop words from being processed
word = stemmer.pre32(word) # remove length three and length two prefixes in this order
word = stemmer.suf32(word) # remove length three and length two suffixes in this order
word = stemmer.waw(word) # remove connective ??? if it precedes a word beginning with ???
word = stemmer.norm(word, num=2) # normalize initial hamza to bare alif
preprocessing_result_question.append(word)
return preprocessing_result_question
################################
#### user input features########
################################
def user_features(preprocessed_data):
features_names = 0
with open('Question_features_names.tmp', 'rb') as dic:
features_names = pickle.load(dic)
# print(features_names)
# print(len(Q_vectorizer.get_feature_names()))
#pr = preprocessing_test_data(preprocessed_data)
QU = ''
for i in range(len(preprocessed_data)):
QU += preprocessed_data[i]
QU += ' '
# print(QU)
QU_list = [QU]
QU_vectorizer = TfidfVectorizer()
QU_vectorizer.fit(QU_list)
# print(QU_vectorizer.get_feature_names())
# print(QU_vectorizer.idf_)
UQuestion_features = QU_vectorizer.transform(QU_list)
# print(UQuestion_features[0, 1])
# print(Q_vectorizer.get_feature_names())
QU_features = np.zeros((1, 357))
for j in range(357):
for k in range(len(QU_vectorizer.get_feature_names())):
if (QU_vectorizer.get_feature_names()[k] == features_names[j]):
QU_features[0, j] = UQuestion_features[0, k]
print(QU_features.shape)
return QU_features
def process_user_input(user_string):
preprocessed_string = preprocessing_test_data(user_string)
features = user_features(preprocessed_string)
ops.reset_default_graph()
net = tf.input_data(shape=[None, 357], name='input')
net = tf.fully_connected(net, 256, activation='relu')
net = tf.fully_connected(net, 128, activation='relu')
net = tf.fully_connected(net, 64, activation='relu')
output = tf.fully_connected(net, 277, activation='softmax')
output = tf.regression(output, optimizer='adam', learning_rate=0.01, loss='categorical_crossentropy', name='output')
# define model
model = tf.DNN(output)
if (os.path.exists('model.tfl.meta')):
model.load('model.tfl')
#print("exists")
else:
print("does not exist")
prediction = model.predict(features)
#print(prediction)
predictions_list = []
for i in range(len(prediction)):
predictions_list.append(np.argmax(prediction[i]) + 1)
#print(predictions_list)
classes = 0
with open('classes.tmp', 'rb') as dic:
classes = pickle.load(dic)
return classes[predictions_list[0]]