-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMovie_Rating_Predictor.py
85 lines (58 loc) · 2.14 KB
/
Movie_Rating_Predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import numpy as np
# Opening cleaned reviews
with open('clean_imdb_trainX.txt', encoding='utf8') as f:
reviews = f.readlines()
reviews = reviews[:500]
# Creating vectorized 2d array
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x_train = cv.fit_transform(reviews).toarray()
# Adding ratings to vectorized corpus
with open('imdb_trainY.txt', encoding='utf8') as f1:
ratings = f1.readlines()
ratings = list(map(int, ratings))
ratings = ratings[:500]
y_train = np.array(ratings)
# Getting x_test and y_test
with open('clean_imdb_testX.txt', encoding='utf8') as f3:
test_reviews = f3.readlines()
test_reviews = test_reviews[:50]
x_test = cv.transform(test_reviews).toarray()
with open('imdb_testY.txt', encoding='utf8') as f4:
test_ratings = f4.readlines()
test_ratings = list(map(int, test_ratings))
test_ratings = test_ratings[:50]
y_test = np.array(test_ratings)
# Prior Probability
def prior_probab(x_train, y_train, label_value):
n_rows = x_train[y_train==label_value]
prob_val = n_rows.shape[0]/float(x_train.shape[0])
return prob_val
prior_probab(x_train, y_train, 1)
# Conditional Probability
def cond_probab(x_train, y_train, feature_index, feature_value, label_value):
n_rows = x_train[y_train==label_value]
constraint_rows = n_rows[n_rows[:, feature_index]==feature_value]
prob_val = constraint_rows.shape[0]/float(x_train.shape[0])
return prob_val
# Calculating Classes
classes = np.unique(y_train)
# Calculating y_pred
y_pred = []
for ix in range(x_test.shape[0]):
post_prob = []
for jx in classes:
likelihood = 1.0
for kx in x_test[ix]:
cond = cond_probab(x_train, y_train, kx, x_test[ix][kx], jx)
likelihood *= cond
prior = prior_probab(x_train, y_train, jx)
posterior = likelihood * prior
post_prob.append(posterior)
post_prob = np.array(post_prob)
pred_label = classes[post_prob.argmax()]
y_pred.append(pred_label)
# Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
accuracy = (cm[0,0]+cm[1,1])/cm.sum()