-
Notifications
You must be signed in to change notification settings - Fork 1
/
tsneplots.py
152 lines (120 loc) · 4.71 KB
/
tsneplots.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# coding: utf-8
from gensim.models import Doc2Vec
from collections import defaultdict
import numpy as np
import random, gensim, logging
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD, PCA
from bokeh.plotting import figure, show, ColumnDataSource, output_notebook
from bokeh.models import HoverTool
from collections import namedtuple
from emotionslist import root_emotions_categorised, emotion_categories
import cPickle as pickle
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO
'''
bokeh models are available only in ipython notebook or the output figure could
be saved as a html image. It doesnt support pop up display as matplotlib.
'''
output_notebook()
model = gensim.models.Doc2Vec.load('/path/to/vector/model')
x_train = pickle.load(open('/path/to/training/sentences','rb'))
y_train = pickle.load(open('/path/to/training/emotions','rb'))
derived_emotions_categorised_dict = pickle.load(open('/path/to/derived/emotions','rb'))
derived_emotions_categorised_list = [category.keys() for category in derived_emotions_categorised_dict]
concatenated_emotions_list = [[] for x in range(len(emotion_categories))]
for idx in range(len(emotion_categories)):
concatenated_emotions_list[idx] = root_emotions_categorised[idx] + derived_emotions_categorised_list[idx]
'''
Returns the sum of all the word vectors in the sentence with respective emotions.
When out of vocabulary word appears, the instance is skipped.
'''
def getWordVecs(model, corpus, category, size):
vecs = []
cat = []
for i,words in enumerate(corpus):
curr_vec = np.zeros((size))
for word in words:
try:
curr_vec += model[word].reshape(size)
except KeyError:
break
else:
vecs.append(curr_vec)
cat.append(category[i])
result = namedtuple('result','vecs,cat')
result.vecs = vecs
result.emos = emos
return result
'''
Returns the respective document vector by infer_vector method which uses
similarity measure to find the closest vector in the learnt vector space.
'''
def getDocVecs(model, corpus, emotions, size):
vecs = []
for z in corpus:
vecs.append(np.array(model.infer_vector(z)).reshape((size)))
result = namedtuple('result','vecs emos')
result.vecs = vecs
result.emos = emotions
return result
model_features = getWordVecs(model, x_train, y_train, 400)
'''
In case of document vectors
sentence_vecs = getDocVecs(model_cbow, x_train, y_train, 400)
'''
emotion_count = defaultdict(int)
for emotion in model_features.emos:
emotion_count[emotion] += 1
'''
vectors are randomly sampled in each category which reduces computational
complexity of the dimensionality reduction algorithm.
'''
def sample_data(trained_model, limit):
sampled_train_vecs = []
sampled_train_emotions = []
for value in emotion_count.iteritems():
specific_doc_vectors_in_model = [vec for idx,vec in enumerate(trained_model.vecs) if trained_model.emos[idx]==value[0]]
emotion_limit = limit if limit < value[1] else value[1]
perm = np.random.permutation(len(specific_doc_vectors_in_model))
sampled_train_vecs.extend([y for x,y in sorted(zip(perm, specific_doc_vectors_in_model))[:emotion_limit]])
sampled_train_emotions.extend([value[0] for idx in range(emotion_limit)])
sampled_data = namedtuple('sampled_data','vectors,emotions')
sampled_data.vectors = sampled_train_vecs
sampled_data.emotions = sampled_train_emotions
return sampled_data
limit = 5000
sampled_data = sample_data(model, limit)
emotion_categories.update({'none':'gray'})
c_val = [emotion_categories[emotion] for emotion in sampled_data.emotions]
def tsne_plot(data):
x_val = []
y_val = []
for vals in data:
x_val.append(vals[0])
y_val.append(vals[1])
source = ColumnDataSource(
data = dict(
x = x_val,
y = y_val,
z = c_val,
legend = sampled_data.emotions,
)
)
TOOLS="pan,wheel_zoom,box_zoom,reset,hover,save"
plt = figure(title = "Tweets dataset visualised under TSNE", tools=TOOLS)
plt.scatter('x','y', color = 'z',source=source)
hover = plt.select(dict(type=HoverTool))
hover.point_policy = "follow_mouse"
show(plt)
'''
PCA plots
'''
pca_vecs = PCA(n_components=2).fit_transform(sampled_data.vectors)
'''
In order to compute tsne, high dimension vectors have to be reduced
to a reasonable dimension using TruncatedSVD.
'''
svd_vecs = TruncatedSVD(n_components=60).fit_transform(sampled_data.vectors)
tsne_vecs = TSNE(n_components=2, method='barnes_hut', verbose=10).fit_transform(svd_vecs)
tsne_plot(tsne_vecs)