-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
54 lines (42 loc) · 1.35 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from utils import import_data, preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re
from vect_utils import c_distance, simple_tokenizer
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from cluster import K_Means
filepath = 'data.txt'
data_as_lines, data_as_words = import_data(filepath)
l = preprocess(data_as_words)
vectorizer = TfidfVectorizer(
use_idf=True, tokenizer=simple_tokenizer,
max_features=100,
stop_words='english')
X = vectorizer.fit_transform(l)
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
reduced = pca.fit_transform(X.todense())
t = reduced.transpose()
print len(t[0])
plt.scatter(t[3], t[4])
# plt.xlim(-0.08,0)
plt.show()
# fig = plt.figure()
# ax = fig.add_subplot(111, projection='3d')
# ax.scatter(t[0], t[1], t[2])
# plt.show()
number_of_clusters = 5
km = KMeans(n_clusters=number_of_clusters)
km_own = K_Means(k=5)
km_own.fit(X.todense())
print km_own.get_clusters()
km.fit(X)
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
centroids = []
for i in range(number_of_clusters):
top_words = [terms[ind] for ind in order_centroids[i, :7]]
centroids.append(' '.join(top_words))
print "Cluster {}: {}".format(i, ' '.join(top_words))