-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
117 lines (96 loc) · 4.17 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import numpy as np
import torch
import time
# from pykeops.torch import LazyTensor
#from pykeops.numpy import LazyTensor
def kmeans_l1(x_bow, K):
N, D = x_bow.shape
x_bow_normalized = x_bow/torch.sum(x_bow, dim=1, keepdim=True).repeat(1,D)
centers = x_bow_normalized[np.random.choice(N, K, replace=False), :]
for ite in range(20):
min_idx = torch.zeros(N)
for n in range(N):
min_idx[n] = torch.argmin(torch.sum((
centers-x_bow[n:(n+1),:].repeat(K,1)).abs(), dim=1))
for k in range(K):
if torch.sum(min_idx==k)>0:
centers[k,:] = torch.sum(
x_bow_normalized[torch.nonzero(
min_idx==k),:], dim=0)/torch.sum(min_idx==k).float().cuda()
else:
centers[k,:] = x_bow_normalized[np.random.choice(N, 1), :]
centers_rank = np.argsort(torch.histc(min_idx,
bins=K, min=0, max=K-1).numpy())[::-1]
return centers[centers_rank.copy(),:]
'''
def Kmeans_keops(x, K=10, Niter=10, verbose=True):
N, D = x.shape # Number of samples, dimension of the ambient space
# K-means loop:
# - x is the point cloud,
# - cl is the vector of class labels
# - c is the cloud of cluster centroids
start = time.time()
c = x[:K, :].clone() # Simplistic random initialization
x_i = LazyTensor(x[:, None, :]) # (Npoints, 1, D)
for i in range(Niter):
c_j = LazyTensor(c[None, :, :]) # (1, Nclusters, D)
D_ij = ((x_i - c_j) ** 2).sum(-1) # (Npoints, Nclusters) symbolic matrix of squared distances
cl = D_ij.argmin(dim=1).long().view(-1) # Points -> Nearest cluster
Ncl = torch.bincount(cl).type(torchtype[dtype]) # Class weights
for d in range(D): # Compute the cluster centroids with torch.bincount:
c[:, d] = torch.bincount(cl, weights=x[:, d]) / Ncl
end = time.time()
if verbose:
print("K-means example with {:,} points in dimension {:,}, K = {:,}:".format(N, D, K))
print('Timing for {} iterations: {:.5f}s = {} x {:.5f}s\n'.format(
Niter, end - start, Niter, (end-start) / Niter))
return cl, c
def KMeans_keops_numpy(x, K=10, Niter=10, verbose=True):
N, D = x.shape # Number of samples, dimension of the ambient space
# K-means loop:
# - x is the point cloud,
# - cl is the vector of class labels
# - c is the cloud of cluster centroids
start = time.time()
c = np.copy(x[:K, :]) # Simplistic random initialization
x_i = LazyTensor(x[:, None, :]) # (Npoints, 1, D)
for i in range(Niter):
c_j = LazyTensor(c[None, :, :]) # (1, Nclusters, D)
D_ij = ((x_i - c_j) ** 2).sum(-1) # (Npoints, Nclusters) symbolic matrix of squared distances
cl = D_ij.argmin(axis=1).astype(int).reshape(N) # Points -> Nearest cluster
Ncl = np.bincount(cl).astype(dtype) # Class weights
for d in range(D): # Compute the cluster centroids with np.bincount:
c[:, d] = np.bincount(cl, weights=x[:, d]) / Ncl
end = time.time()
if verbose:
print("K-means example with {:,} points in dimension {:,}, K = {:,}:".format(N, D, K))
print('Timing for {} iterations: {:.5f}s = {} x {:.5f}s\n'.format(
Niter, end - start, Niter, (end - start) / Niter))
return cl, c
'''
def createTensors(bow, vocab):
''' Creates tensors and bag of words format for files to save time
Args:
bow - path to the numpy files
vocab - path to vocab file
'''
t_device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
x = np.load(bow)
x_cnt = x[x.files[1]] # mutation count matrix
x_idx = x[x.files[0]] # mutation index matrix
N = len(x_idx)
the_vocab = np.load(vocab)
the_vocab = the_vocab[the_vocab.files[0]]
x_bow = torch.zeros(N, the_vocab.shape[0], device=t_device)
M = []
for n in range(len(x_cnt)):
x_cnt[n] = torch.from_numpy(x_cnt[n]).float().to(t_device)
try:
M.append(x_idx[n].size) # M is a list of document unique word counts.
except:
raise ValueError(x_idx[n], n)
x_bow[n, x_idx[n].tolist()] = x_cnt[n]
torch.save(x_cnt, 'x_cnt_all.pt')
torch.save(x_idx, 'x_idx_all.pt')
torch.save(M, 'M_all.pt')
torch.save(x_bow, 'x_bow_all.pt')