-
Notifications
You must be signed in to change notification settings - Fork 0
/
fmmdmwu_nyoom.py
303 lines (238 loc) · 9.34 KB
/
fmmdmwu_nyoom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
import math
import numpy as np
from tqdm import trange
import sys
from datastructures.WeightedTree import WeightedTree
import datastructures.BallTree as BallTree
from algorithms.rounding import rand_round
import algorithms.utils as algsU
import datasets.utils as datsU
import algorithms.coreset as CORESET
def mult_weight_upd(gen, gamma, N, k, features, colors, c_tree : WeightedTree, kis, epsilon, percent_theoretical_limit=1.0):
"""
uses the multiplicative weight update method to
generate an integer solution for the LP
:param gamma: the minimum distance to optimize for
:param N: the number of elements in the dataset
:param k: total number of points selected
:param features: dataset's features
:param colors: matching colors
:param c_tree: The ParGeo C++ tree on the features (passed in so we can re-use it across problem instances)
:param kis: the color->count mapping
:param epsilon: allowed error value
:param percent_theoretical_limit: Percentage of the theoretical maximum number of iterations to run (default 1.0)
:return: a nx1 vector X of the solution or None if infeasible
:return: the "removed time" for encoding and decoding
"""
"""
Things to try
- softmax
- sampling based on weights
- counting approximation approach (ask professor for writeup)
- random ideas?
"""
def getNextSolutionCheckWait():
return gen.integers(9, 29)
nextSolutionCheckWait = getNextSolutionCheckWait()
assert(k > 0)
# time spent translating queries
translation_time = 0
scaled_eps = epsilon / (1.0 + (epsilon / 4.0))
# for calculating error
mu = k - 1
h = np.full((N, 1), 1.0 / N, dtype=np.double) # weights
X = np.zeros((N, 1)) # Output
T = ((8 * mu) / (math.pow(scaled_eps, 2))) * math.log(N, math.e) # iterations
# scale by the amount of the theoretical limit requested
T *= percent_theoretical_limit
for t in trange(math.ceil(T), desc='MWU Loop', disable=False):
S = np.empty((0, features.shape[1])) # points we select this round
W = 0 # current weight sum
# weights to every point (time is ignored for now)
timer = algsU.Stopwatch("Query")
inner_time, w_sums = c_tree.run_query(gamma / 2.0, h)
_, outer_time = timer.stop()
translation_time += (outer_time - inner_time)
# compute minimums per color
# TODO pre-compute as much of this loop as possible
for color in kis.keys():
# need this to reverse things
color_sums_ind = (color == colors).nonzero()[0] # tuple for reasons
# get minimum points as indices
color_sums = w_sums[color_sums_ind]
partition = np.argpartition(color_sums, kis[color] - 1)
arg_mins = partition[:kis[color]]
min_indecies = color_sums_ind[arg_mins]
# add 1 to X[i]'s that are the minimum indices
X[min_indecies] += 1
# add points we've seen to S
S = np.append(S, features[min_indecies], axis=0)
# add additional weight to W
W += np.sum(w_sums[min_indecies])
if W >= 1:
# struct.delete_tree()
return None, translation_time
# get counts of points in each ball in M
M = np.zeros_like(h)
Z = BallTree.create(S)
Cs = BallTree.get_counts_in_range(Z, features, gamma / 2.0)
for i, c in enumerate(Cs):
M[i] = (1.0 / mu) * ((-1 * c) + 1)
# update H
h = h * (np.ones_like(M) - ((scaled_eps / 4.0) * M))
h /= np.sum(h)
# TODO: check rate of change of X and h (euclidean distance) or l-inf
# check directly if X is a feasible solution
if t > 100 and nextSolutionCheckWait == 0:
# reset the wait to a new time
nextSolutionCheckWait = getNextSolutionCheckWait()
timer = algsU.Stopwatch("Query")
_, X_weights = c_tree.run_query(gamma / 2.0, (X / (t + 1)))
_, outer_time = timer.stop()
translation_time += (outer_time - inner_time)
if not np.any(X_weights > 1 + epsilon):
break
else:
nextSolutionCheckWait -= 1
# t is always bound, unless we run for 0 iterations, which is an error
X = X / (t + 1)
return X, translation_time
def epsilon_falloff(gen, features, colors, kis, gamma_upper, mwu_epsilon, falloff_epsilon, return_unadjusted, percent_theoretical_limit=1.0):
"""
starts at a high bound (given by the corset estimate) and repeatedly falls off by 1-epsilon
:param gen: RNG generator
:param features: the data set
:param colors: color labels for the data set
:param kis: map of colors to requested counts
:param gamma_upper: the starting value for gamma
:param mwu_epsilon: epsilon for the MWU method (static error)
:param falloff_epsilon: epsilon for the falloff system (fraction to reduce by each cycle)
:param return_unadjusted: whether to also return the "real" time
:param percent_theoretical_limit: Percentage of the theoretical maximum number of iterations to run (default 1.0)
:return:
"""
# translation time to subtract from the total time
translation_time = 0
N = len(features)
k = sum(kis.values())
timer = algsU.Stopwatch("Falloff")
gamma = gamma_upper
# a ParGeo tree for good measure
dim = features.shape[1]
pargeo_tree = WeightedTree(dim)
pargeo_tree.construct_tree(features)
X, cur_trans_time = mult_weight_upd(gen, gamma, N, k, features, colors, pargeo_tree, kis, mwu_epsilon, percent_theoretical_limit)
translation_time += cur_trans_time
while X is None:
gamma = gamma * (1 - falloff_epsilon)
X, cur_trans_time = mult_weight_upd(gen, gamma, N, k, features, colors, pargeo_tree, kis, mwu_epsilon, percent_theoretical_limit)
translation_time += cur_trans_time
# "clean up" our tree
pargeo_tree.delete_tree()
timer.split("Randomized Rounding")
# we need to flatten X since it expects an array rather than a 1D vector
S = rand_round(gen, gamma / 2.0, X.flatten(), features, colors, kis)
_, total_time = timer.stop()
adjusted_time = total_time - translation_time
# build up stats to return
selected_count = len(S)
solution = features[S]
diversity = algsU.compute_maxmin_diversity(solution)
if return_unadjusted:
return S, diversity, adjusted_time, total_time
else:
return S, diversity, adjusted_time
if __name__ == '__main__':
gen = np.random.default_rng()
"""
# Testing field
# File fields
allFields = [
"x",
"y",
"color",
]
# fields we care about for parsing
color_field = ['color']
feature_fields = {'x', 'y'}
# variables for running LP bin-search
# keys are appended using underscores
kis = {
'blue': 2,
'red': 1,
}
k = sum(kis.values())
"""
# File fields
allFields = [
"age",
"workclass",
"fnlwgt", # what on earth is this one?
"education",
"education-num",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"capital-gain",
"capital-loss",
"hours-per-week",
"native-country",
"yearly-income",
]
# fields we care about for parsing
color_field = ['race', 'sex']
feature_fields = {'age', 'capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt', 'education-num'}
colors, features = datsU.read_CSV("./datasets/ads/adult.data", allFields, color_field, '_', feature_fields)
assert (len(colors) == len(features))
# get the colors
color_names = np.unique(colors)
# "normalize" features
# Should happen before coreset construction
means = features.mean(axis=0)
devs = features.std(axis=0)
features = (features - means) / devs
# testing!
results = []
for k in range(10, 21, 10):
# build KIs
kis = algsU.buildKisMap(colors, k, 0)
adj_k = sum(kis.values())
# compute coreset
coreset_size = 10 * k
d = len(feature_fields)
m = len(color_names)
coreset = CORESET.Coreset_FMM(gen, features, colors, adj_k, m, d, coreset_size)
core_features, core_colors = coreset.compute()
gamma_upper = coreset.compute_gamma_upper_bound()
# actually run the model
print(f'running mwu for {k}')
selected, div, adj_time, time = epsilon_falloff(
gen=gen,
features=core_features,
colors=core_colors,
kis=kis,
gamma_upper=gamma_upper,
mwu_epsilon=0.75,
falloff_epsilon=0.1,
return_unadjusted=True,
percent_theoretical_limit=0.4,
)
print(f'Finished! (time={time}) (adjusted={adj_time})')
results.append((adj_k, selected, div, adj_time))
print('\n\nFINAL RESULTS:')
print('k\tselected\tdiversity\ttime')
for k, selected, div, time in results:
print(f'{k},\t{len(selected)},\t{div},\t{time},')
"""
N = len(features)
gamma = 5
k = 3
kis = {'blue': 2, 'red': 1}
X = mult_weight_upd(5, N, k, features, colors, kis, 0.5)
if X is None:
print('None!')
else:
print(X)
"""