-
Notifications
You must be signed in to change notification settings - Fork 0
/
fmmd_lp.py
421 lines (324 loc) · 12.5 KB
/
fmmd_lp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
# lpsolve - simple lp solving based approach to the problem
import math
from collections import defaultdict
import gurobipy as gp
from gurobipy import GRB
import sys
import numpy as np
from tqdm import tqdm
import typing as t
import numpy.typing as npt
import time
import datastructures.KDTree2 as algo
from datastructures.WeightedTree import WeightedTree
import datastructures.BallTree as BallTree
from algorithms.rounding import rand_round
import algorithms.coreset as CORESET
import algorithms.utils as algsU
import datasets.utils as datsU
def solve_lp(dataStruct, kis, m: gp.Model, gamma: np.float64, variables: npt.NDArray[gp.MVar], colors: npt.NDArray[t.AnyStr],
features: npt.NDArray[np.float64]) -> bool:
"""
solve_lp
builds and solves the LP for the given gamma, returning true if the model is feasible
variables, colors, and features should all have the same length (features can be multi-dimensional?)
:param m: the initialized model (named, really; added for compatibility with moving pieces outside of this to reduce re-initalization)
:param gamma: current minimum distance between points
:param variables: np array of model variables
:param colors: np array of "colors" of each point
:param features: np array of data points per model
:return:
"""
# reset model, and remove constraints if they exist
m.reset()
m.remove(m.getConstrs())
# set up parameters since reset resets it
m.Params.SolutionLimit = 1
m.Params.OutputFlag = 0
N = len(features)
"""
THIS DOES NOT WORK
# if we are going to a big dataset, use memory reducing settings
if N >= 10000:
m.Params.PreSparsify = 2
m.Params.Method = 1
"""
# we could probably build up the color constraints once and repeatedly add them
# this builds up the lhs of the constraints in exprs
exprs = defaultdict()
exprs.default_factory = lambda: gp.LinExpr() # this shouldn't be necessary but it is!
for i, color in tqdm(enumerate(colors), desc="Building color constraints", unit=" elems", total=N, disable=True):
exprs[color.item()].addTerms(1.0, variables[i].item())
# reset the default factory to make this into a normal dictionary (error if we use the wrong key)
exprs.default_factory = None
# we need at least ki of each color so build the final constraints here
for key in kis.keys():
m.addConstr(exprs[key] >= kis[key])
# we need at most one point in the ball
# This is slow
# build a constraint for every point
for v, p in tqdm(zip(variables, features), desc="Building ball constraints", unit=" elems", total=N, disable=True):
indices = algo.get_ind_range(dataStruct, np.float_(gamma / 2.0), p)
# add constraint that all the variables need to add up
other_vars = variables[indices]
count = other_vars.shape[0]
# a bit of workaaounds to get from MVar to var here
in_rad = gp.LinExpr([1.0] * count, other_vars.tolist())
m.addConstr(in_rad <= 1)
m.update()
m.optimize()
# if we're feasible, return true otherwise return false
# model is passed back automatically
if m.status == GRB.INFEASIBLE or m.status == GRB.INF_OR_UNBD:
#print(f'Model for {gamma} is infeasible')
return False
elif m.status == GRB.OPTIMAL or m.status == GRB.SOLUTION_LIMIT:
#print(f'Model for {gamma} is feasible')
return True
else:
print(f'\n\n\n***ERROR: Model returned status code {m.status}***')
print(f'Exiting')
exit(-1)
# NOTE: DO NOT USE; HAS BUILT IN CORESET
def bin_lpsolve(gen, features, colors, k, epsilon, a):
# all colors made by combining values in color_fields
color_names = np.unique(colors)
timer = algsU.Stopwatch("Normalization")
# "normalize" features
# Should happen before coreset construction
means = features.mean(axis=0)
devs = features.std(axis=0)
features = (features - means) / devs
timer.split("Coreset")
d = len(feature_fields)
m = len(color_names)
features, colors = CORESET.Coreset_FMM(gen, features, colors, k, m, d, coreset_size).compute()
N = len(features)
timer.split("Building Kis")
kis = algsU.buildKisMap(colors, k, a)
# keys are appended using underscores
# TODO: group formulas => max(1, (1-a) * k n_c / n)
# NOTE: DO THIS AFTER CORESET
# n = number of input points in color c
# (k * n_c / n) <- is all we need!
timer.split("Tree Creation")
# create data structure
data_struct = algo.create(features)
timer.split("High Bound Search")
# first we need to find the high value
print('Solving for high bound')
high = 20 # I'm assuming it's a BIT larger than 0.01
gamma = high * epsilon
m = gp.Model(f"feasibility model") # workaround for OOM error?
m.Params.SolutionLimit = 1
m.Params.OutputFlag = 0
m.Params.LogFile = ""
m.Params.LogToConsole = 0
variables = m.addMVar(N, name="x", vtype=GRB.CONTINUOUS)
feasible = solve_lp(data_struct, kis, m, np.float_(gamma), variables, colors, features)
while feasible:
high *= 2
gamma = high * epsilon
feasible = solve_lp(data_struct, kis, m, np.float_(gamma), variables, colors, features)
# binary search over multiples of epsilon
timer.split("Binary Search")
low = 1
assert (low < high)
multiple = math.ceil((low + high) / 2.0)
while low < high:
# solve model once for current gamma
gamma = multiple * epsilon
feasible = solve_lp(data_struct, kis, m, np.float_(gamma), variables, colors, features)
# if it's feasible, we have to search for larger multiples
# if it's not, we want smaller multiples
if feasible:
# our high will be the first failure
low = multiple
else:
high = multiple - 1
print(low, high)
multiple = math.ceil((low + high) / 2.0)
gamma = multiple * epsilon
while not solve_lp(data_struct, kis, m, np.float_(gamma), variables, colors, features):
multiple -= 1
gamma = multiple * epsilon
print()
if m.status == GRB.INFEASIBLE or m.status == GRB.INF_OR_UNBD:
print(f'Model for {gamma} is infeasible')
exit(-1)
elif m.status == GRB.OPTIMAL or m.status == GRB.SOLUTION_LIMIT:
print(f'Model for {gamma} is feasible')
else:
print(f'\n\n\n***ERROR: Model returned status code {m.status}***')
print(f'Exiting')
exit(-1)
timer.split("Get Results")
# get results of the LP
vars = m.getVars()
X = np.array(m.getAttr("X", vars))
names = np.array(m.getAttr("VarName", vars))
assert(len(X) == N)
count=0
print(f'Nonzero variables:')
for x, n in zip(X, names):
if x != 0:
#print(f"{n} = {x}")
count+=1
print(f'Total number: {count}')
print()
timer.split("Randomized Rounding")
# do we want all points included or just the ones in S?
S = rand_round(gen, gamma / 2, X, features, colors, kis)
"""
print(f'Final Solution (len = {len(S)}):')
print(S)
print('Points:')
res = list(zip(features[S], colors[S]))
for r in res:
print(r[0], r[1])
"""
print('Solution Stats:')
for color in kis.keys():
print(f'{color}: {sum(colors[S] == color)}')
print()
tree = algo.create(features[S])
for i in S:
p = features[i]
dists, indices = algo.get_ind(tree, 2, p)
if dists[0][1] < gamma / 2.0:
print('ERROR: invalid distance between points')
print(dists)
print(features[i])
print(features[S][indices[0]][1])
print()
# time!
res, total = timer.stop()
print(f'Timings! ({total} total seconds)')
for name, delta in res:
print(f'{name + ":":>40} {delta}')
# compute diversity value of solution
solution = features[S]
diversity = algsU.compute_maxmin_diversity(solution)
print(f'Solved diversity is {diversity}')
return diversity, total
def epsilon_falloff(gen, features, colors, upper_gamma, kis, epsilon, deltas=False):
"""
starts at a high bound from the corest and repeatedly falls off by 1-epsilon
:param gen: a random number generator
:param features: the points in the dataset
:param colors: corresponding color labels for each point
:param kis: the map of points to select per color
:param epsilon: fallof value
:param deltas: whether to return delta between requested and selected
:return:
"""
N = len(features)
timer = algsU.Stopwatch("Tree Creation")
# create data structure
data_struct = algo.create(features)
timer.split("Exponential falloff")
# build model
m = gp.Model("feasibility model") # workaround for OOM error?
m.Params.SolutionLimit = 1
m.Params.OutputFlag = 0
m.Params.LogFile = ""
m.Params.LogToConsole = 0
gamma = upper_gamma
variables = m.addMVar(N, name="X", vtype=GRB.CONTINUOUS)
# fall off until solving
while not solve_lp(data_struct, kis, m, np.float_(gamma), variables, colors, features):
gamma = gamma * (1 - epsilon)
timer.split("Randomized Rounding")
# get results of the LP
vars = m.getVars()
X = np.array(m.getAttr("X", vars))
#names = np.array(m.getAttr("VarName", vars))
assert(len(X) == N)
# do we want all points included or just the ones in S?
S = rand_round(gen, gamma / 2.0, X, features, colors, kis)
_, total_time = timer.stop()
selected_count = len(S)
# compute diversity value of solution
solution = features[S]
diversity = algsU.compute_maxmin_diversity(solution)
if deltas:
delta_vals = algsU.check_returned_kis(colors, kis, S)
return S, diversity, delta_vals, total_time
else:
return S, diversity, total_time
if __name__ == '__main__':
gen = np.default_rng()
# setup
# File fields
allFields = [
"age",
"workclass",
"fnlwgt", # what on earth is this one?
"education",
"education-num",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"capital-gain",
"capital-loss",
"hours-per-week",
"native-country",
"yearly-income",
]
# fields we care about for parsing
color_field = ['race', 'sex']
feature_fields = {'age', 'capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt', 'education-num'}
# variables for running LP bin-search
# coreset params
# Set the size of the coreset
colors, features = datsU.read_CSV("./datasets/ads/adult.data", allFields, color_field, '_', feature_fields)
assert (len(colors) == len(features))
# run some tests!
results = []
# first for the proper 100
for k in range(10, 201, 5):
# compute coreset of size
coreset_size = 50 * k
# all colors made by combining values in color_fields
color_names = np.unique(colors)
# "normalize" features
# Should happen before coreset construction
means = features.mean(axis=0)
devs = features.std(axis=0)
features = (features - means) / devs
d = len(feature_fields)
m = len(color_names)
coreset = CORESET.Coreset_FMM(gen, features, colors, k, m, d, coreset_size)
# can't overwrite our original dataset
core_features, core_colors = coreset.compute()
# starting estimate
upper_gamma = coreset.compute_gamma_upper_bound()
# ki generation
kis = algsU.buildKisMap(colors, k, 0.1)
# run LP
print(f'Running LP for {k}')
selected, div, deltas, time = epsilon_falloff(
gen=gen,
features=core_features,
colors=core_colors,
upper_gamma=upper_gamma,
kis=kis,
epsilon=.1,
deltas=True,
)
print(f'Finished in {time}! (diversity of {div})')
results.append((k, selected, div, deltas, time))
# TODO: double check how many of each color compared to K map
# TODO: run another algorithm to compare diversity
print('\n\nFINAL RESULTS:')
print('k\tselected\tdiversity\ttime', end='\n')
#for color in deltas.keys():
#print(f'{color}', end='\t')
#print('', end='\n')
for k, selected, div, deltas, time in results:
print(f'{k}\t{selected}\t{div}\t{time}', end='\t')
for delta in deltas.values():
print(f'{delta}', end='\t')
print('', end='\n')