This repository has been archived by the owner on Feb 5, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
mechanism.py
230 lines (183 loc) · 8.64 KB
/
mechanism.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#!/usr/bin/env python3
"""
The implementation of the Mechanism class.
Created from the mechanism.py file of the original repository:
https://github.com/usnistgov/PrivacyEngCollabSpace/tree/master/tools
de-identification/Differential-Privacy-Synthetic-Data-Challenge-Algorithms/
rmckenna
-----
Nampoina Andriamilanto <tompo.andri@gmail.com>
"""
import json
import random
from pathlib import Path
from typing import Any, Optional
import pandas as pd
from loguru import logger
import mbi
from mbi import Dataset, Domain
# The attributes that are discretized
# Format: index => (lower bound included, higher bound excluded)
DISCRETIZED_ATTRIBUTES = {
'age': {0: (17, 26), 1: (26, 62), 2: (62, 100)},
'fnlwgt': {0: (10000, 100000), 1: (100000, 200000), 2: (200000, 300000),
3: (300000, 400000), 4: (400000, 500000), 5: (500000, 1500000)},
'capital-gain': {0: (0, 5000), 1: (5000, 10000), 2: (10000, 20000),
3: (20000, 100000)},
'capital-loss': {0: (0, 1000), 1: (1000, 2000), 2: (2000, 3000),
3: (3000, 4000), 5: (4000, 5000)},
'hours-per-week': {0: (0, 25), 1: (25, 50), 2: (50, 75),
3: (75, 100)}
}
class Mechanism:
"""The Mechanism class.
This class is a template for a mechanism with all the boilerplate code
already implemented. Subclasses should implement three functions:
setup, measure, and postprocess
measure is the only function that is allowed to look at the data, and it
must be privacy-vetted. All other code should not need to be checked with
very much scrutiny.
"""
def __init__(self, dataset_path: Path, domain_info_path: Path):
"""Initialize the mechanism.
Args:
dataset_path: The path to the dataset file (in csv format).
domain_info_path: The path to the domain information file.
"""
# Set the dataset path
self.dataset_path = dataset_path
logger.info(f'Set the dataset path: {self.dataset_path}')
# Load the domain information
# Format: attribute => list[attr. values]
self.domain_info = json.load(open(domain_info_path, 'r'))
logger.info(f'Loaded the domain from {domain_info_path}')
logger.debug(self.domain_info)
# Infer the domain sizes (used for the domain object)
# Format: attribute => size of the domain of the attribute
domain_sizes = {}
for attribute in self.domain_info:
domain_sizes[attribute] = len(self.domain_info[attribute])
self.domain = Domain.fromdict(domain_sizes)
logger.debug(f'The domain size of each attribute: {self.domain}')
# This will hold the initial order of the columns and is set in
# load_data()
self.column_order = None
# The synthetic dataset. Note that depending on the moment, this
# attribute can be of type mbi.Dataset or a pd.DataFrame
self.synthetic = None
# The privacy parameters that will be set in run()
self.epsilon = None
self.delta = None
self.save_path = None
def setup(self):
"""Do any setup needed to run the algorithm here."""
def load_data(self, dataset_path: Optional[Path] = None) -> mbi.Dataset:
"""Load the data and discretize the integer/float attributes.
Args:
dataset_path: The path to the dataset file (in csv format).
Returns:
The loaded dataset (type mbi.Dataset).
"""
# Load the dataset
if dataset_path is None:
dataset_path = self.dataset_path
dataset = pd.read_csv(dataset_path)
logger.info(f'Loaded {len(dataset)} rows from {dataset_path}')
# Backup the order of the columns (without the unwated attributes)
self.column_order = dataset.columns
logger.debug(f'The initial column order: {self.column_order}')
# Build the set of each attribute type
attributes = set(self.domain_info.keys())
continuous_attributes = set(DISCRETIZED_ATTRIBUTES.keys())
categorical_attributes = attributes.difference(continuous_attributes)
logger.info(f'We have {len(attributes)} attributes, divided into '
f'{len(continuous_attributes)} continuous attributes and '
f'{len(categorical_attributes)} categorical attributes')
logger.debug(f'Continuous attributes: {continuous_attributes}')
logger.debug(f'Categorical attributes: {categorical_attributes}')
# Map the values of the categorical attributes to their indices.
# Format of mapping: value => index
for attribute in categorical_attributes:
values = self.domain_info[attribute]
mapping = dict(zip(values, range(len(values))))
dataset[attribute] = dataset[attribute].map(mapping)
# For each continuous attribute, create the mapping into the bins
for attribute, bin_id__bins in DISCRETIZED_ATTRIBUTES.items():
def cont_attr_mapper(value: Any) -> int:
"""Map the value of a continuous attribute to its bin index.
Args:
value: The continuous value to map to a bin index.
Returns:
The index of the bin in which the value falls in.
"""
for bin_id, bin_range in bin_id__bins.items():
lower_bound, higher_bound_ex = bin_range
if lower_bound <= value < higher_bound_ex:
return bin_id
raise ValueError(f'The value {value} of {attribute} does not '
'fall within a bin!')
dataset[attribute] = dataset[attribute].map(cont_attr_mapper)
return Dataset(dataset, self.domain)
def measure(self):
"""Load the data and measure things about it.
Save the measurements taken, but do not save the data. This is the only
function that needs to be vetted for privacy.
"""
def postprocess(self):
"""Postprocess step.
Post-process the measurements taken into a synthetic dataset over
discrete attributes.
"""
def transform_domain(self):
"""Set the synthetic dataset to the initial domain.
Convert the synthetic discrete data back to the original domain and add
any missing columns with a default value. The synthetic dataset is then
stored in self.synthetic.
"""
# Get the synthetic dataset
synthetic_dataset = self.synthetic.df
# Build the set of each attribute type
attributes = set(self.domain_info.keys())
continuous_attributes = set(DISCRETIZED_ATTRIBUTES.keys())
categorical_attributes = attributes.difference(continuous_attributes)
# Map back the values of the categorical attributes from their indices.
# Format of mapping: index => value
for attribute in categorical_attributes:
values = self.domain_info[attribute]
mapping = dict(zip(range(len(values)), values))
synthetic_dataset[attribute] = synthetic_dataset[attribute].map(
mapping)
# For each continuous attribute, create the mapping into the bins
for attribute, bin_id__bins in DISCRETIZED_ATTRIBUTES.items():
def cont_attr_mapper(bind_id) -> Any:
"""Map the bin index of a continuous attribute to its value.
Args:
The index of the bin in which the value falls in.
Returns:
value: A random continuous value that falls within the bin.
"""
lower_bound, higher_bound_excld = bin_id__bins[bind_id]
return random.randrange(lower_bound, higher_bound_excld)
synthetic_dataset[attribute] = synthetic_dataset[attribute].map(
cont_attr_mapper)
# Set back to the right column order
self.synthetic = synthetic_dataset[self.column_order]
def run(self, epsilon: float, save_path: Path,
delta: float = 2.2820610e-12) -> pd.DataFrame:
"""Run the mechanism at the given privacy level.
Args:
epsilon: The privacy budget.
save: The location where to save the synthetic data.
delta: The privacy parameter.
Returns:
The synthetic data in the same format as original data.
"""
self.epsilon = epsilon
self.delta = delta
self.save_path = save_path
self.setup()
self.measure()
self.postprocess()
self.transform_domain()
self.synthetic.to_csv(save_path, index=False)
return self.synthetic