Skip to content

Commit

Permalink
Merge pull request #345 from SMILELab-FL/feature-datareport-siqi
Browse files Browse the repository at this point in the history
Feature datareport siqi
  • Loading branch information
AgentDS authored Dec 7, 2023
2 parents c0fea85 + e315584 commit 0ecd419
Show file tree
Hide file tree
Showing 3 changed files with 181 additions and 97 deletions.
82 changes: 82 additions & 0 deletions fedlab/utils/dataset/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import numpy as np
import pandas as pd
import warnings
from collections import Counter


def split_indices(num_cumsum, rand_perm):
Expand Down Expand Up @@ -517,3 +518,84 @@ def random_slicing(dataset, num_clients):
np.random.choice(all_idxs, num_items, replace=False))
all_idxs = list(set(all_idxs) - set(dict_users[i]))
return dict_users


def partition_report(targets, data_indices, class_num=None, verbose=True, file=None):
"""Generate data partition report for clients in ``data_indices``.
Generate data partition report for each client according to ``data_indices``, including
ratio of each class and dataset size in current client. Report can be printed in screen or into
file. The output format is comma-separated values which can be read by :func:`pandas.read_csv`
or :func:`csv.reader`.
Args:
targets (list or numpy.ndarray): Targets for all data samples, with each element is in range of ``0`` to ``class_num-1``.
data_indices (dict): Dict of ``client_id: [data indices]``.
class_num (int, optional): Total number of classes. If set to ``None``, then ``class_num = max(targets) + 1``.
verbose (bool, optional): Whether print data partition report in screen. Default as ``True``.
file (str, optional): Output file name of data partition report. If ``None``, then no output in file. Default as ``None``.
Returns:
pd.DataFrame
Examples:
First generate synthetic data labels and data partition to obtain ``data_indices``
(``{ client_id: sample indices}``):
>>> sample_num = 15
>>> class_num = 4
>>> clients_num = 3
>>> num_per_client = int(sample_num/clients_num)
>>> labels = np.random.randint(class_num, size=sample_num) # generate 15 labels, each label is 0 to 3
>>> rand_per = np.random.permutation(sample_num)
>>> # partition synthetic data into 3 clients
>>> data_indices = {0: rand_per[0:num_per_client],
... 1: rand_per[num_per_client:num_per_client*2],
... 2: rand_per[num_per_client*2:num_per_client*3]}
Check ``data_indices`` may look like:
>>> data_indices
{0: array([ 4, 1, 14, 8, 5]),
1: array([ 0, 13, 12, 3, 2]),
2: array([10, 9, 7, 6, 11])}
Now generate partition report for each client and each class:
>>> partition_report(labels, data_indices, class_num=class_num, verbose=True, file=None)
Class sample statistics:
cid class-0 class-1 class-2 class-3 TotalAmount
0 0 3 2 0 0 5
1 1 1 1 1 2 5
2 2 3 1 1 0 5
"""
if not isinstance(targets, np.ndarray):
targets = np.array(targets)

if not class_num:
class_num = max(targets) + 1

sorted_cid = sorted(data_indices.keys()) # sort client id in ascending order

stats_rows = []
for client_id in sorted_cid:
indices = data_indices[client_id]
client_targets = targets[indices]
client_sample_num = len(indices) # total number of samples of current client
client_target_cnt = Counter(client_targets) # { cls1: num1, cls2: num2, ... }
cur_client_stat = {'cid': client_id}
for cls in range(class_num):
cur_client_stat[f'class-{cls}'] = client_target_cnt[cls] if cls in client_target_cnt else 0
cur_client_stat['TotalAmount'] = client_sample_num
stats_rows.append(cur_client_stat)


stats_df = pd.DataFrame(stats_rows)
if file is not None:
stats_df.to_csv(file, header=True, index=False)
if verbose:
print("Class sample statistics:")
print(stats_df)

return stats_df
2 changes: 2 additions & 0 deletions fedlab/utils/dataset/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def __init__(self, targets, num_clients,
self.client_dict = self._perform_partition()
# get sample number count for each client
self.client_sample_count = F.samples_num_count(self.client_dict, self.num_clients)
self.stats_report = F.partition_report(targets, self.client_dict, class_num=self.num_classes, verbose=False)

def _perform_partition(self):
if self.balance is None:
Expand Down Expand Up @@ -265,6 +266,7 @@ def __init__(self, targets, num_clients,
self.client_dict = self._perform_partition()
# get sample number count for each client
self.client_sample_count = F.samples_num_count(self.client_dict, self.num_clients)
self.stats_report = F.partition_report(targets, self.client_dict, class_num=self.num_classes, verbose=False)

def _perform_partition(self):
if self.partition == "noniid-#label":
Expand Down
194 changes: 97 additions & 97 deletions fedlab/utils/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,100 +147,100 @@ def get_best_gpu():
return torch.device("cuda:%d" % (best_device_index))


def partition_report(targets, data_indices, class_num=None, verbose=True, file=None):
"""Generate data partition report for clients in ``data_indices``.
Generate data partition report for each client according to ``data_indices``, including
ratio of each class and dataset size in current client. Report can be printed in screen or into
file. The output format is comma-separated values which can be read by :func:`pandas.read_csv`
or :func:`csv.reader`.
Args:
targets (list or numpy.ndarray): Targets for all data samples, with each element is in range of ``0`` to ``class_num-1``.
data_indices (dict): Dict of ``client_id: [data indices]``.
class_num (int, optional): Total number of classes. If set to ``None``, then ``class_num = max(targets) + 1``.
verbose (bool, optional): Whether print data partition report in screen. Default as ``True``.
file (str, optional): Output file name of data partition report. If ``None``, then no output in file. Default as ``None``.
Examples:
First generate synthetic data labels and data partition to obtain ``data_indices``
(``{ client_id: sample indices}``):
>>> sample_num = 15
>>> class_num = 4
>>> clients_num = 3
>>> num_per_client = int(sample_num/clients_num)
>>> labels = np.random.randint(class_num, size=sample_num) # generate 15 labels, each label is 0 to 3
>>> rand_per = np.random.permutation(sample_num)
>>> # partition synthetic data into 3 clients
>>> data_indices = {0: rand_per[0:num_per_client],
... 1: rand_per[num_per_client:num_per_client*2],
... 2: rand_per[num_per_client*2:num_per_client*3]}
Check ``data_indices`` may look like:
>>> data_indices
{0: array([8, 6, 5, 7, 2]),
1: array([ 3, 10, 14, 4, 1]),
2: array([13, 9, 12, 11, 0])}
Now generate partition report for each client and each class:
>>> partition_report(labels, data_indices, class_num=class_num, verbose=True, file=None)
Class frequencies:
client,class0,class1,class2,class3,Amount
Client 0,0.200,0.00,0.200,0.600,5
Client 1,0.400,0.200,0.200,0.200,5
Client 2,0.00,0.400,0.400,0.200,5
"""
if not verbose and file is None:
print("No partition report generated")
return

if not isinstance(targets, np.ndarray):
targets = np.array(targets)

if not class_num:
class_num = max(targets) + 1

sorted_cid = sorted(data_indices.keys()) # sort client id in ascending order

header_line = "Class frequencies:"
col_name = "client," + ",".join([f"class{i}" for i in range(class_num)]) + ",Amount"

if verbose:
print(header_line)
print(col_name)
if file is not None:
reports = [header_line, col_name]
else:
reports = None

for client_id in sorted_cid:
indices = data_indices[client_id]
client_targets = targets[indices]
client_sample_num = len(indices) # total number of samples of current client
client_target_cnt = Counter(client_targets) # { cls1: num1, cls2: num2, ... }

report_line = (
f"Client {client_id:3d},"
+ ",".join(
[
f"{client_target_cnt[cls] / client_sample_num:.3f}"
if cls in client_target_cnt
else "0.00"
for cls in range(class_num)
]
)
+ f",{client_sample_num}"
)
if verbose:
print(report_line)
if file is not None:
reports.append(report_line)

if file is not None:
fh = open(file, "w")
fh.write("\n".join(reports))
fh.close()
# def partition_report(targets, data_indices, class_num=None, verbose=True, file=None):
# """Generate data partition report for clients in ``data_indices``.

# Generate data partition report for each client according to ``data_indices``, including
# ratio of each class and dataset size in current client. Report can be printed in screen or into
# file. The output format is comma-separated values which can be read by :func:`pandas.read_csv`
# or :func:`csv.reader`.

# Args:
# targets (list or numpy.ndarray): Targets for all data samples, with each element is in range of ``0`` to ``class_num-1``.
# data_indices (dict): Dict of ``client_id: [data indices]``.
# class_num (int, optional): Total number of classes. If set to ``None``, then ``class_num = max(targets) + 1``.
# verbose (bool, optional): Whether print data partition report in screen. Default as ``True``.
# file (str, optional): Output file name of data partition report. If ``None``, then no output in file. Default as ``None``.

# Examples:
# First generate synthetic data labels and data partition to obtain ``data_indices``
# (``{ client_id: sample indices}``):

# >>> sample_num = 15
# >>> class_num = 4
# >>> clients_num = 3
# >>> num_per_client = int(sample_num/clients_num)
# >>> labels = np.random.randint(class_num, size=sample_num) # generate 15 labels, each label is 0 to 3
# >>> rand_per = np.random.permutation(sample_num)
# >>> # partition synthetic data into 3 clients
# >>> data_indices = {0: rand_per[0:num_per_client],
# ... 1: rand_per[num_per_client:num_per_client*2],
# ... 2: rand_per[num_per_client*2:num_per_client*3]}

# Check ``data_indices`` may look like:

# >>> data_indices
# {0: array([8, 6, 5, 7, 2]),
# 1: array([ 3, 10, 14, 4, 1]),
# 2: array([13, 9, 12, 11, 0])}

# Now generate partition report for each client and each class:

# >>> partition_report(labels, data_indices, class_num=class_num, verbose=True, file=None)
# Class frequencies:
# client,class0,class1,class2,class3,Amount
# Client 0,0.200,0.00,0.200,0.600,5
# Client 1,0.400,0.200,0.200,0.200,5
# Client 2,0.00,0.400,0.400,0.200,5

# """
# if not verbose and file is None:
# print("No partition report generated")
# return

# if not isinstance(targets, np.ndarray):
# targets = np.array(targets)

# if not class_num:
# class_num = max(targets) + 1

# sorted_cid = sorted(data_indices.keys()) # sort client id in ascending order

# header_line = "Class frequencies:"
# col_name = "client," + ",".join([f"class{i}" for i in range(class_num)]) + ",Amount"

# if verbose:
# print(header_line)
# print(col_name)
# if file is not None:
# reports = [header_line, col_name]
# else:
# reports = None

# for client_id in sorted_cid:
# indices = data_indices[client_id]
# client_targets = targets[indices]
# client_sample_num = len(indices) # total number of samples of current client
# client_target_cnt = Counter(client_targets) # { cls1: num1, cls2: num2, ... }

# report_line = (
# f"Client {client_id:3d},"
# + ",".join(
# [
# f"{client_target_cnt[cls] / client_sample_num:.3f}"
# if cls in client_target_cnt
# else "0.00"
# for cls in range(class_num)
# ]
# )
# + f",{client_sample_num}"
# )
# if verbose:
# print(report_line)
# if file is not None:
# reports.append(report_line)

# if file is not None:
# fh = open(file, "w")
# fh.write("\n".join(reports))
# fh.close()

0 comments on commit 0ecd419

Please sign in to comment.