Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature datareport siqi #345

Merged
merged 2 commits into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions fedlab/utils/dataset/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import numpy as np
import pandas as pd
import warnings
from collections import Counter


def split_indices(num_cumsum, rand_perm):
Expand Down Expand Up @@ -517,3 +518,84 @@ def random_slicing(dataset, num_clients):
np.random.choice(all_idxs, num_items, replace=False))
all_idxs = list(set(all_idxs) - set(dict_users[i]))
return dict_users


def partition_report(targets, data_indices, class_num=None, verbose=True, file=None):
"""Generate data partition report for clients in ``data_indices``.

Generate data partition report for each client according to ``data_indices``, including
ratio of each class and dataset size in current client. Report can be printed in screen or into
file. The output format is comma-separated values which can be read by :func:`pandas.read_csv`
or :func:`csv.reader`.

Args:
targets (list or numpy.ndarray): Targets for all data samples, with each element is in range of ``0`` to ``class_num-1``.
data_indices (dict): Dict of ``client_id: [data indices]``.
class_num (int, optional): Total number of classes. If set to ``None``, then ``class_num = max(targets) + 1``.
verbose (bool, optional): Whether print data partition report in screen. Default as ``True``.
file (str, optional): Output file name of data partition report. If ``None``, then no output in file. Default as ``None``.

Returns:
pd.DataFrame

Examples:
First generate synthetic data labels and data partition to obtain ``data_indices``
(``{ client_id: sample indices}``):

>>> sample_num = 15
>>> class_num = 4
>>> clients_num = 3
>>> num_per_client = int(sample_num/clients_num)
>>> labels = np.random.randint(class_num, size=sample_num) # generate 15 labels, each label is 0 to 3
>>> rand_per = np.random.permutation(sample_num)
>>> # partition synthetic data into 3 clients
>>> data_indices = {0: rand_per[0:num_per_client],
... 1: rand_per[num_per_client:num_per_client*2],
... 2: rand_per[num_per_client*2:num_per_client*3]}

Check ``data_indices`` may look like:

>>> data_indices
{0: array([ 4, 1, 14, 8, 5]),
1: array([ 0, 13, 12, 3, 2]),
2: array([10, 9, 7, 6, 11])}

Now generate partition report for each client and each class:

>>> partition_report(labels, data_indices, class_num=class_num, verbose=True, file=None)
Class sample statistics:
cid class-0 class-1 class-2 class-3 TotalAmount
0 0 3 2 0 0 5
1 1 1 1 1 2 5
2 2 3 1 1 0 5

"""
if not isinstance(targets, np.ndarray):
targets = np.array(targets)

if not class_num:
class_num = max(targets) + 1

sorted_cid = sorted(data_indices.keys()) # sort client id in ascending order

stats_rows = []
for client_id in sorted_cid:
indices = data_indices[client_id]
client_targets = targets[indices]
client_sample_num = len(indices) # total number of samples of current client
client_target_cnt = Counter(client_targets) # { cls1: num1, cls2: num2, ... }
cur_client_stat = {'cid': client_id}
for cls in range(class_num):
cur_client_stat[f'class-{cls}'] = client_target_cnt[cls] if cls in client_target_cnt else 0
cur_client_stat['TotalAmount'] = client_sample_num
stats_rows.append(cur_client_stat)


stats_df = pd.DataFrame(stats_rows)
if file is not None:
stats_df.to_csv(file, header=True, index=False)
if verbose:
print("Class sample statistics:")
print(stats_df)

return stats_df
2 changes: 2 additions & 0 deletions fedlab/utils/dataset/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def __init__(self, targets, num_clients,
self.client_dict = self._perform_partition()
# get sample number count for each client
self.client_sample_count = F.samples_num_count(self.client_dict, self.num_clients)
self.stats_report = F.partition_report(targets, self.client_dict, class_num=self.num_classes, verbose=False)

def _perform_partition(self):
if self.balance is None:
Expand Down Expand Up @@ -265,6 +266,7 @@ def __init__(self, targets, num_clients,
self.client_dict = self._perform_partition()
# get sample number count for each client
self.client_sample_count = F.samples_num_count(self.client_dict, self.num_clients)
self.stats_report = F.partition_report(targets, self.client_dict, class_num=self.num_classes, verbose=False)

def _perform_partition(self):
if self.partition == "noniid-#label":
Expand Down
194 changes: 97 additions & 97 deletions fedlab/utils/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,100 +147,100 @@ def get_best_gpu():
return torch.device("cuda:%d" % (best_device_index))


def partition_report(targets, data_indices, class_num=None, verbose=True, file=None):
"""Generate data partition report for clients in ``data_indices``.

Generate data partition report for each client according to ``data_indices``, including
ratio of each class and dataset size in current client. Report can be printed in screen or into
file. The output format is comma-separated values which can be read by :func:`pandas.read_csv`
or :func:`csv.reader`.

Args:
targets (list or numpy.ndarray): Targets for all data samples, with each element is in range of ``0`` to ``class_num-1``.
data_indices (dict): Dict of ``client_id: [data indices]``.
class_num (int, optional): Total number of classes. If set to ``None``, then ``class_num = max(targets) + 1``.
verbose (bool, optional): Whether print data partition report in screen. Default as ``True``.
file (str, optional): Output file name of data partition report. If ``None``, then no output in file. Default as ``None``.

Examples:
First generate synthetic data labels and data partition to obtain ``data_indices``
(``{ client_id: sample indices}``):

>>> sample_num = 15
>>> class_num = 4
>>> clients_num = 3
>>> num_per_client = int(sample_num/clients_num)
>>> labels = np.random.randint(class_num, size=sample_num) # generate 15 labels, each label is 0 to 3
>>> rand_per = np.random.permutation(sample_num)
>>> # partition synthetic data into 3 clients
>>> data_indices = {0: rand_per[0:num_per_client],
... 1: rand_per[num_per_client:num_per_client*2],
... 2: rand_per[num_per_client*2:num_per_client*3]}

Check ``data_indices`` may look like:

>>> data_indices
{0: array([8, 6, 5, 7, 2]),
1: array([ 3, 10, 14, 4, 1]),
2: array([13, 9, 12, 11, 0])}

Now generate partition report for each client and each class:

>>> partition_report(labels, data_indices, class_num=class_num, verbose=True, file=None)
Class frequencies:
client,class0,class1,class2,class3,Amount
Client 0,0.200,0.00,0.200,0.600,5
Client 1,0.400,0.200,0.200,0.200,5
Client 2,0.00,0.400,0.400,0.200,5

"""
if not verbose and file is None:
print("No partition report generated")
return

if not isinstance(targets, np.ndarray):
targets = np.array(targets)

if not class_num:
class_num = max(targets) + 1

sorted_cid = sorted(data_indices.keys()) # sort client id in ascending order

header_line = "Class frequencies:"
col_name = "client," + ",".join([f"class{i}" for i in range(class_num)]) + ",Amount"

if verbose:
print(header_line)
print(col_name)
if file is not None:
reports = [header_line, col_name]
else:
reports = None

for client_id in sorted_cid:
indices = data_indices[client_id]
client_targets = targets[indices]
client_sample_num = len(indices) # total number of samples of current client
client_target_cnt = Counter(client_targets) # { cls1: num1, cls2: num2, ... }

report_line = (
f"Client {client_id:3d},"
+ ",".join(
[
f"{client_target_cnt[cls] / client_sample_num:.3f}"
if cls in client_target_cnt
else "0.00"
for cls in range(class_num)
]
)
+ f",{client_sample_num}"
)
if verbose:
print(report_line)
if file is not None:
reports.append(report_line)

if file is not None:
fh = open(file, "w")
fh.write("\n".join(reports))
fh.close()
# def partition_report(targets, data_indices, class_num=None, verbose=True, file=None):
# """Generate data partition report for clients in ``data_indices``.

# Generate data partition report for each client according to ``data_indices``, including
# ratio of each class and dataset size in current client. Report can be printed in screen or into
# file. The output format is comma-separated values which can be read by :func:`pandas.read_csv`
# or :func:`csv.reader`.

# Args:
# targets (list or numpy.ndarray): Targets for all data samples, with each element is in range of ``0`` to ``class_num-1``.
# data_indices (dict): Dict of ``client_id: [data indices]``.
# class_num (int, optional): Total number of classes. If set to ``None``, then ``class_num = max(targets) + 1``.
# verbose (bool, optional): Whether print data partition report in screen. Default as ``True``.
# file (str, optional): Output file name of data partition report. If ``None``, then no output in file. Default as ``None``.

# Examples:
# First generate synthetic data labels and data partition to obtain ``data_indices``
# (``{ client_id: sample indices}``):

# >>> sample_num = 15
# >>> class_num = 4
# >>> clients_num = 3
# >>> num_per_client = int(sample_num/clients_num)
# >>> labels = np.random.randint(class_num, size=sample_num) # generate 15 labels, each label is 0 to 3
# >>> rand_per = np.random.permutation(sample_num)
# >>> # partition synthetic data into 3 clients
# >>> data_indices = {0: rand_per[0:num_per_client],
# ... 1: rand_per[num_per_client:num_per_client*2],
# ... 2: rand_per[num_per_client*2:num_per_client*3]}

# Check ``data_indices`` may look like:

# >>> data_indices
# {0: array([8, 6, 5, 7, 2]),
# 1: array([ 3, 10, 14, 4, 1]),
# 2: array([13, 9, 12, 11, 0])}

# Now generate partition report for each client and each class:

# >>> partition_report(labels, data_indices, class_num=class_num, verbose=True, file=None)
# Class frequencies:
# client,class0,class1,class2,class3,Amount
# Client 0,0.200,0.00,0.200,0.600,5
# Client 1,0.400,0.200,0.200,0.200,5
# Client 2,0.00,0.400,0.400,0.200,5

# """
# if not verbose and file is None:
# print("No partition report generated")
# return

# if not isinstance(targets, np.ndarray):
# targets = np.array(targets)

# if not class_num:
# class_num = max(targets) + 1

# sorted_cid = sorted(data_indices.keys()) # sort client id in ascending order

# header_line = "Class frequencies:"
# col_name = "client," + ",".join([f"class{i}" for i in range(class_num)]) + ",Amount"

# if verbose:
# print(header_line)
# print(col_name)
# if file is not None:
# reports = [header_line, col_name]
# else:
# reports = None

# for client_id in sorted_cid:
# indices = data_indices[client_id]
# client_targets = targets[indices]
# client_sample_num = len(indices) # total number of samples of current client
# client_target_cnt = Counter(client_targets) # { cls1: num1, cls2: num2, ... }

# report_line = (
# f"Client {client_id:3d},"
# + ",".join(
# [
# f"{client_target_cnt[cls] / client_sample_num:.3f}"
# if cls in client_target_cnt
# else "0.00"
# for cls in range(class_num)
# ]
# )
# + f",{client_sample_num}"
# )
# if verbose:
# print(report_line)
# if file is not None:
# reports.append(report_line)

# if file is not None:
# fh = open(file, "w")
# fh.write("\n".join(reports))
# fh.close()
Loading