diff --git a/datasets/mnist/MNIST/processed/test.pt b/datasets/mnist/MNIST/processed/test.pt
new file mode 100644
index 00000000..ec6bd381
Binary files /dev/null and b/datasets/mnist/MNIST/processed/test.pt differ
diff --git a/datasets/mnist/MNIST/processed/training.pt b/datasets/mnist/MNIST/processed/training.pt
new file mode 100644
index 00000000..bcd9bff3
Binary files /dev/null and b/datasets/mnist/MNIST/processed/training.pt differ
diff --git a/datasets/mnist/MNIST/raw/t10k-images-idx3-ubyte b/datasets/mnist/MNIST/raw/t10k-images-idx3-ubyte
new file mode 100644
index 00000000..1170b2ca
Binary files /dev/null and b/datasets/mnist/MNIST/raw/t10k-images-idx3-ubyte differ
diff --git a/datasets/mnist/MNIST/raw/t10k-images-idx3-ubyte.gz b/datasets/mnist/MNIST/raw/t10k-images-idx3-ubyte.gz
new file mode 100644
index 00000000..5ace8ea9
Binary files /dev/null and b/datasets/mnist/MNIST/raw/t10k-images-idx3-ubyte.gz differ
diff --git a/datasets/mnist/MNIST/raw/t10k-labels-idx1-ubyte b/datasets/mnist/MNIST/raw/t10k-labels-idx1-ubyte
new file mode 100644
index 00000000..d1c3a970
Binary files /dev/null and b/datasets/mnist/MNIST/raw/t10k-labels-idx1-ubyte differ
diff --git a/datasets/mnist/MNIST/raw/t10k-labels-idx1-ubyte.gz b/datasets/mnist/MNIST/raw/t10k-labels-idx1-ubyte.gz
new file mode 100644
index 00000000..a7e14154
Binary files /dev/null and b/datasets/mnist/MNIST/raw/t10k-labels-idx1-ubyte.gz differ
diff --git a/datasets/mnist/MNIST/raw/train-images-idx3-ubyte b/datasets/mnist/MNIST/raw/train-images-idx3-ubyte
new file mode 100644
index 00000000..bbce2765
Binary files /dev/null and b/datasets/mnist/MNIST/raw/train-images-idx3-ubyte differ
diff --git a/datasets/mnist/MNIST/raw/train-images-idx3-ubyte.gz b/datasets/mnist/MNIST/raw/train-images-idx3-ubyte.gz
new file mode 100644
index 00000000..b50e4b6b
Binary files /dev/null and b/datasets/mnist/MNIST/raw/train-images-idx3-ubyte.gz differ
diff --git a/datasets/mnist/MNIST/raw/train-labels-idx1-ubyte b/datasets/mnist/MNIST/raw/train-labels-idx1-ubyte
new file mode 100644
index 00000000..d6b4c5db
Binary files /dev/null and b/datasets/mnist/MNIST/raw/train-labels-idx1-ubyte differ
diff --git a/datasets/mnist/MNIST/raw/train-labels-idx1-ubyte.gz b/datasets/mnist/MNIST/raw/train-labels-idx1-ubyte.gz
new file mode 100644
index 00000000..707a576b
Binary files /dev/null and b/datasets/mnist/MNIST/raw/train-labels-idx1-ubyte.gz differ
diff --git a/fedlab/contrib/algorithm/basic_server.py b/fedlab/contrib/algorithm/basic_server.py
index cca7ba08..44e39e11 100644
--- a/fedlab/contrib/algorithm/basic_server.py
+++ b/fedlab/contrib/algorithm/basic_server.py
@@ -13,11 +13,13 @@
 # limitations under the License.
 
 import torch
+from torch import nn
 import random
 from copy import deepcopy
 
 from typing import List
 from ...utils import Logger, Aggregators, SerializationTool
+from ...utils.functional import evaluate
 from ...core.server.handler import ServerHandler
 from ..client_sampler.base_sampler import FedSampler
 from ..client_sampler.uniform_sampler import RandomSampler
@@ -36,7 +38,7 @@ class SyncServerHandler(ServerHandler):
     Args:
         model (torch.nn.Module): model trained by federated learning.
         global_round (int): stop condition. Shut down FL system when global round is reached.
-        num_clients (int): number of clients in FL. Default: 0 (initialized external). 
+        num_clients (int): number of clients in FL. Default: 0 (initialized external).
         sample_ratio (float): the result of ``sample_ratio * num_clients`` is the number of clients for every FL round.
         cuda (bool): use GPUs or not. Default: ``False``.
         device (str, optional): assign model/data to the given GPUs. E.g., 'device:0' or 'device:0,1'. Defaults to None. If device is None and cuda is True, FedLab will set the gpu with the largest memory as default.
@@ -149,6 +151,19 @@ def load(self, payload: List[torch.Tensor]) -> bool:
         else:
             return False
 
+    def setup_dataset(self, dataset) -> None:
+        self.dataset = dataset
+
+    def evaluate(self):
+        self._model.eval()
+        test_loader = self.dataset.get_dataloader(type="test", batch_size=128)
+        loss_, acc_ = evaluate(self._model, nn.CrossEntropyLoss(), test_loader)
+        self._LOGGER.info(
+            f"Round [{self.round - 1}/{self.global_round}] test performance on server: \t Loss: {loss_:.5f} \t Acc: {100*acc_:.3f}%"
+        )
+
+        return loss_, acc_
+
 
 class AsyncServerHandler(ServerHandler):
     """Asynchronous Parameter Server Handler
diff --git a/fedlab/contrib/dataset/pathological_mnist.py b/fedlab/contrib/dataset/pathological_mnist.py
index 3c942413..c8e0d982 100644
--- a/fedlab/contrib/dataset/pathological_mnist.py
+++ b/fedlab/contrib/dataset/pathological_mnist.py
@@ -26,14 +26,17 @@
 class PathologicalMNIST(FedDataset):
     """The partition stratigy in FedAvg. See http://proceedings.mlr.press/v54/mcmahan17a?ref=https://githubhelp.com
 
-        Args:
-            root (str): Path to download raw dataset.
-            path (str): Path to save partitioned subdataset.
-            num_clients (int): Number of clients.
-            shards (int, optional): Sort the dataset by the label, and uniformly partition them into shards. Then 
-            download (bool, optional): Download. Defaults to True.
-        """
-    def __init__(self, root, path, num_clients=100, shards=200, download=True, preprocess=False) -> None:
+    Args:
+        root (str): Path to download raw dataset.
+        path (str): Path to save partitioned subdataset.
+        num_clients (int): Number of clients.
+        shards (int, optional): Sort the dataset by the label, and uniformly partition them into shards. Then
+        download (bool, optional): Download. Defaults to True.
+    """
+
+    def __init__(
+        self, root, path, num_clients=100, shards=200, download=True, preprocess=False
+    ) -> None:
         self.root = os.path.expanduser(root)
         self.path = path
         self.num_clients = num_clients
@@ -48,15 +51,19 @@ def preprocess(self, download=True):
 
         if os.path.exists(self.path) is not True:
             os.mkdir(self.path)
-        
+
         if os.path.exists(os.path.join(self.path, "train")) is not True:
             os.mkdir(os.path.join(self.path, "train"))
             os.mkdir(os.path.join(self.path, "var"))
             os.mkdir(os.path.join(self.path, "test"))
-            
+
         # train
-        mnist = torchvision.datasets.MNIST(self.root, train=True, download=self.download,
-                                           transform=transforms.ToTensor())
+        mnist = torchvision.datasets.MNIST(
+            self.root,
+            train=True,
+            download=self.download,
+            transform=transforms.ToTensor(),
+        )
         data_indices = noniid_slicing(mnist, self.num_clients, self.shards)
 
         samples, labels = [], []
@@ -70,9 +77,25 @@ def preprocess(self, download=True):
                 data.append(x)
                 label.append(y)
             dataset = BaseDataset(data, label)
-            torch.save(dataset, os.path.join(self.path, "train", "data{}.pkl".format(id)))
-
-    def get_dataset(self, id, type="train"):
+            torch.save(
+                dataset, os.path.join(self.path, "train", "data{}.pkl".format(id))
+            )
+
+        # test
+        mnist_test = torchvision.datasets.MNIST(
+            self.root,
+            train=False,
+            download=self.download,
+            transform=transforms.ToTensor(),
+        )
+        test_samples, test_labels = [], []
+        for x, y in mnist_test:
+            test_samples.append(x)
+            test_labels.append(y)
+        test_dataset = BaseDataset(test_samples, test_labels)
+        torch.save(test_dataset, os.path.join(self.path, "test", "test.pkl"))
+
+    def get_dataset(self, id=None, type="train"):
         """Load subdataset for client with client ID ``cid`` from local file.
 
         Args:
@@ -82,10 +105,13 @@ def get_dataset(self, id, type="train"):
         Returns:
             Dataset
         """
-        dataset = torch.load(os.path.join(self.path, type, "data{}.pkl".format(id)))
+        if type == "train":
+            dataset = torch.load(os.path.join(self.path, type, "data{}.pkl".format(id)))
+        else:
+            dataset = torch.load(os.path.join(self.path, "test", "test.pkl"))
         return dataset
 
-    def get_dataloader(self, id, batch_size=None, type="train"):
+    def get_dataloader(self, id=None, batch_size=None, type="train"):
         """Return dataload for client with client ID ``cid``.
 
         Args:
diff --git a/fedlab/core/standalone.py b/fedlab/core/standalone.py
index c6e9a625..1f884cd6 100644
--- a/fedlab/core/standalone.py
+++ b/fedlab/core/standalone.py
@@ -15,6 +15,7 @@
 from .client.trainer import SerialClientTrainer
 from .server.handler import ServerHandler
 
+
 class StandalonePipeline(object):
     def __init__(self, handler: ServerHandler, trainer: SerialClientTrainer):
         """Perform standalone simulation process.
@@ -48,4 +49,4 @@ def main(self):
             # self.handler.evaluate()
 
     def evaluate(self):
-        print("This is a example implementation. Please read the source code at fedlab.core.standalone.")
+        loss_, acc_ = self.handler.evaluate()
diff --git a/fedlab/utils/functional.py b/fedlab/utils/functional.py
index c389725d..0d0f3a30 100644
--- a/fedlab/utils/functional.py
+++ b/fedlab/utils/functional.py
@@ -44,14 +44,14 @@ def reset(self):
 
     def update(self, val, n=1):
         self.val = val
-        self.sum += val
+        self.sum += val * n
         self.count += n
         self.avg = self.sum / self.count
 
 
 def evaluate(model, criterion, test_loader):
     """Evaluate classify task model accuracy.
-    
+
     Returns:
         (loss.sum, acc.avg)
     """
@@ -62,6 +62,7 @@ def evaluate(model, criterion, test_loader):
     acc_ = AverageMeter()
     with torch.no_grad():
         for inputs, labels in test_loader:
+            batch_size = len(labels)
             inputs = inputs.to(gpu)
             labels = labels.to(gpu)
 
@@ -69,10 +70,10 @@ def evaluate(model, criterion, test_loader):
             loss = criterion(outputs, labels)
 
             _, predicted = torch.max(outputs, 1)
-            loss_.update(loss.item())
-            acc_.update(torch.sum(predicted.eq(labels)).item(), len(labels))
+            loss_.update(loss.item(), batch_size)
+            acc_.update(torch.sum(predicted.eq(labels)).item() / batch_size, batch_size)
 
-    return loss_.sum, acc_.avg
+    return loss_.avg, acc_.avg
 
 
 def read_config_from_json(json_file: str, user_name: str):
@@ -114,8 +115,12 @@ def read_config_from_json(json_file: str, user_name: str):
     with open(json_file) as f:
         config = json.load(f)
     config_info = config[user_name]
-    return (config_info["ip"], config_info["port"], config_info["world_size"],
-            config_info["rank"])
+    return (
+        config_info["ip"],
+        config_info["port"],
+        config_info["world_size"],
+        config_info["rank"],
+    )
 
 
 def get_best_gpu():
@@ -126,7 +131,7 @@ def get_best_gpu():
 
     if "CUDA_VISIBLE_DEVICES" in os.environ.keys() is not None:
         cuda_devices = [
-            int(device) for device in os.environ["CUDA_VISIBLE_DEVICES"].split(',')
+            int(device) for device in os.environ["CUDA_VISIBLE_DEVICES"].split(",")
         ]
     else:
         cuda_devices = range(deviceCount)
@@ -142,11 +147,7 @@ def get_best_gpu():
     return torch.device("cuda:%d" % (best_device_index))
 
 
-def partition_report(targets,
-                     data_indices,
-                     class_num=None,
-                     verbose=True,
-                     file=None):
+def partition_report(targets, data_indices, class_num=None, verbose=True, file=None):
     """Generate data partition report for clients in ``data_indices``.
 
     Generate data partition report for each client according to ``data_indices``, including
@@ -203,12 +204,10 @@ def partition_report(targets,
     if not class_num:
         class_num = max(targets) + 1
 
-    sorted_cid = sorted(
-        data_indices.keys())  # sort client id in ascending order
+    sorted_cid = sorted(data_indices.keys())  # sort client id in ascending order
 
     header_line = "Class frequencies:"
-    col_name = "client," + ','.join([f"class{i}"
-                                     for i in range(class_num)]) + ",Amount"
+    col_name = "client," + ",".join([f"class{i}" for i in range(class_num)]) + ",Amount"
 
     if verbose:
         print(header_line)
@@ -221,16 +220,21 @@ def partition_report(targets,
     for client_id in sorted_cid:
         indices = data_indices[client_id]
         client_targets = targets[indices]
-        client_sample_num = len(
-            indices)  # total number of samples of current client
-        client_target_cnt = Counter(
-            client_targets)  # { cls1: num1, cls2: num2, ... }
-
-        report_line = f"Client {client_id:3d}," + \
-                      ','.join([
-                          f"{client_target_cnt[cls] / client_sample_num:.3f}" if cls in client_target_cnt else "0.00"
-                          for cls in range(class_num)]) + \
-                      f",{client_sample_num}"
+        client_sample_num = len(indices)  # total number of samples of current client
+        client_target_cnt = Counter(client_targets)  # { cls1: num1, cls2: num2, ... }
+
+        report_line = (
+            f"Client {client_id:3d},"
+            + ",".join(
+                [
+                    f"{client_target_cnt[cls] / client_sample_num:.3f}"
+                    if cls in client_target_cnt
+                    else "0.00"
+                    for cls in range(class_num)
+                ]
+            )
+            + f",{client_sample_num}"
+        )
         if verbose:
             print(report_line)
         if file is not None:
@@ -240,4 +244,3 @@ def partition_report(targets,
         fh = open(file, "w")
         fh.write("\n".join(reports))
         fh.close()
-