From e3e6df5a8f997c40b52c48a6d0beb69501e6613c Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Mon, 19 Feb 2024 21:51:54 +0800
Subject: [PATCH 01/35] update code for llm-2.0

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 python/fate_llm/model_zoo/pellm/bloom.py      |   2 +-
 python/fate_llm/model_zoo/pellm/chatglm.py    |  10 +-
 python/fate_llm/model_zoo/pellm/gpt2.py       |   3 +-
 python/fate_llm/model_zoo/pellm/llama.py      |   9 +-
 .../pellm/parameter_efficient_llm.py          |  45 +-
 python/fate_llm/runner/__init__.py            |   0
 python/fate_llm/runner/homo_seq2seq_runner.py | 244 +++++++
 python/fate_llm/trainer/__init__.py           |   0
 python/fate_llm/trainer/fedipr_trainer.py     | 594 ------------------
 .../trainer/offsite_tuning_trainer.py         | 317 ----------
 python/fate_llm/trainer/seq2seq_trainer.py    | 166 +++++
 11 files changed, 436 insertions(+), 954 deletions(-)
 create mode 100644 python/fate_llm/runner/__init__.py
 create mode 100644 python/fate_llm/runner/homo_seq2seq_runner.py
 create mode 100644 python/fate_llm/trainer/__init__.py
 delete mode 100644 python/fate_llm/trainer/fedipr_trainer.py
 delete mode 100644 python/fate_llm/trainer/offsite_tuning_trainer.py
 create mode 100644 python/fate_llm/trainer/seq2seq_trainer.py

diff --git a/python/fate_llm/model_zoo/pellm/bloom.py b/python/fate_llm/model_zoo/pellm/bloom.py
index ae48925..eafbee9 100644
--- a/python/fate_llm/model_zoo/pellm/bloom.py
+++ b/python/fate_llm/model_zoo/pellm/bloom.py
@@ -18,7 +18,7 @@
 from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM
 
 
-class BloomForCausalLM(PELLM):
+class Bloom(PELLM):
 
     config_class = BloomConfig
     model_loader = BloomForCausalLM
diff --git a/python/fate_llm/model_zoo/pellm/chatglm.py b/python/fate_llm/model_zoo/pellm/chatglm.py
index 98e0c7d..d49c5be 100644
--- a/python/fate_llm/model_zoo/pellm/chatglm.py
+++ b/python/fate_llm/model_zoo/pellm/chatglm.py
@@ -17,20 +17,18 @@
 from transformers import AutoConfig
 
 
-class ChatGLMForConditionalGeneration(PELLM):
+class ChatGLM(PELLM):
     enable_save_pretrained = True
 
     def __init__(self,
                  pretrained_path: str = None,
                  peft_type: str = None,
                  peft_config: dict = None,
-                 fp16: bool = True,
                  pre_seq_len: int = None,
                  prefix_projection: bool = False) -> None:
 
         self.pre_seq_len = pre_seq_len
         self.prefix_projection = prefix_projection
-        self.fp16 = fp16
 
         super().__init__(pretrained_path=pretrained_path,
                          peft_type=peft_type,
@@ -44,15 +42,13 @@ def init_config(self):
 
     def init_base_lm(self):
         super(
-            ChatGLMForConditionalGeneration,
+            ChatGLM,
             self).init_base_lm(
             trust_remote_code=True)
-        if self.fp16:
-            self._pe_lm.half()
 
     def add_peft(self):
         if self.pre_seq_len:
             self._pe_lm.half()
             self._pe_lm.transformer.prefix_encoder.float()
         else:
-            super(ChatGLMForConditionalGeneration, self).add_peft()
+            super(ChatGLM, self).add_peft()
diff --git a/python/fate_llm/model_zoo/pellm/gpt2.py b/python/fate_llm/model_zoo/pellm/gpt2.py
index aceca10..085bffd 100644
--- a/python/fate_llm/model_zoo/pellm/gpt2.py
+++ b/python/fate_llm/model_zoo/pellm/gpt2.py
@@ -22,7 +22,8 @@ class GPT2(PELLM):
     config_class = GPT2Config
     model_loader = GPT2ForSequenceClassification
 
-    def __init__(self, config: dict = None,
+    def __init__(self,
+                 config: dict = None,
                  pretrained_path: str = None,
                  peft_type: str = None,
                  peft_config: dict = None,
diff --git a/python/fate_llm/model_zoo/pellm/llama.py b/python/fate_llm/model_zoo/pellm/llama.py
index d23474a..a2c53c3 100644
--- a/python/fate_llm/model_zoo/pellm/llama.py
+++ b/python/fate_llm/model_zoo/pellm/llama.py
@@ -19,17 +19,15 @@
 from transformers import LlamaForCausalLM
 
 
-class LLAMAForCausalLM(PELLM):
+class LLaMa(PELLM):
     config_class = LlamaConfig
     enable_save_pretrained = True
 
     def __init__(self,
                  pretrained_path: str = None,
                  peft_type: str = None,
-                 peft_config: dict = None,
-                 fp16: bool = True) -> None:
+                 peft_config: dict = None) -> None:
 
-        self.fp16 = fp16
         super().__init__(pretrained_path=pretrained_path,
                          peft_type=peft_type,
                          peft_config=peft_config)
@@ -44,9 +42,6 @@ def init_base_lm(self):
             raise ValueError(
                 'config_path to pretrained model folder cannot be None')
 
-        if self.fp16:
-            self._pe_lm.half()
-
     def check_config(self, pretrain_path):
         config = AutoConfig.from_pretrained(pretrain_path)
         assert isinstance(
diff --git a/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py b/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
index 540ec0f..a753e36 100644
--- a/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
+++ b/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
@@ -19,7 +19,11 @@
 from transformers import AutoConfig
 from transformers import AutoModel
 from transformers.configuration_utils import PretrainedConfig
-from federatedml.util import LOGGER
+import logging
+import yaml
+
+
+logger = logging.getLogger(__name__)
 
 
 AVAILABLE_PEFT_CONFIG = list(
@@ -32,10 +36,10 @@
 class PELLM(torch.nn.Module):
 
     config_class: PretrainedConfig = None
-    enable_save_pretrained: bool = True
     model_loader = None
 
-    def __init__(self, config: dict = None,
+    def __init__(self,
+                 config: dict = None,
                  pretrained_path: str = None,
                  peft_type: str = None,
                  peft_config: dict = None,
@@ -88,46 +92,33 @@ def init_base_lm(self, **kwargs):
                 'config_path to pretrained model folder cannot be None')
 
     def add_peft(self):
-        assert self.peft_type in AVAILABLE_PEFT_CONFIG, 'peft name {} not in availabe config {}'.format(
+        assert self.peft_type in AVAILABLE_PEFT_CONFIG, 'peft name {} not in available config {}'.format(
             self.peft_type, AVAILABLE_PEFT_CONFIG)
 
         if self.peft_config is None:
             peft_config = getattr(peft, self.peft_type)()
-        else:
+        elif isinstance(self.peft_config, dict):
             peft_config = getattr(peft, self.peft_type)(**self.peft_config)
+        elif isinstance(self.peft_config, str):
+            peft_config = yaml.safe_load(self.peft_config)
+        else:
+            raise ValueError(f"Can not parse peft_config of {type(self.peft_config)}")
 
         self._pe_lm = peft.get_peft_model(self._pe_lm, peft_config)
 
     def model_summary(self):
-        try:
+        if hasattr(self._pe_lm, "print_trainable_parameters"):
             summary = self._pe_lm.print_trainable_parameters()
+            logger.debug(f'PELLM model summary: \n{summary}')
 
-            LOGGER.debug('PELLM model summary: \n{}'.format(summary))
-        except BaseException:
-            pass
-
-    def _get_trainable_parameters(self):
-        trainable = []
-        for n, p in self._pe_lm.named_parameters():
-            if p.requires_grad:
-                trainable.append(p)
-        return trainable
-
-    def forward(self, tokenized_data: dict):
+    def forward(self, **tokenized_data):
         return self._pe_lm(**tokenized_data)
 
-    def save_pretrained(self, path):
-        if not self.enable_save_pretrained:
-            raise ValueError(
-                "To save trainable parameters only, set enable_save_pretrained=True in your model")
-
-        from pathlib import Path
-
+    def save_pretrained(self, output_path):
         state_dict = {
             k: p.to("cpu") for k,
             p in self._pe_lm.named_parameters() if p.requires_grad}
-        Path.mkdir(Path(path), exist_ok=True)
-        torch.save(state_dict, Path(path).joinpath("adapter_model.bin"))
+        torch.save(state_dict, output_path)
 
 
 class AutoPELLM(PELLM):
diff --git a/python/fate_llm/runner/__init__.py b/python/fate_llm/runner/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/fate_llm/runner/homo_seq2seq_runner.py b/python/fate_llm/runner/homo_seq2seq_runner.py
new file mode 100644
index 0000000..ab0bdc1
--- /dev/null
+++ b/python/fate_llm/runner/homo_seq2seq_runner.py
@@ -0,0 +1,244 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from fate.components.components.nn.nn_runner import (
+    NNRunner,
+    load_model_dict_from_path,
+    dir_warning,
+    loader_load_from_conf,
+    run_dataset_func,
+)
+from fate.components.components.nn.runner.homo_default_runner import DefaultRunner
+from fate.ml.nn.homo.fedavg import FedAVGArguments
+from fate_llm.homo.fedavg import Seq2SeqFedAVGClient, Seq2SeqFedAVGServer
+from typing import Dict
+from fate.components.components.nn.loader import Loader
+import torch.nn as nn
+import torch.optim as optim
+from fate.ml.nn.trainer.trainer_base import FedArguments, HomoTrainerServer
+from fate_llm.trainer.seq2seq_trainer import Seq2SeqTrainingArguments, HomoSeq2SeqTrainerClient
+from typing import Union, Type, Callable, Optional
+from transformers.trainer_utils import get_last_checkpoint
+from typing import Literal
+import logging
+from fate.arch.dataframe import DataFrame
+from transformers.modeling_utils import PreTrainedModel, unwrap_model
+
+
+logger = logging.getLogger(__name__)
+
+
+SUPPORTED_ALGO = ["fedavg"]
+
+
+def _check_instances(
+    trainer: Union[Type[HomoSeq2SeqTrainerClient], Type[HomoTrainerServer]] = None,
+    fed_args: FedArguments = None,
+    model: nn.Module = None,
+    optimizer: optim.Optimizer = None,
+    train_args: Seq2SeqTrainingArguments = None,
+    data_collator: Callable = None,
+) -> None:
+    if trainer is not None and not (
+        issubclass(type(trainer), HomoSeq2SeqTrainerClient) or issubclass(type(trainer), HomoTrainerServer)
+    ):
+        raise TypeError(
+            f"SetupReturn Error: trainer must be a subclass of either "
+            f"HomoSeq2SeqTrainerClient or HomoSeq2SeqTrainerClient but got {type(trainer)}"
+        )
+
+    if fed_args is not None and not isinstance(fed_args, FedArguments):
+        raise TypeError(f"SetupReturn Error: fed_args must be an instance of FedArguments but got {type(fed_args)}")
+
+    if model is not None and not issubclass(type(model), nn.Module):
+        raise TypeError(f"SetupReturn Error: model must be a subclass of torch.nn.Module but got {type(model)}")
+
+    if optimizer is not None and not issubclass(type(optimizer), optim.Optimizer):
+        raise TypeError(
+            f"SetupReturn Error: optimizer must be a subclass of torch.optim.Optimizer but got {type(optimizer)}"
+        )
+
+    if train_args is not None and not isinstance(train_args, Seq2SeqTrainingArguments):
+        raise TypeError(
+            f"SetupReturn Error: train_args must be an instance of Seq2SeqTrainingArguments "
+            f"but got {type(train_args)}"
+        )
+
+    if data_collator is not None and not callable(data_collator):
+        raise TypeError(f"SetupReturn Error: data_collator must be callable but got {type(data_collator)}")
+
+
+class Seq2SeqRunner(DefaultRunner):
+    def __init__(
+        self,
+        algo: str = "fedavg",
+        model_conf: Optional[Dict] = None,
+        dataset_conf: Optional[Dict] = None,
+        optimizer_conf: Optional[Dict] = None,
+        training_args_conf: Optional[Dict] = None,
+        fed_args_conf: Optional[Dict] = None,
+        data_collator_conf: Optional[Dict] = None,
+        tokenizer_conf: Optional[Dict] = None,
+        task_type: Literal["causal_lm", "other"] = "causal_lm",
+        local_mode: bool = False,
+        save_trainable_weights_only: bool = False,
+    ) -> None:
+        super(NNRunner, self).__init__()
+        self.algo = algo
+        self.model_conf = model_conf
+        self.dataset_conf = dataset_conf
+        self.optimizer_conf = optimizer_conf
+        self.training_args_conf = training_args_conf
+        self.fed_args_conf = fed_args_conf
+        self.data_collator_conf = data_collator_conf
+        self.local_mode = local_mode
+        self.tokenizer_conf = tokenizer_conf
+        self.task_type = task_type
+        self.save_trainable_weights_only = save_trainable_weights_only
+
+        # check param
+        if self.algo not in SUPPORTED_ALGO:
+            raise ValueError(f"algo should be one of {SUPPORTED_ALGO}")
+        if self.task_type not in ["causal_lm", "others"]:
+            raise ValueError("task_type should be one of [binary, multi, regression, others]")
+        assert isinstance(self.local_mode, bool), "local should be bool"
+
+        # setup var
+        self.trainer = None
+        self.training_args = None
+
+    def client_setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None, stage="train"):
+        if stage == "predict":
+            self.local_mode = True
+
+        if self.algo == "fedavg":
+            client_class: Seq2SeqFedAVGClient = Seq2SeqFedAVGClient
+        else:
+            raise ValueError(f"algo {self.algo} not supported")
+
+        ctx = self.get_context()
+        model = loader_load_from_conf(self.model_conf)
+        if model is None:
+            raise ValueError(f"model is None, cannot load model from conf {self.model_conf}")
+
+        if output_dir is None:
+            output_dir = "./"
+
+        resume_path = None
+        if saved_model is not None:
+            model_dict = load_model_dict_from_path(saved_model)
+            model.load_state_dict(model_dict)
+            logger.info(f"loading model dict from {saved_model} to model done")
+            if get_last_checkpoint(saved_model) is not None:
+                resume_path = saved_model
+                logger.info(f"checkpoint detected, resume_path set to {resume_path}")
+        # load optimizer
+        if self.optimizer_conf:
+            optimizer_loader = Loader.from_dict(self.optimizer_conf)
+            optimizer_ = optimizer_loader.load_item()
+            optimizer_params = optimizer_loader.kwargs
+            optimizer = optimizer_(model.parameters(), **optimizer_params)
+        else:
+            optimizer = None
+        # load collator func
+        data_collator = loader_load_from_conf(self.data_collator_conf)
+        # load tokenizer if import conf provided
+        tokenizer = loader_load_from_conf(self.tokenizer_conf)
+        # args
+        dir_warning(self.training_args_conf)
+        training_args = Seq2SeqTrainingArguments(**self.training_args_conf)
+        self.training_args = training_args
+        # reset to default, saving to arbitrary path is not allowed in
+        # DefaultRunner
+        training_args.output_dir = output_dir
+        training_args.resume_from_checkpoint = resume_path  # resume path
+        fed_args = FedAVGArguments(**self.fed_args_conf)
+
+        # prepare trainer
+        trainer = client_class(
+            ctx=ctx,
+            model=model,
+            optimizer=optimizer,
+            training_args=training_args,
+            fed_args=fed_args,
+            data_collator=data_collator,
+            tokenizer=tokenizer,
+            train_set=train_set,
+            val_set=validate_set,
+            local_mode=self.local_mode,
+            save_trainable_weights_only=self.save_trainable_weights_only,
+        )
+
+        _check_instances(
+            trainer=trainer,
+            model=model,
+            optimizer=optimizer,
+            train_args=training_args,
+            fed_args=fed_args,
+            data_collator=data_collator,
+        )
+        return trainer
+
+    def server_setup(self, stage="train"):
+        if stage == "predict":
+            self.local_mode = True
+        if self.algo == "fedavg":
+            server_class: Seq2SeqFedAVGServer = Seq2SeqFedAVGServer
+        else:
+            raise ValueError(f"algo {self.algo} not supported")
+        ctx = self.get_context()
+        trainer = server_class(ctx=ctx, local_mode=self.local_mode)
+        _check_instances(trainer)
+        return trainer
+
+    def predict(self, test_data: Union[str, DataFrame], saved_model_path: str = None) -> Union[DataFrame, None]:
+        if self.is_client():
+            test_set = self._prepare_data(test_data, "test_data")
+            if self.trainer is not None:
+                trainer = self.trainer
+                logger.info("trainer found, skip setting up")
+            else:
+                trainer = self.client_setup(saved_model=saved_model_path, stage="predict")
+
+            classes = run_dataset_func(test_set, "get_classes")
+            match_ids = run_dataset_func(test_set, "get_match_ids")
+            sample_ids = run_dataset_func(test_set, "get_sample_ids")
+            match_id_name = run_dataset_func(test_set, "get_match_id_name")
+            sample_id_name = run_dataset_func(test_set, "get_sample_id_name")
+
+            if not self.training_args.predict_with_generate:
+                return
+
+            pred_rs = trainer.predict(test_set)
+
+            if self.training_args and self.training_args.deepspeed and self.training_args.local_rank != 0:
+                return
+
+            rs_df = self.get_nn_output_dataframe(
+                self.get_context(),
+                pred_rs.predictions,
+                pred_rs.label_ids if hasattr(pred_rs, "label_ids") else None,
+                match_ids,
+                sample_ids,
+                match_id_name=match_id_name,
+                sample_id_name=sample_id_name,
+                dataframe_format="dist_df",
+                task_type=self.task_type,
+                classes=classes,
+            )
+            return rs_df
+        else:
+            # server not predict
+            return
diff --git a/python/fate_llm/trainer/__init__.py b/python/fate_llm/trainer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/fate_llm/trainer/fedipr_trainer.py b/python/fate_llm/trainer/fedipr_trainer.py
deleted file mode 100644
index bc0f919..0000000
--- a/python/fate_llm/trainer/fedipr_trainer.py
+++ /dev/null
@@ -1,594 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import torch as t
-import tqdm
-import numpy as np
-import torch
-from typing import Literal
-from federatedml.nn.homo.trainer.fedavg_trainer import FedAVGTrainer
-from federatedml.nn.backend.utils import distributed_util
-from torch.utils.data import DataLoader, DistributedSampler
-import torch.distributed as dist
-from fate_llm.dataset.watermark import WaterMarkImageDataset, WaterMarkDataset
-from federatedml.util import LOGGER
-from fate_llm.model_zoo.ipr.sign_block import generate_signature, is_sign_block
-from fate_llm.model_zoo.ipr.sign_block import SignatureBlock
-from sklearn.metrics import accuracy_score
-from federatedml.nn.dataset.base import Dataset
-from federatedml.util import consts
-
-
-def get_sign_blocks(model: torch.nn.Module):
-
-    record_sign_block = {}
-    for name, m in model.named_modules():
-        if is_sign_block(m):
-            record_sign_block[name] = m
-
-    return record_sign_block
-
-
-def get_keys(sign_block_dict: dict, num_bits: int):
-
-    key_pairs = {}
-    param_len = []
-    sum_allocated_bits = 0
-    # Iterate through each layer and compute the flattened parameter lengths
-    for k, v in sign_block_dict.items():
-        param_len.append(len(v.embeded_param.flatten()))
-    total_param_len = sum(param_len)
-
-    alloc_bits = {}
-
-    for i, (k, v) in enumerate(sign_block_dict.items()):
-        allocated_bits = int((param_len[i] / total_param_len) * num_bits)
-        alloc_bits[k] = allocated_bits
-        sum_allocated_bits += allocated_bits
-
-    rest_bits = num_bits - sum_allocated_bits
-    if rest_bits > 0:
-        alloc_bits[k] += rest_bits
-
-    for k, v in sign_block_dict.items():
-        key_pairs[k] = generate_signature(v, alloc_bits[k])
-
-    return key_pairs
-
-
-"""
-Verify Tools
-"""
-
-
-def to_cuda(var, device=0):
-    if hasattr(var, 'cuda'):
-        return var.cuda(device)
-    elif isinstance(var, tuple) or isinstance(var, list):
-        ret = tuple(to_cuda(i) for i in var)
-        return ret
-    elif isinstance(var, dict):
-        for k in var:
-            if hasattr(var[k], 'cuda'):
-                var[k] = var[k].cuda(device)
-        return var
-    else:
-        return var
-
-
-def _verify_sign_blocks(sign_blocks, keys, cuda=False, device=None):
-
-    signature_correct_count = 0
-    total_bit = 0
-    for name, block in sign_blocks.items():
-        block: SignatureBlock = block
-        W, signature = keys[name]
-        if cuda:
-            W = to_cuda(W, device=device)
-            signature = to_cuda(signature, device=device)
-        extract_bits = block.extract_sign(W)
-        total_bit += len(extract_bits)
-        signature_correct_count += (extract_bits ==
-                                    signature).sum().detach().cpu().item()
-
-    sign_acc = signature_correct_count / total_bit
-    return sign_acc
-
-
-def _suggest_sign_bit(param_num, client_num):
-    max_signbit = param_num // client_num
-    max_signbit -= 1  # not to exceed
-    if max_signbit <= 0:
-        raise ValueError(
-            'not able to add feature based watermark, param_num is {}, client num is {}, computed max bit is {} <=0'.format(
-                param_num, client_num, max_signbit))
-    return max_signbit
-
-
-def compute_sign_bit(model, client_num):
-    total_param_num = 0
-    blocks = get_sign_blocks(model)
-    for k, v in blocks.items():
-        total_param_num += v.embeded_param_num()
-    if total_param_num == 0:
-        return 0
-    return _suggest_sign_bit(total_param_num, client_num)
-
-
-def verify_feature_based_signature(model, keys):
-
-    model = model.cpu()
-    sign_blocks = get_sign_blocks(model)
-    return _verify_sign_blocks(sign_blocks, keys, cuda=False)
-
-
-class FedIPRTrainer(FedAVGTrainer):
-
-    def __init__(self,
-                 epochs=10,
-                 noraml_dataset_batch_size=32,
-                 watermark_dataset_batch_size=2,
-                 early_stop=None,
-                 tol=0.0001,
-                 secure_aggregate=True,
-                 weighted_aggregation=True,
-                 aggregate_every_n_epoch=None,
-                 cuda=None,
-                 pin_memory=True,
-                 shuffle=True,
-                 data_loader_worker=0,
-                 validation_freqs=None,
-                 checkpoint_save_freqs=None,
-                 task_type='auto',
-                 save_to_local_dir=False,
-                 collate_fn=None,
-                 collate_fn_params=None,
-                 alpha=0.01,
-                 verify_freqs=1,
-                 backdoor_verify_method: Literal['accuracy',
-                                                 'loss'] = 'accuracy'):
-
-        super().__init__(
-            epochs,
-            noraml_dataset_batch_size,
-            early_stop,
-            tol,
-            secure_aggregate,
-            weighted_aggregation,
-            aggregate_every_n_epoch,
-            cuda,
-            pin_memory,
-            shuffle,
-            data_loader_worker,
-            validation_freqs,
-            checkpoint_save_freqs,
-            task_type,
-            save_to_local_dir,
-            collate_fn,
-            collate_fn_params)
-
-        self.normal_train_set = None
-        self.watermark_set = None
-        self.data_loader = None
-        self.normal_dataset_batch_size = noraml_dataset_batch_size
-        self.watermark_dataset_batch_size = watermark_dataset_batch_size
-        self.alpha = alpha
-        self.verify_freqs = verify_freqs
-        self.backdoor_verify_method = backdoor_verify_method
-        self._sign_keys = None
-        self._sign_blocks = None
-        self._client_num = None
-        self._sign_bits = None
-
-        assert self.alpha > 0, 'alpha must be greater than 0'
-        assert self.verify_freqs > 0 and isinstance(
-            self.verify_freqs, int), 'verify_freqs must be greater than 0'
-        assert self.backdoor_verify_method in [
-            'accuracy', 'loss'], 'backdoor_verify_method must be accuracy or loss'
-
-    def local_mode(self):
-        self.fed_mode = False
-        self._client_num = 1
-
-    def _handle_dataset(self, train_set, collate_fn):
-
-        if not distributed_util.is_distributed() or distributed_util.get_num_workers() <= 1:
-            return DataLoader(
-                train_set,
-                batch_size=self.batch_size,
-                pin_memory=self.pin_memory,
-                shuffle=self.shuffle,
-                num_workers=self.data_loader_worker,
-                collate_fn=collate_fn
-            )
-        else:
-            train_sampler = DistributedSampler(
-                train_set,
-                num_replicas=dist.get_world_size(),
-                rank=dist.get_rank()
-            )
-            return DataLoader(
-                train_set,
-                batch_size=self.batch_size,
-                pin_memory=self.pin_memory,
-                num_workers=self.data_loader_worker,
-                collate_fn=collate_fn,
-                sampler=train_sampler
-            )
-
-    def _get_train_data_loader(self, train_set):
-
-        collate_fn = self._get_collate_fn(train_set)
-
-        if isinstance(train_set, WaterMarkDataset):
-            LOGGER.info(
-                'detect watermark dataset, split watermark dataset and normal dataset')
-            normal_train_set = train_set.get_normal_dataset()
-            watermark_set = train_set.get_watermark_dataset()
-            if normal_train_set is None:
-                raise ValueError(
-                    'normal dataset must not be None in FedIPR algo')
-            train_dataloder = self._handle_dataset(
-                normal_train_set, collate_fn)
-
-            if watermark_set is not None:
-                watermark_dataloader = self._handle_dataset(
-                    watermark_set, collate_fn)
-            else:
-                watermark_dataloader = None
-            self.normal_train_set = normal_train_set
-            self.watermark_set = watermark_set
-            dataloaders = {
-                'train': train_dataloder,
-                'watermark': watermark_dataloader}
-            return dataloaders
-        else:
-            LOGGER.info('detect non-watermark dataset')
-            train_dataloder = self._handle_dataset(train_set, collate_fn)
-            dataloaders = {'train': train_dataloder, 'watermark': None}
-            return dataloaders
-
-    def _get_device(self):
-        if self.cuda is not None or self._enable_deepspeed:
-            device = self.cuda_main_device if self.cuda_main_device is not None else self.model.device
-            return device
-        else:
-            return None
-
-    def verify(self, sign_blocks: dict, keys: dict):
-
-        return _verify_sign_blocks(
-            sign_blocks,
-            keys,
-            self.cuda is not None,
-            self._get_device())
-
-    def get_loss_from_pred(self, loss, pred, batch_label):
-
-        if not loss and hasattr(pred, "loss"):
-            batch_loss = pred.loss
-
-        elif loss is not None:
-            if batch_label is None:
-                raise ValueError(
-                    "When loss is set, please provide label to calculate loss"
-                )
-            if not isinstance(pred, torch.Tensor) and hasattr(pred, "logits"):
-                pred = pred.logits
-            batch_loss = loss(pred, batch_label)
-        else:
-            raise ValueError(
-                'Trainer requires a loss function, but got None, please specify loss function in the'
-                ' job configuration')
-
-        return batch_loss
-
-    def _get_keys(self, sign_blocks):
-
-        if self._sign_keys is None:
-            self._sign_keys = get_keys(sign_blocks, self._sign_bits)
-        return self._sign_keys
-
-    def _get_sign_blocks(self):
-        if self._sign_blocks is None:
-            sign_blocks = get_sign_blocks(self.model)
-            self._sign_blocks = sign_blocks
-
-        return self._sign_blocks
-
-    def train(
-            self,
-            train_set: Dataset,
-            validate_set: Dataset = None,
-            optimizer=None,
-            loss=None,
-            extra_dict={}):
-
-        if 'keys' in extra_dict:
-            self._sign_keys = extra_dict['keys']
-            self._sign_bits = extra_dict['num_bits']
-        else:
-            LOGGER.info('computing feature based sign bits')
-            if self._client_num is None and self.party_id_list is not None:
-                self._client_num = len(self.party_id_list)
-            self._sign_bits = compute_sign_bit(self.model, self._client_num)
-
-        LOGGER.info(
-            'client num {}, party id list {}'.format(
-                self._client_num,
-                self.party_id_list))
-        LOGGER.info(
-            'will assign {} bits for feature based watermark'.format(
-                self._sign_bits))
-        return super().train(train_set, validate_set, optimizer, loss, extra_dict)
-
-    def train_an_epoch(
-            self,
-            epoch_idx,
-            model,
-            train_set,
-            optimizer,
-            loss_func):
-
-        epoch_loss = 0.0
-        batch_idx = 0
-        acc_num = 0
-
-        sign_blocks = self._get_sign_blocks()
-        keys = self._get_keys(sign_blocks)
-
-        dl, watermark_dl = self.data_loader['train'], self.data_loader['watermark']
-        if isinstance(dl, DistributedSampler):
-            dl.sampler.set_epoch(epoch_idx)
-        if isinstance(watermark_dl, DistributedSampler):
-            watermark_dl.sampler.set_epoch(epoch_idx)
-
-        if not self.fed_mode:
-            trainset_iterator = tqdm.tqdm(dl)
-        else:
-            trainset_iterator = dl
-        batch_label = None
-
-        # collect watermark data and mix them into the training data
-        watermark_collect = []
-        if watermark_dl is not None:
-            for watermark_batch in watermark_dl:
-                watermark_collect.append(watermark_batch)
-
-        total_batch_len = len(dl)
-        LOGGER.info('total batch len is {}'.format(total_batch_len))
-
-        for _batch_iter in trainset_iterator:
-
-            _batch_iter = self._decode(_batch_iter)
-
-            if isinstance(_batch_iter, list) or isinstance(_batch_iter, tuple):
-                batch_data, batch_label = _batch_iter
-            else:
-                batch_data = _batch_iter
-
-            if watermark_dl is not None:
-                # Mix the backdoor sample into the training data
-                wm_batch_idx = int(batch_idx % len(watermark_collect))
-                wm_batch = watermark_collect[wm_batch_idx]
-                if isinstance(wm_batch, list):
-                    wm_batch_data, wm_batch_label = wm_batch
-                    batch_data = torch.cat([batch_data, wm_batch_data], dim=0)
-                    batch_label = torch.cat(
-                        [batch_label, wm_batch_label], dim=0)
-                else:
-                    wm_batch_data = wm_batch
-                    batch_data = torch.cat([batch_data, wm_batch_data], dim=0)
-
-            if self.cuda is not None or self._enable_deepspeed:
-                device = self.cuda_main_device if self.cuda_main_device is not None else self.model.device
-                batch_data = self.to_cuda(batch_data, device)
-                if batch_label is not None:
-                    batch_label = self.to_cuda(batch_label, device)
-
-            if not self._enable_deepspeed:
-                optimizer.zero_grad()
-            else:
-                model.zero_grad()
-
-            pred = model(batch_data)
-
-            sign_loss = 0
-            # Get the sign loss of model
-            for name, block in sign_blocks.items():
-
-                block: SignatureBlock = block
-                W, signature = keys[name]
-                if self.cuda is not None:
-                    device = self._get_device()
-                    W = self.to_cuda(W, device)
-                    signature = self.to_cuda(signature, device)
-                sign_loss += self.alpha * block.sign_loss(W, signature)
-
-            batch_loss = self.get_loss_from_pred(loss_func, pred, batch_label)
-            batch_loss += sign_loss
-
-            if not self._enable_deepspeed:
-
-                batch_loss.backward()
-                optimizer.step()
-                batch_loss_np = np.array(
-                    batch_loss.detach().tolist()) if self.cuda is None else np.array(
-                    batch_loss.cpu().detach().tolist())
-
-                if acc_num + self.batch_size > len(train_set):
-                    batch_len = len(train_set) - acc_num
-                else:
-                    batch_len = self.batch_size
-
-                epoch_loss += batch_loss_np * batch_len
-            else:
-                batch_loss = model.backward(batch_loss)
-                batch_loss_np = np.array(batch_loss.cpu().detach().tolist())
-                model.step()
-                batch_loss_np = self._sync_loss(
-                    batch_loss_np * self._get_batch_size(batch_data))
-                if distributed_util.is_rank_0():
-                    epoch_loss += batch_loss_np
-
-            batch_idx += 1
-
-            if self.fed_mode:
-                if total_batch_len > 100:
-                    if batch_idx % (total_batch_len // 100) == 0:
-                        percentage = (batch_idx / total_batch_len) * 100
-                        LOGGER.debug(f"Training progress of epoch {epoch_idx}: {percentage:.1f}%")
-                else:
-                    LOGGER.debug("Training epoch {}:batch {}".format(epoch_idx, batch_idx))
-
-        epoch_loss = epoch_loss / len(train_set)
-
-        # verify the sign of model during training
-        if epoch_idx % self.verify_freqs == 0:
-            # verify feature-based signature
-            sign_acc = self.verify(sign_blocks, keys)
-            LOGGER.info(f"epoch {epoch_idx} sign accuracy: {sign_acc}")
-            # verify backdoor signature
-            if self.watermark_set is not None:
-                _, pred, label = self._predict(self.watermark_set)
-                pred = pred.detach().cpu()
-                label = label.detach().cpu()
-                if self.backdoor_verify_method == 'accuracy':
-                    if not isinstance(
-                            pred, torch.Tensor) and hasattr(
-                            pred, "logits"):
-                        pred = pred.logits
-                    pred = pred.numpy().reshape((len(label), -1))
-                    label = label.numpy()
-                    pred_label = np.argmax(pred, axis=1)
-                    metric = accuracy_score(
-                        pred_label.flatten(), label.flatten())
-                else:
-                    metric = self.get_loss_from_pred(loss_func, pred, label)
-
-                LOGGER.info(
-                    f"epoch {epoch_idx} backdoor {self.backdoor_verify_method}: {metric}")
-
-        return epoch_loss
-
-    def _predict(self, dataset: Dataset):
-        pred_result = []
-
-        # switch eval mode
-        dataset.eval()
-        model = self._select_model()
-        model.eval()
-
-        if not dataset.has_sample_ids():
-            dataset.init_sid_and_getfunc(prefix=dataset.get_type())
-
-        labels = []
-        with torch.no_grad():
-            for _batch_iter in DataLoader(
-                dataset, self.batch_size
-            ):
-                if isinstance(_batch_iter, list):
-                    batch_data, batch_label = _batch_iter
-                else:
-                    batch_label = _batch_iter.pop("labels")
-                    batch_data = _batch_iter
-
-                if self.cuda is not None or self._enable_deepspeed:
-                    device = self.cuda_main_device if self.cuda_main_device is not None else self.model.device
-                    batch_data = self.to_cuda(batch_data, device)
-
-                pred = model(batch_data)
-
-                if not isinstance(
-                        pred, torch.Tensor) and hasattr(
-                        pred, "logits"):
-                    pred = pred.logits
-
-                pred_result.append(pred)
-                labels.append(batch_label)
-
-            ret_rs = torch.concat(pred_result, axis=0)
-            ret_label = torch.concat(labels, axis=0)
-
-        # switch back to train mode
-        dataset.train()
-        model.train()
-
-        return dataset.get_sample_ids(), ret_rs, ret_label
-
-    def predict(self, dataset: Dataset):
-
-        if self.task_type in [consts.CAUSAL_LM, consts.SEQ_2_SEQ_LM]:
-            LOGGER.warning(
-                f"Not support prediction of task_types={[consts.CAUSAL_LM, consts.SEQ_2_SEQ_LM]}")
-            return
-
-        if distributed_util.is_distributed() and not distributed_util.is_rank_0():
-            return
-
-        if isinstance(dataset, WaterMarkDataset):
-            normal_train_set = dataset.get_normal_dataset()
-            if normal_train_set is None:
-                raise ValueError(
-                    'normal train set is None in FedIPR algo predict function')
-        else:
-            normal_train_set = normal_train_set
-
-        ids, ret_rs, ret_label = self._predict(normal_train_set)
-
-        if self.fed_mode:
-            return self.format_predict_result(
-                ids, ret_rs, ret_label, task_type=self.task_type)
-        else:
-            return ret_rs, ret_label
-
-    def save(
-            self,
-            model=None,
-            epoch_idx=-1,
-            optimizer=None,
-            converge_status=False,
-            loss_history=None,
-            best_epoch=-1,
-            extra_data={}):
-
-        extra_data = {'keys': self._sign_keys, 'num_bits': self._sign_bits}
-        super().save(
-            model,
-            epoch_idx,
-            optimizer,
-            converge_status,
-            loss_history,
-            best_epoch,
-            extra_data)
-
-    def local_save(self,
-                   model=None,
-                   epoch_idx=-1,
-                   optimizer=None,
-                   converge_status=False,
-                   loss_history=None,
-                   best_epoch=-1,
-                   extra_data={}):
-
-        extra_data = {'keys': self._sign_keys, 'num_bits': self._sign_bits}
-        super().local_save(
-            model,
-            epoch_idx,
-            optimizer,
-            converge_status,
-            loss_history,
-            best_epoch,
-            extra_data)
diff --git a/python/fate_llm/trainer/offsite_tuning_trainer.py b/python/fate_llm/trainer/offsite_tuning_trainer.py
deleted file mode 100644
index c48bc30..0000000
--- a/python/fate_llm/trainer/offsite_tuning_trainer.py
+++ /dev/null
@@ -1,317 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import torch as t
-from federatedml.nn.homo.trainer.fedavg_trainer import FedAVGTrainer
-from federatedml.framework.homo.aggregator.secure_aggregator import SecureAggregatorClient as SecureAggClient
-from federatedml.framework.homo.aggregator.secure_aggregator import SecureAggregatorServer as SecureAggServer
-from federatedml.util import LOGGER
-from fate_llm.model_zoo.offsite_tuning.offsite_tuning_model import OffsiteTuningSubModel, OffsiteTuningMainModel
-from federatedml.util import consts
-from federatedml.nn.backend.utils import deepspeed_util
-from federatedml.nn.backend.utils import distributed_util
-import torch.distributed as dist
-from federatedml.optim.convergence import converge_func_factory
-
-
-
-def count_parameters(model: t.nn.Module):
-    return sum(p.numel() for p in model.parameters())
-
-
-def count_trainable_parameters(model):
-    return sum(p.numel() for p in model.parameters() if p.requires_grad)
-
-
-class OffsiteTuningTrainer(FedAVGTrainer):
-
-    def __init__(self, epochs=10, batch_size=512,  # training parameter
-                 early_stop=None, tol=0.0001,  # early stop parameters
-                 secure_aggregate=False, weighted_aggregation=True, aggregate_every_n_epoch=None,  # federation, offsite tuning need to aggregate large model, default is False
-                 cuda=None,
-                 pin_memory=True, shuffle=True, data_loader_worker=0,  # GPU & dataloader
-                 validation_freqs=None,  # validation configuration
-                 checkpoint_save_freqs=None,  # checkpoint configuration
-                 task_type='auto',  # task type
-                 save_to_local_dir=False,  # save model to local path
-                 collate_fn=None,
-                 collate_fn_params=None,
-                 need_aggregate=False
-                 ):
-
-        super().__init__(
-            epochs=epochs,
-            batch_size=batch_size,
-            early_stop=early_stop,
-            tol=tol,
-            secure_aggregate=secure_aggregate,
-            weighted_aggregation=weighted_aggregation,
-            aggregate_every_n_epoch=aggregate_every_n_epoch,
-            cuda=cuda,
-            pin_memory=pin_memory,
-            shuffle=shuffle,
-            data_loader_worker=data_loader_worker,
-            validation_freqs=validation_freqs,
-            checkpoint_save_freqs=checkpoint_save_freqs,
-            task_type=task_type,
-            save_to_local_dir=save_to_local_dir,
-            collate_fn=collate_fn,
-            collate_fn_params=collate_fn_params)
-
-        self.need_aggregate = need_aggregate
-        self.model_transvar = None
-
-
-    def _send_submodel_weights(self, state_dict, send_func, suffix='start'):
-        from fate_arch.session import computing_session as session
-        emulator = state_dict['emulator']
-        adapter_top = state_dict['adapter_top']
-        adapter_bottom = state_dict['adapter_bottom']
-        tb1 = session.parallelize([(key, value) for key, value in emulator.items()], include_key=True, partition=4)
-        tb2 = session.parallelize([(key, value) for key, value in adapter_top.items()], include_key=True, partition=4)
-        tb3 = session.parallelize([(key, value) for key, value in adapter_bottom.items()], include_key=True, partition=4)
-        state_dict.pop('emulator', None)
-        state_dict.pop('adapter_top', None)
-        state_dict.pop('adapter_bottom', None)
-        tb4 = session.parallelize([(key, value) for key, value in state_dict.items()], include_key=True, partition=4)
-        send_func(
-            tb1,
-            suffix='emulator_'+suffix)
-        send_func(
-            tb2,
-            suffix='adapter_top_'+suffix)
-        send_func(
-            tb3,
-            suffix='adapter_bottom_'+suffix)
-        send_func(
-            tb4,
-            suffix='other_param_'+suffix)
-
-
-    def _get_submodel_weights(self, get_func, suffix='start'):
-        
-        tb1 = get_func(suffix='emulator_'+suffix)[0]
-        tb2 = get_func(suffix='adapter_top_'+suffix)[0]
-        tb3 = get_func(suffix='adapter_bottom_'+suffix)[0]
-        tb4 = get_func(suffix='other_param_'+suffix)[0]
-
-        got_state_dict = {}
-        got_state_dict['emulator'] = dict(tb1.collect())
-        got_state_dict['adapter_top'] = dict(tb2.collect())
-        got_state_dict['adapter_bottom'] = dict(tb3.collect())
-        other_param = dict(tb4.collect())
-        got_state_dict.update(other_param)
-
-        return got_state_dict
-
-
-    def on_loop_begin_client(self):
-        
-        unwarp_model = self.unwrap_model(self.model)
-        if not isinstance(unwarp_model, OffsiteTuningSubModel):
-            raise ValueError(
-                'Client must provide a model subclassing "OffsiteTuningSubModel" in the offsite-tuning trainer, but got {}'.format(
-                    type(
-                        unwarp_model)))
-
-        model: OffsiteTuningSubModel = unwarp_model
-
-        if self.fed_mode:
-
-            if (distributed_util.is_distributed() and distributed_util.is_rank_0()) or (not distributed_util.is_distributed()):
-                # receive parameters from model provider and load emulator, adapter
-                ret = self._get_submodel_weights(self.model_transvar.server_to_client.get, suffix='start')
-                LOGGER.info('loaded weights keys are {}'.format(ret.keys()))
-                # client_agg: SecureAggregatorClient = self.client_agg
-                # param = client_agg.get('sub_model_parameter')
-                model.load_submodel_weights(ret)
-
-            if distributed_util.is_distributed(): 
-                self._share_model(sync_trainable_only=False)
-                # reinitalize deepspeed
-                deepspeed_util.init_deepspeed_env(self._ds_config)
-                model = self.unwrap_model(self.model)
-                self._model, self._optimizer = deepspeed_util.deepspeed_init(model, self._ds_config)
-                if deepspeed_util.is_zero3(self._ds_config):
-                    self._model.train()
-
-        LOGGER.info(
-            'adapter parameters num: {}'.format(
-                count_parameters(
-                    model.get_adapter_top()) +
-                count_parameters(
-                    model.get_adapter_bottom())))
-        LOGGER.info(
-            'trainable parameters num {}'.format(
-                count_trainable_parameters(model)))
-
-    def on_loop_begin_server(self):
-        
-        if self.model is None:
-            raise ValueError(
-                'Server must provide a main model in the offsite-tuning trainer, got None model, \
-                               please set server_init to True and provide the model config')
-
-        unwrap_model = self.unwrap_model(self.model)
-        if not isinstance(unwrap_model, OffsiteTuningMainModel):
-            raise ValueError(
-                'Server must provide a model subclassing "OffsiteTuningMainModel" in the offsite-tuning trainer, but got {}'.format(
-                    type(
-                        unwrap_model)))
-
-        model: OffsiteTuningMainModel = unwrap_model
-        sub_model_state_dict = model.get_submodel_weights()
-        self._send_submodel_weights(sub_model_state_dict, self.model_transvar.server_to_client.remote, suffix='start')
-        # server_agg: SecureAggregatorServer = self.server_agg
-        # server_agg.broadcast(
-        #     sub_model_state_dict,
-        #     suffix='sub_model_parameter')
-
-        LOGGER.info(
-            'adapter parameters num: {}'.format(
-                count_parameters(
-                    model.get_adapter_top()) +
-                count_parameters(
-                    model.get_adapter_bottom())))
-        LOGGER.info(
-            'emulator parameters num: {}'.format(
-                count_parameters(
-                    model.get_emulator())))
-
-    def on_loop_end_client(self):
-
-        if self.fed_mode:
-            if (distributed_util.is_distributed() and distributed_util.is_rank_0()) or (not distributed_util.is_distributed()):
-                model: OffsiteTuningSubModel = self.unwrap_model(self.model) 
-                sub_model_state_dict = model.get_submodel_weights()
-                # client_agg = self.client_agg
-                # client_agg.send(
-                #     sub_model_state_dict,
-                #     suffix='final_sub_model_parameter')
-                self._send_submodel_weights(sub_model_state_dict, self.model_transvar.client_to_server.remote, suffix='end')
-
-    def on_loop_end_server(self):
-    
-        model: OffsiteTuningMainModel = self.model
-        ret_state_dict = self._get_submodel_weights(self.model_transvar.client_to_server.get, suffix='end')
-        model.load_submodel_weights(ret_state_dict)
-        # server_agg = self.server_agg
-        # sub_model_state_dict = server_agg.collect(
-        #     suffix='final_sub_model_parameter')[0]
-        # model.load_submodel_weights(sub_model_state_dict)
-
-
-    def _client_sends_data(self, epoch_idx, epoch_loss, cur_agg_round):
-        if self.need_aggregate:
-            return super()._client_sends_data(epoch_idx, epoch_loss, cur_agg_round)
-        else:
-            return False
-
-    def _server_aggregates_data(
-            self,
-            epoch_idx,
-            check_converge,
-            converge_func):
-        if self.need_aggregate:
-            return super()._server_aggregates_data(epoch_idx, check_converge, converge_func)
-        else:
-            return False
-
-    def _init_aggregator(self, train_set):
-        # compute round to aggregate
-        cur_agg_round = 0
-        if self.aggregate_every_n_epoch is not None:
-            aggregate_round = self.epochs // self.aggregate_every_n_epoch
-        else:
-            aggregate_round = self.epochs
-
-        # initialize fed avg client
-        if self.fed_mode:
-            if self.weighted_aggregation:
-                sample_num = len(train_set)
-            else:
-                sample_num = 1.0
-
-            if not distributed_util.is_distributed() or distributed_util.is_rank_0():
-                if len(self.party_id_list) == 1:  # guest only:
-                    clients = (consts.GUEST, )
-                else:
-                    clients = (consts.GUEST, consts.HOST)
-                client_agg = SecureAggClient(
-                    self.secure_aggregate,
-                    aggregate_weight=sample_num,
-                    communicate_match_suffix=self.comm_suffix,
-                    clients=clients,
-                    lm_aggregate=True
-                    )
-                # init model transvar
-                from federatedml.framework.homo.blocks import CommunicatorTransVar
-                self.model_transvar = CommunicatorTransVar(clients=clients, prefix='model', disable_gc=True)
-            else:
-                client_agg = None
-        else:
-            client_agg = None
-
-        return client_agg, aggregate_round
-
-    def server_aggregate_procedure(self, extra_data={}):
-
-        # converge status
-        check_converge = False
-        converge_func = None
-        if self.early_stop:
-            check_converge = True
-            converge_func = converge_func_factory(
-                self.early_stop, self.tol).is_converge
-            LOGGER.info(
-                'check early stop, converge func is {}'.format(converge_func))
-
-        LOGGER.info('server running aggregate procedure')
-        if len(self.party_id_list) == 1:  # guest only:
-            clients = (consts.GUEST, )
-        else:
-            clients = (consts.GUEST, consts.HOST)
-
-        self.server_agg = SecureAggServer(
-            self.secure_aggregate,
-            communicate_match_suffix=self.comm_suffix,
-            clients=clients
-            )
-        from federatedml.framework.homo.blocks import CommunicatorTransVar
-        self.model_transvar = CommunicatorTransVar(clients=clients, prefix='model', disable_gc=True)
-
-        self.on_loop_begin_server()
-        # aggregate and broadcast models
-        for i in range(self.epochs):
-
-            need_stop = self._server_aggregates_data(
-                i, check_converge, converge_func)
-            if need_stop:
-                break
-
-        self.on_loop_end_server()
-        LOGGER.info('server aggregation process done')
-        if self._model is not None:
-            if self.save_to_local_dir:
-                self.local_save(
-                    model=self.model,
-                    epoch_idx=i,
-                    converge_status=need_stop)
-            else:
-                self.save(
-                    model=self.model,
-                    epoch_idx=i,
-                    converge_status=need_stop)
-            LOGGER.info('sever side model saved')
diff --git a/python/fate_llm/trainer/seq2seq_trainer.py b/python/fate_llm/trainer/seq2seq_trainer.py
new file mode 100644
index 0000000..7046cd3
--- /dev/null
+++ b/python/fate_llm/trainer/seq2seq_trainer.py
@@ -0,0 +1,166 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from transformers import Seq2SeqTrainingArguments as _hf_Seq2SeqTrainingArguments, Seq2SeqTrainer
+from dataclasses import dataclass, field
+from typing import Optional
+from fate.ml.nn.trainer.trainer_base import HomoTrainerMixin, FedArguments, get_ith_checkpoint
+import os
+import torch
+from torch import nn
+from typing import Any, Dict, List, Callable
+from enum import Enum
+from fate.arch import Context
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader, Dataset
+from transformers import PreTrainedTokenizer
+from transformers import Trainer, EvalPrediction
+from transformers.trainer_utils import has_length
+from torch.utils.data import _utils
+from transformers.trainer_callback import TrainerCallback
+from typing import Optional
+from dataclasses import dataclass, field
+from transformers.modeling_utils import unwrap_model
+
+
+TRAINABLE_WEIGHTS_NAME = "adapter_model.bin"
+
+
+@dataclass
+class _S2STrainingArguments(_hf_Seq2SeqTrainingArguments):
+    # in fate-2.0, we will control the output dir when using pipeline
+    output_dir: str = field(default="./")
+    disable_tqdm: bool = field(default=True)
+    save_strategy: str = field(default="no")
+    logging_strategy: str = field(default="epoch")
+    logging_steps: int = field(default=1)
+    evaluation_strategy: str = field(default="no")
+    logging_dir: str = field(default=None)
+    checkpoint_idx: int = field(default=None)
+    # by default, we use constant learning rate, the same as FATE-1.X
+    lr_scheduler_type: str = field(default="constant")
+    log_level: str = field(default="info")
+    deepspeed: Optional[str] = field(default=None)
+    save_safetensors: bool = field(default=False)
+    use_cpu: bool = field(default=True)
+
+    def __post_init__(self):
+        self.push_to_hub = False
+        self.hub_model_id = None
+        self.hub_strategy = "every_save"
+        self.hub_token = None
+        self.hub_private_repo = False
+        self.push_to_hub_model_id = None
+        self.push_to_hub_organization = None
+        self.push_to_hub_token = None
+
+        super().__post_init__()
+
+
+@dataclass
+class Seq2SeqTrainingArguments(_S2STrainingArguments):
+    # To simplify the to dict result(to_dict only return non-default args)
+
+    def to_dict(self):
+        # Call the superclass's to_dict method
+        all_args = super().to_dict()
+        # Get a dict with default values for all fields
+        default_args = _S2STrainingArguments().to_dict()
+        # Filter out args that are equal to their default values
+        set_args = {name: value for name, value in all_args.items() if value != default_args.get(name)}
+        return set_args
+
+
+class HomoSeq2SeqTrainerClient(Seq2SeqTrainer, HomoTrainerMixin):
+
+    def __init__(
+        self,
+        ctx: Context,
+        model: nn.Module,
+        training_args: Seq2SeqTrainingArguments,
+        fed_args: FedArguments,
+        train_set: Dataset,
+        val_set: Dataset = None,
+        optimizer: torch.optim.Optimizer = None,
+        data_collator: Callable = None,
+        scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        callbacks: Optional[List[TrainerCallback]] = [],
+        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+        local_mode: bool = False,
+        save_trainable_weights_only: bool = False,
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+    ):
+        # in case you forget to set evaluation_strategy
+        if val_set is not None and training_args.evaluation_strategy == "no":
+            training_args.evaluation_strategy = "epoch"
+
+        HomoTrainerMixin.__init__(
+            self,
+            ctx=ctx,
+            model=model,
+            optimizer=optimizer,
+            training_args=training_args,
+            fed_args=fed_args,
+            train_set=train_set,
+            val_set=val_set,
+            scheduler=scheduler,
+            callbacks=callbacks,
+            compute_metrics=compute_metrics,
+            local_mode=local_mode,
+            save_trainable_weights_only=save_trainable_weights_only,
+        )
+
+        # concat checkpoint path if checkpoint idx is set
+        if self._args.checkpoint_idx is not None:
+            checkpoint_path = self._args.resume_from_checkpoint
+            if checkpoint_path is not None and os.path.exists(checkpoint_path):
+                checkpoint_folder = get_ith_checkpoint(checkpoint_path, self._args.checkpoint_idx)
+                self._args.resume_from_checkpoint = os.path.join(checkpoint_path, checkpoint_folder)
+
+        Trainer.__init__(
+            self,
+            model=model,
+            args=self._args,
+            train_dataset=train_set,
+            eval_dataset=val_set,
+            data_collator=data_collator,
+            optimizers=(optimizer, scheduler),
+            tokenizer=tokenizer,
+            compute_metrics=self._compute_metrics_warp_func,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        self._add_fate_callback(self.callback_handler)
+
+    def _save(
+        self,
+        output_dir: Optional[str] = None,
+        state_dict=None
+    ):
+        if not self._save_trainable_weights_only:
+            return super()._save(output_dir, state_dict)
+        else:
+            model = unwrap_model(self.model)
+
+            if hasattr(model, "save_pretrained"):
+                model.save_pretrained(os.path.join(output_dir, TRAINABLE_WEIGHTS_NAME))
+            else:
+                state_dict = {
+                    k: p.to("cpu") for k,
+                                       p in model.named_parameters() if p.requires_grad
+                }
+
+                torch.save(state_dict, os.path.join(output_dir, TRAINABLE_WEIGHTS_NAME))

From bc1a310b68f5179efc6243979a3b4804fa017d2e Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Mon, 19 Feb 2024 22:33:22 +0800
Subject: [PATCH 02/35] update dataset for llm-2.0

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 .../dataset/data_collator/__init__.py         |  28 +++
 python/fate_llm/dataset/glm_tokenizer.py      |  97 ---------
 ...{prompt_tokenizer.py => prompt_dataset.py} |  70 ++++---
 python/fate_llm/dataset/qa_dataset.py         | 193 ------------------
 .../{nlp_tokenizer.py => seq_cls_dataset.py}  |   8 +-
 .../fate_llm/dataset/tokenizers/__init__.py   |  44 ++++
 python/fate_llm/dataset/watermark.py          | 134 ------------
 7 files changed, 122 insertions(+), 452 deletions(-)
 create mode 100644 python/fate_llm/dataset/data_collator/__init__.py
 delete mode 100644 python/fate_llm/dataset/glm_tokenizer.py
 rename python/fate_llm/dataset/{prompt_tokenizer.py => prompt_dataset.py} (57%)
 delete mode 100644 python/fate_llm/dataset/qa_dataset.py
 rename python/fate_llm/dataset/{nlp_tokenizer.py => seq_cls_dataset.py} (95%)
 create mode 100644 python/fate_llm/dataset/tokenizers/__init__.py
 delete mode 100644 python/fate_llm/dataset/watermark.py

diff --git a/python/fate_llm/dataset/data_collator/__init__.py b/python/fate_llm/dataset/data_collator/__init__.py
new file mode 100644
index 0000000..3d7398c
--- /dev/null
+++ b/python/fate_llm/dataset/data_collator/__init__.py
@@ -0,0 +1,28 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from transformers.data import data_collator
+from ..tokenizers import get_prompt_tokenizer
+
+
+def get_data_collator(data_collator_name, tokenizer_name_or_path=None, pad_token=None, padding_side="left", **kwargs):
+    if not hasattr(data_collator, data_collator_name):
+        support_collator_list = list(filter(lambda module_name: "collator" in module_name.lower(), dir(data_collator)))
+        return ValueError(f"data_collator's name={data_collator_name} does not in support list={support_collator_list}")
+
+    tokenizer = get_prompt_tokenizer(tokenizer_name_or_path=tokenizer_name_or_path,
+                                     pad_token=pad_token)
+
+    return getattr(data_collator, data_collator_name)(tokenizer, **kwargs)
diff --git a/python/fate_llm/dataset/glm_tokenizer.py b/python/fate_llm/dataset/glm_tokenizer.py
deleted file mode 100644
index 8a7ddf9..0000000
--- a/python/fate_llm/dataset/glm_tokenizer.py
+++ /dev/null
@@ -1,97 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from federatedml.nn.dataset.base import Dataset
-import pandas as pd
-from transformers import AutoTokenizer
-
-
-PROMPT_TEMPLATE = "{prompt}"
-
-
-class GLMTokenizerDataset(Dataset):
-    def __init__(self, truncation=True, text_max_length=256,
-                 tokenizer_name_or_path=None,
-                 padding=True, padding_side="right", pad_token=None,
-                 trust_remote_code=True,
-                 prompt_template=None,
-                 prompt_column="content",
-                 response_column="summary",
-                 version=1
-                 ):
-
-        super(GLMTokenizerDataset, self).__init__()
-        self.label = None
-        self.tokenizer = None
-        self.padding = padding
-        self.truncation = truncation
-        self.max_length = text_max_length
-        self.tokenizer_name_or_path = tokenizer_name_or_path
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.tokenizer_name_or_path, trust_remote_code=trust_remote_code)
-        self.tokenizer.padding_side = padding_side
-        if pad_token is not None:
-            self.tokenizer.add_special_tokens({'pad_token': pad_token})
-
-        self._version = version
-
-        self.prompt_template = prompt_template if prompt_template else PROMPT_TEMPLATE
-        self.prompt_column = prompt_column
-        self.response_column = response_column
-        self._data = None
-
-    def load(self, file_path):
-        df = pd.read_json(file_path, lines=True)
-        self._data = df.apply(self._process_data, axis=1)
-
-    def _process_data(self, line):
-        _prompt = line[self.prompt_column]
-        _response = line[self.response_column]
-
-        prompt = self.prompt_template.format_map(dict(prompt=_prompt))
-        prompt_ids = self.tokenizer.encode(prompt, add_special_tokens=False)
-        target_ids = self.tokenizer.encode(_response, add_special_tokens=False)
-
-        if len(prompt_ids) > self.max_length - 1:
-            prompt_ids = prompt_ids[: self.max_length - 1]
-        if len(target_ids) > self.max_length - 2:
-            target_ids = target_ids[: self.max_length - 2]
-
-        input_ids = self.tokenizer.build_inputs_with_special_tokens(
-            prompt_ids, target_ids)
-
-        if self._version == 1:
-            seq_length = input_ids.index(self.tokenizer.bos_token_id)
-        else:
-            seq_length = len(prompt_ids)
-
-        labels = [-100] * seq_length + input_ids[seq_length:]
-
-        return {
-            "input_ids": input_ids,
-            "labels": labels,
-        }
-
-    def get_vocab_size(self):
-        return self.tokenizer.vocab_size
-
-    def __getitem__(self, item):
-        return self._data[item]
-
-    def __len__(self):
-        return len(self._data)
-
-    def __repr__(self):
-        return self.tokenizer.__repr__()
diff --git a/python/fate_llm/dataset/prompt_tokenizer.py b/python/fate_llm/dataset/prompt_dataset.py
similarity index 57%
rename from python/fate_llm/dataset/prompt_tokenizer.py
rename to python/fate_llm/dataset/prompt_dataset.py
index 1aff07b..1d4be24 100644
--- a/python/fate_llm/dataset/prompt_tokenizer.py
+++ b/python/fate_llm/dataset/prompt_dataset.py
@@ -13,18 +13,23 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import copy
 import pandas as pd
-from transformers import AutoTokenizer
-from federatedml.nn.dataset.base import Dataset
+from fate.ml.nn.dataset.base import Dataset
+from .tokenizers import get_prompt_tokenizer
 
 
 PROMPT_TEMPLATE = "{prompt}"
 
 
-class PromptTokenizerDataset(Dataset):
-    def __init__(self, text_max_length=256,
+class PromptDataset(Dataset):
+    def __init__(self,
+                 text_max_length=256,
                  tokenizer_name_or_path=None,
-                 padding=False, padding_side='left',
+                 trust_remote_code=False,
+                 padding=False,
+                 padding_side='left',
+                 pad_token=None,
                  pad_token_id=0,
                  bos_token_id=1,
                  eos_token_id=2,
@@ -35,21 +40,23 @@ def __init__(self, text_max_length=256,
                  response_column="summary",
                  ):
 
-        super(PromptTokenizerDataset, self).__init__()
+        super(PromptDataset, self).__init__()
         self.tokenizer = None
+        self.tokenizer_name_or_path = tokenizer_name_or_path
         self.padding = padding
         self.add_special_tokens = add_special_tokens
         self.max_length = text_max_length
-        self.tokenizer_name_or_path = tokenizer_name_or_path
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.tokenizer_name_or_path, add_eos_token=add_eos_token)
-        if pad_token_id is not None:
-            self.tokenizer.pad_token_id = pad_token_id
-        if bos_token_id is not None:
-            self.tokenizer.bos_token_id = bos_token_id
-        if eos_token_id is not None:
-            self.tokenizer.eos_token_id = eos_token_id
-        self.tokenizer.padding_side = padding_side
+
+        self.tokenizer = get_prompt_tokenizer(
+            tokenizer_name_or_path=tokenizer_name_or_path,
+            trust_remote_code=trust_remote_code,
+            pad_token=pad_token,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            padding_side=padding_side,
+            add_eos_token=add_eos_token,
+        )
 
         self.prompt_template = prompt_template if prompt_template else PROMPT_TEMPLATE
         self.prompt_column = prompt_column
@@ -74,15 +81,30 @@ def _process_data(self, line):
             add_special_tokens=self.add_special_tokens,
             padding=self.padding)
 
-        if len(prompt_ids) > self.max_length - 2:
-            prompt_ids = prompt_ids[: self.max_length - 2]
-        if len(target_ids) > self.max_length - 2:
-            target_ids = target_ids[: self.max_length - 2]
+        if "chatglm" in self.tokenizer_name_or_path.lower():
+            if len(prompt_ids) > self.max_length - 1:
+                prompt_ids = prompt_ids[: self.max_length - 1]
+            if len(target_ids) > self.max_length - 2:
+                target_ids = target_ids[: self.max_length - 2]
+
+            input_ids = self.tokenizer.build_inputs_with_special_tokens(
+                prompt_ids, target_ids)
+
+            if "chatglm2" in self.tokenizer_name_or_path:
+                seq_length = input_ids.index(self.tokenizer.bos_token_id)
+            else:
+                seq_length = len(prompt_ids)
+        else:
+            if len(prompt_ids) > self.max_length - 2:
+                prompt_ids = prompt_ids[: self.max_length - 2]
+            if len(target_ids) > self.max_length - 1:
+                target_ids = target_ids[: self.max_length - 1]
+
+            input_ids = self.tokenizer.build_inputs_with_special_tokens(
+                prompt_ids, target_ids)
 
-        input_ids = self.tokenizer.build_inputs_with_special_tokens(
-            prompt_ids, target_ids)
+            seq_length = len(prompt_ids) + 2
 
-        seq_length = len(prompt_ids) + 2
         labels = [-100] * seq_length + input_ids[seq_length:]
 
         return {
@@ -94,7 +116,7 @@ def get_vocab_size(self):
         return self.tokenizer.vocab_size
 
     def __getitem__(self, item):
-        return self._data[item]
+        return copy.deepcopy(self._data[item])
 
     def __len__(self):
         return len(self._data)
diff --git a/python/fate_llm/dataset/qa_dataset.py b/python/fate_llm/dataset/qa_dataset.py
deleted file mode 100644
index 017c241..0000000
--- a/python/fate_llm/dataset/qa_dataset.py
+++ /dev/null
@@ -1,193 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from datasets import load_from_disk, load_dataset
-from transformers import AutoTokenizer
-import torch as t
-import os
-from federatedml.nn.dataset.base import Dataset
-
-
-"""
-These Data pre-processing templates are from https://github.com/mit-han-lab/offsite-tuning
-"""
-
-class PIQA:
-    def __init__(self):
-        self._template = "Question: {}\nAnswer:"
-
-    def get_context(self, examples):
-        ctx = examples['goal']
-        return [self._template.format(c) for c in ctx]
-
-    def get_target(self, examples):
-        if -1 in examples["label"]:  # test set
-            return [""] * len(examples["label"])
-        else:
-            gt_tuples = [("sol{}".format(label + 1), idx)
-                         for idx, label in enumerate(examples['label'])]
-            return [examples[k][i] for k, i in gt_tuples]
-
-class SciQ:
-    def __init__(self):
-        self._template = "{}\nQuestion: {}\nAnswer:"
-
-    def get_context(self, examples):
-        sources = examples['support']
-        queries = examples['question']
-        return [self._template.format(s, q) for s, q in zip(sources, queries)]
-
-    def get_target(self, examples):
-        return examples['correct_answer']
-
-
-class OpenBookQA:
-    def get_context(self, examples):
-        return examples['question_stem']
-
-    def get_target(self, examples):
-        choices = examples['choices']
-        answers = examples['answerKey']
-        targets = []
-        for choice, answer in zip(choices, answers):
-            answer = ord(answer.strip()) - ord('A')
-            targets.append(choice['text'][answer])
-        return targets
-
-
-task_dict = {
-    "piqa": PIQA(),
-    "sciq": SciQ(),
-    "openbookqa": OpenBookQA()
-}
-
-
-def tokenize_qa_dataset(dataset_name, tokenizer, save_path, seq_max_len=1000):
-
-    max_len = seq_max_len
-    assert dataset_name in ['piqa', 'sciq', 'openbookqa'], "dataset name must be one of ['piqa', 'sciq', 'openbookqa']"
-    raw_datasets = load_dataset(dataset_name) 
-    task = task_dict[dataset_name]
-
-    column_names = raw_datasets["train"].column_names
-
-    def tokenize_function(examples):
-        context = task.get_context(examples)
-        target = task.get_target(examples)
-
-        context = tokenizer(context)
-        target = tokenizer(target)
-
-        # if context is ending with special token, remove it
-        if len(context['input_ids'][0]) > 0 and context['input_ids'][0][-1] in tokenizer.all_special_ids:
-            context['input_ids'] = [i[:-1] for i in context['input_ids']]
-            context['attention_mask'] = [a[:-1]
-                                            for a in context['attention_mask']]
-
-        # if target is starting with special token, remove it
-        if len(target['input_ids'][0]) > 0 and target['input_ids'][0][0] in tokenizer.all_special_ids:
-            target['input_ids'] = [i[1:] for i in target['input_ids']]
-            target['attention_mask'] = [a[1:]
-                                        for a in target['attention_mask']]
-
-        out = {}
-        out['input_ids'] = [i1 + i2 for i1,
-                            i2 in zip(context['input_ids'], target['input_ids'])]
-        out['attention_mask'] = [a1 + a2 for a1,
-                                    a2 in zip(context['attention_mask'], target['attention_mask'])]
-
-        # set -100 for context tokens
-        out["labels"] = [
-            [-100] * len(i1) + i2 for i1, i2 in zip(context['input_ids'], target['input_ids'])]
-
-        return out
-
-    tokenized_datasets = raw_datasets.map(
-            tokenize_function,
-            batched=True,
-        num_proc=32,
-        remove_columns=column_names,
-        load_from_cache_file=True,
-        desc="Running tokenizer on dataset",
-    )
-
-    # pad all instances in lm_datasets to the max length of the dataset
-    max_length = -1
-    for v in tokenized_datasets.values():
-        for x in v:
-            max_length = max(max_length, len(x['input_ids']))
-
-    # pad to the multiple of 8
-    max_length = (max_length // 8 + 1) * 8
-
-    block_size = max_len
-    max_length = min(max_length, block_size)
-
-    def pad_function(examples):
-        examples["input_ids"] = [i + [tokenizer.pad_token_id] *
-                                    (max_length - len(i)) for i in examples["input_ids"]]
-        examples["attention_mask"] = [[1] * len(i) + [0] *
-                                        (max_length - len(i)) for i in examples["attention_mask"]]
-        examples["labels"] = [i + [-100] *
-                                (max_length - len(i)) for i in examples["labels"]]
-        # truncate to max_length
-        examples["input_ids"] = [i[:max_length] for i in examples["input_ids"]]
-        examples["attention_mask"] = [a[:max_length]
-                                        for a in examples["attention_mask"]]
-        examples["labels"] = [l[:max_length] for l in examples["labels"]]
-        return examples
-
-
-    tokenized_datasets = tokenized_datasets.map(
-            pad_function,
-            batched=True,
-            num_proc=32,
-            load_from_cache_file=True,
-            desc=f"Padding dataset to max length {max_length}",
-        )
-
-    tokenized_datasets.save_to_disk(save_path)
-    return tokenized_datasets
-
-
-class QaDataset(Dataset):
-
-    def __init__(self, tokenizer_name_or_path, select_num=None, start_idx=None):
-        self.select_num = select_num
-        self.start_idx = start_idx
-        self.ds = None
-        if 'llama' in tokenizer_name_or_path:
-            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, unk_token="<unk>",  bos_token="<s>", eos_token="</s>", add_eos_token=True)   
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-        else:
-            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
-        if 'gpt2' in tokenizer_name_or_path:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-        
-
-    def load(self, path):
-        loaded = load_from_disk(path)
-        self.ds = loaded['train']
-        if self.select_num is not None:
-            if self.start_idx is not None:
-                self.ds = self.ds.select(range(self.start_idx, min(len(self.ds), self.start_idx + self.select_num)))
-            else:
-                self.ds = self.ds.select(range(self.select_num))
-
-    def __len__(self):
-        return len(self.ds)
-
-    def __getitem__(self, idx):
-        return self.ds[idx]
diff --git a/python/fate_llm/dataset/nlp_tokenizer.py b/python/fate_llm/dataset/seq_cls_dataset.py
similarity index 95%
rename from python/fate_llm/dataset/nlp_tokenizer.py
rename to python/fate_llm/dataset/seq_cls_dataset.py
index c506088..e86d6f2 100644
--- a/python/fate_llm/dataset/nlp_tokenizer.py
+++ b/python/fate_llm/dataset/seq_cls_dataset.py
@@ -13,7 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from federatedml.nn.dataset.base import Dataset
+from fate.ml.nn.dataset.base import Dataset
 import pandas as pd
 import torch as t
 from transformers import AutoTokenizer
@@ -24,7 +24,7 @@
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
-class TokenizerDataset(Dataset):
+class SeqCLSDataset(Dataset):
     """
     A Dataset for some basic NLP Tasks, this dataset will automatically transform raw text into word indices
     using AutoTokenizer from transformers library,
@@ -53,7 +53,7 @@ def __init__(
             pad_token=None,
             return_input_ids=True):
 
-        super(TokenizerDataset, self).__init__()
+        super(SeqCLSDataset, self).__init__()
         self.text = None
         self.word_idx = None
         self.label = None
@@ -119,4 +119,4 @@ def __len__(self):
         return len(self.text)
 
     def __repr__(self):
-        return self.tokenizer.__repr__()
+        return self.tokenizer.__repr__()
\ No newline at end of file
diff --git a/python/fate_llm/dataset/tokenizers/__init__.py b/python/fate_llm/dataset/tokenizers/__init__.py
new file mode 100644
index 0000000..1e47421
--- /dev/null
+++ b/python/fate_llm/dataset/tokenizers/__init__.py
@@ -0,0 +1,44 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from transformers import AutoTokenizer
+
+
+def get_prompt_tokenizer(
+    tokenizer_name_or_path,
+    trust_remote_code=False,
+    padding_side="left",
+    pad_token=None,
+    pad_token_id=0,
+    bos_token_id=1,
+    eos_token_id=2,
+    add_eos_token=True,
+):
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_name_or_path,
+        trust_remote_code=trust_remote_code,
+        add_eos_token=add_eos_token
+    )
+    tokenizer.padding_side = padding_side
+    if pad_token is not None:
+        tokenizer.add_special_tokens({'pad_token': pad_token})
+    if pad_token_id is not None:
+        tokenizer.pad_token_id = pad_token_id
+    if bos_token_id is not None:
+        tokenizer.bos_token_id = bos_token_id
+    if eos_token_id is not None:
+        tokenizer.eos_token_id = eos_token_id
+
+    return tokenizer
diff --git a/python/fate_llm/dataset/watermark.py b/python/fate_llm/dataset/watermark.py
deleted file mode 100644
index 3e38959..0000000
--- a/python/fate_llm/dataset/watermark.py
+++ /dev/null
@@ -1,134 +0,0 @@
-import os
-import numpy as np
-import pandas as pd
-from federatedml.nn.dataset.base import Dataset
-from federatedml.util import LOGGER
-from federatedml.nn.dataset.image import ImageDataset
-
-
-class WaterMarkDataset(Dataset):
-
-    def __init__(self):
-        super().__init__()
-        self.normal_dataset = None
-        self.watermark_dataset = None
-
-    def load(self, path):
-        raise NotImplementedError()
-
-    def get_normal_dataset(self):
-        return self.normal_dataset
-
-    def get_watermark_dataset(self):
-        return self.watermark_dataset
-
-
-class WaterMarkImageDataset(WaterMarkDataset):
-
-    """
-    A basic WaterMark Dataset built on pytorch ImageFolder
-    This Dataset is used for Fed-IPR algorithm, see: https://arxiv.org/abs/2109.13236 for details
-    It will contain two part: A normal dataset and a watermark dataset
-    When training, the FedIPR Trainer will retrieve the normal dataset and watermark dataset from it
-    Given a path to image folder, WaterMarkImageDataset will load images from this folder, by default,
-    folder named 'normal' will be treated as normal dataset, folder named 'watermark' will be treated as watermark dataset
-    You can adjust this behavior by setting normal_folder_name and watermark_folder_name in the parameters
-
-    Parameters:
-    ----------
-    normal_folder_name: str, default is 'normal', the folder name of normal dataset
-    watermark_folder_name: str, default is 'watermark', the folder name of watermark dataset
-    """
-
-    def __init__(
-            self,
-            normal_folder_name='normal',
-            watermark_folder_name='watermark',
-            center_crop=False,
-            center_crop_shape=None,
-            generate_id_from_file_name=True,
-            file_suffix='.jpg',
-            float64=False,
-            label_dtype='long'):
-
-        super(WaterMarkImageDataset, self).__init__()
-        self.normal_folder_name = normal_folder_name
-        self.watermark_folder_name = watermark_folder_name
-
-        self.normal_dataset = None
-        self.watermark_dataset = None
-
-        self.center_crop = center_crop
-        self.size = center_crop_shape
-        self.generate_id_from_file_name = generate_id_from_file_name
-        self.file_suffix = file_suffix
-        self.float64 = float64
-        self.label_type = label_dtype
-
-    def __getitem__(self, item):
-
-        if item < 0:
-            item = len(self) + item
-        if item < 0:
-            raise IndexError('index out of range')
-
-        if item < len(self.normal_dataset):
-            return ('normal', self.normal_dataset[item])
-        else:
-            return ('watermark',
-                    self.watermark_dataset[item - len(self.normal_dataset)])
-
-    def __len__(self):
-        len_ = 0
-        if self.normal_dataset is not None:
-            len_ += len(self.normal_dataset)
-        if self.watermark_dataset is not None:
-            len_ += len(self.watermark_dataset)
-        return len_
-
-    def load(self, file_path):
-
-        # normal dataset path
-        normal_path = os.path.join(file_path, self.normal_folder_name)
-        # watermark dataset path
-        watermark_path = os.path.join(file_path, self.watermark_folder_name)
-
-        # load normal dataset
-        self.normal_dataset = ImageDataset(
-            center_crop=self.center_crop,
-            center_crop_shape=self.size,
-            generate_id_from_file_name=self.generate_id_from_file_name,
-            file_suffix=self.file_suffix,
-            float64=self.float64,
-            label_dtype=self.label_type
-        )
-        if os.path.exists(normal_path):
-            self.normal_dataset.load(normal_path)
-        else:
-            self.normal_dataset = None
-            LOGGER.info(
-                f'normal dataset not found in {normal_path}, will not load normal dataset')
-        # load watermark dataset
-        self.watermark_dataset = ImageDataset(
-            center_crop=self.center_crop,
-            center_crop_shape=self.size,
-            generate_id_from_file_name=self.generate_id_from_file_name,
-            file_suffix=self.file_suffix,
-            float64=self.float64,
-            label_dtype=self.label_type
-        )
-        if os.path.exists(watermark_path):
-            self.watermark_dataset.load(watermark_path)
-        else:
-            self.watermark_dataset = None
-            LOGGER.info(
-                f'watermark dataset not found in {watermark_path}, will not load watermark dataset')
-
-    def get_normal_dataset(self):
-        return self.normal_dataset
-
-    def get_watermark_dataset(self):
-        return self.watermark_dataset
-
-    def get_classes(self):
-        return self.normal_dataset.get_classes()

From 8d25ba8e6068ce63bacfba447cf1bb377a17fe0f Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Tue, 20 Feb 2024 10:54:34 +0800
Subject: [PATCH 03/35] fix chatglm2 detection

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 python/fate_llm/dataset/prompt_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/fate_llm/dataset/prompt_dataset.py b/python/fate_llm/dataset/prompt_dataset.py
index 1d4be24..ec127d7 100644
--- a/python/fate_llm/dataset/prompt_dataset.py
+++ b/python/fate_llm/dataset/prompt_dataset.py
@@ -90,7 +90,7 @@ def _process_data(self, line):
             input_ids = self.tokenizer.build_inputs_with_special_tokens(
                 prompt_ids, target_ids)
 
-            if "chatglm2" in self.tokenizer_name_or_path:
+            if "chatglm2" in self.tokenizer_name_or_path.lower():
                 seq_length = input_ids.index(self.tokenizer.bos_token_id)
             else:
                 seq_length = len(prompt_ids)

From 6231f45d95358d0591b29cc91d097beeb757af39 Mon Sep 17 00:00:00 2001
From: cwj <talkingwallace@sohu.com>
Date: Tue, 20 Feb 2024 17:10:07 +0800
Subject: [PATCH 04/35] Add offsite-tuning contents Signed-off-by: weijingchen
 <talkingwallace@sohu.com>

Signed-off-by: cwj <talkingwallace@sohu.com>
---
 python/fate_llm/dataset/qa_dataset.py         | 195 ++++++++++++++
 python/fate_llm/homo/fedavg.py                | 101 ++++++++
 python/fate_llm/homo/offsite_tuning.py        | 149 +++++++++++
 python/fate_llm/homo/test/test_gpt2.py        |   5 +
 python/fate_llm/homo/test/test_loader.py      |  17 ++
 python/fate_llm/homo/test/test_ot.py          | 135 ++++++++++
 .../model_zoo/offsite_tuning/bloom.py         |   0
 .../fate_llm/model_zoo/offsite_tuning/gpt2.py | 242 +++++++++++++++++
 .../fate_llm/runner/offsite_tuning_runner.py  | 244 ++++++++++++++++++
 python/fate_llm/trainer/seq2seq_trainer.py    |  44 +---
 10 files changed, 1091 insertions(+), 41 deletions(-)
 create mode 100644 python/fate_llm/dataset/qa_dataset.py
 create mode 100644 python/fate_llm/homo/fedavg.py
 create mode 100644 python/fate_llm/homo/offsite_tuning.py
 create mode 100644 python/fate_llm/homo/test/test_gpt2.py
 create mode 100644 python/fate_llm/homo/test/test_loader.py
 create mode 100644 python/fate_llm/homo/test/test_ot.py
 create mode 100644 python/fate_llm/model_zoo/offsite_tuning/bloom.py
 create mode 100644 python/fate_llm/model_zoo/offsite_tuning/gpt2.py
 create mode 100644 python/fate_llm/runner/offsite_tuning_runner.py

diff --git a/python/fate_llm/dataset/qa_dataset.py b/python/fate_llm/dataset/qa_dataset.py
new file mode 100644
index 0000000..020072c
--- /dev/null
+++ b/python/fate_llm/dataset/qa_dataset.py
@@ -0,0 +1,195 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from datasets import load_from_disk, load_dataset
+from transformers import AutoTokenizer
+import torch as t
+import os
+from torch.utils.data import Dataset
+
+
+"""
+These Data pre-processing templates are from https://github.com/mit-han-lab/offsite-tuning
+"""
+
+class PIQA:
+    def __init__(self):
+        self._template = "Question: {}\nAnswer:"
+
+    def get_context(self, examples):
+        ctx = examples['goal']
+        return [self._template.format(c) for c in ctx]
+
+    def get_target(self, examples):
+        if -1 in examples["label"]:  # test set
+            return [""] * len(examples["label"])
+        else:
+            gt_tuples = [("sol{}".format(label + 1), idx)
+                         for idx, label in enumerate(examples['label'])]
+            return [examples[k][i] for k, i in gt_tuples]
+
+
+class SciQ:
+    def __init__(self):
+        self._template = "{}\nQuestion: {}\nAnswer:"
+
+    def get_context(self, examples):
+        sources = examples['support']
+        queries = examples['question']
+        return [self._template.format(s, q) for s, q in zip(sources, queries)]
+
+    def get_target(self, examples):
+        return examples['correct_answer']
+
+
+class OpenBookQA:
+    def get_context(self, examples):
+        return examples['question_stem']
+
+    def get_target(self, examples):
+        choices = examples['choices']
+        answers = examples['answerKey']
+        targets = []
+        for choice, answer in zip(choices, answers):
+            answer = ord(answer.strip()) - ord('A')
+            targets.append(choice['text'][answer])
+        return targets
+
+
+task_dict = {
+    "piqa": PIQA(),
+    "sciq": SciQ(),
+    "openbookqa": OpenBookQA()
+}
+
+
+def tokenize_qa_dataset(dataset_name, tokenizer, save_path, seq_max_len=1000):
+
+    max_len = seq_max_len
+    assert dataset_name in ['piqa', 'sciq', 'openbookqa'], "dataset name must be one of ['piqa', 'sciq', 'openbookqa']"
+    raw_datasets = load_dataset(dataset_name) 
+    task = task_dict[dataset_name]
+
+    column_names = raw_datasets["train"].column_names
+
+    def tokenize_function(examples):
+        context = task.get_context(examples)
+        target = task.get_target(examples)
+
+        context = tokenizer(context)
+        target = tokenizer(target)
+
+        # if context is ending with special token, remove it
+        if len(context['input_ids'][0]) > 0 and context['input_ids'][0][-1] in tokenizer.all_special_ids:
+            context['input_ids'] = [i[:-1] for i in context['input_ids']]
+            context['attention_mask'] = [a[:-1]
+                                            for a in context['attention_mask']]
+
+        # if target is starting with special token, remove it
+        if len(target['input_ids'][0]) > 0 and target['input_ids'][0][0] in tokenizer.all_special_ids:
+            target['input_ids'] = [i[1:] for i in target['input_ids']]
+            target['attention_mask'] = [a[1:]
+                                        for a in target['attention_mask']]
+
+        out = {}
+        out['input_ids'] = [i1 + i2 for i1,
+                            i2 in zip(context['input_ids'], target['input_ids'])]
+        out['attention_mask'] = [a1 + a2 for a1,
+                                    a2 in zip(context['attention_mask'], target['attention_mask'])]
+
+        # set -100 for context tokens
+        out["labels"] = [
+            [-100] * len(i1) + i2 for i1, i2 in zip(context['input_ids'], target['input_ids'])]
+
+        return out
+
+    tokenized_datasets = raw_datasets.map(
+            tokenize_function,
+            batched=True,
+        num_proc=32,
+        remove_columns=column_names,
+        load_from_cache_file=True,
+        desc="Running tokenizer on dataset",
+    )
+
+    # pad all instances in lm_datasets to the max length of the dataset
+    max_length = -1
+    for v in tokenized_datasets.values():
+        for x in v:
+            max_length = max(max_length, len(x['input_ids']))
+
+    # pad to the multiple of 8
+    max_length = (max_length // 8 + 1) * 8
+
+    block_size = max_len
+    max_length = min(max_length, block_size)
+
+    def pad_function(examples):
+        examples["input_ids"] = [i + [tokenizer.pad_token_id] *
+                                    (max_length - len(i)) for i in examples["input_ids"]]
+        examples["attention_mask"] = [[1] * len(i) + [0] *
+                                        (max_length - len(i)) for i in examples["attention_mask"]]
+        examples["labels"] = [i + [-100] *
+                                (max_length - len(i)) for i in examples["labels"]]
+        # truncate to max_length
+        examples["input_ids"] = [i[:max_length] for i in examples["input_ids"]]
+        examples["attention_mask"] = [a[:max_length]
+                                        for a in examples["attention_mask"]]
+        examples["labels"] = [l[:max_length] for l in examples["labels"]]
+        return examples
+
+
+    tokenized_datasets = tokenized_datasets.map(
+            pad_function,
+            batched=True,
+            num_proc=32,
+            load_from_cache_file=True,
+            desc=f"Padding dataset to max length {max_length}",
+        )
+
+    tokenized_datasets.save_to_disk(save_path)
+    return tokenized_datasets
+
+
+class QaDataset(Dataset):
+
+    def __init__(self, tokenizer_name_or_path, select_num=None, start_idx=None):
+        self.select_num = select_num
+        self.start_idx = start_idx
+        self.ds = None
+        if 'llama' in tokenizer_name_or_path:
+            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, unk_token="<unk>",  bos_token="<s>", eos_token="</s>", add_eos_token=True)   
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+        if 'gpt2' in tokenizer_name_or_path:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        
+
+    def load(self, path):
+        loaded = load_from_disk(path)
+        self.ds = loaded['train']
+        if self.select_num is not None:
+            if self.start_idx is not None:
+                self.ds = self.ds.select(range(self.start_idx, min(len(self.ds), self.start_idx + self.select_num)))
+            else:
+                self.ds = self.ds.select(range(self.select_num))
+
+    def __len__(self):
+        return len(self.ds)
+
+    def __getitem__(self, idx):
+        return self.ds[idx]
+    
\ No newline at end of file
diff --git a/python/fate_llm/homo/fedavg.py b/python/fate_llm/homo/fedavg.py
new file mode 100644
index 0000000..406b244
--- /dev/null
+++ b/python/fate_llm/homo/fedavg.py
@@ -0,0 +1,101 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import torch
+from fate.ml.nn.homo.fedavg import FedAVGServer, FedAVGArguments, FedArguments
+from fate.arch import Context
+from fate_llm.trainer.seq2seq_trainer import HomoSeq2SeqTrainerClient, Seq2SeqTrainingArguments
+from fate.ml.aggregator import AggregatorClientWrapper
+import logging
+from typing import List, Optional, Tuple, Callable, Dict
+from fate.arch import Context
+from torch.optim import Optimizer
+from torch.utils.data import Dataset
+from torch.optim.lr_scheduler import _LRScheduler
+from transformers.trainer_callback import TrainerCallback
+from torch import nn
+from torch.utils.data import DataLoader
+from transformers import TrainerState, TrainerControl, PreTrainedTokenizer, EvalPrediction
+
+
+logger = logging.getLogger(__name__)
+
+
+Seq2SeqFedAVGServer = FedAVGServer
+
+
+class Seq2SeqFedAVGClient(HomoSeq2SeqTrainerClient):
+
+    def __init__(
+        self,
+        ctx: Context,
+        model: nn.Module,
+        training_args: Seq2SeqTrainingArguments,
+        fed_args: FedArguments,
+        train_set: Dataset,
+        val_set: Dataset = None,
+        optimizer: torch.optim.Optimizer = None,
+        scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
+        data_collator: Callable = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        callbacks: Optional[List[TrainerCallback]] = [],
+        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+        local_mode: bool = False
+    ):
+        # in case you forget to set evaluation_strategy
+        if val_set is not None and training_args.evaluation_strategy == "no":
+            training_args.evaluation_strategy = "epoch"
+
+        HomoSeq2SeqTrainerClient.__init__(
+            self,
+            ctx,
+            model,
+            training_args,
+            fed_args,
+            train_set,
+            val_set,
+            optimizer,
+            data_collator,
+            scheduler,
+            tokenizer,
+            callbacks,
+            compute_metrics,
+            local_mode
+        )
+
+
+    def init_aggregator(self, ctx: Context, fed_args: FedArguments):
+        aggregate_type = "weighted_mean"
+        aggregator_name = "fedavg"
+        aggregator = fed_args.aggregator
+        return AggregatorClientWrapper(
+            ctx, aggregate_type, aggregator_name, aggregator, sample_num=len(self.train_dataset), args=self._args
+        )
+
+    def on_federation(
+        self,
+        ctx: Context,
+        aggregator: AggregatorClientWrapper,
+        fed_args: FedArguments,
+        args: Seq2SeqTrainingArguments,
+        model: Optional[nn.Module] = None,
+        optimizer: Optional[Optimizer] = None,
+        scheduler: Optional[_LRScheduler] = None,
+        dataloader: Optional[Tuple[DataLoader]] = None,
+        control: Optional[TrainerControl] = None,
+        state: Optional[TrainerState] = None,
+        **kwargs,
+    ):
+        aggregator.model_aggregation(ctx, model)
+
diff --git a/python/fate_llm/homo/offsite_tuning.py b/python/fate_llm/homo/offsite_tuning.py
new file mode 100644
index 0000000..a4b927f
--- /dev/null
+++ b/python/fate_llm/homo/offsite_tuning.py
@@ -0,0 +1,149 @@
+from torch.nn.modules import Module
+from fate.ml.aggregator.base import Aggregator
+from fate_llm.homo.fedavg import Seq2SeqFedAVGClient, Seq2SeqFedAVGServer, Seq2SeqTrainingArguments
+from fate.ml.nn.trainer.trainer_base import FedArguments, TrainingArguments
+from dataclasses import dataclass
+from typing import List, Optional, Callable, Tuple
+from fate.arch import Context
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader, Dataset
+from torch.optim.lr_scheduler import _LRScheduler
+from transformers.trainer_callback import TrainerCallback
+from torch.nn import Module
+from transformers import TrainerState, TrainerControl, PreTrainedTokenizer
+from fate_llm.model_zoo.offsite_tuning.offsite_tuning_model import OffsiteTuningBaseModel
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+class OffsiteTuningTrainerClient(Seq2SeqFedAVGClient):
+    
+    def __init__(
+        self,
+        ctx: Context,
+        model: OffsiteTuningBaseModel,
+        training_args: Seq2SeqTrainingArguments,
+        fed_args: FedArguments,
+        train_set: Dataset,
+        val_set: Dataset = None,
+        optimizer: Optimizer = None,
+        scheduler: _LRScheduler = None,
+        data_collator: Callable = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        callbacks: List[TrainerCallback] = [],
+        compute_metrics: Callable = None,
+        aggregate_model: bool = False,
+    ):
+        assert isinstance(model, OffsiteTuningBaseModel), "model must be the subclass of OffsiteTuningBaseModel"
+        if aggregate_model == False and fed_args is None:
+            fed_args = FedArguments()
+        elif fed_args is None:
+            raise ValueError("fed_args must be provided when aggregate_model is True")
+
+        local_mode = True  if not aggregate_model else False
+            
+        super().__init__(
+            ctx,
+            model,
+            training_args,
+            fed_args,
+            train_set,
+            val_set,
+            optimizer,
+            scheduler,
+            data_collator,
+            tokenizer,
+            callbacks,
+            compute_metrics,
+            local_mode
+        )
+        self._aggregate_model = aggregate_model
+
+    def on_train_begin(self, ctx: Context, aggregator: Aggregator, fed_args: FedArguments, 
+                       args: TrainingArguments, model: Module = None, optimizer: Optimizer = None, scheduler: _LRScheduler = None, 
+                       dataloader: Tuple[DataLoader]= None, control: TrainerControl= None, 
+                       state: TrainerState = None, **kwargs):
+        
+        logger.info('receving weights from server')
+        parameters_to_get = ctx.arbiter.get('sub_model_para')
+        model.load_submodel_weights(parameters_to_get)
+        logger.info('received submodel weigths from the server')
+
+    def on_federation(
+        self,
+        ctx: Context,
+        aggregator,
+        fed_args: FedArguments,
+        args: TrainingArguments,
+        model: Optional[OffsiteTuningBaseModel] = None,
+        optimizer: Optional[Optimizer] = None,
+        scheduler: Optional[_LRScheduler] = None,
+        dataloader: Optional[Tuple[DataLoader]] = None,
+        control: Optional[TrainerControl] = None,
+        state: Optional[TrainerState] = None,
+        **kwargs,
+    ):
+        if self._aggregate_model:
+            aggregator.model_aggregation(ctx, model)
+
+
+    def on_train_end(self, ctx: Context, aggregator: Aggregator, fed_args: FedArguments, 
+                    args: TrainingArguments, model: OffsiteTuningBaseModel = None, optimizer: Optimizer = None, scheduler: _LRScheduler = None, 
+                    dataloader: Tuple[DataLoader]= None, control: TrainerControl= None, 
+                    state: TrainerState = None, **kwargs):
+        logger.info('receving weights from server')
+        return_weights = model.get_submodel_weights()
+        ctx.arbiter.put('trained_sub_model_para', return_weights)
+        logger.info('weights sent back to the server')
+
+    def init_aggregator(self, ctx: Context, fed_args: FedArguments):
+        if self._aggregate_model:
+            return super().init_aggregator(ctx, fed_args)
+        else:
+            return None
+
+
+class OffsiteTuningTrainerServer(Seq2SeqFedAVGServer):
+    
+    def __init__(self, ctx: Context, model: OffsiteTuningBaseModel, aggregate_model=False) -> None:
+        self._aggregate_model = aggregate_model
+        super().__init__(ctx, local_mode=False)
+        assert isinstance(model, OffsiteTuningBaseModel), "model must be the subclass of OffsiteTuningBaseModel"
+        self.model = model
+
+    def on_train_begin(self, ctx: Context, aggregator: Aggregator):
+        logger.info('sending weights to clients')
+        parameters_to_send = self.model.get_submodel_weights()
+        ctx.guest.put('sub_model_para', parameters_to_send)
+        if any(p.role=='host' for p in ctx.parties):
+            ctx.hosts.put('sub_model_para', parameters_to_send)
+
+    def on_train_end(self, ctx: Context, aggregator: Aggregator):
+        parameters_to_get = ctx.guest.get('trained_sub_model_para')
+        self.model.load_submodel_weights(parameters_to_get)
+        logger.info('received trained submodel weigths from the client')
+
+    def on_federation(self, ctx: Context, aggregator, agg_iter_idx: int):
+        if self._aggregate_model:
+            aggregator.model_aggregation(ctx)
+        else:
+            logger.info('skip aggregation')
+
+    def init_aggregator(self, ctx):
+        if self._aggregate_model:
+            return super().init_aggregator(ctx)
+        else:
+            return None
+        
+    def train(self):
+
+        if self._aggregate_model:
+            super().train()
+        else:
+            # do nothing but send the submodel weights to the client
+            # and then aggregate the weights from the client
+            self.on_init_end(self.ctx, aggregator=self.aggregator)
+            self.on_train_begin(self.ctx, aggregator=self.aggregator)
+            self.on_train_end(self.ctx, aggregator=self.aggregator)
diff --git a/python/fate_llm/homo/test/test_gpt2.py b/python/fate_llm/homo/test/test_gpt2.py
new file mode 100644
index 0000000..8bce7bf
--- /dev/null
+++ b/python/fate_llm/homo/test/test_gpt2.py
@@ -0,0 +1,5 @@
+from fate_llm.model_zoo.offsite_tuning.gpt2 import GPT2LMHeadMainModel, GPT2LMHeadSubModel
+
+main_model = GPT2LMHeadMainModel('gpt2', emulator_layer_num=4, adapter_top_layer_num=2, adapter_bottom_layer_num=2)
+sub_model = GPT2LMHeadSubModel('gpt2', emulator_layer_num=4, adapter_top_layer_num=2, adapter_bottom_layer_num=2)
+
diff --git a/python/fate_llm/homo/test/test_loader.py b/python/fate_llm/homo/test/test_loader.py
new file mode 100644
index 0000000..7adc6cf
--- /dev/null
+++ b/python/fate_llm/homo/test/test_loader.py
@@ -0,0 +1,17 @@
+from fate.components.components.nn.loader import ModelLoader
+from fate.components.components.nn.torch.base import Sequential, load_seq
+from fate.components.components.nn.torch import nn
+from fate.ml.nn.trainer.trainer_base import TrainingArguments
+from transformers import Seq2SeqTrainingArguments
+
+
+loader = ModelLoader('multi_model', 'Multi')
+
+b = Sequential(
+    nn.Linear(10, 10),
+    nn.Sigmoid()
+)
+
+a = Sequential(
+    ModelLoader('multi_model', 'Multi')
+)
\ No newline at end of file
diff --git a/python/fate_llm/homo/test/test_ot.py b/python/fate_llm/homo/test/test_ot.py
new file mode 100644
index 0000000..dec36b9
--- /dev/null
+++ b/python/fate_llm/homo/test/test_ot.py
@@ -0,0 +1,135 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+import sys
+from datetime import datetime
+from fate_llm.homo.offsite_tuning import OffsiteTuningTrainerClient, OffsiteTuningTrainerServer, TrainingArguments, FedArguments
+from fate_llm.model_zoo.offsite_tuning.gpt2 import GPT2LMHeadMainModel, GPT2LMHeadSubModel
+from transformers import DataCollatorForSeq2Seq
+
+
+def get_current_datetime_str():
+    return datetime.now().strftime("%Y-%m-%d-%H-%M")
+
+
+guest = ("guest", "10000")
+arbiter = ("arbiter", "9999")
+host = ("host", "9998")
+name = get_current_datetime_str()
+
+
+def create_ctx(local, context_name):
+    from fate.arch import Context
+    from fate.arch.computing.backends.standalone import CSession
+    from fate.arch.federation.backends.standalone import StandaloneFederation
+    import logging
+
+    # prepare log
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.DEBUG)
+    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
+    # init fate context
+    computing = CSession(data_dir="./cession_dir")
+    return Context(computing=computing, federation=StandaloneFederation(computing, context_name, local, [guest, arbiter, host]))
+
+
+if __name__ == "__main__":
+
+    party = sys.argv[1]
+    import torch as t
+    from fate_llm.dataset.qa_dataset import QaDataset
+
+    def set_seed(seed):
+        t.manual_seed(seed)
+        if t.cuda.is_available():
+            t.cuda.manual_seed_all(seed)
+            t.backends.cudnn.deterministic = True
+            t.backends.cudnn.benchmark = False
+
+    set_seed(42)
+
+    if party == "guest" or party == "host":
+        from fate_llm.dataset.qa_dataset import tokenize_qa_dataset
+        from transformers import AutoTokenizer, AutoModel
+        tokenizer_name_or_path = 'gpt2'
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+
+        if party == "guest":
+            ctx = create_ctx(guest, get_current_datetime_str())
+        elif party == "host":
+            ctx = create_ctx(host, get_current_datetime_str())
+
+        if 'llama' in tokenizer_name_or_path:
+            tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, unk_token="<unk>",  bos_token="<s>", eos_token="</s>", add_eos_token=True)   
+            tokenizer.pad_token = tokenizer.eos_token
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+        if 'gpt2' in tokenizer_name_or_path:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        ds = QaDataset(tokenizer_name_or_path=tokenizer_name_or_path, select_num=100)
+        ds.load('./sciq/')
+
+
+        train_args = TrainingArguments(
+            per_device_train_batch_size=1,
+            learning_rate=5e-5,
+            disable_tqdm=False,
+            num_train_epochs=2,
+            logging_steps=10,
+            logging_strategy='steps'
+        )
+
+        model = GPT2LMHeadSubModel(
+            model_name_or_path=tokenizer_name_or_path,
+            emulator_layer_num=2,
+            adapter_top_layer_num=2,
+            adapter_bottom_layer_num=2
+        )
+
+        trainer = OffsiteTuningTrainerClient(
+            ctx=ctx,
+            model=model,
+            training_args=train_args,
+            train_set=ds,
+            fed_args=FedArguments(),
+            data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
+            aggregate_model=True
+        )
+        print('start training')
+        trainer.train()
+
+    elif party == "arbiter":
+        ctx = create_ctx(arbiter, get_current_datetime_str())
+
+        model = GPT2LMHeadMainModel(
+            model_name_or_path='gpt2',
+            emulator_layer_num=2,
+            adapter_top_layer_num=2,
+            adapter_bottom_layer_num=2
+        )
+
+        trainer = OffsiteTuningTrainerServer(
+            ctx=ctx,
+            model=model,
+            aggregate_model=True
+        )
+        print('start training')
+        trainer.train()
diff --git a/python/fate_llm/model_zoo/offsite_tuning/bloom.py b/python/fate_llm/model_zoo/offsite_tuning/bloom.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/fate_llm/model_zoo/offsite_tuning/gpt2.py b/python/fate_llm/model_zoo/offsite_tuning/gpt2.py
new file mode 100644
index 0000000..f49492f
--- /dev/null
+++ b/python/fate_llm/model_zoo/offsite_tuning/gpt2.py
@@ -0,0 +1,242 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from fate_llm.model_zoo.offsite_tuning.offsite_tuning_model import OffsiteTuningSubModel, OffsiteTuningMainModel, get_dropout_emulator_and_adapters, split_numpy_array, recover_numpy_array
+from transformers import GPT2LMHeadModel, GPT2Config
+import torch
+from typing import Optional, Tuple
+
+
+class GPT2LMHeadMainModel(OffsiteTuningMainModel):
+
+    def __init__(
+            self,
+            model_name_or_path,
+            emulator_layer_num: int,
+            adapter_top_layer_num: int = 2,
+            adapter_bottom_layer_num: int = 2):
+
+        self.model_name_or_path = model_name_or_path
+        super().__init__(
+            emulator_layer_num,
+            adapter_top_layer_num,
+            adapter_bottom_layer_num)
+
+    def get_base_model(self):
+        return GPT2LMHeadModel.from_pretrained(self.model_name_or_path)
+
+    def get_model_transformer_blocks(self, model: GPT2LMHeadModel):
+        return model.transformer.h
+
+    def forward(self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,):
+
+        return self.model(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+
+    def get_additional_param_state_dict(self):
+        # get parameter of additional parameter
+        model = self.model
+        param_dict = {
+            'wte': model.transformer.wte,
+            'wpe': model.transformer.wpe,
+            'last_ln_f': model.transformer.ln_f
+        }
+
+        addition_weights = self.get_numpy_state_dict(param_dict)
+
+        wte = addition_weights.pop('wte')
+        wte_dict = split_numpy_array(wte, 10, 'wte')
+        wpe = addition_weights.pop('wpe')
+        wpe_dict = split_numpy_array(wpe, 10, 'wpe')
+        addition_weights.update(wte_dict)
+        addition_weights.update(wpe_dict)
+        return addition_weights
+
+    def load_additional_param_state_dict(self, submodel_weights: dict):
+        # load additional weights:
+        model = self.model
+        param_dict = {
+            'wte': model.transformer.wte,
+            'wpe': model.transformer.wpe,
+            'last_ln_f': model.transformer.ln_f
+        }
+
+        new_submodel_weight = {}
+        new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f']
+        wte_dict, wpe_dict = {}, {}
+        for k, v in submodel_weights.items():
+            if 'wte' in k:
+                wte_dict[k] = v
+            if 'wpe' in k:
+                wpe_dict[k] = v
+        wte = recover_numpy_array(wte_dict, 'wte')
+        wpe = recover_numpy_array(wpe_dict, 'wpe')
+        new_submodel_weight['wte'] = wte
+        new_submodel_weight['wpe'] = wpe
+
+        self.load_numpy_state_dict(param_dict, new_submodel_weight)
+
+
+class GPT2LMHeadSubModel(OffsiteTuningSubModel):
+
+    def __init__(
+            self,
+            model_name_or_path,
+            emulator_layer_num: int,
+            adapter_top_layer_num: int = 2,
+            adapter_bottom_layer_num: int = 2,
+            fp16_mix_precision=False,
+            partial_weight_decay=None):
+
+        self.model_name_or_path = model_name_or_path
+        self.emulator_layer_num = emulator_layer_num
+        self.adapter_top_layer_num = adapter_top_layer_num
+        self.adapter_bottom_layer_num = adapter_bottom_layer_num
+        super().__init__(
+            emulator_layer_num,
+            adapter_top_layer_num,
+            adapter_bottom_layer_num,
+            fp16_mix_precision)
+        self.partial_weight_decay = partial_weight_decay
+
+    def get_base_model(self):
+        total_layer_num = self.emulator_layer_num + \
+            self.adapter_top_layer_num + self.adapter_bottom_layer_num
+        config = GPT2Config.from_pretrained(self.model_name_or_path)
+        config.num_hidden_layers = total_layer_num
+        # initialize a model without pretrained weights
+        return GPT2LMHeadModel(config)
+
+    def get_model_transformer_blocks(self, model: GPT2LMHeadModel):
+        return model.transformer.h
+
+    def get_additional_param_state_dict(self):
+        # get parameter of additional parameter
+        model = self.model
+        param_dict = {
+            'wte': model.transformer.wte,
+            'wpe': model.transformer.wpe,
+            'last_ln_f': model.transformer.ln_f
+        }
+
+        addition_weights = self.get_numpy_state_dict(param_dict)
+
+        wte = addition_weights.pop('wte')
+        wte_dict = split_numpy_array(wte, 10, 'wte')
+        wpe = addition_weights.pop('wpe')
+        wpe_dict = split_numpy_array(wpe, 10, 'wpe')
+        addition_weights.update(wte_dict)
+        addition_weights.update(wpe_dict)
+        return addition_weights
+
+    def load_additional_param_state_dict(self, submodel_weights: dict):
+        # load additional weights:
+        model = self.model
+        param_dict = {
+            'wte': model.transformer.wte,
+            'wpe': model.transformer.wpe,
+            'last_ln_f': model.transformer.ln_f
+        }
+
+        new_submodel_weight = {}
+        new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f']
+        wte_dict, wpe_dict = {}, {}
+        for k, v in submodel_weights.items():
+            if 'wte' in k:
+                wte_dict[k] = v
+            if 'wpe' in k:
+                wpe_dict[k] = v
+        wte = recover_numpy_array(wte_dict, 'wte')
+        wpe = recover_numpy_array(wpe_dict, 'wpe')
+        new_submodel_weight['wte'] = wte
+        new_submodel_weight['wpe'] = wpe
+
+        self.load_numpy_state_dict(param_dict, new_submodel_weight)
+
+    def forward(self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,):
+
+        return self.model(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+
+    def parameters(self, recurse=True):
+        if self.partial_weight_decay is None:
+            return super().parameters(recurse)
+        elif isinstance(self.partial_weight_decay, float):
+            no_decay = ["bias", "layer_norm.weight"]
+            return [
+                {
+                    "params": [
+                        p for n, p in self.named_parameters() if not any(
+                            nd in n for nd in no_decay)], "weight_decay": self.partial_weight_decay}, {
+                    "params": [
+                        p for n, p in self.named_parameters() if any(
+                            nd in n for nd in no_decay)], "weight_decay": 0.0}]
+        else:
+            raise ValueError(
+                f"partial_weight_decay should be None or float, but got {self.partial_weight_decay}")
\ No newline at end of file
diff --git a/python/fate_llm/runner/offsite_tuning_runner.py b/python/fate_llm/runner/offsite_tuning_runner.py
new file mode 100644
index 0000000..ab0bdc1
--- /dev/null
+++ b/python/fate_llm/runner/offsite_tuning_runner.py
@@ -0,0 +1,244 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from fate.components.components.nn.nn_runner import (
+    NNRunner,
+    load_model_dict_from_path,
+    dir_warning,
+    loader_load_from_conf,
+    run_dataset_func,
+)
+from fate.components.components.nn.runner.homo_default_runner import DefaultRunner
+from fate.ml.nn.homo.fedavg import FedAVGArguments
+from fate_llm.homo.fedavg import Seq2SeqFedAVGClient, Seq2SeqFedAVGServer
+from typing import Dict
+from fate.components.components.nn.loader import Loader
+import torch.nn as nn
+import torch.optim as optim
+from fate.ml.nn.trainer.trainer_base import FedArguments, HomoTrainerServer
+from fate_llm.trainer.seq2seq_trainer import Seq2SeqTrainingArguments, HomoSeq2SeqTrainerClient
+from typing import Union, Type, Callable, Optional
+from transformers.trainer_utils import get_last_checkpoint
+from typing import Literal
+import logging
+from fate.arch.dataframe import DataFrame
+from transformers.modeling_utils import PreTrainedModel, unwrap_model
+
+
+logger = logging.getLogger(__name__)
+
+
+SUPPORTED_ALGO = ["fedavg"]
+
+
+def _check_instances(
+    trainer: Union[Type[HomoSeq2SeqTrainerClient], Type[HomoTrainerServer]] = None,
+    fed_args: FedArguments = None,
+    model: nn.Module = None,
+    optimizer: optim.Optimizer = None,
+    train_args: Seq2SeqTrainingArguments = None,
+    data_collator: Callable = None,
+) -> None:
+    if trainer is not None and not (
+        issubclass(type(trainer), HomoSeq2SeqTrainerClient) or issubclass(type(trainer), HomoTrainerServer)
+    ):
+        raise TypeError(
+            f"SetupReturn Error: trainer must be a subclass of either "
+            f"HomoSeq2SeqTrainerClient or HomoSeq2SeqTrainerClient but got {type(trainer)}"
+        )
+
+    if fed_args is not None and not isinstance(fed_args, FedArguments):
+        raise TypeError(f"SetupReturn Error: fed_args must be an instance of FedArguments but got {type(fed_args)}")
+
+    if model is not None and not issubclass(type(model), nn.Module):
+        raise TypeError(f"SetupReturn Error: model must be a subclass of torch.nn.Module but got {type(model)}")
+
+    if optimizer is not None and not issubclass(type(optimizer), optim.Optimizer):
+        raise TypeError(
+            f"SetupReturn Error: optimizer must be a subclass of torch.optim.Optimizer but got {type(optimizer)}"
+        )
+
+    if train_args is not None and not isinstance(train_args, Seq2SeqTrainingArguments):
+        raise TypeError(
+            f"SetupReturn Error: train_args must be an instance of Seq2SeqTrainingArguments "
+            f"but got {type(train_args)}"
+        )
+
+    if data_collator is not None and not callable(data_collator):
+        raise TypeError(f"SetupReturn Error: data_collator must be callable but got {type(data_collator)}")
+
+
+class Seq2SeqRunner(DefaultRunner):
+    def __init__(
+        self,
+        algo: str = "fedavg",
+        model_conf: Optional[Dict] = None,
+        dataset_conf: Optional[Dict] = None,
+        optimizer_conf: Optional[Dict] = None,
+        training_args_conf: Optional[Dict] = None,
+        fed_args_conf: Optional[Dict] = None,
+        data_collator_conf: Optional[Dict] = None,
+        tokenizer_conf: Optional[Dict] = None,
+        task_type: Literal["causal_lm", "other"] = "causal_lm",
+        local_mode: bool = False,
+        save_trainable_weights_only: bool = False,
+    ) -> None:
+        super(NNRunner, self).__init__()
+        self.algo = algo
+        self.model_conf = model_conf
+        self.dataset_conf = dataset_conf
+        self.optimizer_conf = optimizer_conf
+        self.training_args_conf = training_args_conf
+        self.fed_args_conf = fed_args_conf
+        self.data_collator_conf = data_collator_conf
+        self.local_mode = local_mode
+        self.tokenizer_conf = tokenizer_conf
+        self.task_type = task_type
+        self.save_trainable_weights_only = save_trainable_weights_only
+
+        # check param
+        if self.algo not in SUPPORTED_ALGO:
+            raise ValueError(f"algo should be one of {SUPPORTED_ALGO}")
+        if self.task_type not in ["causal_lm", "others"]:
+            raise ValueError("task_type should be one of [binary, multi, regression, others]")
+        assert isinstance(self.local_mode, bool), "local should be bool"
+
+        # setup var
+        self.trainer = None
+        self.training_args = None
+
+    def client_setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None, stage="train"):
+        if stage == "predict":
+            self.local_mode = True
+
+        if self.algo == "fedavg":
+            client_class: Seq2SeqFedAVGClient = Seq2SeqFedAVGClient
+        else:
+            raise ValueError(f"algo {self.algo} not supported")
+
+        ctx = self.get_context()
+        model = loader_load_from_conf(self.model_conf)
+        if model is None:
+            raise ValueError(f"model is None, cannot load model from conf {self.model_conf}")
+
+        if output_dir is None:
+            output_dir = "./"
+
+        resume_path = None
+        if saved_model is not None:
+            model_dict = load_model_dict_from_path(saved_model)
+            model.load_state_dict(model_dict)
+            logger.info(f"loading model dict from {saved_model} to model done")
+            if get_last_checkpoint(saved_model) is not None:
+                resume_path = saved_model
+                logger.info(f"checkpoint detected, resume_path set to {resume_path}")
+        # load optimizer
+        if self.optimizer_conf:
+            optimizer_loader = Loader.from_dict(self.optimizer_conf)
+            optimizer_ = optimizer_loader.load_item()
+            optimizer_params = optimizer_loader.kwargs
+            optimizer = optimizer_(model.parameters(), **optimizer_params)
+        else:
+            optimizer = None
+        # load collator func
+        data_collator = loader_load_from_conf(self.data_collator_conf)
+        # load tokenizer if import conf provided
+        tokenizer = loader_load_from_conf(self.tokenizer_conf)
+        # args
+        dir_warning(self.training_args_conf)
+        training_args = Seq2SeqTrainingArguments(**self.training_args_conf)
+        self.training_args = training_args
+        # reset to default, saving to arbitrary path is not allowed in
+        # DefaultRunner
+        training_args.output_dir = output_dir
+        training_args.resume_from_checkpoint = resume_path  # resume path
+        fed_args = FedAVGArguments(**self.fed_args_conf)
+
+        # prepare trainer
+        trainer = client_class(
+            ctx=ctx,
+            model=model,
+            optimizer=optimizer,
+            training_args=training_args,
+            fed_args=fed_args,
+            data_collator=data_collator,
+            tokenizer=tokenizer,
+            train_set=train_set,
+            val_set=validate_set,
+            local_mode=self.local_mode,
+            save_trainable_weights_only=self.save_trainable_weights_only,
+        )
+
+        _check_instances(
+            trainer=trainer,
+            model=model,
+            optimizer=optimizer,
+            train_args=training_args,
+            fed_args=fed_args,
+            data_collator=data_collator,
+        )
+        return trainer
+
+    def server_setup(self, stage="train"):
+        if stage == "predict":
+            self.local_mode = True
+        if self.algo == "fedavg":
+            server_class: Seq2SeqFedAVGServer = Seq2SeqFedAVGServer
+        else:
+            raise ValueError(f"algo {self.algo} not supported")
+        ctx = self.get_context()
+        trainer = server_class(ctx=ctx, local_mode=self.local_mode)
+        _check_instances(trainer)
+        return trainer
+
+    def predict(self, test_data: Union[str, DataFrame], saved_model_path: str = None) -> Union[DataFrame, None]:
+        if self.is_client():
+            test_set = self._prepare_data(test_data, "test_data")
+            if self.trainer is not None:
+                trainer = self.trainer
+                logger.info("trainer found, skip setting up")
+            else:
+                trainer = self.client_setup(saved_model=saved_model_path, stage="predict")
+
+            classes = run_dataset_func(test_set, "get_classes")
+            match_ids = run_dataset_func(test_set, "get_match_ids")
+            sample_ids = run_dataset_func(test_set, "get_sample_ids")
+            match_id_name = run_dataset_func(test_set, "get_match_id_name")
+            sample_id_name = run_dataset_func(test_set, "get_sample_id_name")
+
+            if not self.training_args.predict_with_generate:
+                return
+
+            pred_rs = trainer.predict(test_set)
+
+            if self.training_args and self.training_args.deepspeed and self.training_args.local_rank != 0:
+                return
+
+            rs_df = self.get_nn_output_dataframe(
+                self.get_context(),
+                pred_rs.predictions,
+                pred_rs.label_ids if hasattr(pred_rs, "label_ids") else None,
+                match_ids,
+                sample_ids,
+                match_id_name=match_id_name,
+                sample_id_name=sample_id_name,
+                dataframe_format="dist_df",
+                task_type=self.task_type,
+                classes=classes,
+            )
+            return rs_df
+        else:
+            # server not predict
+            return
diff --git a/python/fate_llm/trainer/seq2seq_trainer.py b/python/fate_llm/trainer/seq2seq_trainer.py
index 7046cd3..cd7ef2d 100644
--- a/python/fate_llm/trainer/seq2seq_trainer.py
+++ b/python/fate_llm/trainer/seq2seq_trainer.py
@@ -1,18 +1,3 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
 from transformers import Seq2SeqTrainingArguments as _hf_Seq2SeqTrainingArguments, Seq2SeqTrainer
 from dataclasses import dataclass, field
 from typing import Optional
@@ -32,11 +17,8 @@
 from transformers.trainer_callback import TrainerCallback
 from typing import Optional
 from dataclasses import dataclass, field
-from transformers.modeling_utils import unwrap_model
 
 
-TRAINABLE_WEIGHTS_NAME = "adapter_model.bin"
-
 
 @dataclass
 class _S2STrainingArguments(_hf_Seq2SeqTrainingArguments):
@@ -100,8 +82,7 @@ def __init__(
         callbacks: Optional[List[TrainerCallback]] = [],
         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
         local_mode: bool = False,
-        save_trainable_weights_only: bool = False,
-        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None
     ):
         # in case you forget to set evaluation_strategy
         if val_set is not None and training_args.evaluation_strategy == "no":
@@ -120,7 +101,6 @@ def __init__(
             callbacks=callbacks,
             compute_metrics=compute_metrics,
             local_mode=local_mode,
-            save_trainable_weights_only=save_trainable_weights_only,
         )
 
         # concat checkpoint path if checkpoint idx is set
@@ -144,23 +124,5 @@ def __init__(
         )
 
         self._add_fate_callback(self.callback_handler)
-
-    def _save(
-        self,
-        output_dir: Optional[str] = None,
-        state_dict=None
-    ):
-        if not self._save_trainable_weights_only:
-            return super()._save(output_dir, state_dict)
-        else:
-            model = unwrap_model(self.model)
-
-            if hasattr(model, "save_pretrained"):
-                model.save_pretrained(os.path.join(output_dir, TRAINABLE_WEIGHTS_NAME))
-            else:
-                state_dict = {
-                    k: p.to("cpu") for k,
-                                       p in model.named_parameters() if p.requires_grad
-                }
-
-                torch.save(state_dict, os.path.join(output_dir, TRAINABLE_WEIGHTS_NAME))
+        
+    

From dad2cde34c12355d7d4b5f3bdca7232822468d7e Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Tue, 20 Feb 2024 17:10:28 +0800
Subject: [PATCH 05/35] delete unused code

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py b/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
index a753e36..73c580e 100644
--- a/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
+++ b/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
@@ -20,7 +20,6 @@
 from transformers import AutoModel
 from transformers.configuration_utils import PretrainedConfig
 import logging
-import yaml
 
 
 logger = logging.getLogger(__name__)
@@ -99,8 +98,6 @@ def add_peft(self):
             peft_config = getattr(peft, self.peft_type)()
         elif isinstance(self.peft_config, dict):
             peft_config = getattr(peft, self.peft_type)(**self.peft_config)
-        elif isinstance(self.peft_config, str):
-            peft_config = yaml.safe_load(self.peft_config)
         else:
             raise ValueError(f"Can not parse peft_config of {type(self.peft_config)}")
 

From 127c42732c88406e1927baf4e2b58428f0bcbfe9 Mon Sep 17 00:00:00 2001
From: cwj <talkingwallace@sohu.com>
Date: Wed, 21 Feb 2024 20:23:04 +0800
Subject: [PATCH 06/35] Add offsite-tuning support Signed-off-by: weijingchen
 <talkingwallace@sohu.com>

Signed-off-by: cwj <talkingwallace@sohu.com>
---
 .../fate_llm/cust_func/cust_data_collator.py  |  8 ++++
 python/fate_llm/dataset/qa_dataset.py         |  2 +-
 python/fate_llm/homo/fedavg.py                |  8 +++-
 python/fate_llm/homo/offsite_tuning.py        | 20 ++++++++-
 python/fate_llm/trainer/seq2seq_trainer.py    | 44 +++++++++++++++++--
 5 files changed, 74 insertions(+), 8 deletions(-)
 create mode 100644 python/fate_llm/cust_func/cust_data_collator.py

diff --git a/python/fate_llm/cust_func/cust_data_collator.py b/python/fate_llm/cust_func/cust_data_collator.py
new file mode 100644
index 0000000..1e2b685
--- /dev/null
+++ b/python/fate_llm/cust_func/cust_data_collator.py
@@ -0,0 +1,8 @@
+from transformers import AutoTokenizer
+
+
+def get_seq2seq_tokenizer(model_path):
+    from transformers import DataCollatorForSeq2Seq
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer.pad_token = tokenizer.eos_token
+    return DataCollatorForSeq2Seq(tokenizer=tokenizer)
diff --git a/python/fate_llm/dataset/qa_dataset.py b/python/fate_llm/dataset/qa_dataset.py
index 020072c..c1399a3 100644
--- a/python/fate_llm/dataset/qa_dataset.py
+++ b/python/fate_llm/dataset/qa_dataset.py
@@ -17,7 +17,7 @@
 from transformers import AutoTokenizer
 import torch as t
 import os
-from torch.utils.data import Dataset
+from fate.ml.nn.dataset.base import Dataset
 
 
 """
diff --git a/python/fate_llm/homo/fedavg.py b/python/fate_llm/homo/fedavg.py
index 406b244..82e0190 100644
--- a/python/fate_llm/homo/fedavg.py
+++ b/python/fate_llm/homo/fedavg.py
@@ -51,7 +51,9 @@ def __init__(
         tokenizer: Optional[PreTrainedTokenizer] = None,
         callbacks: Optional[List[TrainerCallback]] = [],
         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
-        local_mode: bool = False
+        local_mode: bool = False,
+        save_trainable_weights_only: bool = False,
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
     ):
         # in case you forget to set evaluation_strategy
         if val_set is not None and training_args.evaluation_strategy == "no":
@@ -71,7 +73,9 @@ def __init__(
             tokenizer,
             callbacks,
             compute_metrics,
-            local_mode
+            local_mode,
+            save_trainable_weights_only,
+            preprocess_logits_for_metrics
         )
 
 
diff --git a/python/fate_llm/homo/offsite_tuning.py b/python/fate_llm/homo/offsite_tuning.py
index a4b927f..1fa3ae3 100644
--- a/python/fate_llm/homo/offsite_tuning.py
+++ b/python/fate_llm/homo/offsite_tuning.py
@@ -13,6 +13,7 @@
 from transformers import TrainerState, TrainerControl, PreTrainedTokenizer
 from fate_llm.model_zoo.offsite_tuning.offsite_tuning_model import OffsiteTuningBaseModel
 import logging
+import torch
 
 
 logger = logging.getLogger(__name__)
@@ -35,6 +36,8 @@ def __init__(
         callbacks: List[TrainerCallback] = [],
         compute_metrics: Callable = None,
         aggregate_model: bool = False,
+        save_trainable_weights_only: bool = False,
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
     ):
         assert isinstance(model, OffsiteTuningBaseModel), "model must be the subclass of OffsiteTuningBaseModel"
         if aggregate_model == False and fed_args is None:
@@ -42,7 +45,7 @@ def __init__(
         elif fed_args is None:
             raise ValueError("fed_args must be provided when aggregate_model is True")
 
-        local_mode = True  if not aggregate_model else False
+        local_mode = True if not aggregate_model else False
             
         super().__init__(
             ctx,
@@ -57,7 +60,9 @@ def __init__(
             tokenizer,
             callbacks,
             compute_metrics,
-            local_mode
+            local_mode,
+            save_trainable_weights_only,
+            preprocess_logits_for_metrics
         )
         self._aggregate_model = aggregate_model
 
@@ -147,3 +152,14 @@ def train(self):
             self.on_init_end(self.ctx, aggregator=self.aggregator)
             self.on_train_begin(self.ctx, aggregator=self.aggregator)
             self.on_train_end(self.ctx, aggregator=self.aggregator)
+
+    def save_model(
+        self,
+        output_dir: Optional[str] = None,
+        state_dict=None
+    ):
+        import torch
+        import os
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        torch.save(self.model.state_dict(), output_dir + '/pytorch_model.bin')
diff --git a/python/fate_llm/trainer/seq2seq_trainer.py b/python/fate_llm/trainer/seq2seq_trainer.py
index cd7ef2d..7046cd3 100644
--- a/python/fate_llm/trainer/seq2seq_trainer.py
+++ b/python/fate_llm/trainer/seq2seq_trainer.py
@@ -1,3 +1,18 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 from transformers import Seq2SeqTrainingArguments as _hf_Seq2SeqTrainingArguments, Seq2SeqTrainer
 from dataclasses import dataclass, field
 from typing import Optional
@@ -17,8 +32,11 @@
 from transformers.trainer_callback import TrainerCallback
 from typing import Optional
 from dataclasses import dataclass, field
+from transformers.modeling_utils import unwrap_model
 
 
+TRAINABLE_WEIGHTS_NAME = "adapter_model.bin"
+
 
 @dataclass
 class _S2STrainingArguments(_hf_Seq2SeqTrainingArguments):
@@ -82,7 +100,8 @@ def __init__(
         callbacks: Optional[List[TrainerCallback]] = [],
         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
         local_mode: bool = False,
-        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None
+        save_trainable_weights_only: bool = False,
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
     ):
         # in case you forget to set evaluation_strategy
         if val_set is not None and training_args.evaluation_strategy == "no":
@@ -101,6 +120,7 @@ def __init__(
             callbacks=callbacks,
             compute_metrics=compute_metrics,
             local_mode=local_mode,
+            save_trainable_weights_only=save_trainable_weights_only,
         )
 
         # concat checkpoint path if checkpoint idx is set
@@ -124,5 +144,23 @@ def __init__(
         )
 
         self._add_fate_callback(self.callback_handler)
-        
-    
+
+    def _save(
+        self,
+        output_dir: Optional[str] = None,
+        state_dict=None
+    ):
+        if not self._save_trainable_weights_only:
+            return super()._save(output_dir, state_dict)
+        else:
+            model = unwrap_model(self.model)
+
+            if hasattr(model, "save_pretrained"):
+                model.save_pretrained(os.path.join(output_dir, TRAINABLE_WEIGHTS_NAME))
+            else:
+                state_dict = {
+                    k: p.to("cpu") for k,
+                                       p in model.named_parameters() if p.requires_grad
+                }
+
+                torch.save(state_dict, os.path.join(output_dir, TRAINABLE_WEIGHTS_NAME))

From 8733232b1ff1114b35a23a18114b89c6cf770c9b Mon Sep 17 00:00:00 2001
From: cwj <talkingwallace@sohu.com>
Date: Thu, 22 Feb 2024 15:43:37 +0800
Subject: [PATCH 07/35] Update modelzoo Signed-off-by: weijingchen
 <talkingwallace@sohu.com>

Signed-off-by: cwj <talkingwallace@sohu.com>
---
 .../model_zoo/offsite_tuning/bloom.py         | 177 +++++++++++++++++
 .../model_zoo/offsite_tuning/bloom_ot.py      | 181 -----------------
 .../model_zoo/offsite_tuning/gpt2_ot.py       | 184 ------------------
 .../offsite_tuning/{llama_ot.py => llama.py}  |   0
 4 files changed, 177 insertions(+), 365 deletions(-)
 delete mode 100644 python/fate_llm/model_zoo/offsite_tuning/bloom_ot.py
 delete mode 100644 python/fate_llm/model_zoo/offsite_tuning/gpt2_ot.py
 rename python/fate_llm/model_zoo/offsite_tuning/{llama_ot.py => llama.py} (100%)

diff --git a/python/fate_llm/model_zoo/offsite_tuning/bloom.py b/python/fate_llm/model_zoo/offsite_tuning/bloom.py
index e69de29..d68f094 100644
--- a/python/fate_llm/model_zoo/offsite_tuning/bloom.py
+++ b/python/fate_llm/model_zoo/offsite_tuning/bloom.py
@@ -0,0 +1,177 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from fate_llm.model_zoo.offsite_tuning.offsite_tuning_model import OffsiteTuningSubModel, OffsiteTuningMainModel, get_dropout_emulator_and_adapters, split_numpy_array, recover_numpy_array
+from transformers.models.bloom.modeling_bloom import BloomForCausalLM, BloomModel, BloomConfig
+from torch import nn
+import torch
+
+
+class BloomMainModel(OffsiteTuningMainModel):
+
+    def __init__(
+            self,
+            model_name_or_path,
+            emulator_layer_num: int,
+            adapter_top_layer_num: int = 2,
+            adapter_bottom_layer_num: int = 2):
+
+        self.model_name_or_path = model_name_or_path
+        super().__init__(
+            emulator_layer_num,
+            adapter_top_layer_num,
+            adapter_bottom_layer_num)
+
+    def get_base_model(self):
+        return BloomForCausalLM.from_pretrained(self.model_name_or_path)
+
+    def get_model_transformer_blocks(self, model: BloomForCausalLM):
+        return model.transformer.h
+
+    def forward(self, x):
+        return self.model(**x)
+
+    def get_additional_param_state_dict(self):
+        # get parameter of additional parameter
+        model = self.model
+        param_dict = {
+            'wte': model.transformer.word_embeddings,
+            'word_ln': model.transformer.word_embeddings_layernorm,
+            'last_ln_f': model.transformer.ln_f
+        }
+
+        addition_weights = self.get_numpy_state_dict(param_dict)
+
+        wte = addition_weights.pop('wte')
+        wte_dict = split_numpy_array(wte, 25, 'wte')
+        addition_weights.update(wte_dict)
+        return addition_weights
+
+    def load_additional_param_state_dict(self, submodel_weights: dict):
+        # load additional weights:
+        model = self.model
+        param_dict = {
+            'wte': model.transformer.word_embeddings,
+            'word_ln': model.transformer.word_embeddings_layernorm,
+            'last_ln_f': model.transformer.ln_f
+        }
+
+        new_submodel_weight = {}
+        new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f']
+        new_submodel_weight['word_ln'] = submodel_weights['word_ln']
+        wte_dict = {}
+        for k, v in submodel_weights.items():
+            if 'wte' in k:
+                wte_dict[k] = v
+        wte = recover_numpy_array(wte_dict, 'wte')
+        new_submodel_weight['wte'] = wte
+        self.load_numpy_state_dict(param_dict, new_submodel_weight)
+
+    def forward(self, x):
+        return self.model(**x)
+
+
+class BloomSubModel(OffsiteTuningSubModel):
+
+    def __init__(
+            self,
+            model_name_or_path,
+            emulator_layer_num: int,
+            adapter_top_layer_num: int = 2,
+            adapter_bottom_layer_num: int = 2,
+            fp16_mix_precision=False,
+            partial_weight_decay=None):
+
+        self.model_name_or_path = model_name_or_path
+        self.emulator_layer_num = emulator_layer_num
+        self.adapter_top_layer_num = adapter_top_layer_num
+        self.adapter_bottom_layer_num = adapter_bottom_layer_num
+        super().__init__(
+            emulator_layer_num,
+            adapter_top_layer_num,
+            adapter_bottom_layer_num,
+            fp16_mix_precision)
+        self.partial_weight_decay = partial_weight_decay
+
+    def get_base_model(self):
+        total_layer_num = self.emulator_layer_num + \
+            self.adapter_top_layer_num + self.adapter_bottom_layer_num
+        config = BloomConfig.from_pretrained(self.model_name_or_path)
+        config.num_hidden_layers = total_layer_num
+        # initialize a model without pretrained weights
+        return BloomForCausalLM(config)
+
+    def get_model_transformer_blocks(self, model: BloomForCausalLM):
+        return model.transformer.h
+
+    def forward(self, x):
+        return self.model(**x)
+
+    def get_additional_param_state_dict(self):
+        # get parameter of additional parameter
+        model = self.model
+        param_dict = {
+            'wte': model.transformer.word_embeddings,
+            'word_ln': model.transformer.word_embeddings_layernorm,
+            'last_ln_f': model.transformer.ln_f
+        }
+
+        addition_weights = self.get_numpy_state_dict(param_dict)
+
+        wte = addition_weights.pop('wte')
+        wte_dict = split_numpy_array(wte, 25, 'wte')
+        addition_weights.update(wte_dict)
+        return addition_weights
+
+    def load_additional_param_state_dict(self, submodel_weights: dict):
+        # load additional weights:
+        model = self.model
+        param_dict = {
+            'wte': model.transformer.word_embeddings,
+            'word_ln': model.transformer.word_embeddings_layernorm,
+            'last_ln_f': model.transformer.ln_f
+        }
+
+        new_submodel_weight = {}
+        new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f']
+        new_submodel_weight['word_ln'] = submodel_weights['word_ln']
+        wte_dict = {}
+        for k, v in submodel_weights.items():
+            if 'wte' in k:
+                wte_dict[k] = v
+        wte = recover_numpy_array(wte_dict, 'wte')
+        new_submodel_weight['wte'] = wte
+        self.load_numpy_state_dict(param_dict, new_submodel_weight)
+
+    def forward(self, x):
+        return self.model(**x)
+
+    def parameters(self, recurse=True):
+        if self.partial_weight_decay is None:
+            return super().parameters(recurse)
+        elif isinstance(self.partial_weight_decay, float):
+            no_decay = ["bias", "layer_norm.weight"]
+            return [
+                {
+                    "params": [
+                        p for n, p in self.named_parameters() if not any(
+                            nd in n for nd in no_decay)], "weight_decay": self.partial_weight_decay}, {
+                    "params": [
+                        p for n, p in self.named_parameters() if any(
+                            nd in n for nd in no_decay)], "weight_decay": 0.0}]
+        else:
+            raise ValueError(
+                f"partial_weight_decay should be None or float, but got {self.partial_weight_decay}")
+
diff --git a/python/fate_llm/model_zoo/offsite_tuning/bloom_ot.py b/python/fate_llm/model_zoo/offsite_tuning/bloom_ot.py
deleted file mode 100644
index 2fd97cf..0000000
--- a/python/fate_llm/model_zoo/offsite_tuning/bloom_ot.py
+++ /dev/null
@@ -1,181 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from fate_llm.model_zoo.offsite_tuning.offsite_tuning_model import OffsiteTuningSubModel, OffsiteTuningMainModel, get_dropout_emulator_and_adapters, split_numpy_array, recover_numpy_array
-from transformers.models.bloom.modeling_bloom import BloomForCausalLM, BloomModel, BloomConfig
-from torch import nn
-import torch
-
-
-class BloomMainModel(OffsiteTuningMainModel):
-
-    def __init__(
-            self,
-            model_name_or_path,
-            emulator_layer_num: int,
-            adapter_top_layer_num: int = 2,
-            adapter_bottom_layer_num: int = 2):
-
-        self.model_name_or_path = model_name_or_path
-        super().__init__(
-            emulator_layer_num,
-            adapter_top_layer_num,
-            adapter_bottom_layer_num)
-
-    def get_base_model(self):
-        return BloomForCausalLM.from_pretrained(self.model_name_or_path)
-
-    def get_model_transformer_blocks(self, model: BloomForCausalLM):
-        return model.transformer.h
-
-    def forward(self, x):
-        return self.model(**x)
-
-    def get_additional_param_state_dict(self):
-        # get parameter of additional parameter
-        model = self.model
-        param_dict = {
-            'wte': model.transformer.word_embeddings,
-            'word_ln': model.transformer.word_embeddings_layernorm,
-            'last_ln_f': model.transformer.ln_f
-        }
-
-        addition_weights = self.get_numpy_state_dict(param_dict)
-
-        wte = addition_weights.pop('wte')
-        wte_dict = split_numpy_array(wte, 25, 'wte')
-        addition_weights.update(wte_dict)
-        return addition_weights
-
-    def load_additional_param_state_dict(self, submodel_weights: dict):
-        # load additional weights:
-        model = self.model
-        param_dict = {
-            'wte': model.transformer.word_embeddings,
-            'word_ln': model.transformer.word_embeddings_layernorm,
-            'last_ln_f': model.transformer.ln_f
-        }
-
-        new_submodel_weight = {}
-        new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f']
-        new_submodel_weight['word_ln'] = submodel_weights['word_ln']
-        wte_dict = {}
-        for k, v in submodel_weights.items():
-            if 'wte' in k:
-                wte_dict[k] = v
-        wte = recover_numpy_array(wte_dict, 'wte')
-        new_submodel_weight['wte'] = wte
-        self.load_numpy_state_dict(param_dict, new_submodel_weight)
-
-    def forward(self, x):
-        return self.model(**x)
-
-
-class BloomSubModel(OffsiteTuningSubModel):
-
-    def __init__(
-            self,
-            model_name_or_path,
-            emulator_layer_num: int,
-            adapter_top_layer_num: int = 2,
-            adapter_bottom_layer_num: int = 2,
-            fp16_mix_precision=False,
-            partial_weight_decay=None):
-
-        self.model_name_or_path = model_name_or_path
-        self.emulator_layer_num = emulator_layer_num
-        self.adapter_top_layer_num = adapter_top_layer_num
-        self.adapter_bottom_layer_num = adapter_bottom_layer_num
-        super().__init__(
-            emulator_layer_num,
-            adapter_top_layer_num,
-            adapter_bottom_layer_num,
-            fp16_mix_precision)
-        self.partial_weight_decay = partial_weight_decay
-
-        # import torch as t
-        # state_dict = t.load('/data/projects/fate/cwj/shortcut_bloom.pkl')
-        # self.load_state_dict(state_dict)
-
-    def get_base_model(self):
-        total_layer_num = self.emulator_layer_num + \
-            self.adapter_top_layer_num + self.adapter_bottom_layer_num
-        config = BloomConfig.from_pretrained(self.model_name_or_path)
-        config.num_hidden_layers = total_layer_num
-        # initialize a model without pretrained weights
-        return BloomForCausalLM(config)
-
-    def get_model_transformer_blocks(self, model: BloomForCausalLM):
-        return model.transformer.h
-
-    def forward(self, x):
-        return self.model(**x)
-
-    def get_additional_param_state_dict(self):
-        # get parameter of additional parameter
-        model = self.model
-        param_dict = {
-            'wte': model.transformer.word_embeddings,
-            'word_ln': model.transformer.word_embeddings_layernorm,
-            'last_ln_f': model.transformer.ln_f
-        }
-
-        addition_weights = self.get_numpy_state_dict(param_dict)
-
-        wte = addition_weights.pop('wte')
-        wte_dict = split_numpy_array(wte, 25, 'wte')
-        addition_weights.update(wte_dict)
-        return addition_weights
-
-    def load_additional_param_state_dict(self, submodel_weights: dict):
-        # load additional weights:
-        model = self.model
-        param_dict = {
-            'wte': model.transformer.word_embeddings,
-            'word_ln': model.transformer.word_embeddings_layernorm,
-            'last_ln_f': model.transformer.ln_f
-        }
-
-        new_submodel_weight = {}
-        new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f']
-        new_submodel_weight['word_ln'] = submodel_weights['word_ln']
-        wte_dict = {}
-        for k, v in submodel_weights.items():
-            if 'wte' in k:
-                wte_dict[k] = v
-        wte = recover_numpy_array(wte_dict, 'wte')
-        new_submodel_weight['wte'] = wte
-        self.load_numpy_state_dict(param_dict, new_submodel_weight)
-
-    def forward(self, x):
-        return self.model(**x)
-
-    def parameters(self, recurse=True):
-        if self.partial_weight_decay is None:
-            return super().parameters(recurse)
-        elif isinstance(self.partial_weight_decay, float):
-            no_decay = ["bias", "layer_norm.weight"]
-            return [
-                {
-                    "params": [
-                        p for n, p in self.named_parameters() if not any(
-                            nd in n for nd in no_decay)], "weight_decay": self.partial_weight_decay}, {
-                    "params": [
-                        p for n, p in self.named_parameters() if any(
-                            nd in n for nd in no_decay)], "weight_decay": 0.0}]
-        else:
-            raise ValueError(
-                f"partial_weight_decay should be None or float, but got {self.partial_weight_decay}")
-
diff --git a/python/fate_llm/model_zoo/offsite_tuning/gpt2_ot.py b/python/fate_llm/model_zoo/offsite_tuning/gpt2_ot.py
deleted file mode 100644
index c122d43..0000000
--- a/python/fate_llm/model_zoo/offsite_tuning/gpt2_ot.py
+++ /dev/null
@@ -1,184 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from fate_llm.model_zoo.offsite_tuning.offsite_tuning_model import OffsiteTuningSubModel, OffsiteTuningMainModel, get_dropout_emulator_and_adapters, split_numpy_array, recover_numpy_array
-from transformers import GPT2LMHeadModel, GPT2Config
-from torch import nn
-import torch
-import torch as t
-
-
-class GPT2LMHeadMainModel(OffsiteTuningMainModel):
-
-    def __init__(
-            self,
-            model_name_or_path,
-            emulator_layer_num: int,
-            adapter_top_layer_num: int = 2,
-            adapter_bottom_layer_num: int = 2):
-
-        self.model_name_or_path = model_name_or_path
-        super().__init__(
-            emulator_layer_num,
-            adapter_top_layer_num,
-            adapter_bottom_layer_num)
-
-    def get_base_model(self):
-        return GPT2LMHeadModel.from_pretrained(self.model_name_or_path)
-
-    def get_model_transformer_blocks(self, model: GPT2LMHeadModel):
-        return model.transformer.h
-
-    def forward(self, x):
-        return self.model(**x)
-
-    def get_additional_param_state_dict(self):
-        # get parameter of additional parameter
-        model = self.model
-        param_dict = {
-            'wte': model.transformer.wte,
-            'wpe': model.transformer.wpe,
-            'last_ln_f': model.transformer.ln_f
-        }
-
-        addition_weights = self.get_numpy_state_dict(param_dict)
-
-        wte = addition_weights.pop('wte')
-        wte_dict = split_numpy_array(wte, 10, 'wte')
-        wpe = addition_weights.pop('wpe')
-        wpe_dict = split_numpy_array(wpe, 10, 'wpe')
-        addition_weights.update(wte_dict)
-        addition_weights.update(wpe_dict)
-        return addition_weights
-
-    def load_additional_param_state_dict(self, submodel_weights: dict):
-        # load additional weights:
-        model = self.model
-        param_dict = {
-            'wte': model.transformer.wte,
-            'wpe': model.transformer.wpe,
-            'last_ln_f': model.transformer.ln_f
-        }
-
-        new_submodel_weight = {}
-        new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f']
-        wte_dict, wpe_dict = {}, {}
-        for k, v in submodel_weights.items():
-            if 'wte' in k:
-                wte_dict[k] = v
-            if 'wpe' in k:
-                wpe_dict[k] = v
-        wte = recover_numpy_array(wte_dict, 'wte')
-        wpe = recover_numpy_array(wpe_dict, 'wpe')
-        new_submodel_weight['wte'] = wte
-        new_submodel_weight['wpe'] = wpe
-
-        self.load_numpy_state_dict(param_dict, new_submodel_weight)
-
-class GPT2LMHeadSubModel(OffsiteTuningSubModel):
-
-    def __init__(
-            self,
-            model_name_or_path,
-            emulator_layer_num: int,
-            adapter_top_layer_num: int = 2,
-            adapter_bottom_layer_num: int = 2,
-            fp16_mix_precision=False,
-            partial_weight_decay=None):
-
-        self.model_name_or_path = model_name_or_path
-        self.emulator_layer_num = emulator_layer_num
-        self.adapter_top_layer_num = adapter_top_layer_num
-        self.adapter_bottom_layer_num = adapter_bottom_layer_num
-        super().__init__(
-            emulator_layer_num,
-            adapter_top_layer_num,
-            adapter_bottom_layer_num,
-            fp16_mix_precision)
-        self.partial_weight_decay = partial_weight_decay
-
-    def get_base_model(self):
-        total_layer_num = self.emulator_layer_num + \
-            self.adapter_top_layer_num + self.adapter_bottom_layer_num
-        config = GPT2Config.from_pretrained(self.model_name_or_path)
-        config.num_hidden_layers = total_layer_num
-        # initialize a model without pretrained weights
-        return GPT2LMHeadModel(config)
-
-    def get_model_transformer_blocks(self, model: GPT2LMHeadModel):
-        return model.transformer.h
-
-    def get_additional_param_state_dict(self):
-        # get parameter of additional parameter
-        model = self.model
-        param_dict = {
-            'wte': model.transformer.wte,
-            'wpe': model.transformer.wpe,
-            'last_ln_f': model.transformer.ln_f
-        }
-
-        addition_weights = self.get_numpy_state_dict(param_dict)
-
-        wte = addition_weights.pop('wte')
-        wte_dict = split_numpy_array(wte, 10, 'wte')
-        wpe = addition_weights.pop('wpe')
-        wpe_dict = split_numpy_array(wpe, 10, 'wpe')
-        addition_weights.update(wte_dict)
-        addition_weights.update(wpe_dict)
-        return addition_weights
-
-    def load_additional_param_state_dict(self, submodel_weights: dict):
-        # load additional weights:
-        model = self.model
-        param_dict = {
-            'wte': model.transformer.wte,
-            'wpe': model.transformer.wpe,
-            'last_ln_f': model.transformer.ln_f
-        }
-
-        new_submodel_weight = {}
-        new_submodel_weight['last_ln_f'] = submodel_weights['last_ln_f']
-        wte_dict, wpe_dict = {}, {}
-        for k, v in submodel_weights.items():
-            if 'wte' in k:
-                wte_dict[k] = v
-            if 'wpe' in k:
-                wpe_dict[k] = v
-        wte = recover_numpy_array(wte_dict, 'wte')
-        wpe = recover_numpy_array(wpe_dict, 'wpe')
-        new_submodel_weight['wte'] = wte
-        new_submodel_weight['wpe'] = wpe
-
-        self.load_numpy_state_dict(param_dict, new_submodel_weight)
-
-    def forward(self, x):
-        return self.model(**x)
-
-    def parameters(self, recurse=True):
-        if self.partial_weight_decay is None:
-            return super().parameters(recurse)
-        elif isinstance(self.partial_weight_decay, float):
-            no_decay = ["bias", "layer_norm.weight"]
-            return [
-                {
-                    "params": [
-                        p for n, p in self.named_parameters() if not any(
-                            nd in n for nd in no_decay)], "weight_decay": self.partial_weight_decay}, {
-                    "params": [
-                        p for n, p in self.named_parameters() if any(
-                            nd in n for nd in no_decay)], "weight_decay": 0.0}]
-        else:
-            raise ValueError(
-                f"partial_weight_decay should be None or float, but got {self.partial_weight_decay}")
diff --git a/python/fate_llm/model_zoo/offsite_tuning/llama_ot.py b/python/fate_llm/model_zoo/offsite_tuning/llama.py
similarity index 100%
rename from python/fate_llm/model_zoo/offsite_tuning/llama_ot.py
rename to python/fate_llm/model_zoo/offsite_tuning/llama.py

From 32ae458e0e96d6d2e1e17675bac0bdbc969a1c49 Mon Sep 17 00:00:00 2001
From: sagewe <wbwmat@gmail.com>
Date: Thu, 22 Feb 2024 17:17:54 +0800
Subject: [PATCH 08/35] feat: implement fedkseed (#47)

Signed-off-by: sagewe <wbwmat@gmail.com>
---
 python/fate_llm/fedkseed/__init__.py      |   0
 python/fate_llm/fedkseed/args.py          |  31 ++++
 python/fate_llm/fedkseed/fedkseed.py      | 156 ++++++++++++++++++
 python/fate_llm/fedkseed/optimizer.py     | 191 ++++++++++++++++++++++
 python/fate_llm/fedkseed/pytorch_utils.py |  51 ++++++
 python/fate_llm/fedkseed/trainer.py       | 131 +++++++++++++++
 python/fate_llm/fedkseed/zo_utils.py      |  68 ++++++++
 7 files changed, 628 insertions(+)
 create mode 100644 python/fate_llm/fedkseed/__init__.py
 create mode 100644 python/fate_llm/fedkseed/args.py
 create mode 100644 python/fate_llm/fedkseed/fedkseed.py
 create mode 100644 python/fate_llm/fedkseed/optimizer.py
 create mode 100644 python/fate_llm/fedkseed/pytorch_utils.py
 create mode 100644 python/fate_llm/fedkseed/trainer.py
 create mode 100644 python/fate_llm/fedkseed/zo_utils.py

diff --git a/python/fate_llm/fedkseed/__init__.py b/python/fate_llm/fedkseed/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/fate_llm/fedkseed/args.py b/python/fate_llm/fedkseed/args.py
new file mode 100644
index 0000000..e5d3eb9
--- /dev/null
+++ b/python/fate_llm/fedkseed/args.py
@@ -0,0 +1,31 @@
+from dataclasses import dataclass, field
+
+from transformers import TrainingArguments
+from transformers.utils import add_start_docstrings
+
+
+@dataclass
+@add_start_docstrings(TrainingArguments.__doc__)
+class KSeedTrainingArguments(TrainingArguments):
+    """
+    TrainingArguments is the subset of the arguments we use in our example scripts, they are the arguments that
+
+    Parameters:
+        optim: optional, default is KSeedZO
+            The optimizer to use.
+        eps: optional, default is 0.0005
+            Epsilon value for KSeedZerothOrderOptimizer.
+        grad_clip: optional, default is -100.0
+            Gradient clip value for KSeedZerothOrderOptimizer.
+    """
+
+    zo_optim: bool = field(
+        default=True,
+        metadata={"help": "Whether to use KSeedZerothOrderOptimizer. This suppress `optim` argument when True."},
+    )
+    k: int = field(
+        default=4096,
+        metadata={"help": "The number of seed candidates to use. This suppress `seed_candidates` argument when > 1."},
+    )
+    eps: float = field(default=0.0005, metadata={"help": "Epsilon value for KSeedZerothOrderOptimizer."})
+    grad_clip: float = field(default=-100.0, metadata={"help": "Gradient clip value for KSeedZerothOrderOptimizer."})
diff --git a/python/fate_llm/fedkseed/fedkseed.py b/python/fate_llm/fedkseed/fedkseed.py
new file mode 100644
index 0000000..49bfe5d
--- /dev/null
+++ b/python/fate_llm/fedkseed/fedkseed.py
@@ -0,0 +1,156 @@
+import copy
+import logging
+from dataclasses import dataclass, field
+from typing import List, Mapping
+
+import torch
+from fate.arch.context import Context
+
+from fate_llm.fedkseed.args import KSeedTrainingArguments
+from fate_llm.fedkseed.pytorch_utils import get_optimizer_parameters_grouped_with_decay
+from fate_llm.fedkseed.trainer import KSeedZOExtendedTrainer
+from fate_llm.fedkseed.zo_utils import probability_from_amps, directional_derivative_step, get_even_seed_probabilities
+
+logger = logging.getLogger(__name__)
+
+
+class Trainer:
+    def __init__(
+        self, ctx: Context, seed_candidates: torch.LongTensor, args, data_collator, train_dataset, eval_dataset
+    ):
+        self.ctx = ctx
+        self.args = args
+        self.data_collator = data_collator
+        self.train_dataset = train_dataset
+        self.eval_dataset = eval_dataset
+
+        self.seed_candidates = seed_candidates
+        self.k = len(seed_candidates)
+        self.clients = ctx.hosts
+        self.model = None
+
+    def load_model(self):
+        raise NotImplementedError
+
+    def train(self):
+        direction_derivative_history = {seed.item(): [self.args.grad_initial] for seed in self.seed_candidates}
+        direction_derivative_sum = None
+        seed_probabilities = None
+        for aggregation_iter, sub_ctx in self.ctx.ctxs_range(self.args.num_aggregations):
+            # step1: re-calculate sample probabilities for each seed
+            if seed_probabilities is None:
+                seed_probabilities = get_even_seed_probabilities(self.k)
+            else:
+                seed_probabilities = probability_from_amps(
+                    [direction_derivative_history[seed.item()] for seed in self.seed_candidates],
+                    self.args.bias_loss_clip,
+                )
+
+            # step2(rpc): remote call to the clients to get the directional derivative history
+            # proposal
+            for client in sub_ctx.hosts:
+                client.put(
+                    "train_once",
+                    (
+                        False,
+                        {
+                            "seed_candidates": self.seed_candidates,
+                            "seed_probabilities": seed_probabilities,
+                            "direction_derivative_sum": direction_derivative_sum,
+                        },
+                    ),
+                )
+
+            if direction_derivative_sum is None:
+                direction_derivative_sum = {seed.item(): 0.0 for seed in self.seed_candidates}
+            # wait for reply and update the directional derivative history
+            for client in sub_ctx.hosts:
+                client_directional_derivative_history = client.get("direction_derivative_history")
+                for seed, history in client_directional_derivative_history.items():
+                    # torch.LongTensor -> int
+                    seed = int(seed)
+                    if seed not in direction_derivative_history:
+                        direction_derivative_history[seed] = []
+                    direction_derivative_history[seed].extend(history)
+                    direction_derivative_sum[seed] += sum(history)
+
+            # step3: evaluate to get stopping condition if necessary
+            if self.should_stop():
+                break
+
+    def should_stop(self):
+        return False
+
+    def evaluate(self):
+        pass
+
+
+class ClientTrainer:
+    def __init__(self, ctx: Context, model, args, train_dataset, eval_dataset, data_collator, tokenizer):
+        self.ctx = ctx
+        self.args = args
+        self.data_collator = data_collator
+        self.train_dataset = train_dataset
+        self.eval_dataset = eval_dataset
+        self.tokenizer = tokenizer
+
+        self.weight_decay = args.weight_decay
+        self.model_0 = model
+
+    def serve_loop(self):
+        for i, sub_ctx in self.ctx.ctxs_range(self.args.num_aggregations):
+            # step1: wait for the server to send the seed candidates and probabilities or exit signal
+            logger.info(f"training loop started: {i}")
+            should_exit, kwargs = sub_ctx.arbiter.get("train_once")
+            seed_candidates = kwargs["seed_candidates"]
+            seed_probabilities = kwargs["seed_probabilities"]
+            direction_derivative_sum = kwargs["direction_derivative_sum"]
+            logger.info(
+                f"should_exit: {should_exit}, seed_candidates: {seed_candidates}, seed_probabilities: {seed_probabilities}"
+            )
+            if should_exit:
+                break
+
+            # step2: start the training loop
+            direction_derivative_history = self.train_once(
+                seed_candidates, seed_probabilities, direction_derivative_sum
+            )
+
+            # step3: send the directional derivative history to the server
+            sub_ctx.arbiter.put("direction_derivative_history", direction_derivative_history)
+
+    def train_once(self, seed_candidates, seed_probabilities, direction_derivative_sum) -> Mapping[int, List[float]]:
+        # build model
+        model = copy.deepcopy(self.model_0)
+        model.to(self.args.device)
+        if direction_derivative_sum is not None:
+            param_groups = get_optimizer_parameters_grouped_with_decay(model, self.weight_decay)
+            for seed, grad in direction_derivative_sum.items():
+                if grad != 0.0:
+                    directional_derivative_step(
+                        param_groups, seed, grad, lr=self.args.learning_rate, weight_decay=self.args.weight_decay
+                    )
+
+        # train
+        trainer = KSeedZOExtendedTrainer(
+            model=model,
+            args=self.args,
+            tokenizer=self.tokenizer,
+            data_collator=self.data_collator,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+        )
+        trainer.configure_seed_candidates(seed_candidates, seed_probabilities)
+        trainer.train()
+        logger.info(f"evaluate: {trainer.evaluate()}")
+        # get directional derivative history
+        return trainer.get_directional_derivative_history()
+
+
+@dataclass
+class FedKSeedTrainingArguments(KSeedTrainingArguments):
+    num_aggregations: int = field(default=10, metadata={"help": "The number of aggregations to perform."})
+    bias_loss_clip: float = field(default=1000.0, metadata={"help": "The bias loss clip value."})
+    grad_initial: float = field(
+        default=0.0, metadata={"help": "The initial value for the directional derivative history."}
+    )
diff --git a/python/fate_llm/fedkseed/optimizer.py b/python/fate_llm/fedkseed/optimizer.py
new file mode 100644
index 0000000..bad789f
--- /dev/null
+++ b/python/fate_llm/fedkseed/optimizer.py
@@ -0,0 +1,191 @@
+import math
+from typing import Mapping, Optional, Callable, Tuple, List
+
+import torch
+from torch.optim import Optimizer
+
+from fate_llm.fedkseed.pytorch_utils import get_optimizer_parameters_grouped_with_decay
+from fate_llm.fedkseed.zo_utils import directional_derivative_step
+
+
+class RandomWalkOptimizer(Optimizer):
+    """
+    Random Walk Optimizer
+
+    This optimizer performs a `random` walk update for the parameters of the model.
+    """
+
+    def __init__(self, params, lr, weight_decay, grad_clip, defaults=None):
+        self.lr = lr
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        if defaults is None:
+            defaults = dict(lr=lr, weight_decay=weight_decay)
+        else:
+            defaults = dict(defaults)
+            defaults.update(lr=lr, weight_decay=weight_decay)
+        super(RandomWalkOptimizer, self).__init__(params, defaults)
+
+    @classmethod
+    def from_model(cls, model, lr, weight_decay, grad_clip, **kwargs):
+        optimizer_grouped_parameters = get_optimizer_parameters_grouped_with_decay(model, weight_decay)
+        kwargs["lr"] = lr
+        kwargs["weight_decay"] = weight_decay
+        kwargs["grad_clip"] = grad_clip
+        return cls(optimizer_grouped_parameters, **kwargs)
+
+    def directional_derivative_step(
+        self, directional_derivative_seed: int, directional_derivative_value: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        """
+        perform a step update for the parameters of the model
+        along the random direction z with the learning rate lr and the step size grad_projected_value
+        """
+
+        if self.grad_clip > 0.0:
+            if abs(directional_derivative_value) > self.grad_clip:
+                return torch.FloatTensor([torch.nan])
+        directional_derivative_step(self.param_groups, directional_derivative_seed, directional_derivative_value)
+        return directional_derivative_value
+
+    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
+        raise NotImplementedError(
+            "use random_step instead of step for RandomWalkOptimizer \
+            since we need pass the `seed` and `grad_projected_value`"
+        )
+
+
+class ZerothOrderOptimizer(RandomWalkOptimizer):
+    def __init__(self, params, lr, eps, weight_decay, grad_clip):
+        self.eps = eps
+        defaults = dict(eps=eps)
+        super(ZerothOrderOptimizer, self).__init__(params, lr, weight_decay, grad_clip, defaults)
+
+    def zeroth_order_step(
+        self, directional_derivative_seed: int, closure: Callable[[], torch.FloatTensor]
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """
+        perform a step update for the parameters of the model along the
+        random direction z generated by the `directional_derivative_seed`
+        with the learning rate lr
+        and the step size of calculated namely `directional_derivative_value`
+
+        Input:
+        - directional_derivative_seed: the seed for generating the random direction z
+        - closure (callable, optional): A closure that reevaluates the model and returns the loss.
+
+        Output:
+        - directional_derivative_value: the gradient projected value
+        - loss_right: the loss of the model with the perturbed parameters x + eps * z
+        - loss_left: the loss of the model with the perturbed parameters x - eps * z
+        """
+
+        # x -> x + eps * z
+        self.random_perturb_parameters(directional_derivative_seed, scaling_factor=1.0)
+        loss_right = closure()
+
+        # x + eps * z -> x - eps * z
+        self.random_perturb_parameters(directional_derivative_seed, scaling_factor=-2.0)
+        loss_left = closure()
+
+        # x - eps * z -> x
+        self.random_perturb_parameters(directional_derivative_seed, scaling_factor=1.0)
+
+        if torch.isnan(loss_right):
+            return loss_right, loss_right, loss_left
+        if torch.isnan(loss_left):
+            return loss_left, loss_right, loss_left
+
+        # ∇f(x) · z = D_z f(x) ≈ (f(x + eps * z) - f(x - eps * z)) / (2 * eps)
+        directional_derivative_value = (loss_right - loss_left) / (2 * self.eps)
+        # perform update for the random direction z * grad_projected_value
+        directional_derivative_value = self.directional_derivative_step(
+            directional_derivative_seed, directional_derivative_value
+        )
+
+        return directional_derivative_value, loss_right, loss_left
+
+    def random_perturb_parameters(self, directional_derivative_seed: int, scaling_factor: float):
+        """
+        Perturb the parameters with random direction z generated by the directional_derivative_seed
+
+        for each parameter theta, the update is theta = theta + scaling_factor * z * eps
+
+        Input:
+        - seed: the seed for generating the random direction z
+        - scaling_factor: the scaling factor for the random direction z
+
+        Output:
+        - None
+        """
+        torch.manual_seed(directional_derivative_seed)
+        for param_group in self.param_groups:
+            eps = param_group["eps"]
+            for param in param_group["params"]:
+                if param.requires_grad:
+                    z = torch.normal(
+                        mean=0, std=1, size=param.data.size(), device=param.data.device, dtype=param.data.dtype
+                    )
+                    param.data = param.data + scaling_factor * eps * z
+
+
+class KSeedZerothOrderOptimizer(ZerothOrderOptimizer):
+    def __init__(
+        self,
+        params,
+        seed_candidates: torch.LongTensor,
+        seed_probabilities: torch.FloatTensor,
+        lr,
+        eps,
+        weight_decay,
+        grad_clip,
+    ):
+        self.seed_candidate = seed_candidates
+        self.seed_probabilities = seed_probabilities
+        self.directional_derivative_history: Mapping[int, List[float]] = {seed.item(): [] for seed in seed_candidates}
+        self.sample_random_generator = torch.Generator()
+        super(KSeedZerothOrderOptimizer, self).__init__(params, lr, eps, weight_decay, grad_clip)
+
+    def sample(self) -> int:
+        sampled = torch.multinomial(
+            input=self.seed_probabilities,
+            num_samples=1,
+            generator=self.sample_random_generator,
+        )[0].item()
+        return self.seed_candidate[sampled].item()
+
+    def step(self, closure: Callable[[], torch.FloatTensor] = None) -> torch.FloatTensor:
+        if closure is None:
+            # closure is required for the zeroth_order_step, but we
+            # don't raise an error here to maintain compatibility with
+            # the third-party tools that use the `step` method without
+            # providing the closure in training loop, e.g., HuggingFace Transformers
+            return torch.FloatTensor([torch.nan])
+        return self.kseed_zeroth_order_step(closure)
+
+    def kseed_zeroth_order_step(self, closure: Callable[[], torch.FloatTensor]) -> torch.FloatTensor:
+        """
+        Performs a single optimization step.
+
+        1. Sample a random seed for sampling z
+        2. Perturb the parameters with the random direction(-z * eps, z * eps) for evaluating the model on the batch, and compute the loss(loss1, loss2)
+        3. Compute the directional derivative value: grad_projected_value = (loss_right - loss_left) / (2 * eps)
+        4. Perform the directional derivative step update for the parameters of the model along the random direction z with the learning rate lr and the step size grad_projected_value
+
+
+        Input:
+        - closure (callable, optional): A closure that reevaluates the model and returns the loss.
+        """
+        if closure is None:
+            raise ValueError("closure must not be None")
+
+        # sample the random seed for sampling z for perturbing parameters.
+        seed = self.sample()
+        directional_derivative_value, loss_right, loss_left = self.zeroth_order_step(seed, closure)
+        if math.isnan(directional_derivative_value):
+            return directional_derivative_value
+
+        # record the directional_derivative_value for the seed
+        self.directional_derivative_history[seed].append(directional_derivative_value.item())
+
+        return loss_right  # TODO: return loss_left or loss_right or average of both?
diff --git a/python/fate_llm/fedkseed/pytorch_utils.py b/python/fate_llm/fedkseed/pytorch_utils.py
new file mode 100644
index 0000000..b9129b4
--- /dev/null
+++ b/python/fate_llm/fedkseed/pytorch_utils.py
@@ -0,0 +1,51 @@
+from typing import List
+
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.trainer_pt_utils import get_parameter_names
+
+
+def get_decay_parameter_names(model) -> List[str]:
+    """
+    Get all parameter names that weight decay will be applied to
+
+    Note that some models implement their own layernorm instead of calling nn.LayerNorm, weight decay could still
+    apply to those modules since this function only filter out instance of nn.LayerNorm
+
+    NOTE: This function is copied from transformers
+    # Copyright 2020-present the HuggingFace Inc. team.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    """
+    decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS)
+    decay_parameters = [name for name in decay_parameters if "bias" not in name]
+    return decay_parameters
+
+
+def get_optimizer_parameters_grouped_with_decay(model, weight_decay: float) -> List[dict]:
+    """
+    Get the parameters grouped by whether they should have weight decay applied
+    """
+    decay_parameters = get_decay_parameter_names(model)
+    params_no_decay = []
+    params_decay = []
+    for n, p in model.named_parameters():
+        if p.requires_grad:
+            if n in decay_parameters:
+                params_decay.append(p)
+            else:
+                params_no_decay.append(p)
+    grouped_parameters_with_decay = [
+        {"params": params_no_decay, "weight_decay": 0.0},
+        {"params": params_decay, "weight_decay": weight_decay},
+    ]
+    return grouped_parameters_with_decay
diff --git a/python/fate_llm/fedkseed/trainer.py b/python/fate_llm/fedkseed/trainer.py
new file mode 100644
index 0000000..d7872ff
--- /dev/null
+++ b/python/fate_llm/fedkseed/trainer.py
@@ -0,0 +1,131 @@
+import logging
+from typing import Dict, Union, Any, Tuple
+from typing import Optional, List, Callable
+
+import torch
+from torch import nn
+from torch.utils.data import Dataset
+from transformers import PreTrainedModel, PreTrainedTokenizerBase, EvalPrediction, DataCollator
+from transformers import Trainer, TrainingArguments
+from transformers.optimization import get_scheduler, SchedulerType
+from transformers.trainer_callback import TrainerCallback
+
+from fate_llm.fedkseed.args import KSeedTrainingArguments
+from fate_llm.fedkseed.optimizer import KSeedZerothOrderOptimizer
+from fate_llm.fedkseed.pytorch_utils import get_optimizer_parameters_grouped_with_decay
+
+logger = logging.getLogger(__name__)
+
+
+class KSeedZOExtendedTrainer(Trainer):
+    def __init__(
+        self,
+        model: Union[PreTrainedModel, nn.Module] = None,
+        args: Union["KSeedTrainingArguments", TrainingArguments] = None,
+        data_collator: Optional[DataCollator] = None,
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        model_init: Optional[Callable[[], PreTrainedModel]] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+        callbacks: Optional[List[TrainerCallback]] = None,
+        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+    ):
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            tokenizer=tokenizer,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+        self.args = args
+        self._kseed_optimizer = None
+
+        self._seed_candidates = None
+        self._seed_probabilities = None
+
+    def configure_seed_candidates(self, seed_candidates: torch.LongTensor, seed_probabilities: torch.FloatTensor):
+        self._seed_candidates = seed_candidates
+        self._seed_probabilities = seed_probabilities
+
+    def get_directional_derivative_history(self):
+        """
+        hook to get the directional derivative history
+        """
+        if KSeedZOExtendedTrainer.k_seed_zo_mode(self.args):
+            if self._kseed_optimizer is None:
+                raise ValueError("KSeedZerothOrderOptimizer is not configured")
+            return self._kseed_optimizer.directional_derivative_history
+        else:
+            raise ValueError("KSeedZerothOrderOptimizer is not configured")
+
+    @staticmethod
+    def k_seed_zo_mode(args):
+        return hasattr(args, "zo_optim") and args.zo_optim
+
+    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
+        """
+        hook to do the step with KSeedZerothOrderOptimizer
+        """
+        if KSeedZOExtendedTrainer.k_seed_zo_mode(self.args):
+            if self._kseed_optimizer is None:
+                raise ValueError("KSeedZerothOrderOptimizer is not configured")
+
+            model.eval()
+            inputs = self._prepare_inputs(inputs)
+
+            with self.compute_loss_context_manager():
+                # zeroth order optimization needs forward pass twice in an optimization step,
+                # so we need to wrap the forward pass in a closure
+                def closure() -> torch.FloatTensor:
+                    with torch.no_grad():
+                        return self.compute_loss(model, inputs, return_outputs=False).detach()
+
+            # we don't use step() method of KSeedZerothOrderOptimizer here
+            # because `Trainer` wraps the optimizer that is subclass of `torch.optim.Optimizer` and
+            # returns nothing from the step method
+            with torch.no_grad():
+                loss = self._kseed_optimizer.kseed_zeroth_order_step(closure=closure)
+                return loss.detach()
+        else:
+            return super().training_step(model, inputs)
+
+    def create_optimizer_and_scheduler(self, num_training_steps: int):
+        """
+        hook to add KSeedZerothOrderOptimizer
+        """
+        if KSeedZOExtendedTrainer.k_seed_zo_mode(self.args):
+
+            if self._seed_candidates is None or self._seed_probabilities is None:
+                raise ValueError("Seed candidates and probabilities are not configured.")
+
+            optimizer_grouped_parameters = get_optimizer_parameters_grouped_with_decay(
+                self.model, self.args.weight_decay
+            )
+            self.optimizer = KSeedZerothOrderOptimizer(
+                optimizer_grouped_parameters,
+                seed_candidates=self._seed_candidates,
+                seed_probabilities=self._seed_probabilities,
+                lr=self.args.learning_rate,
+                eps=self.args.eps,
+                weight_decay=self.args.weight_decay,
+                grad_clip=self.args.grad_clip,
+            )
+            # we need to keep the reference to the original optimizer to use it in training_step
+            self._kseed_optimizer = self.optimizer
+            # if we use learning rate scheduler, we may need to preserve all updates instead of the aggregated one
+            self.lr_scheduler = get_scheduler(
+                name=SchedulerType.CONSTANT,
+                optimizer=self.optimizer,
+                num_warmup_steps=self.args.warmup_steps,
+                num_training_steps=num_training_steps,
+            )
+        else:
+            super().create_optimizer_and_scheduler(num_training_steps)
diff --git a/python/fate_llm/fedkseed/zo_utils.py b/python/fate_llm/fedkseed/zo_utils.py
new file mode 100644
index 0000000..9aa9496
--- /dev/null
+++ b/python/fate_llm/fedkseed/zo_utils.py
@@ -0,0 +1,68 @@
+from typing import List
+
+import torch
+
+
+def probability_from_amps(amps: List[List[float]], clip):
+    """
+    Get the probability distribution from the amplitude history
+
+    formula: amp_i = clamp(amp_i, -clip, clip).abs().mean()
+             amp_i = (amp_i - min(amp)) / (max(amp) - min(amp))
+             prob_i = softmax(amp)_i
+
+    :param amps: list of amplitude history
+    :param clip: the clipping value
+    :return:
+    """
+    amps = [torch.Tensor(amp) for amp in amps]
+    amp = torch.stack([amp.clamp_(-clip, clip).abs_().mean() for amp in amps])
+    return (amp - amp.min()).div_(amp.max() - amp.min() + 1e-10).softmax(0)
+
+
+def directional_derivative_step(
+    param_groups: List[dict],
+    directional_derivative_seed: int,
+    directional_derivative_value: torch.FloatTensor,
+    lr: float = None,
+    weight_decay: float = None,
+) -> torch.FloatTensor:
+    """
+    perform a step update for the parameters of the model
+    along the random direction z with the learning rate lr and the step size grad_projected_value
+
+    Input:
+    - param_groups (List[dict]): list of parameter groups
+    - directional_derivative_seed (int): seed for the random direction
+    - directional_derivative_value (torch.FloatTensor): the step size
+    - lr (float, optional): learning rate
+    - weight_decay (float, optional): weight decay
+    """
+
+    torch.manual_seed(directional_derivative_seed)
+    for param_group in param_groups:
+        weight_decay = param_group["weight_decay"] if weight_decay is None else weight_decay
+        lr = param_group["lr"] if lr is None else lr
+        for param in param_group["params"]:
+            z = torch.normal(mean=0, std=1, size=param.data.size(), device=param.data.device, dtype=param.data.dtype)
+            if weight_decay is not None:
+                param.data = param.data - lr * (directional_derivative_value * z + weight_decay * param.data)
+
+            else:
+                param.data = param.data - lr * (directional_derivative_value * z)
+
+    return directional_derivative_value
+
+
+def build_seed_candidates(k, low=0, high=2**32):
+    """
+    Build seed candidates for the random walk optimizer
+    """
+    return torch.randint(low, high, size=(k,), dtype=torch.long)
+
+
+def get_even_seed_probabilities(k):
+    """
+    Get the even seed probabilities, i.e., 1/k for each seed
+    """
+    return torch.ones(k) / k

From 3b77e4d37c094c416805a04fb2522a914146cf1e Mon Sep 17 00:00:00 2001
From: sagewe <wbwmat@gmail.com>
Date: Thu, 22 Feb 2024 18:08:31 +0800
Subject: [PATCH 09/35] feat: add fedkseed runner (#47)

Signed-off-by: sagewe <wbwmat@gmail.com>
---
 python/fate_llm/fedkseed/fedkseed.py      |  25 ++--
 python/fate_llm/fedkseed/trainer.py       |  17 +--
 python/fate_llm/runner/fedkseed_runner.py | 137 ++++++++++++++++++++++
 3 files changed, 159 insertions(+), 20 deletions(-)
 create mode 100644 python/fate_llm/runner/fedkseed_runner.py

diff --git a/python/fate_llm/fedkseed/fedkseed.py b/python/fate_llm/fedkseed/fedkseed.py
index 49bfe5d..532f808 100644
--- a/python/fate_llm/fedkseed/fedkseed.py
+++ b/python/fate_llm/fedkseed/fedkseed.py
@@ -6,23 +6,20 @@
 import torch
 from fate.arch.context import Context
 
-from fate_llm.fedkseed.args import KSeedTrainingArguments
 from fate_llm.fedkseed.pytorch_utils import get_optimizer_parameters_grouped_with_decay
 from fate_llm.fedkseed.trainer import KSeedZOExtendedTrainer
 from fate_llm.fedkseed.zo_utils import probability_from_amps, directional_derivative_step, get_even_seed_probabilities
+from fate_llm.fedkseed.args import KSeedTrainingArguments
 
 logger = logging.getLogger(__name__)
 
 
 class Trainer:
     def __init__(
-        self, ctx: Context, seed_candidates: torch.LongTensor, args, data_collator, train_dataset, eval_dataset
+            self, ctx: Context, seed_candidates: torch.LongTensor, args
     ):
         self.ctx = ctx
         self.args = args
-        self.data_collator = data_collator
-        self.train_dataset = train_dataset
-        self.eval_dataset = eval_dataset
 
         self.seed_candidates = seed_candidates
         self.k = len(seed_candidates)
@@ -86,19 +83,21 @@ def evaluate(self):
 
 
 class ClientTrainer:
-    def __init__(self, ctx: Context, model, args, train_dataset, eval_dataset, data_collator, tokenizer):
+    def __init__(self, ctx: Context, model, fedkseed_args, training_args, train_dataset, eval_dataset, data_collator,
+                 tokenizer):
         self.ctx = ctx
-        self.args = args
+        self.fedkseed_args = fedkseed_args
+        self.training_args = training_args
         self.data_collator = data_collator
         self.train_dataset = train_dataset
         self.eval_dataset = eval_dataset
         self.tokenizer = tokenizer
 
-        self.weight_decay = args.weight_decay
+        self.weight_decay = training_args.weight_decay
         self.model_0 = model
 
     def serve_loop(self):
-        for i, sub_ctx in self.ctx.ctxs_range(self.args.num_aggregations):
+        for i, sub_ctx in self.ctx.ctxs_range(self.fedkseed_args.num_aggregations):
             # step1: wait for the server to send the seed candidates and probabilities or exit signal
             logger.info(f"training loop started: {i}")
             should_exit, kwargs = sub_ctx.arbiter.get("train_once")
@@ -122,19 +121,21 @@ def serve_loop(self):
     def train_once(self, seed_candidates, seed_probabilities, direction_derivative_sum) -> Mapping[int, List[float]]:
         # build model
         model = copy.deepcopy(self.model_0)
-        model.to(self.args.device)
+        model.to(self.training_args.device)
         if direction_derivative_sum is not None:
             param_groups = get_optimizer_parameters_grouped_with_decay(model, self.weight_decay)
             for seed, grad in direction_derivative_sum.items():
                 if grad != 0.0:
                     directional_derivative_step(
-                        param_groups, seed, grad, lr=self.args.learning_rate, weight_decay=self.args.weight_decay
+                        param_groups, seed, grad, lr=self.training_args.learning_rate,
+                        weight_decay=self.training_args.weight_decay
                     )
 
         # train
         trainer = KSeedZOExtendedTrainer(
             model=model,
-            args=self.args,
+            training_args=self.training_args,
+            kseed_args=self.fedkseed_args,
             tokenizer=self.tokenizer,
             data_collator=self.data_collator,
             train_dataset=self.train_dataset,
diff --git a/python/fate_llm/fedkseed/trainer.py b/python/fate_llm/fedkseed/trainer.py
index d7872ff..7a1b5cb 100644
--- a/python/fate_llm/fedkseed/trainer.py
+++ b/python/fate_llm/fedkseed/trainer.py
@@ -21,7 +21,8 @@ class KSeedZOExtendedTrainer(Trainer):
     def __init__(
         self,
         model: Union[PreTrainedModel, nn.Module] = None,
-        args: Union["KSeedTrainingArguments", TrainingArguments] = None,
+        training_args: TrainingArguments = None,
+        kseed_args: "KSeedTrainingArguments" = None,
         data_collator: Optional[DataCollator] = None,
         train_dataset: Optional[Dataset] = None,
         eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
@@ -34,7 +35,7 @@ def __init__(
     ):
         super().__init__(
             model=model,
-            args=args,
+            args=training_args,
             data_collator=data_collator,
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
@@ -45,7 +46,7 @@ def __init__(
             optimizers=optimizers,
             preprocess_logits_for_metrics=preprocess_logits_for_metrics,
         )
-        self.args = args
+        self.kseed_args = kseed_args
         self._kseed_optimizer = None
 
         self._seed_candidates = None
@@ -59,7 +60,7 @@ def get_directional_derivative_history(self):
         """
         hook to get the directional derivative history
         """
-        if KSeedZOExtendedTrainer.k_seed_zo_mode(self.args):
+        if KSeedZOExtendedTrainer.k_seed_zo_mode(self.kseed_args):
             if self._kseed_optimizer is None:
                 raise ValueError("KSeedZerothOrderOptimizer is not configured")
             return self._kseed_optimizer.directional_derivative_history
@@ -74,7 +75,7 @@ def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor,
         """
         hook to do the step with KSeedZerothOrderOptimizer
         """
-        if KSeedZOExtendedTrainer.k_seed_zo_mode(self.args):
+        if KSeedZOExtendedTrainer.k_seed_zo_mode(self.kseed_args):
             if self._kseed_optimizer is None:
                 raise ValueError("KSeedZerothOrderOptimizer is not configured")
 
@@ -101,7 +102,7 @@ def create_optimizer_and_scheduler(self, num_training_steps: int):
         """
         hook to add KSeedZerothOrderOptimizer
         """
-        if KSeedZOExtendedTrainer.k_seed_zo_mode(self.args):
+        if KSeedZOExtendedTrainer.k_seed_zo_mode(self.kseed_args):
 
             if self._seed_candidates is None or self._seed_probabilities is None:
                 raise ValueError("Seed candidates and probabilities are not configured.")
@@ -114,9 +115,9 @@ def create_optimizer_and_scheduler(self, num_training_steps: int):
                 seed_candidates=self._seed_candidates,
                 seed_probabilities=self._seed_probabilities,
                 lr=self.args.learning_rate,
-                eps=self.args.eps,
+                eps=self.kseed_args.eps,
                 weight_decay=self.args.weight_decay,
-                grad_clip=self.args.grad_clip,
+                grad_clip=self.kseed_args.grad_clip,
             )
             # we need to keep the reference to the original optimizer to use it in training_step
             self._kseed_optimizer = self.optimizer
diff --git a/python/fate_llm/runner/fedkseed_runner.py b/python/fate_llm/runner/fedkseed_runner.py
new file mode 100644
index 0000000..4d30e8c
--- /dev/null
+++ b/python/fate_llm/runner/fedkseed_runner.py
@@ -0,0 +1,137 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import logging
+from typing import Dict
+from typing import Literal
+from typing import Optional
+
+from fate.components.components.nn.nn_runner import (
+    NNRunner,
+    load_model_dict_from_path,
+    dir_warning,
+    loader_load_from_conf,
+)
+from fate.components.components.nn.runner.homo_default_runner import DefaultRunner
+from transformers.trainer_utils import get_last_checkpoint
+
+from fate_llm.fedkseed.fedkseed import Trainer, FedKSeedTrainingArguments, ClientTrainer
+from fate_llm.fedkseed.zo_utils import build_seed_candidates
+from fate_llm.trainer.seq2seq_trainer import Seq2SeqTrainingArguments
+
+logger = logging.getLogger(__name__)
+
+SUPPORTED_ALGO = ["fedkseed"]
+
+
+class FedKSeedRunner(DefaultRunner):
+    def __init__(
+            self,
+            algo: str = "fedkseed",
+            model_conf: Optional[Dict] = None,
+            dataset_conf: Optional[Dict] = None,
+            optimizer_conf: Optional[Dict] = None,
+            training_args_conf: Optional[Dict] = None,
+            fed_args_conf: Optional[Dict] = None,
+            data_collator_conf: Optional[Dict] = None,
+            tokenizer_conf: Optional[Dict] = None,
+            task_type: Literal["causal_lm", "other"] = "causal_lm",
+            local_mode: bool = False,
+            save_trainable_weights_only: bool = False,
+    ) -> None:
+        super(NNRunner, self).__init__()
+        self.algo = algo
+        self.model_conf = model_conf
+        self.dataset_conf = dataset_conf
+        self.optimizer_conf = optimizer_conf
+        self.training_args_conf = training_args_conf
+        self.fed_args_conf = fed_args_conf
+        self.data_collator_conf = data_collator_conf
+        self.local_mode = local_mode
+        self.tokenizer_conf = tokenizer_conf
+        self.task_type = task_type
+        self.save_trainable_weights_only = save_trainable_weights_only
+
+        # check param
+        if self.algo not in SUPPORTED_ALGO:
+            raise ValueError(f"algo should be one of {SUPPORTED_ALGO}")
+        if self.task_type not in ["causal_lm", "others"]:
+            raise ValueError("task_type should be one of [binary, multi, regression, others]")
+        assert isinstance(self.local_mode, bool), "local should be bool"
+
+        # setup var
+        self.trainer = None
+        self.training_args = None
+
+    def client_setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None, stage="train"):
+        if self.algo != "fedkseed":
+            raise ValueError(f"algo {self.algo} not supported")
+
+        ctx = self.get_context()
+
+        model = loader_load_from_conf(self.model_conf)
+        if model is None:
+            raise ValueError(f"model is None, cannot load model from conf {self.model_conf}")
+
+        if output_dir is None:
+            output_dir = "./"
+
+        resume_path = None
+        if saved_model is not None:
+            model_dict = load_model_dict_from_path(saved_model)
+            model.load_state_dict(model_dict)
+            logger.info(f"loading model dict from {saved_model} to model done")
+            if get_last_checkpoint(saved_model) is not None:
+                resume_path = saved_model
+                logger.info(f"checkpoint detected, resume_path set to {resume_path}")
+
+        data_collator = loader_load_from_conf(self.data_collator_conf)
+        # load tokenizer if import conf provided
+        tokenizer = loader_load_from_conf(self.tokenizer_conf)
+        # args
+        dir_warning(self.training_args_conf)
+        training_args = Seq2SeqTrainingArguments(**self.training_args_conf)
+        self.training_args = training_args
+        # reset to default, saving to arbitrary path is not allowed in
+        # DefaultRunner
+        training_args.output_dir = output_dir
+        training_args.resume_from_checkpoint = resume_path  # resume path
+
+        fedkseed_args = FedKSeedTrainingArguments(**self.fed_args_conf)
+        training_args = Seq2SeqTrainingArguments(**self.training_args_conf)
+        trainer = ClientTrainer(
+            ctx=ctx,
+            model=model,
+            training_args=training_args,
+            fedkseed_args=fedkseed_args,
+            data_collator=data_collator,
+            tokenizer=tokenizer,
+            train_dataset=train_set,
+            eval_dataset=validate_set,
+        )
+        return trainer
+
+    def server_setup(self, stage="train"):
+
+        if self.algo != "fedkseed":
+            raise ValueError(f"algo {self.algo} not supported")
+        ctx = self.get_context()
+
+        fedkseed_args = FedKSeedTrainingArguments(**self.fed_args_conf)
+        training_args = Seq2SeqTrainingArguments(**self.training_args_conf)
+
+        seed_candidates = build_seed_candidates(fedkseed_args.k, low=0, high=2 ** 32)
+        trainer = Trainer(ctx=ctx, seed_candidates=seed_candidates, args=training_args)
+        return trainer

From 40ecdeb57c59553d42d28640e656d83cd3626124 Mon Sep 17 00:00:00 2001
From: sagewe <wbwmat@gmail.com>
Date: Thu, 22 Feb 2024 18:49:55 +0800
Subject: [PATCH 10/35] fix: refactor args (#47)

Signed-off-by: sagewe <wbwmat@gmail.com>
---
 python/fate_llm/fedkseed/args.py     | 6 +-----
 python/fate_llm/fedkseed/fedkseed.py | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/python/fate_llm/fedkseed/args.py b/python/fate_llm/fedkseed/args.py
index e5d3eb9..17db3f9 100644
--- a/python/fate_llm/fedkseed/args.py
+++ b/python/fate_llm/fedkseed/args.py
@@ -1,12 +1,8 @@
 from dataclasses import dataclass, field
 
-from transformers import TrainingArguments
-from transformers.utils import add_start_docstrings
-
 
 @dataclass
-@add_start_docstrings(TrainingArguments.__doc__)
-class KSeedTrainingArguments(TrainingArguments):
+class KSeedTrainingArguments:
     """
     TrainingArguments is the subset of the arguments we use in our example scripts, they are the arguments that
 
diff --git a/python/fate_llm/fedkseed/fedkseed.py b/python/fate_llm/fedkseed/fedkseed.py
index 532f808..d2dfb0e 100644
--- a/python/fate_llm/fedkseed/fedkseed.py
+++ b/python/fate_llm/fedkseed/fedkseed.py
@@ -96,7 +96,7 @@ def __init__(self, ctx: Context, model, fedkseed_args, training_args, train_data
         self.weight_decay = training_args.weight_decay
         self.model_0 = model
 
-    def serve_loop(self):
+    def train(self):
         for i, sub_ctx in self.ctx.ctxs_range(self.fedkseed_args.num_aggregations):
             # step1: wait for the server to send the seed candidates and probabilities or exit signal
             logger.info(f"training loop started: {i}")

From 321f42638f7ed8dd077e51f020c97f693d7a8783 Mon Sep 17 00:00:00 2001
From: cwj <talkingwallace@sohu.com>
Date: Thu, 22 Feb 2024 23:40:30 +0800
Subject: [PATCH 11/35] Fix ot bug Signed-off-by: weijingchen
 <talkingwallace@sohu.com>

Signed-off-by: cwj <talkingwallace@sohu.com>
---
 .../data_collator/__init__.py                 |   0
 .../data_collator}/cust_data_collator.py      |   0
 .../{dataset => data}/tokenizers/__init__.py  |   0
 .../offsite_tuning/offsite_tuning_model.py    | 357 ++++++++----------
 python/fate_llm/runner/homo_seq2seq_runner.py |   2 +-
 5 files changed, 164 insertions(+), 195 deletions(-)
 rename python/fate_llm/{dataset => data}/data_collator/__init__.py (100%)
 rename python/fate_llm/{cust_func => data/data_collator}/cust_data_collator.py (100%)
 rename python/fate_llm/{dataset => data}/tokenizers/__init__.py (100%)

diff --git a/python/fate_llm/dataset/data_collator/__init__.py b/python/fate_llm/data/data_collator/__init__.py
similarity index 100%
rename from python/fate_llm/dataset/data_collator/__init__.py
rename to python/fate_llm/data/data_collator/__init__.py
diff --git a/python/fate_llm/cust_func/cust_data_collator.py b/python/fate_llm/data/data_collator/cust_data_collator.py
similarity index 100%
rename from python/fate_llm/cust_func/cust_data_collator.py
rename to python/fate_llm/data/data_collator/cust_data_collator.py
diff --git a/python/fate_llm/dataset/tokenizers/__init__.py b/python/fate_llm/data/tokenizers/__init__.py
similarity index 100%
rename from python/fate_llm/dataset/tokenizers/__init__.py
rename to python/fate_llm/data/tokenizers/__init__.py
diff --git a/python/fate_llm/model_zoo/offsite_tuning/offsite_tuning_model.py b/python/fate_llm/model_zoo/offsite_tuning/offsite_tuning_model.py
index 6bc6b47..dd48d41 100644
--- a/python/fate_llm/model_zoo/offsite_tuning/offsite_tuning_model.py
+++ b/python/fate_llm/model_zoo/offsite_tuning/offsite_tuning_model.py
@@ -12,202 +12,171 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-#
-import torch as t
-from torch import nn
-from federatedml.util import LOGGER
-from transformers import AutoModel
-import numpy as np
-
-
-
-def get_dropout_emulator_and_adapters(
-        transformer_layers: nn.ModuleList,
-        emulator_layer_num: int,
-        adapter_top_layer_num: int,
-        adapter_bottom_layer_num: int):
-
-    assert adapter_bottom_layer_num > 0 and adapter_top_layer_num > 0, "adapter layer num must be greater than 0"
-    assert emulator_layer_num < len(
-        transformer_layers), "emulator layer num must be less than the number of transformer layers"
-    assert adapter_bottom_layer_num + adapter_top_layer_num < len(
-        transformer_layers), "adapter layer num must be less than the number of transformer layers"
-    assert emulator_layer_num < len(
-        transformer_layers) and emulator_layer_num > 0, "emulator layer num must be less than the number of transformer layers"
-
-    bottom_idx = adapter_bottom_layer_num
-    top_idx = len(transformer_layers) - adapter_top_layer_num
-    bottom_layers = transformer_layers[:bottom_idx]
-    top_layers = transformer_layers[top_idx:]
-    kept_layers = transformer_layers[bottom_idx:top_idx]
-    emulator = nn.ModuleList()
-    stride = (len(kept_layers) - 1) / (emulator_layer_num - 1)
-
-    layer_idx = []
-    for i in range(emulator_layer_num):
-        idx = int(round(i * stride))
-        layer_idx.append(idx)
-        emulator.append(kept_layers[idx])
-    LOGGER.info(
-        'take layer {} of the original model as the emulator'.format(
-            t.Tensor(layer_idx) +
-            bottom_idx))
-    return nn.ModuleList(emulator), nn.ModuleList(
-        bottom_layers), nn.ModuleList(top_layers)
-
-
-
-def split_numpy_array(embedding_matrix, n, suffix):
-    # Calculate the indices where the splits should occur
-    embedding_matrix = embedding_matrix['weight']
-    indices = np.linspace(0, embedding_matrix.shape[0], n+1, dtype=int)
-
-    # Split the embedding matrix at the calculated indices
-    slices = [embedding_matrix[indices[i]:indices[i+1]] for i in range(n)]
-
-    # Create a dictionary with the slices
-    result_dict = {suffix+str(i): slice for i, slice in enumerate(slices)}
-    return result_dict
-
-
-def recover_numpy_array(slices_dict, suffix=""):
-    # Get the slices from the dictionary and concatenate them
-    slices = [slices_dict[suffix + str(i)] for i in range(len(slices_dict))]
-    complete_array = np.concatenate(slices, axis=0)
-    return {'weight': complete_array}
-
-
-class OffsiteTuningBaseModel(t.nn.Module):
-
-    def __init__(self, emulator_layer_num: int, adapter_top_layer_num: int = 2,
-                 adapter_bottom_layer_num: int = 2, fp16_mix_precision=False):
-        super().__init__()
-        self.fp16_mix_precision = fp16_mix_precision
-        self.model = self.get_base_model()
-        self.initialize_model()
-        self.emulator, self.adapter_bottom, self.adapter_top = get_dropout_emulator_and_adapters(
-            transformer_layers=self.get_model_transformer_blocks(self.model),
-            emulator_layer_num=emulator_layer_num,
-            adapter_top_layer_num=adapter_top_layer_num,
-            adapter_bottom_layer_num=adapter_bottom_layer_num
-        )
-        self.post_initialization()
-
-    def initialize_model(self):
-        if self.fp16_mix_precision:
-            self.model.half()
-        for param in self.model.parameters():
-            param.requires_grad = False
-
-    def post_initialization(self):
-        pass
-
-    def get_adapter_top(self):
-        return self.adapter_top
 
-    def get_adapter_bottom(self):
-        return self.adapter_bottom
-
-    def get_emulator(self):
-        return self.emulator
+from fate.components.components.nn.nn_runner import (
+    load_model_dict_from_path,
+    dir_warning,
+    loader_load_from_conf,
+    run_dataset_func,
+)
+from fate.ml.nn.homo.fedavg import FedAVGArguments
+from fate_llm.homo.fedavg import Seq2SeqFedAVGClient, Seq2SeqFedAVGServer
+from typing import Dict
+from fate.components.components.nn.loader import Loader
+from fate_llm.trainer.seq2seq_trainer import Seq2SeqTrainingArguments
+from typing import Union, Optional
+from transformers.trainer_utils import get_last_checkpoint
+from typing import Literal
+import logging
+from fate.arch.dataframe import DataFrame
+from fate_llm.runner.homo_seq2seq_runner import Seq2SeqRunner, _check_instances
+from fate_llm.homo.offsite_tuning import OffsiteTuningTrainerClient, OffsiteTuningTrainerServer
+
+
+logger = logging.getLogger(__name__)
+
+
+class OTRunner(Seq2SeqRunner):
+
+    def __init__(
+        self,
+        model_conf: Optional[Dict] = None,
+        dataset_conf: Optional[Dict] = None,
+        optimizer_conf: Optional[Dict] = None,
+        training_args_conf: Optional[Dict] = None,
+        fed_args_conf: Optional[Dict] = None,
+        data_collator_conf: Optional[Dict] = None,
+        tokenizer_conf: Optional[Dict] = None,
+        task_type: Literal["causal_lm", "other"] = "causal_lm",
+        save_trainable_weights_only: bool = False,
+        aggregate_model: bool = False,
+        algo: str = 'ot'
+    ) -> None:
+        super(OTRunner, self).__init__(
+            algo, model_conf, dataset_conf, optimizer_conf, training_args_conf, fed_args_conf,
+            data_collator_conf, tokenizer_conf, task_type, local_mode=False
+        )
 
-    def get_additional_param_state_dict(self):
-        # get parameter of additional parameter
-        return {}
+        self.aggregate_model = aggregate_model
+        self.save_trainable_weights_only = save_trainable_weights_only
+
+    def setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None, stage="train"):
+
+        if stage == "predict":
+            self.local_mode = True
+            
+        ctx = self.get_context()
+        model = loader_load_from_conf(self.model_conf)
+
+        if model is None:
+            raise ValueError(f"model is None, cannot load model from conf {self.model_conf}")
+        
+        if output_dir is None:
+            output_dir = "./"
+
+        resume_path = None
+        if saved_model is not None:
+            model_dict = load_model_dict_from_path(saved_model)
+            model.load_state_dict(model_dict)
+            logger.info(f"loading model dict from {saved_model} to model done")
+            if get_last_checkpoint(saved_model) is not None:
+                resume_path = saved_model
+                logger.info(f"checkpoint detected, resume_path set to {resume_path}")
+
+        # load optimizer
+        if self.optimizer_conf:
+            optimizer_loader = Loader.from_dict(self.optimizer_conf)
+            optimizer_ = optimizer_loader.load_item()
+            optimizer_params = optimizer_loader.kwargs
+            optimizer = optimizer_(model.parameters(), **optimizer_params)
+        else:
+            optimizer = None
+        # load collator func
+        data_collator = loader_load_from_conf(self.data_collator_conf)
+        # load tokenizer if import conf provided
+        tokenizer = loader_load_from_conf(self.tokenizer_conf)
+        # args
+        dir_warning(self.training_args_conf)
+        training_args = Seq2SeqTrainingArguments(**self.training_args_conf)
+        self.training_args = training_args
+        # reset to default, saving to arbitrary path is not allowed in
+        # DefaultRunner
+        training_args.output_dir = output_dir
+        training_args.resume_from_checkpoint = resume_path  # resume path
+        fed_args = FedAVGArguments(**self.fed_args_conf)
+
+        # prepare trainer
+        if self.is_client():
+            trainer = OffsiteTuningTrainerClient(
+                ctx=ctx,
+                model=model,
+                optimizer=optimizer,
+                training_args=training_args,
+                fed_args=fed_args,
+                data_collator=data_collator,
+                tokenizer=tokenizer,
+                train_set=train_set,
+                val_set=validate_set,
+                save_trainable_weights_only=self.save_trainable_weights_only,
+                aggregate_model=self.aggregate_model
+            )
+
+        elif self.is_server():
+            trainer = OffsiteTuningTrainerServer(
+                ctx=ctx,
+                model=model,
+                aggregate_model=self.aggregate_model
+            )
+
+        _check_instances(
+            trainer=trainer,
+            model=model,
+            optimizer=optimizer,
+            train_args=training_args,
+            fed_args=fed_args,
+            data_collator=data_collator,
+        )
 
-    def load_additional_param_state_dict(self, submodel_weights: dict):
-        # load additional weights:
-        pass
+        return trainer
 
-    def _get_numpy_arr(self, v):
-        if v.dtype == t.bfloat16:
-            # float 32
-            v = v.detach().cpu().float().numpy()
+    def server_setup(self, stage="train"):
+        if stage == "predict":
+            self.local_mode = True
+        if self.algo == "fedavg":
+            server_class: Seq2SeqFedAVGServer = Seq2SeqFedAVGServer
         else:
-            v = v.detach().cpu().numpy()
-
-        return v
-
-
-    def load_numpy_state_dict(self, module_dict, state_dict):
-        param_dict = module_dict
-
-        for k, v in param_dict.items():
-            if k not in state_dict:
-                continue
-            addition_weights = {
-                k: t.tensor(v) for k,
-                v in state_dict[k].items()}
-            v.load_state_dict(addition_weights)
-
-    def get_numpy_state_dict(self, module_dict):
-
-        weight_dict = {}
-        for k, v in module_dict.items():
-            weight_dict[k] = {
-                k: self._get_numpy_arr(v) for k,
-                v in v.state_dict().items()}
-        return weight_dict
-
-    def get_submodel_weights(self) -> dict:
-        submodel_weights = {
-            "emulator": {
-                k: self._get_numpy_arr(v) for k,
-                v in self.get_emulator().state_dict().items()},
-            "adapter_top": {
-                k: self._get_numpy_arr(v) for k,
-                v in self.get_adapter_top().state_dict().items()},
-            "adapter_bottom": {
-                k: self._get_numpy_arr(v) for k,
-                v in self.get_adapter_bottom().state_dict().items()}}
-        addition_weights = self.get_additional_param_state_dict()
-        submodel_weights.update(addition_weights)
-        return submodel_weights
-
-    def load_submodel_weights(self, submodel_weights: dict):
-
-        emulator_weights = {
-            k: t.tensor(v) for k,
-            v in submodel_weights['emulator'].items()}
-        adapter_top_weights = {
-            k: t.tensor(v) for k,
-            v in submodel_weights['adapter_top'].items()}
-        adapter_bottom_weights = {
-            k: t.tensor(v) for k,
-            v in submodel_weights['adapter_bottom'].items()}
-
-        emulator = self.get_emulator()
-        adapter_top = self.get_adapter_top()
-        adapter_bottom = self.get_adapter_bottom()
-
-        emulator.load_state_dict(emulator_weights)
-        adapter_top.load_state_dict(adapter_top_weights)
-        adapter_bottom.load_state_dict(adapter_bottom_weights)
-        self.load_additional_param_state_dict(submodel_weights)
-
-    def forward(self, **kwargs):
-        raise NotImplementedError()
-
-    def get_base_model(self):
-        raise NotImplementedError()
-
-    def get_model_transformer_blocks(self, model: t.nn.Module):
-        raise NotImplementedError()
-
-
-class OffsiteTuningMainModel(OffsiteTuningBaseModel):
-
-    def post_initialization(self):
-        pass
-
-
-class OffsiteTuningSubModel(OffsiteTuningBaseModel):
-
-    def post_initialization(self):
-        # mix precision model training
-        for param in self.adapter_top.parameters():
-            param.data = param.data.float()
-            param.requires_grad = True
-        for param in self.adapter_bottom.parameters():
-            param.data = param.data.float()
-            param.requires_grad = True
\ No newline at end of file
+            raise ValueError(f"algo {self.algo} not supported")
+        ctx = self.get_context()
+        trainer = server_class(ctx=ctx, local_mode=self.local_mode)
+        _check_instances(trainer)
+        return trainer
+    
+
+    def train(
+        self,
+        train_data: Optional[Union[str, DataFrame]] = None,
+        validate_data: Optional[Union[str, DataFrame]] = None,
+        output_dir: str = None,
+        saved_model_path: str = None,
+    ):
+        
+        if self.is_client():
+            train_set = self._prepare_data(train_data, "train_data")
+            validate_set = self._prepare_data(validate_data, "val_data")
+            trainer = self.setup(
+                train_set=train_set, validate_set=validate_set, output_dir=output_dir, saved_model=saved_model_path
+            )
+            self.trainer = trainer
+            trainer.train()
+
+        elif self.is_server():
+            trainer = self.setup(
+                train_set=None, validate_set=None, output_dir=output_dir, saved_model=saved_model_path
+            )
+            trainer.train()
+
+        if output_dir is not None:
+            if self.training_args.deepspeed and self.training_args.local_rank != 0:
+                pass
+            else:
+                trainer.save_model(output_dir)
diff --git a/python/fate_llm/runner/homo_seq2seq_runner.py b/python/fate_llm/runner/homo_seq2seq_runner.py
index ab0bdc1..e5146c7 100644
--- a/python/fate_llm/runner/homo_seq2seq_runner.py
+++ b/python/fate_llm/runner/homo_seq2seq_runner.py
@@ -40,7 +40,7 @@
 logger = logging.getLogger(__name__)
 
 
-SUPPORTED_ALGO = ["fedavg"]
+SUPPORTED_ALGO = ["fedavg", "ot"]
 
 
 def _check_instances(

From 69bc8a628822a95022bedf475cddca526424276d Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Fri, 23 Feb 2024 10:45:50 +0800
Subject: [PATCH 12/35] support qwen

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 python/fate_llm/model_zoo/pellm/chatglm.py    | 15 +++-----
 python/fate_llm/model_zoo/pellm/llama.py      | 15 ++++----
 .../pellm/parameter_efficient_llm.py          | 16 ++++++---
 python/fate_llm/model_zoo/pellm/qwen.py       | 36 +++++++++++++++++++
 4 files changed, 62 insertions(+), 20 deletions(-)
 create mode 100644 python/fate_llm/model_zoo/pellm/qwen.py

diff --git a/python/fate_llm/model_zoo/pellm/chatglm.py b/python/fate_llm/model_zoo/pellm/chatglm.py
index d49c5be..ded8dc0 100644
--- a/python/fate_llm/model_zoo/pellm/chatglm.py
+++ b/python/fate_llm/model_zoo/pellm/chatglm.py
@@ -18,21 +18,22 @@
 
 
 class ChatGLM(PELLM):
-    enable_save_pretrained = True
-
     def __init__(self,
                  pretrained_path: str = None,
                  peft_type: str = None,
                  peft_config: dict = None,
                  pre_seq_len: int = None,
-                 prefix_projection: bool = False) -> None:
+                 prefix_projection: bool = False,
+                 **kwargs) -> None:
 
         self.pre_seq_len = pre_seq_len
         self.prefix_projection = prefix_projection
 
         super().__init__(pretrained_path=pretrained_path,
                          peft_type=peft_type,
-                         peft_config=peft_config)
+                         peft_config=peft_config,
+                         **kwargs
+                         )
 
     def init_config(self):
         self.config = AutoConfig.from_pretrained(
@@ -40,12 +41,6 @@ def init_config(self):
         self.config.pre_seq_len = self.pre_seq_len
         self.config.prefix_projection = self.prefix_projection
 
-    def init_base_lm(self):
-        super(
-            ChatGLM,
-            self).init_base_lm(
-            trust_remote_code=True)
-
     def add_peft(self):
         if self.pre_seq_len:
             self._pe_lm.half()
diff --git a/python/fate_llm/model_zoo/pellm/llama.py b/python/fate_llm/model_zoo/pellm/llama.py
index a2c53c3..a28c29f 100644
--- a/python/fate_llm/model_zoo/pellm/llama.py
+++ b/python/fate_llm/model_zoo/pellm/llama.py
@@ -21,23 +21,26 @@
 
 class LLaMa(PELLM):
     config_class = LlamaConfig
-    enable_save_pretrained = True
 
     def __init__(self,
                  pretrained_path: str = None,
                  peft_type: str = None,
-                 peft_config: dict = None) -> None:
+                 peft_config: dict = None,
+                 **kwargs) -> None:
 
         super().__init__(pretrained_path=pretrained_path,
                          peft_type=peft_type,
-                         peft_config=peft_config)
+                         peft_config=peft_config,
+                         **kwargs)
 
-    def init_base_lm(self):
+    def init_base_lm(self, **kwargs):
         if self.config is not None:
             self._pe_lm = LlamaForCausalLM.from_pretrained(self.config_path,
-                                                           config=self.config)
+                                                           config=self.config,
+                                                           torch_dtype=self.torch_dtype,
+                                                           **kwargs)
         elif self.config_path is not None:
-            self._pe_lm = LlamaForCausalLM.from_pretrained(self.config_path)
+            self._pe_lm = LlamaForCausalLM.from_pretrained(self.config_path, torch_dtype=self.torch_dtype, **kwargs)
         else:
             raise ValueError(
                 'config_path to pretrained model folder cannot be None')
diff --git a/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py b/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
index 73c580e..dbc01fc 100644
--- a/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
+++ b/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
@@ -42,6 +42,8 @@ def __init__(self,
                  pretrained_path: str = None,
                  peft_type: str = None,
                  peft_config: dict = None,
+                 torch_dtype: str = None,
+                 trust_remote_code: bool = False,
                  **kwargs
                  ) -> None:
 
@@ -51,6 +53,8 @@ def __init__(self,
         self.config_path = pretrained_path
         self.peft_type = peft_type
         self.peft_config = peft_config
+        self.torch_dtype = None if not torch_dtype else getattr(torch, torch_dtype)
+        self.trust_remote_code = trust_remote_code
 
         assert self.config_path is not None or self.config is not None, \
             "At least one of config_path and config must be set."
@@ -62,12 +66,12 @@ def _init_pelm(self, **kwargs):
 
     def init_lm_with_peft(self, **kwargs):
         self.init_config(**kwargs)
-        self.init_base_lm()
+        self.init_base_lm(**kwargs)
         self.add_peft()
 
     def init_config(self, **kwargs):
         if self.config_path is not None:
-            self.config = AutoConfig.from_pretrained(self.config_path)
+            self.config = AutoConfig.from_pretrained(self.config_path, trust_remote_code=self.trust_remote_code)
         elif self.config is not None and self.config_class is not None:
             self.config = self.config_class().from_dict(self.config)
         else:
@@ -82,10 +86,14 @@ def init_base_lm(self, **kwargs):
         model_loader = self.model_loader if self.model_loader is not None else AutoModel
         if self.config is not None:
             self._pe_lm = model_loader.from_pretrained(
-                self.config_path, config=self.config, **kwargs)
+                self.config_path, config=self.config,
+                torch_dtype=self.torch_dtype, **kwargs,
+                trust_remote_code=self.trust_remote_code
+            )
         elif self.config_path is not None:
             self._pe_lm = model_loader.from_pretrained(
-                self.config_path, **kwargs)
+                self.config_path, torch_dtype=self.torch_dtype,
+                trust_remote_code=self.trust_remote_code, **kwargs)
         else:
             raise ValueError(
                 'config_path to pretrained model folder cannot be None')
diff --git a/python/fate_llm/model_zoo/pellm/qwen.py b/python/fate_llm/model_zoo/pellm/qwen.py
new file mode 100644
index 0000000..cf3f292
--- /dev/null
+++ b/python/fate_llm/model_zoo/pellm/qwen.py
@@ -0,0 +1,36 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from transformers import Qwen2Config
+from transformers import Qwen2ForCausalLM
+from fate_llm.model_zoo.pellm.parameter_efficient_llm import PELLM
+
+
+class Qwen(PELLM):
+
+    config_class = Qwen2Config
+    model_loader = Qwen2ForCausalLM
+
+    def __init__(self, config: dict = None,
+                 pretrained_path: str = None,
+                 peft_type: str = None,
+                 peft_config: dict = None,
+                 **kwargs
+                 ) -> None:
+
+        if config is None and pretrained_path is None:
+            config = Qwen2Config().to_dict()  # use default model setting
+        super().__init__(config=config, pretrained_path=pretrained_path,
+                         peft_type=peft_type, peft_config=peft_config, **kwargs)

From 18f82a60623c2676b3bc0b9b5548dbdf79b00818 Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Fri, 23 Feb 2024 16:33:53 +0800
Subject: [PATCH 13/35] update data_collator and tokenizer

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 .../fate_llm/data/data_collator/__init__.py   | 13 -----
 .../data/data_collator/cust_data_collator.py  | 50 ++++++++++++++---
 python/fate_llm/data/tokenizers/__init__.py   | 29 ----------
 .../data/tokenizers/cust_tokenizer.py         | 54 +++++++++++++++++++
 python/fate_llm/dataset/prompt_dataset.py     | 10 ++--
 5 files changed, 103 insertions(+), 53 deletions(-)
 create mode 100644 python/fate_llm/data/tokenizers/cust_tokenizer.py

diff --git a/python/fate_llm/data/data_collator/__init__.py b/python/fate_llm/data/data_collator/__init__.py
index 3d7398c..878d3a9 100644
--- a/python/fate_llm/data/data_collator/__init__.py
+++ b/python/fate_llm/data/data_collator/__init__.py
@@ -13,16 +13,3 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from transformers.data import data_collator
-from ..tokenizers import get_prompt_tokenizer
-
-
-def get_data_collator(data_collator_name, tokenizer_name_or_path=None, pad_token=None, padding_side="left", **kwargs):
-    if not hasattr(data_collator, data_collator_name):
-        support_collator_list = list(filter(lambda module_name: "collator" in module_name.lower(), dir(data_collator)))
-        return ValueError(f"data_collator's name={data_collator_name} does not in support list={support_collator_list}")
-
-    tokenizer = get_prompt_tokenizer(tokenizer_name_or_path=tokenizer_name_or_path,
-                                     pad_token=pad_token)
-
-    return getattr(data_collator, data_collator_name)(tokenizer, **kwargs)
diff --git a/python/fate_llm/data/data_collator/cust_data_collator.py b/python/fate_llm/data/data_collator/cust_data_collator.py
index 1e2b685..d6b8905 100644
--- a/python/fate_llm/data/data_collator/cust_data_collator.py
+++ b/python/fate_llm/data/data_collator/cust_data_collator.py
@@ -1,8 +1,46 @@
-from transformers import AutoTokenizer
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from transformers.data import data_collator
+from ..tokenizers.cust_tokenizer import get_tokenizer
 
 
-def get_seq2seq_tokenizer(model_path):
-    from transformers import DataCollatorForSeq2Seq
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    tokenizer.pad_token = tokenizer.eos_token
-    return DataCollatorForSeq2Seq(tokenizer=tokenizer)
+def get_data_collator(data_collator_name,
+                      tokenizer_name_or_path=None,
+                      pad_token=None,
+                      bos_token=None,
+                      eos_token=None,
+                      pad_token_id=None,
+                      bos_token_id=None,
+                      eos_token_id=None,
+                      trust_remote_code=False, **kwargs):
+    if not hasattr(data_collator, data_collator_name):
+        support_collator_list = list(filter(lambda module_name: "collator" in module_name.lower(), dir(data_collator)))
+        return ValueError(f"data_collator's name={data_collator_name} does not in support list={support_collator_list}")
+
+    tokenizer = get_tokenizer(tokenizer_name_or_path=tokenizer_name_or_path,
+                              pad_token=pad_token,
+                              bos_token=bos_token,
+                              eos_token=eos_token,
+                              pad_token_id=pad_token_id,
+                              bos_token_id=bos_token_id,
+                              eos_token_id=eos_token_id,
+                              trust_remote_code=trust_remote_code)
+
+    return getattr(data_collator, data_collator_name)(tokenizer, **kwargs)
+
+
+def get_seq2seq_data_collator(tokenizer_name_or_path, **kwargs):
+    return get_data_collator("DataCollatorForSeq2Seq", tokenizer_name_or_path=tokenizer_name_or_path, **kwargs)
diff --git a/python/fate_llm/data/tokenizers/__init__.py b/python/fate_llm/data/tokenizers/__init__.py
index 1e47421..878d3a9 100644
--- a/python/fate_llm/data/tokenizers/__init__.py
+++ b/python/fate_llm/data/tokenizers/__init__.py
@@ -13,32 +13,3 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from transformers import AutoTokenizer
-
-
-def get_prompt_tokenizer(
-    tokenizer_name_or_path,
-    trust_remote_code=False,
-    padding_side="left",
-    pad_token=None,
-    pad_token_id=0,
-    bos_token_id=1,
-    eos_token_id=2,
-    add_eos_token=True,
-):
-    tokenizer = AutoTokenizer.from_pretrained(
-        tokenizer_name_or_path,
-        trust_remote_code=trust_remote_code,
-        add_eos_token=add_eos_token
-    )
-    tokenizer.padding_side = padding_side
-    if pad_token is not None:
-        tokenizer.add_special_tokens({'pad_token': pad_token})
-    if pad_token_id is not None:
-        tokenizer.pad_token_id = pad_token_id
-    if bos_token_id is not None:
-        tokenizer.bos_token_id = bos_token_id
-    if eos_token_id is not None:
-        tokenizer.eos_token_id = eos_token_id
-
-    return tokenizer
diff --git a/python/fate_llm/data/tokenizers/cust_tokenizer.py b/python/fate_llm/data/tokenizers/cust_tokenizer.py
new file mode 100644
index 0000000..ee6da32
--- /dev/null
+++ b/python/fate_llm/data/tokenizers/cust_tokenizer.py
@@ -0,0 +1,54 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from transformers import AutoTokenizer
+
+
+def get_tokenizer(
+    tokenizer_name_or_path,
+    trust_remote_code=False,
+    padding_side=None,
+    pad_token=None,
+    bos_token=None,
+    eos_token=None,
+    pad_token_id=None,
+    bos_token_id=None,
+    eos_token_id=None,
+    add_eos_token=True,
+):
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_name_or_path,
+        trust_remote_code=trust_remote_code,
+        add_eos_token=add_eos_token
+    )
+    if padding_side is not None:
+        tokenizer.padding_side = padding_side
+    if pad_token is not None:
+        tokenizer.add_special_tokens({'pad_token': pad_token})
+    if bos_token is not None:
+        tokenizer.add_special_tokens({'bos_token': bos_token})
+    if eos_token is not None:
+        tokenizer.add_special_tokens({"eos_token": eos_token})
+    if pad_token_id is not None:
+        tokenizer.pad_token_id = pad_token_id
+    if bos_token_id is not None:
+        tokenizer.bos_token_id = bos_token_id
+    if eos_token_id is not None:
+        tokenizer.eos_token_id = eos_token_id
+
+    if "llama" in tokenizer_name_or_path.lower() or "gpt2" in tokenizer_name_or_path.lower():
+        tokenizer.pad_token = tokenizer.eos_token
+
+    return tokenizer
diff --git a/python/fate_llm/dataset/prompt_dataset.py b/python/fate_llm/dataset/prompt_dataset.py
index ec127d7..12c8463 100644
--- a/python/fate_llm/dataset/prompt_dataset.py
+++ b/python/fate_llm/dataset/prompt_dataset.py
@@ -16,7 +16,7 @@
 import copy
 import pandas as pd
 from fate.ml.nn.dataset.base import Dataset
-from .tokenizers import get_prompt_tokenizer
+from ..data.tokenizers.cust_tokenizer import get_tokenizer
 
 
 PROMPT_TEMPLATE = "{prompt}"
@@ -30,9 +30,9 @@ def __init__(self,
                  padding=False,
                  padding_side='left',
                  pad_token=None,
-                 pad_token_id=0,
-                 bos_token_id=1,
-                 eos_token_id=2,
+                 pad_token_id=None,
+                 bos_token_id=None,
+                 eos_token_id=None,
                  add_eos_token=True,
                  prompt_template=None,
                  add_special_tokens=False,
@@ -47,7 +47,7 @@ def __init__(self,
         self.add_special_tokens = add_special_tokens
         self.max_length = text_max_length
 
-        self.tokenizer = get_prompt_tokenizer(
+        self.tokenizer = get_tokenizer(
             tokenizer_name_or_path=tokenizer_name_or_path,
             trust_remote_code=trust_remote_code,
             pad_token=pad_token,

From b7c02c7230e21d617e8b1a7cecba106ad1e60dbc Mon Sep 17 00:00:00 2001
From: cwj <talkingwallace@sohu.com>
Date: Mon, 26 Feb 2024 11:06:46 +0800
Subject: [PATCH 14/35] Fix codes Signed-off-by: weijingchen
 <talkingwallace@sohu.com>

Signed-off-by: cwj <talkingwallace@sohu.com>
---
 .../offsite_tuning/offsite_tuning_model.py    | 348 ++++++++++--------
 1 file changed, 190 insertions(+), 158 deletions(-)

diff --git a/python/fate_llm/model_zoo/offsite_tuning/offsite_tuning_model.py b/python/fate_llm/model_zoo/offsite_tuning/offsite_tuning_model.py
index dd48d41..73f50ef 100644
--- a/python/fate_llm/model_zoo/offsite_tuning/offsite_tuning_model.py
+++ b/python/fate_llm/model_zoo/offsite_tuning/offsite_tuning_model.py
@@ -12,171 +12,203 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
-from fate.components.components.nn.nn_runner import (
-    load_model_dict_from_path,
-    dir_warning,
-    loader_load_from_conf,
-    run_dataset_func,
-)
-from fate.ml.nn.homo.fedavg import FedAVGArguments
-from fate_llm.homo.fedavg import Seq2SeqFedAVGClient, Seq2SeqFedAVGServer
-from typing import Dict
-from fate.components.components.nn.loader import Loader
-from fate_llm.trainer.seq2seq_trainer import Seq2SeqTrainingArguments
-from typing import Union, Optional
-from transformers.trainer_utils import get_last_checkpoint
-from typing import Literal
+#
+import torch as t
+from torch import nn
+from transformers import AutoModel
+import numpy as np
 import logging
-from fate.arch.dataframe import DataFrame
-from fate_llm.runner.homo_seq2seq_runner import Seq2SeqRunner, _check_instances
-from fate_llm.homo.offsite_tuning import OffsiteTuningTrainerClient, OffsiteTuningTrainerServer
-
 
 logger = logging.getLogger(__name__)
 
 
-class OTRunner(Seq2SeqRunner):
-
-    def __init__(
-        self,
-        model_conf: Optional[Dict] = None,
-        dataset_conf: Optional[Dict] = None,
-        optimizer_conf: Optional[Dict] = None,
-        training_args_conf: Optional[Dict] = None,
-        fed_args_conf: Optional[Dict] = None,
-        data_collator_conf: Optional[Dict] = None,
-        tokenizer_conf: Optional[Dict] = None,
-        task_type: Literal["causal_lm", "other"] = "causal_lm",
-        save_trainable_weights_only: bool = False,
-        aggregate_model: bool = False,
-        algo: str = 'ot'
-    ) -> None:
-        super(OTRunner, self).__init__(
-            algo, model_conf, dataset_conf, optimizer_conf, training_args_conf, fed_args_conf,
-            data_collator_conf, tokenizer_conf, task_type, local_mode=False
+def get_dropout_emulator_and_adapters(
+        transformer_layers: nn.ModuleList,
+        emulator_layer_num: int,
+        adapter_top_layer_num: int,
+        adapter_bottom_layer_num: int):
+
+    assert adapter_bottom_layer_num > 0 and adapter_top_layer_num > 0, "adapter layer num must be greater than 0"
+    assert emulator_layer_num < len(
+        transformer_layers), "emulator layer num must be less than the number of transformer layers"
+    assert adapter_bottom_layer_num + adapter_top_layer_num < len(
+        transformer_layers), "adapter layer num must be less than the number of transformer layers"
+    assert emulator_layer_num < len(
+        transformer_layers) and emulator_layer_num > 0, "emulator layer num must be less than the number of transformer layers"
+
+    bottom_idx = adapter_bottom_layer_num
+    top_idx = len(transformer_layers) - adapter_top_layer_num
+    bottom_layers = transformer_layers[:bottom_idx]
+    top_layers = transformer_layers[top_idx:]
+    kept_layers = transformer_layers[bottom_idx:top_idx]
+    emulator = nn.ModuleList()
+    stride = (len(kept_layers) - 1) / (emulator_layer_num - 1)
+
+    layer_idx = []
+    for i in range(emulator_layer_num):
+        idx = int(round(i * stride))
+        layer_idx.append(idx)
+        emulator.append(kept_layers[idx])
+    logger.info(
+        'take layer {} of the original model as the emulator'.format(
+            t.Tensor(layer_idx) +
+            bottom_idx))
+    return nn.ModuleList(emulator), nn.ModuleList(
+        bottom_layers), nn.ModuleList(top_layers)
+
+
+
+def split_numpy_array(embedding_matrix, n, suffix):
+    # Calculate the indices where the splits should occur
+    embedding_matrix = embedding_matrix['weight']
+    indices = np.linspace(0, embedding_matrix.shape[0], n+1, dtype=int)
+
+    # Split the embedding matrix at the calculated indices
+    slices = [embedding_matrix[indices[i]:indices[i+1]] for i in range(n)]
+
+    # Create a dictionary with the slices
+    result_dict = {suffix+str(i): slice for i, slice in enumerate(slices)}
+    return result_dict
+
+
+def recover_numpy_array(slices_dict, suffix=""):
+    # Get the slices from the dictionary and concatenate them
+    slices = [slices_dict[suffix + str(i)] for i in range(len(slices_dict))]
+    complete_array = np.concatenate(slices, axis=0)
+    return {'weight': complete_array}
+
+
+class OffsiteTuningBaseModel(t.nn.Module):
+
+    def __init__(self, emulator_layer_num: int, adapter_top_layer_num: int = 2,
+                 adapter_bottom_layer_num: int = 2, fp16_mix_precision=False):
+        super().__init__()
+        self.fp16_mix_precision = fp16_mix_precision
+        self.model = self.get_base_model()
+        self.initialize_model()
+        self.emulator, self.adapter_bottom, self.adapter_top = get_dropout_emulator_and_adapters(
+            transformer_layers=self.get_model_transformer_blocks(self.model),
+            emulator_layer_num=emulator_layer_num,
+            adapter_top_layer_num=adapter_top_layer_num,
+            adapter_bottom_layer_num=adapter_bottom_layer_num
         )
+        self.post_initialization()
 
-        self.aggregate_model = aggregate_model
-        self.save_trainable_weights_only = save_trainable_weights_only
-
-    def setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None, stage="train"):
-
-        if stage == "predict":
-            self.local_mode = True
-            
-        ctx = self.get_context()
-        model = loader_load_from_conf(self.model_conf)
-
-        if model is None:
-            raise ValueError(f"model is None, cannot load model from conf {self.model_conf}")
-        
-        if output_dir is None:
-            output_dir = "./"
-
-        resume_path = None
-        if saved_model is not None:
-            model_dict = load_model_dict_from_path(saved_model)
-            model.load_state_dict(model_dict)
-            logger.info(f"loading model dict from {saved_model} to model done")
-            if get_last_checkpoint(saved_model) is not None:
-                resume_path = saved_model
-                logger.info(f"checkpoint detected, resume_path set to {resume_path}")
-
-        # load optimizer
-        if self.optimizer_conf:
-            optimizer_loader = Loader.from_dict(self.optimizer_conf)
-            optimizer_ = optimizer_loader.load_item()
-            optimizer_params = optimizer_loader.kwargs
-            optimizer = optimizer_(model.parameters(), **optimizer_params)
-        else:
-            optimizer = None
-        # load collator func
-        data_collator = loader_load_from_conf(self.data_collator_conf)
-        # load tokenizer if import conf provided
-        tokenizer = loader_load_from_conf(self.tokenizer_conf)
-        # args
-        dir_warning(self.training_args_conf)
-        training_args = Seq2SeqTrainingArguments(**self.training_args_conf)
-        self.training_args = training_args
-        # reset to default, saving to arbitrary path is not allowed in
-        # DefaultRunner
-        training_args.output_dir = output_dir
-        training_args.resume_from_checkpoint = resume_path  # resume path
-        fed_args = FedAVGArguments(**self.fed_args_conf)
-
-        # prepare trainer
-        if self.is_client():
-            trainer = OffsiteTuningTrainerClient(
-                ctx=ctx,
-                model=model,
-                optimizer=optimizer,
-                training_args=training_args,
-                fed_args=fed_args,
-                data_collator=data_collator,
-                tokenizer=tokenizer,
-                train_set=train_set,
-                val_set=validate_set,
-                save_trainable_weights_only=self.save_trainable_weights_only,
-                aggregate_model=self.aggregate_model
-            )
-
-        elif self.is_server():
-            trainer = OffsiteTuningTrainerServer(
-                ctx=ctx,
-                model=model,
-                aggregate_model=self.aggregate_model
-            )
-
-        _check_instances(
-            trainer=trainer,
-            model=model,
-            optimizer=optimizer,
-            train_args=training_args,
-            fed_args=fed_args,
-            data_collator=data_collator,
-        )
+    def initialize_model(self):
+        if self.fp16_mix_precision:
+            self.model.half()
+        for param in self.model.parameters():
+            param.requires_grad = False
+
+    def post_initialization(self):
+        pass
+
+    def get_adapter_top(self):
+        return self.adapter_top
+
+    def get_adapter_bottom(self):
+        return self.adapter_bottom
+
+    def get_emulator(self):
+        return self.emulator
+
+    def get_additional_param_state_dict(self):
+        # get parameter of additional parameter
+        return {}
 
-        return trainer
+    def load_additional_param_state_dict(self, submodel_weights: dict):
+        # load additional weights:
+        pass
 
-    def server_setup(self, stage="train"):
-        if stage == "predict":
-            self.local_mode = True
-        if self.algo == "fedavg":
-            server_class: Seq2SeqFedAVGServer = Seq2SeqFedAVGServer
+    def _get_numpy_arr(self, v):
+        if v.dtype == t.bfloat16:
+            # float 32
+            v = v.detach().cpu().float().numpy()
         else:
-            raise ValueError(f"algo {self.algo} not supported")
-        ctx = self.get_context()
-        trainer = server_class(ctx=ctx, local_mode=self.local_mode)
-        _check_instances(trainer)
-        return trainer
-    
-
-    def train(
-        self,
-        train_data: Optional[Union[str, DataFrame]] = None,
-        validate_data: Optional[Union[str, DataFrame]] = None,
-        output_dir: str = None,
-        saved_model_path: str = None,
-    ):
-        
-        if self.is_client():
-            train_set = self._prepare_data(train_data, "train_data")
-            validate_set = self._prepare_data(validate_data, "val_data")
-            trainer = self.setup(
-                train_set=train_set, validate_set=validate_set, output_dir=output_dir, saved_model=saved_model_path
-            )
-            self.trainer = trainer
-            trainer.train()
-
-        elif self.is_server():
-            trainer = self.setup(
-                train_set=None, validate_set=None, output_dir=output_dir, saved_model=saved_model_path
-            )
-            trainer.train()
-
-        if output_dir is not None:
-            if self.training_args.deepspeed and self.training_args.local_rank != 0:
-                pass
-            else:
-                trainer.save_model(output_dir)
+            v = v.detach().cpu().numpy()
+
+        return v
+
+
+    def load_numpy_state_dict(self, module_dict, state_dict):
+        param_dict = module_dict
+
+        for k, v in param_dict.items():
+            if k not in state_dict:
+                continue
+            addition_weights = {
+                k: t.tensor(v) for k,
+                v in state_dict[k].items()}
+            v.load_state_dict(addition_weights)
+
+    def get_numpy_state_dict(self, module_dict):
+
+        weight_dict = {}
+        for k, v in module_dict.items():
+            weight_dict[k] = {
+                k: self._get_numpy_arr(v) for k,
+                v in v.state_dict().items()}
+        return weight_dict
+
+    def get_submodel_weights(self) -> dict:
+        submodel_weights = {
+            "emulator": {
+                k: self._get_numpy_arr(v) for k,
+                v in self.get_emulator().state_dict().items()},
+            "adapter_top": {
+                k: self._get_numpy_arr(v) for k,
+                v in self.get_adapter_top().state_dict().items()},
+            "adapter_bottom": {
+                k: self._get_numpy_arr(v) for k,
+                v in self.get_adapter_bottom().state_dict().items()}}
+        addition_weights = self.get_additional_param_state_dict()
+        submodel_weights.update(addition_weights)
+        return submodel_weights
+
+    def load_submodel_weights(self, submodel_weights: dict):
+
+        emulator_weights = {
+            k: t.tensor(v) for k,
+            v in submodel_weights['emulator'].items()}
+        adapter_top_weights = {
+            k: t.tensor(v) for k,
+            v in submodel_weights['adapter_top'].items()}
+        adapter_bottom_weights = {
+            k: t.tensor(v) for k,
+            v in submodel_weights['adapter_bottom'].items()}
+
+        emulator = self.get_emulator()
+        adapter_top = self.get_adapter_top()
+        adapter_bottom = self.get_adapter_bottom()
+
+        emulator.load_state_dict(emulator_weights)
+        adapter_top.load_state_dict(adapter_top_weights)
+        adapter_bottom.load_state_dict(adapter_bottom_weights)
+        self.load_additional_param_state_dict(submodel_weights)
+
+    def forward(self, **kwargs):
+        raise NotImplementedError()
+
+    def get_base_model(self):
+        raise NotImplementedError()
+
+    def get_model_transformer_blocks(self, model: t.nn.Module):
+        raise NotImplementedError()
+
+
+class OffsiteTuningMainModel(OffsiteTuningBaseModel):
+
+    def post_initialization(self):
+        pass
+
+
+class OffsiteTuningSubModel(OffsiteTuningBaseModel):
+
+    def post_initialization(self):
+        # mix precision model training
+        for param in self.adapter_top.parameters():
+            param.data = param.data.float()
+            param.requires_grad = True
+        for param in self.adapter_bottom.parameters():
+            param.data = param.data.float()
+            param.requires_grad = True
\ No newline at end of file

From 86fa886e29ab109b30819ccc78848bb00e63560d Mon Sep 17 00:00:00 2001
From: cwj <talkingwallace@sohu.com>
Date: Mon, 26 Feb 2024 11:27:43 +0800
Subject: [PATCH 15/35] Remove model_zoo codes & fix runner Signed-off-by:
 weijingchen <talkingwallace@sohu.com>

Signed-off-by: cwj <talkingwallace@sohu.com>
---
 python/fate_llm/model_zoo/ipr/alexnet.py      |  74 -------
 python/fate_llm/model_zoo/ipr/distilbert.py   |  64 ------
 python/fate_llm/model_zoo/ipr/gpt2.py         |  56 -----
 python/fate_llm/model_zoo/ipr/resnet.py       | 161 --------------
 python/fate_llm/model_zoo/ipr/sign_block.py   | 197 ------------------
 .../fate_llm/runner/offsite_tuning_runner.py  | 193 ++++++-----------
 6 files changed, 67 insertions(+), 678 deletions(-)
 delete mode 100644 python/fate_llm/model_zoo/ipr/alexnet.py
 delete mode 100644 python/fate_llm/model_zoo/ipr/distilbert.py
 delete mode 100644 python/fate_llm/model_zoo/ipr/gpt2.py
 delete mode 100644 python/fate_llm/model_zoo/ipr/resnet.py
 delete mode 100644 python/fate_llm/model_zoo/ipr/sign_block.py

diff --git a/python/fate_llm/model_zoo/ipr/alexnet.py b/python/fate_llm/model_zoo/ipr/alexnet.py
deleted file mode 100644
index 28c6dc0..0000000
--- a/python/fate_llm/model_zoo/ipr/alexnet.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import torch.nn as nn
-from fate_llm.model_zoo.ipr.sign_block import SignatureConv, ConvBlock
-
-
-class SignAlexNet(nn.Module):
-
-    """
-    This is a modified Alexnet: its 4,5,6 layers are replaced by Singnature Conv Block
-    """
-
-    def __init__(self, num_classes):
-        super().__init__()
-        in_channels = 3
-        maxpoolidx = [1, 3, 7]
-        signed_layer = [4, 5, 6]
-        layers = []
-        inp = in_channels
-
-        # channels & kennel size
-        # the same setting as the FedIPR paper
-        oups = {
-            0: 64,
-            2: 192,
-            4: 384,
-            5: 256,
-            6: 256
-        }
-        kp = {
-            0: (5, 2),
-            2: (5, 2),
-            4: (3, 1),
-            5: (3, 1),
-            6: (3, 1)
-        }
-
-        for layeridx in range(8):
-            if layeridx in maxpoolidx:
-                layers.append(nn.MaxPool2d(2, 2))
-            else:
-                k = kp[layeridx][0]
-                p = kp[layeridx][1]
-                if layeridx in signed_layer:
-                    layers.append(SignatureConv(inp, oups[layeridx], k, 1, p))
-                else:
-                    layers.append(ConvBlock(inp, oups[layeridx], k, 1, p))
-                inp = oups[layeridx]
-
-        self.features = nn.Sequential(*layers)
-        self.classifier = nn.Linear(4 * 4 * 256, num_classes)
-
-    def forward(self, x):
-        for m in self.features:
-            x = m(x)
-        x = x.view(x.size(0), -1)
-        x = self.classifier(x)
-        if self.training:
-            return x
-        else:  # Sofmax
-            return nn.functional.softmax(x, dim=1)
diff --git a/python/fate_llm/model_zoo/ipr/distilbert.py b/python/fate_llm/model_zoo/ipr/distilbert.py
deleted file mode 100644
index 063fd13..0000000
--- a/python/fate_llm/model_zoo/ipr/distilbert.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from torch.nn import Module
-from transformers import DistilBertForSequenceClassification, DistilBertForTokenClassification
-from fate_llm.model_zoo.ipr.sign_block import recursive_replace_layernorm
-
-
-class SignDistilBertForTokenClassification(Module):
-
-    def __init__(self, model_path=None, num_labels=4) -> None:
-        super().__init__()
-        if model_path is None:
-            model_path = 'distilbert-base-uncased'
-
-        self.model_path = model_path
-        self.model = DistilBertForTokenClassification.from_pretrained(
-            model_path, num_labels=num_labels)
-
-        # replace layernorm by SignatureLayerNorm
-        sub_distilbert = self.model.distilbert.transformer.layer[3:]
-        recursive_replace_layernorm(
-            sub_distilbert,
-            layer_name_set={'output_layer_norm'})
-
-    def forward(self, input_dict):
-        return self.model(**input_dict)
-
-
-class SignDistilBertForSequenceClassification(Module):
-
-    def __init__(
-            self,
-            model_path=None,
-            num_labels=4,
-            problem_type=None) -> None:
-        super().__init__()
-        if model_path is None:
-            model_path = 'distilbert-base-uncased'
-
-        self.model_path = model_path
-        self.model = DistilBertForSequenceClassification.from_pretrained(
-            model_path, num_labels=num_labels, problem_type=problem_type)
-
-        # replace layernorm by SignatureLayerNorm
-        sub_distilbert = self.model.distilbert.transformer.layer[3:]
-        recursive_replace_layernorm(
-            sub_distilbert,
-            layer_name_set={'output_layer_norm'})
-
-    def forward(self, input_dict):
-        return self.model(**input_dict)
diff --git a/python/fate_llm/model_zoo/ipr/gpt2.py b/python/fate_llm/model_zoo/ipr/gpt2.py
deleted file mode 100644
index 26c9b4b..0000000
--- a/python/fate_llm/model_zoo/ipr/gpt2.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from torch.nn import Module
-from transformers import GPT2ForTokenClassification, GPT2ForSequenceClassification
-from fate_llm.model_zoo.ipr.sign_block import recursive_replace_layernorm
-
-
-class SignGPT2ForTokenClassification(Module):
-
-    def __init__(self, model_path=None, num_labels=4) -> None:
-        super().__init__()
-        if model_path is None:
-            model_path = 'gpt2'
-
-        self.model_path = model_path
-        self.model = GPT2ForTokenClassification.from_pretrained(
-            model_path, num_labels=num_labels)
-
-        # replace layernorm by SignatureLayerNorm
-        sub_gpt2 = self.model.transformer.h[10:]
-        recursive_replace_layernorm(sub_gpt2)
-
-    def forward(self, input_dict):
-        return self.model(**input_dict)
-
-
-class SignGPT2ForSequenceClassification(Module):
-
-    def __init__(self, model_path=None, num_labels=2) -> None:
-        super().__init__()
-        if model_path is None:
-            model_path = 'gpt2'
-
-        self.model_path = model_path
-        self.model = GPT2ForSequenceClassification.from_pretrained(
-            model_path, num_labels=num_labels)
-
-        # replace layernorm by SignatureLayerNorm
-        sub_gpt2 = self.model.transformer.h[10:]
-        recursive_replace_layernorm(sub_gpt2)
-
-    def forward(self, input_dict):
-        return self.model(**input_dict)
diff --git a/python/fate_llm/model_zoo/ipr/resnet.py b/python/fate_llm/model_zoo/ipr/resnet.py
deleted file mode 100644
index d03e870..0000000
--- a/python/fate_llm/model_zoo/ipr/resnet.py
+++ /dev/null
@@ -1,161 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import torch.nn as nn
-import torch.nn.functional as F
-from fate_llm.model_zoo.ipr.sign_block import ConvBlock, SignatureConv
-
-
-# The layer define for ResNet18, add signature to last layer
-signed_layer_define = {
-    'layer1': {
-        '0': {'convbnrelu_1': {'flag': False}, 'convbn_2': {'flag': False}},
-        '1': {'convbnrelu_1': {'flag': False}, 'convbn_2': {'flag': False}}
-    },
-    'layer2': {
-        '0': {'convbnrelu_1': {'flag': False}, 'convbn_2': {'flag': False}, 'shortcut': {'flag': False}},
-        '1': {'convbnrelu_1': {'flag': False}, 'convbn_2': {'flag': False}}
-    },
-    'layer3': {
-        '0': {'convbnrelu_1': {'flag': False}, 'convbn_2': {'flag': False}, 'shortcut': {'flag': False}},
-        '1': {'convbnrelu_1': {'flag': False}, 'convbn_2': {'flag': False}}
-    },
-    'layer4': {
-        '0': {'convbnrelu_1': {'flag': True}, 'convbn_2': {'flag': True}, 'shortcut': {'flag': False}},
-        '1': {'convbnrelu_1': {'flag': True}, 'convbn_2': {'flag': True}}
-    }
-}
-
-
-def get_convblock(passport_kwargs):
-    def convblock_(*args, **kwargs):
-        if passport_kwargs['flag']:
-            return SignatureConv(*args, **kwargs)
-        else:
-            return ConvBlock(*args, **kwargs)
-
-    return convblock_
-
-
-class BasicPrivateBlock(nn.Module):
-
-    expansion = 1
-
-    def __init__(self, in_planes, planes, stride=1, kwargs={}):  # (512, 512, 2) (512, 512, 1)
-        super(BasicPrivateBlock, self).__init__()
-
-        self.convbnrelu_1 = get_convblock(
-            kwargs['convbnrelu_1'])(
-            in_planes, planes, 3, stride, 1)
-        self.convbn_2 = get_convblock(
-            kwargs['convbn_2'])(
-            planes, planes, 3, 1, 1)
-        self.shortcut = nn.Sequential()
-        if stride != 1 or in_planes != self.expansion * planes:
-            self.shortcut = get_convblock(
-                kwargs['shortcut'])(
-                in_planes,
-                self.expansion * planes,
-                1,
-                stride,
-                0)  # input, output, kernel_size=1
-
-    def forward(self, x):
-
-        out = self.convbnrelu_1(x)
-        out = self.convbn_2(out)
-
-        if not isinstance(self.shortcut, nn.Sequential):
-            out = out + self.shortcut(x)
-        else:
-            out = out + x
-        out = F.relu(out)
-        return out
-
-
-class SignResnet18(nn.Module):
-
-    # BasicPrivateBlock, [2, 2, 2, 2], **model_kwargs
-    def __init__(self, num_classes=100):
-
-        super(SignResnet18, self).__init__()
-        num_blocks = [2, 2, 2, 2]
-        self.in_planes = 64
-        block = BasicPrivateBlock
-        model_define = signed_layer_define
-
-        self.convbnrelu_1 = ConvBlock(3, 64, 3, 1, 1)
-        self.layer1 = self._make_layer(
-            block,
-            64,
-            num_blocks[0],
-            stride=1,
-            model_define=model_define['layer1'])
-        self.layer2 = self._make_layer(
-            block,
-            128,
-            num_blocks[1],
-            stride=2,
-            model_define=model_define['layer2'])
-        self.layer3 = self._make_layer(
-            block,
-            256,
-            num_blocks[2],
-            stride=2,
-            model_define=model_define['layer3'])
-        self.layer4 = self._make_layer(
-            block,
-            512,
-            num_blocks[3],
-            stride=2,
-            model_define=model_define['layer4'])
-        self.linear = nn.Linear(512 * block.expansion, num_classes)
-
-    # BasicPrivateBlock, planes = 512, numblocks = 2, stride =2, **model_kwargs
-    def _make_layer(self, block, planes, num_blocks, stride, model_define):
-        strides = [stride] + [1] * (num_blocks - 1)  # [2] + [1]*1 = [2, 1]
-        layers = []
-        for i, stride in enumerate(strides):  # stride = 2 & 1
-            layers.append(block(self.in_planes, planes, stride,
-                          model_define[str(i)]))  # (512, 512, 2)
-            self.in_planes = planes * block.expansion
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-
-        out = self.convbnrelu_1(x)
-
-        for block in self.layer1:
-            out = block(out)
-        for block in self.layer2:
-            out = block(out)
-        for block in self.layer3:
-            out = block(out)
-        for block in self.layer4:
-            out = block(out)
-
-        out = F.avg_pool2d(out, 4)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-
-        if self.training:
-            return out
-        else:
-            return F.softmax(out, dim=1)
-
-
-if __name__ == '__main__':
-
-    net = SignResnet18(num_classes=10)
diff --git a/python/fate_llm/model_zoo/ipr/sign_block.py b/python/fate_llm/model_zoo/ipr/sign_block.py
deleted file mode 100644
index 5cef62e..0000000
--- a/python/fate_llm/model_zoo/ipr/sign_block.py
+++ /dev/null
@@ -1,197 +0,0 @@
-#
-#  Copyright 2019 The FATE Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import torch
-import torch.nn as nn
-import torch.nn.init as init
-from torch.nn import functional as F
-from federatedml.util import LOGGER
-
-"""
-Base
-"""
-
-
-class SignatureBlock(nn.Module):
-
-    def __init__(self) -> None:
-        super().__init__()
-    
-    @property
-    def embeded_param(self):
-        return None
-
-    def embeded_param_num(self):
-        return None
-    
-    def extract_sign(self, W):
-        pass
-
-    def sign_loss(self, W, sign):
-        pass
-
-
-def is_sign_block(block):
-    return issubclass(type(block), SignatureBlock)
-
-
-class ConvBlock(nn.Module):
-    def __init__(self, i, o, ks=3, s=1, pd=1, relu=True):
-        super().__init__()
-
-        self.conv = nn.Conv2d(i, o, ks, s, pd, bias= False)
-
-        if relu:
-            self.relu = nn.ReLU(inplace=True)
-        else:
-            self.relu = None
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        init.kaiming_normal_(self.conv.weight, mode='fan_out', nonlinearity='relu')
-
-    def forward(self, x):
-        x = self.conv(x)
-        if self.relu is not None:
-            x = self.relu(x)
-        return x
-
-
-def generate_signature(conv_block: SignatureBlock, num_bits):
-    
-    sign = torch.sign(torch.rand(num_bits) - 0.5)
-    W = torch.randn(len(conv_block.embeded_param.flatten()), num_bits)
-
-    return (W, sign)
-
-
-"""
-Function & Class for Conv Layer
-"""
-
-
-class SignatureConv(SignatureBlock):
-
-    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False):
-        super(SignatureConv, self).__init__()
-
-        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=bias)
-        self.weight = self.conv.weight
-
-        self.init_scale()
-        self.init_bias()
-        self.bn = nn.BatchNorm2d(out_channels, affine=False)
-        self.relu = nn.ReLU(inplace=True)
-        self.reset_parameters()
-
-    def init_bias(self):
-        self.bias = nn.Parameter(torch.Tensor(self.conv.out_channels).to(self.weight.device))
-        init.zeros_(self.bias)
-
-    def init_scale(self):
-        self.scale = nn.Parameter(torch.Tensor(self.conv.out_channels).to(self.weight.device))
-        init.ones_(self.scale)
-
-    def reset_parameters(self):
-        init.kaiming_normal_(self.weight, mode='fan_out', nonlinearity='relu')
-
-    @property
-    def embeded_param(self):
-        # embedded in the BatchNorm param, as the same in the paper
-        return self.scale
-    
-    def embeded_param_num(self):
-        return len(self.scale)
-
-    def extract_sign(self, W):
-        # W is the linear weight for extracting signature
-        with torch.no_grad():
-            return self.scale.view([1, -1]).mm(W).sign().flatten()
-
-    def sign_loss(self, W, sign):
-        loss = F.relu(-self.scale.view([1, -1]).mm(W).mul(sign.view(-1))).sum()
-        return loss
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        x = x * self.scale[None, :, None, None] + self.bias[None, :, None, None]
-        x = self.relu(x)
-        return x
-
-
-"""
-Function & Class for LM
-"""
-
-
-def recursive_replace_layernorm(module, layer_name_set=None):
-
-    """
-    Recursively replaces the LayerNorm layers of a given module with SignatureLayerNorm layers.
-    
-    Parameters:
-        module (torch.nn.Module): The module in which LayerNorm layers should be replaced.
-        layer_name_set (set[str], optional): A set of layer names to be replaced. If None,
-                                             all LayerNorm layers in the module will be replaced.
-    """
-        
-    for name, sub_module in module.named_children():
-        if isinstance(sub_module, nn.LayerNorm):
-            if layer_name_set is not None and name not in layer_name_set:
-                continue
-            setattr(module, name, SignatureLayerNorm.from_layer_norm_layer(sub_module))
-            LOGGER.debug(f"Replace {name} with SignatureLayerNorm")
-        recursive_replace_layernorm(sub_module, layer_name_set)
-
-
-class SignatureLayerNorm(SignatureBlock):
-
-    def __init__(self, normalized_shape=None, eps=1e-5, elementwise_affine=True, layer_norm_inst=None):
-        super(SignatureLayerNorm, self).__init__()
-        if layer_norm_inst is not None and isinstance(layer_norm_inst, nn.LayerNorm):
-            self.ln = layer_norm_inst
-        else:
-            self.ln = nn.LayerNorm(normalized_shape, eps, elementwise_affine)
-
-    @property
-    def embeded_param(self):
-        return self.ln.weight
-
-    def embeded_param_num(self):
-        return self.ln.weight.numel()
-
-    @staticmethod
-    def from_layer_norm_layer(layer_norm_layer: nn.LayerNorm):
-        return SignatureLayerNorm(layer_norm_inst=layer_norm_layer)
-
-    def extract_sign(self, W):
-        # W is the linear weight for extracting signature
-        with torch.no_grad():
-            return self.ln.weight.view([1, -1]).mm(W).sign().flatten()
-
-    def sign_loss(self, W, sign):
-        loss = F.relu(-self.ln.weight.view([1, -1]).mm(W).mul(sign.view(-1))).sum()
-        return loss
-
-    def forward(self, x):
-        return self.ln(x)
-
-
-if __name__ == "__main__":
-    conv = SignatureConv(3, 384, 3, 1, 1)
-    layer_norm = SignatureLayerNorm((768, ))
-    layer_norm_2 = SignatureLayerNorm.from_layer_norm_layer(layer_norm.ln)
\ No newline at end of file
diff --git a/python/fate_llm/runner/offsite_tuning_runner.py b/python/fate_llm/runner/offsite_tuning_runner.py
index ab0bdc1..047ecad 100644
--- a/python/fate_llm/runner/offsite_tuning_runner.py
+++ b/python/fate_llm/runner/offsite_tuning_runner.py
@@ -14,27 +14,23 @@
 #  limitations under the License.
 
 from fate.components.components.nn.nn_runner import (
-    NNRunner,
     load_model_dict_from_path,
     dir_warning,
     loader_load_from_conf,
     run_dataset_func,
 )
-from fate.components.components.nn.runner.homo_default_runner import DefaultRunner
 from fate.ml.nn.homo.fedavg import FedAVGArguments
 from fate_llm.homo.fedavg import Seq2SeqFedAVGClient, Seq2SeqFedAVGServer
 from typing import Dict
 from fate.components.components.nn.loader import Loader
-import torch.nn as nn
-import torch.optim as optim
-from fate.ml.nn.trainer.trainer_base import FedArguments, HomoTrainerServer
-from fate_llm.trainer.seq2seq_trainer import Seq2SeqTrainingArguments, HomoSeq2SeqTrainerClient
-from typing import Union, Type, Callable, Optional
+from fate_llm.trainer.seq2seq_trainer import Seq2SeqTrainingArguments
+from typing import Union, Optional
 from transformers.trainer_utils import get_last_checkpoint
 from typing import Literal
 import logging
 from fate.arch.dataframe import DataFrame
-from transformers.modeling_utils import PreTrainedModel, unwrap_model
+from fate_llm.runner.homo_seq2seq_runner import Seq2SeqRunner, _check_instances
+from fate_llm.homo.offsite_tuning import OffsiteTuningTrainerClient, OffsiteTuningTrainerServer
 
 
 logger = logging.getLogger(__name__)
@@ -43,47 +39,10 @@
 SUPPORTED_ALGO = ["fedavg"]
 
 
-def _check_instances(
-    trainer: Union[Type[HomoSeq2SeqTrainerClient], Type[HomoTrainerServer]] = None,
-    fed_args: FedArguments = None,
-    model: nn.Module = None,
-    optimizer: optim.Optimizer = None,
-    train_args: Seq2SeqTrainingArguments = None,
-    data_collator: Callable = None,
-) -> None:
-    if trainer is not None and not (
-        issubclass(type(trainer), HomoSeq2SeqTrainerClient) or issubclass(type(trainer), HomoTrainerServer)
-    ):
-        raise TypeError(
-            f"SetupReturn Error: trainer must be a subclass of either "
-            f"HomoSeq2SeqTrainerClient or HomoSeq2SeqTrainerClient but got {type(trainer)}"
-        )
-
-    if fed_args is not None and not isinstance(fed_args, FedArguments):
-        raise TypeError(f"SetupReturn Error: fed_args must be an instance of FedArguments but got {type(fed_args)}")
-
-    if model is not None and not issubclass(type(model), nn.Module):
-        raise TypeError(f"SetupReturn Error: model must be a subclass of torch.nn.Module but got {type(model)}")
-
-    if optimizer is not None and not issubclass(type(optimizer), optim.Optimizer):
-        raise TypeError(
-            f"SetupReturn Error: optimizer must be a subclass of torch.optim.Optimizer but got {type(optimizer)}"
-        )
-
-    if train_args is not None and not isinstance(train_args, Seq2SeqTrainingArguments):
-        raise TypeError(
-            f"SetupReturn Error: train_args must be an instance of Seq2SeqTrainingArguments "
-            f"but got {type(train_args)}"
-        )
-
-    if data_collator is not None and not callable(data_collator):
-        raise TypeError(f"SetupReturn Error: data_collator must be callable but got {type(data_collator)}")
-
+class OTRunner(Seq2SeqRunner):
 
-class Seq2SeqRunner(DefaultRunner):
     def __init__(
         self,
-        algo: str = "fedavg",
         model_conf: Optional[Dict] = None,
         dataset_conf: Optional[Dict] = None,
         optimizer_conf: Optional[Dict] = None,
@@ -92,47 +51,29 @@ def __init__(
         data_collator_conf: Optional[Dict] = None,
         tokenizer_conf: Optional[Dict] = None,
         task_type: Literal["causal_lm", "other"] = "causal_lm",
-        local_mode: bool = False,
         save_trainable_weights_only: bool = False,
+        aggregate_model: bool = False,
+        algo: str = 'ot'
     ) -> None:
-        super(NNRunner, self).__init__()
-        self.algo = algo
-        self.model_conf = model_conf
-        self.dataset_conf = dataset_conf
-        self.optimizer_conf = optimizer_conf
-        self.training_args_conf = training_args_conf
-        self.fed_args_conf = fed_args_conf
-        self.data_collator_conf = data_collator_conf
-        self.local_mode = local_mode
-        self.tokenizer_conf = tokenizer_conf
-        self.task_type = task_type
-        self.save_trainable_weights_only = save_trainable_weights_only
+        super(OTRunner, self).__init__(
+            algo, model_conf, dataset_conf, optimizer_conf, training_args_conf, fed_args_conf,
+            data_collator_conf, tokenizer_conf, task_type, local_mode=False
+        )
 
-        # check param
-        if self.algo not in SUPPORTED_ALGO:
-            raise ValueError(f"algo should be one of {SUPPORTED_ALGO}")
-        if self.task_type not in ["causal_lm", "others"]:
-            raise ValueError("task_type should be one of [binary, multi, regression, others]")
-        assert isinstance(self.local_mode, bool), "local should be bool"
+        self.aggregate_model = aggregate_model
+        self.save_trainable_weights_only = save_trainable_weights_only
 
-        # setup var
-        self.trainer = None
-        self.training_args = None
+    def setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None, stage="train"):
 
-    def client_setup(self, train_set=None, validate_set=None, output_dir=None, saved_model=None, stage="train"):
         if stage == "predict":
             self.local_mode = True
-
-        if self.algo == "fedavg":
-            client_class: Seq2SeqFedAVGClient = Seq2SeqFedAVGClient
-        else:
-            raise ValueError(f"algo {self.algo} not supported")
-
+            
         ctx = self.get_context()
         model = loader_load_from_conf(self.model_conf)
+
         if model is None:
             raise ValueError(f"model is None, cannot load model from conf {self.model_conf}")
-
+        
         if output_dir is None:
             output_dir = "./"
 
@@ -144,6 +85,7 @@ def client_setup(self, train_set=None, validate_set=None, output_dir=None, saved
             if get_last_checkpoint(saved_model) is not None:
                 resume_path = saved_model
                 logger.info(f"checkpoint detected, resume_path set to {resume_path}")
+
         # load optimizer
         if self.optimizer_conf:
             optimizer_loader = Loader.from_dict(self.optimizer_conf)
@@ -167,19 +109,27 @@ def client_setup(self, train_set=None, validate_set=None, output_dir=None, saved
         fed_args = FedAVGArguments(**self.fed_args_conf)
 
         # prepare trainer
-        trainer = client_class(
-            ctx=ctx,
-            model=model,
-            optimizer=optimizer,
-            training_args=training_args,
-            fed_args=fed_args,
-            data_collator=data_collator,
-            tokenizer=tokenizer,
-            train_set=train_set,
-            val_set=validate_set,
-            local_mode=self.local_mode,
-            save_trainable_weights_only=self.save_trainable_weights_only,
-        )
+        if self.is_client():
+            trainer = OffsiteTuningTrainerClient(
+                ctx=ctx,
+                model=model,
+                optimizer=optimizer,
+                training_args=training_args,
+                fed_args=fed_args,
+                data_collator=data_collator,
+                tokenizer=tokenizer,
+                train_set=train_set,
+                val_set=validate_set,
+                save_trainable_weights_only=self.save_trainable_weights_only,
+                aggregate_model=self.aggregate_model
+            )
+
+        elif self.is_server():
+            trainer = OffsiteTuningTrainerServer(
+                ctx=ctx,
+                model=model,
+                aggregate_model=self.aggregate_model
+            )
 
         _check_instances(
             trainer=trainer,
@@ -189,6 +139,7 @@ def client_setup(self, train_set=None, validate_set=None, output_dir=None, saved
             fed_args=fed_args,
             data_collator=data_collator,
         )
+
         return trainer
 
     def server_setup(self, stage="train"):
@@ -202,43 +153,33 @@ def server_setup(self, stage="train"):
         trainer = server_class(ctx=ctx, local_mode=self.local_mode)
         _check_instances(trainer)
         return trainer
+    
 
-    def predict(self, test_data: Union[str, DataFrame], saved_model_path: str = None) -> Union[DataFrame, None]:
+    def train(
+        self,
+        train_data: Optional[Union[str, DataFrame]] = None,
+        validate_data: Optional[Union[str, DataFrame]] = None,
+        output_dir: str = None,
+        saved_model_path: str = None,
+    ):
+        
         if self.is_client():
-            test_set = self._prepare_data(test_data, "test_data")
-            if self.trainer is not None:
-                trainer = self.trainer
-                logger.info("trainer found, skip setting up")
-            else:
-                trainer = self.client_setup(saved_model=saved_model_path, stage="predict")
-
-            classes = run_dataset_func(test_set, "get_classes")
-            match_ids = run_dataset_func(test_set, "get_match_ids")
-            sample_ids = run_dataset_func(test_set, "get_sample_ids")
-            match_id_name = run_dataset_func(test_set, "get_match_id_name")
-            sample_id_name = run_dataset_func(test_set, "get_sample_id_name")
-
-            if not self.training_args.predict_with_generate:
-                return
-
-            pred_rs = trainer.predict(test_set)
-
-            if self.training_args and self.training_args.deepspeed and self.training_args.local_rank != 0:
-                return
-
-            rs_df = self.get_nn_output_dataframe(
-                self.get_context(),
-                pred_rs.predictions,
-                pred_rs.label_ids if hasattr(pred_rs, "label_ids") else None,
-                match_ids,
-                sample_ids,
-                match_id_name=match_id_name,
-                sample_id_name=sample_id_name,
-                dataframe_format="dist_df",
-                task_type=self.task_type,
-                classes=classes,
+            train_set = self._prepare_data(train_data, "train_data")
+            validate_set = self._prepare_data(validate_data, "val_data")
+            trainer = self.setup(
+                train_set=train_set, validate_set=validate_set, output_dir=output_dir, saved_model=saved_model_path
             )
-            return rs_df
-        else:
-            # server not predict
-            return
+            self.trainer = trainer
+            trainer.train()
+
+        elif self.is_server():
+            trainer = self.setup(
+                train_set=None, validate_set=None, output_dir=output_dir, saved_model=saved_model_path
+            )
+            trainer.train()
+
+        if output_dir is not None:
+            if self.training_args.deepspeed and self.training_args.local_rank != 0:
+                pass
+            else:
+                trainer.save_model(output_dir)

From a8add722beb9d93484cd646626c5e0ed5fd290ec Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Mon, 26 Feb 2024 15:20:57 +0800
Subject: [PATCH 16/35] update code to support seq_cls in pellm

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 .../pellm/parameter_efficient_llm.py          | 19 +++++++++++++------
 python/fate_llm/trainer/seq2seq_trainer.py    |  2 +-
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py b/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
index dbc01fc..cd93938 100644
--- a/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
+++ b/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
@@ -15,7 +15,8 @@
 #
 import peft
 import torch
-from peft import PeftModel
+from collections.abc import Mapping
+from peft import PeftModel, TaskType
 from transformers import AutoConfig
 from transformers import AutoModel
 from transformers.configuration_utils import PretrainedConfig
@@ -41,7 +42,7 @@ def __init__(self,
                  config: dict = None,
                  pretrained_path: str = None,
                  peft_type: str = None,
-                 peft_config: dict = None,
+                 peft_config=None,
                  torch_dtype: str = None,
                  trust_remote_code: bool = False,
                  **kwargs
@@ -66,7 +67,7 @@ def _init_pelm(self, **kwargs):
 
     def init_lm_with_peft(self, **kwargs):
         self.init_config(**kwargs)
-        self.init_base_lm(**kwargs)
+        self.init_base_lm()
         self.add_peft()
 
     def init_config(self, **kwargs):
@@ -110,16 +111,22 @@ def add_peft(self):
             raise ValueError(f"Can not parse peft_config of {type(self.peft_config)}")
 
         self._pe_lm = peft.get_peft_model(self._pe_lm, peft_config)
+        self.peft_config = peft_config
 
     def model_summary(self):
         if hasattr(self._pe_lm, "print_trainable_parameters"):
             summary = self._pe_lm.print_trainable_parameters()
             logger.debug(f'PELLM model summary: \n{summary}')
 
-    def forward(self, **tokenized_data):
-        return self._pe_lm(**tokenized_data)
+    def forward(self, *args, **kwargs):
+        forward_ret = self._pe_lm.forward(*args, **kwargs)
+
+        if self.peft_config.task_type == TaskType.SEQ_CLS:
+            return forward_ret.logits
+        else:
+            return forward_ret
 
-    def save_pretrained(self, output_path):
+    def save_trainable(self, output_path):
         state_dict = {
             k: p.to("cpu") for k,
             p in self._pe_lm.named_parameters() if p.requires_grad}
diff --git a/python/fate_llm/trainer/seq2seq_trainer.py b/python/fate_llm/trainer/seq2seq_trainer.py
index 7046cd3..d4b5906 100644
--- a/python/fate_llm/trainer/seq2seq_trainer.py
+++ b/python/fate_llm/trainer/seq2seq_trainer.py
@@ -155,7 +155,7 @@ def _save(
         else:
             model = unwrap_model(self.model)
 
-            if hasattr(model, "save_pretrained"):
+            if hasattr(model, "save_trainable"):
                 model.save_pretrained(os.path.join(output_dir, TRAINABLE_WEIGHTS_NAME))
             else:
                 state_dict = {

From 2a20d8686bffb1b9c941d1d60ee3a50e3e00bda9 Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Mon, 26 Feb 2024 15:27:45 +0800
Subject: [PATCH 17/35] fix trainer save trainable bug

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 python/fate_llm/trainer/seq2seq_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/fate_llm/trainer/seq2seq_trainer.py b/python/fate_llm/trainer/seq2seq_trainer.py
index d4b5906..bd29bf3 100644
--- a/python/fate_llm/trainer/seq2seq_trainer.py
+++ b/python/fate_llm/trainer/seq2seq_trainer.py
@@ -156,7 +156,7 @@ def _save(
             model = unwrap_model(self.model)
 
             if hasattr(model, "save_trainable"):
-                model.save_pretrained(os.path.join(output_dir, TRAINABLE_WEIGHTS_NAME))
+                model.save_trainable(os.path.join(output_dir, TRAINABLE_WEIGHTS_NAME))
             else:
                 state_dict = {
                     k: p.to("cpu") for k,

From 17e2d49e562781c2b648983e40de97c8831db37f Mon Sep 17 00:00:00 2001
From: cwj <talkingwallace@sohu.com>
Date: Mon, 26 Feb 2024 18:23:13 +0800
Subject: [PATCH 18/35] Fix model forward Signed-off-by: weijingchen
 <talkingwallace@sohu.com>

Signed-off-by: cwj <talkingwallace@sohu.com>
---
 .../model_zoo/offsite_tuning/bloom.py         | 67 ++++++++++++++++---
 .../model_zoo/offsite_tuning/llama.py         | 14 ++--
 2 files changed, 61 insertions(+), 20 deletions(-)

diff --git a/python/fate_llm/model_zoo/offsite_tuning/bloom.py b/python/fate_llm/model_zoo/offsite_tuning/bloom.py
index d68f094..09cfc26 100644
--- a/python/fate_llm/model_zoo/offsite_tuning/bloom.py
+++ b/python/fate_llm/model_zoo/offsite_tuning/bloom.py
@@ -17,6 +17,7 @@
 from transformers.models.bloom.modeling_bloom import BloomForCausalLM, BloomModel, BloomConfig
 from torch import nn
 import torch
+from typing import Optional, Tuple
 
 
 class BloomMainModel(OffsiteTuningMainModel):
@@ -40,9 +41,6 @@ def get_base_model(self):
     def get_model_transformer_blocks(self, model: BloomForCausalLM):
         return model.transformer.h
 
-    def forward(self, x):
-        return self.model(**x)
-
     def get_additional_param_state_dict(self):
         # get parameter of additional parameter
         model = self.model
@@ -79,8 +77,34 @@ def load_additional_param_state_dict(self, submodel_weights: dict):
         new_submodel_weight['wte'] = wte
         self.load_numpy_state_dict(param_dict, new_submodel_weight)
 
-    def forward(self, x):
-        return self.model(**x)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ):
+
+        return self.model(
+            input_ids,
+            past_key_values,
+            attention_mask,
+            head_mask,
+            inputs_embeds,
+            labels,
+            use_cache,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            **deprecated_arguments,
+        )
 
 
 class BloomSubModel(OffsiteTuningSubModel):
@@ -116,9 +140,6 @@ def get_base_model(self):
     def get_model_transformer_blocks(self, model: BloomForCausalLM):
         return model.transformer.h
 
-    def forward(self, x):
-        return self.model(**x)
-
     def get_additional_param_state_dict(self):
         # get parameter of additional parameter
         model = self.model
@@ -155,8 +176,34 @@ def load_additional_param_state_dict(self, submodel_weights: dict):
         new_submodel_weight['wte'] = wte
         self.load_numpy_state_dict(param_dict, new_submodel_weight)
 
-    def forward(self, x):
-        return self.model(**x)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ):
+
+        return self.model(
+            input_ids,
+            past_key_values,
+            attention_mask,
+            head_mask,
+            inputs_embeds,
+            labels,
+            use_cache,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            **deprecated_arguments,
+        )
 
     def parameters(self, recurse=True):
         if self.partial_weight_decay is None:
diff --git a/python/fate_llm/model_zoo/offsite_tuning/llama.py b/python/fate_llm/model_zoo/offsite_tuning/llama.py
index 7acb02c..a034d6d 100644
--- a/python/fate_llm/model_zoo/offsite_tuning/llama.py
+++ b/python/fate_llm/model_zoo/offsite_tuning/llama.py
@@ -37,9 +37,6 @@ def get_base_model(self):
 
     def get_model_transformer_blocks(self, model: LlamaForCausalLM):
         return model.model.layers
-    
-    def forward(self, x):
-        return self.model(**x)
 
     def get_additional_param_state_dict(self):
         # get parameter of additional parameter
@@ -74,8 +71,8 @@ def load_additional_param_state_dict(self, submodel_weights: dict):
         new_submodel_weight['wte'] = wte
         self.load_numpy_state_dict(param_dict, new_submodel_weight)
 
-    def forward(self, x):
-        return self.model(**x)
+    def forward(self, **kwargs):
+        return self.model(**kwargs)
 
 
 class LlamaSubModel(OffsiteTuningSubModel):
@@ -111,9 +108,6 @@ def get_base_model(self):
     def get_model_transformer_blocks(self, model: LlamaForCausalLM):
         return model.model.layers
 
-    def forward(self, x):
-        return self.model(**x)
-
     def get_additional_param_state_dict(self):
         # get parameter of additional parameter
         model = self.model
@@ -147,8 +141,8 @@ def load_additional_param_state_dict(self, submodel_weights: dict):
         new_submodel_weight['wte'] = wte
         self.load_numpy_state_dict(param_dict, new_submodel_weight)
 
-    def forward(self, x):
-        return self.model(**x)
+    def forward(self, **kwargs):
+        return self.model(**kwargs)
 
     def parameters(self, recurse=True):
         if self.partial_weight_decay is None:

From 7654d937a580f3c9ea31088873bf454ea38af6af Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Mon, 26 Feb 2024 20:03:23 +0800
Subject: [PATCH 19/35] update requirements

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 python/requirements.txt | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 python/requirements.txt

diff --git a/python/requirements.txt b/python/requirements.txt
new file mode 100644
index 0000000..0f39c38
--- /dev/null
+++ b/python/requirements.txt
@@ -0,0 +1,5 @@
+accelerate==0.27.2
+deepspeed==0.13.3
+peft==0.8.2
+sentencepiece==0.2.0
+

From e0e7cacd85e0f100ab69e088cb388234dc8c31fe Mon Sep 17 00:00:00 2001
From: sagewe <wbwmat@gmail.com>
Date: Mon, 26 Feb 2024 20:25:00 +0800
Subject: [PATCH 20/35] fix: add hf dataset and hf model (#47)

Signed-off-by: sagewe <wbwmat@gmail.com>
---
 python/fate_llm/dataset/hf_dataset.py     | 188 ++++++++++++++++++++++
 python/fate_llm/fedkseed/fedkseed.py      |  26 ++-
 python/fate_llm/model_zoo/hf_model.py     |  15 ++
 python/fate_llm/runner/fedkseed_runner.py |  38 ++---
 4 files changed, 238 insertions(+), 29 deletions(-)
 create mode 100644 python/fate_llm/dataset/hf_dataset.py
 create mode 100644 python/fate_llm/model_zoo/hf_model.py

diff --git a/python/fate_llm/dataset/hf_dataset.py b/python/fate_llm/dataset/hf_dataset.py
new file mode 100644
index 0000000..646401b
--- /dev/null
+++ b/python/fate_llm/dataset/hf_dataset.py
@@ -0,0 +1,188 @@
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+from typing import Optional, Union, Sequence, Mapping, Dict
+
+from datasets import load_dataset, Features, Split, DownloadConfig, DownloadMode, VerificationMode, Version
+
+from fate.ml.nn.dataset.base import Dataset
+
+# avoid tokenizer parallelism
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+class HuggingfaceDataset(Dataset):
+    """
+    A dataset class for huggingface datasets
+    """
+
+    def __init__(
+            self,
+            name: Optional[str] = None,
+            data_dir: Optional[str] = None,
+            data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
+            split: Optional[Union[str, Split]] = None,
+            cache_dir: Optional[str] = None,
+            features: Optional[Features] = None,
+            download_config: Optional[DownloadConfig] = None,
+            download_mode: Optional[Union[DownloadMode, str]] = None,
+            verification_mode: Optional[Union[VerificationMode, str]] = None,
+            ignore_verifications="deprecated",
+            keep_in_memory: Optional[bool] = None,
+            save_infos: bool = False,
+            revision: Optional[Union[str, Version]] = None,
+            token: Optional[Union[bool, str]] = None,
+            use_auth_token="deprecated",
+            task="deprecated",
+            streaming: bool = False,
+            num_proc: Optional[int] = None,
+            storage_options: Optional[Dict] = None,
+            trust_remote_code: bool = None,
+            tokenizer_params: Optional[Dict] = None,
+            tokenizer_apply_params: Optional[Dict] = None,
+            **config_kwargs,
+    ):
+        self.name = name
+        self.data_dir = data_dir
+        self.data_files = data_files
+        self.split = split
+        self.cache_dir = cache_dir
+        self.features = features
+        self.download_config = download_config
+        self.download_mode = download_mode
+        self.verification_mode = verification_mode
+        self.ignore_verifications = ignore_verifications
+        self.keep_in_memory = keep_in_memory
+        self.save_infos = save_infos
+        self.revision = revision
+        self.token = token
+        self.use_auth_token = use_auth_token
+        self.task = task
+        self.streaming = streaming
+        self.num_proc = num_proc
+        self.storage_options = storage_options
+        self.trust_remote_code = trust_remote_code
+        self.tokenizer_params = tokenizer_params
+        self.tokenizer_apply_params = tokenizer_apply_params
+        self.config_kwargs = config_kwargs
+
+        super(HuggingfaceDataset, self).__init__()
+
+    def load(self, file_path):
+        return load_dataset(path=file_path, name=self.name, data_dir=self.data_dir, data_files=self.data_files,
+                            split=self.split, cache_dir=self.cache_dir, features=self.features,
+                            download_config=self.download_config, download_mode=self.download_mode,
+                            verification_mode=self.verification_mode, ignore_verifications=self.ignore_verifications,
+                            keep_in_memory=self.keep_in_memory, save_infos=self.save_infos, revision=self.revision,
+                            token=self.token, use_auth_token=self.use_auth_token, task=self.task,
+                            streaming=self.streaming, num_proc=self.num_proc, storage_options=self.storage_options,
+                            trust_remote_code=self.trust_remote_code, **self.config_kwargs)
+
+
+class Dolly15K(HuggingfaceDataset):
+    INSTRUCTION_KEY = "### Instruction:"
+    INPUT_KEY = "Input:"
+    RESPONSE_KEY = "### Response:"
+    END_KEY = "### End"
+    RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"
+    DEFAULT_SEED = 42
+    INTRO_BLURB = (
+        "Below is an instruction that describes a task. Write a response that appropriately completes the request."
+    )
+    PROMPT_NO_INPUT_FORMAT = """{intro}
+{instruction_key}
+{instruction}
+
+{response_key}
+{response}
+
+{end_key}""".format(
+        intro=INTRO_BLURB,
+        instruction_key=INSTRUCTION_KEY,
+        instruction="{instruction}",
+        response_key=RESPONSE_KEY,
+        response="{response}",
+        end_key=END_KEY,
+    )
+
+    # This is a training prompt that contains an input string that serves as context for the instruction.  For example,
+    # the input might be a passage from Wikipedia and the intruction is to extract some information from it.
+    PROMPT_WITH_INPUT_FORMAT = """{intro}
+
+{instruction_key}
+{instruction}
+
+{input_key}
+{input}
+
+{response_key}
+{response}
+
+{end_key}""".format(
+        intro=INTRO_BLURB,
+        instruction_key=INSTRUCTION_KEY,
+        instruction="{instruction}",
+        input_key=INPUT_KEY,
+        input="{input}",
+        response_key=RESPONSE_KEY,
+        response="{response}",
+        end_key=END_KEY,
+    )
+
+    def __init__(self, *args, **kwargs):
+        super(Dolly15K, self).__init__(*args, **kwargs)
+
+    def load(self, file_path):
+        dataset = super().load(file_path)
+        return self._post_process(dataset)
+
+    def _post_process(self, dataset):
+
+        def _add_text(rec):
+            instruction = rec["instruction"]
+            response = rec["response"]
+            context = rec.get("context")
+
+            if not instruction:
+                raise ValueError(f"Expected an instruction in: {rec}")
+
+            if not response:
+                raise ValueError(f"Expected a response in: {rec}")
+
+            # For some instructions there is an input that goes along with the instruction, providing context for the
+            # instruction.  For example, the input might be a passage from Wikipedia and the instruction says to extract
+            # some piece of information from it.  The response is that information to extract.  In other cases there is
+            # no input.  For example, the instruction might be open QA such as asking what year some historic figure was
+            # born.
+            if context:
+                rec["text"] = self.PROMPT_WITH_INPUT_FORMAT.format(instruction=instruction, response=response,
+                                                                   input=context)
+            else:
+                rec["text"] = self.PROMPT_NO_INPUT_FORMAT.format(instruction=instruction, response=response)
+            return rec
+
+        dataset = dataset.map(_add_text)
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(**self.tokenizer_params)
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "right"
+
+        def tokenize_function(examples):
+            return tokenizer(examples["text"], **self.tokenizer_apply_params)
+
+        dataset = dataset.map(tokenize_function, batched=True)
+        return dataset
diff --git a/python/fate_llm/fedkseed/fedkseed.py b/python/fate_llm/fedkseed/fedkseed.py
index d2dfb0e..6fbaca0 100644
--- a/python/fate_llm/fedkseed/fedkseed.py
+++ b/python/fate_llm/fedkseed/fedkseed.py
@@ -16,36 +16,45 @@
 
 class Trainer:
     def __init__(
-            self, ctx: Context, seed_candidates: torch.LongTensor, args
+            self, ctx: Context, seed_candidates: torch.LongTensor, args, fedkseed_args,
     ):
         self.ctx = ctx
         self.args = args
+        self.fedkseed_args = fedkseed_args
 
         self.seed_candidates = seed_candidates
         self.k = len(seed_candidates)
-        self.clients = ctx.hosts
         self.model = None
 
+    @staticmethod
+    def get_clients(ctx: Context):
+        clients = [ctx.guest]
+        try:
+            clients.extend(ctx.hosts)
+        except:
+            pass
+        return clients
+
     def load_model(self):
         raise NotImplementedError
 
     def train(self):
-        direction_derivative_history = {seed.item(): [self.args.grad_initial] for seed in self.seed_candidates}
+        direction_derivative_history = {seed.item(): [self.fedkseed_args.grad_initial] for seed in self.seed_candidates}
         direction_derivative_sum = None
         seed_probabilities = None
-        for aggregation_iter, sub_ctx in self.ctx.ctxs_range(self.args.num_aggregations):
+        for aggregation_iter, sub_ctx in self.ctx.ctxs_range(self.fedkseed_args.num_aggregations):
             # step1: re-calculate sample probabilities for each seed
             if seed_probabilities is None:
                 seed_probabilities = get_even_seed_probabilities(self.k)
             else:
                 seed_probabilities = probability_from_amps(
                     [direction_derivative_history[seed.item()] for seed in self.seed_candidates],
-                    self.args.bias_loss_clip,
+                    self.fedkseed_args.bias_loss_clip,
                 )
 
             # step2(rpc): remote call to the clients to get the directional derivative history
             # proposal
-            for client in sub_ctx.hosts:
+            for client in self.get_clients(sub_ctx):
                 client.put(
                     "train_once",
                     (
@@ -61,7 +70,7 @@ def train(self):
             if direction_derivative_sum is None:
                 direction_derivative_sum = {seed.item(): 0.0 for seed in self.seed_candidates}
             # wait for reply and update the directional derivative history
-            for client in sub_ctx.hosts:
+            for client in self.get_clients(sub_ctx):
                 client_directional_derivative_history = client.get("direction_derivative_history")
                 for seed, history in client_directional_derivative_history.items():
                     # torch.LongTensor -> int
@@ -143,7 +152,8 @@ def train_once(self, seed_candidates, seed_probabilities, direction_derivative_s
         )
         trainer.configure_seed_candidates(seed_candidates, seed_probabilities)
         trainer.train()
-        logger.info(f"evaluate: {trainer.evaluate()}")
+        if self.eval_dataset is not None:
+            logger.info(f"evaluate: {trainer.evaluate()}")
         # get directional derivative history
         return trainer.get_directional_derivative_history()
 
diff --git a/python/fate_llm/model_zoo/hf_model.py b/python/fate_llm/model_zoo/hf_model.py
new file mode 100644
index 0000000..a7701c5
--- /dev/null
+++ b/python/fate_llm/model_zoo/hf_model.py
@@ -0,0 +1,15 @@
+from transformers import AutoModelForCausalLM
+
+
+class HFAutoModelForCausalLM:
+
+    def __init__(self, pretrained_model_name_or_path, *model_args, **kwargs) -> None:
+        self.pretrained_model_name_or_path = pretrained_model_name_or_path
+        self.model_args = model_args
+        self.kwargs = kwargs
+
+    def load(self):
+        model = AutoModelForCausalLM.from_pretrained(
+            self.pretrained_model_name_or_path, *self.model_args, **self.kwargs
+        )
+        return model
diff --git a/python/fate_llm/runner/fedkseed_runner.py b/python/fate_llm/runner/fedkseed_runner.py
index 4d30e8c..0af057a 100644
--- a/python/fate_llm/runner/fedkseed_runner.py
+++ b/python/fate_llm/runner/fedkseed_runner.py
@@ -18,14 +18,13 @@
 from typing import Literal
 from typing import Optional
 
+import transformers
 from fate.components.components.nn.nn_runner import (
     NNRunner,
-    load_model_dict_from_path,
     dir_warning,
     loader_load_from_conf,
 )
 from fate.components.components.nn.runner.homo_default_runner import DefaultRunner
-from transformers.trainer_utils import get_last_checkpoint
 
 from fate_llm.fedkseed.fedkseed import Trainer, FedKSeedTrainingArguments, ClientTrainer
 from fate_llm.fedkseed.zo_utils import build_seed_candidates
@@ -81,34 +80,22 @@ def client_setup(self, train_set=None, validate_set=None, output_dir=None, saved
 
         ctx = self.get_context()
 
-        model = loader_load_from_conf(self.model_conf)
+        model = maybe_loader_load_from_conf(self.model_conf)
         if model is None:
             raise ValueError(f"model is None, cannot load model from conf {self.model_conf}")
 
         if output_dir is None:
             output_dir = "./"
 
-        resume_path = None
-        if saved_model is not None:
-            model_dict = load_model_dict_from_path(saved_model)
-            model.load_state_dict(model_dict)
-            logger.info(f"loading model dict from {saved_model} to model done")
-            if get_last_checkpoint(saved_model) is not None:
-                resume_path = saved_model
-                logger.info(f"checkpoint detected, resume_path set to {resume_path}")
-
-        data_collator = loader_load_from_conf(self.data_collator_conf)
-        # load tokenizer if import conf provided
-        tokenizer = loader_load_from_conf(self.tokenizer_conf)
-        # args
+        tokenizer = transformers.AutoTokenizer.from_pretrained(**self.data_collator_conf["kwargs"]["tokenizer_params"])
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "right"
+
+        data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
         dir_warning(self.training_args_conf)
         training_args = Seq2SeqTrainingArguments(**self.training_args_conf)
         self.training_args = training_args
-        # reset to default, saving to arbitrary path is not allowed in
-        # DefaultRunner
         training_args.output_dir = output_dir
-        training_args.resume_from_checkpoint = resume_path  # resume path
-
         fedkseed_args = FedKSeedTrainingArguments(**self.fed_args_conf)
         training_args = Seq2SeqTrainingArguments(**self.training_args_conf)
         trainer = ClientTrainer(
@@ -133,5 +120,14 @@ def server_setup(self, stage="train"):
         training_args = Seq2SeqTrainingArguments(**self.training_args_conf)
 
         seed_candidates = build_seed_candidates(fedkseed_args.k, low=0, high=2 ** 32)
-        trainer = Trainer(ctx=ctx, seed_candidates=seed_candidates, args=training_args)
+        trainer = Trainer(ctx=ctx, seed_candidates=seed_candidates, args=training_args, fedkseed_args=fedkseed_args)
         return trainer
+
+
+def maybe_loader_load_from_conf(conf):
+    from fate_llm.model_zoo.hf_model import HFAutoModelForCausalLM
+
+    model = loader_load_from_conf(conf)
+    if isinstance(model, HFAutoModelForCausalLM):
+        model = model.load()
+    return model

From af63457c3d1b178aec836979a720eb66b13ccea2 Mon Sep 17 00:00:00 2001
From: sagewe <wbwmat@gmail.com>
Date: Tue, 27 Feb 2024 12:45:49 +0800
Subject: [PATCH 21/35] chore: add log (#47)

Signed-off-by: sagewe <wbwmat@gmail.com>
---
 python/fate_llm/dataset/hf_dataset.py     | 4 +---
 python/fate_llm/runner/fedkseed_runner.py | 6 +++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/python/fate_llm/dataset/hf_dataset.py b/python/fate_llm/dataset/hf_dataset.py
index 646401b..d38a09e 100644
--- a/python/fate_llm/dataset/hf_dataset.py
+++ b/python/fate_llm/dataset/hf_dataset.py
@@ -17,6 +17,7 @@
 from typing import Optional, Union, Sequence, Mapping, Dict
 
 from datasets import load_dataset, Features, Split, DownloadConfig, DownloadMode, VerificationMode, Version
+from transformers import AutoTokenizer
 
 from fate.ml.nn.dataset.base import Dataset
 
@@ -176,10 +177,7 @@ def _add_text(rec):
 
         dataset = dataset.map(_add_text)
 
-        from transformers import AutoTokenizer
         tokenizer = AutoTokenizer.from_pretrained(**self.tokenizer_params)
-        tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.padding_side = "right"
 
         def tokenize_function(examples):
             return tokenizer(examples["text"], **self.tokenizer_apply_params)
diff --git a/python/fate_llm/runner/fedkseed_runner.py b/python/fate_llm/runner/fedkseed_runner.py
index 0af057a..f48b9e8 100644
--- a/python/fate_llm/runner/fedkseed_runner.py
+++ b/python/fate_llm/runner/fedkseed_runner.py
@@ -88,16 +88,16 @@ def client_setup(self, train_set=None, validate_set=None, output_dir=None, saved
             output_dir = "./"
 
         tokenizer = transformers.AutoTokenizer.from_pretrained(**self.data_collator_conf["kwargs"]["tokenizer_params"])
-        tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.padding_side = "right"
 
         data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
         dir_warning(self.training_args_conf)
+
         training_args = Seq2SeqTrainingArguments(**self.training_args_conf)
         self.training_args = training_args
         training_args.output_dir = output_dir
         fedkseed_args = FedKSeedTrainingArguments(**self.fed_args_conf)
-        training_args = Seq2SeqTrainingArguments(**self.training_args_conf)
+        logger.debug(f"training_args: {training_args}")
+        logger.debug(f"fedkseed_args: {fedkseed_args}")
         trainer = ClientTrainer(
             ctx=ctx,
             model=model,

From 50fa2421233dfefd60eb21ef0e8434282c361d0d Mon Sep 17 00:00:00 2001
From: cwj <talkingwallace@sohu.com>
Date: Tue, 27 Feb 2024 15:35:14 +0800
Subject: [PATCH 22/35] Fix multi-GPU bugs Signed-off-by: weijingchen
 <talkingwallace@sohu.com>

Signed-off-by: cwj <talkingwallace@sohu.com>
---
 python/fate_llm/homo/fedavg.py         |  1 +
 python/fate_llm/homo/offsite_tuning.py | 44 +++++++++++++++++++++-----
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/python/fate_llm/homo/fedavg.py b/python/fate_llm/homo/fedavg.py
index 82e0190..be28d10 100644
--- a/python/fate_llm/homo/fedavg.py
+++ b/python/fate_llm/homo/fedavg.py
@@ -29,6 +29,7 @@
 from transformers import TrainerState, TrainerControl, PreTrainedTokenizer, EvalPrediction
 
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/python/fate_llm/homo/offsite_tuning.py b/python/fate_llm/homo/offsite_tuning.py
index 1fa3ae3..109a2e7 100644
--- a/python/fate_llm/homo/offsite_tuning.py
+++ b/python/fate_llm/homo/offsite_tuning.py
@@ -14,6 +14,8 @@
 from fate_llm.model_zoo.offsite_tuning.offsite_tuning_model import OffsiteTuningBaseModel
 import logging
 import torch
+import torch.distributed as dist
+from transformers.modeling_utils import unwrap_model
 
 
 logger = logging.getLogger(__name__)
@@ -66,15 +68,38 @@ def __init__(
         )
         self._aggregate_model = aggregate_model
 
+
+    def _share_model(self, model, args: Seq2SeqTrainingArguments, sync_trainable_only=True):
+
+        if args.local_rank == 0:
+            for p in model.parameters():
+                if (not sync_trainable_only) or (sync_trainable_only and p.requires_grad):
+                    scatter_list = [p.data for _ in range(args.world_size)]
+                    dist.scatter(p.data, scatter_list, async_op=False)
+        else:
+            for p in model.parameters():
+                if (not sync_trainable_only) or (sync_trainable_only and p.requires_grad):
+                    dist.scatter(p.data, src=0, async_op=False)
+
     def on_train_begin(self, ctx: Context, aggregator: Aggregator, fed_args: FedArguments, 
                        args: TrainingArguments, model: Module = None, optimizer: Optimizer = None, scheduler: _LRScheduler = None, 
                        dataloader: Tuple[DataLoader]= None, control: TrainerControl= None, 
                        state: TrainerState = None, **kwargs):
         
-        logger.info('receving weights from server')
-        parameters_to_get = ctx.arbiter.get('sub_model_para')
-        model.load_submodel_weights(parameters_to_get)
-        logger.info('received submodel weigths from the server')
+        if args.local_rank == 0: # master
+            logger.info('receving weights from server')
+            parameters_to_get = ctx.arbiter.get('sub_model_para')
+            model = unwrap_model(model)
+            model.load_submodel_weights(parameters_to_get)
+            logger.info('received submodel weigths from the server')
+            if args.world_size > 1:
+                self._share_model(model, args)
+                logger.info('sharing model parameters done')
+        else:
+            if args.world_size > 1:
+                model = unwrap_model(model)
+                self._share_model(model, args)
+                logger.info('sharing model parameters done')
 
     def on_federation(
         self,
@@ -98,10 +123,13 @@ def on_train_end(self, ctx: Context, aggregator: Aggregator, fed_args: FedArgume
                     args: TrainingArguments, model: OffsiteTuningBaseModel = None, optimizer: Optimizer = None, scheduler: _LRScheduler = None, 
                     dataloader: Tuple[DataLoader]= None, control: TrainerControl= None, 
                     state: TrainerState = None, **kwargs):
-        logger.info('receving weights from server')
-        return_weights = model.get_submodel_weights()
-        ctx.arbiter.put('trained_sub_model_para', return_weights)
-        logger.info('weights sent back to the server')
+
+        if args.local_rank == 0:
+            if args.world_size > 1:
+                model = unwrap_model(model)
+            return_weights = model.get_submodel_weights()
+            ctx.arbiter.put('trained_sub_model_para', return_weights)
+            logger.info('weights sent back to the server')
 
     def init_aggregator(self, ctx: Context, fed_args: FedArguments):
         if self._aggregate_model:

From 7ad859f88d6c4e19d093493f4037c06831cd88e9 Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Thu, 29 Feb 2024 17:31:13 +0800
Subject: [PATCH 23/35] fix pellm

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py b/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
index cd93938..1c18a72 100644
--- a/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
+++ b/python/fate_llm/model_zoo/pellm/parameter_efficient_llm.py
@@ -121,10 +121,10 @@ def model_summary(self):
     def forward(self, *args, **kwargs):
         forward_ret = self._pe_lm.forward(*args, **kwargs)
 
-        if self.peft_config.task_type == TaskType.SEQ_CLS:
-            return forward_ret.logits
-        else:
+        if self.peft_config is None or self.peft_config.task_type != TaskType.SEQ_CLS:
             return forward_ret
+        else:
+            return forward_ret.logits
 
     def save_trainable(self, output_path):
         state_dict = {

From f6924c463891cc2ae59640f655d49fe7dbfb479c Mon Sep 17 00:00:00 2001
From: sagewe <wbwmat@gmail.com>
Date: Thu, 29 Feb 2024 19:28:17 +0800
Subject: [PATCH 24/35] docs: add fedkseed (#47)

Signed-off-by: sagewe <wbwmat@gmail.com>
---
 doc/tutorial/fedkseed/fedkseed-example.ipynb | 389 +++++++++++++++++++
 1 file changed, 389 insertions(+)
 create mode 100644 doc/tutorial/fedkseed/fedkseed-example.ipynb

diff --git a/doc/tutorial/fedkseed/fedkseed-example.ipynb b/doc/tutorial/fedkseed/fedkseed-example.ipynb
new file mode 100644
index 0000000..a20002b
--- /dev/null
+++ b/doc/tutorial/fedkseed/fedkseed-example.ipynb
@@ -0,0 +1,389 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#  Federated Tuning with FedKSeed methods in FATE-LLM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this tutorial, we will demonstrate how to efficiently train federated large language models using the FATE-LLM framework. In FATE-LLM, we introduce the \"FedKSeed\" module, specifically designed for federated learning with large language models. The Idea of FedKSeed is to use Zeroth-Order-Optimizer to optimize model along given direction that generated with random seed. This method can be used to train large language models in a federated learning setting with extremely low communication cost.\n",
+    "\n",
+    "The Algorithm is based on the paper: [Federated Full-Parameter Tuning of Billion-Sized Language Models\n",
+    "with Communication Cost under 18 Kilobytes](https://arxiv.org/pdf/2312.06353.pdf) and the code is modified from the https://github.com/alibaba/FederatedScope/tree/FedKSeed. We refactor the code to make it more compatible with (transformers/PyTorch) framework and integrate it into the FATE-LLM framework.\n",
+    "\n",
+    "The main works include:\n",
+    "1. An KSeedZerothOrderOptimizer class that can be used to optimize model along given direction that generated with random seed.\n",
+    "2. An KSeedZOExtendedTrainer subclass of Trainer from transformers that can be used to train large language models with KSeedZerothOrderOptimizer.\n",
+    "3. Trainers for federated learning with large language models.\n",
+    "\n",
+    "In this tutorial, we will demonstrate how to use the FedKSeed method to train a large language model in a federated learning setting. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model: datajuicer/LLaMA-1B-dj-refine-150B\n",
+    "\n",
+    "This is the introduction from the Huggingface model hub: [datajuicer/LLaMA-1B-dj-refine-150B](https://huggingface.co/datajuicer/LLaMA-1B-dj-refine-150B)\n",
+    "\n",
+    "> The model architecture is LLaMA-1.3B and we adopt the OpenLLaMA implementation. The model is pre-trained on 150B tokens of Data-Juicer's refined RedPajama and Pile. It achieves an average score of 34.21 over 16 HELM tasks, beating Falcon-1.3B (trained on 350B tokens from RefinedWeb), Pythia-1.4B (trained on 300B tokens from original Pile) and Open-LLaMA-1.3B (trained on 150B tokens from original RedPajama and Pile).\n",
+    "\n",
+    "> For more details, please refer to our [paper](https://arxiv.org/abs/2309.02033).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "# model_name_or_path = \"datajuicer/LLaMA-1B-dj-refine-150B\"\n",
+    "model_name_or_path = \"gpt2\""
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-02-29T09:27:23.512735Z",
+     "start_time": "2024-02-29T09:27:23.508790Z"
+    }
+   },
+   "execution_count": 1
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Dataset: databricks/databricks-dolly-15k\n",
+    "\n",
+    "This is the introduction from the Huggingface dataset hub: [databricks/databricks-dolly-15k](https://huggingface.co/dataset/databricks/databricks-dolly-15k)\n",
+    "\n",
+    "> databricks-dolly-15k is a corpus of more than 15,000 records generated by thousands of Databricks employees to enable large language models to exhibit the magical interactivity of ChatGPT. Databricks employees were invited to create prompt / response pairs in each of eight different instruction categories, including the seven outlined in the InstructGPT paper, as well as an open-ended free-form category. The contributors were instructed to avoid using information from any source on the web with the exception of Wikipedia (for particular subsets of instruction categories), and explicitly instructed to avoid using generative AI in formulating instructions or responses. Examples of each behavior were provided to motivate the types of questions and instructions appropriate to each category\n",
+    "\n",
+    "To use this dataset, you first need to download it from the Huggingface dataset hub:\n",
+    "\n",
+    "```bash\n",
+    "mkdir -p ../../../examples/data/dolly && cd ../../../examples/data/dolly && wget  wget https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl\\?download\\=true -O databricks-dolly-15k.jsonl\n",
+    "```\n",
+    "\n",
+    "### Check Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-02-29T09:27:26.987779Z",
+     "start_time": "2024-02-29T09:27:24.706218Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from fate_llm.dataset.hf_dataset import Dolly15K\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)\n",
+    "special_tokens = tokenizer.special_tokens_map\n",
+    "if \"pad_token\" not in tokenizer.special_tokens_map:\n",
+    "    special_tokens[\"pad_token\"] = special_tokens[\"eos_token\"]\n",
+    "\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "ds = Dolly15K(split=\"train\", tokenizer_params={\"pretrained_model_name_or_path\": model_name_or_path, **special_tokens},\n",
+    "              tokenizer_apply_params=dict(truncation=True, max_length=tokenizer.model_max_length, padding=\"max_length\", return_tensors=\"pt\"))\n",
+    "ds = ds.load('../../../examples/data/dolly')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-02-29T09:27:27.875025Z",
+     "start_time": "2024-02-29T09:27:27.867839Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "Dataset({\n    features: ['instruction', 'context', 'response', 'category', 'text', 'input_ids', 'attention_mask'],\n    num_rows: 15011\n})"
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For more details of FATE-LLM dataset setting, we recommend that you read through these tutorials first: [NN Dataset Customization](https://github.com/FederatedAI/FATE/blob/master/doc/tutorial/pipeline/nn_tutorial/Homo-NN-Customize-your-Dataset.ipynb), [Some Built-In Dataset](https://github.com/FederatedAI/FATE/blob/master/doc/tutorial/pipeline/nn_tutorial/Introduce-Built-In-Dataset.ipynb),"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Check local training\n",
+    "\n",
+    "Before submitting a federated learning task, we will demonstrate how to perform local testing to ensure the proper functionality of your custom dataset, model. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM, TrainingArguments, DataCollatorForLanguageModeling\n",
+    "from fate_llm.fedkseed.trainer import KSeedZOExtendedTrainer, KSeedTrainingArguments\n",
+    "from fate_llm.fedkseed.zo_utils import build_seed_candidates, get_even_seed_probabilities\n",
+    "\n",
+    "def test_training(zo_mode=True):\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path, **special_tokens)\n",
+    "    data_collector = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n",
+    "    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name_or_path)\n",
+    "\n",
+    "    training_args = TrainingArguments(output_dir='./',\n",
+    "                                      dataloader_num_workers=1,\n",
+    "                                      dataloader_prefetch_factor=1,\n",
+    "                                      remove_unused_columns=True,\n",
+    "                                      learning_rate=1e-5,\n",
+    "                                      per_device_train_batch_size=1,\n",
+    "                                      num_train_epochs=0.01,\n",
+    "                                      )\n",
+    "    kseed_args = KSeedTrainingArguments(zo_optim=zo_mode)\n",
+    "    trainer = KSeedZOExtendedTrainer(model=model, train_dataset=ds, training_args=training_args, kseed_args=kseed_args,\n",
+    "                                     tokenizer=tokenizer, data_collator=data_collector)\n",
+    "    if zo_mode:\n",
+    "        seed_candidates = build_seed_candidates(k=kseed_args.k)\n",
+    "        seed_probabilities = get_even_seed_probabilities(k=kseed_args.k)\n",
+    "        trainer.configure_seed_candidates(seed_candidates, seed_probabilities)\n",
+    "    return trainer.train()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-02-29T09:38:33.175079Z",
+     "start_time": "2024-02-29T09:38:33.168844Z"
+    }
+   },
+   "execution_count": 16
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-02-29T09:39:37.602070Z",
+     "start_time": "2024-02-29T09:38:34.024223Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "<IPython.core.display.HTML object>",
+      "text/html": "\n    <div>\n      \n      <progress value='151' max='151' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [151/151 00:59, Epoch 0/1]\n    </div>\n    <table border=\"1\" class=\"dataframe\">\n  <thead>\n <tr style=\"text-align: left;\">\n      <th>Step</th>\n      <th>Training Loss</th>\n    </tr>\n  </thead>\n  <tbody>\n  </tbody>\n</table><p>"
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": "TrainOutput(global_step=151, training_loss=1.2660519429390005, metrics={'train_runtime': 61.8249, 'train_samples_per_second': 2.428, 'train_steps_per_second': 2.442, 'total_flos': 78910193664000.0, 'train_loss': 1.2660519429390005, 'epoch': 0.01})"
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_training(zo_mode=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "<IPython.core.display.HTML object>",
+      "text/html": "\n    <div>\n      \n      <progress value='151' max='151' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [151/151 01:29, Epoch 0/1]\n    </div>\n    <table border=\"1\" class=\"dataframe\">\n  <thead>\n <tr style=\"text-align: left;\">\n      <th>Step</th>\n      <th>Training Loss</th>\n    </tr>\n  </thead>\n  <tbody>\n  </tbody>\n</table><p>"
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": "TrainOutput(global_step=151, training_loss=0.6093456950408733, metrics={'train_runtime': 92.6158, 'train_samples_per_second': 1.621, 'train_steps_per_second': 1.63, 'total_flos': 78910193664000.0, 'train_loss': 0.6093456950408733, 'epoch': 0.01})"
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_training(zo_mode=False)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-02-29T09:41:28.949449Z",
+     "start_time": "2024-02-29T09:39:54.802705Z"
+    }
+   },
+   "execution_count": 18
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "You can see that Zeroth-Order-Optimizer has much worse performance than AdamW, that's the price we need to pay for the low communication cost. "
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Submit Federated Task\n",
+    "Once you have successfully completed local testing, We can submit a task to FATE. Please notice that this tutorial is ran on a standalone version. **Please notice that in this tutorial we are using a standalone version, if you are using a cluster version, you need to bind the data with the corresponding name&namespace on each machine.**\n",
+    "\n",
+    "In this example we load pretrained weights for gpt2 model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "from fate_client.pipeline.components.fate.reader import Reader\n",
+    "from fate_client.pipeline import FateFlowPipeline\n",
+    "from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_config_of_seq2seq_runner\n",
+    "from fate_client.pipeline.components.fate.nn.algo_params import TrainingArguments, FedAVGArguments\n",
+    "from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader\n",
+    "\n",
+    "guest = '10000'\n",
+    "host = '10000'\n",
+    "arbiter = '10000'\n",
+    "\n",
+    "epochs = 0.01\n",
+    "batch_size = 1\n",
+    "lr = 1e-5\n",
+    "\n",
+    "pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter)\n",
+    "pipeline.bind_local_path(path=\"/data/projects/fate/examples/data/dolly\", namespace=\"experiment\",\n",
+    "                         name=\"dolly\")\n",
+    "time.sleep(5)\n",
+    "\n",
+    "reader_0 = Reader(\"reader_0\", runtime_parties=dict(guest=guest, host=host))\n",
+    "reader_0.guest.task_parameters(\n",
+    "    namespace=\"experiment\",\n",
+    "    name=\"dolly\"\n",
+    ")\n",
+    "reader_0.hosts[0].task_parameters(\n",
+    "    namespace=\"experiment\",\n",
+    "    name=\"dolly\"\n",
+    ")\n",
+    "\n",
+    "tokenizer_params = dict(\n",
+    "    pretrained_model_name_or_path=\"gpt2\",\n",
+    "    trust_remote_code=True,\n",
+    ")\n",
+    "conf = get_config_of_seq2seq_runner(\n",
+    "    algo='fedkseed',\n",
+    "    model=LLMModelLoader(\n",
+    "        \"hf_model\",\n",
+    "        \"HFAutoModelForCausalLM\",\n",
+    "        # pretrained_model_name_or_path=\"datajuicer/LLaMA-1B-dj-refine-150B\",\n",
+    "        pretrained_model_name_or_path=\"gpt2\",\n",
+    "        trust_remote_code=True\n",
+    "    ),\n",
+    "    dataset=LLMDatasetLoader(\n",
+    "        \"hf_dataset\",\n",
+    "        \"Dolly15K\",\n",
+    "        split=\"train\",\n",
+    "        tokenizer_params=tokenizer_params,\n",
+    "        tokenizer_apply_params=dict(\n",
+    "            truncation=True,\n",
+    "            max_length=1024,\n",
+    "        )),\n",
+    "    data_collator=LLMDataFuncLoader(\n",
+    "        \"cust_func.cust_data_collator\",\n",
+    "        \"get_seq2seq_tokenizer\",\n",
+    "        tokenizer_params=tokenizer_params,\n",
+    "    ),\n",
+    "    training_args=TrainingArguments(\n",
+    "        num_train_epochs=0.01,\n",
+    "        per_device_train_batch_size=batch_size,\n",
+    "        remove_unused_columns=True,\n",
+    "        learning_rate=lr,\n",
+    "        fp16=False,\n",
+    "        use_cpu=False,\n",
+    "        disable_tqdm=False,\n",
+    "        use_mps_device=True,\n",
+    "    ),\n",
+    "    fed_args=FedAVGArguments(),\n",
+    "    task_type='causal_lm',\n",
+    "    save_trainable_weights_only=True,\n",
+    ")\n",
+    "\n",
+    "conf[\"fed_args_conf\"] = {}\n",
+    "\n",
+    "homo_nn_0 = HomoNN(\n",
+    "    'nn_0',\n",
+    "    runner_conf=conf,\n",
+    "    train_data=reader_0.outputs[\"output_data\"],\n",
+    "    runner_module=\"fedkseed_runner\",\n",
+    "    runner_class=\"FedKSeedRunner\",\n",
+    ")\n",
+    "\n",
+    "pipeline.add_tasks([reader_0, homo_nn_0])\n",
+    "pipeline.conf.set(\"task\", dict(engine_run={\"cores\": 1}))\n",
+    "\n",
+    "pipeline.compile()\n",
+    "pipeline.fit()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can use this script to submit the model, but submitting the model will take a long time to train and generate a long log, so we won't do it here."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From b84c88f671b8a91603b6a0a1dd469344c39eeeb6 Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Thu, 29 Feb 2024 20:55:33 +0800
Subject: [PATCH 25/35] update doc of pellm

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 doc/tutorial/builtin_models.md                |  21 -
 doc/tutorial/builtin_pellm_models.md          |  22 +
 doc/tutorial/fed_ipr/FedIPR-tutorial.ipynb    | 828 ------------------
 .../ChatGLM3-6B_ds.ipynb                      | 558 ++++++++++++
 .../GPT2-example.ipynb                        | 673 --------------
 5 files changed, 580 insertions(+), 1522 deletions(-)
 delete mode 100644 doc/tutorial/builtin_models.md
 create mode 100644 doc/tutorial/builtin_pellm_models.md
 delete mode 100644 doc/tutorial/fed_ipr/FedIPR-tutorial.ipynb
 create mode 100644 doc/tutorial/parameter_efficient_llm/ChatGLM3-6B_ds.ipynb
 delete mode 100644 doc/tutorial/parameter_efficient_llm/GPT2-example.ipynb

diff --git a/doc/tutorial/builtin_models.md b/doc/tutorial/builtin_models.md
deleted file mode 100644
index 5069246..0000000
--- a/doc/tutorial/builtin_models.md
+++ /dev/null
@@ -1,21 +0,0 @@
-## Builtin Models
-FATE-LLM provide some builtin models, users can use them simply to efficiently train their language models.
-To use these models, please read the using tutorial of [ChatGLM-6B Training Guide](./ChatGLM-6B_ds.ipynb) and [GPT2 Training Guide](GPT2-example.ipynb).   
-After reading the training tutorial above, it's easy to use other models listing in the following tabular by changing `module_name`, `class_name`, `dataset` to `ModuleName`, `ClassName`, `DatasetName` respectively list below.
-  
-  
-
-| Model          | ModuleName        | ClassName                         | DataSetName      | 
-| -------------- | ----------------- | --------------------------------- | ---------------- |
-| Bloom-7B1      | pellm.bloom       | BloomForCausalLM                  | prompt_tokenizer  |                              
-| LLaMA-2-7B     | pellm.llama       | LLAMAForCausalLM                  | prompt_tokenizer  |                              
-| LLaMA-7B       | pellm.llama       | LLAMAForCausalLM                  | prompt_tokenizer  |                              
-| ChatGLM2-6B    | pellm.chatglm     | ChatGLMForConditionalGeneration   | glm_tokenizer    |                              
-| ChatGLM-6B     | pellm.chatglm     | ChatGLMForConditionalGeneration   | glm_tokenizer    |                              
-| GPT-2          | pellm.gpt2        | GPT2                              | nlp_tokenizer    |                              
-| ALBERT         | pellm.albert      | Albert                            | nlp_tokenizer    |                              
-| BART           | pellm.bart        | Bart                              | nlp_tokenizer    |                              
-| BERT           | pellm.bert        | Bert                              | nlp_tokenizer    |                              
-| DeBERTa        | pellm.deberta     | Deberta                           | nlp_tokenizer    |                              
-| DistilBERT     | pellm.distilbert  | DistilBert                        | nlp_tokenizer    |                              
-| RoBERTa        | pellm.roberta     | Roberta                           | nlp_tokenizer    |                              
diff --git a/doc/tutorial/builtin_pellm_models.md b/doc/tutorial/builtin_pellm_models.md
new file mode 100644
index 0000000..70c3f37
--- /dev/null
+++ b/doc/tutorial/builtin_pellm_models.md
@@ -0,0 +1,22 @@
+## Builtin PELLM Models
+FATE-LLM provide some builtin pellm models, users can use them simply to efficiently train their language models.
+To use these models, please read the using tutorial of [ChatGLM-6B Training Guide](./ChatGLM-6B_ds.ipynb).   
+After reading the training tutorial above, it's easy to use other models listing in the following tabular by changing `module_name`, `class_name`, `dataset` list below.
+  
+  
+
+| Model          | ModuleName        | ClassName     | DataSetName     | 
+| -------------- | ----------------- | --------------| --------------- |                 |
+| Qwen2          | pellm.qwen        | Qwen          | prompt_dataset  |                              
+| Bloom-7B1      | pellm.bloom       | Bloom         | prompt_dataset  |                              
+| LLaMA-2-7B     | pellm.llama       | LLaMa         | prompt_dataset  |                              
+| LLaMA-7B       | pellm.llama       | LLaMa         | prompt_dataset  |                              
+| ChatGLM3-6B    | pellm.chatglm     | ChatGLM       | prompt_dataset  |                              
+| ChatGLM-6B     | pellm.chatglm     | ChatGLM       | prompt_dataset  |                              
+| GPT-2          | pellm.gpt2        | GPT2          | seq_cls_dataset |                              
+| ALBERT         | pellm.albert      | Albert        | seq_cls_dataset |                              
+| BART           | pellm.bart        | Bart          | seq_cls_dataset |                              
+| BERT           | pellm.bert        | Bert          | seq_cls_dataset |                              
+| DeBERTa        | pellm.deberta     | Deberta       | seq_cls_dataset |                              
+| DistilBERT     | pellm.distilbert  | DistilBert    | seq_cls_dataset |                              
+| RoBERTa        | pellm.roberta     | Roberta       | seq_cls_dataset |                              
diff --git a/doc/tutorial/fed_ipr/FedIPR-tutorial.ipynb b/doc/tutorial/fed_ipr/FedIPR-tutorial.ipynb
deleted file mode 100644
index 8b32781..0000000
--- a/doc/tutorial/fed_ipr/FedIPR-tutorial.ipynb
+++ /dev/null
@@ -1,828 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# FedIPR Tutorial: Guide to Adding Watermarks to Image and Language Models"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this tutorial, you'll learn how to add both backdoor-based and feature-based watermarks to your models in the federated training. \n",
-    "We'll dive into using backdoor-watermark datasets for backdoor-based watermarking and exploring signblock—a tool that learns feature-based watermarks during traning. We will show you how to apply these techniques to both computer vision and language models. We'll also offer a hands-on example with a CV task, share how to verify the watermarks you've embedded, and introduce some ready-to-use models provided by the FATE framework. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## FedIPR Introduction\n",
-    "FedIPR stands for Federated Intellectual Property Rights, a technology designed to protect the ownership of models developed under federated conditions. At its core, the FedIPR approach is described in the original paper [FedIPR](https://arxiv.org/pdf/2109.13236.pdf), introducing two primary watermarking techniques to safeguard your model: Backdoor-based and Feature-based watermarks.\n",
-    "\n",
-    "Backdoor-based methods: These methods use specific input triggers to produce intentional, incorrect labels. The goal here is to create a unique \"signature\" for the model, allowing for ownership verification through remote APIs, without requiring access to the model's internal parameters.\n",
-    "\n",
-    "Feature-based methods: These techniques encode designated binary strings as watermarks directly into the model's layer parameters. Various schemes have been proposed, such as embedding these watermarks into convolution layer weights using a binary cross-entropy loss function, or into normalization layer scale parameters using a hinge-like regularization term. In our implementations, we embed signatures into normalization layers as the same as \n",
-    "\n",
-    "Through these watermarking techniques, FedIPR ensures a robust way to assert ownership of your federated models without compromising their performance."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Preliminary\n",
-    "\n",
-    "We strongly recommend you finish reading our NN tutorial to get familiar with Model and Dataset customizations: [NN Tutorials](https://github.com/FederatedAI/FATE/blob/master/doc/tutorial/pipeline/nn_tutorial/README.md)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Backdoor Dataset for Backdoor Watermark"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can develop your own backdoor dataset and use it in FedIPRTrainer. If watermark dataset is detected, it will be used to train models along with your task dataset. If not provided, it will perform normal training.\n",
-    "\n",
-    "You can add python path so that you can run codes in the notebook."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "your_path_to_fate_python = 'xxx/fate/python'\n",
-    "sys.path.append(your_path_to_fate_python)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Interfaces"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The WaterMarkDataset class serves as a base class for handling watermark datasets in federated learning environments. It’s crucial for you to implement the load method. The primary task when subclassing WaterMarkDataset is to fill in the load method. This method should take a path argument and use it to load both your normal and watermark datasets.\n",
-    "\n",
-    "Besides you need to implement other interfaces like get_item, len like using a pytorch dataset to make it work correctly in FATE.\n",
-    "You can refer to this tutorial: [Dataset Customization](https://github.com/FederatedAI/FATE/blob/master/doc/tutorial/pipeline/nn_tutorial/Homo-NN-Customize-your-Dataset.ipynb)\n",
-    "\n",
-    "Here show you the source code of the watermark dataset class."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from federatedml.nn.dataset.base import Dataset\n",
-    "from federatedml.util import LOGGER\n",
-    "from federatedml.nn.dataset.image import ImageDataset\n",
-    "\n",
-    "\n",
-    "class WaterMarkDataset(Dataset):\n",
-    "\n",
-    "    def __init__(self):\n",
-    "        super().__init__()\n",
-    "        self.normal_dataset = None\n",
-    "        self.watermark_dataset = None\n",
-    "\n",
-    "    def load(self, path):\n",
-    "        raise NotImplementedError()\n",
-    "\n",
-    "    def get_normal_dataset(self):\n",
-    "        return self.normal_dataset\n",
-    "\n",
-    "    def get_watermark_dataset(self):\n",
-    "        return self.watermark_dataset"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "To make you better understand how our watermark dataset work, here we show the implementation of load function of our built-in WaterMarkImageDataset.\n",
-    "The WaterMarkImageDataset class is designed to automatically identify and load two distinct folders from the specified file path: one containing 'normal' training samples and another containing 'watermark' trigger samples."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def load(self, file_path):\n",
-    "\n",
-    "    # normal dataset path\n",
-    "    normal_path = os.path.join(file_path, self.normal_folder_name)\n",
-    "    # watermark dataset path\n",
-    "    watermark_path = os.path.join(file_path, self.watermark_folder_name)\n",
-    "\n",
-    "    # load normal dataset\n",
-    "    self.normal_dataset = ImageDataset(\n",
-    "        center_crop=self.center_crop,\n",
-    "        center_crop_shape=self.size,\n",
-    "        generate_id_from_file_name=self.generate_id_from_file_name,\n",
-    "        file_suffix=self.file_suffix,\n",
-    "        float64=self.float64,\n",
-    "        label_dtype=self.label_type\n",
-    "    )\n",
-    "    if os.path.exists(normal_path):\n",
-    "        self.normal_dataset.load(normal_path)\n",
-    "    else:\n",
-    "        self.normal_dataset = None\n",
-    "        LOGGER.info(\n",
-    "            f'normal dataset not found in {normal_path}, will not load normal dataset')\n",
-    "    # load watermark dataset\n",
-    "    self.watermark_dataset = ImageDataset(\n",
-    "        center_crop=self.center_crop,\n",
-    "        center_crop_shape=self.size,\n",
-    "        generate_id_from_file_name=self.generate_id_from_file_name,\n",
-    "        file_suffix=self.file_suffix,\n",
-    "        float64=self.float64,\n",
-    "        label_dtype=self.label_type\n",
-    "    )\n",
-    "    if os.path.exists(watermark_path):\n",
-    "        self.watermark_dataset.load(watermark_path)\n",
-    "    else:\n",
-    "        self.watermark_dataset = None\n",
-    "        LOGGER.info(\n",
-    "            f'watermark dataset not found in {watermark_path}, will not load watermark dataset')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can try our WaterMarkImageDataset: use it load our provided cifar-10 watermarked dataset which contains 100 trigger samples.Each image in these folders has been augmented with a pattern of structured noise in one corner. Download the dataset and place it in example/data folder in your fate project: [Dowload Path]()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from fate_llm.dataset.watermark import WaterMarkImageDataset\n",
-    "\n",
-    "ds = WaterMarkImageDataset()\n",
-    "ds.load('../../../examples/data/cifar_10_ipr/fedipr_cifar10_guest/')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Dataset ImageFolder\n",
-       "    Number of datapoints: 25000\n",
-       "    Root location: ../../examples/data/cifar_10_ipr/fedipr_cifar10_guest/normal\n",
-       "    StandardTransform\n",
-       "Transform: Compose(\n",
-       "               ToTensor()\n",
-       "           )"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ds.get_normal_dataset()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Dataset ImageFolder\n",
-       "    Number of datapoints: 100\n",
-       "    Root location: ../../examples/data/cifar_10_ipr/fedipr_cifar10_guest/watermark\n",
-       "    StandardTransform\n",
-       "Transform: Compose(\n",
-       "               ToTensor()\n",
-       "           )"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ds.get_watermark_dataset() # water mark dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "25100"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(ds)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "At this point, you can now customize a watermark dataset for your own tasks to add watermarks to your models. In the upcoming CIFAR-10 task, we will be using FATE's built-in image watermark dataset class."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Built-in BacthNorm and LayerNorm Blocks for Feature-based Watermark"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this section, we will delve into the workings of feature-based watermarking. Feature-based watermarking involves embedding binary watermarks vectors into specific model parameters. In FATE implementations, we use the same design as the FATE-IPR paper: In the case of CNN, binary water mark are embeded into BatchNorm Layer. In transformers, watermarks are embeded into LayerNorm layers.\n",
-    "\n",
-    "You can use SignatureConv, SignatureLayerNorm to build your model. Once these blocks are detected in the FedIPR trainer, trainer will automatically assign binary watermark vector whose bit length is computed by Equation (15) in the origin paper.\n",
-    "\n",
-    "You can import them from:model's proprietary elements."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from fate_llm.model_zoo.ipr.sign_block import SignatureConv, SignatureLayerNorm"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here we show you the source code of our built in alexnet and distilbert to show you how to quickly build a model with featurebased watermark:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch.nn as nn\n",
-    "from fate_llm.model_zoo.ipr.sign_block import SignatureConv, ConvBlock\n",
-    "\n",
-    "\n",
-    "class SignAlexNet(nn.Module):\n",
-    "\n",
-    "    \"\"\"\n",
-    "    This is a modified Alexnet: its 4,5,6 layers are replaced by Singnature Conv Block\n",
-    "    \"\"\"\n",
-    "\n",
-    "    def __init__(self, num_classes):\n",
-    "        super().__init__()\n",
-    "        in_channels = 3\n",
-    "        maxpoolidx = [1, 3, 7]\n",
-    "        signed_layer = [4, 5, 6]\n",
-    "        layers = []\n",
-    "        inp = in_channels\n",
-    "\n",
-    "        # channels & kennel size\n",
-    "        # the same setting as the FedIPR paper\n",
-    "        oups = {\n",
-    "            0: 64,\n",
-    "            2: 192,\n",
-    "            4: 384,\n",
-    "            5: 256,\n",
-    "            6: 256\n",
-    "        }\n",
-    "        kp = {\n",
-    "            0: (5, 2),\n",
-    "            2: (5, 2),\n",
-    "            4: (3, 1),\n",
-    "            5: (3, 1),\n",
-    "            6: (3, 1)\n",
-    "        }\n",
-    "\n",
-    "        for layeridx in range(8):\n",
-    "            if layeridx in maxpoolidx:\n",
-    "                layers.append(nn.MaxPool2d(2, 2))\n",
-    "            else:\n",
-    "                k = kp[layeridx][0]\n",
-    "                p = kp[layeridx][1]\n",
-    "                if layeridx in signed_layer:\n",
-    "                    layers.append(SignatureConv(inp, oups[layeridx], k, 1, p))\n",
-    "                else:\n",
-    "                    layers.append(ConvBlock(inp, oups[layeridx], k, 1, p))\n",
-    "                inp = oups[layeridx]\n",
-    "\n",
-    "        self.features = nn.Sequential(*layers)\n",
-    "        self.classifier = nn.Linear(4 * 4 * 256, num_classes)\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        for m in self.features:\n",
-    "            x = m(x)\n",
-    "        x = x.view(x.size(0), -1)\n",
-    "        x = self.classifier(x)\n",
-    "        if self.training:\n",
-    "            return x\n",
-    "        else:  # Sofmax\n",
-    "            return nn.functional.softmax(x, dim=1)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "By inserting signconv block you can easily build a cv model with feature-based signature, in the case of NLP models, by useing 'recursive_replace_layernorm' you can quickly replace the original LayerNorm with our sign layernorm. Codes below show that you can quickly add feature-based watermarks to a huggingface pretraind model:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from torch.nn import Module\n",
-    "from transformers import DistilBertForSequenceClassification, DistilBertForTokenClassification\n",
-    "from fate_llm.model_zoo.ipr.sign_block import recursive_replace_layernorm\n",
-    "\n",
-    "\n",
-    "class SignDistilBertForTokenClassification(Module):\n",
-    "\n",
-    "    def __init__(self, model_path=None, num_labels=4) -> None:\n",
-    "        super().__init__()\n",
-    "        if model_path is None:\n",
-    "            model_path = 'distilbert-base-uncased'\n",
-    "\n",
-    "        self.model_path = model_path\n",
-    "        self.model = DistilBertForTokenClassification.from_pretrained(\n",
-    "            model_path, num_labels=num_labels)\n",
-    "\n",
-    "        # replace layernorm by SignatureLayerNorm\n",
-    "        sub_distilbert = self.model.distilbert.transformer.layer[3:]  # replace layernorm by SingLayerNorm in the last 3 layer\n",
-    "        recursive_replace_layernorm(\n",
-    "            sub_distilbert,\n",
-    "            layer_name_set={'output_layer_norm'})\n",
-    "\n",
-    "    def forward(self, input_dict):\n",
-    "        return self.model(**input_dict)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Verify Feature-based watermark with our tools\n",
-    "\n",
-    "After training is done, feature-based watermarks' signatures will be saved together with model. You can use our tool to verify the model ownership."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from fate_llm.trainer.fedipr_trainer import verify_feature_based_signature"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "See the example below for usage."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## FedIPR on FATE"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In FATE-LLM-1.3’s model_zoo we have these built-in models which are automatically integrated with feature-based watermarking capabilities:\n",
-    "\n",
-    "#### Model List\n",
-    "\n",
-    "- `alexnet.py` - Alexnet\n",
-    "- `resnet.py` - Resnet18\n",
-    "- `distilbert.py` - Distilbert (Configurations match those in the FedIPR paper)\n",
-    "- `gpt2.py` - Standard GPT-2 (Watermarks are added to the last 2 transformer layers)\n",
-    "t.py`"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We have verified the effectiveness of our watermarking features through a series of tests:\n",
-    "- For computer vision tasks, we evaluated both backdoor watermarking and feature-based watermarking techniques on the CIFAR-10 and CIFAR-100 datasets. Our testing involved the use of ResNet and AlexNet models.\n",
-    "- For natural language processing tasks, we assessed the performance of DistilBERT and GPT2 models on the IMDB & CoNLL-2003 datasets, which are sequence classification tasn and token classification respectively. \n",
-    "During the testing phase, the sign bit was automatically allocated, and the data was evenly divided between the guest and host parties. For backdoor watermarking, each party supplied 100 trigger samples, all of which were augmented with noises.\n",
-    "\n",
-    "Here we display the results of the experiments:\n",
-    "\n",
-    "AlexNet & Resnet:\n",
-    "\n",
-    "| Test Configuration | AlexNet Feature-Based Watermark Accuracy | AlexNet Backdoor Watermark Accuracy | ResNet18 Feature-Based Watermark Accuracy | ResNet18 Backdoor Watermark Accuracy |\n",
-    "|--------------------|-----------------------------------------|------------------------------------|------------------------------------------|-------------------------------------|\n",
-    "| Two-party federation on CIFAR-10 with 100 trigger samples, SignBit auto-assigned | 1.0 (All Parties) | 1.0 (All Parties) | 1.0 (All Parties) | 1.0 (All Parties) |\n",
-    "| Two-party federation on CIFAR-100 with 100 trigger samples, SignBit auto-assigned | 1.0 (All Parties) | 1.0 (Guest), 0.991 (Host) | 1.0 (All Parties) | 1.0 (All Parties) |\n",
-    "\n",
-    "DistilBert & GPT2:\n",
-    "\n",
-    "| Test Configuration | DistillBERT Feature-Based Watermark Accuracy | GPT-2 Feature-Based Watermark Accuracy |\n",
-    "|--------------------|----------------------------------------------|---------------------------------------|\n",
-    "| Two-party federation on CoNLL-2003 Token Classification with SignBit auto-assigned | 1.0 (All Parties) | 1.0 (All Parties) |\n",
-    "| Two-party federation on IMDB Classification with SignBit auto-assigned | 1.0 (All Parties) | 1.0 (All Parties) |\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## A Cifar-10 Example &  Verifying Watermark\n",
-    "\n",
-    "At last, we will show you a CV example: we will train a AlexNet with backdoor watermark & feature-based watermark at the same time. And after training is done, we use built in tools to verify feature-based watermark. You can verify the backdoor watermark yourself by simply predicting trigger samples with your models."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### FedIPR Parameters\n",
-    "\n",
-    "The FedIPRTrainer's parameters are basically the same as the FedAVGTrainer except for 3 parameters: alpha, verify_freq and backdoor_verify_method\n",
-    "alpha is the weight for sign loss; verify_freq is the frequency of verifying your watermark during training(you can check result in logs) and backdoor_verify_method allows you to choose the method for verifying your datasets."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class FedIPRTrainer(FedAVGTrainer):\n",
-    "\n",
-    "    def __init__(self,\n",
-    "                 epochs=10,\n",
-    "                 noraml_dataset_batch_size=32,\n",
-    "                 watermark_dataset_batch_size=2,\n",
-    "                 early_stop=None,\n",
-    "                 tol=0.0001,\n",
-    "                 secure_aggregate=True,\n",
-    "                 weighted_aggregation=True,\n",
-    "                 aggregate_every_n_epoch=None,\n",
-    "                 cuda=None,\n",
-    "                 pin_memory=True,\n",
-    "                 shuffle=True,\n",
-    "                 data_loader_worker=0,\n",
-    "                 validation_freqs=None,\n",
-    "                 checkpoint_save_freqs=None,\n",
-    "                 task_type='auto',\n",
-    "                 save_to_local_dir=False,\n",
-    "                 collate_fn=None,\n",
-    "                 collate_fn_params=None,\n",
-    "                 alpha=0.01,\n",
-    "                 verify_freqs=1,\n",
-    "                 backdoor_verify_method: Literal['accuracy',\n",
-    "                                                 'loss'] = 'accuracy'):\n",
-    "        ..."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Submit a pipeline to run FedIPR CV task\n",
-    "\n",
-    "This a standalone version example, if you are running on the cluster version, you have to bind name&namespace on guest&host machines correspondingly"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<pipeline.backend.pipeline.PipeLine at 0x7fc070abdc40>"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import torch as t\n",
-    "from torch import nn\n",
-    "from pipeline import fate_torch_hook\n",
-    "from pipeline.component import HomoNN\n",
-    "from pipeline.backend.pipeline import PipeLine\n",
-    "from pipeline.component import Reader, Evaluation, DataTransform\n",
-    "from pipeline.interface import Data, Model\n",
-    "\n",
-    "t = fate_torch_hook(t)\n",
-    "\n",
-    "import os\n",
-    "# bind data path to name & namespace\n",
-    "fate_project_path = os.path.abspath('../../../')\n",
-    "host = 9997\n",
-    "guest = 9997\n",
-    "arbiter = 9997\n",
-    "pipeline = PipeLine().set_initiator(role='guest', party_id=guest).set_roles(guest=guest, host=host,\n",
-    "                                                                            arbiter=arbiter)\n",
-    "\n",
-    "data_0 = {\"name\": \"watermark_cifar10_guest\", \"namespace\": \"experiment\"}\n",
-    "data_1 = {\"name\": \"watermark_cifar10_host\", \"namespace\": \"experiment\"}\n",
-    "\n",
-    "data_path_0 = fate_project_path + '/examples/data/cifar_10_ipr/fedipr_cifar10_guest'\n",
-    "data_path_1 = fate_project_path + '/examples/data/cifar_10_ipr/fedipr_cifar10_host'\n",
-    "pipeline.bind_table(name=data_0['name'], namespace=data_0['namespace'], path=data_path_0)\n",
-    "pipeline.bind_table(name=data_1['name'], namespace=data_1['namespace'], path=data_path_1)\n",
-    "\n",
-    "reader_0 = Reader(name=\"reader_0\")\n",
-    "reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=data_0)\n",
-    "reader_0.get_party_instance(role='host', party_id=host).component_param(table=data_1)\n",
-    "\n",
-    "from pipeline.component.nn import DatasetParam\n",
-    "\n",
-    "dataset_param = DatasetParam(dataset_name='watermark')\n",
-    "\n",
-    "from pipeline.component.homo_nn import TrainerParam  # Interface\n",
-    "\n",
-    "# our simple classification model:\n",
-    "model = t.nn.CustModel(module_name='ipr.alexnet', class_name='SignAlexNet', num_classes=10)\n",
-    "\n",
-    "nn_component = HomoNN(name='nn_0',\n",
-    "                      model=model, # model\n",
-    "                      dataset=dataset_param,  # dataset\n",
-    "                      # Notice that for the convenience of getting result model we set save_to_local_dir=True\n",
-    "                      trainer=TrainerParam(trainer_name='fedipr_trainer', epochs=5, save_to_local_dir=True, cuda=0),\n",
-    "                      optimizer=t.optim.Adam(lr=0.001),\n",
-    "                      loss=t.nn.CrossEntropyLoss(),\n",
-    "                      torch_seed=100 # random seed\n",
-    "                      )\n",
-    "\n",
-    "\n",
-    "pipeline.add_component(reader_0)\n",
-    "pipeline.add_component(nn_component, data=Data(train_data=reader_0.output.data))\n",
-    "pipeline.compile()\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pipeline.fit() # submit!"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Load Model and Verify\n",
-    "\n",
-    "Since we enable 'save_to_local_dir', we can directly load trained model from fateflow job folder, and verify its watermarks"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from fate_llm.trainer.fedipr_trainer import verify_feature_based_signature"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "job_id = '202309041103336933850'  # your job id\n",
-    "import os\n",
-    "fate_project_path = os.path.abspath('../../../')\n",
-    "local_dir = fate_project_path + '/fateflow/jobs/{}/guest/9997/nn_0/'.format(job_id)\n",
-    "state_dict = t.load(local_dir + 'model.pkl')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from fate_llm.model_zoo.ipr.alexnet import SignAlexNet\n",
-    "\n",
-    "model = SignAlexNet(num_classes=10)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<All keys matched successfully>"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "model.load_state_dict(state_dict['model'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "keys = state_dict['extra_data']['keys']  # W and watermark vectors"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'features.4': (tensor([[-7.3380e-02,  1.6275e+00, -1.5404e+00,  ...,  3.4250e-01,\n",
-       "           -1.0067e+00, -5.4504e-01],\n",
-       "          [ 2.9928e-01, -4.0935e-01, -6.1239e-01,  ...,  7.2356e-01,\n",
-       "            2.7019e-01, -9.1200e-01],\n",
-       "          [-4.3889e-02,  2.1774e+00, -1.3706e+00,  ..., -8.5879e-01,\n",
-       "            2.3445e-01,  2.0458e+00],\n",
-       "          ...,\n",
-       "          [-5.1755e-01,  5.9240e-01,  2.6353e-01,  ..., -1.0465e+00,\n",
-       "           -5.3456e-01, -6.0439e-01],\n",
-       "          [-2.4679e-01, -1.4290e+00, -5.9567e-01,  ...,  7.7682e-01,\n",
-       "           -6.2445e-01,  1.3682e+00],\n",
-       "          [ 1.1148e+00, -8.7518e-01,  7.6818e-01,  ...,  6.5654e-01,\n",
-       "           -1.8362e+00, -5.5355e-04]]),\n",
-       "  tensor([-1., -1.,  1.,  1.,  1., -1.,  1.,  1., -1., -1., -1.,  1., -1.,  1.,\n",
-       "          -1., -1.,  1., -1., -1., -1., -1.,  1., -1.,  1., -1., -1.,  1., -1.,\n",
-       "           1., -1., -1., -1.,  1.,  1.,  1.,  1.,  1.,  1.,  1., -1., -1.,  1.,\n",
-       "           1., -1.,  1., -1., -1.,  1., -1.,  1., -1., -1., -1.,  1., -1., -1.,\n",
-       "          -1., -1., -1., -1.,  1., -1., -1.,  1.,  1., -1., -1.,  1., -1., -1.,\n",
-       "           1.,  1.,  1., -1., -1., -1., -1.,  1., -1.,  1., -1.,  1., -1.,  1.,\n",
-       "          -1., -1.,  1., -1., -1., -1., -1.,  1., -1.,  1.,  1., -1., -1.,  1.,\n",
-       "          -1., -1.,  1., -1., -1., -1., -1.,  1.,  1.,  1., -1., -1.,  1., -1.,\n",
-       "          -1., -1.,  1., -1.,  1., -1., -1., -1., -1.,  1., -1.,  1.,  1., -1.,\n",
-       "           1.,  1.,  1., -1.,  1.,  1., -1., -1.,  1.,  1., -1., -1., -1.,  1.,\n",
-       "          -1., -1., -1., -1.,  1., -1., -1.,  1.,  1.,  1., -1.,  1., -1.,  1.,\n",
-       "           1.,  1., -1.,  1.,  1.,  1.,  1.,  1.,  1.,  1., -1., -1., -1.,  1.,\n",
-       "          -1., -1., -1., -1., -1., -1., -1., -1., -1.,  1., -1., -1.,  1.,  1.,\n",
-       "           1., -1.,  1., -1., -1., -1.,  1.,  1.,  1.])),\n",
-       " 'features.5': (tensor([[-1.2336,  0.1894, -0.3584,  ..., -0.5398,  0.5318, -1.6536],\n",
-       "          [ 0.1128,  0.3999,  1.2841,  ...,  1.6082, -0.1920, -0.0636],\n",
-       "          [-0.9447, -0.2025,  0.4786,  ...,  1.5100, -0.7834,  0.8102],\n",
-       "          ...,\n",
-       "          [-0.7941,  2.0311, -0.9690,  ..., -1.1630,  0.6953,  1.6115],\n",
-       "          [ 0.0314,  0.3718,  0.5974,  ..., -1.6695,  1.8833, -0.1461],\n",
-       "          [ 0.4956,  0.7747, -0.0847,  ..., -0.3533,  0.0763,  0.0952]]),\n",
-       "  tensor([-1.,  1., -1.,  1.,  1.,  1.,  1.,  1.,  1., -1., -1., -1.,  1., -1.,\n",
-       "          -1., -1., -1.,  1., -1.,  1., -1., -1.,  1., -1., -1., -1.,  1., -1.,\n",
-       "           1.,  1.,  1., -1., -1., -1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,  1.,\n",
-       "          -1., -1., -1., -1.,  1., -1.,  1., -1.,  1., -1.,  1., -1., -1.,  1.,\n",
-       "           1.,  1.,  1.,  1., -1.,  1.,  1.,  1., -1., -1., -1., -1., -1., -1.,\n",
-       "          -1.,  1.,  1., -1.,  1., -1., -1.,  1., -1., -1.,  1.,  1., -1., -1.,\n",
-       "          -1., -1., -1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,  1., -1., -1.,  1.,\n",
-       "           1.,  1.,  1., -1.,  1., -1.,  1.,  1., -1., -1.,  1.,  1., -1., -1.,\n",
-       "           1., -1., -1.,  1.,  1., -1.,  1.,  1.,  1.,  1., -1., -1., -1., -1.,\n",
-       "           1.])),\n",
-       " 'features.6': (tensor([[ 2.6993e+00,  1.0507e+00, -6.6219e-01,  ...,  6.3679e-01,\n",
-       "            7.7061e-01,  1.4231e+00],\n",
-       "          [-1.0477e+00,  2.0904e-01, -3.4522e-01,  ..., -4.9581e-01,\n",
-       "            1.4211e+00, -2.1041e+00],\n",
-       "          [ 1.0036e+00,  1.0025e+00, -2.5215e-03,  ...,  1.1413e+00,\n",
-       "           -1.8600e+00,  2.0058e-02],\n",
-       "          ...,\n",
-       "          [ 1.2943e+00,  5.6073e-01, -1.9590e+00,  ..., -1.4320e+00,\n",
-       "           -1.6486e+00, -3.0871e-01],\n",
-       "          [ 4.2747e-01,  1.8310e+00, -2.7685e-01,  ..., -1.0765e+00,\n",
-       "           -4.6004e-01,  3.6701e-02],\n",
-       "          [-4.9978e-01,  4.4728e-01, -7.3183e-01,  ...,  7.5242e-01,\n",
-       "            8.4118e-01,  8.3414e-02]]),\n",
-       "  tensor([ 1., -1.,  1.,  1., -1., -1.,  1., -1., -1.,  1.,  1.,  1.,  1., -1.,\n",
-       "          -1., -1., -1.,  1.,  1.,  1.,  1., -1., -1., -1.,  1.,  1.,  1.,  1.,\n",
-       "          -1.,  1.,  1., -1.,  1., -1., -1.,  1., -1., -1.,  1.,  1., -1.,  1.,\n",
-       "          -1., -1., -1., -1., -1.,  1.,  1., -1.,  1., -1., -1., -1., -1., -1.,\n",
-       "          -1., -1.,  1., -1.,  1.,  1., -1.,  1., -1., -1.,  1., -1., -1.,  1.,\n",
-       "           1.,  1., -1., -1., -1.,  1., -1., -1.,  1., -1.,  1., -1.,  1., -1.,\n",
-       "          -1.,  1.,  1.,  1., -1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,  1., -1.,\n",
-       "          -1.,  1.,  1.,  1., -1.,  1.,  1., -1., -1.,  1.,  1.,  1.,  1.,  1.,\n",
-       "          -1., -1., -1., -1.,  1.,  1.,  1., -1.,  1., -1.,  1., -1.,  1.,  1.,\n",
-       "          -1., -1., -1.]))}"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "keys"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1.0\n"
-     ]
-    }
-   ],
-   "source": [
-    "acc = verify_feature_based_signature(model, keys)\n",
-    "print(acc)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The accuracy is 100%! Congratulations. Now you can use FATE to build your own IPR protected models."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/doc/tutorial/parameter_efficient_llm/ChatGLM3-6B_ds.ipynb b/doc/tutorial/parameter_efficient_llm/ChatGLM3-6B_ds.ipynb
new file mode 100644
index 0000000..2b816b5
--- /dev/null
+++ b/doc/tutorial/parameter_efficient_llm/ChatGLM3-6B_ds.ipynb
@@ -0,0 +1,558 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Federated ChatGLM3 Tuning with Parameter Efficient methods in FATE-LLM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this tutorial, we will demonstrate how to efficiently train federated ChatGLM3-6B with deepspeed using the FATE-LLM framework. In FATE-LLM, we introduce the \"pellm\"(Parameter Efficient Large Language Model) module, specifically designed for federated learning with large language models. We enable the implementation of parameter-efficient methods in federated learning, reducing communication overhead while maintaining model performance. In this tutorial we particularlly focus on ChatGLM3-6B, and we will also emphasize the use of the Adapter mechanism for fine-tuning ChatGLM3-6B, which enables us to effectively reduce communication volume and improve overall efficiency.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## FATE-LLM: ChatGLM3-6B\n",
+    "\n",
+    "### ChatGLM-6B\n",
+    "ChatGLM3-6B is a large transformer-based language model with 5.977 billion parameters, it is an open bilingual language model based on General Language Model. You can download the pretrained model from [here](https://github.com/THUDM/ChatGLM3), or let the program automatically download it when you use it later.\n",
+    "\n",
+    "### Current Features\n",
+    "\n",
+    "In current version, FATE-LLM: ChatGLM-6B supports the following features:\n",
+    "<div align=\"center\">\n",
+    "  <img src=\"../../images/fate-llm-chatglm-6b.png\">\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Experiment Setting"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Before running experiment, please make sure that [FATE-LLM Cluster](https://github.com/FederatedAI/FATE/wiki/Download#llm%E9%83%A8%E7%BD%B2%E5%8C%85) has been deployed. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Dataset: Advertising Text Generation\n",
+    "\n",
+    "This is an advertising test generateion dataset, you can download dataset from the following links and place it in the examples/data folder. \n",
+    "- [data link 1](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view)\n",
+    "- [data link 2](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1)  \n",
+    "\n",
+    "You can refer to following link for more details about [data](https://aclanthology.org/D19-1321.pdf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "df = pd.read_json('${fate_install}/examples/data/AdvertiseGen/train.json', lines=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ChatGLM3-6B with Adapter\n",
+    "\n",
+    "In this section, we will guide you through the process of finetuning ChatGLM-6B with adapters using the FATE-LLM framework. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "ChatGLM model is located on fate_llm/model_zoo/chatglm.py, can be use directly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "albert.py  bloom.py    distilbert.py  parameter_efficient_llm.py\n",
+      "bart.py    chatglm.py  gpt2.py\t      qwen.py\n",
+      "bert.py    deberta.py  llama.py       roberta.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls ../../../../fate_llm/python/fate_llm/model_zoo/pellm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Adapters"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can directly use adapters from the peft. See details for adapters on this page [Adapter Methods](https://huggingface.co/docs/peft/index) for more details. By specifying the adapter name and the adapter\n",
+    "config dict we can insert adapters into our language models:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from peft import LoraConfig, TaskType\n",
+    "\n",
+    "lora_config = LoraConfig(\n",
+    "    task_type=TaskType.CAUSAL_LM,\n",
+    "    inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,\n",
+    "    target_modules=['query_key_value'],\n",
+    ")\n",
+    "lora_config.target_modules = list(lora_config.target_modules) # this line is needed to ensure lora_config is jsonable"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Init ChatGLM3 Model "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader\n",
+    "\n",
+    "pretrained_model_path = \"fill with pretrained model download path please\"\n",
+    "\n",
+    "model = LLMModelLoader(\n",
+    "    \"pellm.chatglm\",\n",
+    "    \"ChatGLM\",\n",
+    "    pretrained_path=pretrained_model_path,\n",
+    "    peft_type=\"LoraConfig\",\n",
+    "    peft_config=lora_config.to_dict(),\n",
+    "    trust_remote_code=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**During the training process, all weights of the pretrained language model will be frozen, and weights of adapters are traininable. Thus, FATE-LLM only train in the local training and aggregate adapters' weights in the fedederation process**\n",
+    "\n",
+    "Now available adapters are [Adapters Overview](https://huggingface.co/docs/peft/index) for details.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Specify Dataset And DataCollator To Process Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fate_client.pipeline.components.fate.nn.loader import LLMDatasetLoader, LLMDataFuncLoader\n",
+    "\n",
+    "tokenizer_params = dict(\n",
+    "    tokenizer_name_or_path=pretrained_model_path,\n",
+    "    trust_remote_code=True,\n",
+    ")\n",
+    "\n",
+    "dataset = LLMDatasetLoader(\n",
+    "    \"prompt_dataset\",\n",
+    "    \"PromptDataset\",\n",
+    "    **tokenizer_params,\n",
+    ")\n",
+    "\n",
+    "data_collator = LLMDataFuncLoader(\n",
+    "    \"data_collator.cust_data_collator\",\n",
+    "    \"get_seq2seq_data_collator\",\n",
+    "    **tokenizer_params,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Init DeepSpeed Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds_config = {\n",
+    "    \"train_micro_batch_size_per_gpu\": 1,\n",
+    "    \"optimizer\": {\n",
+    "        \"type\": \"Adam\",\n",
+    "        \"params\": {\n",
+    "            \"lr\": 5e-4\n",
+    "        }\n",
+    "    },\n",
+    "    \"fp16\": {\n",
+    "        \"enabled\": True\n",
+    "    },\n",
+    "    \"gradient_accumulation_steps\": 1,\n",
+    "    \"zero_optimization\": {\n",
+    "        \"stage\": 2,\n",
+    "        \"allgather_partitions\": True,\n",
+    "        \"allgather_bucket_size\": 1e8,\n",
+    "        \"overlap_comm\": True,\n",
+    "        \"reduce_scatter\": True,\n",
+    "        \"reduce_bucket_size\": 1e8,\n",
+    "        \"contiguous_gradients\": True,\n",
+    "        \"offload_optimizer\": {\n",
+    "            \"device\": \"cpu\"\n",
+    "        },\n",
+    "        \"offload_param\": {\n",
+    "            \"device\": \"cpu\"\n",
+    "        }\n",
+    "    }\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Submit Federated Task\n",
+    "To run federated task, please make sure to ues fate>=2.1.0 and deploy it with gpu machines. To running this code, make sure training data path is already binded. The following code shoud be copy to a script and run in a command line like \"python federated_chatglm.py\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can use this script to submit the model, but submitting the model will take a long time to train and generate a long log, so we won't do it here."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "from fate_client.pipeline.components.fate.reader import Reader\n",
+    "from fate_client.pipeline import FateFlowPipeline\n",
+    "from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_config_of_seq2seq_runner\n",
+    "from fate_client.pipeline.components.fate.nn.algo_params import Seq2SeqTrainingArguments, FedAVGArguments\n",
+    "from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader\n",
+    "from peft import LoraConfig, TaskType\n",
+    "\n",
+    "\n",
+    "guest = '10000'\n",
+    "host = '10000'\n",
+    "arbiter = '10000'\n",
+    "\n",
+    "epochs = 1\n",
+    "batch_size = 1\n",
+    "lr = 5e-4\n",
+    "\n",
+    "ds_config = {\n",
+    "    \"train_micro_batch_size_per_gpu\": batch_size,\n",
+    "    \"optimizer\": {\n",
+    "        \"type\": \"Adam\",\n",
+    "        \"params\": {\n",
+    "            \"lr\": lr,\n",
+    "            \"torch_adam\": True,\n",
+    "            \"adam_w_mode\": False\n",
+    "        }\n",
+    "    },\n",
+    "    \"fp16\": {\n",
+    "        \"enabled\": True\n",
+    "    },\n",
+    "    \"gradient_accumulation_steps\": 1,\n",
+    "    \"zero_optimization\": {\n",
+    "        \"stage\": 2,\n",
+    "        \"allgather_partitions\": True,\n",
+    "        \"allgather_bucket_size\": 1e8,\n",
+    "        \"overlap_comm\": True,\n",
+    "        \"reduce_scatter\": True,\n",
+    "        \"reduce_bucket_size\": 1e8,\n",
+    "        \"contiguous_gradients\": True,\n",
+    "        \"offload_optimizer\": {\n",
+    "            \"device\": \"cpu\"\n",
+    "        },\n",
+    "        \"offload_param\": {\n",
+    "            \"device\": \"cpu\"\n",
+    "        }\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "pipeline = FateFlowPipeline().set_parties(guest=guest, host=host, arbiter=arbiter)\n",
+    "# pipeline.bind_local_path(path=\"\", namespace=\"experiment\", name=\"ad\")\n",
+    "time.sleep(5)\n",
+    "\n",
+    "\n",
+    "reader_0 = Reader(\"reader_0\", runtime_parties=dict(guest=guest, host=host))\n",
+    "reader_0.guest.task_parameters(\n",
+    "    namespace=\"experiment\",\n",
+    "    name=\"ad\"\n",
+    ")\n",
+    "reader_0.hosts[0].task_parameters(\n",
+    "    namespace=\"experiment\",\n",
+    "    name=\"ad\"\n",
+    ")\n",
+    "\n",
+    "# define lora config\n",
+    "lora_config = LoraConfig(\n",
+    "    task_type=TaskType.CAUSAL_LM,\n",
+    "    inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,\n",
+    "    target_modules=['query_key_value'],\n",
+    ")\n",
+    "lora_config.target_modules = list(lora_config.target_modules)\n",
+    "\n",
+    "pretrained_model_path = \"/data/cephfs/llm/models/chatglm3-6b\"\n",
+    "\n",
+    "model = LLMModelLoader(\n",
+    "    \"pellm.chatglm\",\n",
+    "    \"ChatGLM\",\n",
+    "    pretrained_path=pretrained_model_path,\n",
+    "    peft_type=\"LoraConfig\",\n",
+    "    peft_config=lora_config.to_dict(),\n",
+    "    trust_remote_code=True\n",
+    ")\n",
+    "\n",
+    "\n",
+    "tokenizer_params = dict(\n",
+    "    tokenizer_name_or_path=pretrained_model_path,\n",
+    "    trust_remote_code=True,\n",
+    ")\n",
+    "\n",
+    "dataset = LLMDatasetLoader(\n",
+    "    \"prompt_dataset\",\n",
+    "    \"PromptDataset\",\n",
+    "    **tokenizer_params,\n",
+    ")\n",
+    "\n",
+    "data_collator = LLMDataFuncLoader(\n",
+    "    \"data_collator.cust_data_collator\",\n",
+    "    \"get_seq2seq_data_collator\",\n",
+    "    **tokenizer_params,\n",
+    ")\n",
+    "\n",
+    "conf = get_config_of_seq2seq_runner(\n",
+    "    algo='fedavg',\n",
+    "    model=model,\n",
+    "    dataset=dataset,\n",
+    "    data_collator=data_collator,\n",
+    "    training_args=Seq2SeqTrainingArguments(\n",
+    "        num_train_epochs=epochs,\n",
+    "        per_device_train_batch_size=batch_size,\n",
+    "        remove_unused_columns=False, \n",
+    "        predict_with_generate=False,\n",
+    "        deepspeed=ds_config,\n",
+    "        learning_rate=lr,\n",
+    "        use_cpu=False, # this must be set as we will gpu\n",
+    "        fp16=True,\n",
+    "    ),\n",
+    "    fed_args=FedAVGArguments(),\n",
+    "    task_type='causal_lm',\n",
+    "    save_trainable_weights_only=True # only save trainable weights\n",
+    ")\n",
+    "\n",
+    "homo_nn_0 = HomoNN(\n",
+    "    'nn_0',\n",
+    "    runner_conf=conf,\n",
+    "    train_data=reader_0.outputs[\"output_data\"],\n",
+    "    runner_module=\"homo_seq2seq_runner\",\n",
+    "    runner_class=\"Seq2SeqRunner\",\n",
+    ")\n",
+    "\n",
+    "homo_nn_0.guest.conf.set(\"launcher_name\", \"deepspeed\") # tell schedule engine to run task with deepspeed\n",
+    "homo_nn_0.hosts[0].conf.set(\"launcher_name\", \"deepspeed\") # tell schedule engine to run task with deepspeed\n",
+    "\n",
+    "pipeline.add_tasks([reader_0, homo_nn_0])\n",
+    "pipeline.conf.set(\"task\", dict(engine_run={\"cores\": 1})) # the number of gpus of each party\n",
+    "\n",
+    "pipeline.compile()\n",
+    "pipeline.fit()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Training With P-Tuning V2 Adapter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To use another adapter lke P-Tuning V2, slightly changes is needed!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = LLMModelLoader(\n",
+    "    \"pellm.chatglm\",\n",
+    "    \"ChatGLM\",\n",
+    "    pretrained_path=pretrained_model_path,\n",
+    "    pre_seq_len=128,\n",
+    "    trust_remote_code=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Models trained with FATE-LLM can be find under the directory `${fate_install}/fateflow/model/$job_id/${role}/${party_id}/$cpn_name/0/output/output_model/model_directory/adapter_model.bin}`,\n",
+    "The following code is an example to load trained lora adapter weights:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "OSError",
+     "evalue": "Incorrect path_or_model_id: ''. Please provide either the path to a local folder or the repo_id of a model on the Hub.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mHFValidationError\u001b[0m                         Traceback (most recent call last)",
+      "File \u001b[0;32m/data/projects/fate/common/python/venv/lib/python3.8/site-packages/transformers/utils/hub.py:385\u001b[0m, in \u001b[0;36mcached_file\u001b[0;34m(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[0m\n\u001b[1;32m    383\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    384\u001b[0m     \u001b[38;5;66;03m# Load from URL or cache if already cached\u001b[39;00m\n\u001b[0;32m--> 385\u001b[0m     resolved_file \u001b[38;5;241m=\u001b[39m \u001b[43mhf_hub_download\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    386\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    387\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    388\u001b[0m \u001b[43m        \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    389\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    390\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    391\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    392\u001b[0m \u001b[43m        \u001b[49m\u001b[43muser_agent\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_agent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    393\u001b[0m \u001b[43m        \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    394\u001b[0m \u001b[43m        \u001b[49m\u001b[43mproxies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    395\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    396\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    397\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    398\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    399\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GatedRepoError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
+      "File \u001b[0;32m/data/projects/fate/common/python/venv/lib/python3.8/site-packages/huggingface_hub/utils/_validators.py:110\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    109\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m arg_name \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrepo_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfrom_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto_id\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m--> 110\u001b[0m     \u001b[43mvalidate_repo_id\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    112\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m arg_name \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m arg_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+      "File \u001b[0;32m/data/projects/fate/common/python/venv/lib/python3.8/site-packages/huggingface_hub/utils/_validators.py:164\u001b[0m, in \u001b[0;36mvalidate_repo_id\u001b[0;34m(repo_id)\u001b[0m\n\u001b[1;32m    163\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m REPO_ID_REGEX\u001b[38;5;241m.\u001b[39mmatch(repo_id):\n\u001b[0;32m--> 164\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m HFValidationError(\n\u001b[1;32m    165\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRepo id must use alphanumeric chars or \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m--\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m and \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m are\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    166\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m forbidden, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m and \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m cannot start or end the name, max length is 96:\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    167\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrepo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    168\u001b[0m     )\n\u001b[1;32m    170\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m--\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m repo_id \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m repo_id:\n",
+      "\u001b[0;31mHFValidationError\u001b[0m: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: ''.",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[21], line 25\u001b[0m\n\u001b[1;32m     21\u001b[0m             \u001b[38;5;28;01myield\u001b[39;00m json\u001b[38;5;241m.\u001b[39mloads(_l\u001b[38;5;241m.\u001b[39mstrip())\n\u001b[1;32m     24\u001b[0m chatglm_model_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 25\u001b[0m model, tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchatglm_model_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     27\u001b[0m test_data_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{fate_install}\u001b[39;00m\u001b[38;5;124m/examples/data/AdvertiseGen/dev.json\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     28\u001b[0m dataset \u001b[38;5;241m=\u001b[39m load_data(test_data_path)\n",
+      "Cell \u001b[0;32mIn[21], line 9\u001b[0m, in \u001b[0;36mload_model\u001b[0;34m(pretrained_model_path)\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_model\u001b[39m(pretrained_model_path):\n\u001b[0;32m----> 9\u001b[0m     _tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mAutoTokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m     10\u001b[0m     _model \u001b[38;5;241m=\u001b[39m AutoModel\u001b[38;5;241m.\u001b[39mfrom_pretrained(pretrained_model_path, trust_remote_code\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m     12\u001b[0m     _model \u001b[38;5;241m=\u001b[39m _model\u001b[38;5;241m.\u001b[39mhalf()\n",
+      "File \u001b[0;32m/data/projects/fate/common/python/venv/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py:758\u001b[0m, in \u001b[0;36mAutoTokenizer.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m    755\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m tokenizer_class\u001b[38;5;241m.\u001b[39mfrom_pretrained(pretrained_model_name_or_path, \u001b[38;5;241m*\u001b[39minputs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    757\u001b[0m \u001b[38;5;66;03m# Next, let's try to use the tokenizer_config file to get the tokenizer class.\u001b[39;00m\n\u001b[0;32m--> 758\u001b[0m tokenizer_config \u001b[38;5;241m=\u001b[39m \u001b[43mget_tokenizer_config\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m tokenizer_config:\n\u001b[1;32m    760\u001b[0m     kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m tokenizer_config[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
+      "File \u001b[0;32m/data/projects/fate/common/python/venv/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py:590\u001b[0m, in \u001b[0;36mget_tokenizer_config\u001b[0;34m(pretrained_model_name_or_path, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, **kwargs)\u001b[0m\n\u001b[1;32m    587\u001b[0m     token \u001b[38;5;241m=\u001b[39m use_auth_token\n\u001b[1;32m    589\u001b[0m commit_hash \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m--> 590\u001b[0m resolved_config_file \u001b[38;5;241m=\u001b[39m \u001b[43mcached_file\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    591\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    592\u001b[0m \u001b[43m    \u001b[49m\u001b[43mTOKENIZER_CONFIG_FILE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    593\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    594\u001b[0m \u001b[43m    \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    595\u001b[0m \u001b[43m    \u001b[49m\u001b[43mresume_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    596\u001b[0m \u001b[43m    \u001b[49m\u001b[43mproxies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    597\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    598\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    599\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    600\u001b[0m \u001b[43m    \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    601\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_raise_exceptions_for_missing_entries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    602\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_raise_exceptions_for_connection_errors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    603\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_commit_hash\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcommit_hash\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    604\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    605\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m resolved_config_file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    606\u001b[0m     logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not locate the tokenizer configuration file, will try to use the model config instead.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m/data/projects/fate/common/python/venv/lib/python3.8/site-packages/transformers/utils/hub.py:450\u001b[0m, in \u001b[0;36mcached_file\u001b[0;34m(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[0m\n\u001b[1;32m    448\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThere was a specific connection error when trying to load \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_repo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00merr\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    449\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HFValidationError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m--> 450\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\n\u001b[1;32m    451\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIncorrect path_or_model_id: \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_repo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m. Please provide either the path to a local folder or the repo_id of a model on the Hub.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    452\u001b[0m     ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[1;32m    453\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resolved_file\n",
+      "\u001b[0;31mOSError\u001b[0m: Incorrect path_or_model_id: ''. Please provide either the path to a local folder or the repo_id of a model on the Hub."
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import sys\n",
+    "import torch\n",
+    "from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model\n",
+    "from transformers import AutoModel, AutoTokenizer\n",
+    "\n",
+    "\n",
+    "def load_model(pretrained_model_path):\n",
+    "    _tokenizer = AutoTokenizer.from_pretrained(pretrained_model_path, trust_remote_code=True)\n",
+    "    _model = AutoModel.from_pretrained(pretrained_model_path, trust_remote_code=True)\n",
+    "\n",
+    "    _model = _model.half()\n",
+    "    _model = _model.eval()\n",
+    "\n",
+    "    return _model, _tokenizer\n",
+    "\n",
+    "\n",
+    "def load_data(data_path):\n",
+    "    with open(data_path, \"r\") as fin:\n",
+    "        for _l in fin:\n",
+    "            yield json.loads(_l.strip())\n",
+    "\n",
+    "\n",
+    "chatglm_model_path = \"\"\n",
+    "model, tokenizer = load_model(chatglm_model_path)\n",
+    "\n",
+    "test_data_path = \"{fate_install}/examples/data/AdvertiseGen/dev.json\"\n",
+    "dataset = load_data(test_data_path)\n",
+    "\n",
+    "peft_path = \"${fate_install}/fateflow/model/$job_id/${role}/${party_id}/$cpn_name/0/output/output_model/model_directory/adapter_model.bin}\"\n",
+    "peft_config = LoraConfig(\n",
+    "    task_type=TaskType.CAUSAL_LM,\n",
+    "    inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,\n",
+    "    target_modules=['query_key_value'],\n",
+    ")\n",
+    "\n",
+    "model = get_peft_model(model, peft_config)\n",
+    "model.load_state_dict(torch.load(peft_path), strict=False)\n",
+    "model = model.half()\n",
+    "model.eval()\n",
+    "\n",
+    "for p in model.parameters():\n",
+    "    if p.requires_grad:\n",
+    "        print(p)\n",
+    "\n",
+    "model.cuda(\"cuda:0\")\n",
+    "\n",
+    "content = list(dataset)[0][\"content\"]\n",
+    "print(model.chat(tokenizer, content, do_sample=False))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/doc/tutorial/parameter_efficient_llm/GPT2-example.ipynb b/doc/tutorial/parameter_efficient_llm/GPT2-example.ipynb
deleted file mode 100644
index 09f3ee7..0000000
--- a/doc/tutorial/parameter_efficient_llm/GPT2-example.ipynb
+++ /dev/null
@@ -1,673 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#  Federated GPT-2 Tuning with Parameter Efficient methods in FATE-LLM"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this tutorial, we will demonstrate how to efficiently train federated large language models using the FATE-LLM framework. In FATE-LLM, we introduce the \"pellm\"(Parameter Efficient Large Language Model) module, specifically designed for federated learning with large language models. We enable the implementation of parameter-efficient methods in federated learning, reducing communication overhead while maintaining model performance. In this tutorial we particularlly focus on GPT-2, and we will also emphasize the use of the Adapter mechanism for fine-tuning GPT-2, which enables us to effectively reduce communication volume and improve overall efficiency.\n",
-    "\n",
-    "By following this tutorial, you will learn how to leverage the FATE-LLM framework to rapidly fine-tune federated large language models, such as GPT-2, with ease and efficiency."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## GPT2\n",
-    "\n",
-    "GPT-2 is a large transformer-based language model with 1.5 billion parameters, trained on a dataset of 8 million web pages. GPT-2 is trained with a causal language modeling (CLM) objective, conditioning on a left-to-right context window of 1024 tokens. In this tutorial, we will use GPT2, you can download the pretrained model from [here](https://huggingface.co/gpt2) (We choose the smallest version for this tutorial), or let the program automatically download it when you use it later."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Dataset: IMDB Sentimental\n",
-    "\n",
-    "In this section, we will introduce the process of preparing the IMDB dataset for use in our federated learning task. We use our tokenizer dataset(based on HuggingFace tokenizer) to preprocess the text data.\n",
-    "\n",
-    "About IMDB Sentimental Dataset:\n",
-    "\n",
-    "This is an binary classification dataset, you can download our processed dataset from here: \n",
-    "- https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fate/examples/data/IMDB.csv\n",
-    "and place it in the examples/data folder. \n",
-    "\n",
-    "The orgin data is from: \n",
-    "- https://ai.stanford.edu/~amaas/data/sentiment/\n",
-    "\n",
-    "### Check Dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "df = pd.read_csv('../../../examples/data/IMDB.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>text</th>\n",
-       "      <th>label</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0</td>\n",
-       "      <td>One of the other reviewers has mentioned that ...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>A wonderful little production. &lt;br /&gt;&lt;br /&gt;The...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2</td>\n",
-       "      <td>I thought this was a wonderful way to spend ti...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>3</td>\n",
-       "      <td>Basically there's a family where a little boy ...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>4</td>\n",
-       "      <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1996</th>\n",
-       "      <td>1996</td>\n",
-       "      <td>THE CELL (2000) Rating: 8/10&lt;br /&gt;&lt;br /&gt;The Ce...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1997</th>\n",
-       "      <td>1997</td>\n",
-       "      <td>This movie, despite its list of B, C, and D li...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1998</th>\n",
-       "      <td>1998</td>\n",
-       "      <td>I loved this movie! It was all I could do not ...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1999</th>\n",
-       "      <td>1999</td>\n",
-       "      <td>This was the worst movie I have ever seen Bill...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2000</th>\n",
-       "      <td>2000</td>\n",
-       "      <td>Stranded in Space (1972) MST3K version - a ver...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>2001 rows × 3 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "        id                                               text  label\n",
-       "0        0  One of the other reviewers has mentioned that ...      1\n",
-       "1        1  A wonderful little production. <br /><br />The...      1\n",
-       "2        2  I thought this was a wonderful way to spend ti...      1\n",
-       "3        3  Basically there's a family where a little boy ...      0\n",
-       "4        4  Petter Mattei's \"Love in the Time of Money\" is...      1\n",
-       "...    ...                                                ...    ...\n",
-       "1996  1996  THE CELL (2000) Rating: 8/10<br /><br />The Ce...      1\n",
-       "1997  1997  This movie, despite its list of B, C, and D li...      0\n",
-       "1998  1998  I loved this movie! It was all I could do not ...      1\n",
-       "1999  1999  This was the worst movie I have ever seen Bill...      0\n",
-       "2000  2000  Stranded in Space (1972) MST3K version - a ver...      0\n",
-       "\n",
-       "[2001 rows x 3 columns]"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from fate_llm.dataset.nlp_tokenizer import TokenizerDataset\n",
-    "\n",
-    "ds = TokenizerDataset(tokenizer_name_or_path=\"your model path\", text_max_length=128, \n",
-    "                      padding_side=\"left\", return_input_ids=False, \n",
-    "                      pad_token='<|endoftext|>')  # load tokenizer config from local pretrained tokenizer\n",
-    "\n",
-    "ds.load('../../../examples/data/IMDB.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "({'input_ids': tensor([ 3198,   286,   262,   584, 30702,   468,  4750,   326,   706,  4964,\n",
-       "            655,   352, 18024,  4471,   345,  1183,   307, 23373,    13,  1119,\n",
-       "            389,   826,    11,   355,   428,   318,  3446,   644,  3022,   351,\n",
-       "            502, 29847,  1671,  1220,  6927,  1671, 11037,   464,   717,  1517,\n",
-       "            326,  7425,   502,   546, 18024,   373,   663, 24557,   290, 42880,\n",
-       "           8589,   278,  8188,   286,  3685,    11,   543,   900,   287,   826,\n",
-       "            422,   262,  1573, 10351,    13,  9870,   502,    11,   428,   318,\n",
-       "            407,   257,   905,   329,   262, 18107,  2612,   276,   393, 44295,\n",
-       "             13,   770,   905, 16194,   645, 25495,   351, 13957,   284,  5010,\n",
-       "             11,  1714,   393,  3685,    13,  6363,   318, 22823,    11,   287,\n",
-       "            262,  6833,   779,   286,   262,  1573, 29847,  1671,  1220,  6927,\n",
-       "           1671, 11037,  1026,   318,  1444,   440,    57,   355,   326,   318,\n",
-       "            262, 21814,  1813,   284,   262, 34374, 22246,  4765]),\n",
-       "  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
-       "          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
-       "          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
-       "          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
-       "          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
-       "          1, 1, 1, 1, 1, 1, 1, 1])},\n",
-       " array([1.], dtype=float32))"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ds[0]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For more details of FATE-LLM dataset setting, we recommend that you read through these tutorials first: [NN Dataset Customization](https://github.com/FederatedAI/FATE/blob/master/doc/tutorial/pipeline/nn_tutorial/Homo-NN-Customize-your-Dataset.ipynb), [Some Built-In Dataset](https://github.com/FederatedAI/FATE/blob/master/doc/tutorial/pipeline/nn_tutorial/Introduce-Built-In-Dataset.ipynb),"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## PELLM Model with Adapter\n",
-    "\n",
-    "In this section, we will guide you through the process of building a parameter-efficient language model using the FATE-LLM framework. We will focus on the implementation of the PELLM model and the integration of the Adapter mechanism, which enables efficient fine-tuning and reduces communication overhead in federated learning settings. Take GPT-2 as example you will learn how to leverage the FATE-LLM framework to rapidly develop and deploy a parameter-efficient language model using FATE-LLM built-in classes. Before starting this section, we recommend that you read through this tutorial first: [Model Customization](https://github.com/FederatedAI/FATE/blob/master/doc/tutorial/pipeline/nn_tutorial/Homo-NN-Customize-Model.ipynb)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### PELLM Models\n",
-    "\n",
-    "In this section we introduce the PELLM model, which is a parameter-efficient language model that can be used in federated learning settings. They are designed to be compatible with the FATE-LLM framework to enable federated model tuning/training.\n",
-    "\n",
-    "PELLM models are located at federatedml.nn.model_zoo.pellm(federatedml/nn/model_zoo/pellm):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "albert.py  bert.py     deberta.py     gpt2.py\t\t\t  __pycache__\r\n",
-      "bart.py    chatglm.py  distilbert.py  parameter_efficient_llm.py  roberta.py\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "! ls ../../../fate/python/fate_llm/model_zoo/pellm"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can initialize your GPT2 model by loading the pretrained model from the model folder, or downloading the pretrained model from the Huggingface,\n",
-    "here we initialize the GPT2 model with the Lora Adapter, we will introduce Adapters in the following sub"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Adapters"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We can directly use adapters from the peft. See details for adapters on this page [Adapter Methods](https://huggingface.co/docs/peft/index#supported-methods) for more details. By specifying the adapter name and the adapter\n",
-    "config dict we can insert adapters into our language models:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from peft import LoraConfig, TaskType\n",
-    "\n",
-    "# define lora config\n",
-    "lora_config = LoraConfig(\n",
-    "    task_type=TaskType.SEQ_CLS,\n",
-    "    inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,\n",
-    "    target_modules=['c_attn'],\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Init PELLM Model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from fate_llm.model_zoo.pellm.gpt2 import GPT2\n",
-    "\n",
-    "# case 1 load pretrained weights from local pretrained weights, same as using the huggingface pretrained model\n",
-    "path_to_pretrained_folder = 'your model path'\n",
-    "gpt2 = GPT2(pretrained_path=path_to_pretrained_folder, \n",
-    "            peft_type=\"LoraConfig\", peft_config=lora_config.to_dict(), \n",
-    "            num_labels=1, pad_token_id=50256)\n",
-    "\n",
-    "# case 2 directly download models from huggingface\n",
-    "# gpt2 = GPT2(pretrained_path=\"gpt2\", \n",
-    "#             peft_type=\"LoraConfig\", peft_config=lora_config, \n",
-    "#             num_labels=1, pad_token_id=50256)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this version we currently support these language model for federated training:\n",
-    "- ChatGLM\n",
-    "- Bert\n",
-    "- ALBert\n",
-    "- RoBerta\n",
-    "- GPT-2\n",
-    "- Bart\n",
-    "- DeBerta\n",
-    "- DistillBert"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**During the training process, all weights of the pretrained language model exclusive classifier head's weihgts will be frozen, and weights of adapters are traininable. Thus, FATE-LLM only train in the local training and aggregate adapters' weights and classifier head's weights(If has) in the fedederation process**\n",
-    "\n",
-    "Now available adapters are [Adapters Overview](https://huggingface.co/docs/peft/index) for details.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Use PELLM Model in FATE with CustModel\n",
-    "\n",
-    "In this [Model Customization](https://github.com/FederatedAI/FATE/blob/master/doc/tutorial/pipeline/nn_tutorial/Homo-NN-Customize-Model.ipynb) tutorial, we demonstrate how to employ the t.nn.CustomModel class in fate_torch to parse a model's structure and submit it to a federated learning task. The CustomModel automatically imports the model class from the model_zoo and initializes the models with the parameters provided. Since these language models are built-in, we can directly use them in the CustomModel and easily add a classifier head to address the classification task at hand："
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch as t\n",
-    "from pipeline import fate_torch_hook\n",
-    "from pipeline.component.nn import save_to_fate_llm\n",
-    "fate_torch_hook(t)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%save_to_fate_llm model sigmoid.py\n",
-    "\n",
-    "import torch as t\n",
-    "\n",
-    "class Sigmoid(t.nn.Module):\n",
-    "    \n",
-    "    def __init__(self):\n",
-    "        super().__init__()\n",
-    "        self.sigmoid = t.nn.Sigmoid()\n",
-    "        \n",
-    "    def forward(self, x):\n",
-    "        return self.sigmoid(x.logits)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# build CustModel with PELLM, and add a classifier head\n",
-    "from transformers import GPT2Config\n",
-    "\n",
-    "checkpoint_path = \"your model path\"\n",
-    "model = t.nn.Sequential(\n",
-    "    t.nn.CustModel(module_name='pellm.gpt2', class_name='GPT2', \n",
-    "                   pretrained_path=checkpoint_path, \n",
-    "                   peft_config=lora_config.to_dict(), peft_type=\"LoraConfig\", \n",
-    "                   num_labels=1,  pad_token_id=50256),\n",
-    "    t.nn.CustModel(module_name='sigmoid', class_name='Sigmoid')\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "Please note that during the training process, only trainable parameters will participate in the federated learning process."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Local Test\n",
-    "\n",
-    "Before submitting a federated learning task, we will demonstrate how to perform local testing to ensure the proper functionality of your custom dataset, model. We use the local mode of our FedAVGTrainer to test if our setting can run correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from fate_llm.model_zoo.pellm.gpt2 import GPT2\n",
-    "from fate_llm.model_zoo.sigmoid import Sigmoid\n",
-    "from federatedml.nn.homo.trainer.fedavg_trainer import FedAVGTrainer\n",
-    "from transformers import GPT2Config\n",
-    "from fate_llm.dataset.nlp_tokenizer import TokenizerDataset\n",
-    "\n",
-    "# load dataset\n",
-    "ds = TokenizerDataset(tokenizer_name_or_path=\"your model path\", text_max_length=128, \n",
-    "                      padding_side=\"left\", return_input_ids=False, pad_token='<|endoftext|>')  # you can load tokenizer config from local pretrained tokenizer\n",
-    "\n",
-    "ds.load('../../../examples/data/IMDB.csv')\n",
-    "\n",
-    "checkpoint_path = \"your model path\"\n",
-    "model = t.nn.Sequential(\n",
-    "    GPT2(pretrained_path=checkpoint_path, peft_config=lora_config.to_dict(), peft_type=\"LoraConfig\", num_labels=1,  pad_token_id=50256),\n",
-    "    Sigmoid()\n",
-    ")\n",
-    "\n",
-    "trainer = FedAVGTrainer(epochs=1, batch_size=8, shuffle=True, data_loader_worker=8)\n",
-    "trainer.local_mode()\n",
-    "trainer.set_model(model)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "epoch is 0\n",
-      "100%|██████████| 251/251 [04:39<00:00,  1.11s/it]\n",
-      "epoch loss is 0.5148034488660345\n"
-     ]
-    }
-   ],
-   "source": [
-    "opt = t.optim.Adam(model.parameters(), lr=0.001)\n",
-    "loss = t.nn.BCELoss()\n",
-    "# local test, here we only use CPU for training\n",
-    "trainer.train(ds, None, opt, loss)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Submit Federated Task\n",
-    "Once you have successfully completed local testing, We can submit a task to FATE. Please notice that this tutorial is ran on a standalone version. **Please notice that in this tutorial we are using a standalone version, if you are using a cluster version, you need to bind the data with the corresponding name&namespace on each machine.**\n",
-    "\n",
-    "In this example we load pretrained weights for gpt2 model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch as t\n",
-    "import os\n",
-    "from pipeline import fate_torch_hook\n",
-    "from pipeline.component import HomoNN\n",
-    "from pipeline.component.homo_nn import DatasetParam, TrainerParam\n",
-    "from pipeline.backend.pipeline import PipeLine\n",
-    "from pipeline.component import Reader\n",
-    "from pipeline.interface import Data\n",
-    "from transformers import GPT2Config\n",
-    "\n",
-    "\n",
-    "fate_torch_hook(t)\n",
-    "\n",
-    "\n",
-    "fate_project_path = \"your model path\"\n",
-    "guest_0 = 9999\n",
-    "host_1 = 9999\n",
-    "pipeline = PipeLine().set_initiator(role='guest', party_id=guest_0).set_roles(guest=guest_0, host=host_1,\n",
-    "                                                                              arbiter=guest_0)\n",
-    "data_0 = {\"name\": \"imdb\", \"namespace\": \"experiment\"}\n",
-    "data_path = fate_project_path + '/examples/data/IMDB.csv'\n",
-    "pipeline.bind_table(name=data_0['name'], namespace=data_0['namespace'], path=data_path)\n",
-    "pipeline.bind_table(name=data_0['name'], namespace=data_0['namespace'], path=data_path)\n",
-    "reader_0 = Reader(name=\"reader_0\")\n",
-    "reader_0.get_party_instance(role='guest', party_id=guest_0).component_param(table=data_0)\n",
-    "reader_0.get_party_instance(role='host', party_id=host_1).component_param(table=data_0)\n",
-    "\n",
-    "reader_1 = Reader(name=\"reader_1\")\n",
-    "reader_1.get_party_instance(role='guest', party_id=guest_0).component_param(table=data_0)\n",
-    "reader_1.get_party_instance(role='host', party_id=host_1).component_param(table=data_0)\n",
-    "\n",
-    "\n",
-    "## Add your pretriained model path here, will load model&tokenizer from this path\n",
-    "\n",
-    "\n",
-    "## LoraConfig\n",
-    "from peft import LoraConfig, TaskType\n",
-    "lora_config = LoraConfig(\n",
-    "    task_type=TaskType.SEQ_CLS,\n",
-    "    inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,\n",
-    "    target_modules=['c_attn']\n",
-    ")\n",
-    "\n",
-    "\n",
-    "model_path = 'your model path'\n",
-    "model = t.nn.Sequential(\n",
-    "    t.nn.CustModel(module_name='pellm.gpt2', class_name='GPT2', pretrained_path=model_path,\n",
-    "                   peft_config=lora_config.to_dict(), peft_type=\"LoraConfig\", num_labels=1,  pad_token_id=50256),\n",
-    "    t.nn.CustModel(module_name='sigmoid', class_name='Sigmoid')\n",
-    ")\n",
-    "\n",
-    "# DatasetParam\n",
-    "dataset_param = DatasetParam(dataset_name='nlp_tokenizer',text_max_length=128, tokenizer_name_or_path=model_path, \n",
-    "                             padding_side=\"left\", return_input_ids=False, pad_token='<|endoftext|>')\n",
-    "# TrainerParam\n",
-    "trainer_param = TrainerParam(trainer_name='fedavg_trainer', epochs=1, batch_size=8,\n",
-    "                             data_loader_worker=8)\n",
-    "\n",
-    "\n",
-    "nn_component = HomoNN(name='nn_0', model=model)\n",
-    "\n",
-    "# set parameter for client 1\n",
-    "nn_component.get_party_instance(role='guest', party_id=guest_0).component_param(\n",
-    "    loss=t.nn.BCELoss(),\n",
-    "    optimizer = t.optim.Adam(lr=0.0001, eps=1e-8),\n",
-    "    dataset=dataset_param,       \n",
-    "    trainer=trainer_param,\n",
-    "    torch_seed=100 \n",
-    ")\n",
-    "\n",
-    "# set parameter for client 2\n",
-    "nn_component.get_party_instance(role='host', party_id=host_1).component_param(\n",
-    "    loss=t.nn.BCELoss(),\n",
-    "    optimizer = t.optim.Adam(lr=0.0001, eps=1e-8),\n",
-    "    dataset=dataset_param,       \n",
-    "    trainer=trainer_param,\n",
-    "    torch_seed=100 \n",
-    ")\n",
-    "\n",
-    "# set parameter for server\n",
-    "nn_component.get_party_instance(role='arbiter', party_id=guest_0).component_param(    \n",
-    "    trainer=trainer_param\n",
-    ")\n",
-    "\n",
-    "pipeline.add_component(reader_0)\n",
-    "pipeline.add_component(nn_component, data=Data(train_data=reader_0.output.data))\n",
-    "pipeline.compile()\n",
-    "\n",
-    "pipeline.fit()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can use this script to submit the model, but submitting the model will take a long time to train and generate a long log, so we won't do it here."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Training with CUDA\n",
-    "\n",
-    "You can use GPU by setting the cuda parameter of the FedAVGTrainer:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "trainer_param = TrainerParam(trainer_name='fedavg_trainer', epochs=1, batch_size=8, \n",
-    "                             data_loader_worker=8, cuda=0)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The cuda parameter here accepts an integer value that corresponds to the index of the GPU you want to use for training. \n",
-    "In the example above, the value is set to 0, which means that on every client the first available GPU in the system will be used. \n",
-    "If you have multiple GPUs and would like to use a specific one, simply change the value of the cuda parameter to the appropriate index."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From 955e511fe526b6dd1ef764276c32992084374704 Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Thu, 29 Feb 2024 21:19:08 +0800
Subject: [PATCH 26/35] update release note of fate-llm 2.0

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 RELEASE.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index 043ed9f..f7e5bbe 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,11 @@
+## Release 2.0.0
+### Major Features and Improvements
+* Adapt to fate-v2.0 framework:
+  * Migrate parameter-efficient fine-tuning training methods and models. 
+  * Migrate Standard Offsite-Tuning and Extended Offsite-Tuning（Federated Offsite-Tuning+)
+  * Newly trainer，dataset, data_processing function design
+* New FedKSeed Federated Tuning Algorithm: train large language models in a federated learning setting with extremely low communication cost
+
 ## Release 1.3.0
 ### Major Features and Improvements
 * FTL-LLM（Fedrated Learning + Transfer Learning + LLM）

From 5397005b3b7858d4cb736b9a8e8375da37f9df61 Mon Sep 17 00:00:00 2001
From: sagewe <wbwmat@gmail.com>
Date: Fri, 1 Mar 2024 11:34:02 +0800
Subject: [PATCH 27/35] chore: add licence (#47)

Signed-off-by: sagewe <wbwmat@gmail.com>
---
 python/fate_llm/fedkseed/README.md    | 12 ++++++++
 python/fate_llm/fedkseed/optimizer.py | 44 +++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 python/fate_llm/fedkseed/README.md

diff --git a/python/fate_llm/fedkseed/README.md b/python/fate_llm/fedkseed/README.md
new file mode 100644
index 0000000..950e7fd
--- /dev/null
+++ b/python/fate_llm/fedkseed/README.md
@@ -0,0 +1,12 @@
+## FedKSeed
+
+The Algorithm is based on the paper: [Federated Full-Parameter Tuning of Billion-Sized Language Models
+with Communication Cost under 18 Kilobytes](https://arxiv.org/pdf/2312.06353.pdf) and the code is adaptor
+from the https://github.com/alibaba/FederatedScope/tree/FedKSeed.
+We refactor the code to make it more compatible with (transformers/PyTorch) framework 
+and integrate it into the FATE-LLM framework.
+
+The main works include:
+1. An KSeedZerothOrderOptimizer class that can be used to optimize model along given direction that generated with random seed.
+2. An KSeedZOExtendedTrainer subclass of Trainer from transformers that can be used to train large language models with KSeedZerothOrderOptimizer.
+3. Trainers for federated learning with large language models.
\ No newline at end of file
diff --git a/python/fate_llm/fedkseed/optimizer.py b/python/fate_llm/fedkseed/optimizer.py
index bad789f..dd53eab 100644
--- a/python/fate_llm/fedkseed/optimizer.py
+++ b/python/fate_llm/fedkseed/optimizer.py
@@ -1,3 +1,47 @@
+"""
+The implementations of ZerothOrderOptimizer and KSeedZerothOrderOptimizer is
+adapted from https://github.com/princeton-nlp/MeZO (MIT License) and
+https://github.com/alibaba/FederatedScope/tree/FedKSeed (Apache License 2.0)
+
+Copyright (c) 2021 Princeton Natural Language Processing
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+---
+#
+#  Copyright 2023 The FederatedScope Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""
+
+
 import math
 from typing import Mapping, Optional, Callable, Tuple, List
 

From 646c6ac97f77de59f0815d8b207bcd1eb5576d35 Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Fri, 1 Mar 2024 16:05:08 +0800
Subject: [PATCH 28/35] update doc of ChatGLM3

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 .../ChatGLM3-6B_ds.ipynb                      | 33 ++-----------------
 1 file changed, 3 insertions(+), 30 deletions(-)

diff --git a/doc/tutorial/parameter_efficient_llm/ChatGLM3-6B_ds.ipynb b/doc/tutorial/parameter_efficient_llm/ChatGLM3-6B_ds.ipynb
index 2b816b5..dcdad0a 100644
--- a/doc/tutorial/parameter_efficient_llm/ChatGLM3-6B_ds.ipynb
+++ b/doc/tutorial/parameter_efficient_llm/ChatGLM3-6B_ds.ipynb
@@ -451,31 +451,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "OSError",
-     "evalue": "Incorrect path_or_model_id: ''. Please provide either the path to a local folder or the repo_id of a model on the Hub.",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mHFValidationError\u001b[0m                         Traceback (most recent call last)",
-      "File \u001b[0;32m/data/projects/fate/common/python/venv/lib/python3.8/site-packages/transformers/utils/hub.py:385\u001b[0m, in \u001b[0;36mcached_file\u001b[0;34m(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[0m\n\u001b[1;32m    383\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    384\u001b[0m     \u001b[38;5;66;03m# Load from URL or cache if already cached\u001b[39;00m\n\u001b[0;32m--> 385\u001b[0m     resolved_file \u001b[38;5;241m=\u001b[39m \u001b[43mhf_hub_download\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    386\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    387\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    388\u001b[0m \u001b[43m        \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    389\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    390\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    391\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    392\u001b[0m \u001b[43m        \u001b[49m\u001b[43muser_agent\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_agent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    393\u001b[0m \u001b[43m        \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    394\u001b[0m \u001b[43m        \u001b[49m\u001b[43mproxies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    395\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    396\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    397\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    398\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    399\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GatedRepoError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
-      "File \u001b[0;32m/data/projects/fate/common/python/venv/lib/python3.8/site-packages/huggingface_hub/utils/_validators.py:110\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    109\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m arg_name \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrepo_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfrom_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto_id\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m--> 110\u001b[0m     \u001b[43mvalidate_repo_id\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    112\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m arg_name \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m arg_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
-      "File \u001b[0;32m/data/projects/fate/common/python/venv/lib/python3.8/site-packages/huggingface_hub/utils/_validators.py:164\u001b[0m, in \u001b[0;36mvalidate_repo_id\u001b[0;34m(repo_id)\u001b[0m\n\u001b[1;32m    163\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m REPO_ID_REGEX\u001b[38;5;241m.\u001b[39mmatch(repo_id):\n\u001b[0;32m--> 164\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m HFValidationError(\n\u001b[1;32m    165\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRepo id must use alphanumeric chars or \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m--\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m and \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m are\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    166\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m forbidden, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m and \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m cannot start or end the name, max length is 96:\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    167\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrepo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    168\u001b[0m     )\n\u001b[1;32m    170\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m--\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m repo_id \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m repo_id:\n",
-      "\u001b[0;31mHFValidationError\u001b[0m: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: ''.",
-      "\nThe above exception was the direct cause of the following exception:\n",
-      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[21], line 25\u001b[0m\n\u001b[1;32m     21\u001b[0m             \u001b[38;5;28;01myield\u001b[39;00m json\u001b[38;5;241m.\u001b[39mloads(_l\u001b[38;5;241m.\u001b[39mstrip())\n\u001b[1;32m     24\u001b[0m chatglm_model_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 25\u001b[0m model, tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchatglm_model_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     27\u001b[0m test_data_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{fate_install}\u001b[39;00m\u001b[38;5;124m/examples/data/AdvertiseGen/dev.json\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     28\u001b[0m dataset \u001b[38;5;241m=\u001b[39m load_data(test_data_path)\n",
-      "Cell \u001b[0;32mIn[21], line 9\u001b[0m, in \u001b[0;36mload_model\u001b[0;34m(pretrained_model_path)\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_model\u001b[39m(pretrained_model_path):\n\u001b[0;32m----> 9\u001b[0m     _tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mAutoTokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m     10\u001b[0m     _model \u001b[38;5;241m=\u001b[39m AutoModel\u001b[38;5;241m.\u001b[39mfrom_pretrained(pretrained_model_path, trust_remote_code\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m     12\u001b[0m     _model \u001b[38;5;241m=\u001b[39m _model\u001b[38;5;241m.\u001b[39mhalf()\n",
-      "File \u001b[0;32m/data/projects/fate/common/python/venv/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py:758\u001b[0m, in \u001b[0;36mAutoTokenizer.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m    755\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m tokenizer_class\u001b[38;5;241m.\u001b[39mfrom_pretrained(pretrained_model_name_or_path, \u001b[38;5;241m*\u001b[39minputs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    757\u001b[0m \u001b[38;5;66;03m# Next, let's try to use the tokenizer_config file to get the tokenizer class.\u001b[39;00m\n\u001b[0;32m--> 758\u001b[0m tokenizer_config \u001b[38;5;241m=\u001b[39m \u001b[43mget_tokenizer_config\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m tokenizer_config:\n\u001b[1;32m    760\u001b[0m     kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m tokenizer_config[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
-      "File \u001b[0;32m/data/projects/fate/common/python/venv/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py:590\u001b[0m, in \u001b[0;36mget_tokenizer_config\u001b[0;34m(pretrained_model_name_or_path, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, **kwargs)\u001b[0m\n\u001b[1;32m    587\u001b[0m     token \u001b[38;5;241m=\u001b[39m use_auth_token\n\u001b[1;32m    589\u001b[0m commit_hash \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m--> 590\u001b[0m resolved_config_file \u001b[38;5;241m=\u001b[39m \u001b[43mcached_file\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    591\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    592\u001b[0m \u001b[43m    \u001b[49m\u001b[43mTOKENIZER_CONFIG_FILE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    593\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    594\u001b[0m \u001b[43m    \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    595\u001b[0m \u001b[43m    \u001b[49m\u001b[43mresume_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    596\u001b[0m \u001b[43m    \u001b[49m\u001b[43mproxies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    597\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    598\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    599\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    600\u001b[0m \u001b[43m    \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    601\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_raise_exceptions_for_missing_entries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    602\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_raise_exceptions_for_connection_errors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    603\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_commit_hash\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcommit_hash\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    604\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    605\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m resolved_config_file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    606\u001b[0m     logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not locate the tokenizer configuration file, will try to use the model config instead.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "File \u001b[0;32m/data/projects/fate/common/python/venv/lib/python3.8/site-packages/transformers/utils/hub.py:450\u001b[0m, in \u001b[0;36mcached_file\u001b[0;34m(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[0m\n\u001b[1;32m    448\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThere was a specific connection error when trying to load \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_repo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00merr\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    449\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HFValidationError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m--> 450\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\n\u001b[1;32m    451\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIncorrect path_or_model_id: \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_repo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m. Please provide either the path to a local folder or the repo_id of a model on the Hub.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    452\u001b[0m     ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[1;32m    453\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resolved_file\n",
-      "\u001b[0;31mOSError\u001b[0m: Incorrect path_or_model_id: ''. Please provide either the path to a local folder or the repo_id of a model on the Hub."
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import json\n",
     "import sys\n",
@@ -525,13 +503,8 @@
     "model.cuda(\"cuda:0\")\n",
     "\n",
     "content = list(dataset)[0][\"content\"]\n",
-    "print(model.chat(tokenizer, content, do_sample=False))\n"
+    "print(model.chat(tokenizer, content, do_sample=False))"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
   }
  ],
  "metadata": {

From d01a77a3a2bf57db54673502cb64c42265bcc108 Mon Sep 17 00:00:00 2001
From: cwj <talkingwallace@sohu.com>
Date: Fri, 1 Mar 2024 16:58:51 +0800
Subject: [PATCH 29/35] Update ot doc Signed-off-by: weijingchen
 <talkingwallace@sohu.com>

Signed-off-by: cwj <talkingwallace@sohu.com>
---
 README.md                                     |   5 +-
 .../Offsite_tuning_tutorial.ipynb             | 961 +++++++++++-------
 2 files changed, 611 insertions(+), 355 deletions(-)

diff --git a/README.md b/README.md
index 7c0410f..086a4c8 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,6 @@ Use [FATE-LLM deployment packages](https://github.com/FederatedAI/FATE/wiki/Down
 
 ## Quick Start
 - [Offsite-tuning Tutorial: Model Definition and Job Submission](./doc/tutorial/offsite_tuning/Offsite_tuning_tutorial.ipynb)
-- [FedIPR Tutorial: Add Watermarks to Your Model](./doc/tutorial/fed_ipr/FedIPR-tutorial.ipynb)
 - [Federated ChatGLM-6B Training](./doc/tutorial/parameter_efficient_llm/ChatGLM-6B_ds.ipynb)
-- [GPT-2 Training](./doc/tutorial/parameter_efficient_llm/GPT2-example.ipynb)
-- [Builtin Models In PELLM](./doc/tutorial/builtin_models.md)
\ No newline at end of file
+- [Builtin Models In PELLM](./doc/tutorial/builtin_models.md)
+- [Offsite Tuning Tutorial](./doc/tutorial/offsite_tuning/Offsite_tuning_tutorial.ipynb)
\ No newline at end of file
diff --git a/doc/tutorial/offsite_tuning/Offsite_tuning_tutorial.ipynb b/doc/tutorial/offsite_tuning/Offsite_tuning_tutorial.ipynb
index b8b90ea..f522b08 100644
--- a/doc/tutorial/offsite_tuning/Offsite_tuning_tutorial.ipynb
+++ b/doc/tutorial/offsite_tuning/Offsite_tuning_tutorial.ipynb
@@ -13,7 +13,7 @@
    "id": "9f1d728c-09e1-418e-8d80-53dd0ec467b1",
    "metadata": {},
    "source": [
-    "In this tutorial, we'll focus on how to leverage Offsite-Tuning framework in FATE to fine-tune your LLM. You'll learn how to:\n",
+    "In this tutorial, we'll focus on how to leverage Offsite-Tuning framework in FATE-LLM-2.0 to fine-tune your LLM. You'll learn how to:\n",
     "\n",
     "1. Define models, including main models(which are at server side and will offer adapters and emulators) and submodel(which are at client side and will load adapters and emulators for local fine-tuning) compatible with Offsite-Tuning framework.\n",
     "2. Get hands-on experience with the Offsite-Tuning trainer.\n",
@@ -31,12 +31,7 @@
     "\n",
     "Offsite-Tuning addresses the challenge of unequal distribution of computational power and data. It allows thLLMel owner to enhance the model's capabilities without direct access to private data, while also enabling data owners who may not have the resources to train a full-scale model to fine-tune a portion of it using less computational power. This mutually beneficial arrangement accommodates both parties involve.\n",
     "\n",
-    "Beyond the standard two-party setup involving the model owner and the data ownin FATE-LLM, er, Offsite-Tunframework ing is also extendable to scenarios with multiple data owners. FATE supports multi-party Offsite-Tuning, allowing multiple data owners to fine-tune and aggregate their Adapters locally, further enhancing the flexibility and applicability of this framewrFor more details of Offsite-tuning, please refer to the [original paper](https://arxiv.org/pdf/2302.04870.pdf).\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n",
-    "\n"
+    "Beyond the standard two-party setup involving the model owner and the data ownin FATE-LLM, er, Offsite-Tunframework ing is also extendable to scenarios with multiple data owners. FATE supports multi-party Offsite-Tuning, allowing multiple data owners to fine-tune and aggregate their Adapters locally, further enhancing the flexibility and applicability of this framewrFor more details of Offsite-tuning, please refer to the [original paper](https://arxiv.org/pdf/2302.04870.pdf).\n"
    ]
   },
   {
@@ -46,13 +41,14 @@
    "source": [
     "## Preliminary\n",
     "\n",
-    "We strongly recommend you finish reading our NN tutorial to get familiar with Model and Dataset customizations: [NN Tutorials](https://github.com/FederatedAI/FATE/blob/master/doc/tutorial/pipeline/nn_tutorial/README.md)\n",
-    "You can add python path so that you can run codes in the notebook."
+    "We strongly recommend you finish reading our NN tutorial to get familiar with Model and Dataset customizations: [NN Tutorials](https://github.com/FederatedAI/FATE/blob/master/doc/2.0/fate/components/pipeline_nn_cutomization_tutorial.md)\n",
+    "\n",
+    "In this tutorial, we assume that you have deploy the codes of FATE(including fateflow & fate-client) & FATE-LLM-2.0. You can add python path so that you can run codes in the notebook."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 4,
    "id": "f33516e8-0d28-4c97-bc38-ba28d60acf37",
    "metadata": {},
    "outputs": [],
@@ -62,6 +58,14 @@
     "sys.path.append(your_path_to_fate_python)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "2f2fc794",
+   "metadata": {},
+   "source": [
+    "If you install FATE & FATE-LLM-2.0 via pip, you can directly use the following codes."
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7309281b-5956-4158-9256-d6db230e086d",
@@ -186,11 +190,7 @@
    "source": [
     "### Share additional parameters with clients\n",
     "\n",
-    "Additionally, beyond the weights of emulators and adapters, you may also want to share other model parameters, such as embedding weights, with your client partners. To achieve this, you'll need to implement two more interfaces: get_additional_param_state_dict and load_additional_param_state_dict for both the Main and Sub Models.\n",
-    "\n",
-    "### Special Attention for Large Objects\n",
-    "\n",
-    "Please note that special attention is required when you need to share large objects, any object potentially exceeding 2GB, such as embedding weights. You should slice these large objects to manage them more efficiently. Below is a code snippet demonstrating this practice, taken directly from FATE's native GPT-2 implementation:"
+    "Additionally, beyond the weights of emulators and adapters, you may also want to share other model parameters, such as embedding weights, with your client partners. To achieve this, you'll need to implement two more interfaces: get_additional_param_state_dict and load_additional_param_state_dict for both the Main and Sub Models."
    ]
   },
   {
@@ -263,7 +263,7 @@
     "\n",
     "### Prepare QA Dataset - Sciq\n",
     "\n",
-    "In this example, we use sciq dataset. You can use tools provided in our qa_dataset.py to tokenize the sciq dataset and save the tokenized result. "
+    "In this example, we use sciq dataset. You can use tools provided in our qa_dataset.py to tokenize the sciq dataset and save the tokenized result. **Remember to modify the save_path to your own path.** For the sake of simplicity, in this tutorial, for every party we only use this dataset to train the model."
    ]
   },
   {
@@ -276,7 +276,7 @@
     "from fate_llm.dataset.qa_dataset import tokenize_qa_dataset\n",
     "from transformers import AutoTokenizer\n",
     "tokenizer_name_or_path = 'gpt2'\n",
-    "tokenizer = AutoTokenizer.from_pretrained(gpt2_path)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)\n",
     "\n",
     "if 'llama' in tokenizer_name_or_path:\n",
     "    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, unk_token=\"<unk>\",  bos_token=\"<s>\", eos_token=\"</s>\", add_eos_token=True)   \n",
@@ -288,8 +288,8 @@
     "\n",
     "import os\n",
     "# bind data path to name & namespace\n",
-    "fate_project_path = os.path.abspath('../../../')\n",
-    "rs = tokenize_qa_dataset('sciq', tokenizer, fate_project_path + '/sciq/', seq_max_len=600)  # we save the cache dataset to the fate root folder"
+    "save_path = 'xxxx/sciq'\n",
+    "rs = tokenize_qa_dataset('sciq', tokenizer, save_path, seq_max_len=600)  # we save the cache dataset to the fate root folder"
    ]
   },
   {
@@ -310,7 +310,7 @@
     "from fate_llm.dataset.qa_dataset import QaDataset\n",
     "\n",
     "ds = QaDataset(tokenizer_name_or_path=tokenizer_name_or_path)\n",
-    "ds.load(fate_project_path + '/sciq/')"
+    "ds.load(save_path)"
    ]
   },
   {
@@ -340,86 +340,406 @@
    "source": [
     "## Submit a Task\n",
     "\n",
-    "Now the model and the dataset is prepared! We can submit a training task. \n",
-    "After we submit the task below, the following process will occur: The server and client each initialize their respective models. The server extracts shared parameters and sends them to the client. The client then loads these parameters and conducts training on a miniaturized GPT-2 model composed of an emulator and adaptesr onSciqP We speicify the OffsiteTuningTrainer via TrainerParam. If you are not familiar with trainer configuration, please refer to [FATE-NN Tutorial](https://github.com/FederatedAI/FATE/blob/master/doc/tutorial/pipeline/nn_tutorial/README.md).\n",
+    "Now the model and the dataset is prepared! We can submit a training task. In the FATE-2.0, you can define your pipeline in a much easier manner.\n",
+    "\n",
+    "After we submit the task below, the following process will occur: The server and client each initialize their respective models. The server extracts shared parameters and sends them to the client. The client then loads these parameters and conducts training on a miniaturized GPT-2 model composed of an emulator and adapter on SciqP \n",
+    "\n",
+    "If you are not familiar with trainer configuration, please refer to [NN Tutorials](https://github.com/FederatedAI/FATE/blob/master/doc/2.0/fate/components/pipeline_nn_cutomization_tutorial.md).\n",
+    "\n",
     " Upon completion of the training, the client sends the adapter parameters back to the server. Since we are directly using Hugging Face's LMHeadGPT2, there's no need to supply a loss function. Simply inputting the preprocessed data and labels into the model will calculate the correct loss and proceed with gradient descent\n",
     "\n",
-    "One thing to pay special attention to is that Offsite-Tuning differs from FedAvg within FATE. In Offsite-Tuning, the server (the arbiter role) needs to initialize the model. Therefore, please refer to the example below and set the 'nn_component' parameters separately for the client and the server. Also, don't forget to add the 'server_init=True' parameter to the server; otherwise, the arbiter side will not initialize the model.\n",
+    "One thing to pay special attention to is that Offsite-Tuning differs from FedAvg within FATE. In Offsite-Tuning, the server (the arbiter role) needs to initialize the model. Therefore, please refer to the example below and set the runner conf separately for the client and the server.\n",
+    "\n",
+    "To make this a quick demo, we only select 100 samples from the origin qa datset, see 'select_num=100' in the LLMDatasetLoader."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "261dfb43",
+   "metadata": {},
+   "source": [
+    "### Bind Dataset Path with Name & Namespace\n",
     "\n",
-    "To make this a quick demo, we only select 100 samples from the origin qa datset, see 'select_num=100' in the DatasetParam."
+    "Plase execute the following code to bind the dataset path with name & namespace. Remember to modify the path to your own dataset save path."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
+   "id": "8dc1e82b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! flow table bind --namespace experiment --name sciq --path YOUR_SAVE_PATH"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e8c5ff4",
+   "metadata": {},
+   "source": [
+    "### Pipeline codes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
    "id": "c9113d10-c3e7-4875-9502-ce46aa0b86b1",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<pipeline.backend.pipeline.PipeLine at 0x7f81000ec850>"
+       "<fate_client.pipeline.pipeline.FateFlowPipeline at 0x7fc69aa33a00>"
       ]
      },
-     "execution_count": 1,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "import torch as t\n",
-    "from torch import nn\n",
-    "from pipeline import fate_torch_hook\n",
-    "from pipeline.component import HomoNN\n",
-    "from pipeline.backend.pipeline import PipeLine\n",
-    "from pipeline.component import Reader, Evaluation, DataTransform\n",
-    "from pipeline.interface import Data, Model\n",
-    "\n",
-    "t = fate_torch_hook(t)\n",
-    "\n",
-    "import os\n",
-    "# bind data path to name & namespace\n",
-    "fate_project_path = os.path.abspath('../../../')\n",
-    "guest = 9997\n",
-    "arbiter = 9997\n",
-    "pipeline = PipeLine().set_initiator(role='guest', party_id=guest).set_roles(guest=guest, arbiter=arbiter)\n",
-    "\n",
-    "# bind data path with name & namespace\n",
-    "data_0 = {\"name\": \"sciq\", \"namespace\": \"experiment\"}\n",
-    "data_path_0 = fate_project_path + '/sciq/'\n",
-    "pipeline.bind_table(name=data_0['name'], namespace=data_0['namespace'], path=data_path_0)\n",
-    "\n",
-    "reader_0 = Reader(name=\"reader_0\")\n",
-    "reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=data_0)\n",
-    "\n",
-    "gpt2_type = 'gpt2'\n",
-    "\n",
-    "from pipeline.component.nn import DatasetParam\n",
-    "dataset_param = DatasetParam(dataset_name='qa_dataset', tokenizer_name_or_path=gpt2_type, select_num=100)\n",
-    "\n",
-    "from pipeline.component.homo_nn import TrainerParam  # Interface\n",
-    "sub_model_client = t.nn.CustModel(module_name='offsite_tuning.gpt2_ot', class_name='GPT2LMHeadSubModel', model_name_or_path=gpt2_type \\\n",
-    "                                  ,emulator_layer_num=4, adapter_top_layer_num=2, adapter_bottom_layer_num=2)\n",
-    "main_model_server = t.nn.CustModel(module_name='offsite_tuning.gpt2_ot', class_name='GPT2LMHeadMainModel', model_name_or_path=gpt2_type \\\n",
-    "                                  ,emulator_layer_num=4, adapter_top_layer_num=2, adapter_bottom_layer_num=2)\n",
-    "\n",
-    "nn_component = HomoNN(name='nn_0')\n",
-    "\n",
-    "nn_component.get_party_instance(role='guest', party_id=guest).component_param(model=sub_model_client, dataset=dataset_param,  # dataset\n",
-    "                                                                              trainer=TrainerParam(trainer_name='offsite_tuning_trainer', epochs=3, batch_size=4, collate_fn='DataCollatorForTokenClassification', task_type='causal_ml', \\\n",
-    "                                                                                                   save_to_local_dir=True, cuda=0),\n",
-    "                                                                             optimizer=t.optim.Adam(lr=5e-5)\n",
-    "                                                                             )\n",
-    "nn_component.get_party_instance(role='arbiter', party_id=arbiter).component_param(model=main_model_server, \n",
-    "                                                                                  trainer=TrainerParam(trainer_name='offsite_tuning_trainer', collate_fn='DataCollatorForTokenClassification', save_to_local_dir=True),\n",
-    "                                                                                  # Attention here\n",
-    "                                                                                  server_init=True # This parameter must be set True !!!!!!!!!!!\n",
-    "                                                                                )\n",
-    "pipeline.add_component(reader_0)\n",
-    "pipeline.add_component(nn_component, data=Data(train_data=reader_0.output.data))\n",
+    "import time\n",
+    "from fate_client.pipeline.components.fate.reader import Reader\n",
+    "from fate_client.pipeline import FateFlowPipeline\n",
+    "from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_conf_of_ot_runner\n",
+    "from fate_client.pipeline.components.fate.nn.algo_params import Seq2SeqTrainingArguments, FedAVGArguments\n",
+    "from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader\n",
+    "from fate_client.pipeline.components.fate.nn.torch.base import Sequential\n",
+    "from fate_client.pipeline.components.fate.nn.torch import nn\n",
+    "\n",
+    "\n",
+    "guest = '10000'\n",
+    "host = '10000'\n",
+    "arbiter = '10000'\n",
+    "\n",
+    "pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter)\n",
+    "\n",
+    "reader_0 = Reader(\"reader_0\", runtime_parties=dict(guest=guest))\n",
+    "reader_0.guest.task_parameters(\n",
+    "    namespace=\"experiment\",\n",
+    "    name=\"sciq\"\n",
+    ")\n",
+    "\n",
+    "client_model = LLMModelLoader(\n",
+    "    module_name='offsite_tuning.gpt2', item_name='GPT2LMHeadSubModel',\n",
+    "    model_name_or_path='gpt2',\n",
+    "    emulator_layer_num=4,\n",
+    "    adapter_top_layer_num=1,\n",
+    "    adapter_bottom_layer_num=1\n",
+    ")\n",
+    "\n",
+    "server_model = LLMModelLoader(\n",
+    "    module_name='offsite_tuning.gpt2', item_name='GPT2LMHeadMainModel',\n",
+    "    model_name_or_path='gpt2',\n",
+    "    emulator_layer_num=4,\n",
+    "    adapter_top_layer_num=1,\n",
+    "    adapter_bottom_layer_num=1  \n",
+    ")\n",
+    "\n",
+    "train_args = Seq2SeqTrainingArguments(\n",
+    "    per_device_train_batch_size=1,\n",
+    "    learning_rate=5e-5,\n",
+    "    disable_tqdm=False,\n",
+    "    num_train_epochs=1,\n",
+    "    logging_steps=10,\n",
+    "    logging_strategy='steps',\n",
+    "    use_cpu=False\n",
+    ")\n",
+    "\n",
+    "dataset = LLMDatasetLoader(\n",
+    "    module_name='qa_dataset', item_name='QaDataset',\n",
+    "    tokenizer_name_or_path='gpt2',\n",
+    "    select_num=100\n",
+    ")\n",
+    "\n",
+    "data_collator = LLMDataFuncLoader(module_name='data_collator.cust_data_collator', item_name='get_seq2seq_data_collator', tokenizer_name_or_path='gpt2')\n",
+    "\n",
+    "client_conf = get_conf_of_ot_runner(\n",
+    "    model=client_model,\n",
+    "    dataset=dataset,\n",
+    "    data_collator=data_collator,\n",
+    "    training_args=train_args,\n",
+    "    fed_args=FedAVGArguments(),\n",
+    "    aggregate_model=False\n",
+    ")\n",
+    "\n",
+    "server_conf = get_conf_of_ot_runner(\n",
+    "    model=server_model,\n",
+    "    dataset=dataset,\n",
+    "    data_collator=data_collator,\n",
+    "    training_args=train_args,\n",
+    "    fed_args=FedAVGArguments(),\n",
+    "    aggregate_model=False\n",
+    ")\n",
+    "\n",
+    "homo_nn_0 = HomoNN(\n",
+    "    'nn_0',\n",
+    "    train_data=reader_0.outputs[\"output_data\"],\n",
+    "    runner_module=\"offsite_tuning_runner\",\n",
+    "    runner_class=\"OTRunner\"\n",
+    ")\n",
+    "\n",
+    "homo_nn_0.guest.task_parameters(runner_conf=client_conf)\n",
+    "homo_nn_0.arbiter.task_parameters(runner_conf=server_conf)\n",
+    "pipeline.add_tasks([reader_0, homo_nn_0])\n",
     "pipeline.compile()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "e97c2823",
+   "metadata": {},
+   "source": [
+    "You can try to initialize your models, datasets to check if they can be loaded correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "872817e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GPT2LMHeadSubModel(\n",
+      "  (model): GPT2LMHeadModel(\n",
+      "    (transformer): GPT2Model(\n",
+      "      (wte): Embedding(50257, 768)\n",
+      "      (wpe): Embedding(1024, 768)\n",
+      "      (drop): Dropout(p=0.1, inplace=False)\n",
+      "      (h): ModuleList(\n",
+      "        (0): GPT2Block(\n",
+      "          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "          (attn): GPT2Attention(\n",
+      "            (c_attn): Conv1D()\n",
+      "            (c_proj): Conv1D()\n",
+      "            (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "            (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "          (mlp): GPT2MLP(\n",
+      "            (c_fc): Conv1D()\n",
+      "            (c_proj): Conv1D()\n",
+      "            (act): NewGELUActivation()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (1): GPT2Block(\n",
+      "          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "          (attn): GPT2Attention(\n",
+      "            (c_attn): Conv1D()\n",
+      "            (c_proj): Conv1D()\n",
+      "            (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "            (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "          (mlp): GPT2MLP(\n",
+      "            (c_fc): Conv1D()\n",
+      "            (c_proj): Conv1D()\n",
+      "            (act): NewGELUActivation()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (2): GPT2Block(\n",
+      "          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "          (attn): GPT2Attention(\n",
+      "            (c_attn): Conv1D()\n",
+      "            (c_proj): Conv1D()\n",
+      "            (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "            (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "          (mlp): GPT2MLP(\n",
+      "            (c_fc): Conv1D()\n",
+      "            (c_proj): Conv1D()\n",
+      "            (act): NewGELUActivation()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (3): GPT2Block(\n",
+      "          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "          (attn): GPT2Attention(\n",
+      "            (c_attn): Conv1D()\n",
+      "            (c_proj): Conv1D()\n",
+      "            (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "            (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "          (mlp): GPT2MLP(\n",
+      "            (c_fc): Conv1D()\n",
+      "            (c_proj): Conv1D()\n",
+      "            (act): NewGELUActivation()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (4): GPT2Block(\n",
+      "          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "          (attn): GPT2Attention(\n",
+      "            (c_attn): Conv1D()\n",
+      "            (c_proj): Conv1D()\n",
+      "            (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "            (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "          (mlp): GPT2MLP(\n",
+      "            (c_fc): Conv1D()\n",
+      "            (c_proj): Conv1D()\n",
+      "            (act): NewGELUActivation()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (5): GPT2Block(\n",
+      "          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "          (attn): GPT2Attention(\n",
+      "            (c_attn): Conv1D()\n",
+      "            (c_proj): Conv1D()\n",
+      "            (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "            (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "          (mlp): GPT2MLP(\n",
+      "            (c_fc): Conv1D()\n",
+      "            (c_proj): Conv1D()\n",
+      "            (act): NewGELUActivation()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "    )\n",
+      "    (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n",
+      "  )\n",
+      "  (emulator): ModuleList(\n",
+      "    (0): GPT2Block(\n",
+      "      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "      (attn): GPT2Attention(\n",
+      "        (c_attn): Conv1D()\n",
+      "        (c_proj): Conv1D()\n",
+      "        (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "      )\n",
+      "      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "      (mlp): GPT2MLP(\n",
+      "        (c_fc): Conv1D()\n",
+      "        (c_proj): Conv1D()\n",
+      "        (act): NewGELUActivation()\n",
+      "        (dropout): Dropout(p=0.1, inplace=False)\n",
+      "      )\n",
+      "    )\n",
+      "    (1): GPT2Block(\n",
+      "      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "      (attn): GPT2Attention(\n",
+      "        (c_attn): Conv1D()\n",
+      "        (c_proj): Conv1D()\n",
+      "        (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "      )\n",
+      "      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "      (mlp): GPT2MLP(\n",
+      "        (c_fc): Conv1D()\n",
+      "        (c_proj): Conv1D()\n",
+      "        (act): NewGELUActivation()\n",
+      "        (dropout): Dropout(p=0.1, inplace=False)\n",
+      "      )\n",
+      "    )\n",
+      "    (2): GPT2Block(\n",
+      "      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "      (attn): GPT2Attention(\n",
+      "        (c_attn): Conv1D()\n",
+      "        (c_proj): Conv1D()\n",
+      "        (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "      )\n",
+      "      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "      (mlp): GPT2MLP(\n",
+      "        (c_fc): Conv1D()\n",
+      "        (c_proj): Conv1D()\n",
+      "        (act): NewGELUActivation()\n",
+      "        (dropout): Dropout(p=0.1, inplace=False)\n",
+      "      )\n",
+      "    )\n",
+      "    (3): GPT2Block(\n",
+      "      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "      (attn): GPT2Attention(\n",
+      "        (c_attn): Conv1D()\n",
+      "        (c_proj): Conv1D()\n",
+      "        (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "      )\n",
+      "      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "      (mlp): GPT2MLP(\n",
+      "        (c_fc): Conv1D()\n",
+      "        (c_proj): Conv1D()\n",
+      "        (act): NewGELUActivation()\n",
+      "        (dropout): Dropout(p=0.1, inplace=False)\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      "  (adapter_bottom): ModuleList(\n",
+      "    (0): GPT2Block(\n",
+      "      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "      (attn): GPT2Attention(\n",
+      "        (c_attn): Conv1D()\n",
+      "        (c_proj): Conv1D()\n",
+      "        (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "      )\n",
+      "      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "      (mlp): GPT2MLP(\n",
+      "        (c_fc): Conv1D()\n",
+      "        (c_proj): Conv1D()\n",
+      "        (act): NewGELUActivation()\n",
+      "        (dropout): Dropout(p=0.1, inplace=False)\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      "  (adapter_top): ModuleList(\n",
+      "    (0): GPT2Block(\n",
+      "      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "      (attn): GPT2Attention(\n",
+      "        (c_attn): Conv1D()\n",
+      "        (c_proj): Conv1D()\n",
+      "        (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "      )\n",
+      "      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "      (mlp): GPT2MLP(\n",
+      "        (c_fc): Conv1D()\n",
+      "        (c_proj): Conv1D()\n",
+      "        (act): NewGELUActivation()\n",
+      "        (dropout): Dropout(p=0.1, inplace=False)\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      ")\n",
+      "**********\n",
+      "<fate_llm.dataset.qa_dataset.QaDataset object at 0x7fc724fdfd00>\n",
+      "**********\n",
+      "DataCollatorForSeq2Seq(tokenizer=GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={\n",
+      "\t50256: AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),\n",
+      "}, model=None, padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(client_model())\n",
+    "print('*' * 10)\n",
+    "print(dataset())\n",
+    "print('*' * 10)\n",
+    "print(data_collator())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "898c3491",
+   "metadata": {},
+   "source": [
+    "Seems that everything is ready! Now we can submit the task. Submit the code below to submit your task."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
@@ -437,7 +757,7 @@
    "source": [
     "## Add Deepspeed Setting\n",
     "\n",
-    "By simply adding a ds_config, we can run our task with a deepspeed backend:"
+    "By simply adding a ds_config, we can run our task with a deepspeed backend. If you have deployed eggroll envoironment, you can submmit the task with deepspeed to eggroll accelerate your training."
    ]
   },
   {
@@ -458,98 +778,136 @@
     }
    ],
    "source": [
-    "import torch as t\n",
-    "from torch import nn\n",
-    "from pipeline import fate_torch_hook\n",
-    "from pipeline.component import HomoNN\n",
-    "from pipeline.backend.pipeline import PipeLine\n",
-    "from pipeline.component import Reader, Evaluation, DataTransform\n",
-    "from pipeline.interface import Data, Model\n",
-    "\n",
-    "t = fate_torch_hook(t)\n",
-    "\n",
-    "import os\n",
-    "# bind data path to name & namespace\n",
-    "fate_project_path = os.path.abspath('../../../')\n",
-    "guest = 9997\n",
-    "arbiter = 9997\n",
-    "pipeline = PipeLine().set_initiator(role='guest', party_id=guest).set_roles(guest=guest, arbiter=arbiter)\n",
-    "\n",
-    "# bind data path with name & namespace\n",
-    "data_0 = {\"name\": \"sciq\", \"namespace\": \"experiment\"}\n",
-    "data_path_0 = fate_project_path + '/sciq/'\n",
-    "pipeline.bind_table(name=data_0['name'], namespace=data_0['namespace'], path=data_path_0)\n",
-    "\n",
-    "reader_0 = Reader(name=\"reader_0\")\n",
-    "reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=data_0)\n",
-    "\n",
-    "# deepspeed config\n",
+    "import time\n",
+    "from fate_client.pipeline.components.fate.reader import Reader\n",
+    "from fate_client.pipeline import FateFlowPipeline\n",
+    "from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_conf_of_ot_runner\n",
+    "from fate_client.pipeline.components.fate.nn.algo_params import Seq2SeqTrainingArguments, FedAVGArguments\n",
+    "from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMDataFuncLoader\n",
+    "from peft import LoraConfig, TaskType\n",
+    "from transformers.modeling_utils import unwrap_model\n",
+    "\n",
+    "\n",
+    "guest = '10000'\n",
+    "host = '10000'\n",
+    "arbiter = '10000'\n",
+    "\n",
+    "# pipeline = FateFlowPipeline().set_parties(guest=guest, host=host, arbiter=arbiter)\n",
+    "pipeline = FateFlowPipeline().set_parties(guest=guest, arbiter=arbiter)\n",
+    "\n",
+    "reader_0 = Reader(\"reader_0\", runtime_parties=dict(guest=guest))\n",
+    "reader_0.guest.task_parameters(\n",
+    "    namespace=\"experiment\",\n",
+    "    name=\"sciq\"\n",
+    ")\n",
+    "\n",
+    "client_model = LLMModelLoader(\n",
+    "    module_name='offsite_tuning.gpt2', item_name='GPT2LMHeadSubModel',\n",
+    "    model_name_or_path='gpt2',\n",
+    "    emulator_layer_num=18,\n",
+    "    adapter_top_layer_num=2,\n",
+    "    adapter_bottom_layer_num=2\n",
+    ")\n",
+    "\n",
+    "server_model = LLMModelLoader(\n",
+    "    module_name='offsite_tuning.gpt2', item_name='GPT2LMHeadMainModel',\n",
+    "    model_name_or_path='gpt2',\n",
+    "    emulator_layer_num=18,\n",
+    "    adapter_top_layer_num=2,\n",
+    "    adapter_bottom_layer_num=2  \n",
+    ")\n",
+    "\n",
+    "dataset = LLMDatasetLoader(\n",
+    "    module_name='qa_dataset', item_name='QaDataset',\n",
+    "    tokenizer_name_or_path='gpt2',\n",
+    "    select_num=100\n",
+    ")\n",
+    "\n",
+    "data_collator = LLMDataFuncLoader(module_name='data_collator.cust_data_collator', item_name='get_seq2seq_data_collator', tokenizer_name_or_path='gpt2')\n",
+    "\n",
+    "batch_size = 1\n",
+    "lr = 5e-5\n",
     "ds_config = {\n",
-    "    \"train_micro_batch_size_per_gpu\": 2,\n",
-    "    \"gradient_accumulation_steps\": 2,\n",
+    "    \"train_micro_batch_size_per_gpu\": batch_size,\n",
     "    \"optimizer\": {\n",
-    "        \"type\": \"AdamW\",\n",
+    "        \"type\": \"Adam\",\n",
     "        \"params\": {\n",
-    "            \"lr\": 5e-5\n",
+    "            \"lr\": lr,\n",
+    "            \"torch_adam\": True,\n",
+    "            \"adam_w_mode\": False\n",
     "        }\n",
-    "    }\n",
-    "    ,\n",
+    "    },\n",
     "    \"fp16\": {\n",
-    "        \"enabled\": False\n",
-    "    }\n",
-    "    ,\n",
+    "        \"enabled\": True\n",
+    "    },\n",
+    "    \"gradient_accumulation_steps\": 1,\n",
     "    \"zero_optimization\": {\n",
-    "        \"stage\": 1,\n",
+    "        \"stage\": 2,\n",
+    "        \"allgather_partitions\": True,\n",
+    "        \"allgather_bucket_size\": 1e8,\n",
+    "        \"overlap_comm\": True,\n",
+    "        \"reduce_scatter\": True,\n",
+    "        \"reduce_bucket_size\": 1e8,\n",
+    "        \"contiguous_gradients\": True,\n",
     "        \"offload_optimizer\": {\n",
     "            \"device\": \"cpu\"\n",
     "        },\n",
-    "        \"contiguous_gradients\": True,\n",
-    "        \"overlap_comm\": True\n",
+    "        \"offload_param\": {\n",
+    "            \"device\": \"cpu\"\n",
+    "        }\n",
     "    }\n",
     "}\n",
     "\n",
-    "gpt2_type = 'gpt2'\n",
-    "\n",
-    "from pipeline.component.nn import DatasetParam\n",
-    "dataset_param = DatasetParam(dataset_name='qa_dataset', tokenizer_name_or_path=gpt2_type, select_num=100)\n",
-    "\n",
-    "from pipeline.component.homo_nn import TrainerParam  # Interface\n",
-    "sub_model_client = t.nn.CustModel(module_name='offsite_tuning.gpt2_ot', class_name='GPT2LMHeadSubModel', model_name_or_path=gpt2_type \\\n",
-    "                                  ,emulator_layer_num=4, adapter_top_layer_num=2, adapter_bottom_layer_num=2)\n",
-    "main_model_server = t.nn.CustModel(module_name='offsite_tuning.gpt2_ot', class_name='GPT2LMHeadMainModel', model_name_or_path=gpt2_type \\\n",
-    "                                  ,emulator_layer_num=4, adapter_top_layer_num=2, adapter_bottom_layer_num=2)\n",
-    "\n",
-    "nn_component = HomoNN(name='nn_0')\n",
-    "\n",
-    "nn_component.get_party_instance(role='guest', party_id=guest).component_param(model=sub_model_client, dataset=dataset_param,  # dataset\n",
-    "                                                                              trainer=TrainerParam(trainer_name='offsite_tuning_trainer', epochs=3, batch_size=4, collate_fn='DataCollatorForTokenClassification', task_type='causal_ml', \\\n",
-    "                                                                                                   save_to_local_dir=True),\n",
-    "                                                                             optimizer=t.optim.Adam(lr=5e-5)\n",
-    "                                                                             )\n",
-    "nn_component.get_party_instance(role='arbiter', party_id=arbiter).component_param(model=main_model_server, \n",
-    "                                                                                  trainer=TrainerParam(trainer_name='offsite_tuning_trainer', collate_fn='DataCollatorForTokenClassification', save_to_local_dir=True),\n",
-    "                                                                                  # Attention here\n",
-    "                                                                                  server_init=True # This parameter must be set True !!!!!!!!!!!\n",
-    "                                                                                )\n",
-    "pipeline.add_component(reader_0)\n",
-    "pipeline.add_component(nn_component, data=Data(train_data=reader_0.output.data))\n",
-    "pipeline.compile()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "23320cb9-d06a-44ac-8966-398b0f7bbaae",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pipeline.runtime.entity import JobParameters\n",
-    "pipeline.fit(JobParameters(task_conf={\n",
-    "    \"nn_0\": {\n",
-    "        \"launcher\": \"deepspeed\",\n",
-    "        \"world_size\": 4\n",
-    "    }\n",
-    "}))"
+    "train_args = Seq2SeqTrainingArguments(\n",
+    "    per_device_train_batch_size=1,\n",
+    "    learning_rate=5e-5,\n",
+    "    disable_tqdm=False,\n",
+    "    num_train_epochs=1,\n",
+    "    logging_steps=10,\n",
+    "    logging_strategy='steps',\n",
+    "    dataloader_num_workers=4,\n",
+    "    use_cpu=False,\n",
+    "    deepspeed=ds_config,  # Add deepspeed config here\n",
+    "    remove_unused_columns=False,\n",
+    "    fp16=True\n",
+    ")\n",
+    "\n",
+    "client_conf = get_conf_of_ot_runner(\n",
+    "    model=client_model,\n",
+    "    dataset=dataset,\n",
+    "    data_collator=data_collator,\n",
+    "    training_args=train_args,\n",
+    "    fed_args=FedAVGArguments(),\n",
+    "    aggregate_model=False,\n",
+    ")\n",
+    "\n",
+    "server_conf = get_conf_of_ot_runner(\n",
+    "    model=server_model,\n",
+    "    dataset=dataset,\n",
+    "    data_collator=data_collator,\n",
+    "    training_args=train_args,\n",
+    "    fed_args=FedAVGArguments(),\n",
+    "    aggregate_model=False\n",
+    ")\n",
+    "\n",
+    "\n",
+    "homo_nn_0 = HomoNN(\n",
+    "    'nn_0',\n",
+    "    train_data=reader_0.outputs[\"output_data\"],\n",
+    "    runner_module=\"offsite_tuning_runner\",\n",
+    "    runner_class=\"OTRunner\"\n",
+    ")\n",
+    "\n",
+    "homo_nn_0.guest.task_parameters(runner_conf=client_conf)\n",
+    "homo_nn_0.arbiter.task_parameters(runner_conf=server_conf)\n",
+    "\n",
+    "# if you have deployed eggroll, you can add this line to submit your job to eggroll\n",
+    "homo_nn_0.guest.conf.set(\"launcher_name\", \"deepspeed\")\n",
+    "\n",
+    "pipeline.add_tasks([reader_0, homo_nn_0])\n",
+    "pipeline.conf.set(\"task\", dict(engine_run={\"cores\": 4}))\n",
+    "pipeline.compile()\n",
+    "pipeline.fit()\n"
    ]
   },
   {
@@ -560,11 +918,7 @@
     "## Offsite-tuning + Multi Client Federation\n",
     "\n",
     "\n",
-    "The Offsite-Tuning + FedAVG federation is configured based on the standard Offsite-Tuning. The setup is a bit more complex, but we will walk you through it step by step. The pipeline code below contains detailed comments. When reading, please pay attention to the following points:\n",
-    "\n",
-    "1. In a multi-party scenario, please fill in different party_ids based on your deployment.\n",
-    "2. The operation to bind the data path with the name & namespace needs to be run on the machines of all parties. For convenience, we've placed the code in one location.\n",
-    "3. When configuring Trainer parameters, make sure to add the 'need_aggregate=True' parameter to the OffsiteTuningTrainer for each client and server. So adapters will be aggregated during training."
+    "The Offsite-Tuning + FedAVG federation is configured based on the standard Offsite-Tuning. In this situation, you need to add data input & configurations for all clients. And do remember to add 'aggregate_model=True' for client & server conf so that model federation will be conducted during the training."
    ]
   },
   {
@@ -574,195 +928,98 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import torch as t\n",
-    "from torch import nn\n",
-    "from pipeline import fate_torch_hook\n",
-    "from pipeline.component import HomoNN\n",
-    "from pipeline.backend.pipeline import PipeLine\n",
-    "from pipeline.component import Reader, Evaluation, DataTransform\n",
-    "from pipeline.interface import Data, Model\n",
-    "\n",
-    "t = fate_torch_hook(t)\n",
-    "\n",
-    "import os\n",
-    "# bind data path to name & namespace\n",
-    "fate_project_path = os.path.abspath('../../../')\n",
-    "guest = 9997\n",
-    "hosts = [9999, 10000]\n",
-    "arbiter = 9997\n",
-    "pipeline = PipeLine().set_initiator(role='guest', party_id=guest).set_roles(guest=guest, arbiter=arbiter, host=hosts)\n",
-    "\n",
-    "data_9997 = {\"name\": \"sciq-9997-gpt2\", \"namespace\": \"experiment\"}\n",
-    "data_9999 = {\"name\": \"sciq-9999-gpt2\", \"namespace\": \"experiment\"}\n",
-    "data_10000 = {\"name\": \"sciq-10000-gpt2\", \"namespace\": \"experiment\"}\n",
-    "\n",
-    "# run the binding codes on 9997\n",
-    "data_path_9997 = fate_project_path + '/sciq/'\n",
-    "pipeline.bind_table(name=data_9997['name'], namespace=data_9997['namespace'], path=data_path_9997)\n",
-    "\n",
-    "# run the binding codes on 9998\n",
-    "data_path_9999 = fate_project_path + '/sciq/'\n",
-    "pipeline.bind_table(name=data_9999['name'], namespace=data_9999['namespace'], path=data_path_9999)\n",
-    "\n",
-    "# run the binding codes on 10000\n",
-    "data_path_10000 = fate_project_path + '/sciq/'\n",
-    "pipeline.bind_table(name=data_10000['name'], namespace=data_10000['namespace'], path=data_path_10000)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "253499d2-37a1-4fbe-9427-646d51fd6edd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# deepspeed config\n",
-    "ds_config = {\n",
-    "    \"train_micro_batch_size_per_gpu\": 2,\n",
-    "    \"gradient_accumulation_steps\": 2,\n",
-    "    \"optimizer\": {\n",
-    "        \"type\": \"AdamW\",\n",
-    "        \"params\": {\n",
-    "            \"lr\": 5e-5\n",
-    "        }\n",
-    "    }\n",
-    "    ,\n",
-    "    \"fp16\": {\n",
-    "        \"enabled\": False\n",
-    "    }\n",
-    "    ,\n",
-    "    \"zero_optimization\": {\n",
-    "        \"stage\": 1,\n",
-    "        \"offload_optimizer\": {\n",
-    "            \"device\": \"cpu\"\n",
-    "        },\n",
-    "        \"contiguous_gradients\": True,\n",
-    "        \"overlap_comm\": True\n",
-    "    }\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "909dc4fb-8d1e-4831-a6f7-744cf7d826c1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_path = 'gpt2'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2283025d-9acf-4ffa-8a25-648aa619528e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "reader_0 = Reader(name=\"reader_0\")\n",
-    "reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=data_9997)\n",
-    "reader_0.get_party_instance(role='host', party_id=hosts[0]).component_param(table=data_9999)\n",
-    "reader_0.get_party_instance(role='host', party_id=hosts[1]).component_param(table=data_10000)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5ce1cc8a-1003-4379-aa4f-bf3fa28237c8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pipeline.component.nn import DatasetParam\n",
-    "\n",
-    "# This demo utilizes the same dataset but selects distinct segments to mimic an equal data distribution across different parties. \n",
-    "# We adopt this strategy for the sake of convenience.\n",
-    "dataset_param_0 = DatasetParam(dataset_name='qa_ds', tokenizer_name_or_path=model_path, start_idx=0, select_num=3893)\n",
-    "dataset_param_1 = DatasetParam(dataset_name='qa_ds', tokenizer_name_or_path=model_path, start_idx=3893, select_num=3893)\n",
-    "dataset_param_2 = DatasetParam(dataset_name='qa_ds', tokenizer_name_or_path=model_path, start_idx=7786, select_num=3893)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "50ea1168-417c-41da-b7da-b2625c26af50",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pipeline.component.homo_nn import TrainerParam  # Interface\n",
-    "\n",
-    "# define model structure\n",
-    "sub_model_client = t.nn.CustModel(module_name='offsite_tuning.gpt2_ot', class_name='GPT2LMHeadSubModel', model_name_or_path=model_path \\\n",
-    "                                  ,emulator_layer_num=4, adapter_top_layer_num=2, adapter_bottom_layer_num=2)\n",
-    "main_model_server = t.nn.CustModel(module_name='offsite_tuning.gpt2_ot', class_name='GPT2LMHeadMainModel', model_name_or_path=model_path \\\n",
-    "                                  ,emulator_layer_num=4, adapter_top_layer_num=2, adapter_bottom_layer_num=2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "dffcace2-0d59-411e-856f-512e7eafd793",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "nn_component = HomoNN(name='nn_0')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3c854117-3fe1-4a7b-9505-bb131d95f178",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "epochs = 8\n",
-    "# We have 4 party to set\n",
-    "# Please make sure that need_aggregate is True, and epochs parameter of all parties are the same\n",
-    "nn_component.get_party_instance(role='guest', party_id=guest).component_param(model=sub_model_client, dataset=dataset_param_0,  # dataset\n",
-    "                                                                              trainer=TrainerParam(trainer_name='offsite_tuning_trainer', epochs=epochs, batch_size=4, collate_fn='DataCollatorForTokenClassification', task_type='causal_ml', \\\n",
-    "                                                                                                   save_to_local_dir=True, need_aggregate=True), ds_config=ds_config)\n",
-    "\n",
-    "nn_component.get_party_instance(role='host', party_id=hosts[0]).component_param(model=sub_model_client, dataset=dataset_param_1,  # dataset\n",
-    "                                                                              trainer=TrainerParam(trainer_name='offsite_tuning_trainer', epochs=epochs, batch_size=4, collate_fn='DataCollatorForTokenClassification', task_type='causal_ml', \\\n",
-    "                                                                                                   save_to_local_dir=True, need_aggregate=True), ds_config=ds_config)\n",
-    "\n",
-    "nn_component.get_party_instance(role='host', party_id=hosts[1]).component_param(model=sub_model_client, dataset=dataset_param_2,  # dataset\n",
-    "                                                                              trainer=TrainerParam(trainer_name='offsite_tuning_trainer', epochs=epochs, batch_size=4, collate_fn='DataCollatorForTokenClassification', task_type='causal_ml', \\\n",
-    "                                                                                                   save_to_local_dir=True, need_aggregate=True), ds_config=ds_config)\n",
-    "\n",
-    "\n",
-    "nn_component.get_party_instance(role='arbiter', party_id=arbiter).component_param(model=main_model_server,\n",
-    "                                                                                  trainer=TrainerParam(trainer_name='offsite_tuning_trainer', epochs=epochs, save_to_local_dir=True,\n",
-    "                                                                                                       need_aggregate=True),\n",
-    "                                                                                  server_init=True\n",
-    "                                                                                )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a5d173c1-5d72-4d25-9b78-91e6ef766d8c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pipeline.add_component(reader_0)\n",
-    "pipeline.add_component(nn_component, data=Data(train_data=reader_0.output.data))\n",
-    "pipeline.compile()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f6674178-2c59-43d6-b6ce-888e426f27b3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pipeline.runtime.entity import JobParameters\n",
-    "pipeline.fit(JobParameters(task_conf={\n",
-    "    \"nn_0\": {\n",
-    "        \"launcher\": \"deepspeed\",\n",
-    "        \"world_size\": 4\n",
-    "    }\n",
-    "}))"
+    "import time\n",
+    "from fate_client.pipeline.components.fate.reader import Reader\n",
+    "from fate_client.pipeline import FateFlowPipeline\n",
+    "from fate_client.pipeline.components.fate.homo_nn import HomoNN, get_conf_of_ot_runner\n",
+    "from fate_client.pipeline.components.fate.nn.algo_params import Seq2SeqTrainingArguments, FedAVGArguments\n",
+    "from fate_client.pipeline.components.fate.nn.loader import LLMModelLoader, LLMDatasetLoader, LLMCustFuncLoader\n",
+    "from peft import LoraConfig, TaskType\n",
+    "\n",
+    "\n",
+    "guest = '10000'\n",
+    "host = '10000'\n",
+    "arbiter = '10000'\n",
+    "\n",
+    "pipeline = FateFlowPipeline().set_parties(guest=guest, host=host, arbiter=arbiter)\n",
+    "\n",
+    "reader_0 = Reader(\"reader_0\", runtime_parties=dict(guest=guest, host=host))\n",
+    "reader_0.guest.task_parameters(\n",
+    "    namespace=\"experiment\",\n",
+    "    name=\"sciq\"\n",
+    ")\n",
+    "reader_0.hosts[0].task_parameters(\n",
+    "    namespace=\"experiment\",\n",
+    "    name=\"sciq\"\n",
+    ")\n",
+    "\n",
+    "client_model = LLMModelLoader(\n",
+    "    module_name='offsite_tuning.gpt2', item_name='GPT2LMHeadSubModel',\n",
+    "    model_name_or_path='gpt2',\n",
+    "    emulator_layer_num=4,\n",
+    "    adapter_top_layer_num=1,\n",
+    "    adapter_bottom_layer_num=1\n",
+    ")\n",
+    "\n",
+    "server_model = LLMModelLoader(\n",
+    "    module_name='offsite_tuning.gpt2', item_name='GPT2LMHeadMainModel',\n",
+    "    model_name_or_path='gpt2',\n",
+    "    emulator_layer_num=4,\n",
+    "    adapter_top_layer_num=1,\n",
+    "    adapter_bottom_layer_num=1  \n",
+    ")\n",
+    "\n",
+    "dataset = LLMDatasetLoader(\n",
+    "    module_name='qa_dataset', item_name='QaDataset',\n",
+    "    tokenizer_name_or_path='gpt2',\n",
+    "    select_num=100\n",
+    ")\n",
+    "\n",
+    "data_collator = LLMCustFuncLoader(module_name='cust_data_collator', item_name='get_seq2seq_tokenizer', model_path='gpt2')\n",
+    "\n",
+    "train_args = Seq2SeqTrainingArguments(\n",
+    "    per_device_train_batch_size=1,\n",
+    "    learning_rate=5e-5,\n",
+    "    disable_tqdm=False,\n",
+    "    num_train_epochs=1,\n",
+    "    logging_steps=10,\n",
+    "    logging_strategy='steps',\n",
+    "    dataloader_num_workers=4\n",
+    ")\n",
+    "\n",
+    "client_conf = get_conf_of_ot_runner(\n",
+    "    model=client_model,\n",
+    "    dataset=dataset,\n",
+    "    data_collator=data_collator,\n",
+    "    training_args=train_args,\n",
+    "    fed_args=FedAVGArguments(),\n",
+    "    aggregate_model=True\n",
+    ")\n",
+    "\n",
+    "server_conf = get_conf_of_ot_runner(\n",
+    "    model=server_model,\n",
+    "    dataset=dataset,\n",
+    "    data_collator=data_collator,\n",
+    "    training_args=train_args,\n",
+    "    fed_args=FedAVGArguments(),\n",
+    "    aggregate_model=True\n",
+    ")\n",
+    "\n",
+    "homo_nn_0 = HomoNN(\n",
+    "    'nn_0',\n",
+    "    train_data=reader_0.outputs[\"output_data\"],\n",
+    "    runner_module=\"offsite_tuning_runner\",\n",
+    "    runner_class=\"OTRunner\"\n",
+    ")\n",
+    "\n",
+    "homo_nn_0.guest.task_parameters(runner_conf=client_conf)\n",
+    "homo_nn_0.hosts[0].task_parameters(runner_conf=client_conf)\n",
+    "homo_nn_0.arbiter.task_parameters(runner_conf=server_conf)\n",
+    "\n",
+    "pipeline.add_tasks([reader_0, homo_nn_0])\n",
+    "\n",
+    "pipeline.compile()\n",
+    "pipeline.fit()"
    ]
   }
  ],

From efc1074df1702559398ad0f3cb33c027bb1e23ee Mon Sep 17 00:00:00 2001
From: cwj <talkingwallace@sohu.com>
Date: Fri, 1 Mar 2024 17:01:17 +0800
Subject: [PATCH 30/35] Fix doc path Signed-off-by: weijingchen
 <talkingwallace@sohu.com>

Signed-off-by: cwj <talkingwallace@sohu.com>
---
 README.md                                     |   2 +-
 .../ChatGLM-6B_ds.ipynb                       | 463 ------------------
 2 files changed, 1 insertion(+), 464 deletions(-)
 delete mode 100644 doc/tutorial/parameter_efficient_llm/ChatGLM-6B_ds.ipynb

diff --git a/README.md b/README.md
index 086a4c8..20a5f0c 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,6 @@ Use [FATE-LLM deployment packages](https://github.com/FederatedAI/FATE/wiki/Down
 
 ## Quick Start
 - [Offsite-tuning Tutorial: Model Definition and Job Submission](./doc/tutorial/offsite_tuning/Offsite_tuning_tutorial.ipynb)
-- [Federated ChatGLM-6B Training](./doc/tutorial/parameter_efficient_llm/ChatGLM-6B_ds.ipynb)
+- [Federated ChatGLM3-6B Training](./doc/tutorial/parameter_efficient_llm/ChatGLM3-6B_ds.ipynb)
 - [Builtin Models In PELLM](./doc/tutorial/builtin_models.md)
 - [Offsite Tuning Tutorial](./doc/tutorial/offsite_tuning/Offsite_tuning_tutorial.ipynb)
\ No newline at end of file
diff --git a/doc/tutorial/parameter_efficient_llm/ChatGLM-6B_ds.ipynb b/doc/tutorial/parameter_efficient_llm/ChatGLM-6B_ds.ipynb
deleted file mode 100644
index f3a43c1..0000000
--- a/doc/tutorial/parameter_efficient_llm/ChatGLM-6B_ds.ipynb
+++ /dev/null
@@ -1,463 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#  Federated ChatGLM Tuning with Parameter Efficient methods in FATE-LLM"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this tutorial, we will demonstrate how to efficiently train federated ChatGLM-6B with deepspeed using the FATE-LLM framework. In FATE-LLM, we introduce the \"pellm\"(Parameter Efficient Large Language Model) module, specifically designed for federated learning with large language models. We enable the implementation of parameter-efficient methods in federated learning, reducing communication overhead while maintaining model performance. In this tutorial we particularlly focus on ChatGLM-^b, and we will also emphasize the use of the Adapter mechanism for fine-tuning ChatGLM-6B, which enables us to effectively reduce communication volume and improve overall efficiency.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## FATE-LLM: ChatGLM-6B\n",
-    "\n",
-    "### ChatGLM-6B\n",
-    "ChatGLM-6B is a large transformer-based language model with 6.2 billion parameters, trained on about 1T tokens of Chinese and English corpus. ChatGLM-6B is an open bilingual language model based on General Language Model. You can download the pretrained model from [here](https://huggingface.co/THUDM/chatglm-6b), or let the program automatically download it when you use it later.\n",
-    "\n",
-    "### Current Features\n",
-    "\n",
-    "In current version, FATE-LLM: ChatGLM-6B supports the following features:\n",
-    "<div align=\"center\">\n",
-    "  <img src=\"../images/fate-llm-chatglm-6b.png\">\n",
-    "</div>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Experiment Setting"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Before running experiment, please make sure that [FATE-LLM Cluster](https://github.com/FederatedAI/FATE/wiki/Download#llm%E9%83%A8%E7%BD%B2%E5%8C%85) has been deployed. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Dataset: Advertising Text Generation\n",
-    "\n",
-    "This is an advertising test generateion dataset, you can download dataset from the following links and place it in the examples/data folder. \n",
-    "- [data link 1](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view)\n",
-    "- [data link 2](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1)  \n",
-    "\n",
-    "You can refer to following link for more details about [data](https://aclanthology.org/D19-1321.pdf)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "df = pd.read_json('${fate_install}/examples/data/AdvertiseGen/train.json', lines=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### ChatGLM-6B with Adapter\n",
-    "\n",
-    "In this section, we will guide you through the process of finetuning ChatGLM-6B with adapters using the FATE-LLM framework. Before starting this section, we recommend that you read through this tutorial first: [Model Customization](https://github.com/FederatedAI/FATE/blob/master/doc/tutorial/pipeline/nn_tutorial/Homo-NN-Customize-Model.ipynb)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "ChatGLM model is located on fate_llm/model_zoo/chatglm.py, can be use directly"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "albert.py  bert.py     deberta.py     gpt2.py\t\t\t  __pycache__\r\n",
-      "bart.py    chatglm.py  distilbert.py  parameter_efficient_llm.py  roberta.py\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "! ls ../../../fate/python/fate_llm/model_zoo/pellm"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Adapters"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We can directly use adapters from the peft. See details for adapters on this page [Adapter Methods](https://huggingface.co/docs/peft/index) for more details. By specifying the adapter name and the adapter\n",
-    "config dict we can insert adapters into our language models:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from peft import LoraConfig, TaskType\n",
-    "\n",
-    "# define lora config\n",
-    "lora_config = LoraConfig(\n",
-    "    task_type=TaskType.SEQ_CLS,\n",
-    "    inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,\n",
-    "    target_modules=['c_attn'],\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Init ChatGLM Model "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch as t\n",
-    "from pipeline import fate_torch_hook\n",
-    "from pipeline.component.nn import save_to_fate_llm\n",
-    "fate_torch_hook(t)\n",
-    "\n",
-    "model_path = \"your download chatglm path\"\n",
-    "model = t.nn.Sequential(\n",
-    "    t.nn.CustModel(module_name='pellm.chatglm', class_name='ChatGLMForConditionalGeneration',\n",
-    "                   peft_config=lora_config.to_dict(), peft_type='LoraConfig',\n",
-    "                   pretrained_path=model_path)\n",
-    ")\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**During the training process, all weights of the pretrained language model will be frozen, and weights of adapters are traininable. Thus, FATE-LLM only train in the local training and aggregate adapters' weights in the fedederation process**\n",
-    "\n",
-    "Now available adapters are [Adapters Overview](https://huggingface.co/docs/peft/index) for details.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Inint DeepSpeed Config"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ds_config = {\n",
-    "    \"train_micro_batch_size_per_gpu\": 1,\n",
-    "    \"optimizer\": {\n",
-    "        \"type\": \"Adam\",\n",
-    "        \"params\": {\n",
-    "            \"lr\": 5e-4\n",
-    "        }\n",
-    "    },\n",
-    "    \"fp16\": {\n",
-    "        \"enabled\": True\n",
-    "    },\n",
-    "    \"zero_optimization\": {\n",
-    "        \"stage\": 2,\n",
-    "        \"allgather_partitions\": True,\n",
-    "        \"allgather_bucket_size\": 5e8,\n",
-    "        \"overlap_comm\": False,\n",
-    "        \"reduce_scatter\": True,\n",
-    "        \"reduce_bucket_size\": 5e8,\n",
-    "        \"contiguous_gradients\": True\n",
-    "    }\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Submit Federated Task\n",
-    "To run federated task, please make sure to ues fate>=v1.11.2 and deploy it with gpu machines. To running this code, make sure training data path is already binded. The following code shoud be copy to a script and run in a command line like \"python federated_chatglm.py\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can use this script to submit the model, but submitting the model will take a long time to train and generate a long log, so we won't do it here."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch as t\n",
-    "import os\n",
-    "from pipeline import fate_torch_hook\n",
-    "from pipeline.component import HomoNN\n",
-    "from pipeline.backend.pipeline import PipeLine\n",
-    "from pipeline.component import Reader\n",
-    "from pipeline.interface import Data\n",
-    "from pipeline.runtime.entity import JobParameters\n",
-    "\n",
-    "fate_torch_hook(t)\n",
-    "\n",
-    "\n",
-    "guest_0 = 9999\n",
-    "host_1 = 10000\n",
-    "pipeline = PipeLine().set_initiator(role='guest', party_id=guest_0).set_roles(guest=guest_0, host=host_1,\n",
-    "                                                                              arbiter=guest_0)\n",
-    "data_guest = {\"name\": \"ad_guest\", \"namespace\": \"experiment\"}\n",
-    "data_host = {\"name\": \"ad_host\", \"namespace\": \"experiment\"}\n",
-    "guest_data_path = \"${fate_install}/examples/data/AdvertiseGen/train.json_guest\"\n",
-    "host_data_path = \"${fate_install}/examples/data/AdvertiseGen/train.json_host\"\n",
-    "# make sure the guest and host's training data are already binded\n",
-    "\n",
-    "reader_0 = Reader(name=\"reader_0\")\n",
-    "reader_0.get_party_instance(role='guest', party_id=guest_0).component_param(table=data_guest)\n",
-    "reader_0.get_party_instance(role='host', party_id=host_1).component_param(table=data_host)\n",
-    "\n",
-    "## Add your pretriained model path here, will load model&tokenizer from this path\n",
-    "\n",
-    "from peft import LoraConfig, TaskType\n",
-    "lora_config = LoraConfig(\n",
-    "    task_type=TaskType.CAUSAL_LM,\n",
-    "    inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,\n",
-    "    target_modules=['query_key_value'],\n",
-    ")\n",
-    "ds_config = {\n",
-    "    \"train_micro_batch_size_per_gpu\": 1,\n",
-    "    \"optimizer\": {\n",
-    "        \"type\": \"Adam\",\n",
-    "        \"params\": {\n",
-    "            \"lr\": 5e-4\n",
-    "        }\n",
-    "    },\n",
-    "    \"fp16\": {\n",
-    "        \"enabled\": True\n",
-    "    },\n",
-    "    \"zero_optimization\": {\n",
-    "        \"stage\": 2,\n",
-    "        \"allgather_partitions\": True,\n",
-    "        \"allgather_bucket_size\": 5e8,\n",
-    "        \"overlap_comm\": False,\n",
-    "        \"reduce_scatter\": True,\n",
-    "        \"reduce_bucket_size\": 5e8,\n",
-    "        \"contiguous_gradients\": True\n",
-    "    }\n",
-    "}\n",
-    "\n",
-    "model_path = \"your download chatglm path\"\n",
-    "from pipeline.component.homo_nn import DatasetParam, TrainerParam\n",
-    "model = t.nn.Sequential(\n",
-    "    t.nn.CustModel(module_name='pellm.chatglm', class_name='ChatGLMForConditionalGeneration',\n",
-    "                   peft_config=lora_config.to_dict(), peft_type='LoraConfig',\n",
-    "                   pretrained_path=model_path)\n",
-    ")\n",
-    "\n",
-    "# DatasetParam\n",
-    "dataset_param = DatasetParam(dataset_name='glm_tokenizer', text_max_length=64, tokenizer_name_or_path=model_path,\n",
-    "                             padding_side=\"left\")\n",
-    "# TrainerParam\n",
-    "trainer_param = TrainerParam(trainer_name='fedavg_trainer', epochs=5, batch_size=4, \n",
-    "                             checkpoint_save_freqs=1, pin_memory=False, \n",
-    "                             task_type=\"seq_2_seq_lm\",\n",
-    "                             data_loader_worker=8, \n",
-    "                             save_to_local_dir=True, # pay attention to tihs parameter\n",
-    "                             collate_fn=\"DataCollatorForSeq2Seq\")\n",
-    "\n",
-    "\n",
-    "nn_component = HomoNN(name='nn_0', model=model , ds_config=ds_config)\n",
-    "\n",
-    "# set parameter for client 1\n",
-    "nn_component.get_party_instance(role='guest', party_id=guest_0).component_param(\n",
-    "    dataset=dataset_param,\n",
-    "    trainer=trainer_param,\n",
-    "    torch_seed=100\n",
-    ")\n",
-    "\n",
-    "# set parameter for client 2\n",
-    "nn_component.get_party_instance(role='host', party_id=host_1).component_param(\n",
-    "    dataset=dataset_param,\n",
-    "    trainer=trainer_param,\n",
-    "    torch_seed=100\n",
-    ")\n",
-    "\n",
-    "# set parameter for server\n",
-    "nn_component.get_party_instance(role='arbiter', party_id=guest_0).component_param(\n",
-    "    trainer=trainer_param\n",
-    ")\n",
-    "\n",
-    "pipeline.add_component(reader_0)\n",
-    "pipeline.add_component(nn_component, data=Data(train_data=reader_0.output.data))\n",
-    "pipeline.compile()\n",
-    "\n",
-    "pipeline.fit(JobParameters(task_conf={\n",
-    "    \"nn_0\": {\n",
-    "        \"launcher\": \"deepspeed\",\n",
-    "        \"world_size\": 8 # world_size means num of gpus to train in a single client\n",
-    "    }\n",
-    "}))\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Training With P-Tuning V2 Adapter"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "To use another adapter lke P-Tuning V2, slightly changes is needed!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pipeline.component.homo_nn import DatasetParam, TrainerParam\n",
-    "model = t.nn.Sequential(\n",
-    "    t.nn.CustModel(module_name='pellm.chatglm', class_name='ChatGLMForConditionalGeneration',\n",
-    "                   pre_seq_len=128, # only this parameters is needed\n",
-    "                   pretrained_path=model_path)\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Inference"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Models trained with FATE-LLM can be find under the directory `${fate_install}/fateflow/model/$jobids/$cpn_name/{model.pkl, checkpoint_xxx.pkl/adapter_model.bin}`, users must may sure \"save_to_local_dir=True\".  \n",
-    "The following code is an example to load trained lora adapter weights:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "import sys\n",
-    "import torch\n",
-    "from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model\n",
-    "from transformers import AutoModel, AutoTokenizer\n",
-    "\n",
-    "\n",
-    "def load_model(pretrained_model_path):\n",
-    "    _tokenizer = AutoTokenizer.from_pretrained(pretrained_model_path, trust_remote_code=True)\n",
-    "    _model = AutoModel.from_pretrained(pretrained_model_path, trust_remote_code=True)\n",
-    "\n",
-    "    _model = _model.half()\n",
-    "    _model = _model.eval()\n",
-    "\n",
-    "    return _model, _tokenizer\n",
-    "\n",
-    "\n",
-    "def load_data(data_path):\n",
-    "    with open(data_path, \"r\") as fin:\n",
-    "        for _l in fin:\n",
-    "            yield json.loads(_l.strip())\n",
-    "\n",
-    "chatglm_model_path = \"\"\n",
-    "model, tokenizer = load_model(chatglm_model_path)\n",
-    "\n",
-    "test_data_path = \"{fate_install}/examples/data/AdvertiseGen/dev.json\"\n",
-    "dataset = load_data(test_data_path)\n",
-    "\n",
-    "peft_path = trained_model_path\n",
-    "peft_config = LoraConfig(\n",
-    "    task_type=TaskType.CAUSAL_LM,\n",
-    "    inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,\n",
-    "    target_modules=['query_key_value'],\n",
-    ")\n",
-    "\n",
-    "model = get_peft_model(model, peft_config)\n",
-    "model.load_state_dict(torch.load(peft_path), strict=False)\n",
-    "model = model.half()\n",
-    "model.eval()\n",
-    "\n",
-    "for p in model.parameters():\n",
-    "    if p.requires_grad:\n",
-    "        print(p)\n",
-    "\n",
-    "model.cuda(\"cuda:0\")\n",
-    "\n",
-    "content = \"advertisement keywords\"\n",
-    "model.chat(tokenizer, content, do_sample=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From c7cdd027a1af9e76d89dffcaed397129629a52e8 Mon Sep 17 00:00:00 2001
From: cwj <talkingwallace@sohu.com>
Date: Fri, 1 Mar 2024 17:06:40 +0800
Subject: [PATCH 31/35] Update Readme Signed-off-by: weijingchen
 <talkingwallace@sohu.com>gi

Signed-off-by: cwj <talkingwallace@sohu.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 20a5f0c..79fabf4 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ Deploy FATE-Standalone version with 1.11.3 <= version < 2.0, then copy directory
 Use [FATE-LLM deployment packages](https://github.com/FederatedAI/FATE/wiki/Download#llm%E9%83%A8%E7%BD%B2%E5%8C%85) to deploy,  refer to [FATE-Cluster deployment](https://github.com/FederatedAI/FATE#cluster-deployment) for more deployment details.
 
 ## Quick Start
-- [Offsite-tuning Tutorial: Model Definition and Job Submission](./doc/tutorial/offsite_tuning/Offsite_tuning_tutorial.ipynb)
 - [Federated ChatGLM3-6B Training](./doc/tutorial/parameter_efficient_llm/ChatGLM3-6B_ds.ipynb)
 - [Builtin Models In PELLM](./doc/tutorial/builtin_models.md)
-- [Offsite Tuning Tutorial](./doc/tutorial/offsite_tuning/Offsite_tuning_tutorial.ipynb)
\ No newline at end of file
+- [Offsite Tuning Tutorial](./doc/tutorial/offsite_tuning/Offsite_tuning_tutorial.ipynb)
+- [FedKSeed](./doc/tutorial/fedkseed/fedkseed-example.ipynb)
\ No newline at end of file

From 7a9a02a655a1544d8866553ef4a2747a373535b8 Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Wed, 6 Mar 2024 15:42:49 +0800
Subject: [PATCH 32/35] update readme

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 79fabf4..02d1f5b 100644
--- a/README.md
+++ b/README.md
@@ -17,13 +17,14 @@ FATE-LLM is a framework to support federated learning for large language models(
 
 ### Standalone deployment
 Please refer to [FATE-Standalone deployment](https://github.com/FederatedAI/FATE#standalone-deployment).  
-Deploy FATE-Standalone version with 1.11.3 <= version < 2.0, then copy directory `python/fate_llm` to `{fate_install}/fate/python/fate_llm`
+* To deploy FATE-LLM v2.0, deploy FATE-Standalone with version >= 2.1, then copy directory `python/fate_llm` to `{fate_install}/fate/python/fate_llm`
+* To deploy FATE-LLM v1.x, deploy FATE-Standalone with 1.11.3 <= version < 2.0, then copy directory `python/fate_llm` to `{fate_install}/fate/python/fate_llm` 
 
 ### Cluster deployment
 Use [FATE-LLM deployment packages](https://github.com/FederatedAI/FATE/wiki/Download#llm%E9%83%A8%E7%BD%B2%E5%8C%85) to deploy,  refer to [FATE-Cluster deployment](https://github.com/FederatedAI/FATE#cluster-deployment) for more deployment details.
 
 ## Quick Start
 - [Federated ChatGLM3-6B Training](./doc/tutorial/parameter_efficient_llm/ChatGLM3-6B_ds.ipynb)
-- [Builtin Models In PELLM](./doc/tutorial/builtin_models.md)
+- [Builtin Models In PELLM](./doc/tutorial/builtin_pellm_models.md)
 - [Offsite Tuning Tutorial](./doc/tutorial/offsite_tuning/Offsite_tuning_tutorial.ipynb)
 - [FedKSeed](./doc/tutorial/fedkseed/fedkseed-example.ipynb)
\ No newline at end of file

From 10643aa77e190668d65bb3dc07bd15374e0310e8 Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Wed, 6 Mar 2024 15:44:51 +0800
Subject: [PATCH 33/35] update builtin_pellm_models doc

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 doc/tutorial/builtin_pellm_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/tutorial/builtin_pellm_models.md b/doc/tutorial/builtin_pellm_models.md
index 70c3f37..a812102 100644
--- a/doc/tutorial/builtin_pellm_models.md
+++ b/doc/tutorial/builtin_pellm_models.md
@@ -6,7 +6,7 @@ After reading the training tutorial above, it's easy to use other models listing
   
 
 | Model          | ModuleName        | ClassName     | DataSetName     | 
-| -------------- | ----------------- | --------------| --------------- |                 |
+| -------------- | ----------------- | --------------| --------------- |                 
 | Qwen2          | pellm.qwen        | Qwen          | prompt_dataset  |                              
 | Bloom-7B1      | pellm.bloom       | Bloom         | prompt_dataset  |                              
 | LLaMA-2-7B     | pellm.llama       | LLaMa         | prompt_dataset  |                              

From 9de48e8cfb865a9bcd7f731e528d718b3bd65a92 Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Wed, 6 Mar 2024 15:45:37 +0800
Subject: [PATCH 34/35] fix builtin_pellm_models doc

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 doc/tutorial/builtin_pellm_models.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/tutorial/builtin_pellm_models.md b/doc/tutorial/builtin_pellm_models.md
index a812102..e2a3d49 100644
--- a/doc/tutorial/builtin_pellm_models.md
+++ b/doc/tutorial/builtin_pellm_models.md
@@ -12,7 +12,6 @@ After reading the training tutorial above, it's easy to use other models listing
 | LLaMA-2-7B     | pellm.llama       | LLaMa         | prompt_dataset  |                              
 | LLaMA-7B       | pellm.llama       | LLaMa         | prompt_dataset  |                              
 | ChatGLM3-6B    | pellm.chatglm     | ChatGLM       | prompt_dataset  |                              
-| ChatGLM-6B     | pellm.chatglm     | ChatGLM       | prompt_dataset  |                              
 | GPT-2          | pellm.gpt2        | GPT2          | seq_cls_dataset |                              
 | ALBERT         | pellm.albert      | Albert        | seq_cls_dataset |                              
 | BART           | pellm.bart        | Bart          | seq_cls_dataset |                              

From c53749bfaa29384fc63a31a11f83253125d3dce3 Mon Sep 17 00:00:00 2001
From: mgqa34 <mgq3374541@163.com>
Date: Wed, 6 Mar 2024 15:49:38 +0800
Subject: [PATCH 35/35] update deployment desc of llm-2.0

Signed-off-by: mgqa34 <mgq3374541@163.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 02d1f5b..cfd98ae 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ FATE-LLM is a framework to support federated learning for large language models(
 
 ### Standalone deployment
 Please refer to [FATE-Standalone deployment](https://github.com/FederatedAI/FATE#standalone-deployment).  
-* To deploy FATE-LLM v2.0, deploy FATE-Standalone with version >= 2.1, then copy directory `python/fate_llm` to `{fate_install}/fate/python/fate_llm`
+* To deploy FATE-LLM v2.0, deploy FATE-Standalone with version >= 2.1, then make a new directory `{fate_install}/fate_llm` and clone the code into it, install the python requirements, and add `{fate_install}/fate_llm/python` to `PYTHONPATH` 
 * To deploy FATE-LLM v1.x, deploy FATE-Standalone with 1.11.3 <= version < 2.0, then copy directory `python/fate_llm` to `{fate_install}/fate/python/fate_llm` 
 
 ### Cluster deployment