diff --git a/llm_ft/fastchat/train/train.py b/llm_ft/fastchat/train/train.py index 5870e5e6..6130d599 100644 --- a/llm_ft/fastchat/train/train.py +++ b/llm_ft/fastchat/train/train.py @@ -29,7 +29,8 @@ from transformers.trainer_pt_utils import LabelSmoother from fastchat.conversation import SeparatorStyle -from fastchat.model.model_adapter import get_conversation_template +# from fastchat.model.model_adapter import get_conversation_template +from fastchat.conversation import get_conv_template IGNORE_TOKEN_ID = LabelSmoother.ignore_index @@ -50,7 +51,7 @@ class DataArguments: ) lazy_preprocess: bool = False shuffle: bool = True - abort_long_seq: bool = False + drop_long_seq: bool = False @dataclass @@ -87,9 +88,9 @@ def trainer_save_model_safe(trainer: transformers.Trainer): def preprocess( sources, tokenizer: transformers.PreTrainedTokenizer, - abort_long_seq: bool = False, + drop_long_seq: bool = False, ) -> Dict: - conv = get_conversation_template("vicuna") + conv = get_conv_template("sotopia") roles = {"human": conv.roles[0], "gpt": conv.roles[1]} # Apply prompt templates @@ -106,13 +107,13 @@ def preprocess( conv.append_message(role, sentence["value"]) conversations.append(conv.get_prompt()) - if abort_long_seq: + if drop_long_seq: new_conversation = [] for temp_conv in conversations: token_len = tokenizer(temp_conv, return_tensors="pt", padding=False, truncation=False).input_ids.size()[1] if token_len <= tokenizer.model_max_length: new_conversation.append(temp_conv) conversation = new_conversation - print(f"Aborted conversations longer than {tokenizer.model_max_length}; Now have {len(conversation)} conversations") + print(f"Dropping conversations longer than {tokenizer.model_max_length}; Now have {len(conversation)} conversations") # Tokenize conversations input_ids = tokenizer( @@ -174,12 +175,12 @@ def preprocess( class SupervisedDataset(Dataset): """Dataset for supervised fine-tuning.""" - def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, abort_long_seq: bool = False): + def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, drop_long_seq: bool = False): super(SupervisedDataset, self).__init__() rank0_print("Formatting inputs...") sources = [example["conversations"] for example in raw_data] - data_dict = preprocess(sources, tokenizer, abort_long_seq=abort_long_seq) + data_dict = preprocess(sources, tokenizer, drop_long_seq=drop_long_seq) self.input_ids = data_dict["input_ids"] self.labels = data_dict["labels"] @@ -199,10 +200,10 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]: class LazySupervisedDataset(Dataset): """Dataset for supervised fine-tuning.""" - def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, abort_long_seq: bool = False): + def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, drop_long_seq: bool = False): super(LazySupervisedDataset, self).__init__() self.tokenizer = tokenizer - self.abort_long_seq = abort_long_seq + self.drop_long_seq = drop_long_seq rank0_print("Formatting inputs...Skip in lazy mode") self.tokenizer = tokenizer @@ -239,13 +240,13 @@ def make_supervised_data_module( train_json = json.load(open(data_args.data_path, "r")) if data_args.shuffle: random.shuffle(train_json) - train_dataset = dataset_cls(train_json, tokenizer=tokenizer, abort_long_seq = data_args.abort_long_seq) + train_dataset = dataset_cls(train_json, tokenizer=tokenizer, drop_long_seq = data_args.drop_long_seq) if data_args.eval_data_path: eval_json = json.load(open(data_args.eval_data_path, "r")) if data_args.shuffle: random.shuffle(train_json) - eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer, abort_long_seq = data_args.abort_long_seq) + eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer, drop_long_seq = data_args.drop_long_seq) else: eval_dataset = None diff --git a/llm_ft/fastchat/train/train_lora.py b/llm_ft/fastchat/train/train_lora.py index 4d845e49..77032654 100644 --- a/llm_ft/fastchat/train/train_lora.py +++ b/llm_ft/fastchat/train/train_lora.py @@ -112,6 +112,8 @@ def train(): training_args, lora_args, ) = parser.parse_args_into_dataclasses() + + print(data_args) device_map = None world_size = int(os.environ.get("WORLD_SIZE", 1)) diff --git a/llm_ft/llama2-13b_qlora_train.sh b/llm_ft/llama2-13b_qlora_train.sh index f24332e2..a715a40d 100644 --- a/llm_ft/llama2-13b_qlora_train.sh +++ b/llm_ft/llama2-13b_qlora_train.sh @@ -1,16 +1,16 @@ deepspeed --num_gpus=1 fastchat/train/train_lora.py \ - --model_name_or_path meta-llama/Llama-2-13b-chat-hf \ + --model_name_or_path meta-llama/Llama-2-13b-hf \ --lora_r 8 \ --lora_alpha 16 \ --lora_dropout 0.05 \ - --data_path ./data/fastchat-ft-gpt4-gpt4-easy-2-side-partial-speak-drop-long.json \ + --data_path ./data/fastchat-ft-gpt4-gpt4-easy-2-side-partial.json \ --shuffle True \ --bf16 True \ - --output_dir ./checkpoint-shuffle-speak-drop-long \ - --num_train_epochs 4 \ + --output_dir ./checkpoint-shuffle-drop-long \ + --num_train_epochs 20 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 32 \ + --gradient_accumulation_steps 16 \ --evaluation_strategy "no" \ --save_strategy "epoch" \ --save_total_limit 6 \ @@ -25,12 +25,12 @@ deepspeed --num_gpus=1 fastchat/train/train_lora.py \ --hf_access_token "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" \ --tf32 True \ --flash_attn True \ - --abort_long_seq True \ + --drop_long_seq True \ # Possible other options # --flash_attn True \ # --tf32 True \ # --save_strategy "steps" \ # --save_steps 1200 \ -# --abort_long_seq True \ +# --drop_long_seq True \ # --lazy_preprocess True \ \ No newline at end of file