Skip to content

Commit

Permalink
added sotopia template
Browse files Browse the repository at this point in the history
  • Loading branch information
Jasonqi146 committed Oct 26, 2023
1 parent 30dc2a4 commit 9a01411
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 19 deletions.
25 changes: 13 additions & 12 deletions llm_ft/fastchat/train/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
from transformers.trainer_pt_utils import LabelSmoother

from fastchat.conversation import SeparatorStyle
from fastchat.model.model_adapter import get_conversation_template
# from fastchat.model.model_adapter import get_conversation_template
from fastchat.conversation import get_conv_template

IGNORE_TOKEN_ID = LabelSmoother.ignore_index

Expand All @@ -50,7 +51,7 @@ class DataArguments:
)
lazy_preprocess: bool = False
shuffle: bool = True
abort_long_seq: bool = False
drop_long_seq: bool = False


@dataclass
Expand Down Expand Up @@ -87,9 +88,9 @@ def trainer_save_model_safe(trainer: transformers.Trainer):
def preprocess(
sources,
tokenizer: transformers.PreTrainedTokenizer,
abort_long_seq: bool = False,
drop_long_seq: bool = False,
) -> Dict:
conv = get_conversation_template("vicuna")
conv = get_conv_template("sotopia")
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}

# Apply prompt templates
Expand All @@ -106,13 +107,13 @@ def preprocess(
conv.append_message(role, sentence["value"])
conversations.append(conv.get_prompt())

if abort_long_seq:
if drop_long_seq:
new_conversation = []
for temp_conv in conversations:
token_len = tokenizer(temp_conv, return_tensors="pt", padding=False, truncation=False).input_ids.size()[1]
if token_len <= tokenizer.model_max_length: new_conversation.append(temp_conv)
conversation = new_conversation
print(f"Aborted conversations longer than {tokenizer.model_max_length}; Now have {len(conversation)} conversations")
print(f"Dropping conversations longer than {tokenizer.model_max_length}; Now have {len(conversation)} conversations")

# Tokenize conversations
input_ids = tokenizer(
Expand Down Expand Up @@ -174,12 +175,12 @@ def preprocess(
class SupervisedDataset(Dataset):
"""Dataset for supervised fine-tuning."""

def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, abort_long_seq: bool = False):
def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, drop_long_seq: bool = False):
super(SupervisedDataset, self).__init__()

rank0_print("Formatting inputs...")
sources = [example["conversations"] for example in raw_data]
data_dict = preprocess(sources, tokenizer, abort_long_seq=abort_long_seq)
data_dict = preprocess(sources, tokenizer, drop_long_seq=drop_long_seq)

self.input_ids = data_dict["input_ids"]
self.labels = data_dict["labels"]
Expand All @@ -199,10 +200,10 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
class LazySupervisedDataset(Dataset):
"""Dataset for supervised fine-tuning."""

def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, abort_long_seq: bool = False):
def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, drop_long_seq: bool = False):
super(LazySupervisedDataset, self).__init__()
self.tokenizer = tokenizer
self.abort_long_seq = abort_long_seq
self.drop_long_seq = drop_long_seq

rank0_print("Formatting inputs...Skip in lazy mode")
self.tokenizer = tokenizer
Expand Down Expand Up @@ -239,13 +240,13 @@ def make_supervised_data_module(
train_json = json.load(open(data_args.data_path, "r"))
if data_args.shuffle: random.shuffle(train_json)

train_dataset = dataset_cls(train_json, tokenizer=tokenizer, abort_long_seq = data_args.abort_long_seq)
train_dataset = dataset_cls(train_json, tokenizer=tokenizer, drop_long_seq = data_args.drop_long_seq)

if data_args.eval_data_path:
eval_json = json.load(open(data_args.eval_data_path, "r"))
if data_args.shuffle: random.shuffle(train_json)

eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer, abort_long_seq = data_args.abort_long_seq)
eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer, drop_long_seq = data_args.drop_long_seq)
else:
eval_dataset = None

Expand Down
2 changes: 2 additions & 0 deletions llm_ft/fastchat/train/train_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ def train():
training_args,
lora_args,
) = parser.parse_args_into_dataclasses()

print(data_args)

device_map = None
world_size = int(os.environ.get("WORLD_SIZE", 1))
Expand Down
14 changes: 7 additions & 7 deletions llm_ft/llama2-13b_qlora_train.sh
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
deepspeed --num_gpus=1 fastchat/train/train_lora.py \
--model_name_or_path meta-llama/Llama-2-13b-chat-hf \
--model_name_or_path meta-llama/Llama-2-13b-hf \
--lora_r 8 \
--lora_alpha 16 \
--lora_dropout 0.05 \
--data_path ./data/fastchat-ft-gpt4-gpt4-easy-2-side-partial-speak-drop-long.json \
--data_path ./data/fastchat-ft-gpt4-gpt4-easy-2-side-partial.json \
--shuffle True \
--bf16 True \
--output_dir ./checkpoint-shuffle-speak-drop-long \
--num_train_epochs 4 \
--output_dir ./checkpoint-shuffle-drop-long \
--num_train_epochs 20 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 32 \
--gradient_accumulation_steps 16 \
--evaluation_strategy "no" \
--save_strategy "epoch" \
--save_total_limit 6 \
Expand All @@ -25,12 +25,12 @@ deepspeed --num_gpus=1 fastchat/train/train_lora.py \
--hf_access_token "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" \
--tf32 True \
--flash_attn True \
--abort_long_seq True \
--drop_long_seq True \

# Possible other options
# --flash_attn True \
# --tf32 True \
# --save_strategy "steps" \
# --save_steps 1200 \
# --abort_long_seq True \
# --drop_long_seq True \
# --lazy_preprocess True \

0 comments on commit 9a01411

Please sign in to comment.