Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Organize together data #72

Merged
merged 16 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@ tests/state_of_the_union.txt

# Build
build
!dummy_file
!dummy_file
89 changes: 0 additions & 89 deletions data_process/data/multiturn_data/multiturn_data_preprocess.py

This file was deleted.

12 changes: 12 additions & 0 deletions llm_ft/data/create_dummy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import json

dummy_qa = {"id": "", "conversations": [{"from": "human", "value": "How old is Haofei?"}, {"from": "gpt", "value": "He is one year old."}]}

res = []
for i in range(1000):
new_qa = dict(dummy_qa)
new_qa["id"] = f"identity_{i}"
res.append(new_qa)

with open("./dummy_convs.json", "w") as f:
json.dump(res, f, indent=4)
28 changes: 28 additions & 0 deletions llm_ft/data/data_filter_out_long.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import json
import transformers

INPUT_PATH = "fastchat-ft-gpt4-gpt4-easy-2-side-partial-speak.json"
OUTPUT_PATH = "fastchat-ft-gpt4-gpt4-easy-2-side-partial-speak-no-long.json"
MODEL_CHECKPOINT = "meta-llama/Llama-2-13b-chat-hf"
HF_TOKEN = "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"

with open(INPUT_PATH, 'r') as f:
data = json.load(f)

tokenizer = transformers.AutoTokenizer.from_pretrained(
MODEL_CHECKPOINT,
padding = False,
truncation = False,
token=HF_TOKEN,
)

res = []
for d in data:
for conv in d['conversations']:
if conv['from'] == "human":
input_ids = tokenizer(conv['value'])
if len(input_ids) <= 2048:
res.append(d)

with open(OUTPUT_PATH, 'w') as f:
json.dump(res, f, indent=4)
16 changes: 16 additions & 0 deletions llm_ft/data/data_keep_only_speak.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import json

INPUT_PATH = "fastchat-ft-gpt4-gpt4-easy-2-side-partial.json"
OUTPUT_PATH = "fastchat-ft-gpt4-gpt4-easy-2-side-partial-speak.json"

with open(INPUT_PATH, 'r') as f:
data = json.load(f)

res = []
for d in data:
for conv in d['conversations']:
if conv['from'] == "gpt" and "'action_type': 'speak'" in conv['value']:
res.append(d)

with open(OUTPUT_PATH, 'w') as f:
json.dump(res, f, indent=4)
8 changes: 7 additions & 1 deletion llm_ft/fastchat/model/model_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,12 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
use_fast=self.use_fast_tokenizer,
revision=revision,
trust_remote_code=True,
token=None if not "token" in from_pretrained_kwargs else from_pretrained_kwargs["token"]
)
except TypeError:
tokenizer = AutoTokenizer.from_pretrained(
model_path, use_fast=False, revision=revision, trust_remote_code=True
model_path, use_fast=False, revision=revision, trust_remote_code=True,
token=None if not "token" in from_pretrained_kwargs else from_pretrained_kwargs["token"]
)
try:
model = AutoModelForCausalLM.from_pretrained(
Expand Down Expand Up @@ -154,6 +156,7 @@ def load_model(
awq_config: Optional[AWQConfig] = None,
revision: str = "main",
debug: bool = False,
hf_access_token: Optional[str|None] = None,
):
"""Load a model from Hugging Face."""
# get model adapter
Expand Down Expand Up @@ -280,6 +283,9 @@ def load_model(

if dtype is not None: # Overwrite dtype if it is provided in the arguments.
kwargs["torch_dtype"] = dtype

if hf_access_token:
kwargs["token"] = hf_access_token

# Load model
model, tokenizer = adapter.load_model(model_path, kwargs)
Expand Down
4 changes: 4 additions & 0 deletions llm_ft/fastchat/serve/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ def main(args):
judge_sent_end=args.judge_sent_end,
debug=args.debug,
history=not args.no_history,
hf_access_token = args.hf_access_token,
)
except KeyboardInterrupt:
print("exit...")
Expand Down Expand Up @@ -281,5 +282,8 @@ def main(args):
action="store_true",
help="Print useful debug information (e.g., prompts)",
)
parser.add_argument(
"--hf-access-token", type=str, default=None, help="Optional access token for Hugging Face."
)
args = parser.parse_args()
main(args)
2 changes: 2 additions & 0 deletions llm_ft/fastchat/serve/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ def chat_loop(
judge_sent_end: bool = True,
debug: bool = True,
history: bool = True,
hf_access_token: Optional[str|None] = None,
):
# Model
model, tokenizer = load_model(
Expand All @@ -322,6 +323,7 @@ def chat_loop(
awq_config=awq_config,
revision=revision,
debug=debug,
hf_access_token=hf_access_token,
)
generate_stream_func = get_generate_stream_function(model, model_path)

Expand Down
32 changes: 25 additions & 7 deletions llm_ft/fastchat/train/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import json
import math
import pathlib
import random
from typing import Dict, Optional, Sequence

import numpy as np
Expand All @@ -36,6 +37,7 @@
@dataclass
class ModelArguments:
model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
hf_access_token: Optional[str] = field(default=None)


@dataclass
Expand All @@ -47,6 +49,8 @@ class DataArguments:
default=None, metadata={"help": "Path to the evaluation data."}
)
lazy_preprocess: bool = False
shuffle: bool = True
abort_long_seq: bool = False


@dataclass
Expand Down Expand Up @@ -83,6 +87,7 @@ def trainer_save_model_safe(trainer: transformers.Trainer):
def preprocess(
sources,
tokenizer: transformers.PreTrainedTokenizer,
abort_long_seq: bool = False,
) -> Dict:
conv = get_conversation_template("vicuna")
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
Expand All @@ -100,7 +105,15 @@ def preprocess(
assert role == conv.roles[j % 2], f"{i}"
conv.append_message(role, sentence["value"])
conversations.append(conv.get_prompt())


if abort_long_seq:
new_conversation = []
for temp_conv in conversations:
token_len = tokenizer(temp_conv, return_tensors="pt", padding=False, truncation=False).input_ids.size()[1]
if token_len <= tokenizer.model_max_length: new_conversation.append(temp_conv)
conversation = new_conversation
print(f"Aborted conversations longer than {tokenizer.model_max_length}; Now have {len(conversation)} conversations")

# Tokenize conversations
input_ids = tokenizer(
conversations,
Expand Down Expand Up @@ -151,7 +164,6 @@ def preprocess(
f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
f" (ignored)"
)

return dict(
input_ids=input_ids,
labels=targets,
Expand All @@ -162,12 +174,12 @@ def preprocess(
class SupervisedDataset(Dataset):
"""Dataset for supervised fine-tuning."""

def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer):
def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, abort_long_seq: bool = False):
super(SupervisedDataset, self).__init__()

rank0_print("Formatting inputs...")
sources = [example["conversations"] for example in raw_data]
data_dict = preprocess(sources, tokenizer)
data_dict = preprocess(sources, tokenizer, abort_long_seq=abort_long_seq)

self.input_ids = data_dict["input_ids"]
self.labels = data_dict["labels"]
Expand All @@ -187,9 +199,10 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
class LazySupervisedDataset(Dataset):
"""Dataset for supervised fine-tuning."""

def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer):
def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, abort_long_seq: bool = False):
super(LazySupervisedDataset, self).__init__()
self.tokenizer = tokenizer
self.abort_long_seq = abort_long_seq

rank0_print("Formatting inputs...Skip in lazy mode")
self.tokenizer = tokenizer
Expand Down Expand Up @@ -224,13 +237,18 @@ def make_supervised_data_module(
rank0_print("Loading data...")

train_json = json.load(open(data_args.data_path, "r"))
train_dataset = dataset_cls(train_json, tokenizer=tokenizer)
if data_args.shuffle: random.shuffle(train_json)

train_dataset = dataset_cls(train_json, tokenizer=tokenizer, abort_long_seq = data_args.abort_long_seq)

if data_args.eval_data_path:
eval_json = json.load(open(data_args.eval_data_path, "r"))
eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer)
if data_args.shuffle: random.shuffle(train_json)

eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer, abort_long_seq = data_args.abort_long_seq)
else:
eval_dataset = None


return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)

Expand Down
8 changes: 7 additions & 1 deletion llm_ft/fastchat/train/train_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@
import pathlib
import typing
import os
import sys

current_directory = os.path.dirname(os.path.abspath(__file__))
root_directory = os.path.join(current_directory, '..', '..')
sys.path.append(root_directory)

from deepspeed import zero
from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
Expand All @@ -35,7 +40,6 @@
)



@dataclass
class TrainingArguments(transformers.TrainingArguments):
cache_dir: typing.Optional[str] = field(default=None)
Expand Down Expand Up @@ -137,6 +141,7 @@ def train():
)
if lora_args.q_lora
else None,
token=None if not model_args.hf_access_token else model_args.hf_access_token,
)
lora_config = LoraConfig(
r=lora_args.lora_r,
Expand Down Expand Up @@ -176,6 +181,7 @@ def train():
model_max_length=training_args.model_max_length,
padding_side="right",
use_fast=False,
token=None if not model_args.hf_access_token else model_args.hf_access_token,
)
tokenizer.pad_token = tokenizer.unk_token

Expand Down
1 change: 0 additions & 1 deletion llm_ft/inference.sh

This file was deleted.

1 change: 1 addition & 0 deletions llm_ft/llama2-13b_inference.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python3 -m fastchat.serve.cli --model-path ./checkpoint-shuffle/checkpoint-161 --hf-access-token "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" --conv-template "vicuna_v1.1"
Loading