From 491c1b0ae292ce2a4c2ac1f50aac04c5911318cc Mon Sep 17 00:00:00 2001 From: Ruiyi Wang <76935534+ruiyiw@users.noreply.github.com> Date: Mon, 6 Nov 2023 18:38:41 -0500 Subject: [PATCH] Feature/move multiturn data (#84) * support qlora mistral training * added deep speed to requirements * temporary save for switching disk region * added shuffle and access token * finished training pipeline; need to fix inference * finished training pipeline; need to fix inference * fixed inference pipeline * commiting to test deepspeed * added featurere to remove seq longer than 2048 * try to merge * minor changes * minor changes * Move together data * rename data process files and add together multiturn data preprocess --------- Co-authored-by: lwaekfjlk <1125027232@qq.com> Co-authored-by: Jasonqi146 Co-authored-by: zqi2cmu Co-authored-by: Wonderplex <50866817+Jasonqi146@users.noreply.github.com> (cherry picked from commit 6f285ba8559a3b5cd7cebd6839b22b8a41cff038) --- data_process/dummyfile | 0 data_process/fastchat_data_preprocess.py | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+) delete mode 100644 data_process/dummyfile create mode 100644 data_process/fastchat_data_preprocess.py diff --git a/data_process/dummyfile b/data_process/dummyfile deleted file mode 100644 index e69de29b..00000000 diff --git a/data_process/fastchat_data_preprocess.py b/data_process/fastchat_data_preprocess.py new file mode 100644 index 00000000..1674f966 --- /dev/null +++ b/data_process/fastchat_data_preprocess.py @@ -0,0 +1,20 @@ +import json +import os + +sotopia_data_dir = "/Users/pamela/Documents/capstone/sotopia-ft-data/ft-data-gpt4-gpt4-easy-2-side-partial" + +ft_data_list = [] +count = 0 +for file in os.listdir(sotopia_data_dir): + with open(os.path.join(sotopia_data_dir, file), 'r') as f: + file_dict = json.load(f) + fastchat_dict = {"id": f"identity_{count}", "conversations": []} + fastchat_dict["conversations"].append( + {"from": "human", "value": file_dict["prompt"]}) + fastchat_dict["conversations"].append( + {"from": "gpt", "value": file_dict["result"]}) + ft_data_list.append(fastchat_dict) + count += 1 + +with open("fastchat-ft-gp4-gpt4-easy-2-side-partial.json", "w") as f: + f.write(json.dumps(ft_data_list, indent=4)) \ No newline at end of file