From 896854b23f926ce4f6732eb8bc5fd3152352401f Mon Sep 17 00:00:00 2001 From: Ruiyi Wang <76935534+ruiyiw@users.noreply.github.com> Date: Mon, 6 Nov 2023 18:38:41 -0500 Subject: [PATCH] Feature/move multiturn data (#84) * support qlora mistral training * added deep speed to requirements * temporary save for switching disk region * added shuffle and access token * finished training pipeline; need to fix inference * finished training pipeline; need to fix inference * fixed inference pipeline * commiting to test deepspeed * added featurere to remove seq longer than 2048 * try to merge * minor changes * minor changes * Move together data * rename data process files and add together multiturn data preprocess --------- Co-authored-by: lwaekfjlk <1125027232@qq.com> Co-authored-by: Jasonqi146 Co-authored-by: zqi2cmu Co-authored-by: Wonderplex <50866817+Jasonqi146@users.noreply.github.com> (cherry picked from commit 6f285ba8559a3b5cd7cebd6839b22b8a41cff038) Signed-off-by: Haofei Yu <1125027232@qq.com> --- data_process/data/data_process.py | 72 ------------- data_process/dummyfile | 0 .../fastchat_data_preprocess.py | 0 .../completion_data_preprocess.py | 17 +++ .../data_process/multiturn_data_preprocess.py | 19 ++++ together_ai_ft/together_ai_data.jsonl | 100 ------------------ 6 files changed, 36 insertions(+), 172 deletions(-) delete mode 100644 data_process/data/data_process.py delete mode 100644 data_process/dummyfile rename data_process/{data/fastchat_data => }/fastchat_data_preprocess.py (100%) create mode 100644 together_ai_ft/data_process/completion_data_preprocess.py create mode 100644 together_ai_ft/data_process/multiturn_data_preprocess.py delete mode 100644 together_ai_ft/together_ai_data.jsonl diff --git a/data_process/data/data_process.py b/data_process/data/data_process.py deleted file mode 100644 index d95ee753..00000000 --- a/data_process/data/data_process.py +++ /dev/null @@ -1,72 +0,0 @@ -import json -import os - -data_dir = "sotopia-data/" -file_list = os.listdir(data_dir) -full_data = [] - -for data_file in file_list: - with open(os.path.join(data_dir, data_file), 'r') as f: - # with open("sotopia-data/01H8D2KJXFPCNB7GDJMVVCQ45Z.json", 'r') as f: - dic = json.load(f) - id = dic["pk"] - messages = dic["messages"] - - human_conv, ai_conv = [], [] - for i in range(len(messages)-1): - msg = messages[i] - if len(msg) != 4: - break - if i == 0: - # env_to_human = msg[0][2] - env_to_ai = msg[1][2] - # human_conv.append(env_to_human) - human_conv.append(env_to_ai) - # ai_conv.append(env_to_ai) - ai_conv.append("Sure!") - - if i % 2 == 0: - if msg[2][2][0] == "s": - human_to_ai = msg[2][2][6:] - else: - human_to_ai = msg[2][2] - human_conv.append(human_to_ai) - else: - if msg[3][2][0] == "s": - ai_to_human = msg[3][2][6:] - else: - ai_to_human = msg[3][2] - ai_conv.append(ai_to_human) - - conv_dic = {} - conv_dic["id"] = id - conversations = [] - - for i in range(min(len(human_conv), len(ai_conv))): - human_dic = {} - human_dic["from"] = "human" - human_dic["value"] = human_conv[i] - ai_dic = {} - ai_dic["from"] = "gpt" - ai_dic["value"] = ai_conv[i] - - conversations.append(human_dic) - conversations.append(ai_dic) - - if len(human_conv) > len(ai_conv): - human_dic = {} - human_dic["from"] = "human" - human_dic["value"] = human_conv[i] - conversations.append(human_dic) - elif len(human_conv) < len(ai_conv): - ai_dic = {} - ai_dic["from"] = "gpt" - ai_dic["value"] = ai_conv[i] - conversations.append(ai_dic) - - conv_dic["conversations"] = conversations - - full_data.append(conv_dic) - -with open("full-data.json", "w") as f: - json.dump(full_data, f, indent=4) diff --git a/data_process/dummyfile b/data_process/dummyfile deleted file mode 100644 index e69de29b..00000000 diff --git a/data_process/data/fastchat_data/fastchat_data_preprocess.py b/data_process/fastchat_data_preprocess.py similarity index 100% rename from data_process/data/fastchat_data/fastchat_data_preprocess.py rename to data_process/fastchat_data_preprocess.py diff --git a/together_ai_ft/data_process/completion_data_preprocess.py b/together_ai_ft/data_process/completion_data_preprocess.py new file mode 100644 index 00000000..c5df559f --- /dev/null +++ b/together_ai_ft/data_process/completion_data_preprocess.py @@ -0,0 +1,17 @@ +import json +import os + +sotopia_data_dir = "ft-data-gpt4-gpt4-easy-2-side-partial/" + +ft_data_list = [] +for file in os.listdir(sotopia_data_dir): + with open(os.path.join(sotopia_data_dir, file), 'r') as f: # 2510 + file_dict = json.load(f) + output = file_dict["prompt"] + " " + file_dict["result"] + ft_data_list.append(output) + + +with open("human-bot-train-gpt4-gpt4-easy-2-side-partial.jsonl", 'w') as f: + for data in ft_data_list: + f.write(json.dumps({"text": data})) + f.write('\n') diff --git a/together_ai_ft/data_process/multiturn_data_preprocess.py b/together_ai_ft/data_process/multiturn_data_preprocess.py new file mode 100644 index 00000000..1154e038 --- /dev/null +++ b/together_ai_ft/data_process/multiturn_data_preprocess.py @@ -0,0 +1,19 @@ +import json +import os + +sotopia_data_dir = "/Users/pamela/Documents/capstone/sotopia-ft-data/GPT4-4_Redis_Easy_No_Filter" + +together_data_template = """[INST] {user_msg} [/INST] {assistant_msg} """ + +lines = [] +for file in os.listdir(sotopia_data_dir): + with open(os.path.join(sotopia_data_dir, file), 'r') as f: + file_dict = json.load(f) + text = together_data_template.format( + user_msg=file_dict["prompt"], assistant_msg=file_dict["result"]) + lines.append(text) + +with open("together-ft-gpt4-gpt4-easy-no-filter.jsonl", 'w') as f: + for line in lines: + f.write(json.dumps({"text": line})) + f.write('\n') diff --git a/together_ai_ft/together_ai_data.jsonl b/together_ai_ft/together_ai_data.jsonl deleted file mode 100644 index 75367e46..00000000 --- a/together_ai_ft/together_ai_data.jsonl +++ /dev/null @@ -1,100 +0,0 @@ -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} -{"text":"[INST] how old are you? [/INST] one yaer old. "} \ No newline at end of file