From c68d3074089f4eaa1ebfb1c3805a66a3411480dd Mon Sep 17 00:00:00 2001 From: Ruiyi Wang <76935534+ruiyiw@users.noreply.github.com> Date: Wed, 11 Oct 2023 16:12:17 -0400 Subject: [PATCH] add converting file for fastchat and together (#46) (cherry picked from commit bc855958d8f7bc7ed9f40e941525e38189f095eb) --- .../fastchat_data/fastchat_data_preprocess.py | 20 +++++++++++++++++++ .../together_data/together_data_preprocess.py | 17 ++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 data_process/data/fastchat_data/fastchat_data_preprocess.py create mode 100644 data_process/data/together_data/together_data_preprocess.py diff --git a/data_process/data/fastchat_data/fastchat_data_preprocess.py b/data_process/data/fastchat_data/fastchat_data_preprocess.py new file mode 100644 index 00000000..1674f966 --- /dev/null +++ b/data_process/data/fastchat_data/fastchat_data_preprocess.py @@ -0,0 +1,20 @@ +import json +import os + +sotopia_data_dir = "/Users/pamela/Documents/capstone/sotopia-ft-data/ft-data-gpt4-gpt4-easy-2-side-partial" + +ft_data_list = [] +count = 0 +for file in os.listdir(sotopia_data_dir): + with open(os.path.join(sotopia_data_dir, file), 'r') as f: + file_dict = json.load(f) + fastchat_dict = {"id": f"identity_{count}", "conversations": []} + fastchat_dict["conversations"].append( + {"from": "human", "value": file_dict["prompt"]}) + fastchat_dict["conversations"].append( + {"from": "gpt", "value": file_dict["result"]}) + ft_data_list.append(fastchat_dict) + count += 1 + +with open("fastchat-ft-gp4-gpt4-easy-2-side-partial.json", "w") as f: + f.write(json.dumps(ft_data_list, indent=4)) \ No newline at end of file diff --git a/data_process/data/together_data/together_data_preprocess.py b/data_process/data/together_data/together_data_preprocess.py new file mode 100644 index 00000000..c5df559f --- /dev/null +++ b/data_process/data/together_data/together_data_preprocess.py @@ -0,0 +1,17 @@ +import json +import os + +sotopia_data_dir = "ft-data-gpt4-gpt4-easy-2-side-partial/" + +ft_data_list = [] +for file in os.listdir(sotopia_data_dir): + with open(os.path.join(sotopia_data_dir, file), 'r') as f: # 2510 + file_dict = json.load(f) + output = file_dict["prompt"] + " " + file_dict["result"] + ft_data_list.append(output) + + +with open("human-bot-train-gpt4-gpt4-easy-2-side-partial.jsonl", 'w') as f: + for data in ft_data_list: + f.write(json.dumps({"text": data})) + f.write('\n')