From c743430a2f3c15fc3e38be15b2427a8575a07aea Mon Sep 17 00:00:00 2001 From: sharonwx54 <123585394+sharonwx54@users.noreply.github.com> Date: Wed, 27 Sep 2023 17:23:36 -0400 Subject: [PATCH] adding py (#13) --- together_ai_ft/prompt_res_data_process.py | 28 +++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 together_ai_ft/prompt_res_data_process.py diff --git a/together_ai_ft/prompt_res_data_process.py b/together_ai_ft/prompt_res_data_process.py new file mode 100644 index 00000000..a671af25 --- /dev/null +++ b/together_ai_ft/prompt_res_data_process.py @@ -0,0 +1,28 @@ +import json +import os +import re +import ast + +data_dir = "data-pair-store/" # change this to the directory of parse chat data +full_data = [] +TGTAI_FORMAT = """[INST] {user_msg} [/INST] {model_answer} """ + +def run_processing(data_dir): + file_list = os.listdir(data_dir) + print(len(file_list)) + unusable = 0 + for data_file in file_list: + try: + with open(os.path.join(data_dir, data_file), 'r') as f: + dic = json.load(f) + prompt = dic["prompt"] + result = dic["result"] + format_str = TGTAI_FORMAT.format(user_msg=prompt, model_answer=result) + full_data.append({'text': format_str}) + except: + unusable+=1 + + print(unusable) + json_output = json.dumps(full_data, indent=2) + with open("full-data.jsonl", "w") as f: + json.dump(full_data, f, indent=4)