forked from nomic-ai/gpt4all
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean.py
74 lines (61 loc) · 2.17 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import numpy as np
import glob
import os
import json
import jsonlines
import pandas as pd
prompt_generation_dir = "raw_data_sanity_cleaned_without_p3/"
for file in glob.glob(os.path.join(prompt_generation_dir, "*.jsonl")):
if "clean.jsonl" in file:
continue
data = []
print(file)
with open(file) as f:
for line in f:
try:
contents = json.loads(line)
data.append(contents)
except BaseException:
pass
processed = []
for item in data:
if 'source' not in item:
item['source'] = 'unspecified'
if 'model_settings' in item:
item.pop('model_settings', None)
for key in list(item.keys()):
if key not in ['source', 'prompt', 'response']:
#print(item[key])
item.pop(key, None)
if isinstance(item['prompt'], dict):
if "value" in item["prompt"]:
item["prompt"] = item["prompt"]["value"]
elif "description" in item["prompt"]:
item["prompt"] = item["prompt"]["description"]
else:
continue
elif not isinstance(item['prompt'], str):
continue
if isinstance(item['response'], dict):
if "value" in item["response"]:
item["response"] = item["response"]["value"]
elif "description" in item["response"]:
item["response"] = item["response"]["description"]
else:
continue
elif not isinstance(item['response'], str):
continue
if item:
processed.append(item)
df = pd.DataFrame(processed)
prev_len = len(df)
# drop empty or null string
df = df.dropna(subset=['prompt', 'response'])
df = df[df['prompt'] != '']
df = df[df['response'] != '']
df = df[df["prompt"].str.len() > 1]
curr_len = len(df)
print(f"Removed {prev_len - curr_len} rows")
clean_name = file.split(".jsonl")[0] + "_clean.jsonl"
print(f"writing to {curr_len} rows to {clean_name}")
df.to_json(clean_name, orient="records", lines=True)