forked from rycolab/probing-via-prompting
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset_stat.py
62 lines (51 loc) · 1.53 KB
/
dataset_stat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: utf-8 -*-
import json
import os
from run_pp import DataTrainingArguments
data_args = DataTrainingArguments()
data_args.data_dir = "./ontonotes/pp/"
data_args.task = "ner"
data_files = {
"train" : os.path.join(data_args.data_dir, data_args.task, 'train.json'),
"validation": os.path.join(data_args.data_dir, data_args.task, 'development.json'),
"test" : os.path.join(data_args.data_dir, data_args.task, 'test.json')
}
def count_data_in_json(file_path):
with open(file_path, 'r') as f:
lines = f.readlines()
count = 0
for line in lines:
try:
item = json.loads(line)
count += 1
except json.JSONDecodeError:
print(f"Error decoding line: {line}")
continue
return count
counts = {}
total = 0
for key, file_path in data_files.items():
counts[key] = count_data_in_json(file_path)
total += counts[key]
percentages = {}
for key, count in counts.items():
percentages[key] = (count / total) * 100
print("Counts:", counts)
print("Percentages:", percentages)
# raw_datasets: DatasetDict = load_dataset("json", data_files=data_files)
# raw_train: Dataset = raw_datasets["train"]
# raw_dev: Dataset = raw_datasets["validation"]
# raw_test: Dataset = raw_datasets["test"]
# token = "<|endoftext|>"
#
# labels = []
# for ex in raw_dev['text']:
# idx = ex.find(token) + len(token)
# labels.append(ex[idx:])
#
# labels_cnt = Counter(labels)
# labels_cnt_values = sorted(list(labels_cnt.values()), reverse=True)
#
# print(labels_cnt)
# print(labels_cnt_values)
# print("Mode pct: ", labels_cnt_values[0] / len(labels))