-
Notifications
You must be signed in to change notification settings - Fork 0
/
utk_dataset_builder.py
87 lines (62 loc) · 2.3 KB
/
utk_dataset_builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#%%
import os
import shutil
import torch
from torchvision import transforms
from torch.utils.data import random_split
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
from PIL import Image
import pytorch_lightning as pl
#%% CONSTANTS
# Dir of raw images
DATASET_SRC = 'data/UTKFace'
# Where to save destination
UTK_DIST = 'data/utk_races'
# Must sum to 1.0
TRAIN_RATIO = 0.6
VAL_RATIO = 0.2
TEST_RATIO = 0.2
SEED = 101
pl.seed_everything(SEED)
class UTKFacesDataset(Dataset):
def __init__(self, folder_path):
self.folder_path = folder_path
self.image_list = os.listdir(folder_path)
self.classes = [0,1,2,3]
def __len__(self):
return len(self.image_list)
def __getitem__(self, idx):
image_name = self.image_list[idx]
image_path = os.path.join(self.folder_path, image_name)
image = Image.open(image_path).convert('RGB')
age, gender, race, _ = image_name.split('_')
age = int(age)
gender = int(gender)
race = int(race)
#label = torch.tensor([age, gender, race], dtype=torch.float32)
return image, race, image_path
# Loading UTKFaces
dataset = UTKFacesDataset(DATASET_SRC)
# Ignore label 4 which is the unknown ethnicity
included_labels = []
for idx in range(len(dataset)):
if dataset[idx][1] != 4:
included_labels.append(idx)
dataset = torch.utils.data.Subset(dataset, included_labels)
print('Number of instances:', len(included_labels))
# Get splits number of instances
train_size = int(TRAIN_RATIO * len(dataset))
val_size = int(VAL_RATIO * len(dataset))
test_size = len(dataset) - train_size - val_size
# Split the dataset into train, validation, and test sets
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
#%% Copy each individual image into corresponding split and label directory
os.makedirs(UTK_DIST, exist_ok=True)
for ds, split in zip([train_dataset, val_dataset, test_dataset], ['train', 'val', 'test']):
os.makedirs(os.path.join(UTK_DIST, split), exist_ok=True)
for i in range(len(ds)):
_, label, img_path = ds.__getitem__(i)
dist = os.path.join(UTK_DIST, split, str(label))
os.makedirs(dist, exist_ok=True)
shutil.copy(img_path, dist)