-
Notifications
You must be signed in to change notification settings - Fork 0
/
setup_chexpert.py
88 lines (63 loc) · 3.15 KB
/
setup_chexpert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import glob
from tqdm import tqdm
from PIL import Image
from source.constants import CHEXPERT_PATH
"""
The following directory uses the contents of
https://stanfordaimi.azurewebsites.net/datasets/8cbd9ed4-2eb9-4565-affc-111cf4f7ebe2
which containts train and val set with labels.
Furthermore, the test set with labels is obtained through
https://stanfordaimi.azurewebsites.net/datasets/23c56a0d-15de-405b-87c8-99c30138950c.
Finally, the CHEXPERT_DEMO file is obtained from
https://stanfordaimi.azurewebsites.net/datasets/192ada7c-4d43-466e-b8bb-b81992bb80cf.
All downloads require to register at Stanford AIMI.
Total download size is about 500 GB and more than a terrabyte after decompression.
Downsizing as done here substantially reduces the size.
A little bit of moving things around was necessary.
I moved / merged all folders I obtained after decompression into 'val' and 'train' respectively.
"""
#! This needs to be modified for the actual path
SOURCE_PATH = "/system/user/publicdata/chexpertchestxrays-u20210408"
#! = True if need to move images to local SSD
move_images = True
# make sure directory to copy data to exists
os.makedirs(CHEXPERT_PATH, exist_ok=True)
########################
# Cleaning annotations #
########################
# copy metadata to the correct location
os.system(f"cp {SOURCE_PATH}/CHEXPERT_DEMO.xlsx {CHEXPERT_PATH}/CHEXPERT_DEMO.xlsx")
# there are three different label options for train, we use the latest (best) one
os.system(f"cp {SOURCE_PATH}/train_visualCheXbert.csv {CHEXPERT_PATH}/train.csv")
os.system(f"cp {SOURCE_PATH}/valid.csv {CHEXPERT_PATH}/valid.csv")
os.system(f"cp {SOURCE_PATH}/test.csv {CHEXPERT_PATH}/test.csv")
#########################
# Moving images to disk #
#########################
if move_images:
for split in ["train", "valid", "test"]:
target_dir = CHEXPERT_PATH + f"/{split}"
source_dir = SOURCE_PATH + f"/{split}"
# Create the target directory if it doesn't exist
os.makedirs(target_dir, exist_ok=True)
print(f"searching for images in the {split} source directory...")
# Get a list of all image files in the source directory
image_files = glob.glob(source_dir + "/**/*.jpg", recursive=True)
print("done")
# Copy and resize each image file to the target directory
for image_file in tqdm(image_files):
# Get the relative path of the image file
relative_path = os.path.relpath(image_file, source_dir)
# Get the target path for the image file
target_path = os.path.join(target_dir, relative_path)
# Create the target directory if it doesn't exist
os.makedirs(os.path.dirname(target_path), exist_ok=True)
# Open the image file
image = Image.open(image_file)
# Resize the image to 224x224
resized_image = image.resize((224, 224))
# Save the resized image to the target path
resized_image.save(target_path)
# Print a message when the copying and resizing is complete
print("Copying and resizing images complete.")