Skip to content

Commit

Permalink
Feature: rescale dataset to the target resolution and different zoom …
Browse files Browse the repository at this point in the history
…levels
  • Loading branch information
IvanKuchin committed Jul 23, 2024
1 parent 52beeb1 commit 2c4fbc1
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 68 deletions.
77 changes: 77 additions & 0 deletions dataset/borders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import tensorflow as tf
import time
import os
import sys
import inspect

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

from tools import resize_3d
import config as config


DEBUG_DATALOADER = True
DEBUG_DATA_LOADING_PERFORMANCE = False

# cutout and resize 3-d tensor with shape [w,h,d]
# 1) cut overheads off from top_left to bottom_right + 1
# top_left and bottom_right will be present in the final shape
# 2) resize shape from step(1) to final shape
# final shape taken form the config
def cutout_and_resize_tensor(tensor, top_left, bottom_right):
# assert tensor.ndim == 3
t = tensor[top_left[0]:bottom_right[0] + 1, top_left[1]:bottom_right[1] + 1, top_left[2]:bottom_right[2] + 1]
t = tf.squeeze(t)
# assert tf.rank(t) == 3
final_shape = tf.constant([config.IMAGE_DIMENSION_X, config.IMAGE_DIMENSION_Y, config.IMAGE_DIMENSION_Z])
t = resize_3d.resize_3d_image(t, final_shape)
return t


# cutout and resize 3-d tensor with shape [w,h,d]
# cut overheads off from top_left to bottom_right + 1
# percentages are used to calculate from top_left and bottom_right
#
# for example, if top_left = [0,0,0] and bottom_right = [100,100,100]
# pancreas volume from [10,10,10] to [90,90,90]
# if all percentages 0.1 (which is 10%) then
# the cut will be from [1,1,1] to [99,99,99]
def cut_and_resize_including_pancreas(data, label, top_left_percentage, bottom_right_percentage):

start_prep = time.time()
top_left_label_position = tf.reduce_min(tf.where(label == 1), axis=0)
bottom_right_label_position = tf.reduce_max(tf.where(label == 1), axis=0)
# random_offset_top_left = tf.random.uniform(shape = [3], minval = [0.0, 0.0, 0.0], maxval = tf.cast(top_left_label_position, dtype=tf.float32))
top_left_offset = top_left_percentage * tf.cast(top_left_label_position, dtype=tf.float32)
top_left_offset = tf.cast(top_left_offset, dtype = tf.int32)
# random_offset_bottom_right = tf.random.uniform(shape = [3], minval = tf.cast(bottom_right_label_position, dtype=tf.float32), maxval = tf.cast(tf.shape(data), dtype=tf.float32))
bottom_right_offset = tf.cast(tf.shape(data), dtype=tf.float32) - bottom_right_percentage * tf.cast(tf.shape(data) - tf.cast(bottom_right_label_position, dtype=tf.int32), dtype=tf.float32)
bottom_right_offset = tf.cast(bottom_right_offset, dtype = tf.int32)
finish_prep = time.time()

if DEBUG_DATALOADER:
print("\tpancreas shape:", (bottom_right_label_position - top_left_label_position).numpy())
print("\ttop_left_label_position:", top_left_label_position.numpy(), "bottom_right_label_position:", bottom_right_label_position.numpy())
print(f"\toriginal shape: {label.shape}")
print(f"\tpercentages: top_left: {top_left_percentage:.2f}, bottom_right: {bottom_right_percentage:.2f}")
print("\toffset_top_left:", top_left_offset.numpy(), "bottom_right_offset:", bottom_right_offset.numpy())
print("\tslice shape:", (bottom_right_offset - top_left_offset + 1).numpy())

start_data = time.time()
_data = cutout_and_resize_tensor(data, top_left_offset, bottom_right_offset)
finish_data = time.time()

start_label = time.time()
_label = cutout_and_resize_tensor(label, top_left_offset, bottom_right_offset)
finish_label = time.time()

if DEBUG_DATA_LOADING_PERFORMANCE:
print(f"\tDATA_LOADING_PERFORMANCE: prep: {finish_prep - start_prep:.1f} data: {finish_data - start_data:.1f} label: {finish_label - start_label:.1f}")

if DEBUG_DATALOADER:
print("\t_data shape:", _data.shape, "_label shape:", _label.shape)

return _data, _label

53 changes: 3 additions & 50 deletions dataset/craft_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import time
import tensorflow as tf
import numpy as np
import borders
import os
import sys
import inspect
Expand All @@ -15,7 +16,7 @@


DEBUG_DATALOADER = False
DEBUG_DATA_LOADING_PERFORMANCE = False
DEBUG_DATA_LOADING_PERFORMANCE = True

def fname_to_patientid(fname_src:str):
if DEBUG_DATALOADER:
Expand Down Expand Up @@ -47,54 +48,6 @@ def read_data_and_label(patient_id:str, src_folder:str):
return data_array, label_array


# cutout and resize 3-d tensor with shape [w,h,d]
# 1) cut overheads off from top_left to bottom_right + 1
# top_left and bottom_right will be present in the final shape
# 2) resize shape from step(1) to final shape
# final shape taken form the config
def cutout_and_resize_tensor(tensor, top_left, bottom_right):
# assert tensor.ndim == 3
t = tensor[top_left[0]:bottom_right[0] + 1, top_left[1]:bottom_right[1] + 1, top_left[2]:bottom_right[2] + 1]
t = tf.squeeze(t)
# assert tf.rank(t) == 3
final_shape = tf.constant([config.IMAGE_DIMENSION_X, config.IMAGE_DIMENSION_Y, config.IMAGE_DIMENSION_Z])
t = resize_3d.resize_3d_image(t, final_shape)
return t


def random_slice_including_pancreas(data, label):

start_prep = time.time()
top_left_label_position = tf.reduce_min(tf.where(label == 1), axis=0)
bottom_right_label_position = tf.reduce_max(tf.where(label == 1), axis=0)
random_offset_top_left = tf.random.uniform(shape = [3], minval = [0.0, 0.0, 0.0], maxval = tf.cast(top_left_label_position, dtype=tf.float32))
random_offset_top_left = tf.cast(random_offset_top_left, dtype = tf.int32)
random_offset_bottom_right = tf.random.uniform(shape = [3], minval = tf.cast(bottom_right_label_position, dtype=tf.float32), maxval = tf.cast(tf.shape(data), dtype=tf.float32))
random_offset_bottom_right = tf.cast(random_offset_bottom_right, dtype = tf.int32)
finish_prep = time.time()

if DEBUG_DATALOADER:
print("\ttop_left_label_position:", top_left_label_position.numpy(), "bottom_right_label_position:", bottom_right_label_position.numpy())
print("\tpancreas shape:", (bottom_right_label_position - top_left_label_position).numpy())
print("\trandom_offset_top_left:", random_offset_top_left.numpy(), "random_offset_bottom_right:", random_offset_bottom_right.numpy())
print("\tslice shape:", (random_offset_bottom_right - random_offset_top_left + 1).numpy())

start_data = time.time()
_data = cutout_and_resize_tensor(data, random_offset_top_left, random_offset_bottom_right)
finish_data = time.time()

start_label = time.time()
_label = cutout_and_resize_tensor(label, random_offset_top_left, random_offset_bottom_right)
finish_label = time.time()

if DEBUG_DATA_LOADING_PERFORMANCE:
print(f"\tDATA_LOADING_PERFORMANCE: prep: {finish_prep - start_prep:.1f} data: {finish_data - start_data:.1f} label: {finish_label - start_label:.1f}")

if DEBUG_DATALOADER:
print("\t_data shape:", _data.shape, "_label shape:", _label.shape)

return _data, _label

class FileIterator:
def __init__(self, folder):
self.folder = folder
Expand Down Expand Up @@ -127,7 +80,7 @@ def __call__(self):
finish_reading = time.time()

start_resize = time.time()
data, label = random_slice_including_pancreas(data, label)
# data, label = borders.cut_and_resize_including_pancreas(data, label, np.random.rand(), np.random.rand())
finish_resize = time.time()

if DEBUG_DATA_LOADING_PERFORMANCE:
Expand Down
57 changes: 39 additions & 18 deletions dataset/pomc_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import pydicom
import nrrd
import borders

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
Expand All @@ -26,6 +27,15 @@

DEBUG = True

# Stored Values (SV) are the values stored in the image pixel data attribute.
# Representation value should be calculated as:
# Rescaled value = SV * Rescale Slope + Rescale Intercept
# https://dicom.innolitics.com/ciods/digital-x-ray-image/dx-image/00281052
def dcim_slice_stored_value_to_rescaled_value(slice):
rescale_intercept = slice.RescaleIntercept if hasattr(slice, "RescaleIntercept") else 0
rescale_slope = slice.RescaleSlope if hasattr(slice, "RescaleSlope") else 1
return slice.pixel_array * rescale_slope + rescale_intercept

class POMCDataset:
def __init__(self, patients_src_folder, labels_src_folder, TFRECORD_FOLDER):
self.patients_src_folder = patients_src_folder
Expand Down Expand Up @@ -60,7 +70,8 @@ def read_dicom_data_from_files(files):
]

if len(slices):
result = np.stack([_.pixel_array for _ in slices], axis = -1)
result = np.stack([dcim_slice_stored_value_to_rescaled_value(_) for _ in slices], axis = -1)
# result = np.stack([_.pixel_array for _ in slices], axis = -1)
else:
print("ERROR: can't dcmread from files:", files)

Expand Down Expand Up @@ -146,6 +157,10 @@ def consistency_check(self, data, label, data_metadata, label_metadata):
print("ERROR: label space origin(", label_metadata["space origin"], ") is not close to the data first slice origin(", data_metadata["min"], ")")
return False

if np.mean(data) > 1000:
print("ERROR: data mean(", np.mean(data), ") is too high. Probably probleam with reading DCIM data")
return False

##############################
# Thos is not relevant check #
##############################
Expand All @@ -161,6 +176,7 @@ def print_statistic(self, tensor_old, tensor_new):
print("\tmin/max:\t{}/{} -> {}/{}".format(np.min(tensor_old), np.max(tensor_old), np.min(tensor_new),
np.max(tensor_new)))
print("\tmean std:\t{:.3f} {:.3f} -> {:.3f} {:.3f}".format(np.mean(tensor_old), np.std(tensor_old), np.mean(tensor_new), np.std(tensor_new)))
print("\tsum:\t\t{} -> {}".format(np.sum(tensor_old), np.sum(tensor_new)))

def preprocess_data(self, data, label):
# zoom = AUGMENT_SCALED_DIMS / data.shape
Expand Down Expand Up @@ -244,15 +260,15 @@ def sanity_check_after_preprocessing(self, data, label):

return result

def save_npy(self, subfolder: str, patient_id:str, original_data, original_label, scaled_data, scaled_label):
def save_npy(self, subfolder: str, patient_id:str, percentage: int, original_data, original_label, scaled_data, scaled_label):
result = True
scaled_data = np.cast[np.float32](scaled_data)
scaled_label = np.cast[np.int8](scaled_label)
np.savez_compressed(os.path.join(self.TFRECORD_FOLDER, subfolder, patient_id + ".npz"), [scaled_data, scaled_label])
np.savez_compressed(os.path.join(self.TFRECORD_FOLDER, subfolder, patient_id + f"_{percentage}.npz", ), [scaled_data, scaled_label])

return result

def pickle_src_data(self, ratio=0.2):
def pickle_src_data(self, train_valid_percentage=0.2):
if not os.path.exists(self.TFRECORD_FOLDER):
print("ERROR: can't find TFRecord folder:", self.TFRECORD_FOLDER)
return
Expand All @@ -264,7 +280,7 @@ def pickle_src_data(self, ratio=0.2):
folder_list = glob.glob(os.path.join(self.patients_src_folder, "*"))

for folder in folder_list:
subfolder = "train" if np.random.rand() > ratio else "valid"
subfolder = "train" if np.random.rand() > train_valid_percentage else "valid"

patient_id = self.get_patient_id_from_folder(folder)

Expand Down Expand Up @@ -293,24 +309,29 @@ def pickle_src_data(self, ratio=0.2):
print("ERROR: data & labels are not consistent patient_id:", patient_id)
continue

start_ts = time.time()
scaled_src_data, scaled_label_data = self.preprocess_data(src_data, label_data)
print("\tPreprocess data in {:.2f} sec".format(time.time() - start_ts))
for percentage in [0, 30, 60, 90]:
print(f"\n\tPreprocess data for {percentage}%")

scaled_data, scaled_label = borders.cut_and_resize_including_pancreas(src_data, label_data, percentage/100, percentage/100)

if DEBUG:
print("\tData")
self.print_statistic(src_data, scaled_src_data)
print("\tLabel")
self.print_statistic(label_data, scaled_label_data)
start_ts = time.time()
scaled_src_data, scaled_label_data = self.preprocess_data(scaled_data.numpy(), scaled_label.numpy())
print("\tPreprocess data in {:.2f} sec".format(time.time() - start_ts))


if self.sanity_check_after_preprocessing(scaled_src_data, scaled_label_data) == False:
print("ERROR: data or label failed sanity check")
continue
if DEBUG:
print("\tData")
self.print_statistic(src_data, scaled_src_data)
print("\tLabel")
self.print_statistic(label_data, scaled_label_data)


if self.sanity_check_after_preprocessing(scaled_src_data, scaled_label_data) == False:
print("ERROR: data or label failed sanity check")
continue

if self.save_npy(subfolder, patient_id, src_data, label_data, scaled_src_data, scaled_label_data) == False:
print("ERROR: can't save TFRecord patient id:", patient_id)
if self.save_npy(subfolder, patient_id, percentage, src_data, label_data, scaled_src_data, scaled_label_data) == False:
print("ERROR: can't save TFRecord patient id:", patient_id)


def main():
Expand Down

0 comments on commit 2c4fbc1

Please sign in to comment.