Feature: rescale dataset to the target resolution and different zoom …

…levels
IvanKuchin · Jul 23, 2024 · 2c4fbc1 · 2c4fbc1
1 parent 52beeb1
commit 2c4fbc1
Show file tree

Hide file tree

Showing 3 changed files with 119 additions and 68 deletions.
diff --git a/dataset/borders.py b/dataset/borders.py
@@ -0,0 +1,77 @@
+import tensorflow as tf
+import time
+import os
+import sys
+import inspect
+
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+parentdir = os.path.dirname(currentdir)
+sys.path.insert(0, parentdir) 
+
+from tools import resize_3d
+import config as config
+
+
+DEBUG_DATALOADER = True
+DEBUG_DATA_LOADING_PERFORMANCE = False
+
+# cutout and resize 3-d tensor with shape [w,h,d]
+# 1) cut overheads off from top_left to bottom_right + 1
+# top_left and bottom_right will be present in the final shape
+# 2) resize shape from step(1) to final shape
+# final shape taken form the config
+def cutout_and_resize_tensor(tensor, top_left, bottom_right):
+ # assert tensor.ndim == 3
+ t = tensor[top_left[0]:bottom_right[0] + 1, top_left[1]:bottom_right[1] + 1, top_left[2]:bottom_right[2] + 1]
+ t = tf.squeeze(t)
+ # assert tf.rank(t) == 3
+ final_shape = tf.constant([config.IMAGE_DIMENSION_X, config.IMAGE_DIMENSION_Y, config.IMAGE_DIMENSION_Z])
+ t = resize_3d.resize_3d_image(t, final_shape)
+ return t
+
+
+# cutout and resize 3-d tensor with shape [w,h,d]
+# cut overheads off from top_left to bottom_right + 1
+# percentages are used to calculate from top_left and bottom_right
+#
+# for example, if top_left = [0,0,0] and bottom_right = [100,100,100]
+# pancreas volume from [10,10,10] to [90,90,90] 
+# if all percentages 0.1 (which is 10%) then
+# the cut will be from [1,1,1] to [99,99,99]
+def cut_and_resize_including_pancreas(data, label, top_left_percentage, bottom_right_percentage):
+
+ start_prep = time.time()
+ top_left_label_position = tf.reduce_min(tf.where(label == 1), axis=0)
+ bottom_right_label_position = tf.reduce_max(tf.where(label == 1), axis=0)
+ # random_offset_top_left = tf.random.uniform(shape = [3], minval = [0.0, 0.0, 0.0], maxval = tf.cast(top_left_label_position, dtype=tf.float32))
+ top_left_offset = top_left_percentage * tf.cast(top_left_label_position, dtype=tf.float32)
+ top_left_offset = tf.cast(top_left_offset, dtype = tf.int32)
+ # random_offset_bottom_right = tf.random.uniform(shape = [3], minval = tf.cast(bottom_right_label_position, dtype=tf.float32), maxval = tf.cast(tf.shape(data), dtype=tf.float32))
+ bottom_right_offset = tf.cast(tf.shape(data), dtype=tf.float32) - bottom_right_percentage * tf.cast(tf.shape(data) - tf.cast(bottom_right_label_position, dtype=tf.int32), dtype=tf.float32)
+ bottom_right_offset = tf.cast(bottom_right_offset, dtype = tf.int32)
+ finish_prep = time.time()
+
+ if DEBUG_DATALOADER:
+ print("\tpancreas shape:", (bottom_right_label_position - top_left_label_position).numpy())
+ print("\ttop_left_label_position:", top_left_label_position.numpy(), "bottom_right_label_position:", bottom_right_label_position.numpy())
+ print(f"\toriginal shape: {label.shape}")
+ print(f"\tpercentages: top_left: {top_left_percentage:.2f}, bottom_right: {bottom_right_percentage:.2f}")
+ print("\toffset_top_left:", top_left_offset.numpy(), "bottom_right_offset:", bottom_right_offset.numpy())
+ print("\tslice shape:", (bottom_right_offset - top_left_offset + 1).numpy())
+
+ start_data = time.time()
+ _data = cutout_and_resize_tensor(data, top_left_offset, bottom_right_offset)
+ finish_data = time.time()
+
+ start_label = time.time()
+ _label = cutout_and_resize_tensor(label, top_left_offset, bottom_right_offset)
+ finish_label = time.time()
+
+ if DEBUG_DATA_LOADING_PERFORMANCE:
+ print(f"\tDATA_LOADING_PERFORMANCE: prep: {finish_prep - start_prep:.1f} data: {finish_data - start_data:.1f} label: {finish_label - start_label:.1f}")
+
+ if DEBUG_DATALOADER:
+ print("\t_data shape:", _data.shape, "_label shape:", _label.shape)
+
+ return _data, _label
+
diff --git a/dataset/craft_datasets.py b/dataset/craft_datasets.py
@@ -2,6 +2,7 @@
 import time
 import tensorflow as tf
 import numpy as np
+import borders
 import os
 import sys
 import inspect
@@ -15,7 +16,7 @@
 
 
 DEBUG_DATALOADER = False
-DEBUG_DATA_LOADING_PERFORMANCE = False
+DEBUG_DATA_LOADING_PERFORMANCE = True
 
 def fname_to_patientid(fname_src:str):
  if DEBUG_DATALOADER:
@@ -47,54 +48,6 @@ def read_data_and_label(patient_id:str, src_folder:str):
  return data_array, label_array
 
 
-# cutout and resize 3-d tensor with shape [w,h,d]
-# 1) cut overheads off from top_left to bottom_right + 1
-# top_left and bottom_right will be present in the final shape
-# 2) resize shape from step(1) to final shape
-# final shape taken form the config
-def cutout_and_resize_tensor(tensor, top_left, bottom_right):
- # assert tensor.ndim == 3
- t = tensor[top_left[0]:bottom_right[0] + 1, top_left[1]:bottom_right[1] + 1, top_left[2]:bottom_right[2] + 1]
- t = tf.squeeze(t)
- # assert tf.rank(t) == 3
- final_shape = tf.constant([config.IMAGE_DIMENSION_X, config.IMAGE_DIMENSION_Y, config.IMAGE_DIMENSION_Z])
- t = resize_3d.resize_3d_image(t, final_shape)
- return t
-
-
-def random_slice_including_pancreas(data, label):
-
- start_prep = time.time()
- top_left_label_position = tf.reduce_min(tf.where(label == 1), axis=0)
- bottom_right_label_position = tf.reduce_max(tf.where(label == 1), axis=0)
- random_offset_top_left = tf.random.uniform(shape = [3], minval = [0.0, 0.0, 0.0], maxval = tf.cast(top_left_label_position, dtype=tf.float32))
- random_offset_top_left = tf.cast(random_offset_top_left, dtype = tf.int32)
- random_offset_bottom_right = tf.random.uniform(shape = [3], minval = tf.cast(bottom_right_label_position, dtype=tf.float32), maxval = tf.cast(tf.shape(data), dtype=tf.float32))
- random_offset_bottom_right = tf.cast(random_offset_bottom_right, dtype = tf.int32)
- finish_prep = time.time()
-
- if DEBUG_DATALOADER:
- print("\ttop_left_label_position:", top_left_label_position.numpy(), "bottom_right_label_position:", bottom_right_label_position.numpy())
- print("\tpancreas shape:", (bottom_right_label_position - top_left_label_position).numpy())
- print("\trandom_offset_top_left:", random_offset_top_left.numpy(), "random_offset_bottom_right:", random_offset_bottom_right.numpy())
- print("\tslice shape:", (random_offset_bottom_right - random_offset_top_left + 1).numpy())
-
- start_data = time.time()
- _data = cutout_and_resize_tensor(data, random_offset_top_left, random_offset_bottom_right)
- finish_data = time.time()
-
- start_label = time.time()
- _label = cutout_and_resize_tensor(label, random_offset_top_left, random_offset_bottom_right)
- finish_label = time.time()
-
- if DEBUG_DATA_LOADING_PERFORMANCE:
- print(f"\tDATA_LOADING_PERFORMANCE: prep: {finish_prep - start_prep:.1f} data: {finish_data - start_data:.1f} label: {finish_label - start_label:.1f}")
-
- if DEBUG_DATALOADER:
- print("\t_data shape:", _data.shape, "_label shape:", _label.shape)
-
- return _data, _label
-
 class FileIterator:
  def __init__(self, folder):
  self.folder = folder
@@ -127,7 +80,7 @@ def __call__(self):
  finish_reading = time.time()
 
  start_resize = time.time()
- data, label = random_slice_including_pancreas(data, label)
+ # data, label = borders.cut_and_resize_including_pancreas(data, label, np.random.rand(), np.random.rand())
  finish_resize = time.time()
 
  if DEBUG_DATA_LOADING_PERFORMANCE:

diff --git a/dataset/pomc_dataset.py b/dataset/pomc_dataset.py
@@ -8,6 +8,7 @@
 import numpy as np
 import pydicom
 import nrrd
+import borders
 
 currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
 parentdir = os.path.dirname(currentdir)
@@ -26,6 +27,15 @@
 
 DEBUG = True
 
+# Stored Values (SV) are the values stored in the image pixel data attribute.
+# Representation value should be calculated as:
+# Rescaled value = SV * Rescale Slope + Rescale Intercept
+# https://dicom.innolitics.com/ciods/digital-x-ray-image/dx-image/00281052
+def dcim_slice_stored_value_to_rescaled_value(slice):
+ rescale_intercept = slice.RescaleIntercept if hasattr(slice, "RescaleIntercept") else 0
+ rescale_slope = slice.RescaleSlope if hasattr(slice, "RescaleSlope") else 1
+ return slice.pixel_array * rescale_slope + rescale_intercept
+
 class POMCDataset:
  def __init__(self, patients_src_folder, labels_src_folder, TFRECORD_FOLDER):
  self.patients_src_folder = patients_src_folder
@@ -60,7 +70,8 @@ def read_dicom_data_from_files(files):
  ]
 
  if len(slices):
- result = np.stack([_.pixel_array for _ in slices], axis = -1)
+ result = np.stack([dcim_slice_stored_value_to_rescaled_value(_) for _ in slices], axis = -1)
+ # result = np.stack([_.pixel_array for _ in slices], axis = -1)
  else:
  print("ERROR: can't dcmread from files:", files)
 
@@ -146,6 +157,10 @@ def consistency_check(self, data, label, data_metadata, label_metadata):
  print("ERROR: label space origin(", label_metadata["space origin"], ") is not close to the data first slice origin(", data_metadata["min"], ")")
  return False
 
+ if np.mean(data) > 1000:
+ print("ERROR: data mean(", np.mean(data), ") is too high. Probably probleam with reading DCIM data")
+ return False
+
  ##############################
  # Thos is not relevant check #
  ##############################
@@ -161,6 +176,7 @@ def print_statistic(self, tensor_old, tensor_new):
  print("\tmin/max:\t{}/{} -> {}/{}".format(np.min(tensor_old), np.max(tensor_old), np.min(tensor_new),
  np.max(tensor_new)))
  print("\tmean std:\t{:.3f} {:.3f} -> {:.3f} {:.3f}".format(np.mean(tensor_old), np.std(tensor_old), np.mean(tensor_new), np.std(tensor_new)))
+ print("\tsum:\t\t{} -> {}".format(np.sum(tensor_old), np.sum(tensor_new)))
 
  def preprocess_data(self, data, label):
  # zoom = AUGMENT_SCALED_DIMS / data.shape
@@ -244,15 +260,15 @@ def sanity_check_after_preprocessing(self, data, label):
 
  return result
 
- def save_npy(self, subfolder: str, patient_id:str, original_data, original_label, scaled_data, scaled_label):
+ def save_npy(self, subfolder: str, patient_id:str, percentage: int, original_data, original_label, scaled_data, scaled_label):
  result = True
  scaled_data = np.cast[np.float32](scaled_data)
  scaled_label = np.cast[np.int8](scaled_label)
- np.savez_compressed(os.path.join(self.TFRECORD_FOLDER, subfolder, patient_id + ".npz"), [scaled_data, scaled_label])
+ np.savez_compressed(os.path.join(self.TFRECORD_FOLDER, subfolder, patient_id + f"_{percentage}.npz", ), [scaled_data, scaled_label])
 
  return result
 
- def pickle_src_data(self, ratio=0.2):
+ def pickle_src_data(self, train_valid_percentage=0.2):
  if not os.path.exists(self.TFRECORD_FOLDER):
  print("ERROR: can't find TFRecord folder:", self.TFRECORD_FOLDER)
  return
@@ -264,7 +280,7 @@ def pickle_src_data(self, ratio=0.2):
  folder_list = glob.glob(os.path.join(self.patients_src_folder, "*"))
 
  for folder in folder_list:
- subfolder = "train" if np.random.rand() > ratio else "valid"
+ subfolder = "train" if np.random.rand() > train_valid_percentage else "valid"
 
  patient_id = self.get_patient_id_from_folder(folder)
 
@@ -293,24 +309,29 @@ def pickle_src_data(self, ratio=0.2):
  print("ERROR: data & labels are not consistent patient_id:", patient_id)
  continue
 
- start_ts = time.time()
- scaled_src_data, scaled_label_data = self.preprocess_data(src_data, label_data)
- print("\tPreprocess data in {:.2f} sec".format(time.time() - start_ts))
+ for percentage in [0, 30, 60, 90]:
+ print(f"\n\tPreprocess data for {percentage}%")
 
+ scaled_data, scaled_label = borders.cut_and_resize_including_pancreas(src_data, label_data, percentage/100, percentage/100)
 
- if DEBUG:
- print("\tData")
- self.print_statistic(src_data, scaled_src_data)
- print("\tLabel")
- self.print_statistic(label_data, scaled_label_data)
+ start_ts = time.time()
+ scaled_src_data, scaled_label_data = self.preprocess_data(scaled_data.numpy(), scaled_label.numpy())
+ print("\tPreprocess data in {:.2f} sec".format(time.time() - start_ts))
 
 
- if self.sanity_check_after_preprocessing(scaled_src_data, scaled_label_data) == False:
- print("ERROR: data or label failed sanity check")
- continue
+ if DEBUG:
+ print("\tData")
+ self.print_statistic(src_data, scaled_src_data)
+ print("\tLabel")
+ self.print_statistic(label_data, scaled_label_data)
+
+
+ if self.sanity_check_after_preprocessing(scaled_src_data, scaled_label_data) == False:
+ print("ERROR: data or label failed sanity check")
+ continue
 
- if self.save_npy(subfolder, patient_id, src_data, label_data, scaled_src_data, scaled_label_data) == False:
- print("ERROR: can't save TFRecord patient id:", patient_id)
+  if self.save_npy(subfolder, patient_id, percentage, src_data, label_data, scaled_src_data, scaled_label_data) == False:
+  print("ERROR: can't save TFRecord patient id:", patient_id)
 
 
 def main():