Imageomics · zoeduan · Aug 7, 2024 · Sep 6, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/src/cautiousrobot/__init__.py b/src/cautiousrobot/__init__.py
@@ -1,5 +1,6 @@
 from cautiousrobot.__main__ import download_images
 from cautiousrobot.buddy_check import BuddyCheck
+from cautiousrobot.utils import downsample_and_save_image
 
 
 # Create instance of the class
@@ -9,4 +10,4 @@
 buddy_check_instance.validate_download
 buddy_check_instance.check_alignment
 
-__all__ = ["download_images", "validate_download", "check_alignment"]
+__all__ = ["download_images", "validate_download", "check_alignment", "downsample_and_save_image"]
diff --git a/src/cautiousrobot/__main__.py b/src/cautiousrobot/__main__.py
@@ -18,7 +18,7 @@
 import time
 from PIL import Image
 from sumbuddy import get_checksums
-from cautiousrobot.utils import log_response, update_log, process_csv
+from cautiousrobot.utils import log_response, update_log, process_csv, downsample_and_save_image
 from cautiousrobot.buddy_check import BuddyCheck
 
 
@@ -88,99 +88,104 @@ def download_images(data, img_dir, log_filepath, error_log_filepath, filename =
         image_name = data[filename][i]
         if subfolders:
             image_dir_path = img_dir + "/" + data[subfolders][i]
-
-        # get image from url
-        url = data[file_url][i]
-        if not url:
-            log_errors = log_response(log_errors,
-                                    index = i,
-                                    image = image_name,
-                                    url = url,
-                                    response_code = "no url")
-            update_log(log = log_errors, index = i, filepath = error_log_filepath)
-
-        else:
-            #download the image
-            redo = True
-            max_redos = retry
-            while redo and max_redos > 0:
-                try:
-                    response = requests.get(url, stream = True)
-                except Exception as e:
-                    redo = True
-                    max_redos -= 1
-                    if max_redos <= 0:
-                        log_errors = log_response(log_errors,
-                                        index = i,
-                                        image = image_name,
-                                        url = url,
-                                        response_code = str(e))
-                        update_log(log = log_errors, index = i, filepath = error_log_filepath)
-                    continue
-
-                if response.status_code == 200:
-                    redo = False
-                    # log status
-                    log_data = log_response(log_data,
+
+        if not os.path.exists(image_dir_path + "/" + image_name):
+
+            # get image from url
+            url = data[file_url][i]
+            if not url:
+                log_errors = log_response(log_errors,
                                         index = i,
                                         image = image_name,
-                                        url = url,
-                                        response_code = response.status_code
-                                        )
-                    update_log(log = log_data, index = i, filepath = log_filepath)
-
-                    #create the appropriate folders if necessary
-
-                    if os.path.exists(image_dir_path) != True:
-                        os.makedirs(image_dir_path, exist_ok=False)
-
-                    # save full size image to appropriate folder
-                    with open(f"{image_dir_path}/{image_name}", "wb") as out_file:
-                        shutil.copyfileobj(response.raw, out_file)
-
-                    if downsample:
-                        downsample_dir_path = downsample_path
-                        if subfolders:
-                            downsample_dir_path = downsample_path + "/" + data[subfolders][i]
-                        if os.path.exists(downsample_dir_path) != True:
-                            os.makedirs(downsample_dir_path, exist_ok=False)
-                        # Downsample & save image
-                        try:
-                            img = Image.open(f"{image_dir_path}/{image_name}")
-                            img.resize((downsample, downsample)).save(downsample_dir_path + "/" + image_name)
-                        except Exception as e:
-                            print(e)
+                                        file_path = url,
+                                        response_code = "no url")
+                update_log(log = log_errors, index = i, filepath = error_log_filepath)
+
+            else:
+                #download the image
+                redo = True
+                max_redos = retry
+                while redo and max_redos > 0:
+                    try:
+                        response = requests.get(url, stream = True)
+                    except Exception as e:
+                        redo = True
+                        max_redos -= 1
+                        if max_redos <= 0:
                             log_errors = log_response(log_errors,
-                                        index = i,
-                                        image = "downsized_" + image_name,
-                                        url = url,
-                                        response_code = str(e))
+                                            index = i,
+                                            image = image_name,
+                                            file_path = url,
+                                            response_code = str(e))
                             update_log(log = log_errors, index = i, filepath = error_log_filepath)
-
-                # check for too many requests
-                elif response.status_code in REDO_CODE_LIST:
-                    redo = True
-                    max_redos -= 1
-                    if max_redos <= 0:
-                        log_errors = log_response(log_errors,
-                                        index = i,
-                                        image = image_name,
-                                        url = url,
-                                        response_code = response.status_code)
-                        update_log(log = log_errors, index = i, filepath = error_log_filepath)
+                        continue
+
+                    if response.status_code == 200:
+                        redo = False
+                        # log status
+                        log_data = log_response(log_data,
+                                            index = i,
+                                            image = image_name,
+                                            file_path = url,
+                                            response_code = response.status_code
+                                            )
+                        update_log(log = log_data, index = i, filepath = log_filepath)
+
+                        #create the appropriate folders if necessary
+
+                        if os.path.exists(image_dir_path) != True:
+                            os.makedirs(image_dir_path, exist_ok=False)
+
+                        # save full size image to appropriate folder
+                        with open(f"{image_dir_path}/{image_name}", "wb") as out_file:
+                            shutil.copyfileobj(response.raw, out_file)
 
-                    else:
-                        time.sleep(wait)
-                else: #other fail, eg. 404
-                    redo = False
-                    log_errors = log_response(log_errors,
+                    # check for too many requests
+                    elif response.status_code in REDO_CODE_LIST:
+                        redo = True
+                        max_redos -= 1
+                        if max_redos <= 0:
+                            log_errors = log_response(log_errors,
                                             index = i,
                                             image = image_name,
-                                            url = url,
+                                            file_path= url,
                                             response_code = response.status_code)
-                    update_log(log = log_errors, index = i, filepath = error_log_filepath)
+                            update_log(log = log_errors, index = i, filepath = error_log_filepath)
+
+                        else:
+                            time.sleep(wait)
+                    else: #other fail, eg. 404
+                        redo = False
+                        log_errors = log_response(log_errors,
+                                                index = i,
+                                                image = image_name,
+                                                file_path = url,
+                                                response_code = response.status_code)
+                        update_log(log = log_errors, index = i, filepath = error_log_filepath)
+
+                    del response
+
+            if downsample:
+                # Since we have image resize within a try and log failure, seems reasonable to not check for the image again.
+                downsample_dir_path = downsample_path
+                if subfolders:
+                    downsample_dir_path = downsample_path + "/" + data[subfolders][i]
+                if os.path.exists(downsample_dir_path + "/" + image_name):
+                    # Don't overwrite resized images either
+                    continue
+
+                downsample_and_save_image(
+                    image_dir_path=image_dir_path,
+                    image_name=image_name,
+                    downsample_dir_path=downsample_dir_path,
+                    downsample_size=downsample,
+                    log_errors=log_errors,
+                    image_index=i,
+                    file_path=url,
+                    error_log_filepath=error_log_filepath
+                )
+
 
-                del response
 
     return
 
@@ -222,9 +227,7 @@ def main():
     # Check for img_dir
     img_dir = args.output_dir
     if os.path.exists(img_dir):
-        overwrite = input(f"'{img_dir}' already exists (may impact downsizing too). Overwrite? [y/n]: ")
-        if overwrite.lower() != "y":
-            sys.exit("Exited without executing.")
+        sys.exit(f"'{img_dir}' already exists. Exited without executing.")
 
     # Set location for logs
     metadata_path = csv_path.split(".")[0]

diff --git a/src/cautiousrobot/utils.py b/src/cautiousrobot/utils.py
@@ -2,13 +2,15 @@
 
 import json
 import pandas as pd
+import os
+from PIL import Image
 
 
-def log_response(log_data, index, image, url, response_code):
+def log_response(log_data, index, image, file_path, response_code):
     # log status
     log_entry = {}
     log_entry["Image"] = image
-    log_entry["file_url"] = url
+    log_entry["file_path"] = file_path
     log_entry["Response_status"] = str(response_code) #int64 has problems sometimes
     log_data[index] = log_entry
 
@@ -44,3 +46,38 @@ def process_csv(csv_path, expected_cols):
         raise Exception(f"The CSV at {csv_path} is missing column(s): {missing_cols}, defined as {[expected_cols[col] for col in missing_cols]}.")
 
     return df
+
+def downsample_and_save_image(image_dir_path, image_name, downsample_dir_path, downsample_size, log_errors, image_index, file_path, error_log_filepath):
+    """
+    Downsample an image and save it to the specified directory.
+
+    Parameters:
+    - image_dir_path (str): The path to the directory containing the original image.
+    - image_name (str): The name of the image to be downsampled.
+    - downsample_dir_path (str): The path to the directory where the downsampled image will be saved.
+    - downsample_size (int): The new size (both width and height) for the downsampled image.
+    - log_errors (dict): A dictionary to store errors encountered during the downsampling process.
+    - image_index (int): The index of the current image being processed, used for logging.
+    - file_path (str): The file path or URL associated with the image, used for logging errors.
+    - error_log_filepath (str): The file path where error logs are stored.
+
+    Returns:
+    None
+    """    
+    if not os.path.exists(downsample_dir_path):
+        os.makedirs(downsample_dir_path, exist_ok=False)
+
+    try:
+        img = Image.open(f"{image_dir_path}/{image_name}")
+        img.resize((downsample_size, downsample_size)).save(f"{downsample_dir_path}/{image_name}")
+    except Exception as e:
+        print(e)
+        log_errors = log_response(
+            log_errors,
+            index=image_index,
+            image="downsized_" + image_name,
+            file_path=file_path,
+            response_code=str(e)
+        )
+        update_log(log=log_errors, index=image_index, filepath=error_log_filepath)
+
diff --git a/tests/test_download_images.py b/tests/test_download_images.py
@@ -302,7 +302,8 @@ def test_main_directory_exists(self, mock_exists, mock_input, mock_process_csv,
         with self.assertRaises(SystemExit) as cm:
             main()
 
-        self.assertEqual(cm.exception.code, "Exited without executing.")
+        # self.assertEqual(cm.exception.code, "mock_args.output_dir Exited without executing.")
+        self.assertEqual(cm.exception.code, f"'{mock_args.output_dir}' already exists. Exited without executing.")
 
 
 if __name__ == '__main__':