Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/avoid overwrite #25

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
3 changes: 2 additions & 1 deletion src/cautiousrobot/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from cautiousrobot.__main__ import download_images
from cautiousrobot.buddy_check import BuddyCheck
from cautiousrobot.utils import downsample_and_save_image


# Create instance of the class
Expand All @@ -9,4 +10,4 @@
buddy_check_instance.validate_download
buddy_check_instance.check_alignment

__all__ = ["download_images", "validate_download", "check_alignment"]
__all__ = ["download_images", "validate_download", "check_alignment", "downsample_and_save_image"]
179 changes: 91 additions & 88 deletions src/cautiousrobot/__main__.py
egrace479 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import time
from PIL import Image
from sumbuddy import get_checksums
from cautiousrobot.utils import log_response, update_log, process_csv
from cautiousrobot.utils import log_response, update_log, process_csv, downsample_and_save_image
from cautiousrobot.buddy_check import BuddyCheck


Expand Down Expand Up @@ -88,99 +88,104 @@ def download_images(data, img_dir, log_filepath, error_log_filepath, filename =
image_name = data[filename][i]
if subfolders:
image_dir_path = img_dir + "/" + data[subfolders][i]

# get image from url
url = data[file_url][i]
if not url:
log_errors = log_response(log_errors,
index = i,
image = image_name,
url = url,
response_code = "no url")
update_log(log = log_errors, index = i, filepath = error_log_filepath)

else:
#download the image
redo = True
max_redos = retry
while redo and max_redos > 0:
try:
response = requests.get(url, stream = True)
except Exception as e:
redo = True
max_redos -= 1
if max_redos <= 0:
log_errors = log_response(log_errors,
index = i,
image = image_name,
url = url,
response_code = str(e))
update_log(log = log_errors, index = i, filepath = error_log_filepath)
continue

if response.status_code == 200:
redo = False
# log status
log_data = log_response(log_data,

if not os.path.exists(image_dir_path + "/" + image_name):

# get image from url
url = data[file_url][i]
if not url:
log_errors = log_response(log_errors,
index = i,
image = image_name,
url = url,
response_code = response.status_code
)
update_log(log = log_data, index = i, filepath = log_filepath)

#create the appropriate folders if necessary

if os.path.exists(image_dir_path) != True:
os.makedirs(image_dir_path, exist_ok=False)

# save full size image to appropriate folder
with open(f"{image_dir_path}/{image_name}", "wb") as out_file:
shutil.copyfileobj(response.raw, out_file)

if downsample:
downsample_dir_path = downsample_path
if subfolders:
downsample_dir_path = downsample_path + "/" + data[subfolders][i]
if os.path.exists(downsample_dir_path) != True:
os.makedirs(downsample_dir_path, exist_ok=False)
# Downsample & save image
try:
img = Image.open(f"{image_dir_path}/{image_name}")
img.resize((downsample, downsample)).save(downsample_dir_path + "/" + image_name)
except Exception as e:
print(e)
file_path = url,
response_code = "no url")
update_log(log = log_errors, index = i, filepath = error_log_filepath)

else:
#download the image
redo = True
max_redos = retry
while redo and max_redos > 0:
try:
response = requests.get(url, stream = True)
except Exception as e:
redo = True
max_redos -= 1
if max_redos <= 0:
log_errors = log_response(log_errors,
index = i,
image = "downsized_" + image_name,
url = url,
response_code = str(e))
index = i,
image = image_name,
file_path = url,
response_code = str(e))
update_log(log = log_errors, index = i, filepath = error_log_filepath)

# check for too many requests
elif response.status_code in REDO_CODE_LIST:
redo = True
max_redos -= 1
if max_redos <= 0:
log_errors = log_response(log_errors,
index = i,
image = image_name,
url = url,
response_code = response.status_code)
update_log(log = log_errors, index = i, filepath = error_log_filepath)
continue

if response.status_code == 200:
redo = False
# log status
log_data = log_response(log_data,
index = i,
image = image_name,
file_path = url,
response_code = response.status_code
)
update_log(log = log_data, index = i, filepath = log_filepath)

#create the appropriate folders if necessary

if os.path.exists(image_dir_path) != True:
os.makedirs(image_dir_path, exist_ok=False)

# save full size image to appropriate folder
with open(f"{image_dir_path}/{image_name}", "wb") as out_file:
shutil.copyfileobj(response.raw, out_file)

else:
time.sleep(wait)
else: #other fail, eg. 404
redo = False
log_errors = log_response(log_errors,
# check for too many requests
elif response.status_code in REDO_CODE_LIST:
redo = True
max_redos -= 1
if max_redos <= 0:
log_errors = log_response(log_errors,
index = i,
image = image_name,
url = url,
file_path= url,
response_code = response.status_code)
update_log(log = log_errors, index = i, filepath = error_log_filepath)
update_log(log = log_errors, index = i, filepath = error_log_filepath)

else:
time.sleep(wait)
else: #other fail, eg. 404
redo = False
log_errors = log_response(log_errors,
index = i,
image = image_name,
file_path = url,
response_code = response.status_code)
update_log(log = log_errors, index = i, filepath = error_log_filepath)

del response

if downsample:
# Since we have image resize within a try and log failure, seems reasonable to not check for the image again.
downsample_dir_path = downsample_path
if subfolders:
downsample_dir_path = downsample_path + "/" + data[subfolders][i]
if os.path.exists(downsample_dir_path + "/" + image_name):
# Don't overwrite resized images either
continue

downsample_and_save_image(
image_dir_path=image_dir_path,
image_name=image_name,
downsample_dir_path=downsample_dir_path,
downsample_size=downsample,
log_errors=log_errors,
image_index=i,
file_path=url,
error_log_filepath=error_log_filepath
)


del response

return

Expand Down Expand Up @@ -222,9 +227,7 @@ def main():
# Check for img_dir
img_dir = args.output_dir
if os.path.exists(img_dir):
overwrite = input(f"'{img_dir}' already exists (may impact downsizing too). Overwrite? [y/n]: ")
if overwrite.lower() != "y":
sys.exit("Exited without executing.")
sys.exit(f"'{img_dir}' already exists. Exited without executing.")

# Set location for logs
metadata_path = csv_path.split(".")[0]
Expand Down
41 changes: 39 additions & 2 deletions src/cautiousrobot/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@

import json
import pandas as pd
import os
from PIL import Image


def log_response(log_data, index, image, url, response_code):
def log_response(log_data, index, image, file_path, response_code):
# log status
log_entry = {}
log_entry["Image"] = image
log_entry["file_url"] = url
log_entry["file_path"] = file_path
log_entry["Response_status"] = str(response_code) #int64 has problems sometimes
log_data[index] = log_entry

Expand Down Expand Up @@ -44,3 +46,38 @@ def process_csv(csv_path, expected_cols):
raise Exception(f"The CSV at {csv_path} is missing column(s): {missing_cols}, defined as {[expected_cols[col] for col in missing_cols]}.")

return df

def downsample_and_save_image(image_dir_path, image_name, downsample_dir_path, downsample_size, log_errors, image_index, file_path, error_log_filepath):
"""
Downsample an image and save it to the specified directory.

Parameters:
- image_dir_path (str): The path to the directory containing the original image.
- image_name (str): The name of the image to be downsampled.
- downsample_dir_path (str): The path to the directory where the downsampled image will be saved.
- downsample_size (int): The new size (both width and height) for the downsampled image.
- log_errors (dict): A dictionary to store errors encountered during the downsampling process.
- image_index (int): The index of the current image being processed, used for logging.
- file_path (str): The file path or URL associated with the image, used for logging errors.
- error_log_filepath (str): The file path where error logs are stored.

Returns:
None
"""
if not os.path.exists(downsample_dir_path):
os.makedirs(downsample_dir_path, exist_ok=False)

try:
img = Image.open(f"{image_dir_path}/{image_name}")
img.resize((downsample_size, downsample_size)).save(f"{downsample_dir_path}/{image_name}")
except Exception as e:
print(e)
log_errors = log_response(
log_errors,
index=image_index,
image="downsized_" + image_name,
file_path=file_path,
response_code=str(e)
)
update_log(log=log_errors, index=image_index, filepath=error_log_filepath)

3 changes: 2 additions & 1 deletion tests/test_download_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,8 @@ def test_main_directory_exists(self, mock_exists, mock_input, mock_process_csv,
with self.assertRaises(SystemExit) as cm:
main()

self.assertEqual(cm.exception.code, "Exited without executing.")
# self.assertEqual(cm.exception.code, "mock_args.output_dir Exited without executing.")
self.assertEqual(cm.exception.code, f"'{mock_args.output_dir}' already exists. Exited without executing.")


if __name__ == '__main__':
Expand Down
Loading