diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5776df4 --- /dev/null +++ b/Makefile @@ -0,0 +1,52 @@ +.PHONY: shoots/clean %.sliced +.SECONDARY: + +# Remove intermediate/final files from the shoots folder +shoots/clean: + rm shoots/*restored + rm shoots/*transferred + rm shoots/*slice* + +# Request the Glacier restoration of the shoots in the given file +# The file is expected to contain one shoot identifier per line. +# In order to run this, set your AWS profile to one with authority in the workflow account. +%.restored : % + cat $< | python src/restore.py + cp $< $@ + + +# Request the Glacier transfer of the shoots in the given file +# This rule depends on restoration having completed, which is not guaranteed +# (or even likely) if you run this rule without having previously requested the restoration +# Any shoots that are not yet fully restored will result in a DLQ message that can eventually +# be redriven when the s3 objects are finally available for download +# In order to run this, set your AWS profile to one with authority in the digitisation account. + +# transfer to staging (see above) +%.transferred.staging: %.restored + cat $< | python src/start_transfers.py staging + cp $< $@ + + +# transfer to production (see above) +%.transferred.production: %.restored + cat $< | python src/start_transfers.py production + cp $< $@ + +# Slice a given input file into manageable chunks, so that you can run them through the +# transfer process separately without overwhelming the target system. +# The right number for archivematica is probably about 20. + +%.sliced: % + split -l 20 $< $<. + +# Touch the files already on AWS. This will stimulate the corresponding transfer lambdas. +# The target system can sometimes be unexpectedly unavailable or overwhelmed, +# resulting in failures. +# This allows us to invoke the process from just before the failure +# In order to run this, set your AWS profile to one with authority in the digitisation account. +%.touched.staging: % + cat % | python src/touch.py staging + +%.touched.production: % + cat % | python src/touch.py production diff --git a/src/check_status.py b/src/check_status.py new file mode 100644 index 0000000..8983328 --- /dev/null +++ b/src/check_status.py @@ -0,0 +1,26 @@ +import logging +import boto3 +from transferrer.common import should_download_file, shoot_number_to_folder_path, get_source_bucket + +logger = logging.getLogger(__name__) + + +def check_shoot_restore_status(bucket, shoot_number): + check_folder_restore_status(bucket, shoot_number_to_folder_path(shoot_number)) + + +def check_folder_restore_status(bucket, s3_folder: str): + logger.info(s3_folder) + for obj in bucket.objects.filter(Prefix=s3_folder, OptionalObjectAttributes=[ + 'RestoreStatus', + ]): + if should_download_file(obj.key): + status = obj.restore_status + else: + status = "ignored" + logger.info(f"{obj.key}\t{status}") + + +if __name__ == "__main__": + import sys + check_shoot_restore_status(get_source_bucket(boto3.Session()), sys.argv[1]) diff --git a/src/lambda_function.py b/src/lambda_function.py new file mode 100644 index 0000000..84c8590 --- /dev/null +++ b/src/lambda_function.py @@ -0,0 +1,35 @@ +import os +import json +import boto3 +from transferrer.transfer import transfer_shoot +import logging +logging.basicConfig(level=logging.INFO) + +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +ACCESSION_NUMBER = os.getenv("ACCESSION_NUMBER", "2754") + + +def lambda_handler(event, context): + shoots = [single_handler(shoot_id) for shoot_id in get_shoot_ids(event)] + return { + 'statusCode': 200, + 'body': shoots + } + + +def single_handler(shoot_id): + logger.info(f"transferring {shoot_id}") + transfer_shoot( + from_session=boto3.Session(), + to_session=boto3.Session(), + shoot_number=shoot_id, + accession_number=ACCESSION_NUMBER + ) + return shoot_id + + +def get_shoot_ids(event): + for record in event['Records']: + yield json.loads(record['body'])["Message"] \ No newline at end of file diff --git a/src/restore.py b/src/restore.py new file mode 100644 index 0000000..61ea911 --- /dev/null +++ b/src/restore.py @@ -0,0 +1,52 @@ +import logging +import boto3 +from botocore.exceptions import ClientError + +from transferrer.common import should_download_file, shoot_number_to_folder_path, get_source_bucket + +logger = logging.getLogger(__name__) + + +def restore_s3_folder(bucket, s3_folder: str, days_to_keep=1): + logger.info(f"restoring folder: {s3_folder}") + for obj in bucket.objects.filter(Prefix=s3_folder): + if should_download_file(obj.key): + try: + logger.info(f"restoring object: {obj.key}") + obj.restore_object( + RestoreRequest={ + 'Days': days_to_keep, + 'GlacierJobParameters': { + 'Tier': 'Bulk' + } + } + ) + except ClientError as e: + if "The operation is not valid for the object's storage class" in str(e): + logger.info(f"attempt to restore non-glacier object: {obj.key}") + elif "RestoreAlreadyInProgress" in str(e): + logger.info(f"redundant attempt to restore object: {obj.key}") + else: + raise + else: + logger.info(f"ignoring {obj.key}") + + +def restore_shoot_folder(bucket, shoot_number): + restore_s3_folder(bucket, shoot_number_to_folder_path(shoot_number)) + + +def restore_shoot_folders(shoot_numbers): + bucket = get_source_bucket(boto3.Session()) + for shoot_number in (n.strip() for n in shoot_numbers): + logger.info(f"restoring shoot {shoot_number}") + restore_shoot_folder(bucket, shoot_number) + + +def main(): + import sys + restore_shoot_folders(sys.stdin.readlines()) + + +if __name__ == "__main__": + main() diff --git a/src/start_transfers.py b/src/start_transfers.py new file mode 100644 index 0000000..a706205 --- /dev/null +++ b/src/start_transfers.py @@ -0,0 +1,14 @@ +import sys +import boto3 + + +def post_messages(session, environment, shoot_numbers): + sns = session.resource("sns") + topic = sns.Topic(f"arn:aws:sns:eu-west-1:404315009621:transfer-shoots-{environment}") + for shoot_number in shoot_numbers: + print(f"requesting transfer of {shoot_number}") + topic.publish(Message=shoot_number.strip()) + + +if __name__ == "__main__": + post_messages(boto3.Session(), sys.argv[1], sys.stdin.readlines()) diff --git a/src/touch.py b/src/touch.py new file mode 100644 index 0000000..73f3219 --- /dev/null +++ b/src/touch.py @@ -0,0 +1,42 @@ +""" +Like unix touch, this updates objects in s3 without substantive change. + +Call with a newline-separated list of keys, thus + +echo 'born-digital-accessions/2754_CP000179.zip \n born-digital-accessions/2754_CP000181.zip' | touch.py + +This is intended for use when Archivematica has ephemerally failed to accept some of the shoot zips. + +A fortnightly list of failed zips is published to Slack, but you could also generate it by looking elsewhere. +""" +import sys + +import boto3 +import datetime + +BUCKETS = { + "staging": "wellcomecollection-archivematica-staging-transfer-source", + "production": "wellcomecollection-archivematica-transfer-source" +} + + +def touch_object(session, bucket, key): + print(f"touching: {bucket}/{key}") + session.client('s3').copy_object( + Bucket=bucket, + CopySource={'Bucket': bucket, 'Key': key}, + Key=key, + Metadata={ + 'touched': datetime.datetime.now().isoformat() + }, + MetadataDirective="REPLACE" + ) + + +def touch_objects(session, bucket, object_keys): + for object_key in object_keys: + touch_object(session, bucket, object_key.strip()) + + +if __name__ == '__main__': + touch_objects(boto3.Session(), BUCKETS[sys.argv[1]], sys.stdin.readlines()) diff --git a/src/transferrer/common.py b/src/transferrer/common.py index 4444c8e..1bcefa0 100644 --- a/src/transferrer/common.py +++ b/src/transferrer/common.py @@ -5,12 +5,11 @@ SOURCE_BUCKET = "wellcomecollection-editorial-photography" -def get_source_bucket(max_connections=10): - return get_source_client(max_connections).Bucket(SOURCE_BUCKET) +def get_source_bucket(session: boto3.session.Session, max_connections=10): + return get_source_client(session, max_connections).Bucket(SOURCE_BUCKET) -def get_source_client(max_connections): - session = boto3.Session() +def get_source_client(session: boto3.session.Session, max_connections): return session.resource('s3', config=botocore.config.Config( max_pool_connections=max_connections )) diff --git a/src/transferrer/download.py b/src/transferrer/download.py index 7b61aac..ff83b22 100644 --- a/src/transferrer/download.py +++ b/src/transferrer/download.py @@ -2,6 +2,7 @@ import logging from concurrent.futures import ThreadPoolExecutor from functools import partial +import boto3 from transferrer.common import should_download_file, shoot_number_to_folder_path, get_source_bucket @@ -12,11 +13,11 @@ THREADS = 10 -def download_shoot(shoot_number, local_dir): +def download_shoot(session: boto3.session.Session, shoot_number, local_dir): # Allowing enough connections for each thread to have two of them # prevents the `urllib3.connectionpool:Connection pool is full` warning # and allows for better connection reuse. - download_shoot_folder(get_source_bucket(max_connections=THREADS * 2), shoot_number, local_dir) + download_shoot_folder(get_source_bucket(session, max_connections=THREADS * 2), shoot_number, local_dir) def download_shoot_folder(bucket, shoot_number, local_dir): @@ -51,4 +52,4 @@ def download_s3_file(object_summary, *, local_dir: str, s3_folder: str): import sys shoot_number = sys.argv[1] download_folder = os.path.join("download", shoot_number) - download_shoot(sys.argv[1], download_folder) + download_shoot(boto3.Session(), sys.argv[1], download_folder) diff --git a/src/transferrer/make_zip.py b/src/transferrer/make_zip.py index cd798cc..bf5e929 100644 --- a/src/transferrer/make_zip.py +++ b/src/transferrer/make_zip.py @@ -56,12 +56,11 @@ def create_born_digital_folder(source_files, target_directory, accession_id): def move_files_to_objects_folder(source_files, target_directory): """ - Move the contents of source directory into the objects subfolder in the target directory - + Move the contents of source directory into the target directory """ - os.makedirs(os.path.join(target_directory, "objects")) + os.makedirs(target_directory, exist_ok=True) for file_abspath in source_files: - shutil.move(file_abspath, os.path.join(target_directory, "objects")) + shutil.move(file_abspath, target_directory) def files_in_folder(source_directory): @@ -103,21 +102,22 @@ def create_born_digital_zips( accession_id = f"{shoot_number_to_accession_id(accession_number, shoot_number)}_{ix:03d}" yield create_born_digital_zip( full_paths(source_directory, batch), - os.path.join(target_directory, accession_id), + target_directory, accession_id, ) else: accession_id = shoot_number_to_accession_id(accession_number, shoot_number) yield create_born_digital_zip( full_paths(source_directory, filenames), - os.path.join(target_directory, accession_id), + target_directory, accession_id, ) -def create_born_digital_zip(source_files, target_directory, accession_id): - folder = create_born_digital_folder(source_files, target_directory, accession_id) - return shutil.make_archive(accession_id, "zip", folder) +def create_born_digital_zip(source_files, target_directory: str, accession_id: str): + accession_path = os.path.join(target_directory, accession_id) + folder = create_born_digital_folder(source_files, accession_path, accession_id) + return shutil.make_archive(accession_path, "zip", folder) # copied from itertools 3.12 documentation. diff --git a/src/transferrer/transfer.py b/src/transferrer/transfer.py index e69de29..7f888a3 100644 --- a/src/transferrer/transfer.py +++ b/src/transferrer/transfer.py @@ -0,0 +1,25 @@ +import os +import tempfile +from transferrer.download import download_shoot +from transferrer.make_zip import create_born_digital_zips +from transferrer.upload import upload +import boto3 + + +def transfer_shoot(from_session, to_session, shoot_number, accession_number): + with tempfile.TemporaryDirectory() as tmpfolder: + source_folder = os.path.join(tmpfolder, "source") + target_folder = os.path.join(tmpfolder, "target") + download_shoot(from_session, shoot_number, source_folder) + for zipfile in create_born_digital_zips(source_folder, target_folder, accession_number, shoot_number, 250): + upload(to_session, zipfile) + + +if __name__ == "__main__": + import sys + transfer_shoot( + from_session=boto3.Session(profile_name=os.environ["AWS_SOURCE_PROFILE"]), + to_session=boto3.Session(profile_name=os.environ["AWS_TARGET_PROFILE"]), + shoot_number=sys.argv[1], + accession_number="2754" + ) diff --git a/src/transferrer/upload.py b/src/transferrer/upload.py index abec15f..b8049dc 100644 --- a/src/transferrer/upload.py +++ b/src/transferrer/upload.py @@ -7,15 +7,17 @@ logger = logging.getLogger(__name__) +TARGET_BUCKET = os.getenv("TARGET_BUCKET", "wellcomecollection-archivematica-staging-transfer-source") -def upload(s3, zipfile_path, target_bucket_name="wellcomecollection-archivematica-staging-transfer-source"): + +def upload(session, zipfile_path, target_bucket_name=TARGET_BUCKET): logger.info(f"uploading {zipfile_path} to {target_bucket_name}") - get_target_bucket(s3, target_bucket_name).upload_file(zipfile_path, f"born-digital-accessions/{os.path.basename(zipfile_path)}") + get_target_bucket(session, target_bucket_name).upload_file(zipfile_path, f"born-digital-accessions/{os.path.basename(zipfile_path)}") -def get_target_bucket(s3, target_bucket): - return s3.Bucket(target_bucket) +def get_target_bucket(session, target_bucket): + return session.resource('s3').Bucket(target_bucket) if __name__ == "__main__": - upload( boto3.Session(profile_name=os.environ["AWS_TARGET_PROFILE"]).resource('s3'), sys.argv[1]) + upload(boto3.Session(profile_name=os.environ["AWS_TARGET_PROFILE"]), sys.argv[1]) diff --git a/terraform/.terraform.lock.hcl b/terraform/.terraform.lock.hcl new file mode 100644 index 0000000..9136804 --- /dev/null +++ b/terraform/.terraform.lock.hcl @@ -0,0 +1,43 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/archive" { + version = "2.4.2" + hashes = [ + "h1:WfIjVbYA9s/uN2FwhGoiffT7CLFydy7MT1waFbt9YrY=", + "zh:08faed7c9f42d82bc3d406d0d9d4971e2d1c2d34eae268ad211b8aca57b7f758", + "zh:3564112ed2d097d7e0672378044a69b06642c326f6f1584d81c7cdd32ebf3a08", + "zh:53cd9afd223c15828c1916e68cb728d2be1cbccb9545568d6c2b122d0bac5102", + "zh:5ae4e41e3a1ce9d40b6458218a85bbde44f21723943982bca4a3b8bb7c103670", + "zh:5b65499218b315b96e95c5d3463ea6d7c66245b59461217c99eaa1611891cd2c", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:7f45b35a8330bebd184c2545a41782ff58240ed6ba947274d9881dd5da44b02e", + "zh:87e67891033214e55cfead1391d68e6a3bf37993b7607753237e82aa3250bb71", + "zh:de3590d14037ad81fc5cedf7cfa44614a92452d7b39676289b704a962050bc5e", + "zh:e7e6f2ea567f2dbb3baa81c6203be69f9cd6aeeb01204fd93e3cf181e099b610", + "zh:fd24d03c89a7702628c2e5a3c732c0dede56fa75a08da4a1efe17b5f881c88e2", + "zh:febf4b7b5f3ff2adff0573ef6361f09b6638105111644bdebc0e4f575373935f", + ] +} + +provider "registry.terraform.io/hashicorp/aws" { + version = "5.57.0" + hashes = [ + "h1:9yi3yb3XOMjj/xsSbOfscfmQzPUQ7sZqSYSBfGSfkBA=", + "zh:03761bedb72290599aef0040d3cefb77842f0ef4338673a7e5b53557b0ca4960", + "zh:1c70c050116370688abd239979b06f33c5c8cb7f6e59e89f60cf08ee01666064", + "zh:1cc3b259028a65b2f68ffc25df876bbb0f46d108f262b8ec7c56fc597ac697af", + "zh:3bcdf1415b37f39b71e07d4d92977cf8697f07602382d63687d5f683fee0231a", + "zh:40b1774a2cacc84002ac88ef30fb017c273009456d7a1f9f7c5a4a057041ec75", + "zh:46d51fa066c6441594a1e242c9491cc31dbb2dc85f1acf8bc54ad6faa4de524b", + "zh:550e5635b0cd5d98fa66c2afd5dbb1563a8e019be9f760bd1543fbcca763f0c1", + "zh:7acc8357b5e02ed3eb478125614d049511d6faeb9850c084d6e6519db875f0d1", + "zh:7f7367299811ddf5560a0586e525d57dd52f1a0ca37e42e2c5284308069bf2b6", + "zh:8766cc10c83b1fc2e971c4e645bc4d3c871d4758eb54b0a3216600c66e3db681", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:a1e85b1fb9004d8ffab7600304e02bce4aa14cea9f0ad77fbd7b84aae6390760", + "zh:bcf2fc83bd9e20e5a930d9d596eb813c319f2b007c620b1818e574c1702eb9a9", + "zh:d2538fcb20dc2afc04b716f67969944eef7f4fc4296410116d5b7af1811100f2", + "zh:e0e47c5d8710bbfcfe4db1cfa81c67e320056006d08063e69640cd2d492c6f64", + ] +} diff --git a/terraform/main.tf b/terraform/main.tf new file mode 100644 index 0000000..5325e07 --- /dev/null +++ b/terraform/main.tf @@ -0,0 +1,36 @@ +locals { + event_batching_window_timeout = 20 + lambda_timeout = 120 //two minutes + + # The lambda event source pulls messages from SQS in batches, finally triggering the lambda + # when either it has enough messages, or enough time has elapsed. + # A message becomes invisible when it joins the event source buffer, so could wait for + # the whole timeout window plus the whole execution time before being confirmed. + # The value of visibility timeout must be at least 20 seconds more than the lambda timeout + # This doesn't necessarily need to exist with a longer batching window, but + # always adding 20 here should mean that you can safely set batching window to 0 + # if you wish. + # See: https://docs.aws.amazon.com/lambda/latest/dg/with-sqs.html + # "Lambda might wait for up to 20 seconds before invoking your function." + queue_visibility_timeout = local.event_batching_window_timeout + local.lambda_timeout + 20 +} + +data "archive_file" "lambda_zip" { + type = "zip" + output_path = "lambda.zip" + source_dir = "../src" +} + +module "staging_lambda" { + source = "./modules/transferrer_pipe" + environment = "staging" + queue_visibility_timeout = local.queue_visibility_timeout + lambda_zip = data.archive_file.lambda_zip +} + +module "production_lambda" { + source = "./modules/transferrer_pipe" + environment = "production" + queue_visibility_timeout = local.queue_visibility_timeout + lambda_zip = data.archive_file.lambda_zip +} \ No newline at end of file diff --git a/terraform/modules/notification_queue/main.tf b/terraform/modules/notification_queue/main.tf new file mode 100644 index 0000000..e87bfb4 --- /dev/null +++ b/terraform/modules/notification_queue/main.tf @@ -0,0 +1,22 @@ + +module "transfer_shoots_topic" { + source = "github.com/wellcomecollection/terraform-aws-sns-topic.git?ref=v1.0.1" + name = "transfer-shoots-${var.environment}" +} + +module "dlq_alarm_topic" { + source = "github.com/wellcomecollection/terraform-aws-sns-topic.git?ref=v1.0.1" + name = "transfer-shoots-alarm-${var.environment}" +} + +module "input_queue" { + source = "github.com/wellcomecollection/terraform-aws-sqs//queue?ref=v1.2.1" + + queue_name = "transfer-shoots-${var.environment}" + + topic_arns = [module.transfer_shoots_topic.arn] + visibility_timeout_seconds = var.queue_visibility_timeout + max_receive_count = 1 + message_retention_seconds = 1200 + alarm_topic_arn = module.dlq_alarm_topic.arn +} diff --git a/terraform/modules/notification_queue/outputs.tf b/terraform/modules/notification_queue/outputs.tf new file mode 100644 index 0000000..d8a0c89 --- /dev/null +++ b/terraform/modules/notification_queue/outputs.tf @@ -0,0 +1,3 @@ +output "queue_arn" { + value = module.input_queue.arn +} \ No newline at end of file diff --git a/terraform/modules/notification_queue/provider.tf b/terraform/modules/notification_queue/provider.tf new file mode 100644 index 0000000..7afdcf4 --- /dev/null +++ b/terraform/modules/notification_queue/provider.tf @@ -0,0 +1,7 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } +} \ No newline at end of file diff --git a/terraform/modules/notification_queue/variables.tf b/terraform/modules/notification_queue/variables.tf new file mode 100644 index 0000000..f625bfb --- /dev/null +++ b/terraform/modules/notification_queue/variables.tf @@ -0,0 +1,11 @@ +variable "queue_visibility_timeout" { + type = number +} + +variable "environment" { + type = string + validation { + condition = contains(["staging", "production"], var.environment) + error_message = "environment must be one of staging or production" + } +} \ No newline at end of file diff --git a/terraform/modules/sqs_lambda_trigger/main.tf b/terraform/modules/sqs_lambda_trigger/main.tf new file mode 100644 index 0000000..814f226 --- /dev/null +++ b/terraform/modules/sqs_lambda_trigger/main.tf @@ -0,0 +1,33 @@ + +data "aws_iam_policy_document" "allow_sqs_pull" { + statement { + actions = [ + "sqs:ReceiveMessage", + "sqs:DeleteMessage", + "sqs:GetQueueAttributes" + ] + resources = [ + var.queue_arn + ] + } +} + +resource "aws_iam_role_policy" "allow_sqs_pull" { + name = "${var.trigger_name}-pull-from-queue" + role = var.role_name + policy = data.aws_iam_policy_document.allow_sqs_pull.json +} + +resource "aws_lambda_event_source_mapping" "lambda_trigger" { + event_source_arn = var.queue_arn + enabled = true + function_name = var.function_name + batch_size = var.batch_size +} + +resource "aws_lambda_permission" "allow_lambda_sqs_trigger" { + action = "lambda:InvokeFunction" + function_name = var.function_name + principal = "sqs.amazonaws.com" + source_arn = var.queue_arn +} \ No newline at end of file diff --git a/terraform/modules/sqs_lambda_trigger/provider.tf b/terraform/modules/sqs_lambda_trigger/provider.tf new file mode 100644 index 0000000..7afdcf4 --- /dev/null +++ b/terraform/modules/sqs_lambda_trigger/provider.tf @@ -0,0 +1,7 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } +} \ No newline at end of file diff --git a/terraform/modules/sqs_lambda_trigger/variables.tf b/terraform/modules/sqs_lambda_trigger/variables.tf new file mode 100644 index 0000000..4a6277c --- /dev/null +++ b/terraform/modules/sqs_lambda_trigger/variables.tf @@ -0,0 +1,19 @@ +variable "queue_arn" { + type = string +} + +variable "function_name" { + type = string +} + +variable "role_name" { + type = string +} + +variable "trigger_name" { + type = string +} +variable "batch_size" { + type = number + default = 1 +} \ No newline at end of file diff --git a/terraform/modules/transferrer_lambda/main.tf b/terraform/modules/transferrer_lambda/main.tf new file mode 100644 index 0000000..6feb827 --- /dev/null +++ b/terraform/modules/transferrer_lambda/main.tf @@ -0,0 +1,74 @@ +locals { + lambda_name = "editorial-photography-transfer-${var.environment}" + lambda_timeout = 300 //five minutes + buckets = tomap( + { + staging = "wellcomecollection-archivematica-staging-transfer-source", + production = "wellcomecollection-archivematica-transfer-source" + } + ) + target_bucket = lookup(local.buckets, var.environment) + +} + + +module "transfer_lambda" { + source = "git@github.com:wellcomecollection/terraform-aws-lambda?ref=v1.2.0" + + name = local.lambda_name + runtime = "python3.12" + handler = "lambda_function.lambda_handler" + + filename = var.lambda_zip.output_path + memory_size = 2048 + timeout = local.lambda_timeout + + environment = { + variables = { + ACCESSION_NUMBER = "2754" + TARGET_BUCKET = local.target_bucket + } + } + source_code_hash = var.lambda_zip.output_base64sha256 + ephemeral_storage = { + size = 4096 + } +} + +resource "aws_iam_role_policy" "write_to_archivematica_transfer_source" { + role = module.transfer_lambda.lambda_role.name + name = "write_to_archivematica_transfer_source-${var.environment}" + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + "Effect": "Allow", + "Action": "s3:PutObject", + "Resource": "arn:aws:s3:::${local.target_bucket}/*" + }, + ] + } + ) +} + +resource "aws_iam_role_policy" "read_from_editorial_photography" { + role = module.transfer_lambda.lambda_role.name + name = "read_from_editorial_photography-${var.environment}" + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + "Effect" = "Allow", + "Action" = [ + "s3:GetObject", + "s3:ListBucket" + ], + "Resource" = [ + "arn:aws:s3:::wellcomecollection-editorial-photography", + "arn:aws:s3:::wellcomecollection-editorial-photography/*" + ], + }, + ] + }) + +} \ No newline at end of file diff --git a/terraform/modules/transferrer_lambda/outputs.tf b/terraform/modules/transferrer_lambda/outputs.tf new file mode 100644 index 0000000..f504240 --- /dev/null +++ b/terraform/modules/transferrer_lambda/outputs.tf @@ -0,0 +1,7 @@ +output "lambda" { + value = module.transfer_lambda.lambda +} + +output "role" { + value = module.transfer_lambda.lambda_role +} \ No newline at end of file diff --git a/terraform/modules/transferrer_lambda/provider.tf b/terraform/modules/transferrer_lambda/provider.tf new file mode 100644 index 0000000..7afdcf4 --- /dev/null +++ b/terraform/modules/transferrer_lambda/provider.tf @@ -0,0 +1,7 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } +} \ No newline at end of file diff --git a/terraform/modules/transferrer_lambda/variables.tf b/terraform/modules/transferrer_lambda/variables.tf new file mode 100644 index 0000000..537c342 --- /dev/null +++ b/terraform/modules/transferrer_lambda/variables.tf @@ -0,0 +1,16 @@ +variable "environment" { + type = string + validation { + condition = contains(["staging", "production"], var.environment) + error_message = "environment must be one of staging or production" + } +} + +variable "lambda_zip" { + type = object( + { + output_path = string, + output_base64sha256 = string + } + ) +} \ No newline at end of file diff --git a/terraform/modules/transferrer_pipe/main.tf b/terraform/modules/transferrer_pipe/main.tf new file mode 100644 index 0000000..45d66bb --- /dev/null +++ b/terraform/modules/transferrer_pipe/main.tf @@ -0,0 +1,20 @@ + +module "transfer_lambda" { + source = "../transferrer_lambda" + environment = var.environment + lambda_zip = var.lambda_zip +} + +module "input_queue" { + source = "../notification_queue" + environment = var.environment + queue_visibility_timeout = var.queue_visibility_timeout +} + +module "trigger" { + source = "../sqs_lambda_trigger" + queue_arn = module.input_queue.queue_arn + function_name = module.transfer_lambda.lambda.function_name + role_name = module.transfer_lambda.role.name + trigger_name = "editorial-photography-${var.environment}" +} \ No newline at end of file diff --git a/terraform/modules/transferrer_pipe/provider.tf b/terraform/modules/transferrer_pipe/provider.tf new file mode 100644 index 0000000..7afdcf4 --- /dev/null +++ b/terraform/modules/transferrer_pipe/provider.tf @@ -0,0 +1,7 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } +} \ No newline at end of file diff --git a/terraform/modules/transferrer_pipe/variables.tf b/terraform/modules/transferrer_pipe/variables.tf new file mode 100644 index 0000000..2e70377 --- /dev/null +++ b/terraform/modules/transferrer_pipe/variables.tf @@ -0,0 +1,20 @@ +variable "environment" { + type = string + validation { + condition = contains(["staging", "production"], var.environment) + error_message = "environment must be one of staging or production" + } +} + +variable "queue_visibility_timeout" { + type = number +} + +variable "lambda_zip" { + type = object( + { + output_path = string, + output_base64sha256 = string + } + ) +} \ No newline at end of file diff --git a/terraform/provider.tf b/terraform/provider.tf new file mode 100644 index 0000000..474feb9 --- /dev/null +++ b/terraform/provider.tf @@ -0,0 +1,8 @@ + +provider "aws" { + region = "eu-west-1" + + assume_role { + role_arn = "arn:aws:iam::404315009621:role/digitisation-developer" + } +} diff --git a/terraform/variables.tf b/terraform/variables.tf new file mode 100644 index 0000000..e69de29 diff --git a/test/conftest.py b/test/conftest.py index ddff173..bf30b83 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -17,6 +17,18 @@ def moto_s3(): finally: moto_fake.stop() + +@pytest.fixture +def moto_session(): + moto_fake = moto.mock_aws() + try: + moto_fake.start() + yield boto3 + finally: + moto_fake.stop() + + + @pytest.fixture def empty_bucket(moto_s3): @contextmanager diff --git a/test/test_make_zips.py b/test/test_make_zips.py index 1bfd257..20180ca 100644 --- a/test/test_make_zips.py +++ b/test/test_make_zips.py @@ -50,13 +50,13 @@ def test_single_zip(fs): populate_source_dir_with_images(fs, "G00DCAFE", 2) next(create_born_digital_zips("/in", "/out", "1234", "G00DCAFE", 10)) # it creates a zip named using the accession and shoot numbers - with zipfile.ZipFile("./1234_G00DCAFE.zip") as zip_file: + with zipfile.ZipFile("./out/1234_G00DCAFE.zip") as zip_file: zip_file.extractall("/unzipped") # with a metadata csv in the root of the zip assert_csv_has_accession_id("/unzipped/metadata/metadata.csv", "1234_G00DCAFE") - # and the photos in an ./objects folder - assert os.path.exists("/unzipped/objects/G00DCAFE_001.tif") - assert os.path.exists("/unzipped/objects/G00DCAFE_002.tif") + # and the photos in an . folder + assert os.path.exists("/unzipped/G00DCAFE_001.tif") + assert os.path.exists("/unzipped/G00DCAFE_002.tif") def test_multiple_zips(fs): @@ -66,19 +66,19 @@ def test_multiple_zips(fs): list(create_born_digital_zips("/in", "/out", "1234", "G00DCAFE", 10)) # it creates zips named using the accession and shoot numbers, with a three-digit numeric suffix - with zipfile.ZipFile("./1234_G00DCAFE_001.zip") as zip_file: + with zipfile.ZipFile("./out/1234_G00DCAFE_001.zip") as zip_file: zip_file.extractall("/unzipped_001") assert_csv_has_accession_id("/unzipped_001/metadata/metadata.csv", "1234_G00DCAFE_001") # The objects chosen for each zip are predictable and consistent. # They are sorted alphanumerically before being sliced into groups to place into each zip - objects = os.listdir("/unzipped_001/objects") + objects = fnmatch.filter(os.listdir("/unzipped_001"), '*tif') assert len(objects) == 10 assert set(filename[:3] for filename in objects) == {"AAA"} - with zipfile.ZipFile("./1234_G00DCAFE_002.zip") as zip_file: + with zipfile.ZipFile("./out/1234_G00DCAFE_002.zip") as zip_file: zip_file.extractall("/unzipped_002") assert_csv_has_accession_id("/unzipped_002/metadata/metadata.csv", "1234_G00DCAFE_002") - objects = os.listdir("/unzipped_002/objects") + objects = fnmatch.filter(os.listdir("/unzipped_002"), '*tif') assert len(objects) == 10 assert set(filename[:3] for filename in objects) == {"BBB"} @@ -99,13 +99,13 @@ def test_ignored_files_single_zip(fs): populate_source_dir_with_images(fs, "HELLO", 10) populate_source_dir(fs, ("shoot.csv", "HELLO.xml")) list(create_born_digital_zips("/in", "/out", "1234", "G00DCAFE", 10)) - assert fnmatch.filter(os.listdir("."), "*zip") == ["1234_G00DCAFE.zip"] + assert fnmatch.filter(os.listdir("./out/"), "*zip") == ["1234_G00DCAFE.zip"] def test_ignored_files_multiple_zips(fs): populate_source_dir_with_images(fs, "HELLO", 20) populate_source_dir(fs, ("shoot.csv", "HELLO.xml")) list(create_born_digital_zips("/in", "/out", "1234", "G00DCAFE", 5)) - zips = fnmatch.filter(os.listdir("."), "*zip") + zips = fnmatch.filter(os.listdir("./out/"), "*zip") # 20/5 == 4, ceil 22/5 > 4 assert len(zips) == 4 diff --git a/test/test_restore.py b/test/test_restore.py index 7605b73..d17e995 100644 --- a/test/test_restore.py +++ b/test/test_restore.py @@ -2,7 +2,7 @@ import pytest import pyfakefs from transferrer.download import download_shoot_folder -from transferrer.restore import restore_shoot_folder +from restore import restore_shoot_folder def test_ignores_metadata_files(glacier_shoot_bucket, fs): diff --git a/test/test_upload.py b/test/test_upload.py index 146c9f7..fc31938 100644 --- a/test/test_upload.py +++ b/test/test_upload.py @@ -3,16 +3,17 @@ import boto3 from transferrer.upload import upload from transferrer.upload import upload +from moto import mock_aws - -def test_raises_on_missing_zip(moto_s3, target_bucket, fs): +@mock_aws +def test_raises_on_missing_zip(moto_session, target_bucket, fs): with pytest.raises(FileNotFoundError): - upload(moto_s3, "missing.zip") - + upload(moto_session, "missing.zip") -def test_uploads_to_accessions_folder_in_bucket(moto_s3, target_bucket, fs): +@mock_aws +def test_uploads_to_accessions_folder_in_bucket(moto_session, target_bucket, fs): fs.create_file("present.zip") - upload(moto_s3, "present.zip") + upload(moto_session, "present.zip") assert [obj.key for obj in target_bucket.objects.all()] == ["born-digital-accessions/present.zip"]