diff --git a/client/touchable.py b/client/touchable.py index 8f7bb5c..84e3a30 100644 --- a/client/touchable.py +++ b/client/touchable.py @@ -1,11 +1,31 @@ import boto3 import sys +import os +from compile_pending_list import find_shoots_query, get_identifiers + +from reporting_client import get_es_client BUCKETS = { "staging": "wellcomecollection-archivematica-staging-transfer-source", "production": "wellcomecollection-archivematica-transfer-source" } +def get_failed_subshoots(session, subshoots): + es = get_es_client(session) + subshoots = list(subshoots) + ids = [s[:-4].partition('/')[2] for s in subshoots] + + response = es.search( + index="storage_ingests", + size=1000, + query=find_shoots_query(ids), + source=False, + fields=["bag.info.externalIdentifier", "lastModifiedDate"] + ) + succeeded = get_identifiers(response["hits"]["hits"]) + for pair in zip(subshoots, ids): + if pair[1] not in succeeded: + yield pair[0] def find_objects(session, bucket, object_keys): bucket = session.resource('s3').Bucket(bucket) @@ -17,6 +37,8 @@ def find_objects(session, bucket, object_keys): if __name__ == '__main__': + objects = find_objects(boto3.Session(profile_name="digitisation-developer"), BUCKETS[sys.argv[1]], sys.stdin.readlines()) print("\n".join( - find_objects(boto3.Session(), BUCKETS[sys.argv[1]], sys.stdin.readlines()) + get_failed_subshoots( + boto3.Session(profile_name="platform-developer"), objects) )) diff --git a/src/transferrer/transfer.py b/src/transferrer/transfer.py index 4f8fc1d..12bc52a 100644 --- a/src/transferrer/transfer.py +++ b/src/transferrer/transfer.py @@ -2,34 +2,18 @@ import tempfile from transferrer.download import download_shoot from transferrer.make_zip import make_zip_from -<<<<<<< HEAD from transferrer.upload import upload, get_target_bucket -======= -from transferrer.upload import upload ->>>>>>> improve-report import boto3 import re - -<<<<<<< HEAD re_extract_suffix = re.compile('(_\\d\\d\\d)\\.zip') MAX_SPACE_BYTES = os.getenv('MAX_SPACE_BYTES', 10240000000) # maximum setting for Lambda Ephemeral Storage TARGET_BUCKET = os.getenv("TARGET_BUCKET") -======= -MAX_SPACE_BYTES = 10240000000 # maximum setting for Lambda Ephemeral Storage - ->>>>>>> improve-report - def shoot_number_to_accession_id(accession_number, shoot_number): """ -<<<<<<< HEAD The accession id is simply the shoot_number prefixed with the accession number. - -======= - The accession id is simply the shoot_number prefixed with the accession number ->>>>>>> improve-report >>> shoot_number_to_accession_id("2754", "CP000159") '2754_CP000159' """ @@ -50,13 +34,8 @@ def transfer_shoot(from_session, to_session, shoot_number, accession_number, max tmpfolder = root_dir.name source_folder = os.path.join(tmpfolder, "source") target_folder = os.path.join(tmpfolder, "target") -<<<<<<< HEAD already_up = [match.group(1) for match in (re_extract_suffix.search(o.key) for o in get_target_bucket(to_session, TARGET_BUCKET).filter(Prefix=f"born-digital-accessions/{accession_id}")) if match] - for files, suffix in download_shoot(from_session, shoot_number, source_folder, max_batch_bytes, ignore=already_up): -======= - for files, suffix in download_shoot(from_session, shoot_number, source_folder, max_batch_bytes): ->>>>>>> improve-report upload( to_session, make_zip_from(files, source_folder, target_folder, accession_id, suffix)