Skip to content

Commit

Permalink
improve retry/touch detection
Browse files Browse the repository at this point in the history
  • Loading branch information
paul-butcher committed Nov 20, 2024
1 parent 0815e50 commit 34a7629
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 22 deletions.
24 changes: 23 additions & 1 deletion client/touchable.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,31 @@
import boto3
import sys
import os
from compile_pending_list import find_shoots_query, get_identifiers

from reporting_client import get_es_client

BUCKETS = {
"staging": "wellcomecollection-archivematica-staging-transfer-source",
"production": "wellcomecollection-archivematica-transfer-source"
}

def get_failed_subshoots(session, subshoots):
es = get_es_client(session)
subshoots = list(subshoots)
ids = [s[:-4].partition('/')[2] for s in subshoots]

response = es.search(
index="storage_ingests",
size=1000,
query=find_shoots_query(ids),
source=False,
fields=["bag.info.externalIdentifier", "lastModifiedDate"]
)
succeeded = get_identifiers(response["hits"]["hits"])
for pair in zip(subshoots, ids):
if pair[1] not in succeeded:
yield pair[0]

def find_objects(session, bucket, object_keys):
bucket = session.resource('s3').Bucket(bucket)
Expand All @@ -17,6 +37,8 @@ def find_objects(session, bucket, object_keys):


if __name__ == '__main__':
objects = find_objects(boto3.Session(profile_name="digitisation-developer"), BUCKETS[sys.argv[1]], sys.stdin.readlines())
print("\n".join(
find_objects(boto3.Session(), BUCKETS[sys.argv[1]], sys.stdin.readlines())
get_failed_subshoots(
boto3.Session(profile_name="platform-developer"), objects)
))
21 changes: 0 additions & 21 deletions src/transferrer/transfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,18 @@
import tempfile
from transferrer.download import download_shoot
from transferrer.make_zip import make_zip_from
<<<<<<< HEAD
from transferrer.upload import upload, get_target_bucket
=======
from transferrer.upload import upload
>>>>>>> improve-report
import boto3
import re


<<<<<<< HEAD
re_extract_suffix = re.compile('(_\\d\\d\\d)\\.zip')

MAX_SPACE_BYTES = os.getenv('MAX_SPACE_BYTES', 10240000000) # maximum setting for Lambda Ephemeral Storage
TARGET_BUCKET = os.getenv("TARGET_BUCKET")
=======
MAX_SPACE_BYTES = 10240000000 # maximum setting for Lambda Ephemeral Storage

>>>>>>> improve-report


def shoot_number_to_accession_id(accession_number, shoot_number):
"""
<<<<<<< HEAD
The accession id is simply the shoot_number prefixed with the accession number.
=======
The accession id is simply the shoot_number prefixed with the accession number
>>>>>>> improve-report
>>> shoot_number_to_accession_id("2754", "CP000159")
'2754_CP000159'
"""
Expand All @@ -50,13 +34,8 @@ def transfer_shoot(from_session, to_session, shoot_number, accession_number, max
tmpfolder = root_dir.name
source_folder = os.path.join(tmpfolder, "source")
target_folder = os.path.join(tmpfolder, "target")
<<<<<<< HEAD
already_up = [match.group(1) for match in (re_extract_suffix.search(o.key) for o in get_target_bucket(to_session, TARGET_BUCKET).filter(Prefix=f"born-digital-accessions/{accession_id}")) if match]

for files, suffix in download_shoot(from_session, shoot_number, source_folder, max_batch_bytes, ignore=already_up):
=======
for files, suffix in download_shoot(from_session, shoot_number, source_folder, max_batch_bytes):
>>>>>>> improve-report
upload(
to_session,
make_zip_from(files, source_folder, target_folder, accession_id, suffix)
Expand Down

0 comments on commit 34a7629

Please sign in to comment.