From 78a125db282d3d61e5fae3b0c38ea8e1c161fa54 Mon Sep 17 00:00:00 2001 From: tomcrane Date: Sun, 14 Jan 2024 12:55:16 +0000 Subject: [PATCH] Add Transfer types --- .../Controllers/ImportExportController.cs | 40 +++++++++++++ .../Abstractions/Transfer/BinaryFile.cs | 44 ++++++++++++++ .../Transfer/ContainerDirectory.cs | 17 ++++++ .../Fedora/Storage/StorageTypes.cs | 8 +++ LeedsExperiment/Preservation/ExportResult.cs | 26 +++++++++ LeedsExperiment/Preservation/FedoraWrapper.cs | 2 +- LeedsExperiment/Preservation/IPreservation.cs | 16 ++++++ LeedsExperiment/Preservation/ImportJob.cs | 57 +++++++++++++++++++ .../Preservation/OcflS3StorageMapper.cs | 2 +- 9 files changed, 210 insertions(+), 2 deletions(-) create mode 100644 LeedsExperiment/Fedora/Abstractions/Transfer/BinaryFile.cs create mode 100644 LeedsExperiment/Fedora/Abstractions/Transfer/ContainerDirectory.cs create mode 100644 LeedsExperiment/Fedora/Storage/StorageTypes.cs create mode 100644 LeedsExperiment/Preservation/ExportResult.cs create mode 100644 LeedsExperiment/Preservation/ImportJob.cs diff --git a/LeedsExperiment/Dashboard/Controllers/ImportExportController.cs b/LeedsExperiment/Dashboard/Controllers/ImportExportController.cs index e52d7c4..3e2a3b7 100644 --- a/LeedsExperiment/Dashboard/Controllers/ImportExportController.cs +++ b/LeedsExperiment/Dashboard/Controllers/ImportExportController.cs @@ -23,6 +23,16 @@ public async Task ExportStartAsync( [FromRoute] string path, [FromQuery] string? version = null) { + // display a list of what's going to be exported + // add a default destination in the staging bucket + // allow a different destination to be specified (bucket, key root) + + // POST the job to /processexport (not the file list, that gets recalculated) + + // sync wait for response + // in production this will go on a queue and will poll for completion + + // display summary completion, link back to ArchivalGroup Browse head } @@ -33,6 +43,36 @@ public async Task ImportStartAsync( [FromRoute] string path, [FromQuery] string? version = null) { + // work out what can be shared with a create new + // (is it just this with a null path?) + + // get a source bucket and root key from the user + // show a list of the contents but don't diff yet + + // https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html + + // S3 supports SHA256 but not 512 + // So we could lean on S3 to compute the checksums at source + + // can we set a bucket policy that auto-calculates a checkusm for every key when added? + // otherwise we'll need to calculate the checksums ourselves + + // ask Leeds about 512 / 256 + + // In order to calculate a diff we can compare checksums. We could compare sizes first to pick up obvious changes + // but we'd need to checksum everything anyway + + // Generate a diff + // files to add + // files to change + // files to rename? - can you even do that... + // files to delete + // This could be optimised in production + + + } + + } } diff --git a/LeedsExperiment/Fedora/Abstractions/Transfer/BinaryFile.cs b/LeedsExperiment/Fedora/Abstractions/Transfer/BinaryFile.cs new file mode 100644 index 0000000..ca3c806 --- /dev/null +++ b/LeedsExperiment/Fedora/Abstractions/Transfer/BinaryFile.cs @@ -0,0 +1,44 @@ +namespace Fedora.Abstractions.Transfer; + +/// +/// Used when importing new files into the repository from a source location (disk or S3) +/// or exporting to a location +/// +public class BinaryFile +{ + /// + /// The repository path (not a full Uri), will end with Slug + /// Only contains permitted characters (e.g., no spaces) + /// + public required string Path { get; set; } + + /// + /// An S3 key, a filesystem path - somewhere accessible to the Preservation API, to import from or export to + /// + public required string Location { get; set; } + + public required string StorageType { get; set; } + + /// + /// Only contains permitted characters (e.g., no spaces) + /// + public string Slug => Path.Split('/')[^1]; + + /// + /// The name of the resource in Fedora (dc:title) + /// + public required string Name { get; set; } + + /// + /// The Original / actual name of the file, rather than the path-safe, reduced character set slug + /// + public required string FileName { get; set; } + + // NB ^^^ for a filename like readme.txt, Slug, Name and FileName will all be the same. + // And in practice, Name and FileName are going ot be the same + // But Slug may differ as it always must be in the reduced character set + + public required string ContentType { get; set; } + + public string? Digest { get; set; } +} diff --git a/LeedsExperiment/Fedora/Abstractions/Transfer/ContainerDirectory.cs b/LeedsExperiment/Fedora/Abstractions/Transfer/ContainerDirectory.cs new file mode 100644 index 0000000..2351ae0 --- /dev/null +++ b/LeedsExperiment/Fedora/Abstractions/Transfer/ContainerDirectory.cs @@ -0,0 +1,17 @@ +namespace Fedora.Abstractions.Transfer; + +public class ContainerDirectory +{ + /// + /// The repository path (not a full Uri), will end with Slug + /// Only contains permitted characters (e.g., no spaces) + /// + public required string Path { get; set; } + + public string Slug => Path.Split('/')[^1]; + + /// + /// The name of the resource in Fedora (dc:title) + /// + public required string Name { get; set; } +} diff --git a/LeedsExperiment/Fedora/Storage/StorageTypes.cs b/LeedsExperiment/Fedora/Storage/StorageTypes.cs new file mode 100644 index 0000000..89cd3d9 --- /dev/null +++ b/LeedsExperiment/Fedora/Storage/StorageTypes.cs @@ -0,0 +1,8 @@ + +namespace Fedora.Storage; + +public static class StorageTypes +{ + public const string S3 = "S3"; + public const string FileSystem = "FileSystem"; +} diff --git a/LeedsExperiment/Preservation/ExportResult.cs b/LeedsExperiment/Preservation/ExportResult.cs new file mode 100644 index 0000000..c44f003 --- /dev/null +++ b/LeedsExperiment/Preservation/ExportResult.cs @@ -0,0 +1,26 @@ + +using Fedora.Abstractions.Transfer; +using Fedora.Storage; + +namespace Preservation; + +public class ExportResult +{ + /// + /// For info - the path of the source archival group + /// + public required string Path { get; set; } + + /// + /// The version that was exported + /// + public required ObjectVersion Version { get; set; } + public DateTime Start { get; set; } + public DateTime End { get; set; } + + // The root location (S3 Uri, directory path) where the ArchivalGroup has been exported + public required string Source { get; set; } + public required string StorageType { get; set; } + + public List Files { get; set; } = []; +} diff --git a/LeedsExperiment/Preservation/FedoraWrapper.cs b/LeedsExperiment/Preservation/FedoraWrapper.cs index 95b6414..f33d7db 100644 --- a/LeedsExperiment/Preservation/FedoraWrapper.cs +++ b/LeedsExperiment/Preservation/FedoraWrapper.cs @@ -604,7 +604,7 @@ private void PopulateOrigins(StorageMap storageMap, Container container) private static void PopulateOrigin(StorageMap storageMap, Binary binary) { - if (storageMap.StorageType == "S3") + if (storageMap.StorageType == StorageTypes.S3) { // This "s3-ness" needs to be inside an abstracted impl binary.Origin = $"s3://{storageMap.Root}/{storageMap.ObjectPath}/{storageMap.Hashes[binary.Digest!]}"; diff --git a/LeedsExperiment/Preservation/IPreservation.cs b/LeedsExperiment/Preservation/IPreservation.cs index 72e4906..467634f 100644 --- a/LeedsExperiment/Preservation/IPreservation.cs +++ b/LeedsExperiment/Preservation/IPreservation.cs @@ -4,7 +4,23 @@ namespace Preservation; public interface IPreservation { + // Getting things from Fedora Task GetResource(string? path); string GetInternalPath(Uri preservationApiUri); Task GetArchivalGroup(string path, string? version); + + + // Interacting with a staging area + Task Export(string path, string? version); + + /// + /// Get a diff that can then be executed + /// + /// + /// + /// + Task GetUpdateJob(string path, string source); + + // Create or update the job obtained above - latter requires isUpdate explicitly + Task Import(ImportJob importJob); } diff --git a/LeedsExperiment/Preservation/ImportJob.cs b/LeedsExperiment/Preservation/ImportJob.cs new file mode 100644 index 0000000..66cf3f1 --- /dev/null +++ b/LeedsExperiment/Preservation/ImportJob.cs @@ -0,0 +1,57 @@ +using Fedora.Abstractions.Transfer; +using Fedora.Storage; + +namespace Preservation; + +public class ImportJob +{ + public required string ArchivalGroupPath { get; set; } + // Must be an S3 URI, for now + public required string Source { get; set; } + public required string StorageType { get; set; } + + public Uri? ArchivalGroupUri { get; set; } + + + public DateTime DiffStart { get; set; } + public DateTime DiffEnd { get; set; } + + /// + /// What version at HEAD is this diff based on? + /// When it comes to execute the job, need to make sure it's the same + /// And then execute the update in a transaction. + /// (null if a new object, IsUpdate = false) + /// + public ObjectVersion? DiffVersion { get; set; } + + public List ContainersToAdd { get; set; } = []; + public List FilesToAdd { get; set; } = []; + public List FilesToDelete { get; set; } = []; + public List FilesToPatch { get; set; } = []; + // FilesToRename? + + /// + /// While any required new containers can be created as files are added (create along path), + /// we may end up with containers that have no files in them; these need to be deleted from Fedora. + /// + public List ContainersToDelete { get; set; } = []; + + + public List ContainersAdded { get; set; } = []; + public List FilesAdded { get; set; } = []; + public List FilesDeleted { get; set; } = []; + public List FilesPatched { get; set; } = []; + public List ContainersDeleted { get; set; } = []; + + // Must be explicitly set to true to allow an update of an existing ArchivalGroup + public bool IsUpdate { get; set; } + + + public DateTime Start { get; set; } + public DateTime End { get; set; } + + + public ObjectVersion? NewVersion { get; set; } + + +} diff --git a/LeedsExperiment/Preservation/OcflS3StorageMapper.cs b/LeedsExperiment/Preservation/OcflS3StorageMapper.cs index bc83b20..4e3115a 100644 --- a/LeedsExperiment/Preservation/OcflS3StorageMapper.cs +++ b/LeedsExperiment/Preservation/OcflS3StorageMapper.cs @@ -113,7 +113,7 @@ public async Task GetStorageMap(Uri archivalGroupUri, string? versio ArchivalGroup = archivalGroupUri, Version = objectVersion, HeadVersion = headObjectVersion, - StorageType = "S3", + StorageType = StorageTypes.S3, Root = fedoraAws.Bucket, ObjectPath = agOrigin!, AllVersions = inventoryVersions.ToArray(),