From 52a5a64270467c844932138da408f48ab62081c6 Mon Sep 17 00:00:00 2001 From: tomcrane Date: Sun, 21 Jan 2024 17:46:57 +0000 Subject: [PATCH] Import Job --- .../Views/ImportExport/ImportJob.cshtml | 6 +- .../Controllers/ImportController.cs | 237 +++++++++++++++++- .../Controllers/InfoController.cs | 30 +++ .../Controllers/RepositoryController.cs | 12 +- LeedsExperiment/Preservation/Checksum.cs | 5 +- LeedsExperiment/Preservation/FedoraWrapper.cs | 7 +- LeedsExperiment/Preservation/IPreservation.cs | 4 +- LeedsExperiment/Preservation/ImportSource.cs | 18 ++ .../PreservationService.cs | 7 +- 9 files changed, 298 insertions(+), 28 deletions(-) create mode 100644 LeedsExperiment/Preservation.API/Controllers/InfoController.cs create mode 100644 LeedsExperiment/Preservation/ImportSource.cs diff --git a/LeedsExperiment/Dashboard/Views/ImportExport/ImportJob.cshtml b/LeedsExperiment/Dashboard/Views/ImportExport/ImportJob.cshtml index 7013720..4a45f8c 100644 --- a/LeedsExperiment/Dashboard/Views/ImportExport/ImportJob.cshtml +++ b/LeedsExperiment/Dashboard/Views/ImportExport/ImportJob.cshtml @@ -102,7 +102,7 @@ - } +
@@ -110,8 +110,8 @@
-
diff --git a/LeedsExperiment/Preservation.API/Controllers/ImportController.cs b/LeedsExperiment/Preservation.API/Controllers/ImportController.cs index a6abb3a..14fac4c 100644 --- a/LeedsExperiment/Preservation.API/Controllers/ImportController.cs +++ b/LeedsExperiment/Preservation.API/Controllers/ImportController.cs @@ -1,6 +1,12 @@ using Amazon.S3; +using Amazon.S3.Model; +using Amazon.S3.Util; using Fedora; +using Fedora.Abstractions; +using Fedora.Abstractions.Transfer; +using Fedora.Storage; using Microsoft.AspNetCore.Mvc; +using Microsoft.AspNetCore.StaticFiles; using Microsoft.Extensions.Options; namespace Preservation.API.Controllers; @@ -13,6 +19,7 @@ public class ImportController : Controller private readonly IFedora fedora; private readonly PreservationApiOptions options; private IAmazonS3 s3Client; + private FileExtensionContentTypeProvider contentTypeProvider = new FileExtensionContentTypeProvider(); public ImportController( IStorageMapper storageMapper, @@ -27,15 +34,167 @@ IAmazonS3 awsS3Client this.s3Client = awsS3Client; } + class NameAndParentPath + { + public string Name { get; set; } + public string? ParentPath { get; set; } + + public NameAndParentPath(string path) + { + var pathParts = path.Split(['/']); + Name = pathParts[^1]; + if (pathParts.Length > 1) + { + ParentPath = path.Substring(0, path.Length - Name.Length - 1); + } + } + } + [HttpGet(Name = "ImportJob")] - [Route("{*path}")] - public async Task GetImportJob([FromRoute] string path, [FromQuery] string? source) + [Route("{*archivalGroupPath}")] + public async Task GetImportJob([FromRoute] string archivalGroupPath, [FromQuery] string source) { - // build an import job, with checksums etc, set diffversion, time it. - // compare the source with the object and build the diff properties. - return null; + var agUri = fedora.GetUri(archivalGroupPath); + ArchivalGroup? archivalGroup; + var diffStart = DateTime.Now; + var info = await fedora.GetResourceInfo(agUri); + if(info.Exists && info.Type == nameof(ArchivalGroup)) + { + archivalGroup = await fedora.GetPopulatedArchivalGroup(agUri); + } + else if (info.StatusCode == 404) // HTTP leakage + { + archivalGroup = null; + // it doesn't exist - but we still need to check that: + // - it has an immediate parent container + // - that container is not itself an archival group or part of one + var npp = new NameAndParentPath(archivalGroupPath); + if(npp.ParentPath == null) + { + throw new InvalidOperationException($"No parent object for {archivalGroupPath}"); + } + var parentInfo = await fedora.GetObject(npp.ParentPath); + if (parentInfo == null) + { + throw new InvalidOperationException($"No parent object for {archivalGroupPath}"); + } + if(parentInfo.Type == nameof(ArchivalGroup)) + { + throw new InvalidOperationException($"The parent of {archivalGroupPath} is an Archival Group"); + } + if(parentInfo.PartOf != null) + { + throw new InvalidOperationException($"{archivalGroupPath} is already part of an Archival Group"); + } + if (parentInfo.Type != nameof(Container)) + { + throw new InvalidOperationException($"The parent of {archivalGroupPath} is not a container"); + } + } + else + { + throw new InvalidOperationException($"Cannot create {archivalGroupPath} for {info.Type}, status: {info.StatusCode}"); + } + + // This is either an existing Archival Group, or a 404 where the immediate parent is a Container that is not itself part of an Archival Group. + // So now evaluate the source: + + var importSource = await GetImportSource(source, agUri); + var importJob = new ImportJob + { + ArchivalGroupUri = agUri, + StorageType = StorageTypes.S3, // all we support for now + ArchivalGroupPath = archivalGroupPath, + Source = source, + DiffStart = diffStart + }; + if(archivalGroup == null) + { + // This is a new object + importJob.ContainersToAdd = importSource.Containers; + importJob.FilesToAdd = importSource.Files; + } + else + { + importJob.ArchivalGroupName = archivalGroup.Name; + importJob.IsUpdate = true; + importJob.DiffVersion = archivalGroup.Version; + PopulateDiffTasks(archivalGroup, importSource, importJob); + } + importJob.DiffEnd = DateTime.Now; + return importJob; } + private void PopulateDiffTasks(ArchivalGroup archivalGroup, ImportSource importSource, ImportJob importJob) + { + // What's the best way to diff? + // This is very crude and can't spot a container being renamed + var allExistingContainers = new List(); + var allExistingFiles = new List(); + TraverseContainers(archivalGroup, allExistingContainers, allExistingFiles, archivalGroup); + + importJob.FilesToAdd.AddRange(importSource.Files.Where( + importFile => !allExistingFiles.Exists(existingFile => existingFile.Path == importFile.Path))); + + importJob.FilesToDelete.AddRange(allExistingFiles.Where( + existingFile => !importSource.Files.Exists(importFile => importFile.Path == existingFile.Path))); + + foreach(var importFile in importSource.Files.Where( + importFile => !importJob.FilesToAdd.Exists(f => f.Path == importFile.Path))) + { + // files not already put in FilesToAdd + var existingFile = allExistingFiles.Single(existingFile => existingFile.Path == importFile.Path); + if(string.IsNullOrEmpty(existingFile.Digest) || string.IsNullOrEmpty(importFile.Digest)) + { + throw new Exception("Missing digest in diff operation for " + existingFile.Path); + } + if(existingFile.Digest != importFile.Digest) + { + importJob.FilesToPatch.Add(importFile); + } + } + + importJob.ContainersToAdd.AddRange(importSource.Containers.Where( + importContainer => !allExistingContainers.Exists(existingContainer => existingContainer.Path == importContainer.Path))); + + importJob.ContainersToDelete.AddRange(allExistingContainers.Where( + existingContainer => !importSource.Containers.Exists(importContainer => importContainer.Path == existingContainer.Path))); + + // Later we will also need patch ops on container (for data) + // and patch ops on file for metadata as well as digest difference as above. + } + + private static void TraverseContainers( + ArchivalGroup archivalGroup, + List allExistingContainers, + List allExistingFiles, + Container traverseContainer) + { + foreach (Container container in traverseContainer.Containers) + { + allExistingContainers.Add(new ContainerDirectory + { + Name = container.Name!, + Parent = archivalGroup.Location!, + Path = container.ObjectPath!.Remove(0, archivalGroup.ObjectPath!.Length + 1) + }); + TraverseContainers(archivalGroup, allExistingContainers, allExistingFiles, container); + } + foreach (Binary binary in traverseContainer.Binaries) + { + allExistingFiles.Add(new BinaryFile + { + Name = binary.Name!, + Parent = archivalGroup.Location!, + Path = binary.ObjectPath!.Remove(0, archivalGroup.ObjectPath!.Length + 1), + ContentType = binary.ContentType ?? string.Empty, + StorageType = StorageTypes.S3, // shouldn't have to hard code that here, but Binary doesn't have that prop + Digest = binary.Digest, + FileName = binary.FileName!, + ExternalLocation = binary.Origin ?? string.Empty // we won't use this because it's the destination + }); + } + } [HttpPost(Name = "ExecuteImport")] [Route("__import")] @@ -45,6 +204,72 @@ IAmazonS3 awsS3Client // keep a log of the updates (populate the *added props) // get the AG again, see the version, validate it's one on etc // return the import job - return null; + throw new NotImplementedException(); + } + + + private async Task GetImportSource(string source, Uri intendedParent) + { + // Move this behind an interface later + // This will currently break if the source is not an s3 Uri to which we have access + // but later could be a file path etc, a scratch upload location, whatever + var s3Uri = new AmazonS3Uri(source); + // we assume this is the root. We also assume that we are not going to hit the AWS limit (1000?) + // https://docs.aws.amazon.com/sdkfornet1/latest/apidocs/html/M_Amazon_S3_AmazonS3_ListObjects.htm + // ^^ for paging + // We can't learn anything about containers this way other than that there are slugs in path + // We can't learn anything about intended name (dc:title) from this, but that's OK for now + // That kind of data should be in METS files; we can enhance the ImportJob with it later in a real world application + var listObjectsReq = new ListObjectsRequest() + { + BucketName = s3Uri.Bucket, + Prefix = $"{s3Uri.Key.TrimEnd('/')}/" + }; + var importSource = new ImportSource(); + var response = await s3Client.ListObjectsAsync(listObjectsReq); + var containerPaths = new HashSet(); + foreach (S3Object obj in response.S3Objects) + { + var s3Stream = await s3Client!.GetObjectStreamAsync(obj.BucketName, obj.Key, null); + var sha512Digest = Checksum.Sha512FromStream(s3Stream); + var sourcePath = obj.Key.Remove(0, listObjectsReq.Prefix.Length); + var nameAndParentPath = new NameAndParentPath(sourcePath); + if(nameAndParentPath.ParentPath != null) + { + containerPaths.Add(nameAndParentPath.ParentPath); + } + importSource.Files.Add(new BinaryFile + { + Name = nameAndParentPath.Name, + FileName = nameAndParentPath.Name, + Parent = intendedParent, + Path = sourcePath, + StorageType = StorageTypes.S3, + ExternalLocation = $"s3://{obj.BucketName}/{obj.Key}", + Digest = sha512Digest, + ContentType = GetDefaultContentType(nameAndParentPath.Name) // we may overwrite this later, e.g., from PREMIS data + }); + } + foreach(string containerPath in containerPaths) + { + var nameAndParentPath = new NameAndParentPath(containerPath); + importSource.Containers.Add(new ContainerDirectory + { + Name = nameAndParentPath.Name, + Parent = intendedParent, + Path = containerPath + }); + } + return importSource; + } + + private string GetDefaultContentType(string path) + { + const string DefaultContentType = "application/octet-stream"; + if (!contentTypeProvider.TryGetContentType(path, out string? contentType)) + { + contentType = DefaultContentType; + } + return contentType; } } diff --git a/LeedsExperiment/Preservation.API/Controllers/InfoController.cs b/LeedsExperiment/Preservation.API/Controllers/InfoController.cs new file mode 100644 index 0000000..f2abcd1 --- /dev/null +++ b/LeedsExperiment/Preservation.API/Controllers/InfoController.cs @@ -0,0 +1,30 @@ +using Fedora; +using Fedora.Abstractions; +using Microsoft.AspNetCore.Mvc; + +namespace Preservation.API.Controllers +{ + [Route("api/[controller]")] + [ApiController] + public class InfoController : Controller + { + private readonly IFedora fedora; + + public InfoController(IFedora fedora) + { + this.fedora = fedora; + } + + + [HttpGet(Name = "GetInfo")] + [Route("{*path}", Order = 1)] + public async Task> Info( + [FromRoute] string path) + { + var uri = fedora.GetUri(path); + var info = await fedora.GetResourceInfo(uri); + return info; + } + } + +} diff --git a/LeedsExperiment/Preservation.API/Controllers/RepositoryController.cs b/LeedsExperiment/Preservation.API/Controllers/RepositoryController.cs index 01c9894..6520f33 100644 --- a/LeedsExperiment/Preservation.API/Controllers/RepositoryController.cs +++ b/LeedsExperiment/Preservation.API/Controllers/RepositoryController.cs @@ -15,18 +15,8 @@ public RepositoryController(IFedora fedora) this.fedora = fedora; } - [HttpGet(Name = "GetInfo")] - [Route("__info/{*path}")] - public async Task> Info( - [FromRoute] string path) - { - var uri = fedora.GetUri(path); - var info = await fedora.GetResourceInfo(uri); - return info; - } - [HttpGet(Name = "Browse")] - [Route("{*path}")] + [Route("{*path}", Order = 2)] public async Task> Index( [FromRoute] string? path = null) { diff --git a/LeedsExperiment/Preservation/Checksum.cs b/LeedsExperiment/Preservation/Checksum.cs index 76e3d96..62f27f2 100644 --- a/LeedsExperiment/Preservation/Checksum.cs +++ b/LeedsExperiment/Preservation/Checksum.cs @@ -9,7 +9,10 @@ public class Checksum { try { - openedStream.Position = 0; + if (openedStream.CanSeek) + { + openedStream.Position = 0; + } // Compute the hash of the fileStream. byte[] hashValue = hashAlgorithm.ComputeHash(openedStream); // Write the name and hash value of the file to the console. diff --git a/LeedsExperiment/Preservation/FedoraWrapper.cs b/LeedsExperiment/Preservation/FedoraWrapper.cs index 2dd05af..a73895d 100644 --- a/LeedsExperiment/Preservation/FedoraWrapper.cs +++ b/LeedsExperiment/Preservation/FedoraWrapper.cs @@ -230,6 +230,8 @@ private async void EnsureChecksum(BinaryFile binaryFile) // could get a byte array here and then pass it along eventually to MakeBinaryPutOrPost // for now just read it twice. // Later we'll get the sha256 checksum from metadata + // Or the MD5 from eTag? + // BEWARE that multipart uploads will not have the MD5 as the eTag. break; default: throw new InvalidOperationException("Unkown storage type " + binaryFile.StorageType); @@ -493,6 +495,7 @@ private HttpRequestMessage MakeHttpRequestMessage(Uri uri, HttpMethod method) public async Task GetObject(Uri uri, Transaction? transaction = null) { // Make a head request to see what this is + var info = await GetResourceInfo(uri); if (info.Type == nameof(ArchivalGroup)) { @@ -587,11 +590,11 @@ public async Task GetResourceInfo(Uri uri) { result.Type = nameof(ArchivalGroup); } - if (headResponse.HasBinaryTypeHeader()) + else if (headResponse.HasBinaryTypeHeader()) { result.Type = nameof(Binary); } - if (headResponse.HasBasicContainerTypeHeader()) + else if (headResponse.HasBasicContainerTypeHeader()) { result.Type = nameof(Container); } diff --git a/LeedsExperiment/Preservation/IPreservation.cs b/LeedsExperiment/Preservation/IPreservation.cs index 48b42aa..65c4616 100644 --- a/LeedsExperiment/Preservation/IPreservation.cs +++ b/LeedsExperiment/Preservation/IPreservation.cs @@ -33,10 +33,10 @@ public interface IPreservation /// The reason to split is to allow the operator (a human user, or software) to see the diff - to verify that the job is what /// was intended or expected. /// - /// + /// /// /// A partially populated ImportJob - Task GetUpdateJob(string path, string source); + Task GetUpdateJob(string archivalGroupPath, string source); /// /// "Execute" the update job obtrained above. diff --git a/LeedsExperiment/Preservation/ImportSource.cs b/LeedsExperiment/Preservation/ImportSource.cs new file mode 100644 index 0000000..4713d0c --- /dev/null +++ b/LeedsExperiment/Preservation/ImportSource.cs @@ -0,0 +1,18 @@ +using Fedora.Abstractions.Transfer; +using System.Text.Json.Serialization; + +namespace Preservation; + +public class ImportSource +{ + [JsonPropertyName("containers")] + [JsonPropertyOrder(1)] + public List Containers { get; set; } = []; + + /// + /// Fedora binaries that need to be created to synchronise the Archival Group object with the source + /// + [JsonPropertyName("files")] + [JsonPropertyOrder(2)] + public List Files { get; set; } = []; +} diff --git a/LeedsExperiment/PreservationApiClient/PreservationService.cs b/LeedsExperiment/PreservationApiClient/PreservationService.cs index 477eb3c..85f6d77 100644 --- a/LeedsExperiment/PreservationApiClient/PreservationService.cs +++ b/LeedsExperiment/PreservationApiClient/PreservationService.cs @@ -11,6 +11,7 @@ public class PreservationService : IPreservation // This is horrible and wrong private const string repositoryPrefix = "api/repository/"; + private const string infoPrefix = "api/info/"; private const string agPrefix = "api/archivalGroup/"; private const string exportPrefix = "api/export/"; private const string importPrefix = "api/import/"; @@ -72,7 +73,7 @@ public string GetInternalPath(Uri preservationApiUri) public async Task GetResourceInfo(string path) { - var apiPath = $"{repositoryPrefix}__info/{path.TrimStart('/')}"; + var apiPath = $"{infoPrefix}{path.TrimStart('/')}"; var infoApi = new Uri(apiPath, UriKind.Relative); var info = await _httpClient.GetFromJsonAsync(infoApi); return info!; @@ -98,9 +99,9 @@ public async Task Export(string path, string? version) throw new InvalidOperationException("Could not get an export object back"); } - public async Task GetUpdateJob(string path, string source) + public async Task GetUpdateJob(string archivalGroupPath, string source) { - var apiPath = $"{importPrefix}{path.TrimStart('/')}?source={source}"; + var apiPath = $"{importPrefix}{archivalGroupPath.TrimStart('/')}?source={source}"; var importApi = new Uri(apiPath, UriKind.Relative); var importJob = await _httpClient.GetFromJsonAsync(importApi); // What's the best way to deal with problems here?