Skip to content

Commit

Permalink
Import Job
Browse files Browse the repository at this point in the history
  • Loading branch information
tomcrane committed Jan 21, 2024
1 parent 8ab9960 commit 52a5a64
Show file tree
Hide file tree
Showing 9 changed files with 298 additions and 28 deletions.
6 changes: 3 additions & 3 deletions LeedsExperiment/Dashboard/Views/ImportExport/ImportJob.cshtml
Original file line number Diff line number Diff line change
Expand Up @@ -102,16 +102,16 @@
</tr>
</tbody>
</table>
}


<form method="post" action="/import/@(Model.Path)">

<button type="submit" class="btn btn-primary">Submit</button>

<div class="mb-3">
<label for="importJob" class="form-label">Raw JSON of the importJob</label>
<textarea class="form-control" id="importJob" rows="3" name="importJobString">
@JsonSerializer.Serialize(Model.ImportJob)
<textarea class="form-control" id="importJob" rows="3" name="importJobString" style="height:50vh; font-family:monospace">
@JsonSerializer.Serialize(Model.ImportJob, new JsonSerializerOptions{WriteIndented=true})
</textarea>
</div>
</form>
Expand Down
237 changes: 231 additions & 6 deletions LeedsExperiment/Preservation.API/Controllers/ImportController.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
using Amazon.S3;
using Amazon.S3.Model;
using Amazon.S3.Util;
using Fedora;
using Fedora.Abstractions;
using Fedora.Abstractions.Transfer;
using Fedora.Storage;
using Microsoft.AspNetCore.Mvc;
using Microsoft.AspNetCore.StaticFiles;
using Microsoft.Extensions.Options;

namespace Preservation.API.Controllers;
Expand All @@ -13,6 +19,7 @@ public class ImportController : Controller
private readonly IFedora fedora;
private readonly PreservationApiOptions options;
private IAmazonS3 s3Client;
private FileExtensionContentTypeProvider contentTypeProvider = new FileExtensionContentTypeProvider();

public ImportController(
IStorageMapper storageMapper,
Expand All @@ -27,15 +34,167 @@ IAmazonS3 awsS3Client
this.s3Client = awsS3Client;
}

class NameAndParentPath
{
public string Name { get; set; }
public string? ParentPath { get; set; }

public NameAndParentPath(string path)
{
var pathParts = path.Split(['/']);
Name = pathParts[^1];
if (pathParts.Length > 1)
{
ParentPath = path.Substring(0, path.Length - Name.Length - 1);
}
}
}

[HttpGet(Name = "ImportJob")]
[Route("{*path}")]
public async Task<ImportJob?> GetImportJob([FromRoute] string path, [FromQuery] string? source)
[Route("{*archivalGroupPath}")]
public async Task<ImportJob?> GetImportJob([FromRoute] string archivalGroupPath, [FromQuery] string source)
{
// build an import job, with checksums etc, set diffversion, time it.
// compare the source with the object and build the diff properties.
return null;
var agUri = fedora.GetUri(archivalGroupPath);
ArchivalGroup? archivalGroup;
var diffStart = DateTime.Now;
var info = await fedora.GetResourceInfo(agUri);
if(info.Exists && info.Type == nameof(ArchivalGroup))
{
archivalGroup = await fedora.GetPopulatedArchivalGroup(agUri);
}
else if (info.StatusCode == 404) // HTTP leakage
{
archivalGroup = null;
// it doesn't exist - but we still need to check that:
// - it has an immediate parent container
// - that container is not itself an archival group or part of one
var npp = new NameAndParentPath(archivalGroupPath);
if(npp.ParentPath == null)
{
throw new InvalidOperationException($"No parent object for {archivalGroupPath}");
}
var parentInfo = await fedora.GetObject(npp.ParentPath);
if (parentInfo == null)
{
throw new InvalidOperationException($"No parent object for {archivalGroupPath}");
}
if(parentInfo.Type == nameof(ArchivalGroup))
{
throw new InvalidOperationException($"The parent of {archivalGroupPath} is an Archival Group");
}
if(parentInfo.PartOf != null)
{
throw new InvalidOperationException($"{archivalGroupPath} is already part of an Archival Group");
}
if (parentInfo.Type != nameof(Container))
{
throw new InvalidOperationException($"The parent of {archivalGroupPath} is not a container");
}
}
else
{
throw new InvalidOperationException($"Cannot create {archivalGroupPath} for {info.Type}, status: {info.StatusCode}");
}

// This is either an existing Archival Group, or a 404 where the immediate parent is a Container that is not itself part of an Archival Group.
// So now evaluate the source:

var importSource = await GetImportSource(source, agUri);
var importJob = new ImportJob
{
ArchivalGroupUri = agUri,
StorageType = StorageTypes.S3, // all we support for now
ArchivalGroupPath = archivalGroupPath,
Source = source,
DiffStart = diffStart
};
if(archivalGroup == null)
{
// This is a new object
importJob.ContainersToAdd = importSource.Containers;
importJob.FilesToAdd = importSource.Files;
}
else
{
importJob.ArchivalGroupName = archivalGroup.Name;
importJob.IsUpdate = true;
importJob.DiffVersion = archivalGroup.Version;
PopulateDiffTasks(archivalGroup, importSource, importJob);
}
importJob.DiffEnd = DateTime.Now;
return importJob;
}

private void PopulateDiffTasks(ArchivalGroup archivalGroup, ImportSource importSource, ImportJob importJob)
{
// What's the best way to diff?
// This is very crude and can't spot a container being renamed
var allExistingContainers = new List<ContainerDirectory>();
var allExistingFiles = new List<BinaryFile>();
TraverseContainers(archivalGroup, allExistingContainers, allExistingFiles, archivalGroup);

importJob.FilesToAdd.AddRange(importSource.Files.Where(
importFile => !allExistingFiles.Exists(existingFile => existingFile.Path == importFile.Path)));

importJob.FilesToDelete.AddRange(allExistingFiles.Where(
existingFile => !importSource.Files.Exists(importFile => importFile.Path == existingFile.Path)));

foreach(var importFile in importSource.Files.Where(
importFile => !importJob.FilesToAdd.Exists(f => f.Path == importFile.Path)))
{
// files not already put in FilesToAdd
var existingFile = allExistingFiles.Single(existingFile => existingFile.Path == importFile.Path);
if(string.IsNullOrEmpty(existingFile.Digest) || string.IsNullOrEmpty(importFile.Digest))
{
throw new Exception("Missing digest in diff operation for " + existingFile.Path);
}
if(existingFile.Digest != importFile.Digest)
{
importJob.FilesToPatch.Add(importFile);
}
}

importJob.ContainersToAdd.AddRange(importSource.Containers.Where(
importContainer => !allExistingContainers.Exists(existingContainer => existingContainer.Path == importContainer.Path)));

importJob.ContainersToDelete.AddRange(allExistingContainers.Where(
existingContainer => !importSource.Containers.Exists(importContainer => importContainer.Path == existingContainer.Path)));

// Later we will also need patch ops on container (for data)
// and patch ops on file for metadata as well as digest difference as above.
}

private static void TraverseContainers(
ArchivalGroup archivalGroup,
List<ContainerDirectory> allExistingContainers,
List<BinaryFile> allExistingFiles,
Container traverseContainer)
{
foreach (Container container in traverseContainer.Containers)
{
allExistingContainers.Add(new ContainerDirectory
{
Name = container.Name!,
Parent = archivalGroup.Location!,
Path = container.ObjectPath!.Remove(0, archivalGroup.ObjectPath!.Length + 1)
});
TraverseContainers(archivalGroup, allExistingContainers, allExistingFiles, container);
}
foreach (Binary binary in traverseContainer.Binaries)
{
allExistingFiles.Add(new BinaryFile
{
Name = binary.Name!,
Parent = archivalGroup.Location!,
Path = binary.ObjectPath!.Remove(0, archivalGroup.ObjectPath!.Length + 1),
ContentType = binary.ContentType ?? string.Empty,
StorageType = StorageTypes.S3, // shouldn't have to hard code that here, but Binary doesn't have that prop
Digest = binary.Digest,
FileName = binary.FileName!,
ExternalLocation = binary.Origin ?? string.Empty // we won't use this because it's the destination
});
}
}

[HttpPost(Name = "ExecuteImport")]
[Route("__import")]
Expand All @@ -45,6 +204,72 @@ IAmazonS3 awsS3Client
// keep a log of the updates (populate the *added props)
// get the AG again, see the version, validate it's one on etc
// return the import job
return null;
throw new NotImplementedException();
}


private async Task<ImportSource> GetImportSource(string source, Uri intendedParent)
{
// Move this behind an interface later
// This will currently break if the source is not an s3 Uri to which we have access
// but later could be a file path etc, a scratch upload location, whatever
var s3Uri = new AmazonS3Uri(source);
// we assume this is the root. We also assume that we are not going to hit the AWS limit (1000?)
// https://docs.aws.amazon.com/sdkfornet1/latest/apidocs/html/M_Amazon_S3_AmazonS3_ListObjects.htm
// ^^ for paging
// We can't learn anything about containers this way other than that there are slugs in path
// We can't learn anything about intended name (dc:title) from this, but that's OK for now
// That kind of data should be in METS files; we can enhance the ImportJob with it later in a real world application
var listObjectsReq = new ListObjectsRequest()
{
BucketName = s3Uri.Bucket,
Prefix = $"{s3Uri.Key.TrimEnd('/')}/"
};
var importSource = new ImportSource();
var response = await s3Client.ListObjectsAsync(listObjectsReq);
var containerPaths = new HashSet<string>();
foreach (S3Object obj in response.S3Objects)
{
var s3Stream = await s3Client!.GetObjectStreamAsync(obj.BucketName, obj.Key, null);
var sha512Digest = Checksum.Sha512FromStream(s3Stream);
var sourcePath = obj.Key.Remove(0, listObjectsReq.Prefix.Length);
var nameAndParentPath = new NameAndParentPath(sourcePath);
if(nameAndParentPath.ParentPath != null)
{
containerPaths.Add(nameAndParentPath.ParentPath);
}
importSource.Files.Add(new BinaryFile
{
Name = nameAndParentPath.Name,
FileName = nameAndParentPath.Name,
Parent = intendedParent,
Path = sourcePath,
StorageType = StorageTypes.S3,
ExternalLocation = $"s3://{obj.BucketName}/{obj.Key}",
Digest = sha512Digest,
ContentType = GetDefaultContentType(nameAndParentPath.Name) // we may overwrite this later, e.g., from PREMIS data
});
}
foreach(string containerPath in containerPaths)
{
var nameAndParentPath = new NameAndParentPath(containerPath);
importSource.Containers.Add(new ContainerDirectory
{
Name = nameAndParentPath.Name,
Parent = intendedParent,
Path = containerPath
});
}
return importSource;
}

private string GetDefaultContentType(string path)
{
const string DefaultContentType = "application/octet-stream";
if (!contentTypeProvider.TryGetContentType(path, out string? contentType))
{
contentType = DefaultContentType;
}
return contentType;
}
}
30 changes: 30 additions & 0 deletions LeedsExperiment/Preservation.API/Controllers/InfoController.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using Fedora;
using Fedora.Abstractions;
using Microsoft.AspNetCore.Mvc;

namespace Preservation.API.Controllers
{
[Route("api/[controller]")]
[ApiController]
public class InfoController : Controller
{
private readonly IFedora fedora;

public InfoController(IFedora fedora)
{
this.fedora = fedora;
}


[HttpGet(Name = "GetInfo")]
[Route("{*path}", Order = 1)]
public async Task<ActionResult<ResourceInfo?>> Info(
[FromRoute] string path)
{
var uri = fedora.GetUri(path);
var info = await fedora.GetResourceInfo(uri);
return info;
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,8 @@ public RepositoryController(IFedora fedora)
this.fedora = fedora;
}

[HttpGet(Name = "GetInfo")]
[Route("__info/{*path}")]
public async Task<ActionResult<ResourceInfo?>> Info(
[FromRoute] string path)
{
var uri = fedora.GetUri(path);
var info = await fedora.GetResourceInfo(uri);
return info;
}

[HttpGet(Name = "Browse")]
[Route("{*path}")]
[Route("{*path}", Order = 2)]
public async Task<ActionResult<Resource?>> Index(
[FromRoute] string? path = null)
{
Expand Down
5 changes: 4 additions & 1 deletion LeedsExperiment/Preservation/Checksum.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ public class Checksum
{
try
{
openedStream.Position = 0;
if (openedStream.CanSeek)
{
openedStream.Position = 0;
}
// Compute the hash of the fileStream.
byte[] hashValue = hashAlgorithm.ComputeHash(openedStream);
// Write the name and hash value of the file to the console.
Expand Down
7 changes: 5 additions & 2 deletions LeedsExperiment/Preservation/FedoraWrapper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,8 @@ private async void EnsureChecksum(BinaryFile binaryFile)
// could get a byte array here and then pass it along eventually to MakeBinaryPutOrPost
// for now just read it twice.
// Later we'll get the sha256 checksum from metadata
// Or the MD5 from eTag?
// BEWARE that multipart uploads will not have the MD5 as the eTag.
break;
default:
throw new InvalidOperationException("Unkown storage type " + binaryFile.StorageType);
Expand Down Expand Up @@ -493,6 +495,7 @@ private HttpRequestMessage MakeHttpRequestMessage(Uri uri, HttpMethod method)
public async Task<Resource?> GetObject(Uri uri, Transaction? transaction = null)
{
// Make a head request to see what this is

var info = await GetResourceInfo(uri);
if (info.Type == nameof(ArchivalGroup))
{
Expand Down Expand Up @@ -587,11 +590,11 @@ public async Task<ResourceInfo> GetResourceInfo(Uri uri)
{
result.Type = nameof(ArchivalGroup);
}
if (headResponse.HasBinaryTypeHeader())
else if (headResponse.HasBinaryTypeHeader())
{
result.Type = nameof(Binary);
}
if (headResponse.HasBasicContainerTypeHeader())
else if (headResponse.HasBasicContainerTypeHeader())
{
result.Type = nameof(Container);
}
Expand Down
4 changes: 2 additions & 2 deletions LeedsExperiment/Preservation/IPreservation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ public interface IPreservation
/// The reason to split is to allow the operator (a human user, or software) to see the diff - to verify that the job is what
/// was intended or expected.
/// </summary>
/// <param name="path"></param>
/// <param name="archivalGroupPath"></param>
/// <param name="source"></param>
/// <returns>A partially populated ImportJob</returns>
Task<ImportJob> GetUpdateJob(string path, string source);
Task<ImportJob> GetUpdateJob(string archivalGroupPath, string source);

/// <summary>
/// "Execute" the update job obtrained above.
Expand Down
Loading

0 comments on commit 52a5a64

Please sign in to comment.