Skip to content

Commit

Permalink
different approach to PutBinary
Browse files Browse the repository at this point in the history
  • Loading branch information
tomcrane committed Jan 15, 2024
1 parent 0843435 commit 271522a
Show file tree
Hide file tree
Showing 6 changed files with 171 additions and 107 deletions.
2 changes: 1 addition & 1 deletion LeedsExperiment/Fedora/Abstractions/Transfer/BinaryFile.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public class BinaryFile
/// <summary>
/// An S3 key, a filesystem path - somewhere accessible to the Preservation API, to import from or export to
/// </summary>
public required string Location { get; set; }
public required string ExternalLocation { get; set; }

public required string StorageType { get; set; }

Expand Down
3 changes: 2 additions & 1 deletion LeedsExperiment/Fedora/IFedora.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ public interface IFedora
/// <param name="checksum">An initial checksum, e.g., calculated in browser on upload. This method will still calculate a checksum and compare with what it gets back from Fedora.</param>
/// <returns></returns>
// Task<Binary> AddBinary(Uri parent, FileInfo localFileInfo, string originalName, string contentType, Transaction? transaction = null, string? checksum = null);
Task<Binary> PutBinary(Uri location, FileInfo localFileInfo, string originalName, string contentType, Transaction? transaction = null, string? checksum = null);
// Task<Binary> PutBinary(Uri location, FileInfo localFileInfo, string originalName, string contentType, Transaction? transaction = null, string? checksum = null);
Task<Binary> PutBinary(Uri archivalGroupUri, BinaryFile binaryFile, Transaction? transaction = null);

// Transactions
Task<Transaction> BeginTransaction();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using Fedora;
using Fedora.Storage.Ocfl;
using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Mvc;

namespace Preservation.API.Controllers
Expand Down
85 changes: 37 additions & 48 deletions LeedsExperiment/Preservation/Checksum.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,62 +5,51 @@ namespace Preservation;

public class Checksum
{
public static string? Sha256FromFile(FileInfo fileInfo)
public static string? HashFromStream(Stream openedStream, HashAlgorithm hashAlgorithm)
{
using (SHA256 sha256 = SHA256.Create())
try
{
openedStream.Position = 0;
// Compute the hash of the fileStream.
byte[] hashValue = hashAlgorithm.ComputeHash(openedStream);
// Write the name and hash value of the file to the console.
return FromByteArray(hashValue);
}
catch (IOException e)
{
Console.WriteLine($"I/O Exception: {e.Message}");
}
catch (UnauthorizedAccessException e)
{
using (FileStream fileStream = fileInfo.Open(FileMode.Open))
{
try
{
// Create a fileStream for the file.
// Be sure it's positioned to the beginning of the stream.
fileStream.Position = 0;
// Compute the hash of the fileStream.
byte[] hashValue = sha256.ComputeHash(fileStream);
// Write the name and hash value of the file to the console.
return FromByteArray(hashValue);
}
catch (IOException e)
{
Console.WriteLine($"I/O Exception: {e.Message}");
}
catch (UnauthorizedAccessException e)
{
Console.WriteLine($"Access Exception: {e.Message}");
}
}
Console.WriteLine($"Access Exception: {e.Message}");
}
return null;
}

public static string? Sha256FromFile(FileInfo fileInfo)
{
using SHA256 sha256 = SHA256.Create();
using FileStream fileStream = fileInfo.Open(FileMode.Open);
return HashFromStream(fileStream, sha256);
}

public static string? Sha512FromFile(FileInfo fileInfo)
{
using (SHA512 sha512 = SHA512.Create())
{
using (FileStream fileStream = fileInfo.Open(FileMode.Open))
{
try
{
// Create a fileStream for the file.
// Be sure it's positioned to the beginning of the stream.
fileStream.Position = 0;
// Compute the hash of the fileStream.
byte[] hashValue = sha512.ComputeHash(fileStream);
// Write the name and hash value of the file to the console.
return FromByteArray(hashValue);
}
catch (IOException e)
{
Console.WriteLine($"I/O Exception: {e.Message}");
}
catch (UnauthorizedAccessException e)
{
Console.WriteLine($"Access Exception: {e.Message}");
}
}
}
return null;
using SHA512 sha512 = SHA512.Create();
using FileStream fileStream = fileInfo.Open(FileMode.Open);
return HashFromStream(fileStream, sha512);
}


public static string? Sha256FromStream(Stream stream)
{
using SHA256 sha256 = SHA256.Create();
return HashFromStream(stream, sha256);
}
public static string? Sha512FromStream(Stream stream)
{
using SHA512 sha512 = SHA512.Create();
return HashFromStream(stream, sha512);
}

public static string FromByteArray(byte[] hashValue)
Expand Down
121 changes: 91 additions & 30 deletions LeedsExperiment/Preservation/FedoraWrapper.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
using Fedora;
using Amazon.S3;
using Amazon.S3.Model;
using Amazon.S3.Util;
using Fedora;
using Fedora.Abstractions;
using Fedora.Abstractions.Transfer;
using Fedora.ApiModel;
Expand All @@ -11,7 +14,6 @@
using System.Net.Http.Json;
using System.Text;
using System.Text.Json;
using System.Xml.Linq;

namespace Preservation;

Expand All @@ -24,12 +26,14 @@ public class FedoraWrapper : IFedora
private readonly IMemoryCache cache;
private readonly int fedoraRootSegmentsLength;
private readonly int baseAddressLength;
private IAmazonS3? s3Client; // required for reading binary sources, not for reading fedora. Kinda... supports S3 just as it does FileInfo.

public FedoraWrapper(
HttpClient httpClient,
IStorageMapper storageMapper,
IOptions<PreservationApiOptions> preservationApiOptions,
IMemoryCache memoryCache)
IMemoryCache memoryCache,
IAmazonS3 s3Client)
{
this.httpClient = httpClient;
this.storageMapper = storageMapper;
Expand All @@ -38,6 +42,7 @@ public FedoraWrapper(
fedoraRootSegmentsLength = httpClient.BaseAddress!.Segments.Length;
baseAddressLength = httpClient.BaseAddress!.ToString().Length;
cache = memoryCache;
this.s3Client = s3Client;
}


Expand Down Expand Up @@ -196,26 +201,69 @@ private void AddType(Uri uri, string type, Transaction? transaction)
// The binary resource does not have a dc:title property yet
}

// Deprecated, but leave the PutOrPost logic for reference
public async Task<Binary> AddBinary(Uri parent, FileInfo localFile, string originalName, string contentType, Transaction? transaction = null, string? checksum = null)
public async Task<Binary> PutBinary(Uri archivalGroupUri, BinaryFile binaryFile, Transaction? transaction = null)
{
return await PutOrPostBinary(HttpMethod.Post, parent, localFile, originalName, contentType, transaction, checksum);
return await PutOrPostBinary(HttpMethod.Put, archivalGroupUri, binaryFile, transaction);
}

public async Task<Binary> PutBinary(Uri location, FileInfo localFile, string originalName, string contentType, Transaction? transaction = null, string? checksum = null)
private async void EnsureChecksum(BinaryFile binaryFile)
{
return await PutOrPostBinary(HttpMethod.Put, location, localFile, originalName, contentType, transaction, checksum);
string? expected = null;

switch(binaryFile.StorageType)
{
case StorageTypes.FileSystem:
var fi = new FileInfo(binaryFile.ExternalLocation);
expected = Checksum.Sha512FromFile(fi);
break;
case StorageTypes.S3:
// TODO - get the SHA256 algorithm from AWS directly rather than compute it here
// GetObjectAttributesAsync
// Need to switch Fedora and OCFL to SHA256
// What does it mean if you switch the default algorithm in Fedora? It's used for OCFL...

var s3Uri = new AmazonS3Uri(binaryFile.ExternalLocation);

// This would be an efficient way of doing this - but with this naive implementation
// we're going to read the object twice
var s3Stream = await s3Client!.GetObjectStreamAsync(s3Uri.Bucket, s3Uri.Key, null);
expected = Checksum.Sha512FromStream(s3Stream);
// could get a byte array here and then pass it along eventually to MakeBinaryPutOrPost
// for now just read it twice.
// Later we'll get the sha256 checksum from metadata
break;
default:
throw new InvalidOperationException("Unkown storage type " + binaryFile.StorageType);
}

if (binaryFile.Digest != null && binaryFile.Digest != expected)
{
throw new InvalidOperationException("Initial checksum doesn't match");
}
binaryFile.Digest = expected;
}

private async Task<Binary> PutOrPostBinary(HttpMethod httpMethod, Uri location, FileInfo localFile, string originalName, string contentType, Transaction? transaction = null, string? checksum = null)
private Uri GetFedoraUriWithinArchivalGroup(Uri archivalGroupUri, string path)
{
// verify that parent is a container first?
var expected = Checksum.Sha512FromFile(localFile);
if (checksum != null && checksum != expected)
// the Location property won't end with a trailing slash, so we can't create URIs with the normal Uri constructor
// we can't do:
// new Uri(Location, "foo/bar.xml");
// and nor can we use "./foo/bar.xml" or "/foo/bar.xml"
if (archivalGroupUri.AbsolutePath.EndsWith("/"))
{
throw new InvalidOperationException("Initial checksum doesn't match");
// I'm pretty sure this will never be the case
return new Uri(archivalGroupUri, path);
}
var req = MakeBinaryPutOrPost(httpMethod, location, localFile, originalName, contentType, transaction, expected);
return new Uri($"{archivalGroupUri}/{path}");
}

private async Task<Binary> PutOrPostBinary(HttpMethod httpMethod, Uri archivalGroupUri, BinaryFile binaryFile, Transaction? transaction = null)
{
// FileInfo localFile, string originalName, string contentType, .. , string? checksum = null
// verify that parent is a container first?
EnsureChecksum(binaryFile);
var fedoraLocation = GetFedoraUriWithinArchivalGroup(archivalGroupUri, binaryFile.Path);
var req = await MakeBinaryPutOrPost(httpMethod, fedoraLocation, binaryFile, transaction);
var response = await httpClient.SendAsync(req);
if (httpMethod == HttpMethod.Put && response.StatusCode == HttpStatusCode.Gone)
{
Expand All @@ -225,13 +273,13 @@ private async Task<Binary> PutOrPostBinary(HttpMethod httpMethod, Uri location,
// But we want to reinstate a binary.

// Log or record somehow that this has happened?
var retryReq = MakeBinaryPutOrPost(httpMethod, location, localFile, originalName, contentType, transaction, expected)
var retryReq = (await MakeBinaryPutOrPost(httpMethod, fedoraLocation, binaryFile, transaction))
.OverwriteTombstone();
response = await httpClient.SendAsync(retryReq);
}
response.EnsureSuccessStatusCode();

var resourceLocation = httpMethod == HttpMethod.Post ? response.Headers.Location! : location;
var resourceLocation = httpMethod == HttpMethod.Post ? response.Headers.Location! : fedoraLocation;
var newReq = MakeHttpRequestMessage(resourceLocation.MetadataUri(), HttpMethod.Get)
.InTransaction(transaction)
.ForJsonLd();
Expand All @@ -243,7 +291,7 @@ private async Task<Binary> PutOrPostBinary(HttpMethod httpMethod, Uri location,
// The binary resource does not have a dc:title property yet
var patchReq = MakeHttpRequestMessage(resourceLocation.MetadataUri(), HttpMethod.Patch)
.InTransaction(transaction);
patchReq.AsInsertTitlePatch(originalName);
patchReq.AsInsertTitlePatch(binaryFile.Name);
var patchResponse = await httpClient.SendAsync(patchReq);
patchResponse.EnsureSuccessStatusCode();
// now ask again:
Expand All @@ -254,35 +302,48 @@ private async Task<Binary> PutOrPostBinary(HttpMethod httpMethod, Uri location,
binaryResponse = await MakeFedoraResponse<BinaryMetadataResponse>(afterPatchResponse);
}
Binary binary = MakeBinary(binaryResponse!);
if (binary.Digest != expected)
if (binary.Digest != binaryFile.Digest)
{
throw new InvalidOperationException("Fedora checksum doesn't match");
}
return binary;
}


private HttpRequestMessage MakeBinaryPutOrPost(HttpMethod httpMethod, Uri location, FileInfo localFile, string originalName, string contentType, Transaction? transaction, string? expected)
private async Task<HttpRequestMessage> MakeBinaryPutOrPost(HttpMethod httpMethod, Uri location, BinaryFile binaryFile, Transaction? transaction)
{
var req = MakeHttpRequestMessage(location, httpMethod)
.InTransaction(transaction)
.WithDigest(expected, "sha-512"); // move algorithm choice to config
.WithDigest(binaryFile.Digest, "sha-512"); // move algorithm choice to config
if (httpMethod == HttpMethod.Post)
{
req.WithSlug(localFile.Name);
req.WithSlug(binaryFile.Slug);
}


// Need something better than this for large files.
// How would we transfer a 10GB file for example?
req.Content = new ByteArrayContent(File.ReadAllBytes(localFile.FullName))
.WithContentType(contentType);

// see if this survives the PUT (i.e., do we need to re-state it?)
// No, always do this
//if (httpMethod == HttpMethod.Post)
//{
req.Content.WithContentDisposition(originalName);
//}
// Also this is grossly inefficient, we've already read the stream to look at the checksum.
// We should keep the byte array... but then what if it's huge?
switch (binaryFile.StorageType)
{
case StorageTypes.FileSystem:
req.Content = new ByteArrayContent(File.ReadAllBytes(binaryFile.ExternalLocation))
.WithContentType(binaryFile.ContentType);
break;
case StorageTypes.S3:
var s3Uri = new AmazonS3Uri(binaryFile.ExternalLocation);
var s3Req = new GetObjectRequest() { BucketName = s3Uri.Bucket, Key = s3Uri.Key };
var ms = new MemoryStream();
var s3Resp = await s3Client!.GetObjectAsync(s3Req);
await s3Resp.ResponseStream.CopyToAsync(ms);
req.Content = new ByteArrayContent(ms.ToArray())
.WithContentType(binaryFile.ContentType);
break;
default:
throw new InvalidOperationException("Unkown storage type " + binaryFile.StorageType);
}
req.Content.WithContentDisposition(binaryFile.FileName);
return req;
}

Expand Down
Loading

0 comments on commit 271522a

Please sign in to comment.