Skip to content

Commit

Permalink
Handle other filename cases related to link operation (#5238)
Browse files Browse the repository at this point in the history
Co-authored-by: Simon Dumas <simon.dumas@epfl.ch>
  • Loading branch information
imsdu and Simon Dumas authored Nov 19, 2024
1 parent 80b78da commit cb4fa1c
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package ch.epfl.bluebrain.nexus.ship.files
import akka.http.scaladsl.model.Uri
import akka.http.scaladsl.model.Uri.Path
import cats.effect.IO
import cats.syntax.all._
import ch.epfl.bluebrain.nexus.delta.kernel.RetryStrategy.logError
import ch.epfl.bluebrain.nexus.delta.kernel.utils.UrlUtils
import ch.epfl.bluebrain.nexus.delta.kernel.{Logger, RetryStrategy}
Expand Down Expand Up @@ -51,6 +52,33 @@ object FileCopier {

}

def computeOriginKey(
s3StorageClient: S3StorageClient,
importBucket: String,
path: Path,
localOrigin: Boolean
): IO[Option[String]] = {
def exists(key: String) = s3StorageClient.objectExists(importBucket, key).flatMap {
case true => IO.some(key)
case false =>
s3StorageClient
.listObjectsV2(importBucket, key)
.map(_.hasContents)
.flatMap { isFolder =>
IO.whenA(isFolder) {
logger.info(s"'$key' has been found to be a folder, skipping the file copy...")
}
}
.as(None)
}

val decodedKey = if (localOrigin) localDiskPath(path) else UrlUtils.decode(path)
exists(decodedKey).flatMap {
case Some(key) => IO.some(key)
case None => exists(path.toString())
}
}

def apply(
s3StorageClient: S3StorageClient,
config: FileProcessingConfig
Expand All @@ -60,17 +88,15 @@ object FileCopier {
val locationGenerator = new S3LocationGenerator(config.prefix.getOrElse(Path.Empty))
(project: ProjectRef, attributes: FileAttributes, localOrigin: Boolean) =>
{
val origin = attributes.path
val path = attributes.path
val patchedFileName = if (attributes.filename.isEmpty) "file" else attributes.filename
val target = locationGenerator.file(project, attributes.uuid, patchedFileName).path
val FIVE_GB = 5_000_000_000L

val originKey = if (localOrigin) localDiskPath(origin) else UrlUtils.decode(origin)
val targetKey = UrlUtils.decode(target)

val targetKey = UrlUtils.decode(target)
val copyOptions = CopyOptions(overwriteTarget = false, attributes.mediaType)

def copy = {
def copy(originKey: String) = {
if (attributes.bytes >= FIVE_GB) {
logger.info(s"Attempting to copy a large file from $importBucket/$originKey to $targetBucket/$targetKey") >>
s3StorageClient.copyObjectMultiPart(importBucket, originKey, targetBucket, targetKey, copyOptions)
Expand All @@ -86,17 +112,12 @@ object FileCopier {
}

for {
isObject <- s3StorageClient.objectExists(importBucket, originKey)
isFolder <-
if (isObject) IO.pure(false) else s3StorageClient.listObjectsV2(importBucket, originKey).map(_.hasContents)
_ <- IO.whenA(isObject) { copy }
_ <- IO.whenA(isFolder) {
logger.info(s"'$originKey' has been found to be a folder, skipping the file copy...")
}
_ <- IO.whenA(!isFolder && !isObject) {
logger.error(s"'$originKey' is neither an object or folder, something is wrong.")
}
} yield if (isObject) FileCopySuccess(target) else FileCopySkipped
originKey <- computeOriginKey(s3StorageClient, importBucket, path, localOrigin)
_ <- originKey.traverse(copy)
_ <- IO.whenA(originKey.isEmpty) {
logger.error(s"$path is neither an object or folder, something is wrong.")
}
} yield if (originKey.isDefined) FileCopySuccess(target) else FileCopySkipped
}.retry(copyRetryStrategy)
}

Expand Down
5 changes: 4 additions & 1 deletion ship/src/test/resources/import/file-import/000000001.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,7 @@
{"ordering":5,"type":"file" ,"org":"public","project":"sscx","id":"https://bbp.epfl.ch/neurosciencegraph/data/empty-filename","rev":1,"value":{"id": "https://bbp.epfl.ch/neurosciencegraph/data/empty-filename", "rev": 1, "@type": "FileCreated", "instant": "2023-03-02T10:46:36.969Z", "project": "public/sscx", "storage": "https://bbp.epfl.ch/neurosciencegraph/data/storage?rev=1", "subject": {"@type": "User", "realm": "bbp", "subject": "alice"}, "attributes": {"path": "public/sscx/2/b/3/9/7/9/3/0/", "uuid": "2b397930-0f69-4dad-bf6a-51825e940e12", "bytes": 538, "digest": {"@type": "ComputedDigest", "value": "b39a754a0988ca1f62e04a34d70479e9610b87beab91c58766f80c6ef6f93f3d", "algorithm": "SHA-256"}, "origin": "Client", "filename": "", "location": "file:///path/public/sscx/2/b/3/9/7/9/3/0", "mediaType": "text/plain"}, "storageType": "RemoteDiskStorage"},"instant":"2023-03-02T11:46:36.969+01:00"}
{"ordering":6,"type":"file" ,"org":"public","project":"sscx","id":"https://bbp.epfl.ch/neurosciencegraph/data/special-chars-filename","rev":1,"value":{"id": "https://bbp.epfl.ch/neurosciencegraph/data/special-chars-filename", "rev": 1, "@type": "FileCreated", "instant": "2023-03-02T10:46:36.969Z", "project": "public/sscx", "storage": "https://bbp.epfl.ch/neurosciencegraph/data/storage?rev=1", "subject": {"@type": "User", "realm": "bbp", "subject": "alice"}, "attributes": {"path": "public/sscx/1/2/3/4/5/6/7/8/special%20%5Bfile%5D.json", "uuid": "12345678-0f69-4dad-bf6a-51825e940e12", "bytes": 538, "digest": {"@type": "ComputedDigest", "value": "b39a754a0988ca1f62e04a34d70479e9610b87beab91c58766f80c6ef6f93f3d", "algorithm": "SHA-256"}, "origin": "Client", "filename": "special [file].json", "location": "file:///path/public/sscx/1/2/3/4/5/6/7/8/special%20%5Bfile%5D.json", "mediaType": "text/plain"}, "storageType": "RemoteDiskStorage"},"instant":"2023-03-02T11:46:36.969+01:00"}
{"ordering":7,"type":"file" ,"org":"public","project":"sscx","id":"https://bbp.epfl.ch/neurosciencegraph/data/directory","rev":1,"value":{"id": "https://bbp.epfl.ch/neurosciencegraph/data/directory", "rev": 1, "@type": "FileCreated", "instant": "2022-09-28T12:59:11.800Z", "project": "public/sscx", "storage": "https://bbp.epfl.ch/neurosciencegraph/data/storage?rev=1", "subject": {"@type": "User", "realm": "bbp", "subject": "akkaufma"}, "attributes": {"path": "public/sscx/0/d/8/b/7/b/3/7/test_linking", "uuid": "0d8b7b37-bf62-4576-a5b4-ba2398731b8f", "bytes": 0, "digest": {"@type": "NotComputedDigest"}, "origin": "Storage", "filename": "test_linking", "location": "file:///path/nexus/bbp/atlas/0/d/8/b/7/b/3/7/test_linking", "mediaType": "application/x-directory"}, "storageType": "RemoteDiskStorage"},"instant":"2022-09-28T14:59:11.8+02:00"}
{"ordering":8,"type":"file" ,"org":"public","project":"sscx","id":"https://bbp.epfl.ch/neurosciencegraph/data/local-plus","rev":1,"value":{"id": "https://bbp.epfl.ch/neurosciencegraph/data/local-plus", "rev": 1, "@type": "FileCreated", "instant": "2022-10-14T12:59:11.800Z", "project": "public/sscx", "storage": "https://bbp.epfl.ch/neurosciencegraph/data/storage?rev=1", "subject": {"@type": "User", "realm": "bbp", "subject": "akkaufma"}, "attributes": {"path": "95b0ee1e-a6a5-43e9-85fb-938b3c38dfc0/9/f/0/3/2/4/f/e/0925_Rhi13.3.13%20cell%201+2%20(superficial).asc", "uuid": "9f0324fe-9aac-4d34-84f0-5a44cd278197", "bytes": 0, "digest": {"@type": "NotComputedDigest"}, "origin": "Storage", "filename": "0925_Rhi13.3.13 cell 1+2 (superficial).asc", "location": "file:///path/nexus/public/sscx/9/f/0/3/2/4/f/e", "mediaType": "application/octet-stream"}, "storageType": "DiskStorage"},"instant":"2022-10-14T14:59:11.8+02:00"}
{"ordering":8,"type":"file" ,"org":"public","project":"sscx","id":"https://bbp.epfl.ch/neurosciencegraph/data/local-plus","rev":1,"value":{"id": "https://bbp.epfl.ch/neurosciencegraph/data/local-plus", "rev": 1, "@type": "FileCreated", "instant": "2022-10-14T12:59:11.800Z", "project": "public/sscx", "storage": "https://bbp.epfl.ch/neurosciencegraph/data/storage?rev=1", "subject": {"@type": "User", "realm": "bbp", "subject": "akkaufma"}, "attributes": {"path": "95b0ee1e-a6a5-43e9-85fb-938b3c38dfc0/9/f/0/3/2/4/f/e/0925_Rhi13.3.13%20cell%201+2%20(superficial).asc", "uuid": "9f0324fe-9aac-4d34-84f0-5a44cd278197", "bytes": 0, "digest": {"@type": "NotComputedDigest"}, "origin": "Storage", "filename": "0925_Rhi13.3.13 cell 1+2 (superficial).asc", "location": "file:///path/nexus/public/sscx/9/f/0/3/2/4/f/e", "mediaType": "application/octet-stream"}, "storageType": "DiskStorage"},"instant":"2022-10-14T14:59:11.8+02:00"}
{"ordering":9,"type":"file" ,"org":"public","project":"sscx","id":"https://bbp.epfl.ch/neurosciencegraph/data/local-space","rev":1,"value":{"id": "https://bbp.epfl.ch/neurosciencegraph/data/local-space", "rev": 1, "@type": "FileCreated", "instant": "2022-10-15T12:59:11.800Z", "project": "public/sscx", "storage": "https://bbp.epfl.ch/neurosciencegraph/data/storage?rev=1", "subject": {"@type": "User", "realm": "bbp", "subject": "akkaufma"}, "attributes": {"path": "public/sscx/9/2/f/f/f/2/1/6/566647353__Square%20-%200.5ms%20Subthreshold__stimulus__1.png", "uuid": "92fff216-73a1-48a6-acc8-3dadf17c26bb", "bytes": 0, "digest": {"@type": "NotComputedDigest"}, "origin": "Storage", "filename": "566647353__Square - 0.5ms Subthreshold__stimulus__1.png", "location": "file:///path/nexus/public/sscx/9/2/f/f/f/2/1/6", "mediaType": "application/octet-stream"}, "storageType": "DiskStorage"},"instant":"2022-10-14T14:59:11.8+02:00"}
{"ordering":10,"type":"file" ,"org":"public","project":"sscx","id":"https://bbp.epfl.ch/neurosciencegraph/data/local-equals","rev":1,"value":{"id": "https://bbp.epfl.ch/neurosciencegraph/data/local-equals", "rev": 1, "@type": "FileCreated", "instant": "2022-10-15T12:59:11.800Z", "project": "public/sscx", "storage": "https://bbp.epfl.ch/neurosciencegraph/data/storage?rev=1", "subject": {"@type": "User", "realm": "bbp", "subject": "akkaufma"}, "attributes": {"path": "public/sscx/9/b/1/9/8/5/f/1/emodel=dSTUT_L5BP__etype=dSTUT__mtype=L5BP__species=mouse__brain_region=Primary%20somatosensory%20area__iteration=final_emodel__seed=3__traces.pdf", "uuid": "9b1985f1-e75d-42d2-b101-e527241e2797", "bytes": 0, "digest": {"@type": "NotComputedDigest"}, "origin": "Storage", "filename": "emodel=dSTUT_L5BP__etype=dSTUT__mtype=L5BP__species=mouse__brain_region=Primary somatosensory area__iteration=final_emodel__seed=3__traces.pdf", "location": "file:///path/nexus/public/sscx/9/b/1/9/8/5/f/1", "mediaType": "application/octet-stream"}, "storageType": "DiskStorage"},"instant":"2022-10-14T14:59:11.8+02:00"}
{"ordering":11,"type":"file" ,"org":"public","project":"sscx","id":"https://bbp.epfl.ch/neurosciencegraph/data/special-chars-2","rev":1,"value":{"id": "https://bbp.epfl.ch/neurosciencegraph/data/special-chars-2", "rev": 1, "@type": "FileCreated", "instant": "2023-03-02T10:46:36.969Z", "project": "public/sscx", "storage": "https://bbp.epfl.ch/neurosciencegraph/data/storage?rev=1", "subject": {"@type": "User", "realm": "bbp", "subject": "alice"}, "attributes": {"path": "15849bfc-f2ef-4ddd-89cb-b4658eb1f4ab/5/4/6/7/8/a/0/9/%20P(marker_cre)_overlapping.csv", "uuid": "54678a09-571b-465d-93e7-12137d6ad434", "bytes": 4267, "digest": {"@type": "ComputedDigest", "value": "b39a754a0988ca1f62e04a34d70479e9610b87beab91c58766f80c6ef6f93f3d", "algorithm": "SHA-256"}, "origin": "Client", "filename": " P(marker_cre)_overlapping.csv", "location": "file:///path/public/sscx/15849bfc-f2ef-4ddd-89cb-b4658eb1f4ab/5/4/6/7/8/a/0/9/%20P(marker_cre)_overlapping.csv", "mediaType": "text/plain"}, "storageType": "RemoteDiskStorage"},"instant":"2023-03-02T11:46:36.969+01:00"}
Original file line number Diff line number Diff line change
Expand Up @@ -192,10 +192,10 @@ class RunShipSuite
// Directory, should be skipped
directoryId = iri"https://bbp.epfl.ch/neurosciencegraph/data/directory"
_ <- checkFor("file", directoryId, xas).assertEquals(0)
// Summary S3 check, 5 objects should have been imported in total
_ <- s3Client.listObjectsV2(targetBucket).map(_.keyCount().intValue()).assertEquals(5)
// Summary S3 check, 8 objects should have been imported in total
_ <- s3Client.listObjectsV2(targetBucket).map(_.keyCount().intValue()).assertEquals(8)
// Summary report check, only the directory event should have been skipped
_ = assertEquals(report.progress(Files.entityType).success, 6L)
_ = assertEquals(report.progress(Files.entityType).success, 9L)
_ = assertEquals(report.progress(Files.entityType).dropped, 1L)
} yield ()
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package ch.epfl.bluebrain.nexus.ship.files

import akka.http.scaladsl.model.Uri
import cats.effect.IO
import ch.epfl.bluebrain.nexus.delta.plugins.storage.storages.StoragesConfig.S3StorageConfig
import ch.epfl.bluebrain.nexus.delta.plugins.storage.storages.operations.s3.client.S3StorageClient
import ch.epfl.bluebrain.nexus.delta.plugins.storage.storages.operations.s3.{LocalStackS3StorageClient, S3Helpers}
import ch.epfl.bluebrain.nexus.ship.files.FileCopier.localDiskPath
import ch.epfl.bluebrain.nexus.testkit.mu.NexusSuite
import io.laserdisc.pure.s3.tagless.S3AsyncClientOp
import munit.AnyFixture

class FileCopierSuite extends NexusSuite with LocalStackS3StorageClient.Fixture with S3Helpers {

override def munitFixtures: Seq[AnyFixture[_]] = List(localStackS3Client)
implicit private lazy val (s3Client: S3StorageClient, underlying: S3AsyncClientOp[IO], _: S3StorageConfig) =
localStackS3Client()

private val fileContents = "file content"

test("Correctly decode a local path") {
val encoded = Uri.Path("org/proj/9/f/0/3/2/4/f/e/0925_Rhi13.3.13%20cell%201+2%20(superficial).asc")
val obtained = localDiskPath(encoded)
val expected = "org/proj/9/f/0/3/2/4/f/e/0925_Rhi13.3.13 cell 1+2 (superficial).asc"
assertEquals(obtained, expected)
}

test("Correctly decode another local path with a space") {
val encoded = Uri.Path("org/proj/9/2/f/f/f/2/1/6/566647353__Square%20-%200.5ms%20Subthreshold__stimulus__1.png")
val obtained = localDiskPath(encoded)
val expected = "org/proj/9/2/f/f/f/2/1/6/566647353__Square - 0.5ms Subthreshold__stimulus__1.png"
assertEquals(obtained, expected)
}

test("Should find a file from a decoded path") {
givenAnS3Bucket { bucket =>
val path = Uri.Path(
"bbp/ncmv3/1/a/e/4/1/a/b/6/EMC__emodel=L5_TPC:B_cAC__ttype=182_L45%20IT%20CTX__species=mouse__brain_region=SS__iteration=mettypesv12_1.json"
)
val key =
"bbp/ncmv3/1/a/e/4/1/a/b/6/EMC__emodel=L5_TPC:B_cAC__ttype=182_L45 IT CTX__species=mouse__brain_region=SS__iteration=mettypesv12_1.json"
givenAFileInABucket(bucket, key, fileContents) { _ =>
FileCopier
.computeOriginKey(s3Client, bucket, path, localOrigin = false)
.assertEquals(
Some(key)
)
}
}
}

test("Should handle correctly path from local origin") {
givenAnS3Bucket { bucket =>
val path = Uri.Path("27554ab5-20f4-4973-91f6-0b2d990cea69/b/e/4/9/b/5/5/b/Log2(CPM(Exon+intron)+1)_Visp_Pyr.csv")
val key = "27554ab5-20f4-4973-91f6-0b2d990cea69/b/e/4/9/b/5/5/b/Log2(CPM(Exon+intron)+1)_Visp_Pyr.csv"
givenAFileInABucket(bucket, key, fileContents) { _ =>
FileCopier
.computeOriginKey(s3Client, bucket, path, localOrigin = true)
.assertEquals(
Some(key)
)
}
}
}

test("Should handle correctly another path from local origin") {
givenAnS3Bucket { bucket =>
val path = Uri.Path(
"95b0ee1e-a6a5-43e9-85fb-938b3c38dfc0/9/f/0/3/2/4/f/e/0925_Rhi13.3.13%20cell%201+2%20(superficial).asc"
)
val key = "95b0ee1e-a6a5-43e9-85fb-938b3c38dfc0/9/f/0/3/2/4/f/e/0925_Rhi13.3.13 cell 1+2 (superficial).asc"
givenAFileInABucket(bucket, key, fileContents) { _ =>
FileCopier
.computeOriginKey(s3Client, bucket, path, localOrigin = true)
.assertEquals(
Some(key)
)
}
}
}

test("Should handle correctly yet another path from local origin") {
givenAnS3Bucket { bucket =>
val path = Uri.Path(
"c1220611-7415-4476-baee-36e75f87bdeb/6/7/5/f/f/c/8/f/AIBS_morpho+ephys_data(for_Rat_coclustering).csv"
)
val key = "c1220611-7415-4476-baee-36e75f87bdeb/6/7/5/f/f/c/8/f/AIBS_morpho+ephys_data(for_Rat_coclustering).csv"
givenAFileInABucket(bucket, key, fileContents) { _ =>
FileCopier
.computeOriginKey(s3Client, bucket, path, localOrigin = true)
.assertEquals(
Some(key)
)
}
}
}

test("Should fallback to the encoded path if the decoded is not found") {
givenAnS3Bucket { bucket =>
val path = Uri.Path("15849bfc-f2ef-4ddd-89cb-b4658eb1f4ab/5/4/6/7/8/a/0/9/%20P(marker_cre)_overlapping.csv")
val key = "15849bfc-f2ef-4ddd-89cb-b4658eb1f4ab/5/4/6/7/8/a/0/9/%20P(marker_cre)_overlapping.csv"
givenAFileInABucket(bucket, key, fileContents) { _ =>
FileCopier
.computeOriginKey(s3Client, bucket, path, localOrigin = false)
.assertEquals(
Some(key)
)
}
}
}

test("Should skip directories") {
givenAnS3Bucket { bucket =>
val path = Uri.Path(
"bbp/ncmv3/1/a/e/4/1/a/b/6/EMC__emodel=L5_TPC:B_cAC__ttype=182_L45%20IT%20CTX__species=mouse__brain_region=SS__iteration=mettypesv12_1.json"
)
val key =
"bbp/ncmv3/1/a/e/4/1/a/b/6/EMC__emodel=L5_TPC:B_cAC__ttype=182_L45 IT CTX__species=mouse__brain_region=SS__iteration=mettypesv12_1.json/pouet.json"
givenAFileInABucket(bucket, key, fileContents) { _ =>
FileCopier.computeOriginKey(s3Client, bucket, path, localOrigin = false).assertEquals(None)
}
}
}

}
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
package ch.epfl.bluebrain.nexus.ship.files

import akka.http.scaladsl.model.{ContentTypes, MediaTypes, Uri}
import akka.http.scaladsl.model.{ContentTypes, MediaTypes}
import ch.epfl.bluebrain.nexus.delta.kernel.http.MediaTypeDetectorConfig
import ch.epfl.bluebrain.nexus.ship.files.FileCopier.localDiskPath
import ch.epfl.bluebrain.nexus.testkit.mu.NexusSuite

class FileProcessorSuite extends NexusSuite {
Expand All @@ -12,13 +11,6 @@ class FileProcessorSuite extends NexusSuite {
"pdf" -> MediaTypes.`application/pdf`
)

test("Correctly decode a local path") {
val encoded = Uri.Path("org/proj/9/f/0/3/2/4/f/e/0925_Rhi13.3.13%20cell%201+2%20(superficial).asc")
val obtained = localDiskPath(encoded)
val expected = "org/proj/9/f/0/3/2/4/f/e/0925_Rhi13.3.13 cell 1+2 (superficial).asc"
assertEquals(obtained, expected)
}

test("Return a new content type matching the config") {
assertEquals(
FileProcessor.patchMediaType("file.json", None),
Expand Down

0 comments on commit cb4fa1c

Please sign in to comment.