From cc7af981358d19c9939da81f015e92b0d19f3fcb Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 9 Aug 2024 17:40:50 +0200 Subject: [PATCH] Patch content type during import (#5090) Co-authored-by: Simon Dumas --- ship/src/main/resources/ship-default.conf | 6 ++++ .../ship/config/FileProcessingConfig.scala | 4 ++- .../nexus/ship/files/FileProcessor.scala | 28 +++++++++++++++---- .../ship/resources/DistributionPatcher.scala | 16 +++++++---- .../ship/config/ShipConfigFixtures.scala | 4 ++- .../nexus/ship/files/FileProcessorSuite.scala | 28 +++++++++++++++++++ .../resources/DistributionPatcherSuite.scala | 27 ++++++++++++++++-- 7 files changed, 98 insertions(+), 15 deletions(-) create mode 100644 ship/src/test/scala/ch/epfl/bluebrain/nexus/ship/files/FileProcessorSuite.scala diff --git a/ship/src/main/resources/ship-default.conf b/ship/src/main/resources/ship-default.conf index 57ed78cdd5..427fc4efba 100644 --- a/ship/src/main/resources/ship-default.conf +++ b/ship/src/main/resources/ship-default.conf @@ -172,6 +172,12 @@ ship { target-bucket = "nexus-delta-production" # To skip file events to make the batch run faster and focus on other events skip-file-events = false + # Allows to define default media types for the given file extensions + media-type-detector { + extensions { + #extension = "application/custom" + } + } } } } \ No newline at end of file diff --git a/ship/src/main/scala/ch/epfl/bluebrain/nexus/ship/config/FileProcessingConfig.scala b/ship/src/main/scala/ch/epfl/bluebrain/nexus/ship/config/FileProcessingConfig.scala index 6e9ffb6545..5043e5b4ce 100644 --- a/ship/src/main/scala/ch/epfl/bluebrain/nexus/ship/config/FileProcessingConfig.scala +++ b/ship/src/main/scala/ch/epfl/bluebrain/nexus/ship/config/FileProcessingConfig.scala @@ -1,6 +1,7 @@ package ch.epfl.bluebrain.nexus.ship.config import akka.http.scaladsl.model.Uri.Path +import ch.epfl.bluebrain.nexus.delta.kernel.http.MediaTypeDetectorConfig import pureconfig.ConfigReader import ch.epfl.bluebrain.nexus.delta.sdk.instances._ import pureconfig.generic.semiauto.deriveReader @@ -9,7 +10,8 @@ final case class FileProcessingConfig( importBucket: String, targetBucket: String, prefix: Option[Path], - skipFileEvents: Boolean + skipFileEvents: Boolean, + mediaTypeDetector: MediaTypeDetectorConfig ) object FileProcessingConfig { diff --git a/ship/src/main/scala/ch/epfl/bluebrain/nexus/ship/files/FileProcessor.scala b/ship/src/main/scala/ch/epfl/bluebrain/nexus/ship/files/FileProcessor.scala index a079539147..cea35f89a9 100644 --- a/ship/src/main/scala/ch/epfl/bluebrain/nexus/ship/files/FileProcessor.scala +++ b/ship/src/main/scala/ch/epfl/bluebrain/nexus/ship/files/FileProcessor.scala @@ -1,13 +1,16 @@ package ch.epfl.bluebrain.nexus.ship.files +import akka.http.scaladsl.model.{ContentType, HttpCharsets} import cats.effect.IO import ch.epfl.bluebrain.nexus.delta.kernel.Logger +import ch.epfl.bluebrain.nexus.delta.kernel.http.MediaTypeDetectorConfig +import ch.epfl.bluebrain.nexus.delta.kernel.utils.FileUtils import ch.epfl.bluebrain.nexus.delta.plugins.storage.files.Files import ch.epfl.bluebrain.nexus.delta.plugins.storage.files.Files.definition import ch.epfl.bluebrain.nexus.delta.plugins.storage.files.model.FileCommand.CancelEvent import ch.epfl.bluebrain.nexus.delta.plugins.storage.files.model.FileEvent._ import ch.epfl.bluebrain.nexus.delta.plugins.storage.files.model.FileRejection.{FileNotFound, IncorrectRev, ResourceAlreadyExists} -import ch.epfl.bluebrain.nexus.delta.plugins.storage.files.model.{FileAttributes, FileCustomMetadata, FileEvent, FileId, FileLinkRequest} +import ch.epfl.bluebrain.nexus.delta.plugins.storage.files.model._ import ch.epfl.bluebrain.nexus.delta.plugins.storage.storages.operations.s3.client.S3StorageClient import ch.epfl.bluebrain.nexus.delta.plugins.storage.storages.{FetchStorage, StorageResource} import ch.epfl.bluebrain.nexus.delta.rdf.jsonld.api.JsonLdApi @@ -22,7 +25,7 @@ import ch.epfl.bluebrain.nexus.ship._ import ch.epfl.bluebrain.nexus.ship.acls.AclWiring.alwaysAuthorize import ch.epfl.bluebrain.nexus.ship.config.InputConfig import ch.epfl.bluebrain.nexus.ship.files.FileCopier.CopyResult.{CopySkipped, CopySuccess} -import ch.epfl.bluebrain.nexus.ship.files.FileProcessor.logger +import ch.epfl.bluebrain.nexus.ship.files.FileProcessor.{logger, patchMediaType} import ch.epfl.bluebrain.nexus.ship.files.FileWiring._ import ch.epfl.bluebrain.nexus.ship.storages.StorageWiring import io.circe.Decoder @@ -32,7 +35,8 @@ class FileProcessor private ( projectMapper: ProjectMapper, fileCopier: FileCopier, clock: EventClock -) extends EventProcessor[FileEvent] { +)(implicit mediaTypeDetector: MediaTypeDetectorConfig) + extends EventProcessor[FileEvent] { override def resourceType: EntityType = Files.entityType @@ -67,7 +71,8 @@ class FileProcessor private ( val customMetadata = Some(getCustomMetadata(attrs)) fileCopier.copyFile(e.project, attrs).flatMap { case CopySuccess(newPath) => - val linkRequest = FileLinkRequest(newPath, attrs.mediaType, customMetadata) + val newMediaType = patchMediaType(attrs.filename, attrs.mediaType) + val linkRequest = FileLinkRequest(newPath, newMediaType, customMetadata) files .linkFile(Some(event.id), project, None, linkRequest, e.tag) .as(ImportStatus.Success) @@ -78,7 +83,8 @@ class FileProcessor private ( val customMetadata = Some(getCustomMetadata(attrs)) fileCopier.copyFile(e.project, attrs).flatMap { case CopySuccess(newPath) => - val linkRequest = FileLinkRequest(newPath, attrs.mediaType, customMetadata) + val newMediaType = patchMediaType(attrs.filename, attrs.mediaType) + val linkRequest = FileLinkRequest(newPath, newMediaType, customMetadata) files .updateLinkedFile(fileId, None, cRev, linkRequest, e.tag) .as(ImportStatus.Success) @@ -117,6 +123,16 @@ object FileProcessor { private val logger = Logger[FileProcessor] + def patchMediaType( + filename: String, + original: Option[ContentType] + )(implicit mediaTypeDetector: MediaTypeDetectorConfig): Option[ContentType] = + FileUtils + .extension(filename) + .flatMap(mediaTypeDetector.find) + .map(ContentType(_, () => HttpCharsets.`UTF-8`)) + .orElse(original) + private val noop = new EventProcessor[FileEvent] { override def resourceType: EntityType = Files.entityType @@ -157,7 +173,7 @@ object FileProcessor { linkOperationOnly(s3Client) )(FailingUUID) - new FileProcessor(files, projectMapper, fileCopier, clock) + new FileProcessor(files, projectMapper, fileCopier, clock)(config.files.mediaTypeDetector) } } diff --git a/ship/src/main/scala/ch/epfl/bluebrain/nexus/ship/resources/DistributionPatcher.scala b/ship/src/main/scala/ch/epfl/bluebrain/nexus/ship/resources/DistributionPatcher.scala index 36ccc1e2e7..b551ffc252 100644 --- a/ship/src/main/scala/ch/epfl/bluebrain/nexus/ship/resources/DistributionPatcher.scala +++ b/ship/src/main/scala/ch/epfl/bluebrain/nexus/ship/resources/DistributionPatcher.scala @@ -1,6 +1,6 @@ package ch.epfl.bluebrain.nexus.ship.resources -import akka.http.scaladsl.model.Uri +import akka.http.scaladsl.model.{ContentType, Uri} import cats.effect.IO import cats.syntax.all._ import ch.epfl.bluebrain.nexus.delta.kernel.Logger @@ -56,6 +56,7 @@ final class DistributionPatcher( IO.pure( setLocation(attributes.location.toString()) .andThen(setContentSize(attributes.bytes)) + .andThen(setEncodingFormat(attributes.mediaType)) .andThen(setDigest(attributes.digest)) ) case Left(e) => @@ -87,12 +88,17 @@ final class DistributionPatcher( } } - private def setContentUrl(newContentUrl: String) = root.contentUrl.string.replace(newContentUrl) - private def setLocation(newLocation: String) = (json: Json) => + private def setContentUrl(newContentUrl: String) = root.contentUrl.string.replace(newContentUrl) + private def setLocation(newLocation: String) = (json: Json) => json.deepMerge(Json.obj("atLocation" := Json.obj("location" := newLocation))) - private def setContentSize(newSize: Long) = (json: Json) => + private def setContentSize(newSize: Long) = (json: Json) => json.deepMerge(Json.obj("contentSize" := Json.obj("unitCode" := "bytes", "value" := newSize))) - private def setDigest(digest: Digest) = (json: Json) => json.deepMerge(Json.obj("digest" := digest)) + private def setEncodingFormat(contentType: Option[ContentType]) = (json: Json) => + contentType.fold(json) { ct => + json.deepMerge(Json.obj("encodingFormat" := ct.mediaType.value)) + } + + private def setDigest(digest: Digest) = (json: Json) => json.deepMerge(Json.obj("digest" := digest)) private def toS3Location: Json => Json = root.atLocation.store.json.replace(targetStorage) diff --git a/ship/src/test/scala/ch/epfl/bluebrain/nexus/ship/config/ShipConfigFixtures.scala b/ship/src/test/scala/ch/epfl/bluebrain/nexus/ship/config/ShipConfigFixtures.scala index 9ba8e24eb2..0f06eb42f1 100644 --- a/ship/src/test/scala/ch/epfl/bluebrain/nexus/ship/config/ShipConfigFixtures.scala +++ b/ship/src/test/scala/ch/epfl/bluebrain/nexus/ship/config/ShipConfigFixtures.scala @@ -2,6 +2,7 @@ package ch.epfl.bluebrain.nexus.ship.config import akka.http.scaladsl.model.Uri import ch.epfl.bluebrain.nexus.delta.kernel.Secret +import ch.epfl.bluebrain.nexus.delta.kernel.http.MediaTypeDetectorConfig import ch.epfl.bluebrain.nexus.delta.plugins.storage.files import ch.epfl.bluebrain.nexus.delta.plugins.storage.storages.StoragesConfig.S3StorageConfig import ch.epfl.bluebrain.nexus.delta.plugins.storage.storages.{permissions, StorageFixtures, StoragesConfig} @@ -61,7 +62,8 @@ trait ShipConfigFixtures extends ConfigFixtures with StorageFixtures with Classp importBucket, targetBucket, Some(Uri.Path("/prefix")), - skipFileEvents = false + skipFileEvents = false, + MediaTypeDetectorConfig() ), IriPatcherConfig(enabled = false, iri"https://bbp.epfl.ch/", iri"https:/openbrainplatform.com/"), Set.empty diff --git a/ship/src/test/scala/ch/epfl/bluebrain/nexus/ship/files/FileProcessorSuite.scala b/ship/src/test/scala/ch/epfl/bluebrain/nexus/ship/files/FileProcessorSuite.scala new file mode 100644 index 0000000000..769ab4ceec --- /dev/null +++ b/ship/src/test/scala/ch/epfl/bluebrain/nexus/ship/files/FileProcessorSuite.scala @@ -0,0 +1,28 @@ +package ch.epfl.bluebrain.nexus.ship.files + +import akka.http.scaladsl.model.{ContentTypes, MediaTypes} +import ch.epfl.bluebrain.nexus.delta.kernel.http.MediaTypeDetectorConfig +import ch.epfl.bluebrain.nexus.testkit.mu.NexusSuite + +class FileProcessorSuite extends NexusSuite { + + implicit private val mediaTypeDetector: MediaTypeDetectorConfig = MediaTypeDetectorConfig( + "json" -> MediaTypes.`application/json`, + "pdf" -> MediaTypes.`application/pdf` + ) + + test("Return a new content type matching the config") { + assertEquals( + FileProcessor.patchMediaType("file.json", None), + Some(ContentTypes.`application/json`) + ) + } + + test("Return the original content type") { + assertEquals( + FileProcessor.patchMediaType("file.", Some(ContentTypes.`text/csv(UTF-8)`)), + Some(ContentTypes.`text/csv(UTF-8)`) + ) + } + +} diff --git a/ship/src/test/scala/ch/epfl/bluebrain/nexus/ship/resources/DistributionPatcherSuite.scala b/ship/src/test/scala/ch/epfl/bluebrain/nexus/ship/resources/DistributionPatcherSuite.scala index 14b0a5d930..9290e85a58 100644 --- a/ship/src/test/scala/ch/epfl/bluebrain/nexus/ship/resources/DistributionPatcherSuite.scala +++ b/ship/src/test/scala/ch/epfl/bluebrain/nexus/ship/resources/DistributionPatcherSuite.scala @@ -1,6 +1,6 @@ package ch.epfl.bluebrain.nexus.ship.resources -import akka.http.scaladsl.model.Uri +import akka.http.scaladsl.model.{ContentTypes, Uri} import cats.effect.IO import ch.epfl.bluebrain.nexus.delta.plugins.storage.FileSelf import ch.epfl.bluebrain.nexus.delta.plugins.storage.files.model.FileAttributes.FileAttributesOrigin @@ -72,7 +72,7 @@ class DistributionPatcherSuite extends NexusSuite { location, path, "file.txt", - None, + Some(ContentTypes.`text/plain(UTF-8)`), Map.empty, None, None, @@ -197,6 +197,21 @@ class DistributionPatcherSuite extends NexusSuite { .assertEquals("/actual/path/file.txt") } + test("Patch a encoding format based on what the file says") { + val input = + json"""{ + "distribution": { + "contentUrl": "${sourceFileSelf(projectWithMapping, resource1)}", + "encodingFormat": "text/csv" + } + }""" + + patcher + .patchAll(input) + .map(distributionEncodingFormat) + .assertEquals("text/plain") + } + test("Patch a file location based on what the resource says when no existing location present") { val input = json"""{ @@ -290,4 +305,12 @@ class DistributionPatcherSuite extends NexusSuite { .getOrElse(fail("contentUrl was not present")) } + private def distributionEncodingFormat(json: Json): String = { + json.hcursor + .downField("distribution") + .downField("encodingFormat") + .as[String] + .getOrElse(fail("encodingFormat was not present")) + } + }