first attempt at single buffer export

elFarto · Nov 11, 2023 · f276397 · f276397
1 parent ea6d862
commit f276397
Show file tree

Hide file tree

Showing 5 changed files with 171 additions and 22 deletions.
diff --git a/src/direct/direct-export-buf.c b/src/direct/direct-export-buf.c
@@ -186,25 +186,75 @@ static BackingImage *direct_allocateBackingImage(NVDriver *drv, const NVSurface
     p = fmtInfo->plane;
 
     LOG("Allocating BackingImages: %p %dx%d", backingImage, surface->width, surface->height);
+    uint32_t totalSize = 0;
     for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
-        alloc_image(&drv->driverContext, surface->width >> p[i].ss.x, surface->height >> p[i].ss.y,
-            p[i].channelCount, 8 * fmtInfo->bppc, p[i].fourcc, &driverImages[i]);
+        driverImages[i].offset = totalSize;
+
+        totalSize += calculate_image_size(surface->width >> p[i].ss.x, surface->height >> p[i].ss.y,
+            p[i].channelCount, 8 * fmtInfo->bppc, &driverImages[i].pitch);
+
+        totalSize = ROUND_UP(totalSize, 64);
     }
 
-    LOG("Importing images");
+    backingImage->totalSize = totalSize;
+
+    //alloc memory
+    // alloc_buffer(NVDriverContext *context, uint32_t size, uint32_t widthInBytes, int *fd1, int *fd2, int *drmFd);
+    int memFd = 0, memFd2 = 0, drmFd = 0;
+    bool ret = alloc_buffer(&drv->driverContext, totalSize, driverImages[0].pitch, &memFd, &memFd2, &drmFd);
+    LOG("Allocate Buffer: %d %d %d %d", ret, memFd, memFd2, drmFd);
+
+    for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
+        driverImages[i].width = surface->width >> p[i].ss.x;
+        driverImages[i].height = surface->height >> p[i].ss.y;
+        driverImages[i].drmFd = drmFd;
+        driverImages[i].mods = DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, drv->driverContext.sector_layout, drv->driverContext.page_kind_generation, drv->driverContext.generic_page_kind, 4);
+        //driverImages[i].memorySize = calculate_image_size(surface->width >> p[i].ss.x, surface->height >> p[i].ss.y, p[i].channelCount, 8 * fmtInfo->bppc, NULL);
+        driverImages[i].fourcc = p[i].fourcc;
+    }
+
+    CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {
+        .type      = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
+        .handle.fd = memFd,
+        .flags     = 0,
+        .size      = totalSize
+    };
+
+    LOG("importing memory to CUDA: %d bytes", totalSize);
+    CHECK_CUDA_RESULT_RETURN(drv->cu->cuImportExternalMemory(&backingImage->extMem, &extMemDesc), false);
+
+    close(memFd);
+    close(memFd2);
+
     for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
-        if (!import_to_cuda(drv, &driverImages[i], 8 * fmtInfo->bppc, p[i].channelCount, &backingImage->cudaImages[i], &backingImage->arrays[i]))
-            goto bail;
+        CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC mipmapArrayDesc = {
+            .arrayDesc = {
+                .Width = driverImages[i].width,
+                .Height = driverImages[i].height,
+                .Depth = 0,
+                .Format = fmtInfo->bppc == 1 ? CU_AD_FORMAT_UNSIGNED_INT8 : CU_AD_FORMAT_UNSIGNED_INT16,
+                .NumChannels = p[i].channelCount,
+                .Flags = 0
+            },
+            .numLevels = 1,
+            .offset = driverImages[i].offset
+        };
+
+        //create a mimap array from the imported memory
+        CHECK_CUDA_RESULT_RETURN(drv->cu->cuExternalMemoryGetMappedMipmappedArray(&backingImage->cudaImages[i].mipmapArray, backingImage->extMem, &mipmapArrayDesc), false);
+
+        //create an array from the mipmap array
+        CHECK_CUDA_RESULT_RETURN(drv->cu->cuMipmappedArrayGetLevel(&backingImage->arrays[i], backingImage->cudaImages[i].mipmapArray, 0), false);
     }
 
     backingImage->width = surface->width;
     backingImage->height = surface->height;
 
     for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
-        backingImage->fds[i] = driverImages[i].drmFd;
+        backingImage->fds[i] = drmFd;
         backingImage->strides[i] = driverImages[i].pitch;
         backingImage->mods[i] = driverImages[i].mods;
-        backingImage->size[i] = driverImages[i].memorySize;
+        backingImage->offsets[i] = driverImages[i].offset;
     }
 
     return backingImage;
@@ -247,8 +297,8 @@ static void destroyBackingImage(NVDriver *drv, BackingImage *img) {
         }
 
         CHECK_CUDA_RESULT(drv->cu->cuMipmappedArrayDestroy(img->cudaImages[i].mipmapArray));
-        CHECK_CUDA_RESULT(drv->cu->cuDestroyExternalMemory(img->cudaImages[i].extMem));
     }
+    CHECK_CUDA_RESULT(drv->cu->cuDestroyExternalMemory(img->extMem));
 
     memset(img, 0, sizeof(BackingImage));
     free(img);
@@ -301,7 +351,7 @@ static bool copyFrameToSurface(NVDriver *drv, CUdeviceptr ptr, NVSurface *surfac
         } else {
             CHECK_CUDA_RESULT(drv->cu->cuMemcpy2DAsync(&cpy, 0));
         }
-        y += surface->height >> p->ss.y;
+        y += cpy.Height;
     }
 
     //notify anyone waiting for us to be resolved
@@ -357,16 +407,15 @@ static bool direct_fillExportDescriptor(NVDriver *drv, NVSurface *surface, VADRM
     desc->height = surface->height;
 
     desc->num_layers = fmtInfo->numPlanes;
-    desc->num_objects = fmtInfo->numPlanes;
+    desc->num_objects = 1;
+    desc->objects[0].fd = dup(img->fds[0]);
+    desc->objects[0].size = img->totalSize;
+    desc->objects[0].drm_format_modifier = img->mods[0];
 
     for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
-        desc->objects[i].fd = dup(img->fds[i]);
-        desc->objects[i].size = img->size[i];
-        desc->objects[i].drm_format_modifier = img->mods[i];
-
         desc->layers[i].drm_format = fmtInfo->plane[i].fourcc;
         desc->layers[i].num_planes = 1;
-        desc->layers[i].object_index[0] = i;
+        desc->layers[i].object_index[0] = 0;
         desc->layers[i].offset[0] = img->offsets[i];
         desc->layers[i].pitch[0] = img->strides[i];
     }

diff --git a/src/direct/nv-driver.c b/src/direct/nv-driver.c
@@ -432,7 +432,8 @@ bool alloc_memory(NVDriverContext *context, uint32_t size, int *fd) {
     return false;
 }
 
-bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint8_t channels, uint8_t bitsPerChannel, uint32_t fourcc, NVDriverImage *image) {
+uint32_t calculate_image_size(uint32_t width, uint32_t height, uint8_t channels, uint8_t bitsPerChannel, uint32_t* widthInBytesOut)
+{
     uint32_t gobWidthInBytes = 64;
     uint32_t gobHeightInBytes = 8;
 
@@ -449,12 +450,107 @@ bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint
     //These two seem to be correct, but it was discovered by trial and error so I'm not 100% sure
     uint32_t widthInBytes = ROUND_UP(width * bytesPerPixel, gobWidthInBytes << log2GobsPerBlockX);
     uint32_t alignedHeight = ROUND_UP(height, gobHeightInBytes << log2GobsPerBlockY);
-
     uint32_t imageSizeInBytes = widthInBytes * alignedHeight;
-    uint32_t size = imageSizeInBytes;
 
     LOG("Aligned image size: %dx%d = %d", widthInBytes, alignedHeight, imageSizeInBytes);
 
+    if (widthInBytesOut != NULL) {
+        *widthInBytesOut = widthInBytes;
+    }
+    return imageSizeInBytes;
+}
+
+bool alloc_buffer(NVDriverContext *context, uint32_t size, uint32_t widthInBytes, int *fd1, int *fd2, int *drmFd) {
+    int memFd = -1;
+    bool ret = alloc_memory(context, size, &memFd);
+    if (!ret) {
+        LOG("alloc_memory failed");
+        return false;
+    }
+
+    //now export the dma-buf
+    uint32_t pitchInBlocks = widthInBytes / 64; //TODO replace with better constants
+
+    //printf("got gobsPerBlock: %ux%u %u %u %u %d\n", width, height, log2GobsPerBlockX, log2GobsPerBlockY, log2GobsPerBlockZ, pitchInBlocks);
+    //duplicate the fd so we don't invalidate it by importing it
+    int memFd2 = dup(memFd);
+    if (memFd2 == -1) {
+        LOG("dup failed");
+        goto err;
+    }
+
+    struct NvKmsKapiPrivImportMemoryParams nvkmsParams = {
+        .memFd = memFd2,
+        .surfaceParams = {
+            .layout = NvKmsSurfaceMemoryLayoutBlockLinear,
+            .blockLinear = {
+                .genericMemory = 0,
+                .pitchInBlocks = pitchInBlocks,
+                .log2GobsPerBlock.x = 0,
+                .log2GobsPerBlock.y = 4, //TODO replace with better constants
+                .log2GobsPerBlock.z = 0,
+            }
+        }
+    };
+
+    struct drm_nvidia_gem_import_nvkms_memory_params params = {
+        .mem_size = size,
+        .nvkms_params_ptr = (uint64_t) &nvkmsParams,
+        .nvkms_params_size = context->driverMajorVersion == 470 ? 0x20 : sizeof(nvkmsParams) //needs to be 0x20 in the 470 series driver
+    };
+    int drmret = ioctl(context->drmFd, DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY, &params);
+    if (drmret != 0) {
+        LOG("DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY failed: %d %d", drmret, errno);
+        goto err;
+    }
+
+    //export dma-buf
+    struct drm_prime_handle prime_handle = {
+        .handle = params.handle
+    };
+    drmret = ioctl(context->drmFd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &prime_handle);
+    if (drmret != 0) {
+        LOG("DRM_IOCTL_PRIME_HANDLE_TO_FD failed: %d %d", drmret, errno);
+        goto err;
+    }
+
+    struct drm_gem_close gem_close = {
+        .handle = params.handle
+    };
+    drmret = ioctl(context->drmFd, DRM_IOCTL_GEM_CLOSE, &gem_close);
+    if (drmret != 0) {
+        LOG("DRM_IOCTL_GEM_CLOSE failed: %d %d", drmret, errno);
+        goto prime_err;
+    }
+
+    *fd1 = memFd;
+    *fd2 = memFd2;
+    *drmFd = prime_handle.fd;
+    return true;
+
+prime_err:
+    if (prime_handle.fd > 0) {
+        close(prime_handle.fd);
+    }
+
+err:
+    if (memFd > 0) {
+        close(memFd);
+    }
+
+    return false;
+}
+
+bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint8_t channels, uint8_t bitsPerChannel, uint32_t fourcc, NVDriverImage *image) {
+    uint32_t gobWidthInBytes = 64;
+
+    uint32_t log2GobsPerBlockX = 0; //TODO not sure if these are the correct numbers to start with, but they're the largest ones i've seen used
+    uint32_t log2GobsPerBlockY = height < 88 ? 3 : 4; //TODO 88 is a guess, 80px high needs 3, 112px needs 4, 96px needs 4, 88px needs 4
+    uint32_t log2GobsPerBlockZ = 0;
+
+    uint32_t widthInBytes = 0;
+    uint32_t size = calculate_image_size(width, height, channels, bitsPerChannel, &widthInBytes);
+
     //this gets us some memory, and the fd to import into cuda
     int memFd = -1;
     bool ret = alloc_memory(context, size, &memFd);
@@ -489,7 +585,7 @@ bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint
     };
 
     struct drm_nvidia_gem_import_nvkms_memory_params params = {
-        .mem_size = imageSizeInBytes,
+        .mem_size = size,
         .nvkms_params_ptr = (uint64_t) &nvkmsParams,
         .nvkms_params_size = context->driverMajorVersion == 470 ? 0x20 : sizeof(nvkmsParams) //needs to be 0x20 in the 470 series driver
     };
@@ -526,10 +622,10 @@ bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint
     image->mods = DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, context->sector_layout, context->page_kind_generation, context->generic_page_kind, log2GobsPerBlockY);
     image->offset = 0;
     image->pitch = widthInBytes;
-    image->memorySize = imageSizeInBytes;
+    image->memorySize = size;
     image->fourcc = fourcc;
 
-    LOG("created image: %dx%d %lx %d %x", width, height, image->mods, widthInBytes, imageSizeInBytes);
+    LOG("created image: %dx%d %lx %d %x", width, height, image->mods, widthInBytes, size);
 
     return true;
 

diff --git a/src/direct/nv-driver.h b/src/direct/nv-driver.h
@@ -42,5 +42,7 @@ bool free_nvdriver(NVDriverContext *context);
 bool get_device_uuid(NVDriverContext *context, char uuid[16]);
 bool alloc_memory(NVDriverContext *context, uint32_t size, int *fd);
 bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint8_t channels, uint8_t bytesPerChannel, uint32_t fourcc, NVDriverImage *image);
+bool alloc_buffer(NVDriverContext *context, uint32_t size, uint32_t widthInBytes, int *fd1, int *fd2, int *drmFd);
+uint32_t calculate_image_size(uint32_t width, uint32_t height, uint8_t channels, uint8_t bitsPerChannel, uint32_t* widthInBytesOut);
 
 #endif
diff --git a/src/vabackend.c b/src/vabackend.c
@@ -2039,7 +2039,7 @@ static VAStatus nvExportSurfaceHandle(
 
     drv->backend->fillExportDescriptor(drv, surface, ptr);
 
-    LOG("Exporting with %d %d %d %d %" PRIx64 " %d %d %" PRIx64, ptr->width, ptr->height, ptr->layers[0].offset[0],
+    LOG("Exporting with w:%d h:%d o:%d p:%d m:%" PRIx64 " o:%d p:%d m:%" PRIx64, ptr->width, ptr->height, ptr->layers[0].offset[0],
                                                                  ptr->layers[0].pitch[0], ptr->objects[0].drm_format_modifier,
                                                                  ptr->layers[1].offset[0], ptr->layers[1].pitch[0],
                                                                  ptr->objects[1].drm_format_modifier);

diff --git a/src/vabackend.h b/src/vabackend.h
@@ -107,6 +107,8 @@ typedef struct _BackingImage {
     //direct backend only
     NVCudaImage cudaImages[3];
     NVFormat    format;
+    uint32_t    totalSize;
+    CUexternalMemory extMem;
 } BackingImage;
 
 struct _NVDriver;