Skip to content

Commit

Permalink
first attempt at single buffer export
Browse files Browse the repository at this point in the history
  • Loading branch information
elFarto committed Nov 11, 2023
1 parent ea6d862 commit f276397
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 22 deletions.
79 changes: 64 additions & 15 deletions src/direct/direct-export-buf.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,25 +186,75 @@ static BackingImage *direct_allocateBackingImage(NVDriver *drv, const NVSurface
p = fmtInfo->plane;

LOG("Allocating BackingImages: %p %dx%d", backingImage, surface->width, surface->height);
uint32_t totalSize = 0;
for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
alloc_image(&drv->driverContext, surface->width >> p[i].ss.x, surface->height >> p[i].ss.y,
p[i].channelCount, 8 * fmtInfo->bppc, p[i].fourcc, &driverImages[i]);
driverImages[i].offset = totalSize;

totalSize += calculate_image_size(surface->width >> p[i].ss.x, surface->height >> p[i].ss.y,
p[i].channelCount, 8 * fmtInfo->bppc, &driverImages[i].pitch);

totalSize = ROUND_UP(totalSize, 64);
}

LOG("Importing images");
backingImage->totalSize = totalSize;

//alloc memory
// alloc_buffer(NVDriverContext *context, uint32_t size, uint32_t widthInBytes, int *fd1, int *fd2, int *drmFd);
int memFd = 0, memFd2 = 0, drmFd = 0;
bool ret = alloc_buffer(&drv->driverContext, totalSize, driverImages[0].pitch, &memFd, &memFd2, &drmFd);
LOG("Allocate Buffer: %d %d %d %d", ret, memFd, memFd2, drmFd);

for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
driverImages[i].width = surface->width >> p[i].ss.x;
driverImages[i].height = surface->height >> p[i].ss.y;
driverImages[i].drmFd = drmFd;
driverImages[i].mods = DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, drv->driverContext.sector_layout, drv->driverContext.page_kind_generation, drv->driverContext.generic_page_kind, 4);
//driverImages[i].memorySize = calculate_image_size(surface->width >> p[i].ss.x, surface->height >> p[i].ss.y, p[i].channelCount, 8 * fmtInfo->bppc, NULL);
driverImages[i].fourcc = p[i].fourcc;
}

CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {
.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
.handle.fd = memFd,
.flags = 0,
.size = totalSize
};

LOG("importing memory to CUDA: %d bytes", totalSize);
CHECK_CUDA_RESULT_RETURN(drv->cu->cuImportExternalMemory(&backingImage->extMem, &extMemDesc), false);

close(memFd);
close(memFd2);

for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
if (!import_to_cuda(drv, &driverImages[i], 8 * fmtInfo->bppc, p[i].channelCount, &backingImage->cudaImages[i], &backingImage->arrays[i]))
goto bail;
CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC mipmapArrayDesc = {
.arrayDesc = {
.Width = driverImages[i].width,
.Height = driverImages[i].height,
.Depth = 0,
.Format = fmtInfo->bppc == 1 ? CU_AD_FORMAT_UNSIGNED_INT8 : CU_AD_FORMAT_UNSIGNED_INT16,
.NumChannels = p[i].channelCount,
.Flags = 0
},
.numLevels = 1,
.offset = driverImages[i].offset
};

//create a mimap array from the imported memory
CHECK_CUDA_RESULT_RETURN(drv->cu->cuExternalMemoryGetMappedMipmappedArray(&backingImage->cudaImages[i].mipmapArray, backingImage->extMem, &mipmapArrayDesc), false);

//create an array from the mipmap array
CHECK_CUDA_RESULT_RETURN(drv->cu->cuMipmappedArrayGetLevel(&backingImage->arrays[i], backingImage->cudaImages[i].mipmapArray, 0), false);
}

backingImage->width = surface->width;
backingImage->height = surface->height;

for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
backingImage->fds[i] = driverImages[i].drmFd;
backingImage->fds[i] = drmFd;
backingImage->strides[i] = driverImages[i].pitch;
backingImage->mods[i] = driverImages[i].mods;
backingImage->size[i] = driverImages[i].memorySize;
backingImage->offsets[i] = driverImages[i].offset;
}

return backingImage;
Expand Down Expand Up @@ -247,8 +297,8 @@ static void destroyBackingImage(NVDriver *drv, BackingImage *img) {
}

CHECK_CUDA_RESULT(drv->cu->cuMipmappedArrayDestroy(img->cudaImages[i].mipmapArray));
CHECK_CUDA_RESULT(drv->cu->cuDestroyExternalMemory(img->cudaImages[i].extMem));
}
CHECK_CUDA_RESULT(drv->cu->cuDestroyExternalMemory(img->extMem));

memset(img, 0, sizeof(BackingImage));
free(img);
Expand Down Expand Up @@ -301,7 +351,7 @@ static bool copyFrameToSurface(NVDriver *drv, CUdeviceptr ptr, NVSurface *surfac
} else {
CHECK_CUDA_RESULT(drv->cu->cuMemcpy2DAsync(&cpy, 0));
}
y += surface->height >> p->ss.y;
y += cpy.Height;
}

//notify anyone waiting for us to be resolved
Expand Down Expand Up @@ -357,16 +407,15 @@ static bool direct_fillExportDescriptor(NVDriver *drv, NVSurface *surface, VADRM
desc->height = surface->height;

desc->num_layers = fmtInfo->numPlanes;
desc->num_objects = fmtInfo->numPlanes;
desc->num_objects = 1;
desc->objects[0].fd = dup(img->fds[0]);
desc->objects[0].size = img->totalSize;
desc->objects[0].drm_format_modifier = img->mods[0];

for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
desc->objects[i].fd = dup(img->fds[i]);
desc->objects[i].size = img->size[i];
desc->objects[i].drm_format_modifier = img->mods[i];

desc->layers[i].drm_format = fmtInfo->plane[i].fourcc;
desc->layers[i].num_planes = 1;
desc->layers[i].object_index[0] = i;
desc->layers[i].object_index[0] = 0;
desc->layers[i].offset[0] = img->offsets[i];
desc->layers[i].pitch[0] = img->strides[i];
}
Expand Down
108 changes: 102 additions & 6 deletions src/direct/nv-driver.c
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,8 @@ bool alloc_memory(NVDriverContext *context, uint32_t size, int *fd) {
return false;
}

bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint8_t channels, uint8_t bitsPerChannel, uint32_t fourcc, NVDriverImage *image) {
uint32_t calculate_image_size(uint32_t width, uint32_t height, uint8_t channels, uint8_t bitsPerChannel, uint32_t* widthInBytesOut)
{
uint32_t gobWidthInBytes = 64;
uint32_t gobHeightInBytes = 8;

Expand All @@ -449,12 +450,107 @@ bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint
//These two seem to be correct, but it was discovered by trial and error so I'm not 100% sure
uint32_t widthInBytes = ROUND_UP(width * bytesPerPixel, gobWidthInBytes << log2GobsPerBlockX);
uint32_t alignedHeight = ROUND_UP(height, gobHeightInBytes << log2GobsPerBlockY);

uint32_t imageSizeInBytes = widthInBytes * alignedHeight;
uint32_t size = imageSizeInBytes;

LOG("Aligned image size: %dx%d = %d", widthInBytes, alignedHeight, imageSizeInBytes);

if (widthInBytesOut != NULL) {
*widthInBytesOut = widthInBytes;
}
return imageSizeInBytes;
}

bool alloc_buffer(NVDriverContext *context, uint32_t size, uint32_t widthInBytes, int *fd1, int *fd2, int *drmFd) {
int memFd = -1;
bool ret = alloc_memory(context, size, &memFd);
if (!ret) {
LOG("alloc_memory failed");
return false;
}

//now export the dma-buf
uint32_t pitchInBlocks = widthInBytes / 64; //TODO replace with better constants

//printf("got gobsPerBlock: %ux%u %u %u %u %d\n", width, height, log2GobsPerBlockX, log2GobsPerBlockY, log2GobsPerBlockZ, pitchInBlocks);
//duplicate the fd so we don't invalidate it by importing it
int memFd2 = dup(memFd);
if (memFd2 == -1) {
LOG("dup failed");
goto err;
}

struct NvKmsKapiPrivImportMemoryParams nvkmsParams = {
.memFd = memFd2,
.surfaceParams = {
.layout = NvKmsSurfaceMemoryLayoutBlockLinear,
.blockLinear = {
.genericMemory = 0,
.pitchInBlocks = pitchInBlocks,
.log2GobsPerBlock.x = 0,
.log2GobsPerBlock.y = 4, //TODO replace with better constants
.log2GobsPerBlock.z = 0,
}
}
};

struct drm_nvidia_gem_import_nvkms_memory_params params = {
.mem_size = size,
.nvkms_params_ptr = (uint64_t) &nvkmsParams,
.nvkms_params_size = context->driverMajorVersion == 470 ? 0x20 : sizeof(nvkmsParams) //needs to be 0x20 in the 470 series driver
};
int drmret = ioctl(context->drmFd, DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY, &params);
if (drmret != 0) {
LOG("DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY failed: %d %d", drmret, errno);
goto err;
}

//export dma-buf
struct drm_prime_handle prime_handle = {
.handle = params.handle
};
drmret = ioctl(context->drmFd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &prime_handle);
if (drmret != 0) {
LOG("DRM_IOCTL_PRIME_HANDLE_TO_FD failed: %d %d", drmret, errno);
goto err;
}

struct drm_gem_close gem_close = {
.handle = params.handle
};
drmret = ioctl(context->drmFd, DRM_IOCTL_GEM_CLOSE, &gem_close);
if (drmret != 0) {
LOG("DRM_IOCTL_GEM_CLOSE failed: %d %d", drmret, errno);
goto prime_err;
}

*fd1 = memFd;
*fd2 = memFd2;
*drmFd = prime_handle.fd;
return true;

prime_err:
if (prime_handle.fd > 0) {
close(prime_handle.fd);
}

err:
if (memFd > 0) {
close(memFd);
}

return false;
}

bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint8_t channels, uint8_t bitsPerChannel, uint32_t fourcc, NVDriverImage *image) {
uint32_t gobWidthInBytes = 64;

uint32_t log2GobsPerBlockX = 0; //TODO not sure if these are the correct numbers to start with, but they're the largest ones i've seen used
uint32_t log2GobsPerBlockY = height < 88 ? 3 : 4; //TODO 88 is a guess, 80px high needs 3, 112px needs 4, 96px needs 4, 88px needs 4
uint32_t log2GobsPerBlockZ = 0;

uint32_t widthInBytes = 0;
uint32_t size = calculate_image_size(width, height, channels, bitsPerChannel, &widthInBytes);

//this gets us some memory, and the fd to import into cuda
int memFd = -1;
bool ret = alloc_memory(context, size, &memFd);
Expand Down Expand Up @@ -489,7 +585,7 @@ bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint
};

struct drm_nvidia_gem_import_nvkms_memory_params params = {
.mem_size = imageSizeInBytes,
.mem_size = size,
.nvkms_params_ptr = (uint64_t) &nvkmsParams,
.nvkms_params_size = context->driverMajorVersion == 470 ? 0x20 : sizeof(nvkmsParams) //needs to be 0x20 in the 470 series driver
};
Expand Down Expand Up @@ -526,10 +622,10 @@ bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint
image->mods = DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, context->sector_layout, context->page_kind_generation, context->generic_page_kind, log2GobsPerBlockY);
image->offset = 0;
image->pitch = widthInBytes;
image->memorySize = imageSizeInBytes;
image->memorySize = size;
image->fourcc = fourcc;

LOG("created image: %dx%d %lx %d %x", width, height, image->mods, widthInBytes, imageSizeInBytes);
LOG("created image: %dx%d %lx %d %x", width, height, image->mods, widthInBytes, size);

return true;

Expand Down
2 changes: 2 additions & 0 deletions src/direct/nv-driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,7 @@ bool free_nvdriver(NVDriverContext *context);
bool get_device_uuid(NVDriverContext *context, char uuid[16]);
bool alloc_memory(NVDriverContext *context, uint32_t size, int *fd);
bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint8_t channels, uint8_t bytesPerChannel, uint32_t fourcc, NVDriverImage *image);
bool alloc_buffer(NVDriverContext *context, uint32_t size, uint32_t widthInBytes, int *fd1, int *fd2, int *drmFd);
uint32_t calculate_image_size(uint32_t width, uint32_t height, uint8_t channels, uint8_t bitsPerChannel, uint32_t* widthInBytesOut);

#endif
2 changes: 1 addition & 1 deletion src/vabackend.c
Original file line number Diff line number Diff line change
Expand Up @@ -2039,7 +2039,7 @@ static VAStatus nvExportSurfaceHandle(

drv->backend->fillExportDescriptor(drv, surface, ptr);

LOG("Exporting with %d %d %d %d %" PRIx64 " %d %d %" PRIx64, ptr->width, ptr->height, ptr->layers[0].offset[0],
LOG("Exporting with w:%d h:%d o:%d p:%d m:%" PRIx64 " o:%d p:%d m:%" PRIx64, ptr->width, ptr->height, ptr->layers[0].offset[0],
ptr->layers[0].pitch[0], ptr->objects[0].drm_format_modifier,
ptr->layers[1].offset[0], ptr->layers[1].pitch[0],
ptr->objects[1].drm_format_modifier);
Expand Down
2 changes: 2 additions & 0 deletions src/vabackend.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ typedef struct _BackingImage {
//direct backend only
NVCudaImage cudaImages[3];
NVFormat format;
uint32_t totalSize;
CUexternalMemory extMem;
} BackingImage;

struct _NVDriver;
Expand Down

0 comments on commit f276397

Please sign in to comment.