Skip to content
This repository has been archived by the owner on Jun 10, 2024. It is now read-only.

Commit

Permalink
Merge branch 'on_gpu_buffer'
Browse files Browse the repository at this point in the history
  • Loading branch information
rarzumanyan committed Oct 5, 2021
2 parents bbd8baa + dc64956 commit 5742166
Show file tree
Hide file tree
Showing 6 changed files with 392 additions and 3 deletions.
30 changes: 30 additions & 0 deletions PyNvCodec/TC/inc/MemoryInterfaces.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,36 @@ class DllExport Buffer final : public Token {
#endif
};

class DllExport CudaBuffer final : public Token {
public:
CudaBuffer() = delete;
CudaBuffer(const CudaBuffer &other) = delete;
CudaBuffer &operator=(CudaBuffer &other) = delete;

static CudaBuffer *Make(size_t elemSize, size_t numElems, CUcontext context);
CudaBuffer *Clone();

size_t GetRawMemSize() const { return elem_size * num_elems; }
size_t GetNumElems() const { return num_elems; }
size_t GetElemSize() const { return elem_size; }
CUdeviceptr GpuMem() { return gpuMem; }
~CudaBuffer();

private:
CudaBuffer(size_t elemSize, size_t numElems, CUcontext context);
bool Allocate();
void Deallocate();

CUdeviceptr gpuMem = 0UL;
CUcontext ctx = nullptr;
size_t elem_size = 0U;
size_t num_elems = 0U;

#ifdef TRACK_TOKEN_ALLOCATIONS
uint64_t id = 0U;
#endif
};

/* RAII-style CUDA Context (un)lock;
*/
class DllExport CudaCtxPush final {
Expand Down
39 changes: 39 additions & 0 deletions PyNvCodec/TC/inc/Tasks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,26 @@ class DllExport CudaUploadFrame final : public Task {
struct CudaUploadFrame_Impl *pImpl = nullptr;
};

class DllExport UploadBuffer final : public Task {
public:
UploadBuffer() = delete;
UploadBuffer(const UploadBuffer &other) = delete;
UploadBuffer &operator=(const UploadBuffer &other) = delete;

TaskExecStatus Run() final;
size_t GetUploadSize() const;
~UploadBuffer() final;
static UploadBuffer *Make(CUstream cuStream, CUcontext cuContext,
uint32_t elem_size, uint32_t num_elems);

private:
UploadBuffer(CUstream cuStream, CUcontext cuContext,
uint32_t elem_size, uint32_t num_elems);
static const uint32_t numInputs = 1U;
static const uint32_t numOutputs = 1U;
struct UploadBuffer_Impl *pImpl = nullptr;
};

class DllExport CudaDownloadSurface final : public Task {
public:
CudaDownloadSurface() = delete;
Expand All @@ -163,6 +183,25 @@ class DllExport CudaDownloadSurface final : public Task {
struct CudaDownloadSurface_Impl *pImpl = nullptr;
};

class DllExport DownloadCudaBuffer final : public Task {
public:
DownloadCudaBuffer() = delete;
DownloadCudaBuffer(const DownloadCudaBuffer &other) = delete;
DownloadCudaBuffer &operator=(const DownloadCudaBuffer &other) = delete;

~DownloadCudaBuffer() final;
TaskExecStatus Run() final;
static DownloadCudaBuffer *Make(CUstream cuStream, CUcontext cuContext,
uint32_t elem_size, uint32_t num_elems);

private:
DownloadCudaBuffer(CUstream cuStream, CUcontext cuContext,
uint32_t elem_size, uint32_t num_elems);
static const uint32_t numInputs = 1U;
static const uint32_t numOutputs = 1U;
struct DownloadCudaBuffer_Impl *pImpl = nullptr;
};

class DllExport DemuxFrame final : public Task {
public:
DemuxFrame() = delete;
Expand Down
68 changes: 66 additions & 2 deletions PyNvCodec/TC/src/MemoryInterfaces.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,12 @@ struct AllocRegister {
}
};

AllocRegister BuffersRegister, HWSurfaceRegister;
AllocRegister BuffersRegister, HWSurfaceRegister, CudaBuffersRegiser;

bool CheckAllocationCounters() {
auto numLeakedBuffers = BuffersRegister.GetSize();
auto numLeakedSurfaces = HWSurfaceRegister.GetSize();
auto numLeakedCudaBuffers = CudaBuffersRegiser.GetSize();

if (numLeakedBuffers) {
cerr << "Leaked buffers (id : size): " << endl;
Expand All @@ -101,7 +102,15 @@ bool CheckAllocationCounters() {
}
}

return (0U == numLeakedBuffers) && (0U == numLeakedSurfaces);
if (numLeakedCudaBuffers) {
cerr << "Leaked CUDA buffers (id : size): " << endl;
for (auto i = 0; i < numLeakedCudaBuffers; i++) {
auto pNote = CudaBuffersRegiser.GetNoteByIndex(i);
cerr << "\t" << pNote->id << "\t: " << pNote->size << endl;
}
}

return (0U == numLeakedBuffers) && (0U == numLeakedSurfaces) && (0U == numLeakedCudaBuffers);
}

} // namespace VPF
Expand Down Expand Up @@ -263,6 +272,61 @@ Buffer *Buffer::MakeOwnMem(size_t bufferSize, const void *pCopyFrom,
return new Buffer(bufferSize, pCopyFrom, ctx);
}

CudaBuffer* CudaBuffer::Make(size_t elemSize, size_t numElems, CUcontext context) {
return new CudaBuffer(elemSize, numElems, context);
}

CudaBuffer *CudaBuffer::Clone() {
auto pCopy = CudaBuffer::Make(elem_size, num_elems, ctx);

if (CUDA_SUCCESS != cuMemcpyDtoD(pCopy->GpuMem(), GpuMem(), GetRawMemSize())) {
delete pCopy;
return nullptr;
}

return pCopy;
}

CudaBuffer::~CudaBuffer() {
Deallocate();
}

CudaBuffer::CudaBuffer(size_t elemSize, size_t numElems, CUcontext context) {
elem_size = elemSize;
num_elems = numElems;
ctx = context;

if (!Allocate()) {
throw bad_alloc();
}
}

bool CudaBuffer::Allocate() {
if (GetRawMemSize()) {
CudaCtxPush lock(ctx);
auto res = cuMemAlloc(&gpuMem, GetRawMemSize());
ThrowOnCudaError(res, __LINE__);

if (0U != gpuMem) {
#ifdef TRACK_TOKEN_ALLOCATIONS
id = CudaBuffersRegiser.AddNote(GetRawMemSize());
#endif
return true;
}
}
return false;
}

void CudaBuffer::Deallocate() {
ThrowOnCudaError(cuMemFree(gpuMem), __LINE__);
gpuMem = 0U;

#ifdef TRACK_TOKEN_ALLOCATIONS
AllocInfo info(id, GetRawMemSize());
CudaBuffersRegiser.DeleteNote(info);
#endif
}

SurfacePlane::SurfacePlane() = default;

SurfacePlane &SurfacePlane::operator=(const SurfacePlane &other) {
Expand Down
117 changes: 117 additions & 0 deletions PyNvCodec/TC/src/Tasks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,65 @@ TaskExecStatus CudaUploadFrame::Run() {
return TASK_EXEC_SUCCESS;
}

namespace VPF {
struct UploadBuffer_Impl {
CUstream cuStream;
CUcontext cuContext;
CudaBuffer *pBuffer = nullptr;

UploadBuffer_Impl() = delete;
UploadBuffer_Impl(const UploadBuffer_Impl &other) = delete;
UploadBuffer_Impl &operator=(const UploadBuffer_Impl &other) = delete;

UploadBuffer_Impl(CUstream stream, CUcontext context,
uint32_t elem_size, uint32_t num_elems)
: cuStream(stream), cuContext(context) {
pBuffer = CudaBuffer::Make(elem_size, num_elems, context);
}

~UploadBuffer_Impl() { delete pBuffer; }
};
} // namespace VPF

UploadBuffer *UploadBuffer::Make(CUstream cuStream, CUcontext cuContext,
uint32_t elem_size, uint32_t num_elems) {
return new UploadBuffer(cuStream, cuContext, elem_size, num_elems);
}

UploadBuffer::UploadBuffer(CUstream cuStream, CUcontext cuContext,
uint32_t elem_size, uint32_t num_elems)
:

Task("UploadBuffer", UploadBuffer::numInputs,
UploadBuffer::numOutputs, cuda_stream_sync, (void *)cuStream) {
pImpl = new UploadBuffer_Impl(cuStream, cuContext, elem_size, num_elems);
}

UploadBuffer::~UploadBuffer() { delete pImpl; }

TaskExecStatus UploadBuffer::Run() {
NvtxMark tick(__FUNCTION__);
if (!GetInput()) {
return TASK_EXEC_FAIL;
}

ClearOutputs();

auto stream = pImpl->cuStream;
auto context = pImpl->cuContext;
auto pBuffer = pImpl->pBuffer;
auto pSrcHost = ((Buffer *)GetInput())->GetDataAs<void>();

CudaCtxPush lock(context);
if (CUDA_SUCCESS != cuMemcpyHtoDAsync(pBuffer->GpuMem(), (const void *)pSrcHost,
pBuffer->GetRawMemSize(), stream)) {
return TASK_EXEC_FAIL;
}

SetOutput(pBuffer, 0);
return TASK_EXEC_SUCCESS;
}

namespace VPF {
struct CudaDownloadSurface_Impl {
CUstream cuStream;
Expand Down Expand Up @@ -538,6 +597,25 @@ struct CudaDownloadSurface_Impl {

~CudaDownloadSurface_Impl() { delete pHostFrame; }
};

struct DownloadCudaBuffer_Impl {
CUstream cuStream;
CUcontext cuContext;
Buffer *pHostBuffer = nullptr;

DownloadCudaBuffer_Impl() = delete;
DownloadCudaBuffer_Impl(const DownloadCudaBuffer_Impl &other) = delete;
DownloadCudaBuffer_Impl &
operator=(const DownloadCudaBuffer_Impl &other) = delete;

DownloadCudaBuffer_Impl(CUstream stream, CUcontext context, uint32_t elem_size,
uint32_t num_elems)
: cuStream(stream), cuContext(context) {
pHostBuffer = Buffer::MakeOwnMem(elem_size * num_elems, context);
}

~DownloadCudaBuffer_Impl() { delete pHostBuffer; }
};
} // namespace VPF

CudaDownloadSurface *CudaDownloadSurface::Make(CUstream cuStream,
Expand Down Expand Up @@ -601,6 +679,45 @@ TaskExecStatus CudaDownloadSurface::Run() {
return TASK_EXEC_SUCCESS;
}

DownloadCudaBuffer *DownloadCudaBuffer::Make(CUstream cuStream, CUcontext cuContext,
uint32_t elem_size, uint32_t num_elems) {
return new DownloadCudaBuffer(cuStream, cuContext, elem_size, num_elems);
}

DownloadCudaBuffer::DownloadCudaBuffer(CUstream cuStream, CUcontext cuContext,
uint32_t elem_size, uint32_t num_elems) :
Task("DownloadCudaBuffer", DownloadCudaBuffer::numInputs,
DownloadCudaBuffer::numOutputs, cuda_stream_sync,
(void *)cuStream) {
pImpl = new DownloadCudaBuffer_Impl(cuStream, cuContext, elem_size, num_elems);
}

DownloadCudaBuffer::~DownloadCudaBuffer() { delete pImpl; }

TaskExecStatus DownloadCudaBuffer::Run() {
NvtxMark tick(__FUNCTION__);

if (!GetInput()) {
return TASK_EXEC_FAIL;
}

ClearOutputs();

auto stream = pImpl->cuStream;
auto context = pImpl->cuContext;
auto pCudaBuffer = (CudaBuffer *)GetInput();
auto pDstHost = ((Buffer *)pImpl->pHostBuffer)->GetDataAs<void>();

CudaCtxPush lock(context);
if (CUDA_SUCCESS != cuMemcpyDtoHAsync(pDstHost, pCudaBuffer->GpuMem(),
pCudaBuffer->GetRawMemSize(), stream)) {
return TASK_EXEC_FAIL;
}

SetOutput(pImpl->pHostBuffer, 0);
return TASK_EXEC_SUCCESS;
}

namespace VPF {
struct DemuxFrame_Impl {
size_t videoBytes = 0U;
Expand Down
35 changes: 35 additions & 0 deletions PyNvCodec/inc/PyNvCodec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,23 @@ class PyFrameUploader {
std::shared_ptr<Surface> UploadSingleFrame(py::array_t<float> &frame);
};

class PyBufferUploader {
std::unique_ptr<UploadBuffer> uploader;
uint32_t elem_size, num_elems;

public:
PyBufferUploader(uint32_t elemSize, uint32_t numElems, uint32_t gpu_ID);

PyBufferUploader(uint32_t elemSize, uint32_t numElems, CUcontext ctx,
CUstream str);

PyBufferUploader(uint32_t elemSize, uint32_t numElems,
size_t ctx, size_t str) :
PyBufferUploader(elemSize, numElems, (CUcontext)ctx, (CUstream)str) {}

std::shared_ptr<CudaBuffer> UploadSingleBuffer(py::array_t<uint8_t> &buffer);
};

class PySurfaceDownloader {
std::unique_ptr<CudaDownloadSurface> upDownloader;
uint32_t surfaceWidth, surfaceHeight;
Expand All @@ -108,6 +125,24 @@ class PySurfaceDownloader {
py::array_t<float> &frame);
};

class PyCudaBufferDownloader {
std::unique_ptr<DownloadCudaBuffer> upDownloader;
uint32_t elem_size, num_elems;

public:
PyCudaBufferDownloader(uint32_t elemSize, uint32_t numElems, uint32_t gpu_ID);

PyCudaBufferDownloader(uint32_t elemSize, uint32_t numElems, CUcontext ctx,
CUstream str);

PyCudaBufferDownloader(uint32_t elemSize, uint32_t numElems,
size_t ctx, size_t str) :
PyCudaBufferDownloader(elemSize, numElems, (CUcontext)ctx, (CUstream)str) {}

bool DownloadSingleCudaBuffer(std::shared_ptr<CudaBuffer> buffer,
py::array_t<uint8_t> &np_array);
};

class PySurfaceConverter {
std::unique_ptr<ConvertSurface> upConverter;
std::unique_ptr<Buffer> upCtxBuffer;
Expand Down
Loading

0 comments on commit 5742166

Please sign in to comment.