Skip to content

Commit

Permalink
OptiX testrender overhaul (take two) (#1897)
Browse files Browse the repository at this point in the history
This PR is a continuation of #1829, updated to include the recently added triangle mesh support. It enables full path tracing support for the OptiX backend in testrender. We have tried to share code between the CPU and OptiX backends where practical. There is more sharing in this PR than there was in #1829, which should reduce the maintenance burden a bit.
ID-based dispatch

Virtual function calls aren't well supported in OptiX, so rather than using regular C++ polymorphism to invoke the sample(), eval(), and get_albedo() functions for each of the BSDF sub-types, we manually invoke the correct function based on the closure ID (which we have added as a member of the BSDF class).

```
#define BSDF_CAST(BSDF_TYPE, bsdf) reinterpret_cast<const BSDF_TYPE*>(bsdf)

OSL_HOSTDEVICE Color3
CompositeBSDF::get_albedo(const BSDF* bsdf, const Vec3& wo) const
{
    Color3 albedo(0);
    switch (bsdf->id) {
    case DIFFUSE_ID:
        albedo = BSDF_CAST(Diffuse<0>, bsdf)->get_albedo(wo);
        break;
    case TRANSPARENT_ID:
    case MX_TRANSPARENT_ID:
        albedo = BSDF_CAST(Transparent, bsdf)->get_albedo(wo);
        break;
```

Iterative closure evaluation

Another key change is the non-recursive closure evaluation. We apply the same style of iterative tree traversal used in the previous OptiX version of process_closure() to the shared implementations of process_closure(), evaluate_layer_opacity(), process_medium_closure(), and process_background_closure().
Background sampling

We've included support for background closures. This includes an OptiX implementation of the Background::prepare() function. We've broken that function into three phases, where phases 1 and 3 are parallelized across a warp and phase 2 is executed on a single thread. This offers a decent speedup over a single-threaded implementation without the complexity of a more sophisticated implementation.

```
    // from background.h
    
    template<typename F>
    OSL_HOSTDEVICE void prepare_cuda(int stride, int idx, F cb)
    {
        prepare_cuda_01(stride, idx, cb);
        if (idx == 0)
            prepare_cuda_02();
        prepare_cuda_03(stride, idx);
    }
```

Tests

I have enabled the render-* tests for OptiX mode. I've added alternative reference images, since the GPU output exceeds the difference threshold on many of the tests. But in most cases the difference between the CPU and GPU output is very small.

---------

Signed-off-by: Tim Grant <tgrant@nvidia.com>
  • Loading branch information
tgrant-nv authored Nov 13, 2024
1 parent 0d3e9d2 commit bda7495
Show file tree
Hide file tree
Showing 91 changed files with 2,293 additions and 1,446 deletions.
3 changes: 2 additions & 1 deletion src/cmake/testing.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ macro ( TESTSUITE )
AND NOT EXISTS "${_testsrcdir}/NOOPTIX-FIXME"
AND NOT EXISTS "${_testsrcdir}/BATCHED_REGRESSION")
# Unoptimized
if (NOT EXISTS "${_testsrcdir}/OPTIMIZEONLY")
if (NOT EXISTS "${_testsrcdir}/OPTIMIZEONLY"
AND NOT EXISTS "${_testsrcdir}/OPTIX_OPTIMIZEONLY")
add_one_testsuite ("${_testname}.optix" "${_testsrcdir}"
ENV TESTSHADE_OPT=0 TESTSHADE_OPTIX=1 )
endif ()
Expand Down
6 changes: 5 additions & 1 deletion src/include/OSL/platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,11 @@
/// to use regular assert() for this purpose if you need to eliminate the
/// dependency on this header from a particular place (and don't mind that
/// assert won't format identically on all platforms).
#ifndef NDEBUG
///
/// These macros are no-ops when compiling for CUDA because they were found
/// to cause strange issues in device code (e.g., function bodies being
/// eliminated when OSL_DASSERT is used).
#if !defined(NDEBUG) && !defined(__CUDACC__)
# define OSL_DASSERT OSL_ASSERT
# define OSL_DASSERT_MSG OSL_ASSERT_MSG
#else
Expand Down
20 changes: 12 additions & 8 deletions src/testrender/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ if (OSL_USE_OPTIX)
list (APPEND testrender_srcs optixraytracer.cpp)
set (testrender_cuda_srcs
cuda/optix_raytracer.cu
cuda/wrapper.cu
)

set (testrender_rend_lib_srcs
Expand All @@ -25,17 +24,22 @@ if (OSL_USE_OPTIX)
)

# We need to make sure that the PTX files are regenerated whenever these
# headers change.
# files change.
set (testrender_cuda_headers
cuda/rend_lib.h
render_params.h)

set ( extra_cuda_headers
render_params.h )
background.h
optics.h
render_params.h
raytracer.h
sampling.h
shading.h
shading.cpp
simpleraytracer.cpp
)

# Generate PTX for all of the CUDA files
foreach (cudasrc ${testrender_cuda_srcs})
NVCC_COMPILE ( ${cudasrc} ${extra_cuda_headers} ptx_generated "" )
NVCC_COMPILE ( ${cudasrc} "${testrender_cuda_headers}" ptx_generated "" )
list (APPEND ptx_list ${ptx_generated})
endforeach ()

Expand All @@ -55,7 +59,7 @@ if (OSL_USE_OPTIX)
list (APPEND ptx_list ${rend_lib_ptx})

add_custom_target (testrender_ptx ALL
DEPENDS ${ptx_list}
DEPENDS ${ptx_list} ${testrender_cuda_headers}
SOURCES ${testrender_cuda_srcs} )

# Install the PTX files in a fixed location so that they can be
Expand Down
154 changes: 141 additions & 13 deletions src/testrender/background.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,48 @@

OSL_NAMESPACE_ENTER


// std::upper_bound is not supported in device code, so define a version of it here.
// Adapted from the LLVM Project, see https://llvm.org/LICENSE.txt for license information.
template<typename T>
inline OSL_HOSTDEVICE const T*
upper_bound(const T* data, int count, const T value)
{
const T* first = data;
const T value_ = value;
int len = count;
while (len != 0) {
int l2 = len / 2;
const T* m = first;
m += l2;
if (value_ < *m)
len = l2;
else {
first = ++m;
len -= l2 + 1;
}
}
return first;
}


struct Background {
OSL_HOSTDEVICE
Background() : values(0), rows(0), cols(0) {}

OSL_HOSTDEVICE
~Background()
{
#ifndef __CUDACC__
delete[] values;
delete[] rows;
delete[] cols;
#endif
}

template<typename F, typename T> void prepare(int resolution, F cb, T* data)
{
// These values are set via set_variables() in CUDA
res = resolution;
if (res < 32)
res = 32; // validate
Expand All @@ -29,6 +60,7 @@ struct Background {
values = new Vec3[res * res];
rows = new float[res];
cols = new float[res * res];

for (int y = 0, i = 0; y < res; y++) {
for (int x = 0; x < res; x++, i++) {
values[i] = cb(map(x + 0.5f, y + 0.5f), data);
Expand All @@ -43,8 +75,9 @@ struct Background {
cols[i - res + x] /= cols[i - 1];
}
// normalize the pdf across all scanlines
for (int y = 0; y < res; y++)
for (int y = 0; y < res; y++) {
rows[y] /= rows[res - 1];
}

// both eval and sample below return a "weight" that is
// value[i] / row*col_pdf, so might as well bake it into the table
Expand All @@ -65,6 +98,7 @@ struct Background {
#endif
}

OSL_HOSTDEVICE
Vec3 eval(const Vec3& dir, float& pdf) const
{
// map from sphere to unit-square
Expand All @@ -90,6 +124,7 @@ struct Background {
return values[i];
}

OSL_HOSTDEVICE
Vec3 sample(float rx, float ry, Dual2<Vec3>& dir, float& pdf) const
{
float row_pdf, col_pdf;
Expand All @@ -101,8 +136,98 @@ struct Background {
return values[y * res + x];
}

#ifdef __CUDACC__
OSL_HOSTDEVICE
void set_variables(Vec3* values_in, float* rows_in, float* cols_in,
int res_in)
{
values = values_in;
rows = rows_in;
cols = cols_in;
res = res_in;
invres = __frcp_rn(res);
invjacobian = __fdiv_rn(res * res, float(4 * M_PI));
assert(res >= 32);
}

template<typename F>
OSL_HOSTDEVICE void prepare_cuda(int stride, int idx, F cb)
{
// N.B. This needs to run on a single-warp launch, since there is no
// synchronization across warps in OptiX.
prepare_cuda_01(stride, idx, cb);
if (idx == 0)
prepare_cuda_02();
prepare_cuda_03(stride, idx);
}

// Pre-compute the 'values' table in parallel
template<typename F>
OSL_HOSTDEVICE void prepare_cuda_01(int stride, int idx, F cb)
{
for (int y = 0; y < res; y++) {
const int row_start = y * res;
const int row_end = row_start + res;
int i = row_start + idx;
for (int x = idx; x < res; x += stride, i += stride) {
if (i >= row_end)
continue;
values[i] = cb(map(x + 0.5f, y + 0.5f));
}
}
}

// Compute 'cols' and 'rows' using a single thread
OSL_HOSTDEVICE void prepare_cuda_02()
{
for (int y = 0, i = 0; y < res; y++) {
for (int x = 0; x < res; x++, i++) {
cols[i] = std::max(std::max(values[i].x, values[i].y),
values[i].z)
+ ((x > 0) ? cols[i - 1] : 0.0f);
}
rows[y] = cols[i - 1] + ((y > 0) ? rows[y - 1] : 0.0f);
// normalize the pdf for this scanline (if it was non-zero)
if (cols[i - 1] > 0) {
for (int x = 0; x < res; x++) {
cols[i - res + x] = __fdiv_rn(cols[i - res + x],
cols[i - 1]);
}
}
}
}

// Normalize the row PDFs and finalize the 'values' table
OSL_HOSTDEVICE void prepare_cuda_03(int stride, int idx)
{
// normalize the pdf across all scanlines
for (int y = idx; y < res; y += stride) {
rows[y] = __fdiv_rn(rows[y], rows[res - 1]);
}

// both eval and sample below return a "weight" that is
// value[i] / row*col_pdf, so might as well bake it into the table
for (int y = 0; y < res; y++) {
float row_pdf = rows[y] - (y > 0 ? rows[y - 1] : 0.0f);
const int row_start = y * res;
const int row_end = row_start + res;
int i = row_start + idx;
for (int x = idx; x < res; x += stride, i += stride) {
if (i >= row_end)
continue;
float col_pdf = cols[i] - (x > 0 ? cols[i - 1] : 0.0f);
const float divisor = __fmul_rn(__fmul_rn(row_pdf, col_pdf),
invjacobian);
values[i].x = __fdiv_rn(values[i].x, divisor);
values[i].y = __fdiv_rn(values[i].y, divisor);
values[i].z = __fdiv_rn(values[i].z, divisor);
}
}
}
#endif

private:
Dual2<Vec3> map(float x, float y) const
OSL_HOSTDEVICE Dual2<Vec3> map(float x, float y) const
{
// pixel coordinates of entry (x,y)
Dual2<float> u = Dual2<float>(x, 1, 0) * invres;
Expand All @@ -115,14 +240,16 @@ struct Background {
return make_Vec3(sin_phi * ct, sin_phi * st, cos_phi);
}

static float sample_cdf(const float* data, unsigned int n, float x,
unsigned int* idx, float* pdf)
static OSL_HOSTDEVICE float sample_cdf(const float* data, unsigned int n,
float x, unsigned int* idx,
float* pdf)
{
OSL_DASSERT(x >= 0);
OSL_DASSERT(x < 1);
*idx = std::upper_bound(data, data + n, x) - data;
OSL_DASSERT(x >= 0.0f);
OSL_DASSERT(x < 1.0f);
*idx = OSL::upper_bound(data, n, x) - data;
OSL_DASSERT(*idx < n);
OSL_DASSERT(x < data[*idx]);

float scaled_sample;
if (*idx == 0) {
*pdf = data[0];
Expand All @@ -137,12 +264,13 @@ struct Background {
return std::min(scaled_sample, 0.99999994f);
}

Vec3* values; // actual map
float* rows; // probability of choosing a given row 'y'
float* cols; // probability of choosing a given column 'x', given that we've chosen row 'y'
int res; // resolution in pixels of the precomputed table
float invres; // 1 / resolution
float invjacobian;
Vec3* values = nullptr; // actual map
float* rows = nullptr; // probability of choosing a given row 'y'
float* cols
= nullptr; // probability of choosing a given column 'x', given that we've chosen row 'y'
int res = -1; // resolution in pixels of the precomputed table
float invres = 0.0f; // 1 / resolution
float invjacobian = 0.0f;
};

OSL_NAMESPACE_EXIT
Loading

0 comments on commit bda7495

Please sign in to comment.