OptiX testrender overhaul (take two) (#1897)

This PR is a continuation of #1829, updated to include the recently added triangle mesh support. It enables full path tracing support for the OptiX backend in testrender. We have tried to share code between the CPU and OptiX backends where practical. There is more sharing in this PR than there was in #1829, which should reduce the maintenance burden a bit. ID-based dispatch Virtual function calls aren't well supported in OptiX, so rather than using regular C++ polymorphism to invoke the sample(), eval(), and get_albedo() functions for each of the BSDF sub-types, we manually invoke the correct function based on the closure ID (which we have added as a member of the BSDF class). ``` #define BSDF_CAST(BSDF_TYPE, bsdf) reinterpret_cast<const BSDF_TYPE*>(bsdf) OSL_HOSTDEVICE Color3 CompositeBSDF::get_albedo(const BSDF* bsdf, const Vec3& wo) const { Color3 albedo(0); switch (bsdf->id) { case DIFFUSE_ID: albedo = BSDF_CAST(Diffuse<0>, bsdf)->get_albedo(wo); break; case TRANSPARENT_ID: case MX_TRANSPARENT_ID: albedo = BSDF_CAST(Transparent, bsdf)->get_albedo(wo); break; ``` Iterative closure evaluation Another key change is the non-recursive closure evaluation. We apply the same style of iterative tree traversal used in the previous OptiX version of process_closure() to the shared implementations of process_closure(), evaluate_layer_opacity(), process_medium_closure(), and process_background_closure(). Background sampling We've included support for background closures. This includes an OptiX implementation of the Background::prepare() function. We've broken that function into three phases, where phases 1 and 3 are parallelized across a warp and phase 2 is executed on a single thread. This offers a decent speedup over a single-threaded implementation without the complexity of a more sophisticated implementation. ``` // from background.h template<typename F> OSL_HOSTDEVICE void prepare_cuda(int stride, int idx, F cb) { prepare_cuda_01(stride, idx, cb); if (idx == 0) prepare_cuda_02(); prepare_cuda_03(stride, idx); } ``` Tests I have enabled the render-* tests for OptiX mode. I've added alternative reference images, since the GPU output exceeds the difference threshold on many of the tests. But in most cases the difference between the CPU and GPU output is very small. --------- Signed-off-by: Tim Grant <tgrant@nvidia.com>
AcademySoftwareFoundation · Nov 13, 2024 · bda7495 · bda7495
1 parent 0d3e9d2
commit bda7495
Show file tree

Hide file tree

Showing 91 changed files with 2,293 additions and 1,446 deletions.
diff --git a/src/cmake/testing.cmake b/src/cmake/testing.cmake
@@ -179,7 +179,8 @@ macro ( TESTSUITE )
             AND NOT EXISTS "${_testsrcdir}/NOOPTIX-FIXME"
             AND NOT EXISTS "${_testsrcdir}/BATCHED_REGRESSION")
             # Unoptimized
-            if (NOT EXISTS "${_testsrcdir}/OPTIMIZEONLY")
+            if (NOT EXISTS "${_testsrcdir}/OPTIMIZEONLY"
+                AND NOT EXISTS "${_testsrcdir}/OPTIX_OPTIMIZEONLY")
                 add_one_testsuite ("${_testname}.optix" "${_testsrcdir}"
                                    ENV TESTSHADE_OPT=0 TESTSHADE_OPTIX=1 )
             endif ()

diff --git a/src/include/OSL/platform.h b/src/include/OSL/platform.h
@@ -481,7 +481,11 @@
 /// to use regular assert() for this purpose if you need to eliminate the
 /// dependency on this header from a particular place (and don't mind that
 /// assert won't format identically on all platforms).
-#ifndef NDEBUG
+///
+/// These macros are no-ops when compiling for CUDA because they were found
+/// to cause strange issues in device code (e.g., function bodies being
+/// eliminated when OSL_DASSERT is used).
+#if !defined(NDEBUG) && !defined(__CUDACC__)
 #    define OSL_DASSERT OSL_ASSERT
 #    define OSL_DASSERT_MSG OSL_ASSERT_MSG
 #else

diff --git a/src/testrender/CMakeLists.txt b/src/testrender/CMakeLists.txt
@@ -16,7 +16,6 @@ if (OSL_USE_OPTIX)
     list (APPEND testrender_srcs optixraytracer.cpp)
     set (testrender_cuda_srcs
         cuda/optix_raytracer.cu
-        cuda/wrapper.cu
         )
 
     set (testrender_rend_lib_srcs
@@ -25,17 +24,22 @@ if (OSL_USE_OPTIX)
         )
 
     # We need to make sure that the PTX files are regenerated whenever these
-    # headers change.
+    # files change.
     set (testrender_cuda_headers
         cuda/rend_lib.h
-        render_params.h)
-
-    set ( extra_cuda_headers
-        render_params.h )
+        background.h
+        optics.h
+        render_params.h
+        raytracer.h
+        sampling.h
+        shading.h
+        shading.cpp
+        simpleraytracer.cpp
+        )
 
     # Generate PTX for all of the CUDA files
     foreach (cudasrc ${testrender_cuda_srcs})
-        NVCC_COMPILE ( ${cudasrc} ${extra_cuda_headers} ptx_generated "" )
+        NVCC_COMPILE ( ${cudasrc} "${testrender_cuda_headers}" ptx_generated "" )
         list (APPEND ptx_list ${ptx_generated})
     endforeach ()
 
@@ -55,7 +59,7 @@ if (OSL_USE_OPTIX)
     list (APPEND ptx_list ${rend_lib_ptx})
 
     add_custom_target (testrender_ptx ALL
-        DEPENDS ${ptx_list}
+        DEPENDS ${ptx_list} ${testrender_cuda_headers}
         SOURCES ${testrender_cuda_srcs} )
 
     # Install the PTX files in a fixed location so that they can be

diff --git a/src/testrender/background.h b/src/testrender/background.h
@@ -10,17 +10,48 @@
 
 OSL_NAMESPACE_ENTER
 
+
+// std::upper_bound is not supported in device code, so define a version of it here.
+// Adapted from the LLVM Project, see https://llvm.org/LICENSE.txt for license information.
+template<typename T>
+inline OSL_HOSTDEVICE const T*
+upper_bound(const T* data, int count, const T value)
+{
+    const T* first = data;
+    const T value_ = value;
+    int len        = count;
+    while (len != 0) {
+        int l2     = len / 2;
+        const T* m = first;
+        m += l2;
+        if (value_ < *m)
+            len = l2;
+        else {
+            first = ++m;
+            len -= l2 + 1;
+        }
+    }
+    return first;
+}
+
+
 struct Background {
+    OSL_HOSTDEVICE
     Background() : values(0), rows(0), cols(0) {}
+
+    OSL_HOSTDEVICE
     ~Background()
     {
+#ifndef __CUDACC__
         delete[] values;
         delete[] rows;
         delete[] cols;
+#endif
     }
 
     template<typename F, typename T> void prepare(int resolution, F cb, T* data)
     {
+        // These values are set via set_variables() in CUDA
         res = resolution;
         if (res < 32)
             res = 32;  // validate
@@ -29,6 +60,7 @@ struct Background {
         values      = new Vec3[res * res];
         rows        = new float[res];
         cols        = new float[res * res];
+
         for (int y = 0, i = 0; y < res; y++) {
             for (int x = 0; x < res; x++, i++) {
                 values[i] = cb(map(x + 0.5f, y + 0.5f), data);
@@ -43,8 +75,9 @@ struct Background {
                     cols[i - res + x] /= cols[i - 1];
         }
         // normalize the pdf across all scanlines
-        for (int y = 0; y < res; y++)
+        for (int y = 0; y < res; y++) {
             rows[y] /= rows[res - 1];
+        }
 
         // both eval and sample below return a "weight" that is
         // value[i] / row*col_pdf, so might as well bake it into the table
@@ -65,6 +98,7 @@ struct Background {
 #endif
     }
 
+    OSL_HOSTDEVICE
     Vec3 eval(const Vec3& dir, float& pdf) const
     {
         // map from sphere to unit-square
@@ -90,6 +124,7 @@ struct Background {
         return values[i];
     }
 
+    OSL_HOSTDEVICE
     Vec3 sample(float rx, float ry, Dual2<Vec3>& dir, float& pdf) const
     {
         float row_pdf, col_pdf;
@@ -101,8 +136,98 @@ struct Background {
         return values[y * res + x];
     }
 
+#ifdef __CUDACC__
+    OSL_HOSTDEVICE
+    void set_variables(Vec3* values_in, float* rows_in, float* cols_in,
+                       int res_in)
+    {
+        values      = values_in;
+        rows        = rows_in;
+        cols        = cols_in;
+        res         = res_in;
+        invres      = __frcp_rn(res);
+        invjacobian = __fdiv_rn(res * res, float(4 * M_PI));
+        assert(res >= 32);
+    }
+
+    template<typename F>
+    OSL_HOSTDEVICE void prepare_cuda(int stride, int idx, F cb)
+    {
+        // N.B. This needs to run on a single-warp launch, since there is no
+        // synchronization across warps in OptiX.
+        prepare_cuda_01(stride, idx, cb);
+        if (idx == 0)
+            prepare_cuda_02();
+        prepare_cuda_03(stride, idx);
+    }
+
+    // Pre-compute the 'values' table in parallel
+    template<typename F>
+    OSL_HOSTDEVICE void prepare_cuda_01(int stride, int idx, F cb)
+    {
+        for (int y = 0; y < res; y++) {
+            const int row_start = y * res;
+            const int row_end   = row_start + res;
+            int i               = row_start + idx;
+            for (int x = idx; x < res; x += stride, i += stride) {
+                if (i >= row_end)
+                    continue;
+                values[i] = cb(map(x + 0.5f, y + 0.5f));
+            }
+        }
+    }
+
+    // Compute 'cols' and 'rows' using a single thread
+    OSL_HOSTDEVICE void prepare_cuda_02()
+    {
+        for (int y = 0, i = 0; y < res; y++) {
+            for (int x = 0; x < res; x++, i++) {
+                cols[i] = std::max(std::max(values[i].x, values[i].y),
+                                   values[i].z)
+                          + ((x > 0) ? cols[i - 1] : 0.0f);
+            }
+            rows[y] = cols[i - 1] + ((y > 0) ? rows[y - 1] : 0.0f);
+            // normalize the pdf for this scanline (if it was non-zero)
+            if (cols[i - 1] > 0) {
+                for (int x = 0; x < res; x++) {
+                    cols[i - res + x] = __fdiv_rn(cols[i - res + x],
+                                                  cols[i - 1]);
+                }
+            }
+        }
+    }
+
+    // Normalize the row PDFs and finalize the 'values' table
+    OSL_HOSTDEVICE void prepare_cuda_03(int stride, int idx)
+    {
+        // normalize the pdf across all scanlines
+        for (int y = idx; y < res; y += stride) {
+            rows[y] = __fdiv_rn(rows[y], rows[res - 1]);
+        }
+
+        // both eval and sample below return a "weight" that is
+        // value[i] / row*col_pdf, so might as well bake it into the table
+        for (int y = 0; y < res; y++) {
+            float row_pdf       = rows[y] - (y > 0 ? rows[y - 1] : 0.0f);
+            const int row_start = y * res;
+            const int row_end   = row_start + res;
+            int i               = row_start + idx;
+            for (int x = idx; x < res; x += stride, i += stride) {
+                if (i >= row_end)
+                    continue;
+                float col_pdf       = cols[i] - (x > 0 ? cols[i - 1] : 0.0f);
+                const float divisor = __fmul_rn(__fmul_rn(row_pdf, col_pdf),
+                                                invjacobian);
+                values[i].x         = __fdiv_rn(values[i].x, divisor);
+                values[i].y         = __fdiv_rn(values[i].y, divisor);
+                values[i].z         = __fdiv_rn(values[i].z, divisor);
+            }
+        }
+    }
+#endif
+
 private:
-    Dual2<Vec3> map(float x, float y) const
+    OSL_HOSTDEVICE Dual2<Vec3> map(float x, float y) const
     {
         // pixel coordinates of entry (x,y)
         Dual2<float> u     = Dual2<float>(x, 1, 0) * invres;
@@ -115,14 +240,16 @@ struct Background {
         return make_Vec3(sin_phi * ct, sin_phi * st, cos_phi);
     }
 
-    static float sample_cdf(const float* data, unsigned int n, float x,
-                            unsigned int* idx, float* pdf)
+    static OSL_HOSTDEVICE float sample_cdf(const float* data, unsigned int n,
+                                           float x, unsigned int* idx,
+                                           float* pdf)
     {
-        OSL_DASSERT(x >= 0);
-        OSL_DASSERT(x < 1);
-        *idx = std::upper_bound(data, data + n, x) - data;
+        OSL_DASSERT(x >= 0.0f);
+        OSL_DASSERT(x < 1.0f);
+        *idx = OSL::upper_bound(data, n, x) - data;
         OSL_DASSERT(*idx < n);
         OSL_DASSERT(x < data[*idx]);
+
         float scaled_sample;
         if (*idx == 0) {
             *pdf          = data[0];
@@ -137,12 +264,13 @@ struct Background {
         return std::min(scaled_sample, 0.99999994f);
     }
 
-    Vec3* values;  // actual map
-    float* rows;   // probability of choosing a given row 'y'
-    float* cols;  // probability of choosing a given column 'x', given that we've chosen row 'y'
-    int res;       // resolution in pixels of the precomputed table
-    float invres;  // 1 / resolution
-    float invjacobian;
+    Vec3* values = nullptr;  // actual map
+    float* rows  = nullptr;  // probability of choosing a given row 'y'
+    float* cols
+        = nullptr;  // probability of choosing a given column 'x', given that we've chosen row 'y'
+    int res           = -1;    // resolution in pixels of the precomputed table
+    float invres      = 0.0f;  // 1 / resolution
+    float invjacobian = 0.0f;
 };
 
 OSL_NAMESPACE_EXIT