gccでのビルドを修正。

rigaya · Jun 8, 2024 · a01f71d · a01f71d
1 parent d330038
commit a01f71d
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 58 deletions.
diff --git a/NVEncCore/NVEncFilterDenoiseFFT3D.cu b/NVEncCore/NVEncFilterDenoiseFFT3D.cu
@@ -40,7 +40,7 @@
 
 #define FFT_M_PI (3.14159265358979323846f)
 
-static __device__ constexpr int log2u(int n) {
+static __device__ __forceinline__ constexpr int log2u(int n) {
     int x = -1;
     while (n > 0) {
         x++;
@@ -51,7 +51,7 @@ static __device__ constexpr int log2u(int n) {
 
 // intのbitを逆順に並び替える
 template<int N>
-static __device__ constexpr int bitreverse(int x) {
+static __device__ __forceinline__ constexpr int bitreverse(int x) {
     int y = 0;
     for (int i = 0; i < N; i++) {
         y = (y << 1) + (x & 1);
@@ -61,23 +61,23 @@ static __device__ constexpr int bitreverse(int x) {
 }
 
 template<typename T, bool forward>
-static __device__ const complex<T> fw(const int k, const int N) {
+static __device__ __forceinline__ const complex<T> fw(const int k, const int N) {
     // cexp<T>(complex<T>(0.0f, -2.0f * FFT_M_PI * k / (float)N));
     const float theta = ((forward) ? -2.0f : +2.0f) * FFT_M_PI * k / (float)N;
     return complex<T>(std::cos(theta), std::sin(theta));
 }
 
 template<typename T, bool forward>
-static __device__ complex<T> fft_calc0(complex<T> c0, complex<T> c1, const int k, const int N) {
+static __device__ __forceinline__ complex<T> fft_calc0(complex<T> c0, complex<T> c1, const int k, const int N) {
     return c0 + fw<T, forward>(k, N) * c1;
 }
 template<typename T, bool forward>
-static __device__ complex<T> fft_calc1(complex<T> c0, complex<T> c1, const int k, const int N) {
+static __device__ __forceinline__ complex<T> fft_calc1(complex<T> c0, complex<T> c1, const int k, const int N) {
     return c0 - fw<T, forward>(k, N) * c1;
 }
 
 template<typename T, int N, int step>
-static __device__ void fftpermute(complex<T> *data) {
+static __device__ __forceinline__ void fftpermute(complex<T> *data) {
     complex<T> work[N];
     #pragma unroll
     for (int i = 0; i < N; i++) {
@@ -94,7 +94,7 @@ static __device__ void fftpermute(complex<T> *data) {
 }
 
 template<typename T, int N, bool forward, int step>
-static __device__ void fft(complex<T> *data) {
+static __device__ __forceinline__ void fft(complex<T> *data) {
     if (N >= 4) {
         fft<T, N / 2, forward, step>(data);
         fft<T, N / 2, forward, step>(data + (N / 2) * step);
@@ -113,34 +113,34 @@ static __device__ void fft(complex<T> *data) {
 }
 
 template<typename T, int N, int step>
-static __device__ void ifft_normalize(complex<T> *data) {
+static __device__ __forceinline__ void ifft_normalize(complex<T> *data) {
     const float invN = 1.0f / (float)N;
     #pragma unroll
     for (int i = 0; i < N; i++) {
         data[i * step] *= invN;
     }
 }
 
-template<> static __device__ void fft<float2,  1, true,   1>(complex<float2> *data) { return; }
-template<> static __device__ void fft<__half2, 1, true,   1>(complex<__half2> *data) { return; }
-template<> static __device__ void fft<float2,  1, true,   9>(complex<float2> *data) { return; }
-template<> static __device__ void fft<__half2, 1, true,   9>(complex<__half2> *data) { return; }
-template<> static __device__ void fft<float2,  1, true,  17>(complex<float2> *data) { return; }
-template<> static __device__ void fft<__half2, 1, true,  17>(complex<__half2> *data) { return; }
-template<> static __device__ void fft<float2,  1, true,  33>(complex<float2> *data) { return; }
-template<> static __device__ void fft<__half2, 1, true,  33>(complex<__half2> *data) { return; }
-template<> static __device__ void fft<float2,  1, true,  65>(complex<float2> *data) { return; }
-template<> static __device__ void fft<__half2, 1, true,  65>(complex<__half2> *data) { return; }
-template<> static __device__ void fft<float2,  1, false,  1>(complex<float2> *data) { return; }
-template<> static __device__ void fft<__half2, 1, false,  1>(complex<__half2> *data) { return; }
-template<> static __device__ void fft<float2,  1, false,  9>(complex<float2> *data) { return; }
-template<> static __device__ void fft<__half2, 1, false,  9>(complex<__half2> *data) { return; }
-template<> static __device__ void fft<float2,  1, false, 17>(complex<float2> *data) { return; }
-template<> static __device__ void fft<__half2, 1, false, 17>(complex<__half2> *data) { return; }
-template<> static __device__ void fft<float2,  1, false, 33>(complex<float2> *data) { return; }
-template<> static __device__ void fft<__half2, 1, false, 33>(complex<__half2> *data) { return; }
-template<> static __device__ void fft<float2,  1, false, 65>(complex<float2> *data) { return; }
-template<> static __device__ void fft<__half2, 1, false, 65>(complex<__half2> *data) { return; }
+template<> __device__ __forceinline__ void fft<float2,  1, true,   1>(complex<float2> *data) { return; }
+template<> __device__ __forceinline__ void fft<__half2, 1, true,   1>(complex<__half2> *data) { return; }
+template<> __device__ __forceinline__ void fft<float2,  1, true,   9>(complex<float2> *data) { return; }
+template<> __device__ __forceinline__ void fft<__half2, 1, true,   9>(complex<__half2> *data) { return; }
+template<> __device__ __forceinline__ void fft<float2,  1, true,  17>(complex<float2> *data) { return; }
+template<> __device__ __forceinline__ void fft<__half2, 1, true,  17>(complex<__half2> *data) { return; }
+template<> __device__ __forceinline__ void fft<float2,  1, true,  33>(complex<float2> *data) { return; }
+template<> __device__ __forceinline__ void fft<__half2, 1, true,  33>(complex<__half2> *data) { return; }
+template<> __device__ __forceinline__ void fft<float2,  1, true,  65>(complex<float2> *data) { return; }
+template<> __device__ __forceinline__ void fft<__half2, 1, true,  65>(complex<__half2> *data) { return; }
+template<> __device__ __forceinline__ void fft<float2,  1, false,  1>(complex<float2> *data) { return; }
+template<> __device__ __forceinline__ void fft<__half2, 1, false,  1>(complex<__half2> *data) { return; }
+template<> __device__ __forceinline__ void fft<float2,  1, false,  9>(complex<float2> *data) { return; }
+template<> __device__ __forceinline__ void fft<__half2, 1, false,  9>(complex<__half2> *data) { return; }
+template<> __device__ __forceinline__ void fft<float2,  1, false, 17>(complex<float2> *data) { return; }
+template<> __device__ __forceinline__ void fft<__half2, 1, false, 17>(complex<__half2> *data) { return; }
+template<> __device__ __forceinline__ void fft<float2,  1, false, 33>(complex<float2> *data) { return; }
+template<> __device__ __forceinline__ void fft<__half2, 1, false, 33>(complex<__half2> *data) { return; }
+template<> __device__ __forceinline__ void fft<float2,  1, false, 65>(complex<float2> *data) { return; }
+template<> __device__ __forceinline__ void fft<__half2, 1, false, 65>(complex<__half2> *data) { return; }
 
 template<typename T, int N, bool forward, int step>
 static __device__ void dft(complex<T> *data) {

diff --git a/NVEncCore/rgy_cuda_util_kernel.h b/NVEncCore/rgy_cuda_util_kernel.h
@@ -1039,15 +1039,26 @@ static __device__ half8& operator-=(half8& a, half8 b) {
     return a;
 }
 
+static __host__ __device__ __forceinline__ void set_complex(__half2& val, float real, float img) {
+#if ENABLE_CUDA_FP16_DEVICE
+    // half2の定数化を効率よく行うためには、__half2をuint32_tに変換してから代入する
+    __half2 c_h2 = __half2(__half(real), __half(img));
+    uint32_t c_uint = (*(uint32_t *)(&c_h2));
+    val = *(__half2 *)(&c_uint);
+#endif
+}
+
+static __host__ __device__ __forceinline__ void set_complex(float2& val, float real, float img) {
+    val.x = real;
+    val.y = img;
+}
+
 template<typename T>
 struct __align__(sizeof(T)) complex {
     T v;
     __host__ __device__ complex() {};
     __host__ __device__ complex(float real, float img) {
-        T val;
-        val.x = real;
-        val.y = img;
-        v = val;
+        set_complex(v, real, img);
     }
     __host__ __device__ complex(T val) {
         v = val;
@@ -1080,39 +1091,29 @@ struct __align__(sizeof(T)) complex {
     }
 };
 
-template<>
-__host__ __device__ complex<__half2>::complex(float real, float img) {
-#if ENABLE_CUDA_FP16_DEVICE
-    // half2の定数化を効率よく行うためには、__half2をuint32_tに変換してから代入する
-    __half2 c_h2 = __half2(__half(real), __half(img));
-    uint32_t c_uint = (*(uint32_t *)(&c_h2));
-    v = *(__half2 *)(&c_uint);
-#endif
-};
-
 template<typename T>
-static __device__ complex<T> operator*(complex<T> a, float b) {
+__device__ __forceinline__ complex<T> operator*(complex<T> a, float b) {
     a.v *= b;
     return a;
 }
 
 template<>
-static __device__ complex<__half2> operator*(complex<__half2> a, float b) {
+__device__ __forceinline__ complex<__half2> operator*(complex<__half2> a, float b) {
 #if ENABLE_CUDA_FP16_DEVICE
     __half2 bh2 = __float2half2_rn(b);
     a.v *= bh2;
 #endif
     return a;
 }
 template<typename T>
-static __device__ complex<T> operator*(const complex<T>& a, const complex<T>& b) {
+__device__ __forceinline__ complex<T> operator*(const complex<T>& a, const complex<T>& b) {
     complex<T> result;
     result.v.x = (a.v.x * b.v.x) - (a.v.y * b.v.y);
     result.v.y = (a.v.x * b.v.y) + (a.v.y * b.v.x);
     return result;
 }
 template<>
-static __device__ complex<__half2> operator*(const complex<__half2>& a, const complex<__half2>& b) {
+__device__ __forceinline__ complex<__half2> operator*(const complex<__half2>& a, const complex<__half2>& b) {
     complex<__half2> result;
 #if ENABLE_CUDA_FP16_DEVICE
     if (true) { // こちらのほうがPRMT命令が減って若干高速
@@ -1135,13 +1136,13 @@ static __device__ complex<__half2> operator*(const complex<__half2>& a, const co
 }
 
 template<typename T>
-static __device__ complex<T>& operator*=(complex<T>& a, float b) {
+__device__ __forceinline__ complex<T>& operator*=(complex<T>& a, float b) {
     a.v *= b;
     return a;
 }
 
 template<>
-static __device__ complex<__half2>& operator*=(complex<__half2>& a, float b) {
+__device__ __forceinline__ complex<__half2>& operator*=(complex<__half2>& a, float b) {
 #if ENABLE_CUDA_FP16_DEVICE
     __half2 bh2 = __float2half2_rn(b);
     a.v *= bh2;
@@ -1150,15 +1151,15 @@ static __device__ complex<__half2>& operator*=(complex<__half2>& a, float b) {
 }
 
 template<typename T>
-static __device__ complex<T>& operator*=(complex<T>& a, complex<T> b) {
+__device__ __forceinline__ complex<T>& operator*=(complex<T>& a, complex<T> b) {
     complex<T> result;
     result.v.x = (a.v.x * b.v.x) - (a.v.y * b.v.y);
     result.v.y = (a.v.x * b.v.y) + (a.v.y * b.v.x);
     a = result;
     return a;
 }
 template<>
-static __device__ complex<__half2>& operator*=(complex<__half2>& a, complex<__half2> b) {
+__device__ __forceinline__ complex<__half2>& operator*=(complex<__half2>& a, complex<__half2> b) {
 #if ENABLE_CUDA_FP16_DEVICE
     __half2 a_x = __half2(a.v.x, a.v.x);
     __half2 a_y = __half2(a.v.y, a.v.y);
@@ -1169,49 +1170,49 @@ static __device__ complex<__half2>& operator*=(complex<__half2>& a, complex<__ha
 }
 
 template<typename T>
-static __device__ complex<T> operator+(complex<T> a, float b) {
+__device__ __forceinline__ complex<T> operator+(complex<T> a, float b) {
     a.v += b;
     return a;
 }
 
 template<typename T>
-static __device__ complex<T> operator+(complex<T> a, complex<T> b) {
+__device__ __forceinline__ complex<T> operator+(complex<T> a, complex<T> b) {
     a.v += b.v;
     return a;
 }
 
 template<typename T>
-static __device__ complex<T>& operator+=(complex<T>& a, float b) {
+__device__ __forceinline__ complex<T>& operator+=(complex<T>& a, float b) {
     a.v += b;
     return a;
 }
 
 template<typename T>
-static __device__ complex<T>& operator+=(complex<T>& a, complex<T> b) {
+__device__ __forceinline__ complex<T>& operator+=(complex<T>& a, complex<T> b) {
     a.v += b.v;
     return a;
 }
 
 template<typename T>
-static __device__ complex<T> operator-(complex<T> a, float b) {
+__device__ __forceinline__ complex<T> operator-(complex<T> a, float b) {
     a.v -= b;
     return a;
 }
 
 template<typename T>
-static __device__ complex<T> operator-(complex<T> a, complex<T> b) {
+__device__ __forceinline__ complex<T> operator-(complex<T> a, complex<T> b) {
     a.v -= b.v;
     return a;
 }
 
 template<typename T>
-static __device__ complex<T>& operator-=(complex<T>& a, float b) {
+__device__ __forceinline__ complex<T>& operator-=(complex<T>& a, float b) {
     a.v -= b;
     return a;
 }
 
 template<typename T>
-static __device__ complex<T>& operator-=(complex<T>& a, complex<T> b) {
+__device__ __forceinline__ complex<T>& operator-=(complex<T>& a, complex<T> b) {
     a.v -= b.v;
     return a;
 }