Skip to content

Commit

Permalink
gccでのビルドを修正。
Browse files Browse the repository at this point in the history
  • Loading branch information
rigaya committed Jun 8, 2024
1 parent d330038 commit a01f71d
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 58 deletions.
56 changes: 28 additions & 28 deletions NVEncCore/NVEncFilterDenoiseFFT3D.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

#define FFT_M_PI (3.14159265358979323846f)

static __device__ constexpr int log2u(int n) {
static __device__ __forceinline__ constexpr int log2u(int n) {
int x = -1;
while (n > 0) {
x++;
Expand All @@ -51,7 +51,7 @@ static __device__ constexpr int log2u(int n) {

// intのbitを逆順に並び替える
template<int N>
static __device__ constexpr int bitreverse(int x) {
static __device__ __forceinline__ constexpr int bitreverse(int x) {
int y = 0;
for (int i = 0; i < N; i++) {
y = (y << 1) + (x & 1);
Expand All @@ -61,23 +61,23 @@ static __device__ constexpr int bitreverse(int x) {
}

template<typename T, bool forward>
static __device__ const complex<T> fw(const int k, const int N) {
static __device__ __forceinline__ const complex<T> fw(const int k, const int N) {
// cexp<T>(complex<T>(0.0f, -2.0f * FFT_M_PI * k / (float)N));
const float theta = ((forward) ? -2.0f : +2.0f) * FFT_M_PI * k / (float)N;
return complex<T>(std::cos(theta), std::sin(theta));
}

template<typename T, bool forward>
static __device__ complex<T> fft_calc0(complex<T> c0, complex<T> c1, const int k, const int N) {
static __device__ __forceinline__ complex<T> fft_calc0(complex<T> c0, complex<T> c1, const int k, const int N) {
return c0 + fw<T, forward>(k, N) * c1;
}
template<typename T, bool forward>
static __device__ complex<T> fft_calc1(complex<T> c0, complex<T> c1, const int k, const int N) {
static __device__ __forceinline__ complex<T> fft_calc1(complex<T> c0, complex<T> c1, const int k, const int N) {
return c0 - fw<T, forward>(k, N) * c1;
}

template<typename T, int N, int step>
static __device__ void fftpermute(complex<T> *data) {
static __device__ __forceinline__ void fftpermute(complex<T> *data) {
complex<T> work[N];
#pragma unroll
for (int i = 0; i < N; i++) {
Expand All @@ -94,7 +94,7 @@ static __device__ void fftpermute(complex<T> *data) {
}

template<typename T, int N, bool forward, int step>
static __device__ void fft(complex<T> *data) {
static __device__ __forceinline__ void fft(complex<T> *data) {
if (N >= 4) {
fft<T, N / 2, forward, step>(data);
fft<T, N / 2, forward, step>(data + (N / 2) * step);
Expand All @@ -113,34 +113,34 @@ static __device__ void fft(complex<T> *data) {
}

template<typename T, int N, int step>
static __device__ void ifft_normalize(complex<T> *data) {
static __device__ __forceinline__ void ifft_normalize(complex<T> *data) {
const float invN = 1.0f / (float)N;
#pragma unroll
for (int i = 0; i < N; i++) {
data[i * step] *= invN;
}
}

template<> static __device__ void fft<float2, 1, true, 1>(complex<float2> *data) { return; }
template<> static __device__ void fft<__half2, 1, true, 1>(complex<__half2> *data) { return; }
template<> static __device__ void fft<float2, 1, true, 9>(complex<float2> *data) { return; }
template<> static __device__ void fft<__half2, 1, true, 9>(complex<__half2> *data) { return; }
template<> static __device__ void fft<float2, 1, true, 17>(complex<float2> *data) { return; }
template<> static __device__ void fft<__half2, 1, true, 17>(complex<__half2> *data) { return; }
template<> static __device__ void fft<float2, 1, true, 33>(complex<float2> *data) { return; }
template<> static __device__ void fft<__half2, 1, true, 33>(complex<__half2> *data) { return; }
template<> static __device__ void fft<float2, 1, true, 65>(complex<float2> *data) { return; }
template<> static __device__ void fft<__half2, 1, true, 65>(complex<__half2> *data) { return; }
template<> static __device__ void fft<float2, 1, false, 1>(complex<float2> *data) { return; }
template<> static __device__ void fft<__half2, 1, false, 1>(complex<__half2> *data) { return; }
template<> static __device__ void fft<float2, 1, false, 9>(complex<float2> *data) { return; }
template<> static __device__ void fft<__half2, 1, false, 9>(complex<__half2> *data) { return; }
template<> static __device__ void fft<float2, 1, false, 17>(complex<float2> *data) { return; }
template<> static __device__ void fft<__half2, 1, false, 17>(complex<__half2> *data) { return; }
template<> static __device__ void fft<float2, 1, false, 33>(complex<float2> *data) { return; }
template<> static __device__ void fft<__half2, 1, false, 33>(complex<__half2> *data) { return; }
template<> static __device__ void fft<float2, 1, false, 65>(complex<float2> *data) { return; }
template<> static __device__ void fft<__half2, 1, false, 65>(complex<__half2> *data) { return; }
template<> __device__ __forceinline__ void fft<float2, 1, true, 1>(complex<float2> *data) { return; }
template<> __device__ __forceinline__ void fft<__half2, 1, true, 1>(complex<__half2> *data) { return; }
template<> __device__ __forceinline__ void fft<float2, 1, true, 9>(complex<float2> *data) { return; }
template<> __device__ __forceinline__ void fft<__half2, 1, true, 9>(complex<__half2> *data) { return; }
template<> __device__ __forceinline__ void fft<float2, 1, true, 17>(complex<float2> *data) { return; }
template<> __device__ __forceinline__ void fft<__half2, 1, true, 17>(complex<__half2> *data) { return; }
template<> __device__ __forceinline__ void fft<float2, 1, true, 33>(complex<float2> *data) { return; }
template<> __device__ __forceinline__ void fft<__half2, 1, true, 33>(complex<__half2> *data) { return; }
template<> __device__ __forceinline__ void fft<float2, 1, true, 65>(complex<float2> *data) { return; }
template<> __device__ __forceinline__ void fft<__half2, 1, true, 65>(complex<__half2> *data) { return; }
template<> __device__ __forceinline__ void fft<float2, 1, false, 1>(complex<float2> *data) { return; }
template<> __device__ __forceinline__ void fft<__half2, 1, false, 1>(complex<__half2> *data) { return; }
template<> __device__ __forceinline__ void fft<float2, 1, false, 9>(complex<float2> *data) { return; }
template<> __device__ __forceinline__ void fft<__half2, 1, false, 9>(complex<__half2> *data) { return; }
template<> __device__ __forceinline__ void fft<float2, 1, false, 17>(complex<float2> *data) { return; }
template<> __device__ __forceinline__ void fft<__half2, 1, false, 17>(complex<__half2> *data) { return; }
template<> __device__ __forceinline__ void fft<float2, 1, false, 33>(complex<float2> *data) { return; }
template<> __device__ __forceinline__ void fft<__half2, 1, false, 33>(complex<__half2> *data) { return; }
template<> __device__ __forceinline__ void fft<float2, 1, false, 65>(complex<float2> *data) { return; }
template<> __device__ __forceinline__ void fft<__half2, 1, false, 65>(complex<__half2> *data) { return; }

template<typename T, int N, bool forward, int step>
static __device__ void dft(complex<T> *data) {
Expand Down
61 changes: 31 additions & 30 deletions NVEncCore/rgy_cuda_util_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -1039,15 +1039,26 @@ static __device__ half8& operator-=(half8& a, half8 b) {
return a;
}

static __host__ __device__ __forceinline__ void set_complex(__half2& val, float real, float img) {
#if ENABLE_CUDA_FP16_DEVICE
// half2の定数化を効率よく行うためには、__half2をuint32_tに変換してから代入する
__half2 c_h2 = __half2(__half(real), __half(img));
uint32_t c_uint = (*(uint32_t *)(&c_h2));
val = *(__half2 *)(&c_uint);
#endif
}

static __host__ __device__ __forceinline__ void set_complex(float2& val, float real, float img) {
val.x = real;
val.y = img;
}

template<typename T>
struct __align__(sizeof(T)) complex {
T v;
__host__ __device__ complex() {};
__host__ __device__ complex(float real, float img) {
T val;
val.x = real;
val.y = img;
v = val;
set_complex(v, real, img);
}
__host__ __device__ complex(T val) {
v = val;
Expand Down Expand Up @@ -1080,39 +1091,29 @@ struct __align__(sizeof(T)) complex {
}
};

template<>
__host__ __device__ complex<__half2>::complex(float real, float img) {
#if ENABLE_CUDA_FP16_DEVICE
// half2の定数化を効率よく行うためには、__half2をuint32_tに変換してから代入する
__half2 c_h2 = __half2(__half(real), __half(img));
uint32_t c_uint = (*(uint32_t *)(&c_h2));
v = *(__half2 *)(&c_uint);
#endif
};

template<typename T>
static __device__ complex<T> operator*(complex<T> a, float b) {
__device__ __forceinline__ complex<T> operator*(complex<T> a, float b) {
a.v *= b;
return a;
}

template<>
static __device__ complex<__half2> operator*(complex<__half2> a, float b) {
__device__ __forceinline__ complex<__half2> operator*(complex<__half2> a, float b) {
#if ENABLE_CUDA_FP16_DEVICE
__half2 bh2 = __float2half2_rn(b);
a.v *= bh2;
#endif
return a;
}
template<typename T>
static __device__ complex<T> operator*(const complex<T>& a, const complex<T>& b) {
__device__ __forceinline__ complex<T> operator*(const complex<T>& a, const complex<T>& b) {
complex<T> result;
result.v.x = (a.v.x * b.v.x) - (a.v.y * b.v.y);
result.v.y = (a.v.x * b.v.y) + (a.v.y * b.v.x);
return result;
}
template<>
static __device__ complex<__half2> operator*(const complex<__half2>& a, const complex<__half2>& b) {
__device__ __forceinline__ complex<__half2> operator*(const complex<__half2>& a, const complex<__half2>& b) {
complex<__half2> result;
#if ENABLE_CUDA_FP16_DEVICE
if (true) { // こちらのほうがPRMT命令が減って若干高速
Expand All @@ -1135,13 +1136,13 @@ static __device__ complex<__half2> operator*(const complex<__half2>& a, const co
}

template<typename T>
static __device__ complex<T>& operator*=(complex<T>& a, float b) {
__device__ __forceinline__ complex<T>& operator*=(complex<T>& a, float b) {
a.v *= b;
return a;
}

template<>
static __device__ complex<__half2>& operator*=(complex<__half2>& a, float b) {
__device__ __forceinline__ complex<__half2>& operator*=(complex<__half2>& a, float b) {
#if ENABLE_CUDA_FP16_DEVICE
__half2 bh2 = __float2half2_rn(b);
a.v *= bh2;
Expand All @@ -1150,15 +1151,15 @@ static __device__ complex<__half2>& operator*=(complex<__half2>& a, float b) {
}

template<typename T>
static __device__ complex<T>& operator*=(complex<T>& a, complex<T> b) {
__device__ __forceinline__ complex<T>& operator*=(complex<T>& a, complex<T> b) {
complex<T> result;
result.v.x = (a.v.x * b.v.x) - (a.v.y * b.v.y);
result.v.y = (a.v.x * b.v.y) + (a.v.y * b.v.x);
a = result;
return a;
}
template<>
static __device__ complex<__half2>& operator*=(complex<__half2>& a, complex<__half2> b) {
__device__ __forceinline__ complex<__half2>& operator*=(complex<__half2>& a, complex<__half2> b) {
#if ENABLE_CUDA_FP16_DEVICE
__half2 a_x = __half2(a.v.x, a.v.x);
__half2 a_y = __half2(a.v.y, a.v.y);
Expand All @@ -1169,49 +1170,49 @@ static __device__ complex<__half2>& operator*=(complex<__half2>& a, complex<__ha
}

template<typename T>
static __device__ complex<T> operator+(complex<T> a, float b) {
__device__ __forceinline__ complex<T> operator+(complex<T> a, float b) {
a.v += b;
return a;
}

template<typename T>
static __device__ complex<T> operator+(complex<T> a, complex<T> b) {
__device__ __forceinline__ complex<T> operator+(complex<T> a, complex<T> b) {
a.v += b.v;
return a;
}

template<typename T>
static __device__ complex<T>& operator+=(complex<T>& a, float b) {
__device__ __forceinline__ complex<T>& operator+=(complex<T>& a, float b) {
a.v += b;
return a;
}

template<typename T>
static __device__ complex<T>& operator+=(complex<T>& a, complex<T> b) {
__device__ __forceinline__ complex<T>& operator+=(complex<T>& a, complex<T> b) {
a.v += b.v;
return a;
}

template<typename T>
static __device__ complex<T> operator-(complex<T> a, float b) {
__device__ __forceinline__ complex<T> operator-(complex<T> a, float b) {
a.v -= b;
return a;
}

template<typename T>
static __device__ complex<T> operator-(complex<T> a, complex<T> b) {
__device__ __forceinline__ complex<T> operator-(complex<T> a, complex<T> b) {
a.v -= b.v;
return a;
}

template<typename T>
static __device__ complex<T>& operator-=(complex<T>& a, float b) {
__device__ __forceinline__ complex<T>& operator-=(complex<T>& a, float b) {
a.v -= b;
return a;
}

template<typename T>
static __device__ complex<T>& operator-=(complex<T>& a, complex<T> b) {
__device__ __forceinline__ complex<T>& operator-=(complex<T>& a, complex<T> b) {
a.v -= b.v;
return a;
}
Expand Down

0 comments on commit a01f71d

Please sign in to comment.