You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I have a simple CUDA program here that performs a simple reduction:
#include <stdio.h>
#include <cuda_runtime.h>
// CUDA kernel for performing reduction (sum) of an array
__global__ void reduceSum(int *g_input, int *g_output, int n) {
extern __shared__ int s_data[];
// Each thread loads one element from global to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) s_data[tid] = g_input[i];
else s_data[tid] = 0;
__syncthreads();
// Do reduction in shared memory
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
s_data[tid] += s_data[tid + s];
}
__syncthreads();
}
// Write result for this block to global memory
if (tid == 0) g_output[blockIdx.x] = s_data[0];
}
int main() {
int n = 1024;
int size = n * sizeof(int);
int *h_input, *h_output;
int *d_input, *d_output;
// Allocate host memory
h_input = (int*)malloc(size);
h_output = (int*)malloc(sizeof(int));
// Initialize input array
for(int i = 0; i < n; i++) {
h_input[i] = 1; // Example: fill with 1 for simplicity
}
// Allocate device memory
cudaMalloc((void **)&d_input, size);
cudaMalloc((void **)&d_output, sizeof(int));
// Copy from host to device
cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);
// Launch the kernel
int threadsPerBlock = 256;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
reduceSum<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>(d_input, d_output, n);
// Copy result back to host
cudaMemcpy(h_output, d_output, sizeof(int), cudaMemcpyDeviceToHost);
printf("Sum is %d\n", *h_output);
// Cleanup
free(h_input);
free(h_output);
cudaFree(d_input);
cudaFree(d_output);
return 0;
}
Then I compile it using this command (which I believe is the correct one to generate OpenMP with all the optimization):
When I try to run the executable ./simple it shows that:
Sum is 0
but in fact according to the source code the sum should be 256 and my CUDA GPU run confirms this. Moreover, when I remove the optimization options "-scal-rep=0 -raise-scf-to-affine --inner-serialize= -O2", the executable run results in a segmentation fault.
I was wondering what am I doing wrong here? Is the CUDA to OpenMP flow not properly supported anymore?
The text was updated successfully, but these errors were encountered:
I have a simple CUDA program here that performs a simple reduction:
Then I compile it using this command (which I believe is the correct one to generate OpenMP with all the optimization):
When I try to run the executable ./simple it shows that:
but in fact according to the source code the sum should be 256 and my CUDA GPU run confirms this. Moreover, when I remove the optimization options "-scal-rep=0 -raise-scf-to-affine --inner-serialize= -O2", the executable run results in a segmentation fault.
I was wondering what am I doing wrong here? Is the CUDA to OpenMP flow not properly supported anymore?
The text was updated successfully, but these errors were encountered: