diff --git a/examples/cuda/vector_addition.cu b/examples/cuda/vector_addition.cu new file mode 100644 index 0000000..fc38841 --- /dev/null +++ b/examples/cuda/vector_addition.cu @@ -0,0 +1,85 @@ +// from https://github.com/olcf-tutorials/vector_addition_cuda/blob/master/vector_addition.cu + +#include + +// Size of array +#define N 1048576 + +// Kernel +__global__ void add_vectors(double *a, double *b, double *c) +{ + int id = blockDim.x * blockIdx.x + threadIdx.x; + if(id < N) c[id] = a[id] + b[id]; +} + +// Main program +int main() +{ + // Number of bytes to allocate for N doubles + size_t bytes = N*sizeof(double); + + // Allocate memory for arrays A, B, and C on host + double *A = (double*)malloc(bytes); + double *B = (double*)malloc(bytes); + double *C = (double*)malloc(bytes); + + // Allocate memory for arrays d_A, d_B, and d_C on device + double *d_A, *d_B, *d_C; + cudaMalloc(&d_A, bytes); + cudaMalloc(&d_B, bytes); + cudaMalloc(&d_C, bytes); + + // Fill host arrays A and B + for(int i=0; i>>(d_A, d_B, d_C); + + // Copy data from device array d_C to host array C + cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost); + + // Verify results + double tolerance = 1.0e-14; + for(int i=0; i tolerance) + { + printf("\nError: value of C[%d] = %d instead of 3.0\n\n", i, C[i]); + exit(1); + } + } + + // Free CPU memory + free(A); + free(B); + free(C); + + // Free GPU memory + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + + printf("\n---------------------------\n"); + printf("__SUCCESS__\n"); + printf("---------------------------\n"); + printf("N = %d\n", N); + printf("Threads Per Block = %d\n", thr_per_blk); + printf("Blocks In Grid = %d\n", blk_in_grid); + printf("---------------------------\n\n"); + + return 0; +}