-
Notifications
You must be signed in to change notification settings - Fork 0
/
kernel.cu
50 lines (42 loc) · 1.5 KB
/
kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
/******************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
******************************************************************************/
#define BLOCK_SIZE 512
__global__ void reduction(float *out, float *in, unsigned size)
{
/********************************************************************
Load a segment of the input vector into shared memory
Traverse the reduction tree
Write the computed sum to the output vector at the correct index
********************************************************************/
// INSERT KERNEL CODE HERE
__shared__ float partialSum[2*BLOCK_SIZE];
unsigned int t = threadIdx.x;
unsigned int start = 2*blockIdx.x*blockDim.x;
if(t+start<size)
{
partialSum[t] = in[start+t];
//partialSum[blockDim.x + t] = in[start +blockDim.x+t];
}
else
{ partialSum[t] = 0.0;
//partialSum[blockDim.x + t] = 0.0;
}
if(t+blockDim.x+start<size){
partialSum[blockDim.x+t] = in [start + blockDim.x+t];
}else{
partialSum[blockDim.x + t] = 0.0;
}
for(unsigned int stride = 1; stride<=blockDim.x; stride*=2)
{
__syncthreads();
if(t % stride == 0)
partialSum[2*t] += partialSum[2*t+stride];
}
__syncthreads();
out[blockIdx.x] = partialSum[0];
}