Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[nvptx-run] Add --verbose/-v #27

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 103 additions & 16 deletions nvptx-run.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,45 @@

#include "version.h"

static int verbose = 0;

static void
print_hr (FILE *f, size_t val)
{
const char * units[] = { "B", "KiB", "MiB", "GiB" };
unsigned count = 0;
size_t rem = 0;
while (val >= 1024
&& count < sizeof (units) / sizeof (units[0]))
{
rem = val % 1024;
val = val / 1024;
count++;
}
/* Scale remainder to double in [0, 1]. */
double fraction = (double)rem / 1024;
/* Scale remainder to int in [0, 100]. */
unsigned int int_fraction = (unsigned)(fraction * 100);
fprintf (f, "%zu.%02u %s", val, int_fraction, units[count]);
}

static void
print_val (FILE *f, const char * p, size_t val)
{
fprintf (f, "%s: %zu (", p, val);
print_hr (f, val);
fprintf (f, ")\n");
}

static void
report_val (FILE *f, const char * p, size_t val)
{
if (!verbose)
return;

print_val (f, p, val);
}

/* On systems where installed NVIDIA driver is newer than CUDA Toolkit,
libcuda.so may have these functions even though <cuda.h> does not. */

Expand Down Expand Up @@ -147,6 +186,7 @@ static const struct option long_options[] =
{ "debuginfo", no_argument, 0, 'G' },
{ "help", no_argument, 0, 'h' },
{ "version", no_argument, 0, 'V' },
{ "verbose", no_argument, 0, 'v' },
{ 0, 0, 0, 0 }
};

Expand All @@ -155,7 +195,7 @@ main (int argc, char **argv)
{
int o;
long stack_size = 0, heap_size = 256 * 1024 * 1024, num_lanes = 1;
while ((o = getopt_long (argc, argv, "S:H:L:O:gGhV", long_options, 0)) != -1)
while ((o = getopt_long (argc, argv, "+S:H:L:O:gGhVv", long_options, 0)) != -1)
{
switch (o)
{
Expand Down Expand Up @@ -195,6 +235,7 @@ Options:\n\
-O, --optlevel N Pass PTX JIT option to set optimization level N\n\
-g, --lineinfo Pass PTX JIT option to generate line information\n\
-G, --debuginfo Pass PTX JIT option to generate debug information\n\
-v, --verbose Run in verbose mode\n\
--help Print this help and exit\n\
--version Print version number and exit\n\
\n\
Expand All @@ -211,6 +252,9 @@ the GNU General Public License version 3 or later.\n\
This program has absolutely no warranty.\n",
PKGVERSION, NVPTX_TOOLS_VERSION, "2015");
exit (0);
case 'v':
verbose = 1;
break;
default:
break;
}
Expand All @@ -235,6 +279,27 @@ This program has absolutely no warranty.\n",
r = cuCtxCreate (&ctx, 0, dev);
fatal_unless_success (r, "cuCtxCreate failed");

size_t mem;
if (!stack_size || verbose)
{
r = cuDeviceTotalMem (&mem, dev);
fatal_unless_success (r, "could not get available memory");
report_val (stderr, "Total device memory", mem);
}

size_t free_mem;
size_t dummy;
Comment on lines +289 to +291
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should dummy move inside the if (verbose)?

if (verbose)
{
/* Set stack size limit to 0 to get more accurate free_mem. */
r = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 0);
Comment on lines +294 to +295
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From cuCtxSetLimit: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g0651954dfb9788173e60a9af7201e65a I can't easily tell the rationale here.

So, should we add more commentary for this, or point to an external URL if that makes sense?

fatal_unless_success (r, "could not set stack limit");

r = cuMemGetInfo (&free_mem, &dummy);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, doesn't dummy here (when given a better name) make obsolete the earlier cuDeviceTotalMem call?

Or, is total amount of memory available for allocation by the CUDA context vs. total amount of memory available on the device intentional?

fatal_unless_success (r, "could not get free memory");
report_val (stderr, "Initial free device memory", free_mem);
}

CUdeviceptr d_retval;
r = cuMemAlloc(&d_retval, sizeof (int));
fatal_unless_success (r, "cuMemAlloc failed");
Expand Down Expand Up @@ -263,29 +328,34 @@ This program has absolutely no warranty.\n",
}
}

#if 0
/* Default seems to be 1 KiB stack, 8 MiB heap. */
size_t stack, heap;
cuCtxGetLimit (&stack, CU_LIMIT_STACK_SIZE);
cuCtxGetLimit (&heap, CU_LIMIT_MALLOC_HEAP_SIZE);
printf ("stack %ld heap %ld\n", stack, heap);
#endif
if (verbose)
{
size_t free_mem_update;
r = cuMemGetInfo (&free_mem_update, &dummy);
fatal_unless_success (r, "could not get free memory");
report_val (stderr, "Program args reservation (effective)",
free_mem - free_mem_update);
Comment on lines +333 to +337
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't this difference computation implicitly assume that nothing else is using the GPU concurrently? (Which is a wrong assumption?) Or, does every process/CUDA context always have available all the GPU memory -- I don't remember the details, and have not yet looked that up.

free_mem = free_mem_update;
}

if (!stack_size)
int sm_count, thread_max;
if (!stack_size || verbose)
{
/* It appears that CUDA driver sometimes accounts memory as if stacks
were reserved for the maximum number of threads the device can host,
even if only a few are launched. Compute the default accordingly. */
int sm_count, thread_max;
r = cuDeviceGetAttribute (&sm_count,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
fatal_unless_success (r, "could not get SM count");

r = cuDeviceGetAttribute
(&thread_max, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
fatal_unless_success (r, "could not get max threads per SM count");
size_t mem;
r = cuDeviceTotalMem (&mem, dev);
fatal_unless_success (r, "could not get available memory");
}

if (!stack_size)
{
/* It appears that CUDA driver sometimes accounts memory as if stacks
were reserved for the maximum number of threads the device can host,
even if only a few are launched. Compute the default accordingly. */

/* Subtract heap size and a 128 MiB extra. */
mem -= heap_size + 128 * 1024 * 1024;
mem /= sm_count * thread_max;
Expand All @@ -295,10 +365,27 @@ This program has absolutely no warranty.\n",
/* Round down to 8-byte boundary. */
stack_size = mem & -8u;
}

r = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, stack_size);
fatal_unless_success (r, "could not set stack limit");
report_val (stderr, "Set stack size limit", stack_size);

if (verbose)
{
report_val (stderr, "Stack size limit reservation (estimated)",
stack_size * sm_count * thread_max);
size_t free_mem_update;
r = cuMemGetInfo (&free_mem_update, &dummy);
fatal_unless_success (r, "could not get free memory");
report_val (stderr, "Stack size limit reservation (effective)",
free_mem - free_mem_update);
Comment on lines +377 to +381
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same concern as above.

free_mem = free_mem_update;
report_val (stderr, "Free device memory", free_mem);
}

r = cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, heap_size);
fatal_unless_success (r, "could not set heap limit");
report_val (stderr, "Set heap size limit", heap_size);

CUmodule hModule = 0;
CUfunction hKernel = 0;
Expand Down