-
Notifications
You must be signed in to change notification settings - Fork 11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[nvptx-run] Add --verbose/-v #27
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,45 @@ | |
|
||
#include "version.h" | ||
|
||
static int verbose = 0; | ||
|
||
static void | ||
print_hr (FILE *f, size_t val) | ||
{ | ||
const char * units[] = { "B", "KiB", "MiB", "GiB" }; | ||
unsigned count = 0; | ||
size_t rem = 0; | ||
while (val >= 1024 | ||
&& count < sizeof (units) / sizeof (units[0])) | ||
{ | ||
rem = val % 1024; | ||
val = val / 1024; | ||
count++; | ||
} | ||
/* Scale remainder to double in [0, 1]. */ | ||
double fraction = (double)rem / 1024; | ||
/* Scale remainder to int in [0, 100]. */ | ||
unsigned int int_fraction = (unsigned)(fraction * 100); | ||
fprintf (f, "%zu.%02u %s", val, int_fraction, units[count]); | ||
} | ||
|
||
static void | ||
print_val (FILE *f, const char * p, size_t val) | ||
{ | ||
fprintf (f, "%s: %zu (", p, val); | ||
print_hr (f, val); | ||
fprintf (f, ")\n"); | ||
} | ||
|
||
static void | ||
report_val (FILE *f, const char * p, size_t val) | ||
{ | ||
if (!verbose) | ||
return; | ||
|
||
print_val (f, p, val); | ||
} | ||
|
||
/* On systems where installed NVIDIA driver is newer than CUDA Toolkit, | ||
libcuda.so may have these functions even though <cuda.h> does not. */ | ||
|
||
|
@@ -147,6 +186,7 @@ static const struct option long_options[] = | |
{ "debuginfo", no_argument, 0, 'G' }, | ||
{ "help", no_argument, 0, 'h' }, | ||
{ "version", no_argument, 0, 'V' }, | ||
{ "verbose", no_argument, 0, 'v' }, | ||
{ 0, 0, 0, 0 } | ||
}; | ||
|
||
|
@@ -155,7 +195,7 @@ main (int argc, char **argv) | |
{ | ||
int o; | ||
long stack_size = 0, heap_size = 256 * 1024 * 1024, num_lanes = 1; | ||
while ((o = getopt_long (argc, argv, "S:H:L:O:gGhV", long_options, 0)) != -1) | ||
while ((o = getopt_long (argc, argv, "+S:H:L:O:gGhVv", long_options, 0)) != -1) | ||
{ | ||
switch (o) | ||
{ | ||
|
@@ -195,6 +235,7 @@ Options:\n\ | |
-O, --optlevel N Pass PTX JIT option to set optimization level N\n\ | ||
-g, --lineinfo Pass PTX JIT option to generate line information\n\ | ||
-G, --debuginfo Pass PTX JIT option to generate debug information\n\ | ||
-v, --verbose Run in verbose mode\n\ | ||
--help Print this help and exit\n\ | ||
--version Print version number and exit\n\ | ||
\n\ | ||
|
@@ -211,6 +252,9 @@ the GNU General Public License version 3 or later.\n\ | |
This program has absolutely no warranty.\n", | ||
PKGVERSION, NVPTX_TOOLS_VERSION, "2015"); | ||
exit (0); | ||
case 'v': | ||
verbose = 1; | ||
break; | ||
default: | ||
break; | ||
} | ||
|
@@ -235,6 +279,27 @@ This program has absolutely no warranty.\n", | |
r = cuCtxCreate (&ctx, 0, dev); | ||
fatal_unless_success (r, "cuCtxCreate failed"); | ||
|
||
size_t mem; | ||
if (!stack_size || verbose) | ||
{ | ||
r = cuDeviceTotalMem (&mem, dev); | ||
fatal_unless_success (r, "could not get available memory"); | ||
report_val (stderr, "Total device memory", mem); | ||
} | ||
|
||
size_t free_mem; | ||
size_t dummy; | ||
if (verbose) | ||
{ | ||
/* Set stack size limit to 0 to get more accurate free_mem. */ | ||
r = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 0); | ||
Comment on lines
+294
to
+295
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From So, should we add more commentary for this, or point to an external URL if that makes sense? |
||
fatal_unless_success (r, "could not set stack limit"); | ||
|
||
r = cuMemGetInfo (&free_mem, &dummy); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, doesn't
Or, is total amount of memory available for allocation by the CUDA context vs. total amount of memory available on the device intentional? |
||
fatal_unless_success (r, "could not get free memory"); | ||
report_val (stderr, "Initial free device memory", free_mem); | ||
} | ||
|
||
CUdeviceptr d_retval; | ||
r = cuMemAlloc(&d_retval, sizeof (int)); | ||
fatal_unless_success (r, "cuMemAlloc failed"); | ||
|
@@ -263,29 +328,34 @@ This program has absolutely no warranty.\n", | |
} | ||
} | ||
|
||
#if 0 | ||
/* Default seems to be 1 KiB stack, 8 MiB heap. */ | ||
size_t stack, heap; | ||
cuCtxGetLimit (&stack, CU_LIMIT_STACK_SIZE); | ||
cuCtxGetLimit (&heap, CU_LIMIT_MALLOC_HEAP_SIZE); | ||
printf ("stack %ld heap %ld\n", stack, heap); | ||
#endif | ||
if (verbose) | ||
{ | ||
size_t free_mem_update; | ||
r = cuMemGetInfo (&free_mem_update, &dummy); | ||
fatal_unless_success (r, "could not get free memory"); | ||
report_val (stderr, "Program args reservation (effective)", | ||
free_mem - free_mem_update); | ||
Comment on lines
+333
to
+337
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't this difference computation implicitly assume that nothing else is using the GPU concurrently? (Which is a wrong assumption?) Or, does every process/CUDA context always have available all the GPU memory -- I don't remember the details, and have not yet looked that up. |
||
free_mem = free_mem_update; | ||
} | ||
|
||
if (!stack_size) | ||
int sm_count, thread_max; | ||
if (!stack_size || verbose) | ||
{ | ||
/* It appears that CUDA driver sometimes accounts memory as if stacks | ||
were reserved for the maximum number of threads the device can host, | ||
even if only a few are launched. Compute the default accordingly. */ | ||
int sm_count, thread_max; | ||
r = cuDeviceGetAttribute (&sm_count, | ||
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); | ||
fatal_unless_success (r, "could not get SM count"); | ||
|
||
r = cuDeviceGetAttribute | ||
(&thread_max, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); | ||
fatal_unless_success (r, "could not get max threads per SM count"); | ||
size_t mem; | ||
r = cuDeviceTotalMem (&mem, dev); | ||
fatal_unless_success (r, "could not get available memory"); | ||
} | ||
|
||
if (!stack_size) | ||
{ | ||
/* It appears that CUDA driver sometimes accounts memory as if stacks | ||
were reserved for the maximum number of threads the device can host, | ||
even if only a few are launched. Compute the default accordingly. */ | ||
|
||
/* Subtract heap size and a 128 MiB extra. */ | ||
mem -= heap_size + 128 * 1024 * 1024; | ||
mem /= sm_count * thread_max; | ||
|
@@ -295,10 +365,27 @@ This program has absolutely no warranty.\n", | |
/* Round down to 8-byte boundary. */ | ||
stack_size = mem & -8u; | ||
} | ||
|
||
r = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, stack_size); | ||
fatal_unless_success (r, "could not set stack limit"); | ||
report_val (stderr, "Set stack size limit", stack_size); | ||
|
||
if (verbose) | ||
{ | ||
report_val (stderr, "Stack size limit reservation (estimated)", | ||
stack_size * sm_count * thread_max); | ||
size_t free_mem_update; | ||
r = cuMemGetInfo (&free_mem_update, &dummy); | ||
fatal_unless_success (r, "could not get free memory"); | ||
report_val (stderr, "Stack size limit reservation (effective)", | ||
free_mem - free_mem_update); | ||
Comment on lines
+377
to
+381
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same concern as above. |
||
free_mem = free_mem_update; | ||
report_val (stderr, "Free device memory", free_mem); | ||
} | ||
|
||
r = cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, heap_size); | ||
fatal_unless_success (r, "could not set heap limit"); | ||
report_val (stderr, "Set heap size limit", heap_size); | ||
|
||
CUmodule hModule = 0; | ||
CUfunction hKernel = 0; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should
dummy
move inside theif (verbose)
?