diff --git a/intercept/src/controls.h b/intercept/src/controls.h index 70476a53..0035b770 100644 --- a/intercept/src/controls.h +++ b/intercept/src/controls.h @@ -44,6 +44,7 @@ CLI_CONTROL( bool, AppendPid, false, "If s CLI_CONTROL( bool, KernelNameHashTracking, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will append the program and build option hashes to the kernel name in logs and reports." ) CLI_CONTROL( cl_uint, LongKernelNameCutoff, UINT_MAX, "If an OpenCL application uses kernels with very long names, the Intercept Layer for OpenCL Applications can substitute a \"short\" kernel identifier for a \"long\" kernel name in logs and reports. This control defines how long a kernel name must be (in characters) before it is replaced by a \"short\" kernel identifier." ) CLI_CONTROL( bool, DemangleKernelNames, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will track kernel names that are demangled according to C++ ABI rules. This setting requires compiler support for demangling and may not be available in all configurations." ) +CLI_CONTROL( bool, DetectNaNs, false, "If set, the Intercept Layer for OpenCL Applications will detect NaNs appearing in fp buffers and fp images and log the first kernel (name and enqueue counter) which caused these NaNs." ) CLI_CONTROL_SEPARATOR( Reporting Controls: ) CLI_CONTROL( bool, ReportToStderr, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will emit reports to stderr." ) diff --git a/intercept/src/dispatch.cpp b/intercept/src/dispatch.cpp index 7cf543bf..5ef8bb17 100644 --- a/intercept/src/dispatch.cpp +++ b/intercept/src/dispatch.cpp @@ -4817,8 +4817,9 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueNDRangeKernel)( INCREMENT_ENQUEUE_COUNTER(); DUMP_BUFFERS_BEFORE_ENQUEUE( kernel, command_queue ); - DUMP_REPLAYABLE_KERNEL( kernel, command_queue, work_dim, global_work_offset, global_work_size, local_work_size ); DUMP_IMAGES_BEFORE_ENQUEUE( kernel, command_queue ); + DUMP_REPLAYABLE_KERNEL( kernel, command_queue, work_dim, global_work_offset, global_work_size, local_work_size ); + CHECK_FOR_NANS( "Before", kernel, command_queue, work_dim, global_work_size ); CHECK_AUBCAPTURE_START_KERNEL( kernel, work_dim, @@ -4926,6 +4927,7 @@ CL_API_ENTRY cl_int CL_API_CALL CLIRN(clEnqueueNDRangeKernel)( DUMP_BUFFERS_AFTER_ENQUEUE( kernel, command_queue ); DUMP_IMAGES_AFTER_ENQUEUE( kernel, command_queue ); + CHECK_FOR_NANS( "After", kernel, command_queue, work_dim, global_work_size ); FINISH_OR_FLUSH_AFTER_ENQUEUE( command_queue ); CHECK_AUBCAPTURE_STOP( command_queue ); diff --git a/intercept/src/intercept.cpp b/intercept/src/intercept.cpp index ca1370bc..1fbc56a8 100644 --- a/intercept/src/intercept.cpp +++ b/intercept/src/intercept.cpp @@ -12,6 +12,7 @@ #include #include #include // strdate +#include #include "common.h" #include "demangle.h" @@ -7455,13 +7456,18 @@ void CLIntercept::dumpKernelInfo( for ( cl_uint idx = 0; idx != numArgs; ++idx ) { size_t argNameSize = 0; - dispatch().clGetKernelArgInfo(kernel, idx, CL_KERNEL_ARG_TYPE_NAME, 0, nullptr, &argNameSize); + int error = dispatch().clGetKernelArgInfo(kernel, idx, CL_KERNEL_ARG_TYPE_NAME, 0, nullptr, &argNameSize); + if ( error != CL_SUCCESS || argNameSize == 0 ) + { + log( "Note: Kernel Argument info not available for replaying.\n" ); + return; + } std::string argName(argNameSize, ' '); - int error = dispatch().clGetKernelArgInfo(kernel, idx, CL_KERNEL_ARG_TYPE_NAME, argNameSize, &argName, nullptr); - if ( error == CL_KERNEL_ARG_INFO_NOT_AVAILABLE ) + error = dispatch().clGetKernelArgInfo(kernel, idx, CL_KERNEL_ARG_TYPE_NAME, argNameSize, &argName[0], nullptr); + if ( error != CL_SUCCESS ) { - log("Note: Kernel Argument info not available for replaying.\n"); + log( "Note: Kernel Argument info not available for replaying.\n" ); return; } outputArgTypes << argName << '\n'; @@ -7519,6 +7525,232 @@ void CLIntercept::dumpArgumentsForKernel( } } +void CLIntercept::detectNaNs( + std::string when, + cl_kernel kernel, + const uint64_t enqueueCounter, + cl_command_queue command_queue, + size_t work_dim, + size_t const* gws ) +{ + std::lock_guard lock(m_Mutex); + + cl_platform_id platform = getPlatform(kernel); + + std::vector transferBuf; + std::string fileNamePrefix = ""; + OS().GetDumpDirectoryName( sc_DumpDirectoryName, fileNamePrefix ); + fileNamePrefix += "/NaN_checker_" + std::to_string(m_ProcessId) + ".txt"; + + std::ofstream output{fileNamePrefix, std::ios_base::app}; + + // for all buffers & images + // See if we have NaNs + CKernelArgMemMap& kernelArgMemMap = m_KernelArgMap[ kernel ]; + CKernelArgMemMap::iterator idx = kernelArgMemMap.begin(); + while( idx != kernelArgMemMap.end() ) + { + cl_uint arg_index = (*idx).first; + void* allocation = (void*)(*idx).second; + cl_mem memobj = (cl_mem)allocation; + ++idx; + + // check if we have a buffer + if( ( m_USMAllocInfoMap.find( allocation ) != m_USMAllocInfoMap.end() ) || + ( m_SVMAllocInfoMap.find( allocation ) != m_SVMAllocInfoMap.end() ) || + ( m_BufferInfoMap.find( memobj ) != m_BufferInfoMap.end() ) ) + { + void* bufferPtr = nullptr; + size_t bufferSize = 0; + + // check if the type is a floating point number + size_t argNameSize = 0; + dispatch().clGetKernelArgInfo(kernel, arg_index, CL_KERNEL_ARG_TYPE_NAME, 0, nullptr, &argNameSize); + + std::string argType(' ', static_cast(argNameSize)); + dispatch().clGetKernelArgInfo(kernel, arg_index, CL_KERNEL_ARG_TYPE_NAME, argNameSize, &argType[0], nullptr); + + if( argType.find("float") != std::string::npos && + argType.find("double") != std::string::npos ) + continue; + + if( m_USMAllocInfoMap.find( allocation ) != m_USMAllocInfoMap.end() ) + { + size_t size = m_USMAllocInfoMap[ allocation ]; + + if( dispatchX(platform).clEnqueueMemcpyINTEL == NULL ) + { + getExtensionFunctionAddress( + platform, + "clEnqueueMemcpyINTEL" ); + } + if( transferBuf.size() < size ) + { + transferBuf.resize(size); + } + + const auto& dispatchX = this->dispatchX(platform); + if( dispatchX.clEnqueueMemcpyINTEL && + transferBuf.size() >= size ) + { + cl_int error = dispatchX.clEnqueueMemcpyINTEL( + command_queue, + CL_TRUE, + transferBuf.data(), + allocation, + size, + 0, + nullptr, + nullptr ); + + if( error == CL_SUCCESS ) + { + bufferPtr = transferBuf.data(); + bufferSize = size; + } + } + } + else if( m_SVMAllocInfoMap.find( allocation ) != m_SVMAllocInfoMap.end() ) + { + size_t size = m_SVMAllocInfoMap[ allocation ]; + cl_int error = dispatch().clEnqueueSVMMap( + command_queue, + CL_TRUE, + CL_MAP_READ, + allocation, + size, + 0, + nullptr, + nullptr ); + if( error == CL_SUCCESS ) + { + bufferPtr = allocation; + bufferSize = size; + dispatch().clEnqueueSVMUnmap( + command_queue, + allocation, + 0, + nullptr, + nullptr ); + } + } + else if( m_BufferInfoMap.find( memobj ) != m_BufferInfoMap.end() ) + { + size_t size = m_BufferInfoMap[ memobj ]; + + cl_int error = CL_SUCCESS; + bufferPtr = dispatch().clEnqueueMapBuffer( + command_queue, + memobj, + CL_TRUE, + CL_MAP_READ, + 0, + size, + 0, + nullptr, + nullptr, + &error ); + if( error == CL_SUCCESS ) + { + bufferSize = size; + dispatch().clEnqueueUnmapMemObject( + command_queue, + memobj, + bufferPtr, + 0, + nullptr, + nullptr ); + } + } + if (bufferPtr == nullptr) + continue; + + bool foundNaN = false; + if( argType.find("float") != std::string::npos ) + { + for( unsigned idx = 0; idx < bufferSize / sizeof(float); idx += sizeof(float) ) + { + if( std::isnan( reinterpret_cast(bufferPtr)[idx] )) + { + foundNaN = true; + break; + } + } + } else + { + for( unsigned idx = 0; idx < bufferSize / sizeof(double); idx += sizeof(double) ) + { + if( std::isnan( reinterpret_cast(bufferPtr)[idx] )) + { + foundNaN = true; + break; + } + } + } + if( foundNaN ) + { + std::string tmp = when + + " kernel: " + getShortKernelName( kernel ) + + ", EnqueueCtr: " + std::to_string( enqueueCounter ) + + ", arg_index: " + std::to_string( arg_index ) + + ", data type: " + argType.c_str() + + ", has a NaN.\n"; + output << tmp; + } + continue; + } + // Not a buffer, should be an image + if( m_ImageInfoMap.find( memobj ) != m_ImageInfoMap.end() ) + { + const SImageInfo& info = m_ImageInfoMap[ memobj ]; + if( info.Format.image_channel_data_type == CL_FLOAT ) + { + size_t size = + info.Region[0] * + info.Region[1] * + info.Region[2] * + info.ElementSize; + std::vector readImageData(size); + + if( readImageData.data() ) + { + size_t origin[3] = { 0, 0, 0 }; + cl_int error = dispatch().clEnqueueReadImage( + command_queue, + memobj, + CL_TRUE, + origin, + info.Region, + 0, + 0, + readImageData.data(), + 0, + nullptr, + nullptr ); + + if( error == CL_SUCCESS ) + { + for( unsigned idx = 0; idx < size / sizeof( float ); idx += sizeof( float ) ) + { + if( std::isnan( reinterpret_cast(readImageData.data())[idx] )) + { + std::string tmp = when + + " kernel: " + getShortKernelName( kernel ) + + ", EnqueueCtr: " + std::to_string( enqueueCounter ) + + ", arg_index: " + std::to_string( arg_index ) + + ", data type: CL_FLOAT" + + ", has a NaN.\n"; + output << tmp; + break; + } + } + } + } + } + } + } +} + /////////////////////////////////////////////////////////////////////////////// // void CLIntercept::dumpBuffersForKernel( diff --git a/intercept/src/intercept.h b/intercept/src/intercept.h index f9f610bc..b0900312 100644 --- a/intercept/src/intercept.h +++ b/intercept/src/intercept.h @@ -954,6 +954,13 @@ class CLIntercept cl_kernel kernel, cl_uint arg_index, std::string const& sampler ); + void detectNaNs( + std::string when, + cl_kernel kernel, + uint64_t enqueueCounter, + cl_command_queue queue, + size_t work_dim, + size_t const* gws ); private: static const char* sc_URL; @@ -1251,6 +1258,8 @@ class CLIntercept typedef std::map CSamplerKernelArgMap; CSamplerKernelArgMap m_samplerKernelArgMap; + std::vector m_NaNInfoVector; + struct SMapPointerInfo { cl_map_flags Flags; @@ -2265,6 +2274,7 @@ inline bool CLIntercept::checkDumpByName( cl_kernel kernel ) pIntercept->config().DumpBuffersBeforeEnqueue || \ ( pIntercept->config().DumpReplayKernelEnqueue != -1 ) || \ ( pIntercept->config().DumpReplayKernelName != "" ) || \ + pIntercept->config().DetectNaNs || \ pIntercept->config().DumpBuffersAfterEnqueue ) ) \ { \ pIntercept->addBuffer( _buffer ); \ @@ -2275,7 +2285,8 @@ inline bool CLIntercept::checkDumpByName( cl_kernel kernel ) ( pIntercept->config().DumpImagesBeforeEnqueue || \ pIntercept->config().DumpImagesAfterEnqueue || \ ( pIntercept->config().DumpReplayKernelEnqueue != -1 ) || \ - ( pIntercept->config().DumpReplayKernelName != "" ) ) ) \ + ( pIntercept->config().DumpReplayKernelName != "" ) || \ + pIntercept->config().DetectNaNs ) ) \ { \ pIntercept->addImage( _image ); \ } @@ -2290,6 +2301,7 @@ inline bool CLIntercept::checkDumpByName( cl_kernel kernel ) ( pIntercept->config().DumpReplayKernelName != "" ) || \ pIntercept->config().DumpBuffersAfterEnqueue || \ pIntercept->config().DumpImagesBeforeEnqueue || \ + pIntercept->config().DetectNaNs || \ pIntercept->config().DumpImagesAfterEnqueue ) ) \ { \ pIntercept->checkRemoveMemObj( _memobj ); \ @@ -2318,6 +2330,7 @@ inline bool CLIntercept::checkDumpByName( cl_kernel kernel ) ( pIntercept->config().DumpBuffersBeforeEnqueue || \ ( pIntercept->config().DumpReplayKernelEnqueue != -1 ) || \ ( pIntercept->config().DumpReplayKernelName != "" ) || \ + pIntercept->config().DetectNaNs || \ pIntercept->config().DumpBuffersAfterEnqueue ) ) \ { \ pIntercept->addSVMAllocation( svmPtr, size ); \ @@ -2328,6 +2341,7 @@ inline bool CLIntercept::checkDumpByName( cl_kernel kernel ) ( pIntercept->config().DumpBuffersBeforeEnqueue || \ pIntercept->config().DumpBuffersAfterEnqueue || \ ( pIntercept->config().DumpReplayKernelEnqueue != -1 ) || \ + pIntercept->config().DetectNaNs || \ ( pIntercept->config().DumpReplayKernelName != "" ) ) ) \ { \ pIntercept->removeSVMAllocation( svmPtr ); \ @@ -2338,6 +2352,7 @@ inline bool CLIntercept::checkDumpByName( cl_kernel kernel ) ( pIntercept->config().DumpBuffersBeforeEnqueue || \ ( pIntercept->config().DumpReplayKernelEnqueue != -1 ) || \ ( pIntercept->config().DumpReplayKernelName != "" ) || \ + pIntercept->config().DetectNaNs || \ pIntercept->config().DumpBuffersAfterEnqueue ) ) \ { \ pIntercept->addUSMAllocation( usmPtr, size ); \ @@ -2348,6 +2363,7 @@ inline bool CLIntercept::checkDumpByName( cl_kernel kernel ) ( pIntercept->config().DumpBuffersBeforeEnqueue || \ ( pIntercept->config().DumpReplayKernelEnqueue != -1 ) || \ ( pIntercept->config().DumpReplayKernelName != "" ) || \ + pIntercept->config().DetectNaNs || \ pIntercept->config().DumpBuffersAfterEnqueue ) ) \ { \ pIntercept->removeUSMAllocation( usmPtr ); \ @@ -2369,6 +2385,7 @@ inline bool CLIntercept::checkDumpByName( cl_kernel kernel ) } \ if( ( pIntercept->config().DumpBuffersBeforeEnqueue || \ pIntercept->config().DumpBuffersAfterEnqueue || \ + pIntercept->config().DetectNaNs || \ (pIntercept->config().DumpReplayKernelEnqueue != -1) || \ ( pIntercept->config().DumpReplayKernelName != "" ) || \ pIntercept->config().DumpImagesBeforeEnqueue || \ @@ -2381,6 +2398,7 @@ inline bool CLIntercept::checkDumpByName( cl_kernel kernel ) } \ if ( pIntercept->config().DumpBuffersBeforeEnqueue || \ pIntercept->config().DumpBuffersAfterEnqueue || \ + pIntercept->config().DetectNaNs || \ ( pIntercept->config().DumpReplayKernelEnqueue != -1 ) || \ ( pIntercept->config().DumpReplayKernelName != "" ) || \ pIntercept->config().DumpImagesBeforeEnqueue || \ @@ -2393,6 +2411,7 @@ inline bool CLIntercept::checkDumpByName( cl_kernel kernel ) if( pIntercept->config().DumpBuffersBeforeEnqueue || \ ( pIntercept->config().DumpReplayKernelEnqueue != -1 ) || \ ( pIntercept->config().DumpReplayKernelName != "" ) || \ + pIntercept->config().DetectNaNs || \ pIntercept->config().DumpBuffersAfterEnqueue ) \ { \ pIntercept->setKernelArgSVMPointer( kernel, arg_index, arg_value ); \ @@ -2402,6 +2421,7 @@ inline bool CLIntercept::checkDumpByName( cl_kernel kernel ) if( pIntercept->config().DumpBuffersBeforeEnqueue || \ ( pIntercept->config().DumpReplayKernelEnqueue != -1 ) || \ ( pIntercept->config().DumpReplayKernelName != "" ) || \ + pIntercept->config().DetectNaNs || \ pIntercept->config().DumpBuffersAfterEnqueue ) \ { \ pIntercept->setKernelArgUSMPointer( kernel, arg_index, arg_value ); \ @@ -2503,6 +2523,12 @@ inline bool CLIntercept::checkDumpByName( cl_kernel kernel ) pIntercept->dumpArgumentsForKernel(kernel, enqueueCounter, pIntercept->config().DumpReplayKernelName != ""); \ } +#define CHECK_FOR_NANS( when, kernel, command_queue, work_dim, gws ) \ + if ( pIntercept->config().DetectNaNs ) \ + { \ + pIntercept->detectNaNs( when, kernel, enqueueCounter, command_queue, work_dim, gws ); \ + } + #define DUMP_IMAGES_BEFORE_ENQUEUE( kernel, command_queue ) \ if( pIntercept->config().DumpImagesBeforeEnqueue && \ pIntercept->checkDumpImageEnqueueLimits( enqueueCounter ) && \ @@ -2858,6 +2884,13 @@ inline bool CLIntercept::checkAubCaptureEnqueueLimits( "-cl-intel-greater-than-4GB-buffer-required", \ _options, \ _newOptions ); \ + } \ + if( pIntercept->config().DetectNaNs) \ + { \ + pIntercept->appendBuildOptions( \ + "-cl-kernel-arg-info", \ + _options, \ + _newOptions); \ } #define DUMP_OUTPUT_PROGRAM_BINARIES( program ) \