Skip to content

Commit

Permalink
Merge pull request #284 from MiloLurati/hip-local-memory-error-handleing
Browse files Browse the repository at this point in the history
Hip local memory error handling
  • Loading branch information
benvanwerkhoven authored Dec 13, 2024
2 parents 083a3ee + cbdd0a8 commit ac05da3
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 8 deletions.
25 changes: 21 additions & 4 deletions kernel_tuner/backends/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,12 +265,23 @@ def compile(self, kernel_instance):
if platform.system() == "Darwin":
lib_extension = ".dylib"

subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"])
subprocess.check_call(
subprocess.run(
[self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)

subprocess.run(
[self.compiler, filename + ".o"]
+ compiler_options
+ ["-shared", "-o", filename + lib_extension]
+ lib_args
+ lib_args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)

self.lib = np.ctypeslib.load_library(filename, ".")
Expand Down Expand Up @@ -396,10 +407,16 @@ def memcpy_htod(self, dest, src):

def cleanup_lib(self):
"""unload the previously loaded shared library"""
if self.lib is None:
return

if not self.using_openmp and not self.using_openacc:
# this if statement is necessary because shared libraries that use
# OpenMP will core dump when unloaded, this is a well-known issue with OpenMP
logging.debug("unloading shared library")
_ctypes.dlclose(self.lib._handle)
try:
_ctypes.dlclose(self.lib._handle)
finally:
self.lib = None

units = {}
6 changes: 4 additions & 2 deletions kernel_tuner/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,8 +647,10 @@ def compile_kernel(self, instance, verbose):
shared_mem_error_messages = [
"uses too much shared data",
"local memory limit exceeded",
r"local memory \(\d+\) exceeds limit \(\d+\)",
]
if any(msg in str(e) for msg in shared_mem_error_messages):
error_message = str(e.stderr) if hasattr(e, "stderr") else str(e)
if any(re.search(msg, error_message) for msg in shared_mem_error_messages):
logging.debug(
"compile_kernel failed due to kernel using too much shared memory"
)
Expand Down Expand Up @@ -715,7 +717,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
)

# check for templated kernel
if kernel_source.lang in ["CUDA", "NVCUDA"] and "<" in name and ">" in name:
if kernel_source.lang in ["CUDA", "NVCUDA", "HIP"] and "<" in name and ">" in name:
kernel_string, name = wrap_templated_kernel(kernel_string, name)

# Preprocess GPU arguments. Require for handling `Tunable` arguments
Expand Down
4 changes: 2 additions & 2 deletions test/test_compiler_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,11 +198,11 @@ def test_compile_detects_device_code(npct, subprocess):
cfunc = CompilerFunctions()
cfunc.compile(kernel_instance)

print(subprocess.check_call.call_args_list)
print(subprocess.run.call_args_list)

# assert the filename suffix used for source compilation is .cu
dot_cu_used = False
for call in subprocess.check_call.call_args_list:
for call in subprocess.run.call_args_list:
args, kwargs = call
args = args[0]
print(args)
Expand Down

0 comments on commit ac05da3

Please sign in to comment.