diff --git a/dev/.documenter-siteinfo.json b/dev/.documenter-siteinfo.json
index 2b709384..6deb66e5 100644
--- a/dev/.documenter-siteinfo.json
+++ b/dev/.documenter-siteinfo.json
@@ -1 +1 @@
-{"documenter":{"julia_version":"1.10.5","generation_timestamp":"2024-09-24T11:16:52","documenter_version":"1.7.0"}}
\ No newline at end of file
+{"documenter":{"julia_version":"1.10.5","generation_timestamp":"2024-09-24T22:48:24","documenter_version":"1.7.0"}}
\ No newline at end of file
diff --git a/dev/api/index.html b/dev/api/index.html
index 32da3ed2..78101b59 100644
--- a/dev/api/index.html
+++ b/dev/api/index.html
@@ -3,4 +3,4 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><span class="tocitem">Intrinsics</span><ul><li><a class="tocitem" href="../execution_control/">Execution Control</a></li></ul></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li class="is-active"><a class="tocitem" href>API Reference</a><ul class="internal"><li><a class="tocitem" href="#Indexing"><span>Indexing</span></a></li><li><a class="tocitem" href="#Synchronization"><span>Synchronization</span></a></li></ul></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>API Reference</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>API Reference</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/api.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="AMDGPU-API-Reference"><a class="docs-heading-anchor" href="#AMDGPU-API-Reference">AMDGPU API Reference</a><a id="AMDGPU-API-Reference-1"></a><a class="docs-heading-anchor-permalink" href="#AMDGPU-API-Reference" title="Permalink"></a></h1><h2 id="Indexing"><a class="docs-heading-anchor" href="#Indexing">Indexing</a><a id="Indexing-1"></a><a class="docs-heading-anchor-permalink" href="#Indexing" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.workitemIdx" href="#AMDGPU.Device.workitemIdx"><code>AMDGPU.Device.workitemIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">workitemIdx()::ROCDim3</code></pre><p>Returns the work item index within the work group. See also: <a href="#AMDGPU.Device.threadIdx"><code>threadIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/indexing.jl#L116-L121">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.workgroupIdx" href="#AMDGPU.Device.workgroupIdx"><code>AMDGPU.Device.workgroupIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">workgroupIdx()::ROCDim3</code></pre><p>Returns the work group index. See also: <a href="#AMDGPU.Device.blockIdx"><code>blockIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/indexing.jl#L124-L129">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.workgroupDim" href="#AMDGPU.Device.workgroupDim"><code>AMDGPU.Device.workgroupDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">workgroupDim()::ROCDim3</code></pre><p>Returns the size of each workgroup in workitems. See also: <a href="#AMDGPU.Device.blockDim"><code>blockDim</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/indexing.jl#L132-L137">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.gridItemDim" href="#AMDGPU.Device.gridItemDim"><code>AMDGPU.Device.gridItemDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">gridItemDim()::ROCDim3</code></pre><p>Returns the size of the grid in workitems. This behaviour is different from CUDA where <code>gridDim</code> gives the size of the grid in blocks.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/indexing.jl#L140-L145">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.gridGroupDim" href="#AMDGPU.Device.gridGroupDim"><code>AMDGPU.Device.gridGroupDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">gridGroupDim()::ROCDim3</code></pre><p>Returns the size of the grid in workgroups. This is equivalent to CUDA&#39;s <code>gridDim</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/indexing.jl#L148-L153">source</a></section></article><p>Use these functions for compatibility with CUDA.jl.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.threadIdx" href="#AMDGPU.Device.threadIdx"><code>AMDGPU.Device.threadIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">threadIdx()::ROCDim3</code></pre><p>Returns the thread index within the block. See also: <a href="#AMDGPU.Device.workitemIdx"><code>workitemIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/indexing.jl#L158-L163">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.blockIdx" href="#AMDGPU.Device.blockIdx"><code>AMDGPU.Device.blockIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">blockIdx()::ROCDim3</code></pre><p>Returns the block index within the grid. See also: <a href="#AMDGPU.Device.workgroupIdx"><code>workgroupIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/indexing.jl#L166-L171">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.blockDim" href="#AMDGPU.Device.blockDim"><code>AMDGPU.Device.blockDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">blockDim()::ROCDim3</code></pre><p>Returns the dimensions of the block. See also: <a href="#AMDGPU.Device.workgroupDim"><code>workgroupDim</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/indexing.jl#L174-L179">source</a></section></article><h2 id="Synchronization"><a class="docs-heading-anchor" href="#Synchronization">Synchronization</a><a id="Synchronization-1"></a><a class="docs-heading-anchor-permalink" href="#Synchronization" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup" href="#AMDGPU.Device.sync_workgroup"><code>AMDGPU.Device.sync_workgroup</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup()</code></pre><p>Waits until all wavefronts in a workgroup have reached this call.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/synchronization.jl#L1-L5">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup_count" href="#AMDGPU.Device.sync_workgroup_count"><code>AMDGPU.Device.sync_workgroup_count</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup_count(predicate::Cint)::Cint</code></pre><p>Identical to <code>sync_workgroup</code>, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns the number of workitems for which predicate evaluates to non-zero.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/synchronization.jl#L9-L15">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup_and" href="#AMDGPU.Device.sync_workgroup_and"><code>AMDGPU.Device.sync_workgroup_and</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup_and(predicate::Cint)::Cint</code></pre><p>Identical to <code>sync_workgroup</code>, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns non-zero if and only if predicate evaluates to non-zero for all of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/synchronization.jl#L22-L28">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup_or" href="#AMDGPU.Device.sync_workgroup_or"><code>AMDGPU.Device.sync_workgroup_or</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup_or(predicate::Cint)::Cint</code></pre><p>Identical to <code>sync_workgroup</code>, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns non-zero if and only if predicate evaluates to non-zero for any of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/synchronization.jl#L35-L41">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../logging/">« Logging</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 11:16">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><span class="tocitem">Intrinsics</span><ul><li><a class="tocitem" href="../execution_control/">Execution Control</a></li></ul></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li class="is-active"><a class="tocitem" href>API Reference</a><ul class="internal"><li><a class="tocitem" href="#Indexing"><span>Indexing</span></a></li><li><a class="tocitem" href="#Synchronization"><span>Synchronization</span></a></li></ul></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>API Reference</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>API Reference</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/api.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="AMDGPU-API-Reference"><a class="docs-heading-anchor" href="#AMDGPU-API-Reference">AMDGPU API Reference</a><a id="AMDGPU-API-Reference-1"></a><a class="docs-heading-anchor-permalink" href="#AMDGPU-API-Reference" title="Permalink"></a></h1><h2 id="Indexing"><a class="docs-heading-anchor" href="#Indexing">Indexing</a><a id="Indexing-1"></a><a class="docs-heading-anchor-permalink" href="#Indexing" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.workitemIdx" href="#AMDGPU.Device.workitemIdx"><code>AMDGPU.Device.workitemIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">workitemIdx()::ROCDim3</code></pre><p>Returns the work item index within the work group. See also: <a href="#AMDGPU.Device.threadIdx"><code>threadIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/indexing.jl#L116-L121">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.workgroupIdx" href="#AMDGPU.Device.workgroupIdx"><code>AMDGPU.Device.workgroupIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">workgroupIdx()::ROCDim3</code></pre><p>Returns the work group index. See also: <a href="#AMDGPU.Device.blockIdx"><code>blockIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/indexing.jl#L124-L129">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.workgroupDim" href="#AMDGPU.Device.workgroupDim"><code>AMDGPU.Device.workgroupDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">workgroupDim()::ROCDim3</code></pre><p>Returns the size of each workgroup in workitems. See also: <a href="#AMDGPU.Device.blockDim"><code>blockDim</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/indexing.jl#L132-L137">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.gridItemDim" href="#AMDGPU.Device.gridItemDim"><code>AMDGPU.Device.gridItemDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">gridItemDim()::ROCDim3</code></pre><p>Returns the size of the grid in workitems. This behaviour is different from CUDA where <code>gridDim</code> gives the size of the grid in blocks.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/indexing.jl#L140-L145">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.gridGroupDim" href="#AMDGPU.Device.gridGroupDim"><code>AMDGPU.Device.gridGroupDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">gridGroupDim()::ROCDim3</code></pre><p>Returns the size of the grid in workgroups. This is equivalent to CUDA&#39;s <code>gridDim</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/indexing.jl#L148-L153">source</a></section></article><p>Use these functions for compatibility with CUDA.jl.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.threadIdx" href="#AMDGPU.Device.threadIdx"><code>AMDGPU.Device.threadIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">threadIdx()::ROCDim3</code></pre><p>Returns the thread index within the block. See also: <a href="#AMDGPU.Device.workitemIdx"><code>workitemIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/indexing.jl#L158-L163">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.blockIdx" href="#AMDGPU.Device.blockIdx"><code>AMDGPU.Device.blockIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">blockIdx()::ROCDim3</code></pre><p>Returns the block index within the grid. See also: <a href="#AMDGPU.Device.workgroupIdx"><code>workgroupIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/indexing.jl#L166-L171">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.blockDim" href="#AMDGPU.Device.blockDim"><code>AMDGPU.Device.blockDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">blockDim()::ROCDim3</code></pre><p>Returns the dimensions of the block. See also: <a href="#AMDGPU.Device.workgroupDim"><code>workgroupDim</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/indexing.jl#L174-L179">source</a></section></article><h2 id="Synchronization"><a class="docs-heading-anchor" href="#Synchronization">Synchronization</a><a id="Synchronization-1"></a><a class="docs-heading-anchor-permalink" href="#Synchronization" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup" href="#AMDGPU.Device.sync_workgroup"><code>AMDGPU.Device.sync_workgroup</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup()</code></pre><p>Waits until all wavefronts in a workgroup have reached this call.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/synchronization.jl#L1-L5">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup_count" href="#AMDGPU.Device.sync_workgroup_count"><code>AMDGPU.Device.sync_workgroup_count</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup_count(predicate::Cint)::Cint</code></pre><p>Identical to <code>sync_workgroup</code>, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns the number of workitems for which predicate evaluates to non-zero.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/synchronization.jl#L9-L15">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup_and" href="#AMDGPU.Device.sync_workgroup_and"><code>AMDGPU.Device.sync_workgroup_and</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup_and(predicate::Cint)::Cint</code></pre><p>Identical to <code>sync_workgroup</code>, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns non-zero if and only if predicate evaluates to non-zero for all of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/synchronization.jl#L22-L28">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup_or" href="#AMDGPU.Device.sync_workgroup_or"><code>AMDGPU.Device.sync_workgroup_or</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup_or(predicate::Cint)::Cint</code></pre><p>Identical to <code>sync_workgroup</code>, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns non-zero if and only if predicate evaluates to non-zero for any of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/synchronization.jl#L35-L41">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../logging/">« Logging</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 22:48">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/devices/index.html b/dev/devices/index.html
index 2eae16c5..808e6071 100644
--- a/dev/devices/index.html
+++ b/dev/devices/index.html
@@ -6,5 +6,5 @@
 </script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li class="is-active"><a class="tocitem" href>Devices</a><ul class="internal"><li><a class="tocitem" href="#Device-Properties"><span>Device Properties</span></a></li></ul></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><span class="tocitem">Intrinsics</span><ul><li><a class="tocitem" href="../execution_control/">Execution Control</a></li></ul></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Devices</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Devices</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/devices.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Devices"><a class="docs-heading-anchor" href="#Devices">Devices</a><a id="Devices-1"></a><a class="docs-heading-anchor-permalink" href="#Devices" title="Permalink"></a></h1><p>In AMDGPU, all GPU devices are auto-detected by the runtime, if they&#39;re supported.</p><p>AMDGPU maintains a global default device. The default device is relevant for all kernel and GPUArray operations. If one is not specified via <code>@roc</code> or an equivalent interface, then the default device is used for those operations, which affects compilation and kernel launch.</p><p>The device bound to a current Julia task is accessible via <a href="#AMDGPU.device"><code>AMDGPU.device</code></a> method. The list of available devices can be queried with <a href="#AMDGPU.HIP.devices"><code>AMDGPU.devices</code></a> method.</p><p>If you have a <code>HIPDevice</code> object, you can also switch the device with <a href="#AMDGPU.device!"><code>AMDGPU.device!</code></a>. This will switch it <strong>only within the task it is called from</strong>.</p><pre><code class="language-julia hljs">xd1 = AMDGPU.ones(Float32, 16) # On `AMDGPU.device()` device.
 
 AMDGPU.device!(AMDGPU.devices()[2]) # Switch to second device.
-xd2 = AMDPGU.ones(Float32, 16) # On second device.</code></pre><p>Additionally, devices have an associated numeric ID. This value is bounded between <code>1</code> and <code>length(AMDGPU.devices())</code>, and device <code>1</code> is the default device when AMDGPU is first loaded. The ID of the device associated with the current task can be queried with <a href="#AMDGPU.device_id"><code>AMDGPU.device_id</code></a> and changed with <a href="#AMDGPU.device_id!"><code>AMDGPU.device_id!</code></a>.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.devices" href="#AMDGPU.HIP.devices"><code>AMDGPU.HIP.devices</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">devices()</code></pre><p>Get list of all devices.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/hip/device.jl#L107-L111">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device" href="#AMDGPU.device"><code>AMDGPU.device</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device()::HIPDevice</code></pre><p>Get currently active device. This device is used when launching kernels via <code>@roc</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/tls.jl#L29-L34">source</a></section><section><div><pre><code class="language-julia hljs">device(A::ROCArray) -&gt; HIPDevice</code></pre><p>Return the device associated with the array <code>A</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/array.jl#L26-L30">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device!" href="#AMDGPU.device!"><code>AMDGPU.device!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device!(device::HIPDevice)</code></pre><p>Switch current device being used. This switches only for a task inside which it is called.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/tls.jl#L37-L42">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device_id" href="#AMDGPU.device_id"><code>AMDGPU.device_id</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device_id() -&gt; Int
-device_id(device::HIPDevice) -&gt; Int</code></pre><p>Returns the numerical device ID for <code>device</code> or for the current <code>AMDGPU.device()</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/highlevel.jl#L1-L6">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device_id!" href="#AMDGPU.device_id!"><code>AMDGPU.device_id!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device_id!(idx::Integer)</code></pre><p>Sets the current device to <code>AMDGPU.devices()[idx]</code>. See <a href="#AMDGPU.device_id"><code>device_id</code></a> for details on the numbering semantics.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/highlevel.jl#L10-L15">source</a></section></article><h2 id="Device-Properties"><a class="docs-heading-anchor" href="#Device-Properties">Device Properties</a><a id="Device-Properties-1"></a><a class="docs-heading-anchor-permalink" href="#Device-Properties" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.name" href="#AMDGPU.HIP.name"><code>AMDGPU.HIP.name</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">name(dev::HIPDevice)::String</code></pre><p>Get name of the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/hip/device.jl#L70-L74">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.wavefrontsize" href="#AMDGPU.HIP.wavefrontsize"><code>AMDGPU.HIP.wavefrontsize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">wavefrontsize(d::HIPDevice)::Cint</code></pre><p>Get size of the wavefront. AMD GPUs support either 32 or 64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/hip/device.jl#L32-L36">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.gcn_arch" href="#AMDGPU.HIP.gcn_arch"><code>AMDGPU.HIP.gcn_arch</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">gcn_arch(d::HIPDevice)::String</code></pre><p>Get GCN architecture for the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/hip/device.jl#L39-L43">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.device_id" href="#AMDGPU.HIP.device_id"><code>AMDGPU.HIP.device_id</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device_id(d::HIPDevice)</code></pre><p>Zero-based device ID as expected by HIP functions. Differs from <a href="#AMDGPU.device_id"><code>AMDGPU.device_id</code></a> method by <code>1</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/hip/device.jl#L24-L29">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.properties" href="#AMDGPU.HIP.properties"><code>AMDGPU.HIP.properties</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">properties(dev::HIPDevice)::hipDeviceProp_t</code></pre><p>Get all properties for the device. See HIP documentation for <code>hipDeviceProp_t</code> for the meaning of each field.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/hip/device.jl#L81-L86">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../quickstart/">« Quick Start</a><a class="docs-footer-nextpage" href="../streams/">Streams »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 11:16">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+xd2 = AMDPGU.ones(Float32, 16) # On second device.</code></pre><p>Additionally, devices have an associated numeric ID. This value is bounded between <code>1</code> and <code>length(AMDGPU.devices())</code>, and device <code>1</code> is the default device when AMDGPU is first loaded. The ID of the device associated with the current task can be queried with <a href="#AMDGPU.device_id"><code>AMDGPU.device_id</code></a> and changed with <a href="#AMDGPU.device_id!"><code>AMDGPU.device_id!</code></a>.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.devices" href="#AMDGPU.HIP.devices"><code>AMDGPU.HIP.devices</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">devices()</code></pre><p>Get list of all devices.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/hip/device.jl#L107-L111">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device" href="#AMDGPU.device"><code>AMDGPU.device</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device()::HIPDevice</code></pre><p>Get currently active device. This device is used when launching kernels via <code>@roc</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/tls.jl#L29-L34">source</a></section><section><div><pre><code class="language-julia hljs">device(A::ROCArray) -&gt; HIPDevice</code></pre><p>Return the device associated with the array <code>A</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/array.jl#L26-L30">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device!" href="#AMDGPU.device!"><code>AMDGPU.device!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device!(device::HIPDevice)</code></pre><p>Switch current device being used. This switches only for a task inside which it is called.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/tls.jl#L37-L42">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device_id" href="#AMDGPU.device_id"><code>AMDGPU.device_id</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device_id() -&gt; Int
+device_id(device::HIPDevice) -&gt; Int</code></pre><p>Returns the numerical device ID for <code>device</code> or for the current <code>AMDGPU.device()</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/highlevel.jl#L1-L6">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device_id!" href="#AMDGPU.device_id!"><code>AMDGPU.device_id!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device_id!(idx::Integer)</code></pre><p>Sets the current device to <code>AMDGPU.devices()[idx]</code>. See <a href="#AMDGPU.device_id"><code>device_id</code></a> for details on the numbering semantics.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/highlevel.jl#L10-L15">source</a></section></article><h2 id="Device-Properties"><a class="docs-heading-anchor" href="#Device-Properties">Device Properties</a><a id="Device-Properties-1"></a><a class="docs-heading-anchor-permalink" href="#Device-Properties" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.name" href="#AMDGPU.HIP.name"><code>AMDGPU.HIP.name</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">name(dev::HIPDevice)::String</code></pre><p>Get name of the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/hip/device.jl#L70-L74">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.wavefrontsize" href="#AMDGPU.HIP.wavefrontsize"><code>AMDGPU.HIP.wavefrontsize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">wavefrontsize(d::HIPDevice)::Cint</code></pre><p>Get size of the wavefront. AMD GPUs support either 32 or 64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/hip/device.jl#L32-L36">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.gcn_arch" href="#AMDGPU.HIP.gcn_arch"><code>AMDGPU.HIP.gcn_arch</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">gcn_arch(d::HIPDevice)::String</code></pre><p>Get GCN architecture for the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/hip/device.jl#L39-L43">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.device_id" href="#AMDGPU.HIP.device_id"><code>AMDGPU.HIP.device_id</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device_id(d::HIPDevice)</code></pre><p>Zero-based device ID as expected by HIP functions. Differs from <a href="#AMDGPU.device_id"><code>AMDGPU.device_id</code></a> method by <code>1</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/hip/device.jl#L24-L29">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.properties" href="#AMDGPU.HIP.properties"><code>AMDGPU.HIP.properties</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">properties(dev::HIPDevice)::hipDeviceProp_t</code></pre><p>Get all properties for the device. See HIP documentation for <code>hipDeviceProp_t</code> for the meaning of each field.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/hip/device.jl#L81-L86">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../quickstart/">« Quick Start</a><a class="docs-footer-nextpage" href="../streams/">Streams »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 22:48">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/exceptions/index.html b/dev/exceptions/index.html
index d40aa583..454d840a 100644
--- a/dev/exceptions/index.html
+++ b/dev/exceptions/index.html
@@ -25,4 +25,4 @@
  [4] synchronize()
    @ AMDGPU ~/.julia/dev/AMDGPU/src/highlevel.jl:154
  [5] top-level scope
-   @ REPL[5]:1</code></pre><p>Kernel-thrown exceptions are thrown during the host synchronization <a href="../streams/#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> or on the next kernel launch.</p><p>Kernels that hit an exception will write its information into a pre-allocated host buffer. Once complete, the wavefront throwing the exception will lock the buffer to prevent other wavefronts from overwriting the exception and stop itself, but other wavefronts will continue executing.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../kernel_programming/">« Kernel Programming</a><a class="docs-footer-nextpage" href="../profiling/">Profiling »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 11:16">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+   @ REPL[5]:1</code></pre><p>Kernel-thrown exceptions are thrown during the host synchronization <a href="../streams/#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> or on the next kernel launch.</p><p>Kernels that hit an exception will write its information into a pre-allocated host buffer. Once complete, the wavefront throwing the exception will lock the buffer to prevent other wavefronts from overwriting the exception and stop itself, but other wavefronts will continue executing.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../kernel_programming/">« Kernel Programming</a><a class="docs-footer-nextpage" href="../profiling/">Profiling »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 22:48">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/execution_control/index.html b/dev/execution_control/index.html
index f7250226..25769ece 100644
--- a/dev/execution_control/index.html
+++ b/dev/execution_control/index.html
@@ -3,4 +3,4 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><span class="tocitem">Intrinsics</span><ul><li class="is-active"><a class="tocitem" href>Execution Control</a></li></ul></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">Intrinsics</a></li><li class="is-active"><a href>Execution Control</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Execution Control</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/execution_control.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Execution-Control-and-Intrinsics"><a class="docs-heading-anchor" href="#Execution-Control-and-Intrinsics">Execution Control and Intrinsics</a><a id="Execution-Control-and-Intrinsics-1"></a><a class="docs-heading-anchor-permalink" href="#Execution-Control-and-Intrinsics" title="Permalink"></a></h1><p>GPU execution is similar to CPU execution in some ways, although there are many differences. AMD GPUs have Compute Units (CUs), which can be thought of like CPU cores. Those CUs have (on pre-Navi architectures) 64 &quot;shader processors&quot;, which are essentially the same as CPU SIMD lanes. The lanes in a CU operate in lockstep just like CPU SIMD lanes, and have execution masks and various kinds of SIMD instructions available. CUs execute wavefronts, which are pieces of work split off from a single kernel launch. A single CU can run one out of many wavefronts (one is chosen by the CU scheduler each cycle), which allows for very efficient parallel and concurrent execution on the device. Each wavefront runs independently of the other wavefronts, only stopping to synchronize with other wavefronts or terminate when specified by the program.</p><p>We can control wavefront execution through a variety of intrinsics provided by ROCm. For example, the <code>endpgm()</code> intrinsic stops the current wavefront&#39;s execution, and is also automatically inserted by the compiler at the end of each kernel (except in certain unique cases).</p><p><code>signal_completion(x)</code> signals the &quot;kernel doorbell&quot; with the value <code>x</code>, which is the signal checked by the CPU <code>wait</code> call to determine when the kernel has completed. This doorbell is set to <code>0</code> automatically by GPU hardware once the kernel is complete.</p><p><code>sendmsg(x,y=0)</code> and <code>sendmsghalt(x,y=0)</code> can be used to signal special conditions to the scheduler/hardware, such as making requests to stop wavefront generation, or halt all running wavefronts. Check the ISA manual for details!</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../hostcall/">« Host-Call</a><a class="docs-footer-nextpage" href="../printing/">Printing »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 11:16">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><span class="tocitem">Intrinsics</span><ul><li class="is-active"><a class="tocitem" href>Execution Control</a></li></ul></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li><a class="is-disabled">Intrinsics</a></li><li class="is-active"><a href>Execution Control</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Execution Control</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/execution_control.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Execution-Control-and-Intrinsics"><a class="docs-heading-anchor" href="#Execution-Control-and-Intrinsics">Execution Control and Intrinsics</a><a id="Execution-Control-and-Intrinsics-1"></a><a class="docs-heading-anchor-permalink" href="#Execution-Control-and-Intrinsics" title="Permalink"></a></h1><p>GPU execution is similar to CPU execution in some ways, although there are many differences. AMD GPUs have Compute Units (CUs), which can be thought of like CPU cores. Those CUs have (on pre-Navi architectures) 64 &quot;shader processors&quot;, which are essentially the same as CPU SIMD lanes. The lanes in a CU operate in lockstep just like CPU SIMD lanes, and have execution masks and various kinds of SIMD instructions available. CUs execute wavefronts, which are pieces of work split off from a single kernel launch. A single CU can run one out of many wavefronts (one is chosen by the CU scheduler each cycle), which allows for very efficient parallel and concurrent execution on the device. Each wavefront runs independently of the other wavefronts, only stopping to synchronize with other wavefronts or terminate when specified by the program.</p><p>We can control wavefront execution through a variety of intrinsics provided by ROCm. For example, the <code>endpgm()</code> intrinsic stops the current wavefront&#39;s execution, and is also automatically inserted by the compiler at the end of each kernel (except in certain unique cases).</p><p><code>signal_completion(x)</code> signals the &quot;kernel doorbell&quot; with the value <code>x</code>, which is the signal checked by the CPU <code>wait</code> call to determine when the kernel has completed. This doorbell is set to <code>0</code> automatically by GPU hardware once the kernel is complete.</p><p><code>sendmsg(x,y=0)</code> and <code>sendmsghalt(x,y=0)</code> can be used to signal special conditions to the scheduler/hardware, such as making requests to stop wavefront generation, or halt all running wavefronts. Check the ISA manual for details!</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../hostcall/">« Host-Call</a><a class="docs-footer-nextpage" href="../printing/">Printing »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 22:48">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/hostcall/index.html b/dev/hostcall/index.html
index 48a36c23..ae21e42c 100644
--- a/dev/hostcall/index.html
+++ b/dev/hostcall/index.html
@@ -17,4 +17,4 @@
 AMDGPU.synchronize(; stop_hostcalls=true) # Stop hostcall.
 AMDGPU.Device.free!(hc) # Free hostcall buffers.
 
-@assert Array(y)[1] ≈ 42f0</code></pre><p>In this example, <code>HostCallHolder</code> is used to create and launch <code>HostCall</code>. <code>HostCallHolder</code> contains the <code>HostCall</code> structure itself that is passed to kernel, a task that is spawned on creation and some additional info for controlling the lifetime of the task.</p><p>First argument is a function we want to execute when we call the hostcall. In this case we add <code>42f0</code> to input argument <code>x</code> and return the result.</p><p>Second and third arguments are the return type <code>Float32</code> and the tuple of types of input arguments <code>Tuple{Float32}</code>.</p><p><code>hostcall!</code> is used to execute the function on the host, wait on the result, and obtain the return values. At the moment, it is performed once per workgroup.</p><h2 id="Continuous-Host-Call"><a class="docs-heading-anchor" href="#Continuous-Host-Call">Continuous Host-Call</a><a id="Continuous-Host-Call-1"></a><a class="docs-heading-anchor-permalink" href="#Continuous-Host-Call" title="Permalink"></a></h2><p>By default, hostcalls can be used only once. After executing the function on the host, the task finishes and exits.</p><p>However, if you need your hostcall to live indefinitely, pass <code>continuous=true</code> keyword argument to <code>HostCallHolder(...; continuous=true)</code>.</p><p>To then stop the hostcall, call <code>Device.non_continuous!(hc)</code> or <code>Device.finish!(hc)</code> on the <code>HostCallHolder</code>.</p><p>The difference between them is that <code>non_continuous!</code> will allow calling hostcall one more time before exiting, while <code>finish!</code> will exit immediately.</p><p><code>finish!</code> can be used on any <code>HostCallHolder</code> to force-exit the running hostcall task.</p><h2 id="Free-hostcall-buffers"><a class="docs-heading-anchor" href="#Free-hostcall-buffers">Free hostcall buffers</a><a id="Free-hostcall-buffers-1"></a><a class="docs-heading-anchor-permalink" href="#Free-hostcall-buffers" title="Permalink"></a></h2><p>For custom hostcalls it is important to call <code>AMDGPU.Device.free!</code> once kernel has finished to free buffers that hostcall used in the process.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../memory/">« Memory</a><a class="docs-footer-nextpage" href="../execution_control/">Execution Control »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 11:16">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+@assert Array(y)[1] ≈ 42f0</code></pre><p>In this example, <code>HostCallHolder</code> is used to create and launch <code>HostCall</code>. <code>HostCallHolder</code> contains the <code>HostCall</code> structure itself that is passed to kernel, a task that is spawned on creation and some additional info for controlling the lifetime of the task.</p><p>First argument is a function we want to execute when we call the hostcall. In this case we add <code>42f0</code> to input argument <code>x</code> and return the result.</p><p>Second and third arguments are the return type <code>Float32</code> and the tuple of types of input arguments <code>Tuple{Float32}</code>.</p><p><code>hostcall!</code> is used to execute the function on the host, wait on the result, and obtain the return values. At the moment, it is performed once per workgroup.</p><h2 id="Continuous-Host-Call"><a class="docs-heading-anchor" href="#Continuous-Host-Call">Continuous Host-Call</a><a id="Continuous-Host-Call-1"></a><a class="docs-heading-anchor-permalink" href="#Continuous-Host-Call" title="Permalink"></a></h2><p>By default, hostcalls can be used only once. After executing the function on the host, the task finishes and exits.</p><p>However, if you need your hostcall to live indefinitely, pass <code>continuous=true</code> keyword argument to <code>HostCallHolder(...; continuous=true)</code>.</p><p>To then stop the hostcall, call <code>Device.non_continuous!(hc)</code> or <code>Device.finish!(hc)</code> on the <code>HostCallHolder</code>.</p><p>The difference between them is that <code>non_continuous!</code> will allow calling hostcall one more time before exiting, while <code>finish!</code> will exit immediately.</p><p><code>finish!</code> can be used on any <code>HostCallHolder</code> to force-exit the running hostcall task.</p><h2 id="Free-hostcall-buffers"><a class="docs-heading-anchor" href="#Free-hostcall-buffers">Free hostcall buffers</a><a id="Free-hostcall-buffers-1"></a><a class="docs-heading-anchor-permalink" href="#Free-hostcall-buffers" title="Permalink"></a></h2><p>For custom hostcalls it is important to call <code>AMDGPU.Device.free!</code> once kernel has finished to free buffers that hostcall used in the process.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../memory/">« Memory</a><a class="docs-footer-nextpage" href="../execution_control/">Execution Control »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 22:48">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/index.html b/dev/index.html
index a80e7085..2764358a 100644
--- a/dev/index.html
+++ b/dev/index.html
@@ -17,4 +17,4 @@
 # Default is &quot;none&quot;, which does not apply any limitation.
 hard_memory_limit = &quot;none&quot;
 # Notice a space between the value and percentage sign.
-# hard_memory_limit = &quot;80 %&quot;</code></pre></article><nav class="docs-footer"><a class="docs-footer-nextpage" href="quickstart/">Quick Start »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 11:16">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+# hard_memory_limit = &quot;80 %&quot;</code></pre></article><nav class="docs-footer"><a class="docs-footer-nextpage" href="quickstart/">Quick Start »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 22:48">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/kernel_programming/index.html b/dev/kernel_programming/index.html
index 1f5579e8..8fbea865 100644
--- a/dev/kernel_programming/index.html
+++ b/dev/kernel_programming/index.html
@@ -6,7 +6,7 @@
 </script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li class="is-active"><a class="tocitem" href>Kernel Programming</a><ul class="internal"><li><a class="tocitem" href="#Launch-Configuration"><span>Launch Configuration</span></a></li><li><a class="tocitem" href="#Atomics"><span>Atomics</span></a></li><li><a class="tocitem" href="#Device-Intrinsics"><span>Device Intrinsics</span></a></li></ul></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><span class="tocitem">Intrinsics</span><ul><li><a class="tocitem" href="../execution_control/">Execution Control</a></li></ul></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Kernel Programming</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Kernel Programming</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/kernel_programming.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Kernel-Programming"><a class="docs-heading-anchor" href="#Kernel-Programming">Kernel Programming</a><a id="Kernel-Programming-1"></a><a class="docs-heading-anchor-permalink" href="#Kernel-Programming" title="Permalink"></a></h1><h2 id="Launch-Configuration"><a class="docs-heading-anchor" href="#Launch-Configuration">Launch Configuration</a><a id="Launch-Configuration-1"></a><a class="docs-heading-anchor-permalink" href="#Launch-Configuration" title="Permalink"></a></h2><p>While an almost arbitrarily large number of workitems can be executed per kernel launch, the hardware can only support executing a limited number of wavefronts at one time.</p><p>To alleviate this, the compiler calculates the &quot;occupancy&quot; of each compiled kernel (which is the number of wavefronts that can be simultaneously executing on the GPU), and passes this information to the hardware; the hardware then launches a limited number of wavefronts at once, based on the kernel&#39;s &quot;occupancy&quot; values.</p><p>The rest of the wavefronts are not launched until hardware resources become available, which means that a kernel with better occupancy will see more of its wavefronts executing simultaneously (which often leads to better performance). Suffice to say, it&#39;s important to know the occupancy of kernels if you want the best performance.</p><p>Like CUDA.jl, AMDGPU.jl has the ability to calculate kernel occupancy, with the <code>launch_configuration</code> function:</p><pre><code class="language-julia hljs">kernel = @roc launch=false mykernel(args...)
 occupancy = AMDGPU.launch_configuration(kernel)
 @show occupancy.gridsize
-@show occupancy.groupsize</code></pre><p>Specifically, <code>launch_configuration</code> calculates the occupancy of <code>mykernel(args...)</code>, and then calculates an optimal groupsize based on the occupancy. This value can then be used to select the groupsize for the kernel:</p><pre><code class="language-julia hljs">@roc groupsize=occupancy.groupsize mykernel(args...)</code></pre><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.@roc" href="#AMDGPU.@roc"><code>AMDGPU.@roc</code></a> — <span class="docstring-category">Macro</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">@roc [kwargs...] func(args...)</code></pre><p>High-level interface for launching kernels on GPU. Upon a first call it will be compiled, subsequent calls will re-use the compiled object.</p><p>Several keyword arguments are supported:</p><ul><li><code>launch::Bool = true</code>: whether to launch the kernel.   If <code>false</code>, then returns a compiled kernel which can be launched by   calling it and passing arguments.</li><li>Arguments that influence kernel compilation, see   <a href="#AMDGPU.Compiler.hipfunction"><code>AMDGPU.Compiler.hipfunction</code></a>.</li><li>Arguments that influence kernel launch, see <a href="#AMDGPU.Runtime.HIPKernel"><code>AMDGPU.Runtime.HIPKernel</code></a>.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/highlevel.jl#L97-L111">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Runtime.HIPKernel" href="#AMDGPU.Runtime.HIPKernel"><code>AMDGPU.Runtime.HIPKernel</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">(ker::HIPKernel)(args::Vararg{Any, N}; kwargs...)</code></pre><p>Launch compiled HIPKernel by passing arguments to it.</p><p>The following kwargs are supported:</p><ul><li><code>gridsize::ROCDim = 1</code>: Size of the grid.</li><li><code>groupsize::ROCDim = 1</code>:  Size of the workgroup.</li><li><code>shmem::Integer = 0</code>:   Amount of dynamically-allocated shared memory in bytes.</li><li><code>stream::HIP.HIPStream = AMDGPU.stream()</code>:   Stream on which to launch the kernel.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/runtime/hip-execution.jl#L1-L13">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Compiler.hipfunction" href="#AMDGPU.Compiler.hipfunction"><code>AMDGPU.Compiler.hipfunction</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">hipfunction(f::F, tt::TT = Tuple{}; kwargs...)</code></pre><p>Compile Julia function <code>f</code> to a HIP kernel given a tuple of argument&#39;s types <code>tt</code> that it accepts.</p><p>The following kwargs are supported:</p><ul><li><code>name::Union{String, Nothing} = nothing</code>:   A unique name to give a compiled kernel.</li><li><code>unsafe_fp_atomics::Bool = true</code>:   Whether to use &#39;unsafe&#39; floating-point atomics.   AMD GPU devices support fast atomic read-modify-write (RMW)   operations on floating-point values.   On single- or double-precision floating-point values this may generate   a hardware RMW instruction that is faster than emulating   the atomic operation using an atomic compare-and-swap (CAS) loop.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/compiler/codegen.jl#L136-L153">source</a></section></article><h2 id="Atomics"><a class="docs-heading-anchor" href="#Atomics">Atomics</a><a id="Atomics-1"></a><a class="docs-heading-anchor-permalink" href="#Atomics" title="Permalink"></a></h2><p>AMDGPU.jl relies on <a href="https://github.com/JuliaConcurrent/Atomix.jl">Atomix.jl</a> for atomics.</p><p>Example of a kernel that computes atomic max:</p><pre><code class="language-julia hljs">using AMDGPU
+@show occupancy.groupsize</code></pre><p>Specifically, <code>launch_configuration</code> calculates the occupancy of <code>mykernel(args...)</code>, and then calculates an optimal groupsize based on the occupancy. This value can then be used to select the groupsize for the kernel:</p><pre><code class="language-julia hljs">@roc groupsize=occupancy.groupsize mykernel(args...)</code></pre><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.@roc" href="#AMDGPU.@roc"><code>AMDGPU.@roc</code></a> — <span class="docstring-category">Macro</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">@roc [kwargs...] func(args...)</code></pre><p>High-level interface for launching kernels on GPU. Upon a first call it will be compiled, subsequent calls will re-use the compiled object.</p><p>Several keyword arguments are supported:</p><ul><li><code>launch::Bool = true</code>: whether to launch the kernel.   If <code>false</code>, then returns a compiled kernel which can be launched by   calling it and passing arguments.</li><li>Arguments that influence kernel compilation, see   <a href="#AMDGPU.Compiler.hipfunction"><code>AMDGPU.Compiler.hipfunction</code></a>.</li><li>Arguments that influence kernel launch, see <a href="#AMDGPU.Runtime.HIPKernel"><code>AMDGPU.Runtime.HIPKernel</code></a>.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/highlevel.jl#L97-L111">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Runtime.HIPKernel" href="#AMDGPU.Runtime.HIPKernel"><code>AMDGPU.Runtime.HIPKernel</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">(ker::HIPKernel)(args::Vararg{Any, N}; kwargs...)</code></pre><p>Launch compiled HIPKernel by passing arguments to it.</p><p>The following kwargs are supported:</p><ul><li><code>gridsize::ROCDim = 1</code>: Size of the grid.</li><li><code>groupsize::ROCDim = 1</code>:  Size of the workgroup.</li><li><code>shmem::Integer = 0</code>:   Amount of dynamically-allocated shared memory in bytes.</li><li><code>stream::HIP.HIPStream = AMDGPU.stream()</code>:   Stream on which to launch the kernel.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/runtime/hip-execution.jl#L1-L13">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Compiler.hipfunction" href="#AMDGPU.Compiler.hipfunction"><code>AMDGPU.Compiler.hipfunction</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">hipfunction(f::F, tt::TT = Tuple{}; kwargs...)</code></pre><p>Compile Julia function <code>f</code> to a HIP kernel given a tuple of argument&#39;s types <code>tt</code> that it accepts.</p><p>The following kwargs are supported:</p><ul><li><code>name::Union{String, Nothing} = nothing</code>:   A unique name to give a compiled kernel.</li><li><code>unsafe_fp_atomics::Bool = true</code>:   Whether to use &#39;unsafe&#39; floating-point atomics.   AMD GPU devices support fast atomic read-modify-write (RMW)   operations on floating-point values.   On single- or double-precision floating-point values this may generate   a hardware RMW instruction that is faster than emulating   the atomic operation using an atomic compare-and-swap (CAS) loop.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/compiler/codegen.jl#L136-L153">source</a></section></article><h2 id="Atomics"><a class="docs-heading-anchor" href="#Atomics">Atomics</a><a id="Atomics-1"></a><a class="docs-heading-anchor-permalink" href="#Atomics" title="Permalink"></a></h2><p>AMDGPU.jl relies on <a href="https://github.com/JuliaConcurrent/Atomix.jl">Atomix.jl</a> for atomics.</p><p>Example of a kernel that computes atomic max:</p><pre><code class="language-julia hljs">using AMDGPU
 
 function ker_atomic_max!(target, source, indices)
     i = workitemIdx().x + (workgroupIdx().x - 0x1) * workgroupDim().x
@@ -20,7 +20,7 @@
 source = ROCArray(rand(UInt32, n))
 indices = ROCArray(rand(1:bins, n))
 target = ROCArray(zeros(UInt32, bins))
-@roc groupsize=256 gridsize=4 ker_atomic_max!(target, source, indices)</code></pre><h2 id="Device-Intrinsics"><a class="docs-heading-anchor" href="#Device-Intrinsics">Device Intrinsics</a><a id="Device-Intrinsics-1"></a><a class="docs-heading-anchor-permalink" href="#Device-Intrinsics" title="Permalink"></a></h2><h3 id="Wavefront-Level-Primitives"><a class="docs-heading-anchor" href="#Wavefront-Level-Primitives">Wavefront-Level Primitives</a><a id="Wavefront-Level-Primitives-1"></a><a class="docs-heading-anchor-permalink" href="#Wavefront-Level-Primitives" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.wavefrontsize" href="#AMDGPU.Device.wavefrontsize"><code>AMDGPU.Device.wavefrontsize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">wavefrontsize()::Cuint</code></pre><p>Get the wavefront size of the device that executes current kernel.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront.jl#L79-L83">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.activelane" href="#AMDGPU.Device.activelane"><code>AMDGPU.Device.activelane</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">activelane()::Cuint</code></pre><p>Get id of the current lane within a wavefront/warp.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+@roc groupsize=256 gridsize=4 ker_atomic_max!(target, source, indices)</code></pre><h2 id="Device-Intrinsics"><a class="docs-heading-anchor" href="#Device-Intrinsics">Device Intrinsics</a><a id="Device-Intrinsics-1"></a><a class="docs-heading-anchor-permalink" href="#Device-Intrinsics" title="Permalink"></a></h2><h3 id="Wavefront-Level-Primitives"><a class="docs-heading-anchor" href="#Wavefront-Level-Primitives">Wavefront-Level Primitives</a><a id="Wavefront-Level-Primitives-1"></a><a class="docs-heading-anchor-permalink" href="#Wavefront-Level-Primitives" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.wavefrontsize" href="#AMDGPU.Device.wavefrontsize"><code>AMDGPU.Device.wavefrontsize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">wavefrontsize()::Cuint</code></pre><p>Get the wavefront size of the device that executes current kernel.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront.jl#L79-L83">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.activelane" href="#AMDGPU.Device.activelane"><code>AMDGPU.Device.activelane</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">activelane()::Cuint</code></pre><p>Get id of the current lane within a wavefront/warp.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i = AMDGPU.Device.activelane()
            x[i + 1] = i
            return
@@ -33,7 +33,7 @@
 
 julia&gt; Array(x)
 1×8 Matrix{Int32}:
- 0  1  2  3  4  5  6  7</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront.jl#L86-L107">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.ballot" href="#AMDGPU.Device.ballot"><code>AMDGPU.Device.ballot</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">ballot(predicate::Bool)::UInt64</code></pre><p>Return a value whose <code>N</code>th bit is set if and only if <code>predicate</code> evaluates to <code>true</code> for the <code>N</code>th lane and the lane is active.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 0  1  2  3  4  5  6  7</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront.jl#L86-L107">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.ballot" href="#AMDGPU.Device.ballot"><code>AMDGPU.Device.ballot</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">ballot(predicate::Bool)::UInt64</code></pre><p>Return a value whose <code>N</code>th bit is set if and only if <code>predicate</code> evaluates to <code>true</code> for the <code>N</code>th lane and the lane is active.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            x[1] = AMDGPU.Device.ballot(true)
            return
        end
@@ -45,7 +45,7 @@
 
 julia&gt; x
 1-element ROCArray{UInt64, 1, AMDGPU.Runtime.Mem.HIPBuffer}:
- 0x00000000ffffffff</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront.jl#L110-L131">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.ballot_sync" href="#AMDGPU.Device.ballot_sync"><code>AMDGPU.Device.ballot_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">ballot_sync(mask::UInt64, predicate::Bool)::UInt64</code></pre><p>Evaluate <code>predicate</code> for all non-exited threads in <code>mask</code> and return an integer whose Nth bit is set if and only if <code>predicate</code> is <code>true</code> for the Nth thread of the wavefront and the Nth thread is active.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 0x00000000ffffffff</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront.jl#L110-L131">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.ballot_sync" href="#AMDGPU.Device.ballot_sync"><code>AMDGPU.Device.ballot_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">ballot_sync(mask::UInt64, predicate::Bool)::UInt64</code></pre><p>Evaluate <code>predicate</code> for all non-exited threads in <code>mask</code> and return an integer whose Nth bit is set if and only if <code>predicate</code> is <code>true</code> for the Nth thread of the wavefront and the Nth thread is active.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i = AMDGPU.Device.activelane()
            if i % 2 == 0
                mask = 0x0000000055555555 # Only even threads.
@@ -60,7 +60,7 @@
 julia&gt; @roc groupsize=32 ker!(x);
 
 julia&gt; bitstring(Array(x)[1])
-&quot;0000000000000000000000000000000001010101010101010101010101010101&quot;</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront_sync.jl#L16-L41">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.activemask" href="#AMDGPU.Device.activemask"><code>AMDGPU.Device.activemask</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">activemask()::UInt64</code></pre><p>Get the mask of all active lanes in a warp.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront.jl#L140-L144">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.bpermute" href="#AMDGPU.Device.bpermute"><code>AMDGPU.Device.bpermute</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">bpermute(addr::Integer, val::Cint)::Cint</code></pre><p>Read data stored in <code>val</code> from the lane VGPR (vector general purpose register) given by <code>addr</code>.</p><p>The permute instruction moves data between lanes but still uses the notion of byte addressing, as do other LDS instructions. Hence, the value in the <code>addr</code> VGPR should be <code>desired_lane_id * 4</code>, since VGPR values are 4 bytes wide.</p><p>Example below shifts all values in the wavefront by 1 to the &quot;left&quot;.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+&quot;0000000000000000000000000000000001010101010101010101010101010101&quot;</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront_sync.jl#L16-L41">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.activemask" href="#AMDGPU.Device.activemask"><code>AMDGPU.Device.activemask</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">activemask()::UInt64</code></pre><p>Get the mask of all active lanes in a warp.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront.jl#L140-L144">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.bpermute" href="#AMDGPU.Device.bpermute"><code>AMDGPU.Device.bpermute</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">bpermute(addr::Integer, val::Cint)::Cint</code></pre><p>Read data stored in <code>val</code> from the lane VGPR (vector general purpose register) given by <code>addr</code>.</p><p>The permute instruction moves data between lanes but still uses the notion of byte addressing, as do other LDS instructions. Hence, the value in the <code>addr</code> VGPR should be <code>desired_lane_id * 4</code>, since VGPR values are 4 bytes wide.</p><p>Example below shifts all values in the wavefront by 1 to the &quot;left&quot;.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i::Cint = AMDGPU.Device.activelane()
            # `addr` points to the next immediate lane.
            addr = ((i + 1) % 8) * 4 # VGPRs are 4 bytes wide
@@ -76,7 +76,7 @@
 
 julia&gt; x
 1×8 ROCArray{Int32, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
- 1  2  3  4  5  6  7  0</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront.jl#L147-L179">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.permute" href="#AMDGPU.Device.permute"><code>AMDGPU.Device.permute</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">permute(addr::Integer, val::Cint)::Cint</code></pre><p>Put data stored in <code>val</code> to the lane VGPR (vector general purpose register) given by <code>addr</code>.</p><p>Example below shifts all values in the wavefront by 1 to the &quot;right&quot;.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 1  2  3  4  5  6  7  0</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront.jl#L147-L179">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.permute" href="#AMDGPU.Device.permute"><code>AMDGPU.Device.permute</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">permute(addr::Integer, val::Cint)::Cint</code></pre><p>Put data stored in <code>val</code> to the lane VGPR (vector general purpose register) given by <code>addr</code>.</p><p>Example below shifts all values in the wavefront by 1 to the &quot;right&quot;.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i::Cint = AMDGPU.Device.activelane()
            # `addr` points to the next immediate lane.
            addr = ((i + 1) % 8) * 4 # VGPRs are 4 bytes wide
@@ -92,7 +92,7 @@
 
 julia&gt; x
 1×8 ROCArray{Int32, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
- 7  0  1  2  3  4  5  6</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront.jl#L183-L210">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl" href="#AMDGPU.Device.shfl"><code>AMDGPU.Device.shfl</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl(val, lane, width = wavefrontsize())</code></pre><p>Read data stored in <code>val</code> from a <code>lane</code> (this is a more high-level op than <a href="#AMDGPU.Device.bpermute"><code>bpermute</code></a>).</p><p>If <code>lane</code> is outside the range <code>[0:width - 1]</code>, the value returned corresponds to the value held by the <code>lane modulo width</code> (within the same subsection).</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 7  0  1  2  3  4  5  6</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront.jl#L183-L210">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl" href="#AMDGPU.Device.shfl"><code>AMDGPU.Device.shfl</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl(val, lane, width = wavefrontsize())</code></pre><p>Read data stored in <code>val</code> from a <code>lane</code> (this is a more high-level op than <a href="#AMDGPU.Device.bpermute"><code>bpermute</code></a>).</p><p>If <code>lane</code> is outside the range <code>[0:width - 1]</code>, the value returned corresponds to the value held by the <code>lane modulo width</code> (within the same subsection).</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i::UInt32 = AMDGPU.Device.activelane()
            x[i + 1] = AMDGPU.Device.shfl(i, i + 1)
            return
@@ -118,7 +118,7 @@
 
 julia&gt; Int.(x)
 1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
- 1  2  3  0  5  6  7  4</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront.jl#L264-L309">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_sync" href="#AMDGPU.Device.shfl_sync"><code>AMDGPU.Device.shfl_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_sync(mask::UInt64, val, lane, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a <code>lane</code> ID.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront_sync.jl#L103-L108">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_up" href="#AMDGPU.Device.shfl_up"><code>AMDGPU.Device.shfl_up</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_up(val, δ, width = wavefrontsize())</code></pre><p>Same as <a href="#AMDGPU.Device.shfl"><code>shfl</code></a>, but instead of specifying lane ID, accepts <code>δ</code> that is subtracted from the current lane ID. I.e. read from a lane with lower ID relative to the caller.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 1  2  3  0  5  6  7  4</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront.jl#L264-L309">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_sync" href="#AMDGPU.Device.shfl_sync"><code>AMDGPU.Device.shfl_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_sync(mask::UInt64, val, lane, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a <code>lane</code> ID.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront_sync.jl#L103-L108">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_up" href="#AMDGPU.Device.shfl_up"><code>AMDGPU.Device.shfl_up</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_up(val, δ, width = wavefrontsize())</code></pre><p>Same as <a href="#AMDGPU.Device.shfl"><code>shfl</code></a>, but instead of specifying lane ID, accepts <code>δ</code> that is subtracted from the current lane ID. I.e. read from a lane with lower ID relative to the caller.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i = AMDGPU.Device.activelane()
            x[i + 1] = AMDGPU.Device.shfl_up(i, 1)
            return
@@ -131,7 +131,7 @@
 
 julia&gt; x
 1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
- 0  0  1  2  3  4  5  6</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront.jl#L312-L335">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_up_sync" href="#AMDGPU.Device.shfl_up_sync"><code>AMDGPU.Device.shfl_up_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_up_sync(mask::UInt64, val, δ, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a <code>lane</code> with lower ID relative to the caller.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront_sync.jl#L114-L119">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_down" href="#AMDGPU.Device.shfl_down"><code>AMDGPU.Device.shfl_down</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_down(val, δ, width = wavefrontsize())</code></pre><p>Same as <a href="#AMDGPU.Device.shfl"><code>shfl</code></a>, but instead of specifying lane ID, accepts <code>δ</code> that is added to the current lane ID. I.e. read from a lane with higher ID relative to the caller.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 0  0  1  2  3  4  5  6</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront.jl#L312-L335">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_up_sync" href="#AMDGPU.Device.shfl_up_sync"><code>AMDGPU.Device.shfl_up_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_up_sync(mask::UInt64, val, δ, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a <code>lane</code> with lower ID relative to the caller.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront_sync.jl#L114-L119">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_down" href="#AMDGPU.Device.shfl_down"><code>AMDGPU.Device.shfl_down</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_down(val, δ, width = wavefrontsize())</code></pre><p>Same as <a href="#AMDGPU.Device.shfl"><code>shfl</code></a>, but instead of specifying lane ID, accepts <code>δ</code> that is added to the current lane ID. I.e. read from a lane with higher ID relative to the caller.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i = AMDGPU.Device.activelane()
            x[i + 1] = AMDGPU.Device.shfl_down(i, 1, 8)
            return
@@ -144,7 +144,7 @@
 
 julia&gt; x
 1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
- 1  2  3  4  5  6  7  7</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront.jl#L338-L361">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_down_sync" href="#AMDGPU.Device.shfl_down_sync"><code>AMDGPU.Device.shfl_down_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_down_sync(mask::UInt64, val, δ, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a <code>lane</code> with higher ID relative to the caller.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront_sync.jl#L125-L130">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_xor" href="#AMDGPU.Device.shfl_xor"><code>AMDGPU.Device.shfl_xor</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_xor(val, lane_mask, width = wavefrontsize())</code></pre><p>Same as <a href="#AMDGPU.Device.shfl"><code>shfl</code></a>, but instead of specifying lane ID, performs bitwise XOR of the caller&#39;s lane ID with the <code>lane_mask</code>.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 1  2  3  4  5  6  7  7</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront.jl#L338-L361">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_down_sync" href="#AMDGPU.Device.shfl_down_sync"><code>AMDGPU.Device.shfl_down_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_down_sync(mask::UInt64, val, δ, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a <code>lane</code> with higher ID relative to the caller.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront_sync.jl#L125-L130">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_xor" href="#AMDGPU.Device.shfl_xor"><code>AMDGPU.Device.shfl_xor</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_xor(val, lane_mask, width = wavefrontsize())</code></pre><p>Same as <a href="#AMDGPU.Device.shfl"><code>shfl</code></a>, but instead of specifying lane ID, performs bitwise XOR of the caller&#39;s lane ID with the <code>lane_mask</code>.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i = AMDGPU.Device.activelane()
            x[i + 1] = AMDGPU.Device.shfl_xor(i, 1)
            return
@@ -157,7 +157,7 @@
 
 julia&gt; x
 1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
- 1  0  3  2  5  4  7  6</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront.jl#L365-L387">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_xor_sync" href="#AMDGPU.Device.shfl_xor_sync"><code>AMDGPU.Device.shfl_xor_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_xor_sync(mask::UInt64, val, lane_mask, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a lane according to a bitwise XOR of the caller&#39;s lane ID with the <code>lane_mask</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront_sync.jl#L136-L142">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.any_sync" href="#AMDGPU.Device.any_sync"><code>AMDGPU.Device.any_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">any_sync(mask::UInt64, predicate::Bool)::Bool</code></pre><p>Evaluate <code>predicate</code> for all non-exited threads in <code>mask</code> and return non-zero if and only if <code>predicate</code> evaluates to non-zero for any of them.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 1  0  3  2  5  4  7  6</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront.jl#L365-L387">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_xor_sync" href="#AMDGPU.Device.shfl_xor_sync"><code>AMDGPU.Device.shfl_xor_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_xor_sync(mask::UInt64, val, lane_mask, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a lane according to a bitwise XOR of the caller&#39;s lane ID with the <code>lane_mask</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront_sync.jl#L136-L142">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.any_sync" href="#AMDGPU.Device.any_sync"><code>AMDGPU.Device.any_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">any_sync(mask::UInt64, predicate::Bool)::Bool</code></pre><p>Evaluate <code>predicate</code> for all non-exited threads in <code>mask</code> and return non-zero if and only if <code>predicate</code> evaluates to non-zero for any of them.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i = AMDGPU.Device.activelane()
            if i % 2 == 0
                mask = 0x0000000055555555 # Only even threads.
@@ -173,7 +173,7 @@
 
 julia&gt; x
 1-element ROCArray{Bool, 1, AMDGPU.Runtime.Mem.HIPBuffer}:
- 1</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront_sync.jl#L47-L72">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.all_sync" href="#AMDGPU.Device.all_sync"><code>AMDGPU.Device.all_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">all_sync(mask::UInt64, predicate::Bool)::Bool</code></pre><p>Evaluate <code>predicate</code> for all non-exited threads in <code>mask</code> and return non-zero if and only if <code>predicate</code> evaluates to non-zero for all of them.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 1</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront_sync.jl#L47-L72">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.all_sync" href="#AMDGPU.Device.all_sync"><code>AMDGPU.Device.all_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">all_sync(mask::UInt64, predicate::Bool)::Bool</code></pre><p>Evaluate <code>predicate</code> for all non-exited threads in <code>mask</code> and return non-zero if and only if <code>predicate</code> evaluates to non-zero for all of them.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i = AMDGPU.Device.activelane()
            if i % 2 == 0
                mask = 0x0000000055555555 # Only even threads.
@@ -189,4 +189,4 @@
 
 julia&gt; x
 1-element ROCArray{Bool, 1, AMDGPU.Runtime.Mem.HIPBuffer}:
- 1</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/device/gcn/wavefront_sync.jl#L75-L100">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../streams/">« Streams</a><a class="docs-footer-nextpage" href="../exceptions/">Exceptions »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 11:16">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+ 1</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/device/gcn/wavefront_sync.jl#L75-L100">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../streams/">« Streams</a><a class="docs-footer-nextpage" href="../exceptions/">Exceptions »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 22:48">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/logging/index.html b/dev/logging/index.html
index 01dcbdf9..911ece88 100644
--- a/dev/logging/index.html
+++ b/dev/logging/index.html
@@ -9,4 +9,4 @@
     fill!(B, 1f0)
     C = Array(B)
 end
-@show logs[1]</code></pre></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../printing/">« Printing</a><a class="docs-footer-nextpage" href="../api/">API Reference »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 11:16">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+@show logs[1]</code></pre></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../printing/">« Printing</a><a class="docs-footer-nextpage" href="../api/">API Reference »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 22:48">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/memory/index.html b/dev/memory/index.html
index bcc7a217..87c62b0d 100644
--- a/dev/memory/index.html
+++ b/dev/memory/index.html
@@ -62,4 +62,4 @@
 xd * xd
 
 # Freeing is a no-op for `xd`, since `xd` does not own the underlying memory.
-AMDGPU.unsafe_free!(xd) # No-op.</code></pre><p>Notice mandatory <code>; lock=false</code> keyword, this is to be able to differentiate between host &amp; device pointers.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../profiling/">« Profiling</a><a class="docs-footer-nextpage" href="../hostcall/">Host-Call »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 11:16">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+AMDGPU.unsafe_free!(xd) # No-op.</code></pre><p>Notice mandatory <code>; lock=false</code> keyword, this is to be able to differentiate between host &amp; device pointers.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../profiling/">« Profiling</a><a class="docs-footer-nextpage" href="../hostcall/">Host-Call »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 22:48">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/printing/index.html b/dev/printing/index.html
index 12ebc2e2..5dbe23e0 100644
--- a/dev/printing/index.html
+++ b/dev/printing/index.html
@@ -38,4 +38,4 @@
 My index is 1
 
 # :grid
-My index is 1</code></pre><h2 id="Differences-to-@cuprintf"><a class="docs-heading-anchor" href="#Differences-to-@cuprintf">Differences to <code>@cuprintf</code></a><a id="Differences-to-@cuprintf-1"></a><a class="docs-heading-anchor-permalink" href="#Differences-to-@cuprintf" title="Permalink"></a></h2><p>Similar to CUDA&#39;s <code>@cuprintf</code>, <code>@rocprintf</code> is a printf-compatible macro which takes a format string and arguments, and commands the host CPU to display it as formatted text. However, in contrast to <code>@cuprintf</code>, we use AMDGPU&#39;s hostcall and Julia&#39;s <code>Printf</code> stdlib to implement this. This means that anything that <code>Printf</code> can print, so can <code>@rocprintf</code> (assuming such an object can be represented on the GPU). The macro is also handled as a regular hostcall, which means that argument types are checked at compile time (although currently, any errors while printing will be detected on the host, and will terminate the kernel).</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../execution_control/">« Execution Control</a><a class="docs-footer-nextpage" href="../logging/">Logging »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 11:16">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+My index is 1</code></pre><h2 id="Differences-to-@cuprintf"><a class="docs-heading-anchor" href="#Differences-to-@cuprintf">Differences to <code>@cuprintf</code></a><a id="Differences-to-@cuprintf-1"></a><a class="docs-heading-anchor-permalink" href="#Differences-to-@cuprintf" title="Permalink"></a></h2><p>Similar to CUDA&#39;s <code>@cuprintf</code>, <code>@rocprintf</code> is a printf-compatible macro which takes a format string and arguments, and commands the host CPU to display it as formatted text. However, in contrast to <code>@cuprintf</code>, we use AMDGPU&#39;s hostcall and Julia&#39;s <code>Printf</code> stdlib to implement this. This means that anything that <code>Printf</code> can print, so can <code>@rocprintf</code> (assuming such an object can be represented on the GPU). The macro is also handled as a regular hostcall, which means that argument types are checked at compile time (although currently, any errors while printing will be detected on the host, and will terminate the kernel).</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../execution_control/">« Execution Control</a><a class="docs-footer-nextpage" href="../logging/">Logging »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 22:48">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/profiling/index.html b/dev/profiling/index.html
index b6235352..2d78020f 100644
--- a/dev/profiling/index.html
+++ b/dev/profiling/index.html
@@ -34,4 +34,4 @@
         @roc groupsize=groupsize gridsize=gridsize mycopy!(dst, src)
     end
     AMDGPU.synchronize()
-    ...</code></pre><p>Running profiling again and visualizing results we now see that kernel launches are adjacent to each other and that the average wall duration is lower.</p><table><tr><th style="text-align: center">Zoomed out</th><th style="text-align: center">Zoomed in</th></tr><tr><td style="text-align: center"><img src="../assets/profiling-2.png" alt="image"/></td><td style="text-align: center"><img src="../assets/profiling-3.png" alt="image"/></td></tr></table><h2 id="Debugging"><a class="docs-heading-anchor" href="#Debugging">Debugging</a><a id="Debugging-1"></a><a class="docs-heading-anchor-permalink" href="#Debugging" title="Permalink"></a></h2><p>Use <code>HIP_LAUNCH_BLOCKING=1</code> to synchronize immediately after launching GPU kernels. This will allow to pinpoint exact kernel that caused the exception.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../exceptions/">« Exceptions</a><a class="docs-footer-nextpage" href="../memory/">Memory »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 11:16">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+    ...</code></pre><p>Running profiling again and visualizing results we now see that kernel launches are adjacent to each other and that the average wall duration is lower.</p><table><tr><th style="text-align: center">Zoomed out</th><th style="text-align: center">Zoomed in</th></tr><tr><td style="text-align: center"><img src="../assets/profiling-2.png" alt="image"/></td><td style="text-align: center"><img src="../assets/profiling-3.png" alt="image"/></td></tr></table><h2 id="Debugging"><a class="docs-heading-anchor" href="#Debugging">Debugging</a><a id="Debugging-1"></a><a class="docs-heading-anchor-permalink" href="#Debugging" title="Permalink"></a></h2><p>Use <code>HIP_LAUNCH_BLOCKING=1</code> to synchronize immediately after launching GPU kernels. This will allow to pinpoint exact kernel that caused the exception.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../exceptions/">« Exceptions</a><a class="docs-footer-nextpage" href="../memory/">Memory »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 22:48">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/quickstart/index.html b/dev/quickstart/index.html
index 9e8a2618..ccf97fc0 100644
--- a/dev/quickstart/index.html
+++ b/dev/quickstart/index.html
@@ -28,4 +28,4 @@
 julia&gt; @roc groupsize=groupsize gridsize=gridsize vadd!(c_d, a_d, b_d);
 
 julia&gt; Array(c_d) ≈ c
-true</code></pre><p>The easiest way to launch a GPU kernel is with the <code>@roc</code> macro, specifying <code>groupsize</code> and <code>gridsize</code> to cover full array, and calling it like a regular function.</p><p>Keep in mind that kernel launches are asynchronous, meaning that you need to synchronize before you can use the result (e.g. with <a href="../streams/#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a>). However, GPU &lt;-&gt; CPU transfers synchronize implicitly.</p><p>The grid is the domain over which the <em>entire</em> kernel executes over. The grid will be split into multiple workgroups by hardware automatically, and the kernel does not complete until all workgroups complete.</p><p>Like OpenCL, AMDGPU has the concept of &quot;workitems&quot;, &quot;workgroups&quot;, and the &quot;grid&quot;. A workitem is a single thread of execution, capable of performing arithmentic operations. Workitems are grouped into &quot;wavefronts&quot; (&quot;warps&quot; in CUDA) which share the same compute unit, and execute the same instructions simulatenously. The workgroup is a logical unit of compute supported by hardware which comprises multiple wavefronts, which shares resources (specifically local memory) and can be efficiently synchronized. A workgroup may be executed by one or multiple hardware compute units, making it often the only dimension of importance for smaller kernel launches.</p><p>Notice how we explicitly specify that this function does not return a value by adding the <code>return</code> statement. This is necessary for all GPU kernels and we can enforce it by adding a <code>return</code>, <code>return nothing</code>, or even <code>nothing</code> at the end of the kernel. If this statement is omitted, Julia will attempt to return the value of the last evaluated expression, in this case a <code>Float64</code>, which will cause a compilation failure as kernels cannot return values.</p><h2 id="Naming-conventions"><a class="docs-heading-anchor" href="#Naming-conventions">Naming conventions</a><a id="Naming-conventions-1"></a><a class="docs-heading-anchor-permalink" href="#Naming-conventions" title="Permalink"></a></h2><p>Throughout this example we use terms like &quot;work group&quot; and &quot;work item&quot;. These terms are used by the Khronos consortium and their APIs including OpenCL and Vulkan, as well as the HSA foundation.</p><p>NVIDIA, on the other hand, uses some different terms in their CUDA API, which might be confusing to some users porting their kernels from CUDA to AMDGPU.</p><p>As a quick summary, here is a mapping of the most common terms:</p><table><tr><th style="text-align: center">AMDGPU</th><th style="text-align: center">CUDA</th></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.workitemIdx"><code>workitemIdx</code></a></td><td style="text-align: center"><code>threadIdx</code></td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.workgroupIdx"><code>workgroupIdx</code></a></td><td style="text-align: center"><code>blockIdx</code></td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.workgroupDim"><code>workgroupDim</code></a></td><td style="text-align: center"><code>blockDim</code></td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.gridItemDim"><code>gridItemDim</code></a></td><td style="text-align: center">No equivalent</td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.gridGroupDim"><code>gridGroupDim</code></a></td><td style="text-align: center"><code>gridDim</code></td></tr><tr><td style="text-align: center"><code>groupsize</code></td><td style="text-align: center"><code>threads</code></td></tr><tr><td style="text-align: center"><code>gridsize</code></td><td style="text-align: center"><code>blocks</code></td></tr><tr><td style="text-align: center"><code>stream</code></td><td style="text-align: center"><code>stream</code></td></tr></table></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../">« Home</a><a class="docs-footer-nextpage" href="../devices/">Devices »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 11:16">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+true</code></pre><p>The easiest way to launch a GPU kernel is with the <code>@roc</code> macro, specifying <code>groupsize</code> and <code>gridsize</code> to cover full array, and calling it like a regular function.</p><p>Keep in mind that kernel launches are asynchronous, meaning that you need to synchronize before you can use the result (e.g. with <a href="../streams/#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a>). However, GPU &lt;-&gt; CPU transfers synchronize implicitly.</p><p>The grid is the domain over which the <em>entire</em> kernel executes over. The grid will be split into multiple workgroups by hardware automatically, and the kernel does not complete until all workgroups complete.</p><p>Like OpenCL, AMDGPU has the concept of &quot;workitems&quot;, &quot;workgroups&quot;, and the &quot;grid&quot;. A workitem is a single thread of execution, capable of performing arithmentic operations. Workitems are grouped into &quot;wavefronts&quot; (&quot;warps&quot; in CUDA) which share the same compute unit, and execute the same instructions simulatenously. The workgroup is a logical unit of compute supported by hardware which comprises multiple wavefronts, which shares resources (specifically local memory) and can be efficiently synchronized. A workgroup may be executed by one or multiple hardware compute units, making it often the only dimension of importance for smaller kernel launches.</p><p>Notice how we explicitly specify that this function does not return a value by adding the <code>return</code> statement. This is necessary for all GPU kernels and we can enforce it by adding a <code>return</code>, <code>return nothing</code>, or even <code>nothing</code> at the end of the kernel. If this statement is omitted, Julia will attempt to return the value of the last evaluated expression, in this case a <code>Float64</code>, which will cause a compilation failure as kernels cannot return values.</p><h2 id="Naming-conventions"><a class="docs-heading-anchor" href="#Naming-conventions">Naming conventions</a><a id="Naming-conventions-1"></a><a class="docs-heading-anchor-permalink" href="#Naming-conventions" title="Permalink"></a></h2><p>Throughout this example we use terms like &quot;work group&quot; and &quot;work item&quot;. These terms are used by the Khronos consortium and their APIs including OpenCL and Vulkan, as well as the HSA foundation.</p><p>NVIDIA, on the other hand, uses some different terms in their CUDA API, which might be confusing to some users porting their kernels from CUDA to AMDGPU.</p><p>As a quick summary, here is a mapping of the most common terms:</p><table><tr><th style="text-align: center">AMDGPU</th><th style="text-align: center">CUDA</th></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.workitemIdx"><code>workitemIdx</code></a></td><td style="text-align: center"><code>threadIdx</code></td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.workgroupIdx"><code>workgroupIdx</code></a></td><td style="text-align: center"><code>blockIdx</code></td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.workgroupDim"><code>workgroupDim</code></a></td><td style="text-align: center"><code>blockDim</code></td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.gridItemDim"><code>gridItemDim</code></a></td><td style="text-align: center">No equivalent</td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.gridGroupDim"><code>gridGroupDim</code></a></td><td style="text-align: center"><code>gridDim</code></td></tr><tr><td style="text-align: center"><code>groupsize</code></td><td style="text-align: center"><code>threads</code></td></tr><tr><td style="text-align: center"><code>gridsize</code></td><td style="text-align: center"><code>blocks</code></td></tr><tr><td style="text-align: center"><code>stream</code></td><td style="text-align: center"><code>stream</code></td></tr></table></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../">« Home</a><a class="docs-footer-nextpage" href="../devices/">Devices »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 22:48">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/streams/index.html b/dev/streams/index.html
index fa0d4a92..e4ba1534 100644
--- a/dev/streams/index.html
+++ b/dev/streams/index.html
@@ -9,6 +9,6 @@
 x = AMDGPU.stream!(() -&gt; AMDGPU.ones(Float32, 16), stream)</code></pre><ul><li>Using <code>stream</code> argument to <code>@roc</code> macro:</li></ul><pre><code class="language-julia hljs">stream = AMDGPU.HIPStream()
 @roc stream=stream kernel(...)</code></pre><p>Streams also have an inherent priority, which allows control of kernel submission latency and on-device scheduling preference with respect to kernels submitted on other streams. There are three priorities: normal (the default), low, and high priority.</p><p>Priority of the default <code>stream</code> can be set with <a href="#AMDGPU.priority!"><code>AMDGPU.priority!</code></a>. Alternatively, it can be set at stream creation time:</p><pre><code class="language-julia hljs">low_prio = HIPStream(:low)
 high_prio = HIPStream(:high)
-normal_prio = HIPStream(:normal) # or just omit &quot;priority&quot;</code></pre><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.stream" href="#AMDGPU.stream"><code>AMDGPU.stream</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">stream()::HIPStream</code></pre><p>Get the HIP stream that should be used as the default one for the currently executing task.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/tls.jl#L70-L75">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.stream!" href="#AMDGPU.stream!"><code>AMDGPU.stream!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">stream!(s::HIPStream)</code></pre><p>Change the default stream to be used <strong>within the same Julia task</strong>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/tls.jl#L78-L82">source</a></section><section><div><pre><code class="language-julia hljs">stream!(f::Base.Callable, stream::HIPStream)</code></pre><p>Change the default stream to be used <strong>within the same Julia task</strong>, execute <code>f</code> and revert to the original stream.</p><p><strong>Returns:</strong></p><p>Return value of the function <code>f</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/tls.jl#L89-L98">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.priority!" href="#AMDGPU.priority!"><code>AMDGPU.priority!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">priority!(p::Symbol)</code></pre><p>Change the priority of the default stream. Accepted values are <code>:normal</code> (the default), <code>:low</code> and <code>:high</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/tls.jl#L142-L147">source</a></section><section><div><pre><code class="language-julia hljs">priority!(f::Base.Callable, priority::Symbol)</code></pre><p>Chnage the priority of default stream, execute <code>f</code> and revert to the original priority. Accepted values are <code>:normal</code> (the default), <code>:low</code> and <code>:high</code>.</p><p><strong>Returns:</strong></p><p>Return value of the function <code>f</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/tls.jl#L156-L166">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.HIPStream" href="#AMDGPU.HIP.HIPStream"><code>AMDGPU.HIP.HIPStream</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">HIPStream(priority::Symbol = :normal)</code></pre><p><strong>Arguments:</strong></p><ul><li><code>priority::Symbol</code>: Priority of the stream: <code>:normal</code>, <code>:high</code> or <code>:low</code>.</li></ul><p>Create HIPStream with given priority. Device is the default device that&#39;s currently in use.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/hip/stream.jl#L13-L22">source</a></section><section><div><pre><code class="language-julia hljs">HIPStream(stream::hipStream_t)</code></pre><p>Create HIPStream from <code>hipStream_t</code> handle. Device is the default device that&#39;s currently in use.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/hip/stream.jl#L42-L47">source</a></section></article><h2 id="Synchronization"><a class="docs-heading-anchor" href="#Synchronization">Synchronization</a><a id="Synchronization-1"></a><a class="docs-heading-anchor-permalink" href="#Synchronization" title="Permalink"></a></h2><p>AMDGPU.jl by default uses non-blocking stream synchronization with <a href="#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> to work correctly with TLS and <a href="../hostcall/#Hostcall">Hostcall</a>.</p><p>Users, however, can switch to a blocking synchronization globally with <code>nonblocking_synchronization</code> <a href="https://github.com/JuliaPackaging/Preferences.jl">preference</a> or with fine-grained <code>AMDGPU.synchronize(; blocking=true)</code>. Blocking synchronization might offer slightly lower latency.</p><p>You can also perform synchronization of the expression with <a href="#AMDGPU.@sync"><code>AMDGPU.@sync</code></a> macro, which will execute given expression and synchronize afterwards (using <a href="#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> under the hood).</p><pre><code class="language-julia hljs">AMDGPU.@sync begin
+normal_prio = HIPStream(:normal) # or just omit &quot;priority&quot;</code></pre><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.stream" href="#AMDGPU.stream"><code>AMDGPU.stream</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">stream()::HIPStream</code></pre><p>Get the HIP stream that should be used as the default one for the currently executing task.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/tls.jl#L70-L75">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.stream!" href="#AMDGPU.stream!"><code>AMDGPU.stream!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">stream!(s::HIPStream)</code></pre><p>Change the default stream to be used <strong>within the same Julia task</strong>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/tls.jl#L78-L82">source</a></section><section><div><pre><code class="language-julia hljs">stream!(f::Base.Callable, stream::HIPStream)</code></pre><p>Change the default stream to be used <strong>within the same Julia task</strong>, execute <code>f</code> and revert to the original stream.</p><p><strong>Returns:</strong></p><p>Return value of the function <code>f</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/tls.jl#L89-L98">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.priority!" href="#AMDGPU.priority!"><code>AMDGPU.priority!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">priority!(p::Symbol)</code></pre><p>Change the priority of the default stream. Accepted values are <code>:normal</code> (the default), <code>:low</code> and <code>:high</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/tls.jl#L142-L147">source</a></section><section><div><pre><code class="language-julia hljs">priority!(f::Base.Callable, priority::Symbol)</code></pre><p>Chnage the priority of default stream, execute <code>f</code> and revert to the original priority. Accepted values are <code>:normal</code> (the default), <code>:low</code> and <code>:high</code>.</p><p><strong>Returns:</strong></p><p>Return value of the function <code>f</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/tls.jl#L156-L166">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.HIPStream" href="#AMDGPU.HIP.HIPStream"><code>AMDGPU.HIP.HIPStream</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">HIPStream(priority::Symbol = :normal)</code></pre><p><strong>Arguments:</strong></p><ul><li><code>priority::Symbol</code>: Priority of the stream: <code>:normal</code>, <code>:high</code> or <code>:low</code>.</li></ul><p>Create HIPStream with given priority. Device is the default device that&#39;s currently in use.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/hip/stream.jl#L13-L22">source</a></section><section><div><pre><code class="language-julia hljs">HIPStream(stream::hipStream_t)</code></pre><p>Create HIPStream from <code>hipStream_t</code> handle. Device is the default device that&#39;s currently in use.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/hip/stream.jl#L42-L47">source</a></section></article><h2 id="Synchronization"><a class="docs-heading-anchor" href="#Synchronization">Synchronization</a><a id="Synchronization-1"></a><a class="docs-heading-anchor-permalink" href="#Synchronization" title="Permalink"></a></h2><p>AMDGPU.jl by default uses non-blocking stream synchronization with <a href="#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> to work correctly with TLS and <a href="../hostcall/#Hostcall">Hostcall</a>.</p><p>Users, however, can switch to a blocking synchronization globally with <code>nonblocking_synchronization</code> <a href="https://github.com/JuliaPackaging/Preferences.jl">preference</a> or with fine-grained <code>AMDGPU.synchronize(; blocking=true)</code>. Blocking synchronization might offer slightly lower latency.</p><p>You can also perform synchronization of the expression with <a href="#AMDGPU.@sync"><code>AMDGPU.@sync</code></a> macro, which will execute given expression and synchronize afterwards (using <a href="#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> under the hood).</p><pre><code class="language-julia hljs">AMDGPU.@sync begin
     @roc ...
-end</code></pre><p>Finally, you can perform full device synchronization with <a href="#AMDGPU.HIP.device_synchronize"><code>AMDGPU.device_synchronize</code></a>.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.synchronize" href="#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">synchronize(stream::HIPStream = stream(); blocking::Bool = false)</code></pre><p>Wait until all kernels executing on <code>stream</code> have completed.</p><p>If there are running HostCalls, then <code>blocking</code> <strong>must</strong> be <code>false</code>. Additionally, if you want to stop host calls afterwards, then provide <code>stop_hostcalls=true</code> keyword argument.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/highlevel.jl#L27-L35">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.@sync" href="#AMDGPU.@sync"><code>AMDGPU.@sync</code></a> — <span class="docstring-category">Macro</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">@sync ex</code></pre><p>Run expression <code>ex</code> on currently active stream and synchronize the GPU on that stream afterwards.</p><p>See also: <a href="#AMDGPU.synchronize"><code>synchronize</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/highlevel.jl#L65-L72">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.device_synchronize" href="#AMDGPU.HIP.device_synchronize"><code>AMDGPU.HIP.device_synchronize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><p>Blocks until all kernels on all streams have completed. Uses currently active device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/1b2e742425aad9babc458b6daccf5dc267e8c530/src/hip/HIP.jl#L78-L81">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../devices/">« Devices</a><a class="docs-footer-nextpage" href="../kernel_programming/">Kernel Programming »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 11:16">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+end</code></pre><p>Finally, you can perform full device synchronization with <a href="#AMDGPU.HIP.device_synchronize"><code>AMDGPU.device_synchronize</code></a>.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.synchronize" href="#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">synchronize(stream::HIPStream = stream(); blocking::Bool = false)</code></pre><p>Wait until all kernels executing on <code>stream</code> have completed.</p><p>If there are running HostCalls, then <code>blocking</code> <strong>must</strong> be <code>false</code>. Additionally, if you want to stop host calls afterwards, then provide <code>stop_hostcalls=true</code> keyword argument.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/highlevel.jl#L27-L35">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.@sync" href="#AMDGPU.@sync"><code>AMDGPU.@sync</code></a> — <span class="docstring-category">Macro</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">@sync ex</code></pre><p>Run expression <code>ex</code> on currently active stream and synchronize the GPU on that stream afterwards.</p><p>See also: <a href="#AMDGPU.synchronize"><code>synchronize</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/highlevel.jl#L65-L72">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.device_synchronize" href="#AMDGPU.HIP.device_synchronize"><code>AMDGPU.HIP.device_synchronize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><p>Blocks until all kernels on all streams have completed. Uses currently active device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/8a23d0dbbc2ee299d358a1d966cdcb7f300c05f8/src/hip/HIP.jl#L78-L81">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../devices/">« Devices</a><a class="docs-footer-nextpage" href="../kernel_programming/">Kernel Programming »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Tuesday 24 September 2024 22:48">Tuesday 24 September 2024</span>. Using Julia version 1.10.5.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>

AMDGPU	CUDA
`workitemIdx`	`threadIdx`
`workgroupIdx`	`blockIdx`
`workgroupDim`	`blockDim`
`gridItemDim`	No equivalent
`gridGroupDim`	`gridDim`
`groupsize`	`threads`
`gridsize`	`blocks`
`stream`	`stream`