diff --git a/latest/.buildinfo b/latest/.buildinfo index 2e08383a8..fc2ec2aff 100644 --- a/latest/.buildinfo +++ b/latest/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: c3d758f43fb7e62bf2a1b966e9ea4853 +config: 0c64f1e50062f38d9437c83151ccb10e tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/latest/_sources/dev-environment.rst.txt b/latest/_sources/dev-environment.rst.txt index 24b37e567..6c36101ac 100644 --- a/latest/_sources/dev-environment.rst.txt +++ b/latest/_sources/dev-environment.rst.txt @@ -27,8 +27,8 @@ Steps with :bash:`sudo` access (e.g. on a local device): * After installation, restart your shell. #. Install the required Python versions: * On some systems, additional packages may be needed to build Python versions. For example on Ubuntu: :bash:`sudo apt install build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev liblzma-dev lzma`. - * Install the Python versions with: :bash:`pyenv install 3.8 3.9 3.10 3.11`. The reason we're installing all these versions as opposed to just one, is so we can test against all supported Python versions. -#. Set the Python versions so they can be found: :bash:`pyenv local 3.8 3.9 3.10 3.11` (replace :bash:`local` with :bash:`global` when not using the virtualenv). + * Install the Python versions with: :bash:`pyenv install 3.9 3.10 3.11 3.12`. The reason we're installing all these versions as opposed to just one, is so we can test against all supported Python versions. +#. Set the Python versions so they can be found: :bash:`pyenv local 3.9 3.10 3.11 3.12` (replace :bash:`local` with :bash:`global` when not using the virtualenv). #. Setup a local virtual environment in the folder: :bash:`pyenv virtualenv 3.11 kerneltuner` (or whatever environment name and Python version you prefer). #. `Install Poetry `__. * Use :bash:`curl -sSL https://install.python-poetry.org | python3 -` to install Poetry. diff --git a/latest/_static/basic.css b/latest/_static/basic.css index cfc60b86c..f316efcb4 100644 --- a/latest/_static/basic.css +++ b/latest/_static/basic.css @@ -4,7 +4,7 @@ * * Sphinx stylesheet -- basic theme. * - * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. * :license: BSD, see LICENSE for details. * */ @@ -237,6 +237,10 @@ a.headerlink { visibility: hidden; } +a:visited { + color: #551A8B; +} + h1:hover > a.headerlink, h2:hover > a.headerlink, h3:hover > a.headerlink, diff --git a/latest/_static/doctools.js b/latest/_static/doctools.js index d06a71d75..4d67807d1 100644 --- a/latest/_static/doctools.js +++ b/latest/_static/doctools.js @@ -4,7 +4,7 @@ * * Base JavaScript utilities for all Sphinx HTML documentation. * - * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. * :license: BSD, see LICENSE for details. * */ diff --git a/latest/_static/documentation_options.js b/latest/_static/documentation_options.js index a7f754b66..529239f07 100644 --- a/latest/_static/documentation_options.js +++ b/latest/_static/documentation_options.js @@ -1,5 +1,4 @@ -var DOCUMENTATION_OPTIONS = { - URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), +const DOCUMENTATION_OPTIONS = { VERSION: '1.0', LANGUAGE: 'en', COLLAPSE_INDEX: false, diff --git a/latest/_static/language_data.js b/latest/_static/language_data.js index 250f5665f..367b8ed81 100644 --- a/latest/_static/language_data.js +++ b/latest/_static/language_data.js @@ -5,7 +5,7 @@ * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. * - * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. * :license: BSD, see LICENSE for details. * */ @@ -13,7 +13,7 @@ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; -/* Non-minified version is copied as a separate JS file, is available */ +/* Non-minified version is copied as a separate JS file, if available */ /** * Porter Stemmer diff --git a/latest/_static/searchtools.js b/latest/_static/searchtools.js index 97d56a74d..92da3f8b2 100644 --- a/latest/_static/searchtools.js +++ b/latest/_static/searchtools.js @@ -4,7 +4,7 @@ * * Sphinx JavaScript utilities for the full-text search. * - * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. * :license: BSD, see LICENSE for details. * */ @@ -57,12 +57,12 @@ const _removeChildren = (element) => { const _escapeRegExp = (string) => string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string -const _displayItem = (item, searchTerms) => { +const _displayItem = (item, searchTerms, highlightTerms) => { const docBuilder = DOCUMENTATION_OPTIONS.BUILDER; - const docUrlRoot = DOCUMENTATION_OPTIONS.URL_ROOT; const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX; const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX; const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; + const contentRoot = document.documentElement.dataset.content_root; const [docName, title, anchor, descr, score, _filename] = item; @@ -75,28 +75,35 @@ const _displayItem = (item, searchTerms) => { if (dirname.match(/\/index\/$/)) dirname = dirname.substring(0, dirname.length - 6); else if (dirname === "index/") dirname = ""; - requestUrl = docUrlRoot + dirname; + requestUrl = contentRoot + dirname; linkUrl = requestUrl; } else { // normal html builders - requestUrl = docUrlRoot + docName + docFileSuffix; + requestUrl = contentRoot + docName + docFileSuffix; linkUrl = docName + docLinkSuffix; } let linkEl = listItem.appendChild(document.createElement("a")); linkEl.href = linkUrl + anchor; linkEl.dataset.score = score; linkEl.innerHTML = title; - if (descr) + if (descr) { listItem.appendChild(document.createElement("span")).innerHTML = " (" + descr + ")"; + // highlight search terms in the description + if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js + highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted")); + } else if (showSearchSummary) fetch(requestUrl) .then((responseData) => responseData.text()) .then((data) => { if (data) listItem.appendChild( - Search.makeSearchSummary(data, searchTerms) + Search.makeSearchSummary(data, searchTerms, anchor) ); + // highlight search terms in the summary + if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js + highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted")); }); Search.output.appendChild(listItem); }; @@ -109,26 +116,43 @@ const _finishSearch = (resultCount) => { ); else Search.status.innerText = _( - `Search finished, found ${resultCount} page(s) matching the search query.` - ); + "Search finished, found ${resultCount} page(s) matching the search query." + ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( results, resultCount, - searchTerms + searchTerms, + highlightTerms, ) => { // results left, load the summary and display it // this is intended to be dynamic (don't sub resultsCount) if (results.length) { - _displayItem(results.pop(), searchTerms); + _displayItem(results.pop(), searchTerms, highlightTerms); setTimeout( - () => _displayNextItem(results, resultCount, searchTerms), + () => _displayNextItem(results, resultCount, searchTerms, highlightTerms), 5 ); } // search finished, update title and status message else _finishSearch(resultCount); }; +// Helper function used by query() to order search results. +// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Order the results by score (in opposite order of appearance, since the +// `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. +const _orderResultsByScoreThenName = (a, b) => { + const leftScore = a[4]; + const rightScore = b[4]; + if (leftScore === rightScore) { + // same score: sort alphabetically + const leftTitle = a[1].toLowerCase(); + const rightTitle = b[1].toLowerCase(); + if (leftTitle === rightTitle) return 0; + return leftTitle > rightTitle ? -1 : 1; // inverted is intentional + } + return leftScore > rightScore ? 1 : -1; +}; /** * Default splitQuery function. Can be overridden in ``sphinx.search`` with a @@ -152,13 +176,26 @@ const Search = { _queued_query: null, _pulse_status: -1, - htmlToText: (htmlString) => { + htmlToText: (htmlString, anchor) => { const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html'); - htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() }); + for (const removalQuery of [".headerlinks", "script", "style"]) { + htmlElement.querySelectorAll(removalQuery).forEach((el) => { el.remove() }); + } + if (anchor) { + const anchorContent = htmlElement.querySelector(`[role="main"] ${anchor}`); + if (anchorContent) return anchorContent.textContent; + + console.warn( + `Anchored content block not found. Sphinx search tries to obtain it via DOM query '[role=main] ${anchor}'. Check your theme or template.` + ); + } + + // if anchor not specified or not found, fall back to main content const docContent = htmlElement.querySelector('[role="main"]'); - if (docContent !== undefined) return docContent.textContent; + if (docContent) return docContent.textContent; + console.warn( - "Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template." + "Content block not found. Sphinx search tries to obtain it via DOM query '[role=main]'. Check your theme or template." ); return ""; }, @@ -231,16 +268,7 @@ const Search = { else Search.deferQuery(query); }, - /** - * execute search (requires search index to be loaded) - */ - query: (query) => { - const filenames = Search._index.filenames; - const docNames = Search._index.docnames; - const titles = Search._index.titles; - const allTitles = Search._index.alltitles; - const indexEntries = Search._index.indexentries; - + _parseQuery: (query) => { // stem the search terms and add them to the correct list const stemmer = new Stemmer(); const searchTerms = new Set(); @@ -276,16 +304,32 @@ const Search = { // console.info("required: ", [...searchTerms]); // console.info("excluded: ", [...excludedTerms]); - // array of [docname, title, anchor, descr, score, filename] - let results = []; + return [query, searchTerms, excludedTerms, highlightTerms, objectTerms]; + }, + + /** + * execute search (requires search index to be loaded) + */ + _performSearch: (query, searchTerms, excludedTerms, highlightTerms, objectTerms) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + const allTitles = Search._index.alltitles; + const indexEntries = Search._index.indexentries; + + // Collect multiple result groups to be sorted separately and then ordered. + // Each is an array of [docname, title, anchor, descr, score, filename]. + const normalResults = []; + const nonMainIndexResults = []; + _removeChildren(document.getElementById("search-progress")); - const queryLower = query.toLowerCase(); + const queryLower = query.toLowerCase().trim(); for (const [title, foundTitles] of Object.entries(allTitles)) { - if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) { + if (title.toLowerCase().trim().includes(queryLower) && (queryLower.length >= title.length/2)) { for (const [file, id] of foundTitles) { let score = Math.round(100 * queryLower.length / title.length) - results.push([ + normalResults.push([ docNames[file], titles[file] !== title ? `${titles[file]} > ${title}` : title, id !== null ? "#" + id : "", @@ -300,46 +344,47 @@ const Search = { // search for explicit entries in index directives for (const [entry, foundEntries] of Object.entries(indexEntries)) { if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) { - for (const [file, id] of foundEntries) { - let score = Math.round(100 * queryLower.length / entry.length) - results.push([ + for (const [file, id, isMain] of foundEntries) { + const score = Math.round(100 * queryLower.length / entry.length); + const result = [ docNames[file], titles[file], id ? "#" + id : "", null, score, filenames[file], - ]); + ]; + if (isMain) { + normalResults.push(result); + } else { + nonMainIndexResults.push(result); + } } } } // lookup as object objectTerms.forEach((term) => - results.push(...Search.performObjectSearch(term, objectTerms)) + normalResults.push(...Search.performObjectSearch(term, objectTerms)) ); // lookup as search terms in fulltext - results.push(...Search.performTermsSearch(searchTerms, excludedTerms)); + normalResults.push(...Search.performTermsSearch(searchTerms, excludedTerms)); // let the scorer override scores with a custom scoring function - if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item))); - - // now sort the results by score (in opposite order of appearance, since the - // display function below uses pop() to retrieve items) and then - // alphabetically - results.sort((a, b) => { - const leftScore = a[4]; - const rightScore = b[4]; - if (leftScore === rightScore) { - // same score: sort alphabetically - const leftTitle = a[1].toLowerCase(); - const rightTitle = b[1].toLowerCase(); - if (leftTitle === rightTitle) return 0; - return leftTitle > rightTitle ? -1 : 1; // inverted is intentional - } - return leftScore > rightScore ? 1 : -1; - }); + if (Scorer.score) { + normalResults.forEach((item) => (item[4] = Scorer.score(item))); + nonMainIndexResults.forEach((item) => (item[4] = Scorer.score(item))); + } + + // Sort each group of results by score and then alphabetically by name. + normalResults.sort(_orderResultsByScoreThenName); + nonMainIndexResults.sort(_orderResultsByScoreThenName); + + // Combine the result groups in (reverse) order. + // Non-main index entries are typically arbitrary cross-references, + // so display them after other results. + let results = [...nonMainIndexResults, ...normalResults]; // remove duplicate search results // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept @@ -353,14 +398,19 @@ const Search = { return acc; }, []); - results = results.reverse(); + return results.reverse(); + }, + + query: (query) => { + const [searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms] = Search._parseQuery(query); + const results = Search._performSearch(searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms); // for debugging //Search.lastresults = results.slice(); // a copy // console.info("search results:", Search.lastresults); // print the results - _displayNextItem(results, results.length, searchTerms); + _displayNextItem(results, results.length, searchTerms, highlightTerms); }, /** @@ -458,14 +508,18 @@ const Search = { // add support for partial matches if (word.length > 2) { const escapedWord = _escapeRegExp(word); - Object.keys(terms).forEach((term) => { - if (term.match(escapedWord) && !terms[word]) - arr.push({ files: terms[term], score: Scorer.partialTerm }); - }); - Object.keys(titleTerms).forEach((term) => { - if (term.match(escapedWord) && !titleTerms[word]) - arr.push({ files: titleTerms[word], score: Scorer.partialTitle }); - }); + if (!terms.hasOwnProperty(word)) { + Object.keys(terms).forEach((term) => { + if (term.match(escapedWord)) + arr.push({ files: terms[term], score: Scorer.partialTerm }); + }); + } + if (!titleTerms.hasOwnProperty(word)) { + Object.keys(titleTerms).forEach((term) => { + if (term.match(escapedWord)) + arr.push({ files: titleTerms[term], score: Scorer.partialTitle }); + }); + } } // no match but word was a required one @@ -488,9 +542,8 @@ const Search = { // create the mapping files.forEach((file) => { - if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1) - fileMap.get(file).push(word); - else fileMap.set(file, [word]); + if (!fileMap.has(file)) fileMap.set(file, [word]); + else if (fileMap.get(file).indexOf(word) === -1) fileMap.get(file).push(word); }); }); @@ -541,8 +594,8 @@ const Search = { * search summary for a given text. keywords is a list * of stemmed words. */ - makeSearchSummary: (htmlText, keywords) => { - const text = Search.htmlToText(htmlText); + makeSearchSummary: (htmlText, keywords, anchor) => { + const text = Search.htmlToText(htmlText, anchor); if (text === "") return null; const textLower = text.toLowerCase(); diff --git a/latest/_static/sphinx_highlight.js b/latest/_static/sphinx_highlight.js index aae669d7e..8a96c69a1 100644 --- a/latest/_static/sphinx_highlight.js +++ b/latest/_static/sphinx_highlight.js @@ -29,14 +29,19 @@ const _highlight = (node, addItems, text, className) => { } span.appendChild(document.createTextNode(val.substr(pos, text.length))); + const rest = document.createTextNode(val.substr(pos + text.length)); parent.insertBefore( span, parent.insertBefore( - document.createTextNode(val.substr(pos + text.length)), + rest, node.nextSibling ) ); node.nodeValue = val.substr(0, pos); + /* There may be more occurrences of search term in this node. So call this + * function recursively on the remaining fragment. + */ + _highlight(rest, addItems, text, className); if (isInSVG) { const rect = document.createElementNS( @@ -140,5 +145,10 @@ const SphinxHighlight = { }, }; -_ready(SphinxHighlight.highlightSearchWords); -_ready(SphinxHighlight.initEscapeListener); +_ready(() => { + /* Do not call highlightSearchWords() when we are on the search page. + * It will highlight words from the *previous* search query. + */ + if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); + SphinxHighlight.initEscapeListener(); +}); diff --git a/latest/backends.html b/latest/backends.html index 142065a4a..048514784 100644 --- a/latest/backends.html +++ b/latest/backends.html @@ -1,19 +1,21 @@ - + - + Backends — Kernel Tuner 1.0 documentation - - + + + + - - - + + + @@ -108,13 +110,13 @@
-

Backends

+

Backends

Kernel Tuner implements multiple backends for CUDA, one for OpenCL, one for HIP, and a generic Compiler backend.

Selecting a backend is in most cases automatic and is done based on the kernel’s programming language, but sometimes you’ll want to specifically choose a backend.

-

CUDA Backends

+

CUDA Backends

PyCUDA is default CUDA backend in Kernel Tuner. It is comparable in feature completeness with CuPy. Because the HIP kernel language is identical to the CUDA kernel language, HIP is included here as well. To use HIP on nvidia GPUs, see https://github.com/jatinx/hip-on-nv.

@@ -131,7 +133,7 @@

CUDA Backends -Backend feature support +Backend feature support

Feature

PyCUDA

@@ -195,7 +197,7 @@

CUDA Backends -Backend usage and compiler +Backend usage and compiler

Feature

PyCUDA

diff --git a/latest/cache_files.html b/latest/cache_files.html index 97ea0ec99..163c9834c 100644 --- a/latest/cache_files.html +++ b/latest/cache_files.html @@ -1,19 +1,21 @@ - + - + Cache files — Kernel Tuner 1.0 documentation - - + + + + - - - + + + @@ -103,7 +105,7 @@
-

Cache files

+

Cache files

A very useful feature of Kernel Tuner is the ability to store benchmarking results in a cache file during tuning. You can enable cache files by passing any filename to the cache= optional argument of tune_kernel.

The benchmark results of individual kernel configurations are appended to the cache file as Kernel Tuner is running. This also allows Kernel Tuner diff --git a/latest/contents.html b/latest/contents.html index 4158bfc20..f4637c5de 100644 --- a/latest/contents.html +++ b/latest/contents.html @@ -1,20 +1,24 @@ - + - + The Kernel Tuner documentation — Kernel Tuner 1.0 documentation - - + + + + - - - + + + + + @@ -102,7 +106,7 @@

-

The Kernel Tuner documentation

+

The Kernel Tuner documentation

Kernel Tuner

    diff --git a/latest/contributing.html b/latest/contributing.html index c52be4f0a..39d5826e8 100644 --- a/latest/contributing.html +++ b/latest/contributing.html @@ -1,19 +1,21 @@ - + - + Contribution guide — Kernel Tuner 1.0 documentation - - + + + + - - - + + + @@ -110,10 +112,10 @@
    -

    Contribution guide

    +

    Contribution guide

    Thank you for considering to contribute to Kernel Tuner!

    -

    Reporting Issues

    +

    Reporting Issues

    Not all contributions are code, creating an issue also helps us to improve. When you create an issue about a problem, please ensure the following:

    • Describe what you expected to happen.

    • @@ -123,7 +125,7 @@

      Reporting Issues -

      Contributing Code

      +

      Contributing Code

      For contributing code to Kernel Tuner please select an issue to work on or create a new issue to propose a change or addition. For significant changes, it is required to first create an issue and discuss the proposed changes. Then fork the repository, create a branch, one per change or addition, and create a pull request.

      Kernel Tuner follows the Google Python style guide, with Sphinxdoc docstrings for module public functions.

      Before creating a pull request please ensure the following:

      @@ -137,7 +139,7 @@

      Contributing Codedesign documentation, or discuss it in the issue regarding your additions.

    -

    Simple development setup

    +

    Simple development setup

    For small changes to the code you can setup a quick development environment with the following steps:

    • git clone git@github.com:KernelTuner/kernel_tuner.git

    • diff --git a/latest/convolution.html b/latest/convolution.html index 434d19f33..daff8a342 100644 --- a/latest/convolution.html +++ b/latest/convolution.html @@ -1,20 +1,22 @@ - + - + Convolution — Kernel Tuner 1.0 documentation - - - + + + + + - - - + + + @@ -112,13 +114,13 @@
      -

      Convolution

      +

      Convolution

      This guide is meant to get you started with writing your tests and tuning scripts using Kernel Tuner. We’ll use a simple 2D Convolution kernel as an example kernel, but as you will find out shortly, much of the scripts that you write with Kernel Tuner can be reused for testing and tuning other kernels.

      Note: If you are reading this guide on the Kernel Tuner’s documentation pages, note that you can actually run this guide as a Jupyter Notebook. Just clone the Kernel Tuner’s GitHub repository. Install using pip install .[tutorial,cuda] and you’re ready to go! You can start the guide by typing “jupyter notebook” in the “kernel_tuner/doc/source” directory.

      -

      2D Convolution example

      +

      2D Convolution example

      Convolution operations are essential to signal and image processing applications and are the main operation in convolutional neural networks used for deep learning. A convolution operation computes the linear combination of the weights in a convolution filter and a range of pixels from the input image for each output pixel. A 2D convolution of an input image \(I\) of size \((w\times h)\) and a convolution filter \(F\) of size \((F_w\times F_h)\) computes an output image \(O\) of size \(((w-F_w)\times (h-F_h))\): \begin{equation}\nonumber O(x,y) = \sum\limits_{j=0}^{F_h} \sum\limits_{i=0}^{F_w} I(x+i,y+j)\times F(i,j) @@ -154,7 +156,7 @@

      2D Convolution example

      -

      Implement a test

      +

      Implement a test

      We will start with using Kernel Tuner’s run_kernel function to call our naive 2D convolution kernel. But first we will have to create some input data, which we will do as follows:

      [ ]:
      @@ -266,7 +268,7 @@ 

      Implement a test -

      Tuning 2D Convolution

      +

      Tuning 2D Convolution

      In many cases there are more tunable parameters than just the thread block dimensions. We have included a highly-optimized 2D Convolution kernel that contains many parametrized code optimizations. It’s a bit long to include here, so instead we just point to the file, you may need to adjust the path a little bit depending on where you’ve stored the Kernel Tuner’s source code and where this notebook is executing.

      [ ]:
      @@ -311,7 +313,7 @@ 

      Tuning 2D Convolution

      -

      More tunable parameters

      +

      More tunable parameters

      I promised that we would use more tunable parameters than just thread block dimensions. Our 2D Convolution kernel also also supports tiling factors in the x and y dimensions. Tiling factors indicate that the amount of work performed by the thread block in a particular dimension is increased with a certain factor.

      [ ]:
      diff --git a/latest/correctness.html b/latest/correctness.html
      index ed27c4544..8bc96a552 100644
      --- a/latest/correctness.html
      +++ b/latest/correctness.html
      @@ -1,19 +1,21 @@
       
      -
      +
       
      -  
      +  
       
         
         Correctness Verification — Kernel Tuner 1.0 documentation
      -      
      -      
      +      
      +      
      +
      +  
         
         
      -        
      -        
      -        
      +        
      +        
      +        
               
           
           
      @@ -103,7 +105,7 @@
                  
      -

      Correctness Verification

      +

      Correctness Verification

      Whenever you optimize a program for performance it is very important to ensure that the program is still producing the correct output. What good is a program that is fast but not correct?

      diff --git a/latest/design.html b/latest/design.html index 46d143adf..1edc725af 100644 --- a/latest/design.html +++ b/latest/design.html @@ -1,19 +1,21 @@ - + - + Design documentation — Kernel Tuner 1.0 documentation - - + + + + - - - + + + @@ -216,7 +218,7 @@
      -

      Design documentation

      +

      Design documentation

      This section provides detailed information about the design and internals of the Kernel Tuner. This information is mostly relevant for developers.

      The Kernel Tuner is designed to be extensible and support @@ -260,72 +262,72 @@ discussed above. For the documentation of the user API see the API Documentation.

      -

      Strategies

      +

      Strategies

      Strategies are explained in Optimization strategies.

      Many of the strategies use helper functions that are collected in kernel_tuner.strategies.common.

      -

      kernel_tuner.strategies.common

      +

      kernel_tuner.strategies.common

      -kernel_tuner.strategies.common.get_options(strategy_options, options)
      +kernel_tuner.strategies.common.get_options(strategy_options, options)

      Get the strategy-specific options or their defaults from user-supplied strategy_options.

      -kernel_tuner.strategies.common.get_strategy_docstring(name, strategy_options)
      +kernel_tuner.strategies.common.get_strategy_docstring(name, strategy_options)

      Generate docstring for a ‘tune’ method of a strategy.

      -kernel_tuner.strategies.common.make_strategy_options_doc(strategy_options)
      +kernel_tuner.strategies.common.make_strategy_options_doc(strategy_options)

      Generate documentation for the supported strategy options and their defaults.

      -kernel_tuner.strategies.common.scale_from_params(params, tune_params, eps)
      +kernel_tuner.strategies.common.scale_from_params(params, tune_params, eps)

      Helper func to do the inverse of the ‘unscale’ function.

      -kernel_tuner.strategies.common.setup_method_arguments(method, bounds)
      +kernel_tuner.strategies.common.setup_method_arguments(method, bounds)

      Prepare method specific arguments.

      -kernel_tuner.strategies.common.setup_method_options(method, tuning_options)
      +kernel_tuner.strategies.common.setup_method_options(method, tuning_options)

      Prepare method specific options.

      -kernel_tuner.strategies.common.snap_to_nearest_config(x, tune_params)
      +kernel_tuner.strategies.common.snap_to_nearest_config(x, tune_params)

      Helper func that for each param selects the closest actual value.

      -kernel_tuner.strategies.common.unscale_and_snap_to_nearest(x, tune_params, eps)
      +kernel_tuner.strategies.common.unscale_and_snap_to_nearest(x, tune_params, eps)

      Helper func that snaps a scaled variable to the nearest config.

      -

      Runners

      +

      Runners

      -

      kernel_tuner.runners.sequential.SequentialRunner

      +

      kernel_tuner.runners.sequential.SequentialRunner

      -class kernel_tuner.runners.sequential.SequentialRunner(kernel_source, kernel_options, device_options, iterations, observers)
      +class kernel_tuner.runners.sequential.SequentialRunner(kernel_source, kernel_options, device_options, iterations, observers)

      SequentialRunner is used for tuning with a single process/thread.

      -__init__(kernel_source, kernel_options, device_options, iterations, observers)
      +__init__(kernel_source, kernel_options, device_options, iterations, observers)

      Instantiate the SequentialRunner.

      Parameters:
      @@ -343,7 +345,7 @@

      kernel_tuner.runners.sequential.SequentialRunner
      -run(parameter_space, tuning_options)
      +run(parameter_space, tuning_options)

      Iterate through the entire parameter space using a single Python process.

      Parameters:
      @@ -367,14 +369,14 @@

      kernel_tuner.runners.sequential.SequentialRunner -

      kernel_tuner.runners.sequential.SimulationRunner

      +

      kernel_tuner.runners.sequential.SimulationRunner

      -class kernel_tuner.runners.simulation.SimulationRunner(kernel_source, kernel_options, device_options, iterations, observers)
      +class kernel_tuner.runners.simulation.SimulationRunner(kernel_source, kernel_options, device_options, iterations, observers)

      SimulationRunner is used for tuning with a single process/thread.

      -__init__(kernel_source, kernel_options, device_options, iterations, observers)
      +__init__(kernel_source, kernel_options, device_options, iterations, observers)

      Instantiate the SimulationRunner.

      Parameters:
      @@ -392,7 +394,7 @@

      kernel_tuner.runners.sequential.SimulationRunner
      -run(parameter_space, tuning_options)
      +run(parameter_space, tuning_options)

      Iterate through the entire parameter space using a single Python process.

      Parameters:
      @@ -417,16 +419,16 @@

      kernel_tuner.runners.sequential.SimulationRunner -

      Device Interfaces

      +

      Device Interfaces

      -

      kernel_tuner.core.DeviceInterface

      +

      kernel_tuner.core.DeviceInterface

      -class kernel_tuner.core.DeviceInterface(kernel_source, device=0, platform=0, quiet=False, compiler=None, compiler_options=None, iterations=7, observers=None)
      +class kernel_tuner.core.DeviceInterface(kernel_source, device=0, platform=0, quiet=False, compiler=None, compiler_options=None, iterations=7, observers=None)

      Class that offers a High-Level Device Interface to the rest of the Kernel Tuner

      -__init__(kernel_source, device=0, platform=0, quiet=False, compiler=None, compiler_options=None, iterations=7, observers=None)
      +__init__(kernel_source, device=0, platform=0, quiet=False, compiler=None, compiler_options=None, iterations=7, observers=None)

      Instantiate the DeviceInterface, based on language in kernel source

      Parameters:
      @@ -450,91 +452,91 @@

      kernel_tuner.core.DeviceInterface
      -benchmark(func, gpu_args, instance, verbose, objective, skip_nvml_setting=False)
      +benchmark(func, gpu_args, instance, verbose, objective, skip_nvml_setting=False)

      Benchmark the kernel instance.

      -benchmark_continuous(func, gpu_args, threads, grid, result, duration)
      +benchmark_continuous(func, gpu_args, threads, grid, result, duration)

      Benchmark continuously for at least ‘duration’ seconds

      -benchmark_default(func, gpu_args, threads, grid, result)
      +benchmark_default(func, gpu_args, threads, grid, result)

      Benchmark one kernel execution at a time.

      -check_kernel_output(func, gpu_args, instance, answer, atol, verify, verbose)
      +check_kernel_output(func, gpu_args, instance, answer, atol, verify, verbose)

      runs the kernel once and checks the result against answer

      -compile_kernel(instance, verbose)
      +compile_kernel(instance, verbose)

      compile the kernel for this specific instance

      -copy_constant_memory_args(cmem_args)
      +copy_constant_memory_args(cmem_args)

      adds constant memory arguments to the most recently compiled module

      -copy_shared_memory_args(smem_args)
      +copy_shared_memory_args(smem_args)

      adds shared memory arguments to the most recently compiled module

      -copy_texture_memory_args(texmem_args)
      +copy_texture_memory_args(texmem_args)

      adds texture memory arguments to the most recently compiled module

      -create_kernel_instance(kernel_source, kernel_options, params, verbose)
      +create_kernel_instance(kernel_source, kernel_options, params, verbose)

      create kernel instance from kernel source, parameters, problem size, grid divisors, and so on

      -get_environment()
      +get_environment()

      Return dictionary with information about the environment

      -memcpy_dtoh(dest, src)
      +memcpy_dtoh(dest, src)

      perform a device to host memory copy

      -static preprocess_gpu_arguments(old_arguments, params)
      +static preprocess_gpu_arguments(old_arguments, params)

      Get a flat list of arguments based on the configuration given by params

      -ready_argument_list(arguments)
      +ready_argument_list(arguments)

      ready argument list to be passed to the kernel, allocates gpu mem if necessary

      -run_kernel(func, gpu_args, instance)
      +run_kernel(func, gpu_args, instance)

      Run a compiled kernel instance on a device

      -set_nvml_parameters(instance)
      +set_nvml_parameters(instance)

      Set the NVML parameters. Avoids setting time leaking into benchmark time.

      @@ -542,14 +544,14 @@

      kernel_tuner.core.DeviceInterface -

      kernel_tuner.backends.pycuda.PyCudaFunctions

      +

      kernel_tuner.backends.pycuda.PyCudaFunctions

      -class kernel_tuner.backends.pycuda.PyCudaFunctions(device=0, iterations=7, compiler_options=None, observers=None)
      +class kernel_tuner.backends.pycuda.PyCudaFunctions(device=0, iterations=7, compiler_options=None, observers=None)

      Class that groups the CUDA functions on maintains state about the device.

      -__init__(device=0, iterations=7, compiler_options=None, observers=None)
      +__init__(device=0, iterations=7, compiler_options=None, observers=None)

      Instantiate PyCudaFunctions object used for interacting with the CUDA device.

      Instantiating this object will inspect and store certain device properties at runtime, which are used during compilation and/or execution of kernels by the @@ -567,7 +569,7 @@

      kernel_tuner.backends.pycuda.PyCudaFunctions
      -compile(kernel_instance)
      +compile(kernel_instance)

      Call the CUDA compiler to compile the kernel, return the device function.

      Parameters:
      @@ -588,7 +590,7 @@

      kernel_tuner.backends.pycuda.PyCudaFunctions
      -copy_constant_memory_args(cmem_args)
      +copy_constant_memory_args(cmem_args)

      Adds constant memory arguments to the most recently compiled module.

      Parameters:
      @@ -603,13 +605,13 @@

      kernel_tuner.backends.pycuda.PyCudaFunctions
      -copy_shared_memory_args(smem_args)
      +copy_shared_memory_args(smem_args)

      Add shared memory arguments to the kernel.

      -copy_texture_memory_args(texmem_args)
      +copy_texture_memory_args(texmem_args)

      Adds texture memory arguments to the most recently compiled module.

      Parameters:
      @@ -621,13 +623,13 @@

      kernel_tuner.backends.pycuda.PyCudaFunctions
      -kernel_finished()
      +kernel_finished()

      Returns True if the kernel has finished, False otherwise.

      -memcpy_dtoh(dest, src)
      +memcpy_dtoh(dest, src)

      Perform a device to host memory copy.

      Parameters:
      @@ -641,7 +643,7 @@

      kernel_tuner.backends.pycuda.PyCudaFunctions
      -memcpy_htod(dest, src)
      +memcpy_htod(dest, src)

      Perform a host to device memory copy.

      Parameters:
      @@ -655,7 +657,7 @@

      kernel_tuner.backends.pycuda.PyCudaFunctions
      -memset(allocation, value, size)
      +memset(allocation, value, size)

      Set the memory in allocation to the value in value.

      Parameters:
      @@ -670,7 +672,7 @@

      kernel_tuner.backends.pycuda.PyCudaFunctions
      -ready_argument_list(arguments)
      +ready_argument_list(arguments)

      Ready argument list to be passed to the kernel, allocates gpu mem.

      Parameters:
      @@ -689,7 +691,7 @@

      kernel_tuner.backends.pycuda.PyCudaFunctions
      -run_kernel(func, gpu_args, threads, grid, stream=None)
      +run_kernel(func, gpu_args, threads, grid, stream=None)

      Runs the CUDA kernel passed as ‘func’.

      Parameters:
      @@ -709,19 +711,19 @@

      kernel_tuner.backends.pycuda.PyCudaFunctions
      -start_event()
      +start_event()

      Records the event that marks the start of a measurement.

      -stop_event()
      +stop_event()

      Records the event that marks the end of a measurement.

      -synchronize()
      +synchronize()

      Halts execution until device has finished its tasks.

      @@ -729,14 +731,14 @@

      kernel_tuner.backends.pycuda.PyCudaFunctions -

      kernel_tuner.backends.cupy.CupyFunctions

      +

      kernel_tuner.backends.cupy.CupyFunctions

      -class kernel_tuner.backends.cupy.CupyFunctions(device=0, iterations=7, compiler_options=None, observers=None)
      +class kernel_tuner.backends.cupy.CupyFunctions(device=0, iterations=7, compiler_options=None, observers=None)

      Class that groups the Cupy functions on maintains state about the device.

      -__init__(device=0, iterations=7, compiler_options=None, observers=None)
      +__init__(device=0, iterations=7, compiler_options=None, observers=None)

      Instantiate CupyFunctions object used for interacting with the CUDA device.

      Instantiating this object will inspect and store certain device properties at runtime, which are used during compilation and/or execution of kernels by the @@ -754,7 +756,7 @@

      kernel_tuner.backends.cupy.CupyFunctions
      -compile(kernel_instance)
      +compile(kernel_instance)

      Call the CUDA compiler to compile the kernel, return the device function.

      Parameters:
      @@ -775,7 +777,7 @@

      kernel_tuner.backends.cupy.CupyFunctions
      -copy_constant_memory_args(cmem_args)
      +copy_constant_memory_args(cmem_args)

      Adds constant memory arguments to the most recently compiled module.

      Parameters:
      @@ -790,13 +792,13 @@

      kernel_tuner.backends.cupy.CupyFunctions
      -copy_shared_memory_args(smem_args)
      +copy_shared_memory_args(smem_args)

      Add shared memory arguments to the kernel.

      -copy_texture_memory_args(texmem_args)
      +copy_texture_memory_args(texmem_args)

      Adds texture memory arguments to the most recently compiled module.

      Parameters:
      @@ -808,13 +810,13 @@

      kernel_tuner.backends.cupy.CupyFunctions
      -kernel_finished()
      +kernel_finished()

      Returns True if the kernel has finished, False otherwise.

      -memcpy_dtoh(dest, src)
      +memcpy_dtoh(dest, src)

      Perform a device to host memory copy.

      Parameters:
      @@ -828,7 +830,7 @@

      kernel_tuner.backends.cupy.CupyFunctions
      -memcpy_htod(dest, src)
      +memcpy_htod(dest, src)

      Perform a host to device memory copy.

      Parameters:
      @@ -842,7 +844,7 @@

      kernel_tuner.backends.cupy.CupyFunctions
      -memset(allocation, value, size)
      +memset(allocation, value, size)

      Set the memory in allocation to the value in value.

      Parameters:
      @@ -857,7 +859,7 @@

      kernel_tuner.backends.cupy.CupyFunctions
      -ready_argument_list(arguments)
      +ready_argument_list(arguments)

      Ready argument list to be passed to the kernel, allocates gpu mem.

      Parameters:
      @@ -876,7 +878,7 @@

      kernel_tuner.backends.cupy.CupyFunctions
      -run_kernel(func, gpu_args, threads, grid, stream=None)
      +run_kernel(func, gpu_args, threads, grid, stream=None)

      Runs the CUDA kernel passed as ‘func’.

      Parameters:
      @@ -896,19 +898,19 @@

      kernel_tuner.backends.cupy.CupyFunctions
      -start_event()
      +start_event()

      Records the event that marks the start of a measurement.

      -stop_event()
      +stop_event()

      Records the event that marks the end of a measurement.

      -synchronize()
      +synchronize()

      Halts execution until device has finished its tasks.

      @@ -916,14 +918,14 @@

      kernel_tuner.backends.cupy.CupyFunctions -

      kernel_tuner.backends.nvcuda.CudaFunctions

      +

      kernel_tuner.backends.nvcuda.CudaFunctions

      -class kernel_tuner.backends.nvcuda.CudaFunctions(device=0, iterations=7, compiler_options=None, observers=None)
      +class kernel_tuner.backends.nvcuda.CudaFunctions(device=0, iterations=7, compiler_options=None, observers=None)

      Class that groups the Cuda functions on maintains state about the device.

      -__init__(device=0, iterations=7, compiler_options=None, observers=None)
      +__init__(device=0, iterations=7, compiler_options=None, observers=None)

      Instantiate CudaFunctions object used for interacting with the CUDA device.

      Instantiating this object will inspect and store certain device properties at runtime, which are used during compilation and/or execution of kernels by the @@ -943,7 +945,7 @@

      kernel_tuner.backends.nvcuda.CudaFunctions
      -compile(kernel_instance)
      +compile(kernel_instance)

      Call the CUDA compiler to compile the kernel, return the device function.

      Parameters:
      @@ -964,7 +966,7 @@

      kernel_tuner.backends.nvcuda.CudaFunctions
      -copy_constant_memory_args(cmem_args)
      +copy_constant_memory_args(cmem_args)

      Adds constant memory arguments to the most recently compiled module.

      Parameters:
      @@ -979,13 +981,13 @@

      kernel_tuner.backends.nvcuda.CudaFunctions
      -copy_shared_memory_args(smem_args)
      +copy_shared_memory_args(smem_args)

      Add shared memory arguments to the kernel.

      -copy_texture_memory_args(texmem_args)
      +copy_texture_memory_args(texmem_args)

      Adds texture memory arguments to the most recently compiled module.

      Parameters:
      @@ -997,13 +999,13 @@

      kernel_tuner.backends.nvcuda.CudaFunctions
      -kernel_finished()
      +kernel_finished()

      Returns True if the kernel has finished, False otherwise.

      -static memcpy_dtoh(dest, src)
      +static memcpy_dtoh(dest, src)

      Perform a device to host memory copy.

      Parameters:
      @@ -1017,7 +1019,7 @@

      kernel_tuner.backends.nvcuda.CudaFunctions
      -static memcpy_htod(dest, src)
      +static memcpy_htod(dest, src)

      Perform a host to device memory copy.

      Parameters:
      @@ -1031,7 +1033,7 @@

      kernel_tuner.backends.nvcuda.CudaFunctions
      -static memset(allocation, value, size)
      +static memset(allocation, value, size)

      Set the memory in allocation to the value in value.

      Parameters:
      @@ -1046,7 +1048,7 @@

      kernel_tuner.backends.nvcuda.CudaFunctions
      -ready_argument_list(arguments)
      +ready_argument_list(arguments)

      Ready argument list to be passed to the kernel, allocates gpu mem.

      Parameters:
      @@ -1065,7 +1067,7 @@

      kernel_tuner.backends.nvcuda.CudaFunctions
      -run_kernel(func, gpu_args, threads, grid, stream=None)
      +run_kernel(func, gpu_args, threads, grid, stream=None)

      Runs the CUDA kernel passed as ‘func’.

      Parameters:
      @@ -1085,19 +1087,19 @@

      kernel_tuner.backends.nvcuda.CudaFunctions
      -start_event()
      +start_event()

      Records the event that marks the start of a measurement.

      -stop_event()
      +stop_event()

      Records the event that marks the end of a measurement.

      -static synchronize()
      +static synchronize()

      Halts execution until device has finished its tasks.

      @@ -1105,14 +1107,14 @@

      kernel_tuner.backends.nvcuda.CudaFunctions -

      kernel_tuner.backends.opencl.OpenCLFunctions

      +

      kernel_tuner.backends.opencl.OpenCLFunctions

      -class kernel_tuner.backends.opencl.OpenCLFunctions(device=0, platform=0, iterations=7, compiler_options=None, observers=None)
      +class kernel_tuner.backends.opencl.OpenCLFunctions(device=0, platform=0, iterations=7, compiler_options=None, observers=None)

      Class that groups the OpenCL functions on maintains some state about the device.

      -__init__(device=0, platform=0, iterations=7, compiler_options=None, observers=None)
      +__init__(device=0, platform=0, iterations=7, compiler_options=None, observers=None)

      Creates OpenCL device context and reads device properties.

      Parameters:
      @@ -1126,7 +1128,7 @@

      kernel_tuner.backends.opencl.OpenCLFunctions
      -compile(kernel_instance)
      +compile(kernel_instance)

      Call the OpenCL compiler to compile the kernel, return the device function.

      Parameters:
      @@ -1147,31 +1149,31 @@

      kernel_tuner.backends.opencl.OpenCLFunctions
      -copy_constant_memory_args(cmem_args)
      +copy_constant_memory_args(cmem_args)

      This method must implement the allocation and copy of constant memory to the GPU.

      -copy_shared_memory_args(smem_args)
      +copy_shared_memory_args(smem_args)

      This method must implement the dynamic allocation of shared memory on the GPU.

      -copy_texture_memory_args(texmem_args)
      +copy_texture_memory_args(texmem_args)

      This method must implement the allocation and copy of texture memory to the GPU.

      -kernel_finished()
      +kernel_finished()

      Returns True if the kernel has finished, False otherwise.

      -memcpy_dtoh(dest, src)
      +memcpy_dtoh(dest, src)

      Perform a device to host memory copy.

      Parameters:
      @@ -1185,7 +1187,7 @@

      kernel_tuner.backends.opencl.OpenCLFunctions
      -memcpy_htod(dest, src)
      +memcpy_htod(dest, src)

      Perform a host to device memory copy.

      Parameters:
      @@ -1199,7 +1201,7 @@

      kernel_tuner.backends.opencl.OpenCLFunctions
      -memset(buffer, value, size)
      +memset(buffer, value, size)

      Set the memory in allocation to the value in value.

      Parameters:
      @@ -1214,7 +1216,7 @@

      kernel_tuner.backends.opencl.OpenCLFunctions
      -ready_argument_list(arguments)
      +ready_argument_list(arguments)

      Ready argument list to be passed to the kernel, allocates gpu mem.

      Parameters:
      @@ -1233,7 +1235,7 @@

      kernel_tuner.backends.opencl.OpenCLFunctions
      -run_kernel(func, gpu_args, threads, grid)
      +run_kernel(func, gpu_args, threads, grid)

      Runs the OpenCL kernel passed as ‘func’.

      Parameters:
      @@ -1253,21 +1255,21 @@

      kernel_tuner.backends.opencl.OpenCLFunctions
      -start_event()
      +start_event()

      Records the event that marks the start of a measurement.

      In OpenCL the event is created when the kernel is launched

      -stop_event()
      +stop_event()

      Records the event that marks the end of a measurement.

      In OpenCL the event is created when the kernel is launched

      -synchronize()
      +synchronize()

      Halts execution until device has finished its tasks.

      @@ -1275,14 +1277,14 @@

      kernel_tuner.backends.opencl.OpenCLFunctions -

      kernel_tuner.backends.compiler.CompilerFunctions

      +

      kernel_tuner.backends.compiler.CompilerFunctions

      -class kernel_tuner.backends.compiler.CompilerFunctions(iterations=7, compiler_options=None, compiler=None, observers=None)
      +class kernel_tuner.backends.compiler.CompilerFunctions(iterations=7, compiler_options=None, compiler=None, observers=None)

      Class that groups the code for running and compiling C functions

      -__init__(iterations=7, compiler_options=None, compiler=None, observers=None)
      +__init__(iterations=7, compiler_options=None, compiler=None, observers=None)

      instantiate CFunctions object used for interacting with C code

      Parameters:
      @@ -1293,13 +1295,13 @@

      kernel_tuner.backends.compiler.CompilerFunctions
      -cleanup_lib()
      +cleanup_lib()

      unload the previously loaded shared library

      -compile(kernel_instance)
      +compile(kernel_instance)

      call the C compiler to compile the kernel, return the function

      Parameters:
      @@ -1317,14 +1319,14 @@

      kernel_tuner.backends.compiler.CompilerFunctions
      -kernel_finished()
      +kernel_finished()

      Returns True if the kernel has finished, False otherwise

      C backend does not support asynchronous launches

      -memcpy_dtoh(dest, src)
      +memcpy_dtoh(dest, src)

      a simple memcpy copying from an Argument to a numpy array

      Parameters:
      @@ -1338,7 +1340,7 @@

      kernel_tuner.backends.compiler.CompilerFunctions
      -memcpy_htod(dest, src)
      +memcpy_htod(dest, src)

      a simple memcpy copying from a numpy array to an Argument

      Parameters:
      @@ -1352,7 +1354,7 @@

      kernel_tuner.backends.compiler.CompilerFunctions
      -memset(allocation, value, size)
      +memset(allocation, value, size)

      set the memory in allocation to the value in value

      Parameters:
      @@ -1367,7 +1369,7 @@

      kernel_tuner.backends.compiler.CompilerFunctions
      -ready_argument_list(arguments)
      +ready_argument_list(arguments)

      ready argument list to be passed to the C function

      Parameters:
      @@ -1386,7 +1388,7 @@

      kernel_tuner.backends.compiler.CompilerFunctions
      -run_kernel(func, c_args, threads, grid, stream=None)
      +run_kernel(func, c_args, threads, grid, stream=None)

      runs the kernel once, returns whatever the kernel returns

      Parameters:
      @@ -1414,21 +1416,21 @@

      kernel_tuner.backends.compiler.CompilerFunctions
      -start_event()
      +start_event()

      Records the event that marks the start of a measurement

      C backend does not use events

      -stop_event()
      +stop_event()

      Records the event that marks the end of a measurement

      C backend does not use events

      -synchronize()
      +synchronize()

      Halts execution until device has finished its tasks

      C backend does not support asynchronous launches

      @@ -1437,14 +1439,14 @@

      kernel_tuner.backends.compiler.CompilerFunctions -

      kernel_tuner.backends.hip.HipFunctions

      +

      kernel_tuner.backends.hip.HipFunctions

      -class kernel_tuner.backends.hip.HipFunctions(device=0, iterations=7, compiler_options=None, observers=None)
      +class kernel_tuner.backends.hip.HipFunctions(device=0, iterations=7, compiler_options=None, observers=None)

      Class that groups the HIP functions on maintains state about the device.

      -__init__(device=0, iterations=7, compiler_options=None, observers=None)
      +__init__(device=0, iterations=7, compiler_options=None, observers=None)

      Instantiate HipFunctions object used for interacting with the HIP device.

      Instantiating this object will inspect and store certain device properties at runtime, which are used during compilation and/or execution of kernels by the @@ -1462,7 +1464,7 @@

      kernel_tuner.backends.hip.HipFunctions
      -compile(kernel_instance)
      +compile(kernel_instance)

      Call the HIP compiler to compile the kernel, return the function.

      Parameters:
      @@ -1480,7 +1482,7 @@

      kernel_tuner.backends.hip.HipFunctions
      -copy_constant_memory_args(cmem_args)
      +copy_constant_memory_args(cmem_args)

      Adds constant memory arguments to the most recently compiled module.

      Parameters:
      @@ -1495,25 +1497,25 @@

      kernel_tuner.backends.hip.HipFunctions
      -copy_shared_memory_args(smem_args)
      +copy_shared_memory_args(smem_args)

      Add shared memory arguments to the kernel.

      -copy_texture_memory_args(texmem_args)
      +copy_texture_memory_args(texmem_args)

      Copy texture memory arguments. Not yet implemented.

      -kernel_finished()
      +kernel_finished()

      Returns True if the kernel has finished, False otherwise.

      -memcpy_dtoh(dest, src)
      +memcpy_dtoh(dest, src)

      Perform a device to host memory copy.

      Parameters:
      @@ -1527,7 +1529,7 @@

      kernel_tuner.backends.hip.HipFunctions
      -memcpy_htod(dest, src)
      +memcpy_htod(dest, src)

      Perform a host to device memory copy.

      Parameters:
      @@ -1541,7 +1543,7 @@

      kernel_tuner.backends.hip.HipFunctions
      -memset(allocation, value, size)
      +memset(allocation, value, size)

      Set the memory in allocation to the value in value.

      Parameters:
      @@ -1556,7 +1558,7 @@

      kernel_tuner.backends.hip.HipFunctions
      -ready_argument_list(arguments)
      +ready_argument_list(arguments)

      Ready argument list to be passed to the HIP function.

      Parameters:
      @@ -1575,7 +1577,7 @@

      kernel_tuner.backends.hip.HipFunctions
      -run_kernel(func, gpu_args, threads, grid, stream=None)
      +run_kernel(func, gpu_args, threads, grid, stream=None)

      Runs the HIP kernel passed as ‘func’.

      Parameters:
      @@ -1595,19 +1597,19 @@

      kernel_tuner.backends.hip.HipFunctions
      -start_event()
      +start_event()

      Records the event that marks the start of a measurement.

      -stop_event()
      +stop_event()

      Records the event that marks the end of a measurement.

      -synchronize()
      +synchronize()

      Halts execution until device has finished its tasks.

      @@ -1616,32 +1618,32 @@

      kernel_tuner.backends.hip.HipFunctions -

      Util Functions

      +

      Util Functions

      -

      kernel_tuner.util

      +

      kernel_tuner.util

      Module for kernel tuner utility functions.

      -class kernel_tuner.util.CompilationFailedConfig
      +class kernel_tuner.util.CompilationFailedConfig
      -class kernel_tuner.util.ErrorConfig
      +class kernel_tuner.util.ErrorConfig
      -class kernel_tuner.util.InvalidConfig
      +class kernel_tuner.util.InvalidConfig
      -class kernel_tuner.util.NpEncoder(*, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, sort_keys=False, indent=None, separators=None, default=None)
      +class kernel_tuner.util.NpEncoder(*, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, sort_keys=False, indent=None, separators=None, default=None)

      Class we use for dumping Numpy objects to JSON.

      -default(obj)
      +default(obj)

      Implement this method in a subclass such that it returns a serializable object for o, or calls the base implementation (to raise a TypeError).

      @@ -1664,138 +1666,138 @@

      Util Functions
      -class kernel_tuner.util.RuntimeFailedConfig
      +class kernel_tuner.util.RuntimeFailedConfig

      -exception kernel_tuner.util.SkippableFailure
      +exception kernel_tuner.util.SkippableFailure

      Exception used to raise when compiling or launching a kernel fails for a reason that can be expected.

      -exception kernel_tuner.util.StopCriterionReached
      +exception kernel_tuner.util.StopCriterionReached

      Exception thrown when a stop criterion has been reached.

      -kernel_tuner.util.check_argument_list(kernel_name, kernel_string, args)
      +kernel_tuner.util.check_argument_list(kernel_name, kernel_string, args)

      Raise an exception if a kernel arguments do not match host arguments.

      -kernel_tuner.util.check_argument_type(dtype, kernel_argument)
      +kernel_tuner.util.check_argument_type(dtype, kernel_argument)

      Check if the numpy.dtype matches the type used in the code.

      -kernel_tuner.util.check_restriction(restrict, params: dict) bool
      +kernel_tuner.util.check_restriction(restrict, params: dict) bool

      Check whether a configuration meets a search space restriction.

      -kernel_tuner.util.check_restrictions(restrictions, params: dict, verbose: bool) bool
      +kernel_tuner.util.check_restrictions(restrictions, params: dict, verbose: bool) bool

      Check whether a configuration meets the search space restrictions.

      -kernel_tuner.util.check_stop_criterion(to)
      +kernel_tuner.util.check_stop_criterion(to)

      Checks if max_fevals is reached or time limit is exceeded.

      -kernel_tuner.util.check_thread_block_dimensions(params, max_threads, block_size_names=None)
      +kernel_tuner.util.check_thread_block_dimensions(params, max_threads, block_size_names=None)

      Check on maximum thread block dimensions.

      -kernel_tuner.util.check_tune_params_list(tune_params, observers, simulation_mode=False)
      +kernel_tuner.util.check_tune_params_list(tune_params, observers, simulation_mode=False)

      Raise an exception if a tune parameter has a forbidden name.

      -kernel_tuner.util.compile_restrictions(restrictions: list, tune_params: dict, monolithic=False, try_to_constraint=True) list[tuple[Union[str, constraint.constraints.Constraint, function], list[str]]]
      +kernel_tuner.util.compile_restrictions(restrictions: list, tune_params: dict, monolithic=False, try_to_constraint=True) list[tuple[str | Constraint | LambdaType, list[str]]]

      Parses restrictions from a list of strings into a list of strings, Functions, or Constraints (if try_to_constraint) and parameters used, or a single Function if monolithic is true.

      -kernel_tuner.util.config_valid(config, tuning_options, max_threads)
      +kernel_tuner.util.config_valid(config, tuning_options, max_threads)

      Combines restrictions and a check on the max thread block dimension to check config validity.

      -kernel_tuner.util.convert_constraint_restriction(restrict: Constraint)
      +kernel_tuner.util.convert_constraint_restriction(restrict: Constraint)

      Convert the python-constraint to a function for backwards compatibility.

      -kernel_tuner.util.correct_open_cache(cache, open_cache=True)
      -

      if cache file was not properly closed, pretend it was properly closed

      +kernel_tuner.util.correct_open_cache(cache, open_cache=True) +

      If cache file was not properly closed, pretend it was properly closed.

      -kernel_tuner.util.cuda_error_check(error)
      +kernel_tuner.util.cuda_error_check(error)

      Checking the status of CUDA calls using the NVIDIA cuda-python backend.

      -kernel_tuner.util.delete_temp_file(filename)
      +kernel_tuner.util.delete_temp_file(filename)

      Delete a temporary file, don’t complain if no longer exists.

      -kernel_tuner.util.detect_language(kernel_string)
      +kernel_tuner.util.detect_language(kernel_string)

      Attempt to detect language from the kernel_string.

      -kernel_tuner.util.dump_cache(obj: str, tuning_options)
      +kernel_tuner.util.dump_cache(obj: str, tuning_options)

      Dumps a string in the cache, this omits the several checks of store_cache() to speed up the process - with great power comes great responsibility!

      -kernel_tuner.util.get_best_config(results, objective, objective_higher_is_better=False)
      +kernel_tuner.util.get_best_config(results, objective, objective_higher_is_better=False)

      Returns the best configuration from a list of results according to some objective.

      -kernel_tuner.util.get_config_string(params, keys=None, units=None)
      +kernel_tuner.util.get_config_string(params, keys=None, units=None)

      Return a compact string representation of a measurement.

      -kernel_tuner.util.get_grid_dimensions(current_problem_size, params, grid_div, block_size_names)
      +kernel_tuner.util.get_grid_dimensions(current_problem_size, params, grid_div, block_size_names)

      Compute grid dims based on problem sizes and listed grid divisors.

      -kernel_tuner.util.get_instance_string(params)
      +kernel_tuner.util.get_instance_string(params)

      Combine the parameters to a string mostly used for debug output use of dict is advised.

      -kernel_tuner.util.get_kernel_string(kernel_source, params=None)
      +kernel_tuner.util.get_kernel_string(kernel_source, params=None)

      Retrieve the kernel source and return as a string.

      This function processes the passed kernel_source argument, which could be a function, a string with a filename, or just a string with code already.

      @@ -1825,43 +1827,43 @@

      Util Functions
      -kernel_tuner.util.get_problem_size(problem_size, params)
      +kernel_tuner.util.get_problem_size(problem_size, params)

      Compute current problem size.

      -kernel_tuner.util.get_smem_args(smem_args, params)
      +kernel_tuner.util.get_smem_args(smem_args, params)

      Return a dict with kernel instance specific size.

      -kernel_tuner.util.get_temp_filename(suffix=None)
      +kernel_tuner.util.get_temp_filename(suffix=None)

      Return a string in the form of temp_X, where X is a large integer.

      -kernel_tuner.util.get_thread_block_dimensions(params, block_size_names=None)
      +kernel_tuner.util.get_thread_block_dimensions(params, block_size_names=None)

      Thread block size from tuning params, currently using convention.

      -kernel_tuner.util.get_total_timings(results, env, overhead_time)
      +kernel_tuner.util.get_total_timings(results, env, overhead_time)

      Sum all timings and put their totals in the env.

      -kernel_tuner.util.looks_like_a_filename(kernel_source)
      +kernel_tuner.util.looks_like_a_filename(kernel_source)

      Attempt to detect whether source code or a filename was passed.

      -kernel_tuner.util.normalize_verify_function(v)
      +kernel_tuner.util.normalize_verify_function(v)

      Normalize a user-specified verify function.

      The user-specified function has two required positional arguments (answer, result_host), and an optional keyword (or keyword-only) argument atol. We normalize it to always accept @@ -1871,13 +1873,13 @@

      Util Functions
      -kernel_tuner.util.parse_restrictions(restrictions: list[str], tune_params: dict, monolithic=False, try_to_constraint=True) list[tuple[Union[constraint.constraints.Constraint, str], list[str]]]
      +kernel_tuner.util.parse_restrictions(restrictions: list[str], tune_params: dict, monolithic=False, try_to_constraint=True) list[tuple[Constraint | str, list[str]]]

      Parses restrictions from a list of strings into compilable functions and constraints, or a single compilable function (if monolithic is True). Returns a list of tuples of (strings or constraints) and parameters.

      -kernel_tuner.util.prepare_kernel_string(kernel_name, kernel_string, params, grid, threads, block_size_names, lang, defines)
      +kernel_tuner.util.prepare_kernel_string(kernel_name, kernel_string, params, grid, threads, block_size_names, lang, defines)

      Prepare kernel string for compilation.

      Prepends the kernel with a series of C preprocessor defines specific to this kernel instance:

      @@ -1916,19 +1918,19 @@

      Util Functions
      -kernel_tuner.util.print_config(config, tuning_options, runner)
      +kernel_tuner.util.print_config(config, tuning_options, runner)

      Print the configuration string with tunable parameters and benchmark results.

      -kernel_tuner.util.print_config_output(tune_params, params, quiet, metrics, units)
      +kernel_tuner.util.print_config_output(tune_params, params, quiet, metrics, units)

      Print the configuration string with tunable parameters and benchmark results.

      -kernel_tuner.util.process_cache(cache, kernel_options, tuning_options, runner)
      +kernel_tuner.util.process_cache(cache, kernel_options, tuning_options, runner)

      Cache file for storing tuned configurations.

      the cache file is stored using JSON and uses the following format:

      { device_name: "name of device"
      @@ -1950,7 +1952,7 @@ 

      Util Functions
      -kernel_tuner.util.process_metrics(params, metrics)
      +kernel_tuner.util.process_metrics(params, metrics)

      Process user-defined metrics for derived benchmark results.

      Metrics must be a dictionary to support composable metrics. The dictionary keys describe the name given to this user-defined metric and will be used as the key in the results dictionaries @@ -1983,43 +1985,43 @@

      Util Functions
      -kernel_tuner.util.read_cache(cache, open_cache=True)
      +kernel_tuner.util.read_cache(cache, open_cache=True)

      Read the cachefile into a dictionary, if open_cache=True prepare the cachefile for appending.

      -kernel_tuner.util.read_file(filename)
      +kernel_tuner.util.read_file(filename)

      Return the contents of the file named filename or None if file not found.

      -kernel_tuner.util.replace_param_occurrences(string: str, params: dict)
      +kernel_tuner.util.replace_param_occurrences(string: str, params: dict)

      Replace occurrences of the tuning params with their current value.

      -kernel_tuner.util.setup_block_and_grid(problem_size, grid_div, params, block_size_names=None)
      +kernel_tuner.util.setup_block_and_grid(problem_size, grid_div, params, block_size_names=None)

      Compute problem size, thread block and grid dimensions for this kernel.

      -kernel_tuner.util.store_cache(key, params, tuning_options)
      +kernel_tuner.util.store_cache(key, params, tuning_options)

      Stores a new entry (key, params) to the cachefile.

      -kernel_tuner.util.to_valid_nvrtc_gpu_arch_cc(compute_capability: str) str
      +kernel_tuner.util.to_valid_nvrtc_gpu_arch_cc(compute_capability: str) str

      Returns a valid Compute Capability for NVRTC –gpu-architecture=, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.

      -kernel_tuner.util.write_file(filename, string)
      +kernel_tuner.util.write_file(filename, string)

      Dump the contents of string to a file called filename.

      diff --git a/latest/dev-environment.html b/latest/dev-environment.html index e27f6de7c..4cc3263db 100644 --- a/latest/dev-environment.html +++ b/latest/dev-environment.html @@ -1,19 +1,21 @@ - + - + Development environment — Kernel Tuner 1.0 documentation - - + + + + - - - + + + @@ -110,12 +112,12 @@
      -

      Development environment

      +

      Development environment

      The following steps help you set up a full development environment. These steps are only needed for core developers of Kernel Tuner who need to test against multiple Python versions or change dependencies of Kernel Tuner.

      For small changes to the code, please see the simplified instructions in the Simple development setup.

      -

      Local setup

      +

      Local setup

      Steps with sudo access (e.g. on a local device):

      1. Clone the git repository to the desired location: git clone https://github.com/KernelTuner/kernel_tuner.git, and cd to it.

      2. @@ -138,12 +140,12 @@

        Local setup
        Install the required Python versions:
        • On some systems, additional packages may be needed to build Python versions. For example on Ubuntu: sudo apt install build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev liblzma-dev lzma.

        • -
        • Install the Python versions with: pyenv install 3.8 3.9 3.10 3.11. The reason we’re installing all these versions as opposed to just one, is so we can test against all supported Python versions.

        • +
        • Install the Python versions with: pyenv install 3.9 3.10 3.11 3.12. The reason we’re installing all these versions as opposed to just one, is so we can test against all supported Python versions.

      -
    • Set the Python versions so they can be found: pyenv local 3.8 3.9 3.10 3.11 (replace local with global when not using the virtualenv).

    • +
    • Set the Python versions so they can be found: pyenv local 3.9 3.10 3.11 3.12 (replace local with global when not using the virtualenv).

    • Setup a local virtual environment in the folder: pyenv virtualenv 3.11 kerneltuner (or whatever environment name and Python version you prefer).

    • Install Poetry.
        @@ -184,7 +186,7 @@

        Local setup -

        Cluster setup

        +

        Cluster setup

        Steps without sudo access (e.g. on a cluster):

        1. Clone the git repository to the desired location: git clone https://github.com/KernelTuner/kernel_tuner.git.

        2. @@ -263,7 +265,7 @@

          Cluster setup -

          Running tests

          +

          Running tests

          To run the tests you can use nox (to run against all supported Python versions in isolated environments) and pytest (to run against the local Python version, see below) in the top-level directory. For full coverage, make Nox use the additional tests (such as cupy and cuda-python) with nox -- additional-tests.

          The Nox isolated environments can take up to 1 gigabyte in size, so users tight on diskspace can run nox with the small-disk option. This removes the other environment caches before each session is ran (note that this will take longer to run). A better option would be to change the location environments are stored in with envdir in the noxsettings.toml file.

          @@ -284,7 +286,7 @@

          Running tests -

          Building documentation

          +

          Building documentation

          Documentation is located in the doc/ directory. This is where you can type make html to generate the html pages in the doc/build/html directory. The source files used for building the documentation are located in diff --git a/latest/diffusion.html b/latest/diffusion.html index 0aec5079d..ae5e821ff 100644 --- a/latest/diffusion.html +++ b/latest/diffusion.html @@ -1,20 +1,22 @@ - + - + Diffusion — Kernel Tuner 1.0 documentation - - - + + + + + - - - + + + @@ -115,7 +117,7 @@

          -

          Diffusion

          +

          Diffusion

          This guide is designed to show you the whole process starting from modeling a physical process to a Python implementation to creating optimized and auto-tuned GPU application using Kernel Tuner.

          In this guide, we will use diffusion as an example application.

          We start with modeling the physical process of diffusion, for which we create a simple numerical implementation in Python. Then we create a CUDA kernel that performs the same computation, but on the GPU. Once we have a CUDA kernel, we start using the Kernel Tuner for auto-tuning our GPU application. And finally, we’ll introduce a few code optimizations to our CUDA kernel that will improve performance, but also add more parameters to tune on using the Kernel Tuner.

          @@ -123,7 +125,7 @@

          DiffusionGitHub repository. Install using pip install .[tutorial,cuda] and you’re ready to go! You can start the guide by typing “jupyter notebook” in the “kernel_tuner/doc/source” directory.

          -

          Diffusion

          +

          Diffusion

          Put simply, diffusion is the redistribution of something from a region of high concentration to a region of low concentration without bulk motion. The concept of diffusion is widely used in many fields, including physics, chemistry, biology, and many more.

          Suppose that we take a metal sheet, in which the temperature is exactly equal to one degree everywhere in the sheet. Now if we were to heat a number of points on the sheet to a very high temperature, say a thousand degrees, in an instant by some method. We could see the heat diffuse from these hotspots to the cooler areas. We are assuming that the metal does not melt. In addition, we will ignore any heat loss from radiation or other causes in this example.

          We can use the diffusion equation to model how the heat diffuses through our metal sheet:

          @@ -159,7 +161,7 @@

          Diffusion

          -

          Python implementation

          +

          Python implementation

          We can create a Python function that implements the numerical approximation defined in the above equation. For simplicity we’ll use the assumption of a free boundary condition.

          [2]:
          @@ -262,7 +264,7 @@ 

          Python implementation

          -

          Computing on the GPU

          +

          Computing on the GPU

          The next step in this guide is to implement a GPU kernel that will allow us to run our problem on the GPU. We store the kernel code in a Python string, because we can directly compile and run the kernel from Python. In this guide, we’ll use the CUDA programming model to implement our kernels.

          If you prefer OpenCL over CUDA, don’t worry. Everything in this guide applies as much to OpenCL as it does to CUDA. But we will use CUDA for our examples, and CUDA terminology in the text.

          @@ -391,7 +393,7 @@

          Computing on the GPUAlso, if you think the Python boilerplate code to call a GPU kernel was a bit messy, we’ve got good news for you! From now on, we’ll only use the Kernel Tuner to compile and benchmark GPU kernels, which we can do with much cleaner Python code.

          -

          Auto-Tuning with the Kernel Tuner

          +

          Auto-Tuning with the Kernel Tuner

          Remember that previously we’ve set the thread block dimensions to 16 by 16. But how do we actually know if that is the best performing setting? That is where auto-tuning comes into play. Basically, it is very difficult to provide an answer through performance modeling and as such, we’d rather use the Kernel Tuner to compile and benchmark all possible kernel configurations.

          But before we continue, we’ll increase the problem size, because the GPU is very likely underutilized.

          @@ -487,7 +489,7 @@

          Auto-Tuning with the Kernel Tuner -

          Using Shared Memory

          +

          Using Shared Memory

          Shared memory, is a special type of the memory available in CUDA. Shared memory can be used by threads within the same thread block to exchange and share values. It is in fact, one of the very few ways for threads to communicate on the GPU.

          The idea is that we’ll try improve the performance of our kernel by using shared memory as a software controlled cache. There are already caches on the GPU, but most GPUs only cache accesses to global memory in L2. Shared memory is closer to the multiprocessors where the thread blocks are executed, comparable to an L1 cache.

          However, because there are also hardware caches, the performance improvement from this step is expected to not be that great. The more fine-grained control that we get by using a software managed cache, rather than a hardware implemented cache, comes at the cost of some instruction overhead. In fact, performance is quite likely to degrade a little. However, this intermediate step is necessary for the next optimization step we have in mind.

          @@ -570,7 +572,7 @@

          Using Shared Memory -

          Tiling GPU Code

          +

          Tiling GPU Code

          One very useful code optimization is called tiling, sometimes also called thread-block-merge. You can look at it in this way, currently we have many thread blocks that together work on the entire domain. If we were to use only half of the number of thread blocks, every thread block would need to double the amount of work it performs to cover the entire domain. However, the threads may be able to reuse part of the data and computation that is required to process a single output element for every element beyond the first.

          This is a code optimization because effectively we are reducing the total number of instructions executed by all threads in all thread blocks. So in a way, were are condensing the total instruction stream while keeping the all the really necessary compute instructions. More importantly, we are increasing data reuse, where previously these values would have been reused from the cache or in the worst-case from GPU memory.

          @@ -876,7 +878,7 @@

          Tiling GPU Code -

          Storing the results

          +

          Storing the results

          While it’s nice that the Kernel Tuner prints the tuning results to stdout, it’s not that great if we’d have to parse what is printed to get the results. That is why the tune_kernel() returns a data structure that holds all the results. We’ve actually already used this data in the above bit of Python code.

          tune_kernel returns a list of dictionaries, where each benchmarked kernel is represented by a dictionary containing the tunable parameters for that particular kernel configuration and one more entry called ‘time’. The list of dictionaries format is very flexible and can easily be converted other formats that are easy to parse formats, like json or csv, for further analysis.

          You can execute the following code block to store the tuning results to both a json and a csv file (if you have Pandas installed).

          diff --git a/latest/diffusion_opencl.html b/latest/diffusion_opencl.html index 286e59da2..b72ce8782 100644 --- a/latest/diffusion_opencl.html +++ b/latest/diffusion_opencl.html @@ -1,20 +1,22 @@ - + - + Tutorial: From physics to tuned GPU kernels — Kernel Tuner 1.0 documentation - - - + + + + + - - - + + + @@ -104,7 +106,7 @@
          -

          Tutorial: From physics to tuned GPU kernels

          +

          Tutorial: From physics to tuned GPU kernels

          This tutorial is designed to show you the whole process starting from modeling a physical process to a Python implementation to creating optimized and auto-tuned GPU application using Kernel Tuner.

          In this tutorial, we will use diffusion as an example application.

          We start with modeling the physical process of diffusion, for which we create a simple numerical implementation in Python. Then we create an OpenCL kernel that performs the same computation, but on the GPU. Once we have a OpenCL kernel, we start using the Kernel Tuner for auto-tuning our GPU application. And finally, we’ll introduce a few code optimizations to our OpenCL kernel that will improve performance, but also add more parameters to tune on using the Kernel Tuner.

          @@ -112,7 +114,7 @@

          Tutorial: From physics to tuned GPU kernelsGitHub repository. Install using pip install .[tutorial,opencl] and you’re ready to go! You can start the tutorial by typing “jupyter notebook” in the “kernel_tuner/doc/source” directory.

          -

          Diffusion

          +

          Diffusion

          Put simply, diffusion is the redistribution of something from a region of high concentration to a region of low concentration without bulk motion. The concept of diffusion is widely used in many fields, including physics, chemistry, biology, and many more.

          Suppose that we take a metal sheet, in which the temperature is exactly equal to one degree everywhere in the sheet. Now if we were to heat a number of points on the sheet to a very high temperature, say a thousand degrees, in an instant by some method. We could see the heat diffuse from these hotspots to the cooler areas. We are assuming that the metal does not melt. In addition, we will ignore any heat loss from radiation or other causes in this example.

          We can use the diffusion equation to model how the heat diffuses through our metal sheet:

          @@ -148,7 +150,7 @@

          Diffusion -

          Python implementation

          +

          Python implementation

          We can create a Python function that implements the numerical approximation defined in the above equation. For simplicity we’ll use the assumption of a free boundary condition.

          [2]:
          @@ -249,7 +251,7 @@ 

          Python implementation

          -

          Computing on the GPU

          +

          Computing on the GPU

          The next step in this tutorial is to implement a GPU kernel that will allow us to run our problem on the GPU. We store the kernel code in a Python string, because we can directly compile and run the kernel from Python. In this tutorial, we’ll use the OpenCL programming model to implement our kernels.

          -

          Auto-Tuning with the Kernel Tuner

          +

          Auto-Tuning with the Kernel Tuner

          Remember that previously we’ve set the thread block dimensions to 16 by 16. But how do we actually know if that is the best performing setting? That is where auto-tuning comes into play. Basically, it is very difficult to provide an answer through performance modeling and as such, we’d rather use the Kernel Tuner to compile and benchmark all possible kernel configurations.

          But before we continue, we’ll increase the problem size, because the GPU is very likely underutilized.

          @@ -453,7 +455,7 @@

          Auto-Tuning with the Kernel Tuner -

          Using Shared (local) Memory

          +

          Using Shared (local) Memory

          Shared (or local) memory, is a special type of the memory available in OpenCL. Shared memory can be used by threads within the same thread block to exchange and share values. It is in fact, one of the very few ways for threads to communicate on the GPU.

          The idea is that we’ll try improve the performance of our kernel by using shared memory as a software controlled cache. There are already caches on the GPU, but most GPUs only cache accesses to global memory in L2. Shared memory is closer to the multiprocessors where the thread blocks are executed, comparable to an L1 cache.

          However, because there are also hardware caches, the performance improvement from this step is expected to not be that great. The more fine-grained control that we get by using a software managed cache, rather than a hardware implemented cache, comes at the cost of some instruction overhead. In fact, performance is quite likely to degrade a little. However, this intermediate step is necessary for the next optimization step we have in mind.

          @@ -535,7 +537,7 @@

          Using Shared (local) Memory -

          Tiling GPU Code

          +

          Tiling GPU Code

          One very useful code optimization is called tiling, sometimes also called thread-block-merge. You can look at it in this way, currently we have many thread blocks that together work on the entire domain. If we were to use only half of the number of thread blocks, every thread block would need to double the amount of work it performs to cover the entire domain. However, the threads may be able to reuse part of the data and computation that is required to process a single output element for every element beyond the first.

          This is a code optimization because effectively we are reducing the total number of instructions executed by all threads in all thread blocks. So in a way, were are condensing the total instruction stream while keeping the all the really necessary compute instructions. More importantly, we are increasing data reuse, where previously these values would have been reused from the cache or in the worst-case from GPU memory.

          @@ -615,7 +617,7 @@

          Tiling GPU Code +
          @@ -727,13 +729,6 @@

          Tiling GPU Code -
          -
          -
          -
           block_size_x=48, block_size_y=4, tile_size_x=4, tile_size_y=4, time=0.6813376
           block_size_x=48, block_size_y=8, tile_size_x=1, tile_size_y=1, time=1.1493952
           block_size_x=48, block_size_y=8, tile_size_x=1, tile_size_y=2, time=0.8444928
          @@ -848,7 +843,7 @@ 

          Tiling GPU Code -

          Storing the results

          +

          Storing the results

          While it’s nice that the Kernel Tuner prints the tuning results to stdout, it’s not that great if we’d have to parse what is printed to get the results. That is why the tune_kernel() returns a data structure that holds all the results. We’ve actually already used this data in the above bit of Python code.

          tune_kernel returns a list of dictionaries, where each benchmarked kernel is represented by a dictionary containing the tunable parameters for that particular kernel configuration and one more entry called ‘time’. The list of dictionaries format is very flexible and can easily be converted other formats that are easy to parse formats, like json or csv, for further analysis.

          You can execute the following code block to store the tuning results to both a json and a csv file (if you have Pandas installed).

          diff --git a/latest/diffusion_use_optparam.html b/latest/diffusion_use_optparam.html index a3d9ccc0d..4e7f19d2e 100644 --- a/latest/diffusion_use_optparam.html +++ b/latest/diffusion_use_optparam.html @@ -1,20 +1,22 @@ - + - + Tutorial: From physics to tuned GPU kernels — Kernel Tuner 1.0 documentation - - - + + + + + - - - + + + @@ -104,7 +106,7 @@
          -

          Tutorial: From physics to tuned GPU kernels

          +

          Tutorial: From physics to tuned GPU kernels

          This tutorial is designed to show you the whole process starting from modeling a physical process to a Python implementation to creating optimized and auto-tuned GPU application using Kernel Tuner.

          In this tutorial, we will use diffusion as an example application.

          We start with modeling the physical process of diffusion, for which we create a simple numerical implementation in Python. Then we create a CUDA kernel that performs the same computation, but on the GPU. Once we have a CUDA kernel, we start using the Kernel Tuner for auto-tuning our GPU application. And finally, we’ll introduce a few code optimizations to our CUDA kernel that will improve performance, but also add more parameters to tune on using the Kernel Tuner.

          @@ -112,7 +114,7 @@

          Tutorial: From physics to tuned GPU kernelsGitHub repository. Install the Kernel Tuner and Jupyter Notebooks and you’re ready to go! You can start the tutorial by typing “jupyter notebook” in the “kernel_tuner/doc/source” directory.

          -

          Diffusion

          +

          Diffusion

          Put simply, diffusion is the redistribution of something from a region of high concentration to a region of low concentration without bulk motion. The concept of diffusion is widely used in many fields, including physics, chemistry, biology, and many more.

          Suppose that we take a metal sheet, in which the temperature is exactly equal to one degree everywhere in the sheet. Now if we were to heat a number of points on the sheet to a very high temperature, say a thousand degrees, in an instant by some method. We could see the heat diffuse from these hotspots to the cooler areas. We are assuming that the metal does not melt. In addition, we will ignore any heat loss from radiation or other causes in this example.

          We can use the diffusion equation to model how the heat diffuses through our metal sheet:

          @@ -148,7 +150,7 @@

          Diffusion -

          Python implementation

          +

          Python implementation

          We can create a Python function that implements the numerical approximation defined in the above equation. For simplicity we’ll use the assumption of a free boundary condition.

          [2]:
          @@ -257,7 +259,7 @@ 

          Python implementation

          -

          Computing on the GPU

          +

          Computing on the GPU

          The next step in this tutorial is to implement a GPU kernel that will allow us to run our problem on the GPU. We store the kernel code in a Python string, because we can directly compile and run the kernel from Python. In this tutorial, we’ll use the CUDA programming model to implement our kernels.

          If you prefer OpenCL over CUDA, don’t worry. Everything in this tutorial applies as much to OpenCL as it does to CUDA. But we will use CUDA for our examples, and CUDA terminology in the text.

          @@ -361,7 +363,7 @@

          Computing on the GPUAlso, if you think the Python boilerplate code to call a GPU kernel was a bit messy, we’ve got good news for you! From now on, we’ll only use the Kernel Tuner to compile and benchmark GPU kernels, which we can do with much cleaner Python code.

          -

          Auto-Tuning with the Kernel Tuner

          +

          Auto-Tuning with the Kernel Tuner

          Remember that previously we’ve set the thread block dimensions to 16 by 16. But how do we actually know if that is the best performing setting? That is where auto-tuning comes into play. Basically, it is very difficult to provide an answer through performance modeling and as such, we’d rather use the Kernel Tuner to compile and benchmark all possible kernel configurations.

          But before we continue, we’ll increase the problem size, because the GPU is very likely underutilized.

          @@ -456,7 +458,7 @@

          Auto-Tuning with the Kernel Tuner -

          Using shared memory

          +

          Using shared memory

          Shared memory, is a special type of the memory available in CUDA. Shared memory can be used by threads within the same thread block to exchange and share values. It is in fact, one of the very few ways for threads to communicate on the GPU.

          The idea is that we’ll try improve the performance of our kernel by using shared memory as a software controlled cache. There are already caches on the GPU, but most GPUs only cache accesses to global memory in L2. Shared memory is closer to the multiprocessors where the thread blocks are executed, comparable to an L1 cache.

          However, because there are also hardware caches, the performance improvement from this step is expected to not be that great. The more fine-grained control that we get by using a software managed cache, rather than a hardware implemented cache, comes at the cost of some instruction overhead. In fact, performance is quite likely to degrade a little. However, this intermediate step is necessary for the next optimization step we have in mind.

          @@ -546,7 +548,7 @@

          Using shared memory -

          Tiling GPU Code

          +

          Tiling GPU Code

          One very useful code optimization is called tiling, sometimes also called thread-block-merge. You can look at it in this way, currently we have many thread blocks that together work on the entire domain. If we were to use only half of the number of thread blocks, every thread block would need to double the amount of work it performs to cover the entire domain. However, the threads may be able to reuse part of the data and computation that is required to process a single output element for every element beyond the first.

          This is a code optimization because effectively we are reducing the total number of instructions executed by all threads in all thread blocks. So in a way, were are condensing the total instruction stream while keeping the all the really necessary compute instructions. More importantly, we are increasing data reuse, where previously these values would have been reused from the cache or in the worst-case from GPU memory.

          @@ -621,7 +623,7 @@

          Tiling GPU Code +
          @@ -726,13 +728,6 @@

          Tiling GPU Code -
          -
          -
          -
           block_size_x=48, block_size_y=4, tile_size_x=1, tile_size_y=2, time=0.593977594376
           block_size_x=48, block_size_y=4, tile_size_x=1, tile_size_y=4, time=0.49723520875
           block_size_x=48, block_size_y=4, tile_size_x=2, tile_size_y=1, time=0.583270406723
          @@ -828,10 +823,10 @@ 

          Tiling GPU Code -

          Using the best parameters in a production run

          +

          Using the best parameters in a production run

          Now that we have determined which parameters are the best for our problems we can use them to simulate the heat diffusion problem. There are several ways to do so depending on the host language you wish to use.

          -

          Python run

          +

          Python run

          To use the optimized parameters in a python run, we simply have to modify the kernel code to specify which value to use for the block and tile size. There are of course many different ways to achieve this. In simple cases on can define a dictionary of values and replace the string block_size_i and tile_size_j by their values.

          [18]:
          @@ -927,7 +922,7 @@ 

          Python run -

          C run

          +

          C run

          If you wish to incorporate the optimized parameters in the kernel and use it in a C run you can use ifndef statement at the begining of the kerenel as demonstrated in the psedo code below.

          [ ]:
          diff --git a/latest/examples.html b/latest/examples.html
          index cde01499e..ed6eab402 100644
          --- a/latest/examples.html
          +++ b/latest/examples.html
          @@ -1,19 +1,21 @@
           
          -
          +
           
          -  
          +  
           
             
             Kernel Tuner Examples — Kernel Tuner 1.0 documentation
          -      
          -      
          +      
          +      
          +
          +  
             
             
          -        
          -        
          -        
          +        
          +        
          +        
                   
               
               
          @@ -122,7 +124,7 @@
             
          -

          Kernel Tuner Examples

          +

          Kernel Tuner Examples

          Most of the examples show how to use Kernel Tuner to tune a CUDA, OpenCL, or C kernel, while demonstrating a particular usecase of Kernel Tuner.

          Except for test_vector_add.py and @@ -136,7 +138,7 @@

          Below we list the example applications and the features they illustrate.

          -

          Vector Add

          +

          Vector Add

          [CUDA] [CUDA-C++] [OpenCL] [C] [Fortran] [OpenACC-C++] [OpenACC-Fortran]
          • use Kernel Tuner to tune a simple kernel

          • @@ -145,7 +147,7 @@

            Vector Add -

            Stencil

            +

            Stencil

            [CUDA] [OpenCL]
            • use a 2-dimensional problem domain with 2-dimensional thread blocks in a simple and clean example

            • @@ -154,7 +156,7 @@

              Stencil -

              Matrix Multiplication

              +

              Matrix Multiplication

              [CUDA] [OpenCL]
              • pass a filename instead of a string with code

              • @@ -167,12 +169,12 @@

                Matrix Multiplication

          -

          Convolution

          +

          Convolution

          There are several different examples centered around the convolution kernel [CUDA] [OpenCL]

          -

          convolution.py

          +

          convolution.py

          [CUDA] [OpenCL]
          • use tunable parameters for tuning for multiple input sizes

          • @@ -183,7 +185,7 @@

            convolution.py -

            sepconv.py

            +

            sepconv.py

            [CUDA] [OpenCL]
            • use the convolution kernel for separable filters

            • @@ -193,7 +195,7 @@

              sepconv.py -

              convolution_correct.py

              +

              convolution_correct.py

              [CUDA] [OpenCL]
              • use run_kernel to compute a reference answer

              • @@ -203,7 +205,7 @@

                convolution_correct.py

          -

          convolution_streams.py

          +

          convolution_streams.py

          [CUDA]
          • allocate page-locked host memory from Python

          • @@ -217,7 +219,7 @@

            convolution_streams.py

          -

          Reduction

          +

          Reduction

          [CUDA] [OpenCL]
          • use vector types and shuffle instructions (shuffle is only available in CUDA)

          • @@ -230,7 +232,7 @@

            Reduction -

            Sparse Matrix Vector Multiplication

            +

            Sparse Matrix Vector Multiplication

            [CUDA]
            • use scipy to compute a reference answer and verify all benchmarked kernels

            • @@ -240,7 +242,7 @@

              Sparse Matrix Vector Multiplication -

              Point-in-Polygon

              +

              Point-in-Polygon

              [CUDA]
              • overlap transfers with device mapped host memory

              • @@ -250,7 +252,7 @@

                Point-in-Polygon -

                ExpDist

                +

                ExpDist

                [CUDA]
                • in-thread block 2D reduction using CUB library

                • @@ -261,7 +263,7 @@

                  ExpDist -

                  Code Generator

                  +

                  Code Generator

                  [CUDA] [OpenCL]
                  • use a Python function as a code generator

                  • diff --git a/latest/genindex.html b/latest/genindex.html index 88f5ac7cc..8b6a2852b 100644 --- a/latest/genindex.html +++ b/latest/genindex.html @@ -1,18 +1,20 @@ - + Index — Kernel Tuner 1.0 documentation - - + + + + - - - + + + diff --git a/latest/grid3d.html b/latest/grid3d.html index fdeed281d..1156437b5 100644 --- a/latest/grid3d.html +++ b/latest/grid3d.html @@ -1,20 +1,22 @@ - + - + 3D Grid on GPU with Kernel Tuner — Kernel Tuner 1.0 documentation - - - + + + + + - - - + + + @@ -104,13 +106,13 @@
                    -

                    3D Grid on GPU with Kernel Tuner

                    +

                    3D Grid on GPU with Kernel Tuner

                    In this tutorial we are going to see how to map a series of Gaussian functions, each located at a different point on a 3D a grid. We are going to optimize the GPU code and compare its performance with the CPU implementation.

                    Note: If you are reading this tutorial on the Kernel Tuner’s documentation pages, note that you can actually run this tutorial as a Jupyter Notebook. Just clone the Kernel Tuner’s GitHub repository. Install the Kernel Tuner and Jupyter Notebooks and you’re ready to go! You can start the tutorial by typing “jupyter notebook” in the “kernel_tuner/doc/source” directory.

                    -

                    Let’s start on the CPU

                    +

                    Let’s start on the CPU

                    Before delving into the GPU implementation, let’s start with a simple CPU implementation of the problem. The problem at hand is to compute the values of the following function

                    \begin{equation} \nonumber f = \sum_{i=1}^{N}\exp\left(-\beta \sqrt{(x-x_i)^2+(y-y_i)^2+(z-z_i)^2}\right) @@ -175,7 +177,7 @@

                    Let’s start on the CPUDepending on your hardware it might take a few seconds for the calculations above to finish.

                    -

                    Let’s move to the GPU

                    +

                    Let’s move to the GPU

                    Let’s see now how that will look like on the GPU. We first write a kernel that does the same calculation as the above function. As you can see see below, the variables block_size_x, block_size_y and block_size_z are not yet defined here. These variables are used to set the number of threads per thread block on the GPU and are the main parameters that we will optimize in this tutorial. During tuning, Kernel Tuner will automatically insert #define statements for these parameters at the top of the kernel code. So for now we don’t have to specify their values.

                    The dimensions of the problem nx, ny, and nz, are the number of grid points in the x, y, and z dimensions. We can again use Kernel Tuner to insert these parameters into the code.

                    @@ -221,7 +223,7 @@

                    Let’s move to the GPU

                    -

                    Tune the kernel

                    +

                    Tune the kernel

                    We can now use the tuner to optimize the thread block dimensions on our GPU. To do so we define the tunable parameters of our kernel using the tune_params dictionary, which assigns to each block size the values we want the tuner to explore. We also use the tunable parameters to insert the domain dimensions nx, ny, and nz.

                    We also define a list containing the arguments of the CUDA function (AddGrid) above. Since we only want to optimize the performance of the kernel we only consider here one center in the middle of the grid. Note that Kernel Tuner needs either numpy.ndarray or numpy.scalar as arguments of the kernel. Hence we need to be specific on the types of the Gaussians positions.

                    @@ -366,7 +368,7 @@

                    Tune the kernel -

                    Using the optimized parameters

                    +

                    Using the optimized parameters

                    Now that we have determined which parameters are the best suited for our application we can specify them in our kernel and run it. In our case, the optimal grid size determined by the tuner were block_size_x = 4, block_size_y = 2, block_size_z=16. We therefore use these parameters here to define the block size. The grid size is simply obtained by dividing the dimension of the problem by the corresponding block size.

                    [6]:
                    diff --git a/latest/hostcode.html b/latest/hostcode.html
                    index 44ed78b16..7d2bec503 100644
                    --- a/latest/hostcode.html
                    +++ b/latest/hostcode.html
                    @@ -1,19 +1,21 @@
                     
                    -
                    +
                     
                    -  
                    +  
                     
                       
                       Tuning Host Code — Kernel Tuner 1.0 documentation
                    -      
                    -      
                    +      
                    +      
                    +
                    +  
                       
                       
                    -        
                    -        
                    -        
                    +        
                    +        
                    +        
                             
                         
                         
                    @@ -106,7 +108,7 @@
                                
                    -

                    Tuning Host Code

                    +

                    Tuning Host Code

                    With the Kernel Tuner it is also possible to tune the host code of your GPU programs, or even just any C function for that matter. Tuning host code can be useful when it contains parameters that have impact on the performance of kernel on the GPU, such as the number of streams to use when executing a kernel across multiple streams. Another example is when you want to include the data transfers between @@ -132,7 +134,7 @@

                    Tuning Host CodeC vector add example we are using the omp_get_wtime() function from OpenMP to measure time on the CPU.

                    -

                    Tuning the number of streams

                    +

                    Tuning the number of streams

                    The following describes the example in examples/cuda/convolution_streams.py. In this example, the same convolution kernel is used as with correctness checking and convolution application example.

                    What is different is that we also supply the host code, which you can find in examples/cuda/convolution_streams.cu. It is a bit diff --git a/latest/index.html b/latest/index.html index a325f19de..0d428b2d6 100644 --- a/latest/index.html +++ b/latest/index.html @@ -1,19 +1,21 @@ - + - + The Kernel Tuner documentation — Kernel Tuner 1.0 documentation - - + + + + - - - + + + @@ -108,7 +110,7 @@

                    -

                    The Kernel Tuner documentation

                    +

                    The Kernel Tuner documentation

                    Kernel Tuner is a software development tool for the creation of highly-optimized and tuned GPU applications.

                    The Kernel Tuner documentation pages are mostly about Kernel Tuner itself, but there are a number of related repositories that are considered part of the Kernel Tuner family:

                    @@ -120,7 +122,7 @@

                    The Kernel Tuner documentation -

                    Quick install

                    +

                    Quick install

                    The easiest way to install the Kernel Tuner is using pip:

                    To tune CUDA kernels:

                    -

                    Example usage

                    +

                    Example usage

                    The following shows a simple example for tuning a CUDA kernel:

                    kernel_string = """
                     __global__ void vector_add(float *c, float *a, float *b, int n) {
                    @@ -172,7 +174,7 @@ 

                    Example usage -

                    Citation

                    +

                    Citation

                    If you use Kernel Tuner in research or research software, please cite the most relevant among the following publications:

                    The first paper on Kernel Tuner, please note that the capabilities of Kernel Tuner have significantly expanded since the first publication:

                    @article{kerneltuner,
                    diff --git a/latest/install.html b/latest/install.html
                    index 90e904efd..7e4aabd58 100644
                    --- a/latest/install.html
                    +++ b/latest/install.html
                    @@ -1,19 +1,21 @@
                     
                    -
                    +
                     
                    -  
                    +  
                     
                       
                       Installation — Kernel Tuner 1.0 documentation
                    -      
                    -      
                    +      
                    +      
                    +
                    +  
                       
                       
                    -        
                    -        
                    -        
                    +        
                    +        
                    +        
                             
                         
                         
                    @@ -115,23 +117,23 @@
                       
                    -

                    Installation

                    +

                    Installation

                    The Kernel Tuner requires several packages to be installed. First of all, you need a working Python version, several Python packages, and optionally CUDA and/or OpenCL installations. All of this is explained in detail in this guide.

                    For comprehensive step-by-step instructions on setting up a development environment, see Development Environment.

                    -

                    Python

                    +

                    Python

                    You need a Python installation. We recommend using Python 3 and installing it with Miniconda. Linux users could type the following to download and install Python 3 using Miniconda:

                    wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
                     bash Miniconda3-latest-Linux-x86_64.sh
                     
                    -

                    You are of course also free to use your own Python installation, and the Kernel Tuner is developed to be fully compatible with Python 3.8 and newer.

                    +

                    You are of course also free to use your own Python installation, and the Kernel Tuner is developed to be fully compatible with Python 3.9 and newer.

                    -

                    Installing Python Packages

                    +

                    Installing Python Packages

                    Note that when you are using a native Python installation, the pip command used to install Kernel Tuner and its dependencies requires sudo rights for system wide installation.

                    @@ -147,7 +149,7 @@

                    Installing Python Packages -

                    CUDA and PyCUDA

                    +

                    CUDA and PyCUDA

                    Installing CUDA and PyCUDA is optional, because you may want to only use Kernel Tuner for tuning OpenCL or C kernels.

                    If you want to use the Kernel Tuner to tune @@ -173,12 +175,12 @@

                    Installing Python Packageshttps://wiki.tiker.net/PyCuda/Installation)

                    -

                    Other CUDA Backends

                    +

                    Other CUDA Backends

                    Kernel Tuner can also be used with CuPy (https://cupy.dev/) or Nvidia’s CUDA Python bindings (https://nvidia.github.io/cuda-python/). Please see the installation instructions of those projects for how the required Python packages.

                    Please refer to the documentation on backends on how to use and select these backends.

                    -

                    OpenCL and PyOpenCL

                    +

                    OpenCL and PyOpenCL

                    Before we can install PyOpenCL you’ll need an OpenCL compiler. There are several OpenCL compilers available depending on the OpenCL platform you want to your code to run on.

                    @@ -203,7 +205,7 @@

                    OpenCL and PyOpenCLhttps://wiki.tiker.net/PyOpenCL/Installation)

                    -

                    HIP and PyHIP

                    +

                    HIP and PyHIP

                    Before we can install PyHIP, you’ll need to have the HIP runtime and compiler installed on your system. The HIP compiler is included as part of the ROCm software stack. Here is AMD’s installation guide:

                      @@ -223,7 +225,7 @@

                      HIP and PyHIP -

                      Installing the git version

                      +

                      Installing the git version

                      You can also install from the git repository. This way you also get the examples. Please note that this will install all required dependencies in the current environment. For step-by-step instructions on setting up a development environment, see Development Environment.

                      @@ -253,7 +255,7 @@

                      Installing the git version -

                      Dependencies for the guides

                      +

                      Dependencies for the guides

                      Some addition Python packages are required to run the Jupyter notebook guides. These packages are commonly used and chances are that you already have these installed.

                      However, to install Kernel Tuner along with the dependencies to run the guides, diff --git a/latest/matrix_multiplication.html b/latest/matrix_multiplication.html index 58a3f275c..74cccfffe 100644 --- a/latest/matrix_multiplication.html +++ b/latest/matrix_multiplication.html @@ -1,20 +1,22 @@ - + - + Matrix multiplication — Kernel Tuner 1.0 documentation - - - + + + + + - - - + + + @@ -112,7 +114,7 @@

                      -

                      Matrix multiplication

                      +

                      Matrix multiplication

                      This guide demonstrates how to use Kernel Tuner to test and tune kernels, using matrix multiplication as an example.

                      Matrix multiplication is one of the most well-known and widely-used linear algebra operations, and is frequently used to demonstrate the high-performance computing capabilities of GPUs. As such, matrix multiplication presents a familiar starting point for many GPU programmers.

                      @@ -120,7 +122,7 @@

                      Matrix multiplication

                      Make sure to execute all the code cells you come across in this tutorial by selecting them and pressing shift+enter.

                      -

                      Naive CUDA kernel

                      +

                      Naive CUDA kernel

                      We’ll start with a very simple kernel for performing a matrix multiplication in CUDA. The idea is that this kernel is executed with one thread per element in the output matrix. As such, each thread \((i,j)\) iterates over the entire row \(i\) in matrix \(A\), and column \(j\) in matrix \(B\).

                      To keep the code clean and simple, we’ll assume that we only work with square matrices. Execute the following cell to write our naive matrix multiplication kernel to a file name “matmul_naive.cu” by pressing shift+enter.

                      @@ -151,7 +153,7 @@

                      Naive CUDA kernel -

                      Tuning a naive kernel

                      +

                      Tuning a naive kernel

                      Now we will have a look at how to use Kernel Tuner to find the best performing combination of tunable parameters for our naive matrix multiplication kernel. We’ll go over the process of creating an auto-tuning script step-by-step.

                      Because the tuner will need to execute the kernel, we start with creating some input data.

                      -

                      Using shared memory

                      +

                      Using shared memory

                      We can increase the utilization of memory bandwidth with a technique called cache-blocking or loop-tiling. To this end, we define two square data structures in shared memory, which will be used for storing square parts of matrix \(A\) and \(B\). The threads in a thread block will collaboratively fill these two submatrices, and then proceed to perform all the computations that need this data, before moving to the next blocked iteration.

                      The code required to do this is a little bit more complex:

                      @@ -289,7 +291,7 @@

                      Using shared memory -

                      Increase work per thread

                      +

                      Increase work per thread

                      A commonly used code optimization in GPU programming is to increase the amount of work performed by each thread. This optimization has several benefits. It increases data reuse within the thread block and reduces the number of redundant instructions executed by distinct threads. This code optimization is typically called 1xN Tiling or thread-block-merge. We will use two different forms of 1xN tiling in this example:

                      First of all, in the x-direction we will use tiling in a way that is similar to the convolution example (used as part of the ‘Getting Started’ tutorial). The area of output data that is processed by a single thread block is increased by a factor of N, and as such shared memory usage also increases by a factor \(N\). This means that the number of thread blocks needed to execute the kernel for this problem size is also reduced by a factor of \(N\). While this may reduce occupancy due to increased shared memory and register usage, this optimization drastically reduces the number of redundant instructions that were previously distributed across multiple thread blocks.

                      diff --git a/latest/metrics.html b/latest/metrics.html index 7ac260443..2dd3335df 100644 --- a/latest/metrics.html +++ b/latest/metrics.html @@ -1,19 +1,21 @@ - + - + Metrics and Objectives — Kernel Tuner 1.0 documentation - - + + + + - - - + + + @@ -107,10 +109,10 @@
                      -

                      Metrics and Objectives

                      +

                      Metrics and Objectives

                      Metrics and custom tuning objectives are two related features that are explained on this page.

                      -

                      Metrics

                      +

                      Metrics

                      User-defined metrics serve as an easy way for the user to define their own derived results based on the measurements reported by Kernel Tuner, and possibly any additional observers. This allows for example to implement performance metrics, such as performance in floating point operations per second (e.g. GFLOP/s), or other metrics that might be more specific to the @@ -131,7 +133,7 @@

                      Metrics and Objectives

                      -

                      Tuning Objectives

                      +

                      Tuning Objectives

                      Users can specify tuning objectives other than the default optimization objective, which is kernel execution time. When using an optimization strategy other than exhaustive search (brute force), this objective is used to guide the optimization through the parameter space. The tuning objective is specified using the objective= optional parameter of tune_kernel() and diff --git a/latest/observers.html b/latest/observers.html index 6f60bacb7..1ed7175a2 100644 --- a/latest/observers.html +++ b/latest/observers.html @@ -1,19 +1,21 @@ - + - + Observers — Kernel Tuner 1.0 documentation - - + + + + - - - + + + @@ -128,7 +130,7 @@

                      -

                      Observers

                      +

                      Observers

                      To facilitate measurements of quantities other than kernel execution time, and to make it easy for the user to control exactly what is being measured by Kernel Tuner, we have introduced the Observers feature. In the layered software architecture of Kernel Tuner, observers act as programmable hooks to allow the @@ -144,35 +146,35 @@ function, the state of GPU memory, or any other information in the GPU runtime.

                      -class kernel_tuner.observers.BenchmarkObserver
                      +class kernel_tuner.observers.BenchmarkObserver

                      Base class for Benchmark Observers

                      -after_finish()
                      +after_finish()

                      after finish is called once every iteration after the kernel has finished execution

                      -after_start()
                      +after_start()

                      after start is called every iteration directly after the kernel was launched

                      -before_start()
                      +before_start()

                      before start is called every iteration before the kernel starts

                      -during()
                      +during()

                      during is called as often as possible while the kernel is running

                      -abstract get_results()
                      +abstract get_results()

                      get_results should return a dict with results that adds to the benchmarking data

                      get_results is called only once per benchmarking of a single kernel configuration and generally returns averaged values over multiple iterations.

                      @@ -180,14 +182,14 @@
                      -register_configuration(params)
                      +register_configuration(params)

                      Called once before benchmarking of a single kernel configuration. The params argument is a dict that stores the configuration parameters.

                      -register_device(dev)
                      +register_device(dev)

                      Sets self.dev, for inspection by the observer at various points during benchmarking

                      @@ -207,7 +209,7 @@

                    -

                    PowerSensorObserver

                    +

                    PowerSensorObserver

                    PowerSensor2 is a custom-built power measurement device for PCIe devices that intercepts the device power with current sensors and transmits the data to the host over a USB connection. The main advantage of using PowerSensor2 over the GPU’s built-in power sensor is that PowerSensor2 reports @@ -221,7 +223,7 @@

                    PowerSensorObserver
                    -class kernel_tuner.observers.powersensor.PowerSensorObserver(observables=None, device=None)
                    +class kernel_tuner.observers.powersensor.PowerSensorObserver(observables=None, device=None)

                    Observer that an external PowerSensor2 device to accurately measure power

                    Requires PowerSensor2 hardware and powersensor Python bindings.

                    @@ -238,7 +240,7 @@

                    PowerSensorObserver -

                    NVMLObserver

                    +

                    NVMLObserver

                    Kernel Tuner also implements an NVMLObserver, which allows the user to observe the power usage, energy consumption, core and memory frequencies, core voltage and temperature for all kernel configurations during benchmarking as reported by the NVIDIA Management Library (NVML). To facilitate the interaction with @@ -254,7 +256,7 @@

                    NVMLObserver
                    -class kernel_tuner.observers.nvml.NVMLObserver(observables, device=0, save_all=False, nvidia_smi_fallback=None, use_locked_clocks=False, continous_duration=1)
                    +class kernel_tuner.observers.nvml.NVMLObserver(observables, device=0, save_all=False, nvidia_smi_fallback=None, use_locked_clocks=False, continous_duration=1)

                    Observer that uses NVML to monitor power, energy, clock frequencies, voltages and temperature.

                    The NVMLObserver can also be used to tune application-specific clock frequencies or power limits in combination with other parameters.

                    @@ -281,7 +283,7 @@

                    NVMLObserver -

                    Tuning execution parameters with NVML

                    +

                    Tuning execution parameters with NVML

                    When you are using the NVMLObserver, Kernel Tuner can use its interface to NVML to enable tuning of execution parameters, such as power limits or memory and core clock frequencies. Using application-specific clock frequencies is one of the most common approaches to tuning energy efficiency on @@ -302,13 +304,13 @@

                    Tuning execution parameters with NVML -

                    PMTObserver

                    +

                    PMTObserver

                    The PMTObserver can be used to measure power and energy on various platforms including Nvidia Jetson, Nvidia NVML, the RAPL interface, AMD ROCM, and Xilinx. It requires PMT to be installed, as well as the PMT’s Python interface. More information about PMT can be found here: https://git.astron.nl/RD/pmt/

                    -class kernel_tuner.observers.pmt.PMTObserver(observable=None)
                    +class kernel_tuner.observers.pmt.PMTObserver(observable=None)

                    Observer that uses the PMT library to measure power

                    Parameters:
                    diff --git a/latest/optimization.html b/latest/optimization.html index 448d0729e..2d45eb803 100644 --- a/latest/optimization.html +++ b/latest/optimization.html @@ -1,19 +1,21 @@ - + - + Optimization strategies — Kernel Tuner 1.0 documentation - - + + + + - - - + + + @@ -182,7 +184,7 @@
                    -

                    Optimization strategies

                    +

                    Optimization strategies

                    Kernel Tuner supports many optimization strategies that accelerate the auto-tuning search process. By default, Kernel Tuner uses ‘brute force’ tuning, which means that Kernel Tuner will try all possible combinations of all values of all tunable parameters. Even with simple kernels this form of tuning can become prohibitively slow and a waste of time and energy.

                    @@ -225,11 +227,11 @@

                    Below all the strategies are listed with their strategy-specific options that can be passed in a dictionary to the strategy_options= argument of tune_kernel().

                    -

                    kernel_tuner.strategies.basinhopping

                    +

                    kernel_tuner.strategies.basinhopping

                    The strategy that uses the basinhopping global optimization method.

                    -kernel_tuner.strategies.basinhopping.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.basinhopping.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space

                    This basin hopping strategy supports the following strategy_options:

                    @@ -259,29 +261,29 @@
                    -

                    kernel_tuner.strategies.bayes_opt

                    +

                    kernel_tuner.strategies.bayes_opt

                    Bayesian Optimization implementation from the thesis by Willemsen.

                    -kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts(tune_params: dict, eps: float) Tuple[dict, dict]
                    +kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts(tune_params: dict, eps: float) Tuple[dict, dict]

                    Generates normalization and denormalization dictionaries.

                    -kernel_tuner.strategies.bayes_opt.normalize_parameter_space(param_space: list, tune_params: dict, normalized: dict) list
                    +kernel_tuner.strategies.bayes_opt.normalize_parameter_space(param_space: list, tune_params: dict, normalized: dict) list

                    Normalize the parameter space given a normalization dictionary.

                    -kernel_tuner.strategies.bayes_opt.prune_parameter_space(parameter_space, tuning_options, tune_params, normalize_dict)
                    +kernel_tuner.strategies.bayes_opt.prune_parameter_space(parameter_space, tuning_options, tune_params, normalize_dict)

                    Pruning of the parameter space to remove dimensions that have a constant parameter.

                    -kernel_tuner.strategies.bayes_opt.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.bayes_opt.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space.

                    Params runner:
                    @@ -303,11 +305,11 @@
                    -

                    kernel_tuner.strategies.brute_force

                    +

                    kernel_tuner.strategies.brute_force

                    The default strategy that iterates through the whole parameter space

                    -kernel_tuner.strategies.brute_force.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.brute_force.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space

                    This Brute Force strategy supports the following strategy_options:

                    @@ -331,11 +333,11 @@
                    -

                    kernel_tuner.strategies.diff_evo

                    +

                    kernel_tuner.strategies.diff_evo

                    The differential evolution strategy that optimizes the search through the parameter space.

                    -kernel_tuner.strategies.diff_evo.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.diff_evo.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space

                    This Differential Evolution strategy supports the following strategy_options:

                    @@ -366,11 +368,11 @@
                    -

                    kernel_tuner.strategies.dual_annealing

                    +

                    kernel_tuner.strategies.dual_annealing

                    The strategy that uses the dual annealing optimization method.

                    -kernel_tuner.strategies.dual_annealing.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.dual_annealing.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space

                    This Dual Annealing strategy supports the following strategy_options:

                    @@ -399,27 +401,27 @@
                    -

                    kernel_tuner.strategies.firefly_algorithm

                    +

                    kernel_tuner.strategies.firefly_algorithm

                    The strategy that uses the firefly algorithm for optimization.

                    -class kernel_tuner.strategies.firefly_algorithm.Firefly(bounds)
                    +class kernel_tuner.strategies.firefly_algorithm.Firefly(bounds)

                    Firefly object for use in the Firefly Algorithm.

                    -compute_intensity(fun)
                    +compute_intensity(fun)

                    Evaluate cost function and compute intensity at this position.

                    -distance_to(other)
                    +distance_to(other)

                    Return Euclidian distance between self and other Firefly.

                    -move_towards(other, beta, alpha)
                    +move_towards(other, beta, alpha)

                    Move firefly towards another given beta and alpha values.

                    @@ -427,7 +429,7 @@
                    -kernel_tuner.strategies.firefly_algorithm.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.firefly_algorithm.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space

                    This firefly algorithm strategy supports the following strategy_options:

                    @@ -460,11 +462,11 @@
                    -

                    kernel_tuner.strategies.genetic_algorithm

                    +

                    kernel_tuner.strategies.genetic_algorithm

                    A simple genetic algorithm for parameter search.

                    -kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover(dna1, dna2)
                    +kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover(dna1, dna2)

                    Disruptive uniform crossover.

                    uniformly crossover genes between dna1 and dna2, with children guaranteed to be different from parents, @@ -473,19 +475,19 @@

                    -kernel_tuner.strategies.genetic_algorithm.mutate(dna, mutation_chance, searchspace: Searchspace, cache=True)
                    +kernel_tuner.strategies.genetic_algorithm.mutate(dna, mutation_chance, searchspace: Searchspace, cache=True)

                    Mutate DNA with 1/mutation_chance chance.

                    -kernel_tuner.strategies.genetic_algorithm.single_point_crossover(dna1, dna2)
                    +kernel_tuner.strategies.genetic_algorithm.single_point_crossover(dna1, dna2)

                    Crossover dna1 and dna2 at a random index.

                    -kernel_tuner.strategies.genetic_algorithm.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.genetic_algorithm.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space

                    This Genetic Algorithm strategy supports the following strategy_options:

                    @@ -517,29 +519,29 @@
                    -kernel_tuner.strategies.genetic_algorithm.two_point_crossover(dna1, dna2)
                    +kernel_tuner.strategies.genetic_algorithm.two_point_crossover(dna1, dna2)

                    Crossover dna1 and dna2 at 2 random indices.

                    -kernel_tuner.strategies.genetic_algorithm.uniform_crossover(dna1, dna2)
                    +kernel_tuner.strategies.genetic_algorithm.uniform_crossover(dna1, dna2)

                    Randomly crossover genes between dna1 and dna2.

                    -kernel_tuner.strategies.genetic_algorithm.weighted_choice(population, n)
                    +kernel_tuner.strategies.genetic_algorithm.weighted_choice(population, n)

                    Randomly select n unique individuals from a weighted population, fitness determines probability of being selected.

                    -

                    kernel_tuner.strategies.greedy_ils

                    +

                    kernel_tuner.strategies.greedy_ils

                    A simple greedy iterative local search algorithm for parameter search.

                    -kernel_tuner.strategies.greedy_ils.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.greedy_ils.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space

                    This Greedy Iterative Local Search (ILS) strategy supports the following strategy_options:

                    @@ -571,11 +573,11 @@
                    -

                    kernel_tuner.strategies.greedy_mls

                    +

                    kernel_tuner.strategies.greedy_mls

                    A greedy multi-start local search algorithm for parameter search.

                    -kernel_tuner.strategies.greedy_mls.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.greedy_mls.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space

                    This Greedy Multi-start Local Search (MLS) strategy supports the following strategy_options:

                    @@ -607,11 +609,11 @@
                    -

                    kernel_tuner.strategies.minimize

                    +

                    kernel_tuner.strategies.minimize

                    The strategy that uses a minimizer method for searching through the parameter space.

                    -kernel_tuner.strategies.minimize.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.minimize.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space

                    This Minimize strategy supports the following strategy_options:

                    @@ -640,11 +642,11 @@
                    -

                    kernel_tuner.strategies.mls

                    +

                    kernel_tuner.strategies.mls

                    The strategy that uses multi-start local search.

                    -kernel_tuner.strategies.mls.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.mls.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space

                    This Multi-start Local Search (MLS) strategy supports the following strategy_options:

                    @@ -676,11 +678,11 @@
                    -

                    kernel_tuner.strategies.ordered_greedy_mls

                    +

                    kernel_tuner.strategies.ordered_greedy_mls

                    A greedy multi-start local search algorithm for parameter search that traverses variables in order.

                    -kernel_tuner.strategies.ordered_greedy_mls.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.ordered_greedy_mls.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space

                    This Ordered Greedy Multi-start Local Search (MLS) strategy supports the following strategy_options:

                    @@ -712,11 +714,11 @@
                    -

                    kernel_tuner.strategies.pso

                    +

                    kernel_tuner.strategies.pso

                    The strategy that uses particle swarm optimization.

                    -kernel_tuner.strategies.pso.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.pso.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space

                    This Particle Swarm Optimization (PSO) strategy supports the following strategy_options:

                    @@ -749,11 +751,11 @@
                    -

                    kernel_tuner.strategies.random_sample

                    +

                    kernel_tuner.strategies.random_sample

                    Iterate over a random sample of the parameter space.

                    -kernel_tuner.strategies.random_sample.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.random_sample.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space

                    This Random Sampling strategy supports the following strategy_options:

                    @@ -782,23 +784,23 @@
                    -

                    kernel_tuner.strategies.simulated_annealing

                    +

                    kernel_tuner.strategies.simulated_annealing

                    The strategy that uses particle swarm optimization.

                    -kernel_tuner.strategies.simulated_annealing.acceptance_prob(old_cost, new_cost, T, tuning_options)
                    +kernel_tuner.strategies.simulated_annealing.acceptance_prob(old_cost, new_cost, T, tuning_options)

                    Annealing equation, with modifications to work towards a lower value.

                    -kernel_tuner.strategies.simulated_annealing.neighbor(pos, searchspace: Searchspace)
                    +kernel_tuner.strategies.simulated_annealing.neighbor(pos, searchspace: Searchspace)

                    Return a random neighbor of pos.

                    -kernel_tuner.strategies.simulated_annealing.tune(searchspace: Searchspace, runner, tuning_options)
                    +kernel_tuner.strategies.simulated_annealing.tune(searchspace: Searchspace, runner, tuning_options)

                    Find the best performing kernel configuration in the parameter space

                    This Simulated Annealing strategy supports the following strategy_options:

                    diff --git a/latest/py-modindex.html b/latest/py-modindex.html index 8ce00ed8f..64c70e244 100644 --- a/latest/py-modindex.html +++ b/latest/py-modindex.html @@ -1,18 +1,20 @@ - + Python Module Index — Kernel Tuner 1.0 documentation - - + + + + - - - + + + diff --git a/latest/quickstart.html b/latest/quickstart.html index b042cac82..69501d8a3 100644 --- a/latest/quickstart.html +++ b/latest/quickstart.html @@ -1,19 +1,21 @@ - + - + Getting Started — Kernel Tuner 1.0 documentation - - + + + + - - - + + + @@ -103,7 +105,7 @@
                    -

                    Getting Started

                    +

                    Getting Started

                    So you have installed Kernel Tuner! That’s great! But now you’d like to get started tuning some GPU code.

                    Let’s say we have a simple CUDA kernel stored in a file called vector_add_kernel.cu:

                    __global__ void vector_add(float * c, float * a, float * b, int n) {
                    diff --git a/latest/search.html b/latest/search.html
                    index d3cb971fc..8b964308a 100644
                    --- a/latest/search.html
                    +++ b/latest/search.html
                    @@ -1,19 +1,21 @@
                     
                    -
                    +
                     
                       
                       
                       Search — Kernel Tuner 1.0 documentation
                    -      
                    -      
                    +      
                    +      
                    +
                    +  
                         
                       
                       
                    -        
                    -        
                    -        
                    +        
                    +        
                    +        
                             
                         
                         
                    diff --git a/latest/searchindex.js b/latest/searchindex.js
                    index 793ac2038..78e74701f 100644
                    --- a/latest/searchindex.js
                    +++ b/latest/searchindex.js
                    @@ -1 +1 @@
                    -Search.setIndex({"docnames": ["backends", "cache_files", "contents", "contributing", "convolution", "correctness", "design", "dev-environment", "diffusion", "diffusion_opencl", "diffusion_use_optparam", "examples", "grid3d", "hostcode", "index", "install", "matrix_multiplication", "metrics", "observers", "optimization", "quickstart", "structs", "templates", "user-api", "vocabulary"], "filenames": ["backends.rst", "cache_files.rst", "contents.rst", "contributing.rst", "convolution.ipynb", "correctness.rst", "design.rst", "dev-environment.rst", "diffusion.ipynb", "diffusion_opencl.ipynb", "diffusion_use_optparam.ipynb", "examples.rst", "grid3d.ipynb", "hostcode.rst", "index.rst", "install.rst", "matrix_multiplication.ipynb", "metrics.rst", "observers.rst", "optimization.rst", "quickstart.rst", "structs.rst", "templates.rst", "user-api.rst", "vocabulary.rst"], "titles": ["Backends", "Cache files", "The Kernel Tuner documentation", "Contribution guide", "Convolution", "Correctness Verification", "Design documentation", "Development environment", "Diffusion", "Tutorial: From physics to tuned GPU kernels", "Tutorial: From physics to tuned GPU kernels", "Kernel Tuner Examples", "3D Grid on GPU with Kernel Tuner", "Tuning Host Code", "The Kernel Tuner documentation", "Installation", "Matrix multiplication", "Metrics and Objectives", "Observers", "Optimization strategies", "Getting Started", "Using structs", "Templated kernels", "API Documentation", "Parameter Vocabulary"], "terms": {"kernel": [0, 1, 3, 4, 5, 6, 7, 13, 15, 17, 18, 19, 20, 21, 23, 24], "tuner": [0, 1, 3, 4, 5, 6, 7, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "implement": [0, 5, 6, 11, 12, 17, 18, 19, 23], "multipl": [0, 2, 6, 7, 13, 18, 22, 23], "one": [0, 3, 4, 6, 7, 8, 9, 10, 12, 15, 16, 18, 19, 23], "opencl": [0, 3, 4, 7, 8, 9, 10, 11, 13, 14, 16, 23], "hip": [0, 3, 7, 14, 23], "gener": [0, 4, 6, 7, 8, 9, 10, 14, 16, 18, 19, 21, 23, 24], "select": [0, 3, 4, 6, 8, 9, 10, 12, 15, 16, 18, 19, 23], "i": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "most": [0, 3, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 19, 20, 21, 23], "case": [0, 4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 18, 20, 21, 23], "automat": [0, 4, 7, 8, 9, 10, 12, 13, 16, 22, 23], "done": [0, 4, 15, 17, 18], "base": [0, 6, 7, 17, 18, 22, 23], "": [0, 4, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23], "program": [0, 5, 7, 8, 9, 10, 13, 16, 21, 22], "languag": [0, 6, 10, 13, 16, 21, 23], "sometim": [0, 7, 8, 9, 10, 21], "you": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24], "ll": [0, 4, 8, 9, 10, 15, 16], "want": [0, 5, 10, 12, 13, 15, 16, 18, 20, 23, 24], "specif": [0, 4, 6, 8, 9, 10, 11, 12, 17, 18, 19, 23], "choos": [0, 8, 9, 10, 16, 19, 23], "pycuda": [0, 7, 8, 10, 12, 13, 18, 22], "default": [0, 4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 18, 19, 22, 23], "It": [0, 4, 6, 7, 8, 9, 10, 13, 15, 16, 18, 22, 23], "compar": [0, 4, 5, 8, 9, 10, 12, 16, 17, 18], "complet": [0, 1, 4], "cupi": [0, 7, 13, 15, 18, 22, 23], "becaus": [0, 4, 5, 8, 9, 10, 13, 15, 16, 17, 22, 24], "ident": 0, "includ": [0, 3, 4, 5, 8, 9, 10, 12, 13, 15, 16, 18, 22, 23], "here": [0, 4, 11, 12, 13, 15, 16, 18, 23], "well": [0, 8, 9, 10, 12, 16, 18, 23], "To": [0, 3, 5, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23], "us": [0, 1, 2, 4, 5, 6, 7, 11, 13, 14, 15, 17, 18, 19, 20, 22, 23, 24], "nvidia": [0, 6, 7, 15, 16, 18, 22], "gpu": [0, 3, 4, 5, 6, 7, 11, 13, 14, 16, 18, 20, 21, 23, 24], "see": [0, 1, 3, 4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 20, 22, 23], "http": [0, 6, 7, 14, 15, 18], "github": [0, 3, 4, 7, 8, 9, 10, 12, 15, 16], "com": [0, 3, 6, 7, 14, 15], "jatinx": [0, 15], "nv": 0, "while": [0, 1, 4, 6, 8, 9, 10, 11, 16, 18, 19], "expect": [0, 3, 4, 5, 6, 7, 8, 9, 10, 16, 18, 23], "all": [0, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 23], "input": [0, 4, 5, 8, 9, 10, 11, 13, 16, 17, 20, 21, 23], "output": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 20, 23, 24], "numpi": [0, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 16, 20, 21, 22, 23], "arrai": [0, 4, 5, 6, 8, 9, 10, 12, 13, 20, 21, 23], "also": [0, 1, 3, 4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "argument": [0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 16, 18, 19, 20, 21, 22, 23], "thi": [0, 1, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "give": [0, 8, 9, 10, 19], "user": [0, 4, 5, 6, 7, 9, 11, 15, 16, 17, 18, 19, 22, 23], "more": [0, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 18, 20, 22, 23], "control": [0, 8, 9, 10, 18, 19, 23], "over": [0, 6, 8, 9, 10, 15, 16, 18, 19], "how": [0, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 20, 21, 22, 23], "memori": [0, 4, 6, 11, 13, 18, 21, 23, 24], "handl": [0, 13, 23], "check": [0, 5, 6, 7, 8, 9, 10, 13, 16], "dure": [0, 1, 6, 8, 9, 10, 12, 18, 23], "verif": [0, 2, 11, 23], "can": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "happen": [0, 1, 3, 4, 16, 20], "entir": [0, 6, 7, 8, 9, 10, 16, 19, 23], "when": [0, 1, 3, 4, 6, 7, 8, 9, 10, 13, 15, 16, 17, 18, 19, 21, 22, 23, 24], "onli": [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 19, 21, 23], "textur": [0, 6, 23], "c": [0, 3, 4, 6, 7, 11, 13, 14, 15, 16, 20, 22, 23], "signatur": [0, 4, 6], "With": [0, 12, 13], "other": [0, 1, 4, 6, 7, 8, 9, 10, 13, 16, 17, 18, 19, 23, 24], "requir": [0, 3, 4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 22], "ha": [0, 4, 6, 7, 8, 9, 10, 13, 16, 18, 19, 23], "extern": [0, 18, 22], "linkag": [0, 22], "If": [0, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 21, 23], "code": [0, 2, 4, 6, 7, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "wrap": [0, 6, 20, 22, 23], "an": [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "block": [0, 4, 6, 8, 9, 10, 11, 12, 15, 16, 17, 20, 23, 24], "which": [0, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24], "mai": [0, 4, 5, 6, 7, 8, 9, 10, 13, 15, 16, 17, 18, 19, 20, 21, 23], "caus": [0, 8, 9, 10], "issu": [0, 7, 21], "contain": [0, 1, 4, 6, 8, 9, 10, 12, 13, 16, 18, 19, 22, 23], "cannot": [0, 7, 8, 9, 10, 18], "have": [0, 1, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 20, 22, 23, 24], "present": [0, 7, 16], "header": [0, 23], "file": [0, 2, 4, 6, 7, 8, 9, 11, 13, 16, 19, 20, 22, 23], "As": [0, 1, 4, 8, 9, 10, 12, 15, 16, 18], "detail": [0, 6, 15, 23], "further": [0, 8, 9, 10, 15, 16], "templat": [0, 2, 12], "ar": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "fulli": [0, 7, 15], "limit": [0, 4, 6, 7, 8, 9, 10, 11, 16, 18, 19, 22, 23, 24], "python": [0, 3, 4, 6, 7, 11, 12, 13, 16, 18, 20, 21, 22, 23], "benchmark": [0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 23, 24], "observ": [0, 2, 6, 17, 23, 24], "constant": [0, 4, 6, 8, 9, 10, 11, 13, 16, 19, 23], "dynam": [0, 6, 23], "share": [0, 4, 6, 23], "anoth": [0, 8, 9, 10, 13, 16, 17, 19, 23], "import": [0, 4, 5, 8, 9, 10, 12, 15, 16, 17, 20, 21, 22], "differ": [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 23], "between": [0, 8, 9, 10, 13, 15, 16, 17, 19, 23], "The": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23], "tabl": 0, "below": [0, 7, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21], "list": [0, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 23], "packag": [0, 7], "pyhip": [0, 6], "interfac": [0, 4, 5, 13, 15, 18, 19, 21, 23], "lang": [0, 6, 11, 13, 22, 23], "nvcuda": 0, "nvcc": [0, 6], "nvrtc": [0, 6, 22], "hiprtc": 0, "A": [1, 4, 6, 7, 14, 15, 16, 18, 19, 23], "veri": [1, 5, 8, 9, 10, 13, 15, 16, 18, 21, 22], "featur": [1, 4, 5, 11, 15, 17, 18, 20, 22, 23], "abil": 1, "store": [1, 4, 6, 7, 10, 16, 18, 20, 23], "result": [1, 3, 4, 5, 6, 10, 12, 16, 17, 18, 19, 20, 23, 24], "tune": [1, 2, 5, 6, 11, 14, 15, 19, 20, 22, 23, 24], "enabl": [1, 18, 19, 21, 22], "pass": [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 22, 23], "ani": [1, 3, 4, 6, 7, 8, 9, 10, 13, 16, 17, 18, 19, 21, 22, 23, 24], "filenam": [1, 4, 6, 11, 16, 20, 23], "option": [1, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19, 22, 23, 24], "tune_kernel": [1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 19, 20, 21, 22, 23], "individu": [1, 18, 19], "configur": [1, 4, 6, 8, 9, 10, 11, 12, 16, 17, 18, 19, 23], "append": [1, 6, 15, 23], "run": [1, 3, 4, 5, 6, 8, 9, 12, 13, 15, 16, 18, 19, 23], "allow": [1, 4, 5, 6, 7, 8, 9, 10, 16, 17, 18, 19, 22, 23], "restart": [1, 7, 8, 9, 10, 19], "session": [1, 6, 7, 19], "from": [1, 4, 5, 6, 7, 8, 11, 12, 13, 15, 16, 18, 19, 21, 22, 23], "exist": [1, 6, 23], "should": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13, 16, 17, 18, 20, 23], "someth": [1, 4, 8, 9, 10, 16], "termin": [1, 15], "previou": [1, 7, 8, 9, 10, 19, 23], "befor": [1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 19, 23], "had": [1, 4], "quit": [1, 8, 9, 10, 12, 16, 22], "often": [1, 8, 9, 10, 18], "hpc": 1, "environ": [1, 2, 3, 4, 6, 15, 19, 23], "job": 1, "reserv": [1, 9, 24], "out": [1, 4, 5, 7, 12, 15, 16], "number": [1, 4, 5, 6, 8, 9, 10, 11, 12, 14, 16, 17, 18, 19, 20, 21, 23, 24], "simul": [1, 6, 10, 14, 19, 21, 23], "visual": [1, 7, 16], "optim": [1, 2, 4, 5, 6, 8, 9, 10, 13, 14, 16, 17, 18, 23], "strategi": [1, 2, 4, 14, 17, 23], "start": [1, 2, 4, 5, 6, 8, 9, 10, 13, 15, 16, 18, 19, 23], "call": [1, 4, 5, 6, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 22, 23], "full": [1, 3, 6, 7, 18, 20], "search": [1, 4, 6, 11, 14, 16, 17, 19, 23], "space": [1, 4, 5, 6, 7, 12, 13, 16, 17, 19, 23], "true": [1, 4, 5, 6, 8, 9, 10, 13, 16, 18, 19, 23], "creat": [1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 16, 18, 20, 21, 23], "even": [1, 7, 8, 9, 10, 13, 16, 19], "work": [1, 3, 4, 6, 8, 9, 10, 15, 17, 19, 22, 23], "still": [1, 3, 5, 16], "new": [1, 3, 6, 7, 8, 9, 10, 19, 23], "come": [1, 6, 8, 9, 10, 16, 18, 22], "thei": [1, 6, 7, 8, 9, 10, 11, 16, 17], "stream": [1, 6, 8, 9, 10], "pleas": [1, 3, 4, 7, 11, 14, 15, 18, 20, 21, 23], "dashboard": [1, 14], "introduct": 2, "instal": [2, 3, 4, 7, 8, 9, 10, 12, 13, 16, 18, 20], "get": [2, 4, 6, 8, 9, 10, 12, 15, 16], "convolut": [2, 5, 13, 16], "diffus": 2, "matrix": 2, "exampl": [2, 3, 5, 6, 7, 8, 9, 10, 13, 15, 16, 17, 18, 19, 20, 21, 23], "backend": [2, 7, 13, 18], "cach": [2, 6, 7, 8, 9, 10, 15, 16, 19, 23], "correct": [2, 7, 13, 21, 23], "host": [2, 6, 7, 9, 10, 11, 18, 21, 22, 23], "struct": 2, "metric": [2, 4, 6, 11, 16, 23], "object": [2, 4, 5, 6, 8, 9, 10, 19, 23], "api": [2, 4, 6], "paramet": [2, 5, 6, 8, 9, 11, 13, 16, 17, 19, 20, 21, 22, 23], "vocabulari": [2, 18, 20], "design": [2, 3, 8, 9, 10, 18], "contribut": [2, 7], "develop": [2, 6, 11, 14, 15], "thank": 3, "consid": [3, 12, 14, 16, 23], "Not": [3, 6], "help": [3, 7, 22], "u": [3, 4, 8, 9, 10], "improv": [3, 6, 8, 9, 10, 16, 19, 23], "about": [3, 4, 6, 8, 9, 10, 14, 16, 18, 19, 20, 23], "problem": [3, 4, 6, 8, 9, 10, 11, 12, 13, 16, 23], "ensur": [3, 5, 8, 9, 10, 13, 15, 18, 21], "follow": [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 22, 23], "describ": [3, 4, 6, 7, 13, 18, 21], "what": [3, 4, 5, 6, 8, 9, 10, 13, 16, 18, 20, 21, 22, 23, 24], "possibl": [3, 4, 5, 7, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 23], "minim": [3, 7, 17, 22, 23], "reproduc": 3, "actual": [3, 4, 5, 6, 8, 9, 10, 12, 16, 22], "error": [3, 4, 5, 6, 13, 16, 22], "print": [3, 4, 6, 8, 9, 10, 12, 16, 23], "version": [3, 4, 7, 16, 18, 23], "cuda": [3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 18, 20, 21, 22, 23], "compil": [3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 22, 23, 24], "applic": [3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 21, 22, 23], "For": [3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 18, 20, 21, 23], "propos": 3, "chang": [3, 7, 12, 18, 23], "addit": [3, 4, 7, 8, 9, 10, 15, 17, 20], "signific": 3, "first": [3, 4, 5, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23], "discuss": [3, 6], "Then": [3, 8, 9, 10, 12, 14, 15, 22], "fork": 3, "repositori": [3, 4, 7, 8, 9, 10, 12, 14, 15, 16], "branch": [3, 7], "per": [3, 4, 6, 8, 9, 10, 12, 17, 18, 23], "pull": 3, "request": [3, 18, 23], "googl": 3, "style": 3, "sphinxdoc": 3, "docstr": [3, 6], "modul": [3, 6, 7, 13, 18], "public": [3, 14], "function": [3, 4, 5, 8, 9, 10, 11, 12, 13, 16, 18, 19, 20, 21, 22, 23], "up": [3, 4, 6, 7, 8, 9, 10, 15, 16, 20, 23], "date": 3, "written": [3, 22], "unit": [3, 6], "test": [3, 8, 9, 10, 11, 15, 16, 18, 23], "your": [3, 4, 7, 8, 9, 10, 12, 13, 14, 15, 18, 21, 23], "nox": [3, 7], "do": [3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 23], "hardwar": [3, 7, 8, 9, 10, 12, 18, 19, 20], "skip": [3, 4, 7, 8, 9, 10, 23], "produc": [3, 5], "same": [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 18, 20, 23], "better": [3, 7, 8, 9, 10], "entri": [3, 6, 8, 9], "changelog": 3, "md": 3, "doubt": 3, "where": [3, 4, 5, 6, 7, 8, 9, 10, 16, 17, 18, 21, 22, 23], "put": [3, 6, 7, 8, 9, 10], "look": [3, 4, 6, 8, 9, 10, 12, 15, 16, 22], "document": [3, 4, 5, 8, 9, 10, 12, 15, 16, 21, 24], "regard": [3, 6, 19], "small": [3, 4, 7, 8, 9, 10, 16], "quick": [3, 8, 9, 10], "step": [3, 7, 8, 9, 10, 15, 16, 17, 19, 22], "git": [3, 7, 18], "clone": [3, 4, 7, 8, 9, 10, 12, 15, 16], "kerneltun": [3, 7, 14], "kernel_tun": [3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 20, 21, 22, 23, 24], "cd": [3, 7, 15], "pip": [3, 4, 7, 8, 9, 14, 15, 16], "e": [3, 7, 15, 17, 18, 19, 23], "local": [3, 19, 23], "r": [3, 5, 13], "doc": [3, 4, 6, 7, 8, 9, 10, 12, 15, 16], "requirements_test": 3, "txt": 3, "pytest": [3, 7], "v": [3, 6, 8, 9, 10, 12], "build": [3, 6, 8, 9, 10], "make": [3, 4, 7, 8, 9, 10, 12, 14, 15, 16, 18, 21, 22], "html": [3, 6, 7], "These": [3, 7, 8, 9, 10, 12, 15, 16, 18, 22, 23], "instruct": [3, 7, 8, 9, 10, 11, 15, 16], "enough": [3, 4, 5, 16], "larger": [3, 8, 9, 10, 13, 19, 22], "need": [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 20, 21, 22, 23], "depend": [3, 4, 5, 7, 10, 11, 12, 14, 17, 23], "set": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 18, 19, 20, 22, 23, 24], "guid": [4, 8, 16, 17, 20], "meant": 4, "write": [4, 11, 12, 16, 22, 23], "script": [4, 6, 16, 21, 22], "we": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 20, 21, 22], "simpl": [4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 21], "find": [4, 13, 16, 19, 23], "shortli": 4, "much": [4, 8, 9, 10, 12, 18, 22, 23], "reus": [4, 8, 9, 10, 16], "note": [4, 6, 7, 8, 9, 10, 12, 14, 15, 16, 18, 21, 23], "read": [4, 5, 6, 7, 8, 9, 10, 12, 13, 16, 18, 23], "page": [4, 7, 8, 9, 10, 11, 12, 14, 16, 17], "jupyt": [4, 8, 9, 10, 12, 15, 16], "notebook": [4, 8, 9, 10, 12, 15, 16], "just": [4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16], "tutori": [4, 8, 12, 14, 15, 16], "re": [4, 7, 8, 9, 10, 12, 16], "readi": [4, 6, 8, 9, 10, 12, 16], "go": [4, 7, 8, 9, 10, 12, 14, 15, 16, 20], "type": [4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23], "sourc": [4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 22, 23], "directori": [4, 7, 8, 9, 10, 12, 15, 16], "oper": [4, 8, 9, 10, 12, 13, 16, 17], "essenti": [4, 7], "signal": [4, 24], "imag": [4, 8, 9, 10], "process": [4, 6, 7, 8, 9, 10, 16, 17, 18, 19, 22], "main": [4, 6, 12, 18, 20], "neural": 4, "network": 4, "deep": 4, "learn": 4, "comput": [4, 5, 6, 11, 12, 13, 14, 16, 19, 23], "linear": [4, 16, 23], "combin": [4, 6, 7, 8, 9, 10, 11, 12, 16, 18, 19, 20, 23], "weight": [4, 19], "filter": [4, 5, 11, 13], "rang": [4, 5, 8, 9, 10, 12, 13, 22], "pixel": 4, "each": [4, 5, 6, 7, 8, 9, 12, 16, 18, 19, 23], "size": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 19, 20, 22, 23], "w": [4, 8, 9, 17, 19], "time": [4, 6, 8, 9, 10, 12, 13, 16, 17, 18, 19, 22, 23, 24], "h": [4, 12, 23], "f": [4, 5, 12, 13, 21], "f_w": 4, "f_h": 4, "o": [4, 6], "begin": [4, 8, 9, 10, 12], "equat": [4, 8, 9, 10, 12, 19], "nonumb": [4, 12], "x": [4, 5, 6, 8, 9, 10, 12, 14, 16, 20, 22, 23], "y": [4, 6, 7, 8, 9, 10, 12, 13, 16, 23], "sum": [4, 5, 6, 16], "limits_": 4, "j": [4, 8, 9, 10, 14, 16], "0": [4, 5, 6, 8, 9, 10, 12, 13, 16, 18, 19, 21, 23], "end": [4, 6, 7, 8, 9, 10, 12, 16, 18, 19, 21], "naiv": [4, 5, 8, 9, 10], "parallel": [4, 8, 9, 10], "thread": [4, 6, 8, 9, 10, 11, 12, 17, 18, 20, 23, 24], "avoid": [4, 6, 16, 24], "confus": 4, "around": [4, 11], "term": 4, "refer": [4, 5, 6, 8, 9, 10, 11, 13, 15, 18, 23], "shown": [4, 6, 18], "sure": [4, 7, 8, 9, 10, 14, 15, 16], "execut": [4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 19, 23], "them": [4, 7, 10, 12, 13, 16], "press": [4, 8, 9, 10, 12, 16], "shift": [4, 8, 9, 10, 12, 16], "enter": [4, 7, 8, 9, 10, 12, 16], "writefil": [4, 16], "convolution_na": [4, 5], "cu": [4, 5, 13, 16, 20, 22], "__global__": [4, 8, 10, 12, 14, 16, 20, 22], "void": [4, 8, 9, 10, 12, 14, 16, 20, 21, 22], "convolution_kernel": [4, 5], "float": [4, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23], "int": [4, 6, 8, 9, 10, 12, 14, 16, 20, 22, 23], "blockidx": [4, 8, 9, 10, 12, 14, 16, 20, 22], "blockdim": [4, 20, 23], "threadidx": [4, 8, 9, 10, 12, 14, 16, 20, 22], "image_height": 4, "image_width": 4, "filter_height": 4, "filter_width": 4, "input_width": 4, "run_kernel": [4, 5, 6, 11, 23], "our": [4, 8, 9, 10, 12, 16, 20, 21], "But": [4, 8, 9, 10, 12, 20], "some": [4, 6, 7, 8, 9, 10, 15, 16, 17, 18, 19, 20, 21, 22, 23], "data": [4, 6, 8, 9, 10, 12, 13, 16, 17, 18, 20, 21, 23], "np": [4, 6, 12, 16, 20, 21], "filter_s": 4, "17": [4, 5, 8, 9, 10, 13], "output_s": 4, "4096": [4, 5, 8, 9, 10, 13, 16], "prod": [4, 5, 13], "border_s": 4, "2": [4, 5, 7, 8, 9, 10, 11, 12, 13, 16, 18, 19, 23], "1": [4, 5, 7, 8, 9, 10, 12, 13, 16, 18, 19, 23], "input_s": [4, 5, 13], "output_imag": 4, "zero": [4, 5, 12, 13, 16], "astyp": [4, 5, 8, 9, 10, 12, 13, 14, 16, 20, 22], "float32": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 20, 22, 23], "input_imag": 4, "random": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 19, 20, 22, 23], "randn": [4, 5, 13, 14, 16, 20, 22], "conv_filt": 4, "now": [4, 6, 8, 9, 10, 12, 13, 16, 20], "structur": [4, 6, 8, 9, 16, 20], "kernel_nam": [4, 6, 13, 21, 22, 23], "kernel_sourc": [4, 6, 21, 23], "problem_s": [4, 5, 6, 8, 9, 10, 12, 13, 16, 20, 21, 23, 24], "param": [4, 5, 6, 7, 18, 19, 23], "ellipsi": 4, "indic": [4, 19, 24], "mani": [4, 6, 8, 9, 10, 16, 17, 18, 19, 23], "won": 4, "t": [4, 6, 7, 8, 9, 10, 12, 13, 15, 19, 22, 23], "right": [4, 8, 9, 10, 12, 15], "interest": [4, 11, 21], "found": [4, 6, 7, 14, 18, 19], "five": [4, 6, 20], "name": [4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 18, 19, 20, 23, 24], "string": [4, 6, 8, 9, 10, 11, 16, 17, 18, 20, 21, 23], "domain": [4, 8, 9, 10, 11, 12, 23], "three": [4, 5, 16], "dimens": [4, 6, 8, 9, 10, 11, 12, 13, 16, 17, 19, 20, 23, 24], "dictionari": [4, 6, 8, 9, 10, 12, 16, 18, 19, 20, 23], "simpli": [4, 5, 6, 8, 9, 10, 12, 19, 20, 23], "cell": [4, 8, 9, 10, 12, 16], "wrote": 4, "determin": [4, 8, 9, 10, 12, 18, 19], "grid": [4, 6, 8, 9, 10, 11, 13, 16, 23, 24], "defin": [4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 18, 22, 23], "abov": [4, 6, 8, 9, 10, 12, 15, 16, 20, 21], "divid": [4, 8, 9, 10, 12, 13, 16, 23], "divisor": [4, 6, 8, 9, 10, 16, 23], "so": [4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 19, 20, 22, 23], "specifi": [4, 5, 6, 7, 8, 9, 10, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24], "point": [4, 6, 7, 8, 9, 10, 12, 13, 16, 17, 18, 20, 23], "scalar": [4, 8, 9, 10, 12, 23], "correspond": [4, 7, 8, 9, 10, 12, 18, 19, 20], "therefor": [4, 5, 8, 9, 10, 12, 13, 16], "exactli": [4, 6, 8, 9, 10, 16, 18], "order": [4, 5, 6, 8, 9, 10, 12, 13, 16, 17, 19, 20, 23], "match": [4, 5, 6], "32": [4, 6, 8, 9, 10, 12, 14, 16, 20, 23], "bit": [4, 6, 8, 9, 10, 12, 13, 16], "final": [4, 5, 8, 9, 10, 12], "rememb": [4, 7, 8, 9, 10, 16], "anyth": 4, "insert": [4, 5, 6, 10, 12, 13, 16, 20, 22, 23, 24], "preprocessor": [4, 6, 23], "statement": [4, 10, 12, 16, 22], "valu": [4, 5, 6, 8, 9, 10, 11, 12, 13, 16, 18, 19, 20, 23], "were": [4, 8, 9, 10, 12, 16, 23], "like": [4, 6, 7, 8, 9, 10, 11, 12, 16, 19, 20, 21, 22, 23], "i_like_convolut": 4, "42": 4, "line": [4, 7, 8, 9, 10], "definit": [4, 12, 23], "effect": [4, 7, 8, 9, 10, 23], "perform": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 23], "unless": 4, "cours": [4, 8, 9, 10, 15, 16], "somewher": 4, "token": 4, "In": [4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 20, 21, 23, 24], "freeli": 4, "few": [4, 8, 9, 10, 12, 13, 22], "special": [4, 8, 9, 10, 18, 20, 24], "notic": [4, 8, 9, 10], "haven": [4, 15], "yet": [4, 6, 12, 13, 20], "basic": [4, 6, 8, 9, 10, 20], "block_size_x": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 20, 22, 23], "block_size_i": [4, 5, 8, 9, 10, 12, 13, 16, 23], "block_size_z": [4, 8, 9, 10, 12, 23], "interpret": 4, "z": [4, 6, 12, 23], "prefer": [4, 6, 7, 8, 10, 18, 23], "block_size_nam": [4, 6, 23], "let": [4, 6, 8, 9, 10, 20, 22], "continu": [4, 6, 7, 8, 9, 10, 15, 18, 19, 23], "creation": [4, 14, 19], "trusti": 4, "old": 4, "16": [4, 5, 8, 9, 10, 12, 13, 16], "dict": [4, 5, 6, 10, 13, 14, 18, 19, 20, 22, 23], "current": [4, 5, 6, 7, 8, 9, 10, 15, 16, 18, 19, 23], "undefin": [4, 6, 8, 9, 10, 16], "filter_heigth": 4, "those": [4, 7, 11, 15, 18], "could": [4, 5, 6, 8, 9, 10, 13, 15, 16, 18, 19, 22, 23], "runtim": [4, 6, 8, 9, 10, 14, 15, 18, 22], "setup": [4, 8, 9, 10, 13, 15, 18, 21], "everyth": [4, 6, 8, 9, 10], "answer": [4, 5, 6, 8, 9, 10, 11, 23], "alloc": [4, 6, 8, 9, 10, 11, 13, 23], "move": [4, 6, 8, 13, 16, 19, 23], "content": [4, 6, 23], "deriv": [4, 6, 8, 9, 10, 17], "after": [4, 5, 6, 7, 8, 9, 10, 13, 15, 16, 18, 23], "retriev": [4, 6, 23], "free": [4, 8, 9, 10, 13, 15, 16], "return": [4, 5, 6, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 23], "contrast": 4, "wa": [4, 6, 8, 9, 10, 18, 23], "finish": [4, 6, 9, 12, 13, 18], "particularli": [4, 7, 17], "than": [4, 8, 9, 10, 12, 17, 18, 19, 23, 24], "highli": [4, 14, 16], "parametr": 4, "long": [4, 8, 9, 10, 12, 13, 16, 21], "instead": [4, 6, 11, 16, 23], "adjust": [4, 7], "path": [4, 7, 18], "littl": [4, 8, 9, 10, 16], "ve": [4, 8, 9, 10, 15, 16], "familiar": [4, 16], "kernel_str": [4, 5, 6, 8, 9, 10, 13, 14, 19, 23], "tune_param": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 19, 20, 21, 22, 23], "replac": [4, 5, 6, 7, 8, 9, 10, 12, 16, 23], "similarli": 4, "singl": [4, 5, 6, 8, 9, 10, 13, 16, 18, 22, 23], "again": [4, 7, 8, 9, 10, 12, 16], "wai": [4, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 23], "64": [4, 8, 9, 10, 14, 16, 20, 22], "128": [4, 8, 9, 10, 14, 20, 22], "8": [4, 6, 7, 8, 9, 10, 12, 15, 16, 18], "try": [4, 6, 8, 9, 10, 15, 16, 19, 23], "env": [4, 6, 19, 20, 23], "take": [4, 6, 7, 8, 9, 10, 12, 16, 18, 19, 20, 22, 23], "cartesian": [4, 12], "product": [4, 8, 9, 23], "realli": [4, 8, 9, 10, 15], "howev": [4, 5, 8, 9, 10, 13, 15, 16, 18, 21, 22, 23], "lot": [4, 8, 9, 10, 16, 18, 20, 21, 23], "problemat": 4, "support": [4, 6, 7, 8, 9, 10, 13, 15, 18, 19, 22, 23, 24], "explain": [4, 6, 8, 9, 10, 13, 15, 16, 17, 20, 22, 23], "illeg": 4, "2048": 4, "1024": [4, 8, 9, 10, 20], "devic": [4, 5, 7, 8, 9, 10, 11, 13, 18, 22, 23], "fail": [4, 6, 15, 23], "reason": [4, 6, 7, 21, 23], "too": [4, 8, 9, 10, 12, 13, 16, 23], "regist": [4, 8, 9, 10, 16, 18], "avail": [4, 7, 8, 9, 10, 11, 12, 15, 18], "silent": 4, "verbos": [4, 5, 6, 8, 9, 10, 13, 23], "bound": [4, 6, 16, 19], "access": [4, 7, 8, 9, 10, 12, 18, 21], "ignor": [4, 6, 8, 9, 10, 23], "two": [4, 6, 8, 9, 10, 11, 16, 17, 19, 23], "thing": [4, 13, 16], "record": [4, 6, 8, 18, 23], "show": [4, 8, 9, 10, 11, 14, 17, 21], "secondli": [4, 16], "experi": 4, "took": [4, 8, 10, 19, 20, 23], "place": [4, 8, 9, 10, 18, 19, 20, 23], "That": [4, 8, 9, 10, 13, 16, 17, 20], "mean": [4, 13, 16, 17, 19, 21, 22, 24], "softwar": [4, 8, 9, 10, 14, 15, 18, 19, 20], "along": [4, 6, 15, 20, 24], "inform": [4, 6, 7, 8, 9, 10, 14, 18, 19, 20, 23, 24], "second": [4, 5, 6, 8, 9, 10, 12, 16, 17, 18, 19, 23], "alwai": [4, 6, 8, 9, 10], "under": [4, 7, 14, 23], "circumst": 4, "obtain": [4, 8, 9, 10, 12, 18], "promis": 4, "would": [4, 7, 8, 9, 10, 22], "tile": [4, 11, 16], "factor": [4, 8, 9, 10, 11, 12, 16, 24], "amount": [4, 8, 9, 10, 16, 17, 23], "particular": [4, 6, 8, 9, 11, 13, 16, 18, 21], "increas": [4, 8, 9, 10, 18], "certain": [4, 6, 8, 9, 10, 11, 18, 24], "tile_size_x": [4, 5, 8, 9, 10, 13, 16], "4": [4, 8, 9, 10, 12, 16, 18], "tile_size_i": [4, 5, 8, 9, 10, 13, 16, 23], "understand": 4, "everi": [4, 5, 8, 9, 10, 11, 18, 20], "fewer": [4, 8, 9, 10], "total": [4, 6, 8, 9, 10, 16, 17, 20], "stai": 4, "tell": [4, 8, 9, 10, 11, 13, 16, 20, 21], "influenc": 4, "alreadi": [4, 6, 7, 8, 9, 10, 15, 16, 23], "did": [4, 8, 9, 10, 16], "mimick": 4, "behavior": [4, 16, 18, 23], "been": [4, 6, 7, 8, 9, 10, 13, 16, 19], "assum": [4, 6, 8, 9, 10, 16, 23], "far": [4, 8, 9, 10, 16, 20], "grid_div_x": [4, 5, 8, 9, 10, 13, 16, 23], "grid_div_i": [4, 5, 8, 9, 10, 13, 16, 23], "add": [4, 6, 7, 8, 9, 10, 13, 16, 18, 19], "decreas": [4, 16], "correspondingli": 4, "displai": 4, "commonli": [4, 8, 9, 10, 15, 16], "gflop": [4, 6, 11, 16, 17], "giga": [4, 16], "compos": [4, 6, 16, 17], "lambda": [4, 6, 8, 9, 16, 17, 23], "collect": [4, 6, 8, 9, 10, 12, 16, 18, 21], "ordereddict": [4, 8, 9, 10, 12, 16, 17], "p": [4, 6, 16, 17, 21, 23], "1e9": [4, 16], "1e3": [4, 8, 9, 10, 16, 17], "expand": [4, 14, 16, 18], "longer": [4, 6, 7, 17], "sinc": [4, 10, 12, 14, 16, 22], "9": [4, 5, 7, 8, 9, 10, 13], "And": [4, 8, 9, 10, 19, 22, 23], "seen": [4, 6, 7, 16], "know": [4, 8, 9, 10, 16, 17], "abl": [4, 6, 8, 9, 10], "own": [4, 10, 13, 15, 17, 18], "whenev": 5, "good": [5, 8, 9, 10, 24], "fast": [5, 8, 9, 10], "verifi": [5, 6, 7, 11, 23], "instanc": [5, 6, 8, 9, 10, 13, 18, 23], "none": [5, 6, 18, 19, 23], "locat": [5, 7, 12, 18], "onc": [5, 6, 8, 9, 10, 12, 18, 23], "against": [5, 6, 7], "comparison": [5, 14], "allclos": [5, 23], "maximum": [5, 6, 12, 19, 23], "absolut": [5, 23], "1e": [5, 23], "6": [5, 8, 9, 10, 12, 13, 23], "toler": 5, "atol": [5, 6, 23], "convolution_correct": 5, "py": [5, 13, 15], "demonstr": [5, 10, 11, 16], "open": [5, 7, 8, 9, 13, 16], "cmem_arg": [5, 6, 23], "d_filter": 5, "arg": [5, 6, 8, 9, 10, 12, 13, 14, 16, 19, 20, 21, 22], "3": [5, 7, 8, 9, 10, 12, 13, 15, 16, 19, 23], "non": [5, 7], "field": [5, 8, 9, 10], "its": [5, 6, 8, 9, 10, 12, 14, 15, 16, 17, 18, 23], "almost": [5, 8, 9, 10, 18], "whose": [5, 23], "trust": [5, 19], "construct": [5, 16], "There": [5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 18, 20, 23, 24], "precomput": 5, "flexibl": [5, 8, 9, 16], "necessari": [5, 6, 7, 8, 9, 10, 23], "callabl": [5, 6, 23], "accept": [5, 6, 19, 23], "cpu_result": 5, "gpu_result": [5, 8, 10], "although": 5, "semant": 5, "posit": [5, 6, 12, 19, 22, 23], "reflect": [5, 18], "reduct": [5, 17, 23], "snippet": 5, "sum_x": 5, "n": [5, 7, 8, 9, 10, 12, 13, 14, 16, 19, 20, 22], "custom": [5, 11, 17, 18, 21], "def": [5, 6, 8, 9, 10, 12, 18, 21], "verify_partial_reduc": 5, "isclos": 5, "first_kernel": 5, "_": [5, 8, 9, 10], "sum_float": 5, "map": [5, 11, 12], "provid": [5, 6, 7, 8, 9, 10, 13, 22, 23], "third": [5, 16], "partial": [5, 8, 9, 10, 11], "cpu": [5, 9, 10, 13], "achiev": [5, 10], "element": [5, 8, 9, 10, 16, 17, 20, 21, 23], "doe": [5, 6, 7, 8, 9, 10, 12, 13, 16, 18, 22, 23], "necessarili": [5, 13], "section": [6, 8, 9, 10], "intern": [6, 14, 19, 22], "mostli": [6, 14, 23], "relev": [6, 14, 18], "extens": 6, "architectur": [6, 18], "At": [6, 12, 23], "top": [6, 7, 12, 18, 23], "expos": 6, "respons": 6, "iter": [6, 8, 9, 10, 12, 16, 18, 19, 20, 23], "through": [6, 7, 8, 9, 10, 12, 14, 17, 18, 19, 23], "brute_forc": [6, 23], "valid": [6, 11, 16, 23], "random_sampl": [6, 23], "sampl": [6, 19, 23], "advanc": [6, 22, 23], "being": [6, 8, 9, 10, 16, 18, 19, 23], "strategy_opt": [6, 19, 23], "sai": [6, 8, 9, 10, 20, 22], "foreseen": 6, "futur": [6, 14, 23, 24], "releas": [6, 7], "high": [6, 8, 9, 10, 14, 16, 18], "level": [6, 7, 18], "low": [6, 8, 9, 10, 16], "abstract": [6, 18], "ready_argument_list": 6, "bottom": 6, "pyopencl": [6, 7, 9, 18], "either": [6, 12, 19, 22, 23], "typic": [6, 15, 16, 23], "gcc": 6, "fortran": [6, 11, 22], "turn": 6, "launch": [6, 8, 9, 10, 13, 18, 23], "rest": [6, 8, 9, 10], "helper": [6, 18], "get_opt": 6, "suppli": [6, 13, 16, 19, 22, 23], "get_strategy_docstr": 6, "method": [6, 8, 9, 10, 13, 16, 18, 19], "make_strategy_options_doc": 6, "scale_from_param": 6, "ep": [6, 19], "func": [6, 18, 23], "invers": 6, "unscal": 6, "setup_method_argu": 6, "prepar": [6, 7, 8, 9, 10], "setup_method_opt": 6, "tuning_opt": [6, 19], "snap_to_nearest_config": 6, "closest": 6, "unscale_and_snap_to_nearest": 6, "snap": 6, "scale": 6, "variabl": [6, 7, 12, 15, 19, 23], "nearest": [6, 23], "config": [6, 7], "class": [6, 18, 19], "kernel_opt": 6, "device_opt": 6, "__init__": 6, "instanti": [6, 22], "kernelsourc": 6, "parameter_spac": [6, 19], "iterfac": 6, "platform": [6, 14, 15, 18, 23], "quiet": [6, 23], "fals": [6, 7, 18, 19, 23], "compiler_opt": [6, 23], "7": [6, 8, 9, 10, 12, 23], "offer": 6, "capabl": [6, 7, 8, 9, 14, 16, 23], "bool": [6, 21, 23], "gpu_arg": 6, "skip_nvml_set": 6, "benchmark_continu": 6, "durat": [6, 18], "least": [6, 7], "benchmark_default": 6, "check_kernel_output": 6, "compile_kernel": 6, "copy_constant_memory_arg": 6, "recent": [6, 15, 18], "copy_shared_memory_arg": 6, "smem_arg": [6, 23], "copy_texture_memory_arg": 6, "texmem_arg": [6, 23], "create_kernel_inst": 6, "get_environ": 6, "memcpy_dtoh": [6, 8], "dest": 6, "src": 6, "copi": [6, 8, 9, 10, 13, 20, 23], "static": 6, "preprocess_gpu_argu": 6, "old_argu": 6, "flat": 6, "given": [6, 8, 9, 10, 12, 18, 19, 23], "mem": 6, "set_nvml_paramet": 6, "nvml": [6, 24], "leak": 6, "group": [6, 8, 9, 10, 23], "maintain": 6, "state": [6, 8, 9, 10, 18, 23], "interact": [6, 18], "inspect": [6, 7, 18], "properti": [6, 16, 23], "context": [6, 8, 10, 12], "kernel_inst": 6, "lookup": 6, "directli": [6, 8, 9, 10, 13, 16, 18, 22, 23], "driver": [6, 7, 8, 10, 12], "ndarrai": [6, 12], "format": [6, 8, 9, 21], "kei": [6, 8, 9, 10, 16, 19, 20, 23], "symbol": [6, 23], "similar": [6, 13, 16, 23], "regular": [6, 10, 18], "int32": [6, 14, 20, 22, 23], "kernel_finish": 6, "otherwis": [6, 7, 16, 23], "devicealloc": 6, "memcpy_htod": [6, 8], "memset": 6, "unsign": [6, 9], "byte": [6, 21, 23], "global": [6, 7, 8, 9, 10, 19], "tupl": [6, 10, 12, 19, 23], "start_ev": 6, "event": [6, 8, 13, 18], "mark": 6, "measur": [6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18, 23, 24], "stop_ev": 6, "synchron": [6, 8, 10, 12, 16, 17], "halt": [6, 13], "until": [6, 13], "task": 6, "rawkernel": 6, "cudeviceptr": 6, "cufunct": 6, "id": [6, 7, 18], "must": [6, 17, 23], "buffer": [6, 9, 21], "fill": [6, 16], "item": [6, 8, 9, 10, 12], "ndrang": 6, "cfunction": 6, "cleanup_lib": 6, "unload": [6, 7], "previous": [6, 8, 9, 10, 16], "load": [6, 7], "librari": [6, 11, 18, 21], "kernelinst": 6, "repres": [6, 8, 9, 10], "tunabl": [6, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 22, 23, 24], "ctype": 6, "_funcptr": 6, "asynchron": 6, "memcpi": [6, 13], "c_arg": 6, "whatev": [6, 7, 13, 19], "left": [6, 7, 8, 9, 10, 12, 17], "robust": 6, "averag": [6, 8, 9, 10, 13, 18], "ptr": 6, "pionter": 6, "compilationfailedconfig": 6, "errorconfig": 6, "invalidconfig": 6, "npencod": 6, "skipkei": 6, "ensure_ascii": 6, "check_circular": 6, "allow_nan": 6, "sort_kei": 6, "indent": 6, "separ": [6, 11, 13, 22], "dump": [6, 8, 9], "json": [6, 8, 9, 11, 23], "obj": 6, "subclass": 6, "serializ": 6, "rais": 6, "typeerror": 6, "arbitrari": 6, "self": [6, 7, 18, 19], "except": [6, 7, 11], "els": 6, "jsonencod": 6, "runtimefailedconfig": 6, "skippablefailur": 6, "stopcriterionreach": 6, "thrown": 6, "stop": [6, 19], "criterion": [6, 19], "reach": 6, "check_argument_list": 6, "check_argument_typ": 6, "dtype": [6, 21], "kernel_argu": 6, "check_restrict": 6, "restrict": [6, 7, 11, 16, 22, 23], "whether": [6, 17, 19, 23], "meet": 6, "check_stop_criterion": 6, "max_fev": [6, 19, 23], "exceed": 6, "check_thread_block_dimens": 6, "max_thread": 6, "check_tune_params_list": 6, "simulation_mod": [6, 23], "forbidden": 6, "compile_restrict": 6, "monolith": 6, "try_to_constraint": 6, "union": 6, "str": [6, 8, 9, 10, 12], "constraint": 6, "pars": [6, 8, 9], "config_valid": 6, "max": 6, "convert_constraint_restrict": 6, "convert": [6, 8, 9], "backward": 6, "compat": [6, 7, 15], "correct_open_cach": 6, "open_cach": 6, "properli": 6, "close": [6, 8, 9, 10], "pretend": 6, "cuda_error_check": 6, "statu": 6, "delete_temp_fil": 6, "delet": 6, "temporari": 6, "don": [6, 7, 8, 10, 12, 13, 23], "complain": 6, "detect_languag": 6, "attempt": [6, 22], "detect": [6, 19, 22, 23], "dump_cach": 6, "omit": 6, "sever": [6, 8, 9, 10, 11, 12, 15, 16, 22, 23], "store_cach": 6, "speed": 6, "great": [6, 8, 9, 10, 20], "power": [6, 16, 18, 24], "get_best_config": 6, "objective_higher_is_bett": [6, 17, 23], "best": [6, 8, 9, 12, 16, 19, 22, 23, 24], "accord": [6, 23], "get_config_str": 6, "compact": 6, "represent": [6, 21], "get_grid_dimens": 6, "current_problem_s": 6, "grid_div": 6, "dim": 6, "get_instance_str": 6, "debug": 6, "advis": 6, "get_kernel_str": [6, 8, 9, 10], "One": [6, 8, 9, 10, 18, 21], "get_problem_s": 6, "get_smem_arg": 6, "get_temp_filenam": 6, "suffix": [6, 23], "form": [6, 16, 18, 19], "temp_x": 6, "larg": [6, 8, 9, 10, 12, 23], "integ": [6, 18, 21, 23], "get_thread_block_dimens": 6, "convent": [6, 13, 23], "get_total_tim": 6, "overhead_tim": 6, "looks_like_a_filenam": 6, "normalize_verify_funct": 6, "normal": [6, 19, 23], "result_host": 6, "keyword": 6, "behaviour": 6, "parse_restrict": 6, "prepare_kernel_str": 6, "prepend": [6, 10], "seri": [6, 12], "By": [6, 13, 16, 19, 23], "macro": 6, "made": 6, "print_config": 6, "print_config_output": 6, "process_cach": 6, "device_nam": [6, 23], "tune_params_kei": 6, "x1": 6, "x2": 6, "xn": 6, "234342": 6, "y1": 6, "y2": 6, "yn": 6, "134233": 6, "last": [6, 7, 21], "bracket": 6, "miss": [6, 7, 23], "earlier": [6, 8, 9, 10, 12], "abruptli": 6, "process_metr": 6, "calcul": [6, 12], "express": [6, 8, 9, 10, 11, 13, 16, 23], "10000": 6, "updat": [6, 7], "read_cach": 6, "cachefil": [6, 23], "read_fil": 6, "replace_param_occurr": 6, "occurr": 6, "setup_block_and_grid": 6, "to_valid_nvrtc_gpu_arch_cc": 6, "compute_cap": 6, "index": [6, 19], "group__opt": 6, "write_fil": 6, "core": [7, 18], "who": 7, "simplifi": [7, 8, 9, 10], "sudo": [7, 15], "g": [7, 15, 17, 18], "desir": 7, "system": [7, 14, 15, 18], "On": [7, 8, 9, 10, 23], "ubuntu": 7, "apt": 7, "upgrad": 7, "libssl": 7, "dev": [7, 15, 18], "zlib1g": 7, "libbz2": 7, "libreadlin": 7, "libsqlite3": 7, "wget": [7, 15], "curl": [7, 15], "llvm": 7, "libncurses5": 7, "libncursesw5": 7, "xz": 7, "util": [7, 16], "tk": 7, "libffi": 7, "liblzma": 7, "openssl": 7, "pyenv": 7, "linux": [7, 15], "bash": [7, 15], "bash_profil": 7, "bashrc": 7, "maco": 7, "brew": 7, "shell": 7, "libgdbm": 7, "libnss3": 7, "lzma": 7, "10": [7, 8, 9, 10, 14, 19], "11": [7, 8, 9, 10], "oppos": 7, "virtualenv": 7, "virtual": [7, 15], "folder": 7, "poetri": [7, 15], "ssl": [7, 15], "org": [7, 14, 15], "python3": [7, 15], "export": 7, "plugin": 7, "appli": [7, 8, 9, 10], "activ": 7, "project": [7, 15], "extra": [7, 15, 22], "leav": 7, "conveni": [7, 8, 9, 10, 13, 23], "cuda11x": 7, "cuda12x": 7, "part": [7, 8, 9, 10, 14, 15, 16, 17, 21, 23], "forget": [7, 12], "correctli": [7, 16], "ld_libary_path": 7, "cpath": 7, "gracefulli": 7, "privileg": [7, 18], "counter": [7, 18], "energi": [7, 14, 18, 19, 24], "cat": 7, "proc": 7, "grep": 7, "rmprofilingadminonli": 7, "without": [7, 8, 9, 10, 12, 13, 18, 19], "conda": 7, "mamba": 7, "miniconda": [7, 15], "tradit": 7, "quota": 7, "disk": 7, "save": [7, 8, 9], "ad": [7, 8, 9, 10, 13, 23], "condarc": 7, "envs_dir": 7, "both": [7, 8, 9, 10, 11, 16], "via": [7, 19], "usual": [7, 18], "exit": 7, "elsewher": 7, "pip_cache_dir": 7, "dir": [7, 15], "xdg_cache_hom": 7, "forg": 7, "auto_activate_bas": 7, "rocm": [7, 15, 18], "keyr": 7, "seemingli": 7, "weird": 7, "known": [7, 16], "m": [7, 8, 9, 10, 12], "disabl": 7, "sync": [7, 21], "dry": 7, "node": [7, 19], "noxset": 7, "toml": 7, "venvbackend": 7, "anaconda": 7, "venv": 7, "Be": [7, 8, 9, 10], "envdir": 7, "diskquota": 7, "isol": [7, 22], "coverag": 7, "gigabyt": 7, "tight": 7, "diskspac": 7, "remov": [7, 19], "ran": 7, "command": [7, 15], "involv": 7, "especi": 7, "break": [7, 22], "hold": [7, 8, 9, 16, 20, 21, 23], "invok": 7, "tab": 7, "studio": 7, "integr": [7, 22], "commit": 7, "brows": 7, "pandoc": 7, "mac": 7, "onlin": 7, "built": [7, 18, 19, 21, 23], "action": 7, "master": 7, "latest": [7, 15], "stabl": 7, "publish": [7, 14], "autom": 7, "whole": [8, 9, 10, 16, 19], "model": [8, 9, 10, 14], "physic": 8, "numer": [8, 9, 10], "introduc": [8, 9, 10, 16, 18], "redistribut": [8, 9, 10], "region": [8, 9, 10], "concentr": [8, 9, 10], "bulk": [8, 9, 10], "motion": [8, 9, 10], "concept": [8, 9, 10], "wide": [8, 9, 10, 15, 16], "chemistri": [8, 9, 10], "biologi": [8, 9, 10], "suppos": [8, 9, 10], "metal": [8, 9, 10], "sheet": [8, 9, 10], "temperatur": [8, 9, 10, 18, 19, 24], "equal": [8, 9, 10, 16, 23], "degre": [8, 9, 10], "everywher": [8, 9, 10], "heat": [8, 9, 10], "thousand": [8, 9, 10], "instant": [8, 9, 10, 12], "hotspot": [8, 9, 10], "cooler": [8, 9, 10], "area": [8, 9, 10, 16], "melt": [8, 9, 10], "loss": [8, 9, 10], "radiat": [8, 9, 10], "frac": [8, 9, 10], "d": [8, 9, 10, 12, 19, 20], "spatial": [8, 9, 10], "descret": [8, 9, 10], "2d": [8, 9, 10, 11], "quantiti": [8, 9, 10, 17, 18, 23], "nx": [8, 9, 10, 12], "equi": [8, 9, 10], "distant": [8, 9, 10], "direct": [8, 9, 10, 13, 16, 17, 23], "ny": [8, 9, 10, 12], "distanc": [8, 9, 10, 19], "delta": [8, 9, 10], "central": [8, 9, 10], "approxim": [8, 9, 10], "x_i": [8, 9, 10, 12], "x_": [8, 9, 10], "approx": [8, 9, 10], "u_": [8, 9, 10], "2u_": [8, 9, 10], "y_": [8, 9, 10], "estim": [8, 9, 10], "next": [8, 9, 10, 16, 21], "formula": [8, 9, 10], "4u_": [8, 9, 10], "simplic": [8, 9, 10, 12], "assumpt": [8, 9, 10], "boundari": [8, 9, 10], "condit": [8, 9, 10, 16], "dt": [8, 9, 10], "225": [8, 9, 10], "initi": [8, 9, 10, 21], "hot": [8, 9, 10], "plot": [8, 9, 10], "color": [8, 9, 10], "matplotlib": [8, 9, 10, 15], "pyplot": [8, 9, 10], "inlin": [8, 9, 10], "get_initial_condit": [8, 9, 10], "ones": [8, 9, 10, 24], "randint": [8, 9, 10], "1000": [8, 9, 10, 12], "2000": [8, 9, 10], "fig": [8, 9, 10], "ax1": [8, 9, 10], "ax2": [8, 9, 10], "subplot": [8, 9, 10], "imshow": [8, 9, 10], "lt": [8, 9, 10], "axesimag": [8, 9, 10], "0x2aaab952f240": 8, "gt": [8, 9, 10], "later": [8, 9, 10, 12, 23], "field_copi": [8, 9], "4164": 8, "018869400024": 8, "0x2aab1c98b3c8": 8, "worri": [8, 10], "terminologi": [8, 10], "text": [8, 10, 16], "5": [8, 9, 10, 12, 19], "225f": [8, 9, 10], "diffuse_kernel": [8, 9, 10], "u_new": [8, 9, 10], "0f": [8, 9, 10], "togeth": [8, 9, 10, 15, 23], "impact": [8, 9, 10, 13], "fix": [8, 9, 10, 19, 23], "unrol": [8, 9, 10, 11, 16, 24], "loop": [8, 9, 10, 11, 16, 24], "drv": 8, "sourcemodul": [8, 10, 12], "init": 8, "make_context": 8, "devprop": 8, "k": [8, 9, 10, 12, 14, 16, 20], "get_devic": 8, "get_attribut": 8, "cc": 8, "compute_capability_major": 8, "compute_capability_minor": 8, "u_old": [8, 10], "mem_alloc": 8, "nbyte": 8, "block_size_str": [8, 10], "arch": 8, "sm_": 8, "get_funct": [8, 10, 12], "boilerpl": [8, 9, 10], "moment": [8, 9, 10, 23], "serv": [8, 9, 10, 17, 19], "guess": [8, 9, 10], "pair": [8, 9, 10], "500": [8, 9, 10], "time_sinc": 8, "zeros_lik": [8, 12, 14, 16, 20, 22], "set_titl": [8, 9, 10], "53": [8, 9, 10], "423038482666016": 8, "0x2aaabbdcb2e8": 8, "faster": [8, 9, 10, 16], "cleanup": 8, "pop": 8, "think": [8, 9, 10], "messi": [8, 9, 10], "got": [8, 9, 10], "cleaner": [8, 9, 10], "plai": [8, 9, 10], "difficult": [8, 9, 10, 21, 22], "rather": [8, 9, 10, 23], "underutil": [8, 9, 10], "purpos": [8, 9, 10, 13, 16, 23, 24], "feel": [8, 9, 10], "48": [8, 9, 10], "care": [8, 9, 10], "appropi": [8, 9, 10], "fly": [8, 9, 10], "12": [8, 9, 10], "13": [8, 9, 10], "geforc": [8, 9, 10, 12], "gtx": [8, 9, 10, 12], "titan": [8, 9, 10], "22305920124": 8, "779033613205": 8, "824838399887": 8, "900499212742": 8, "999763202667": 8, "727967989445": 8, "752479994297": 8, "797900807858": 8, "876627194881": 8, "93347837925": 8, "766662418842": 8, "803033602238": 8, "853574407101": 8, "971545600891": 8, "763775992393": 8, "791257584095": 8, "848044800758": 8, "922745585442": 8, "792595207691": 8, "822137594223": 8, "893279993534": 8, "millisecond": [8, 9, 10], "matter": [8, 9, 10, 13], "analyz": [8, 9, 10], "seem": [8, 9, 10], "vari": [8, 9, 10, 12, 16, 17], "addtion": [8, 9, 10], "among": [8, 9, 10, 14, 19], "128x32": [8, 9, 10], "likewis": [8, 9, 10], "becom": [8, 9, 10, 18, 19], "affect": [8, 9, 10, 16], "within": [8, 9, 10, 12, 16, 19, 23], "exchang": [8, 9, 10], "fact": [8, 9, 10, 13], "commun": [8, 9, 10], "idea": [8, 9, 10, 13, 16, 24], "l2": [8, 9, 10], "closer": [8, 9, 10], "multiprocessor": [8, 9, 10], "l1": [8, 9, 10], "fine": [8, 9, 10], "grain": [8, 9, 10], "manag": [8, 9, 10, 16, 18], "cost": [8, 9, 10, 19], "overhead": [8, 9, 10, 16], "degrad": [8, 9, 10], "intermedi": [8, 9, 10], "mind": [8, 9, 10], "14": [8, 9, 10], "tx": [8, 9, 10, 16], "ty": [8, 9, 10, 16], "bx": [8, 9, 10, 12], "__shared__": [8, 10, 16], "sh_u": [8, 9, 10], "pragma": [8, 9, 10, 16], "__syncthread": [8, 9, 10, 16], "75041918755": 8, "18713598251": 8, "09015038013": 8, "06844799519": 8, "09730558395": 8, "14420480728": 8, "05957758427": 8, "07508480549": 8, "0731967926": 8, "14729599953": 8, "08389122486": 8, "10700161457": 8, "10125439167": 8, "31661438942": 8, "0629119873": 8, "04807043076": 8, "054880023": 8, "12033278942": 8, "06672639847": 8, "05816960335": 8, "12000002861": 8, "merg": [8, 9, 10, 16], "half": [8, 9, 10], "doubl": [8, 9, 10, 21, 22], "cover": [8, 9, 10, 19], "beyond": [8, 9, 10, 23], "reduc": [8, 9, 10, 16], "condens": [8, 9, 10], "keep": [8, 9, 10, 16, 21], "importantli": [8, 9, 10], "worst": [8, 9, 10], "15": [8, 9, 10, 22], "tj": [8, 9, 10], "ti": [8, 9, 10, 12], "somehow": [8, 9, 10], "insid": [8, 9, 10, 13, 16, 22, 23], "round": [8, 9, 10, 23], "arithmet": [8, 9, 10, 23], "evalu": [8, 9, 10, 16, 19, 23], "759308815": 8, "29789438248": 8, "06983039379": 8, "2634239912": 8, "997139203548": 8, "843692803383": 8, "05549435616": 8, "862348806858": 8, "750636804104": 8, "19084160328": 8, "876377594471": 8, "714169609547": 8, "875001597404": 8, "691116797924": 8, "575859189034": 8, "759679996967": 8, "622867202759": 8, "650336003304": 8, "09794559479": 8, "826515209675": 8, "692665600777": 8, "78363519907": 8, "646092808247": 8, "554745602608": 8, "716115188599": 8, "581280004978": 8, "662566399574": 8, "07386879921": 8, "833420813084": 8, "705055999756": 8, "840755212307": 8, "652575993538": 8, "569388794899": 8, "689356791973": 8, "597267186642": 8, "675232005119": 8, "10033922195": 8, "860332798958": 8, "731891202927": 8, "867276787758": 8, "68781440258": 8, "595276796818": 8, "735436797142": 8, "60216319561": 8, "852166390419": 8, "15089921951": 8, "852575981617": 8, "705932807922": 8, "888671982288": 8, "673248004913": 8, "563417613506": 8, "761139214039": 8, "621254396439": 8, "676595199108": 8, "06709122658": 8, "804953610897": 8, "685670387745": 8, "801798415184": 8, "632006394863": 8, "542387211323": 8, "722668802738": 8, "578745603561": 8, "618598401546": 8, "08220798969": 8, "821881604195": 8, "687955200672": 8, "77759360075": 8, "618003201485": 8, "539891195297": 8, "705900788307": 8, "568556785583": 8, "624492788315": 8, "0799423933": 8, "832300806046": 8, "70140799284": 8, "835481595993": 8, "638348805904": 8, "550105595589": 8, "667251205444": 8, "576044797897": 8, "732409596443": 8, "15916161537": 8, "869497597218": 8, "733248019218": 8, "890803205967": 8, "677363204956": 8, "577215993404": 8, "730982398987": 8, "58035838604": 8, "10066559315": 8, "837804794312": 8, "691385602951": 8, "851040017605": 8, "666656005383": 8, "560505592823": 8, "771103990078": 8, "626163220406": 8, "694451200962": 8, "11514236927": 8, "837299215794": 8, "703302407265": 8, "806828796864": 8, "648620784283": 8, "562521612644": 8, "760915207863": 8, "605760002136": 8, "690009605885": 8, "10740480423": 8, "841631996632": 8, "700883197784": 8, "838195204735": 8, "649779188633": 8, "56585599184": 8, "7168192029": 8, "59088640213": 8, "69627519846": 8, "3269824028": 8, "02665598392": 8, "840908801556": 8, "03752319813": 8, "788345599174": 8, "662041604519": 8, "85437438488": 8, "680422389507": 8, "0759360075": 8, "801996803284": 8, "666003203392": 8, "808000004292": 8, "643359994888": 8, "544691193104": 8, "741964805126": 8, "60942081213": 8, "681350398064": 8, "05262081623": 8, "792108798027": 8, "66344319582": 8, "768064010143": 8, "625260794163": 8, "540352010727": 8, "721862399578": 8, "579411196709": 8, "626976013184": 8, "06332798004": 8, "808211183548": 8, "679372787476": 8, "803718411922": 8, "627136015892": 8, "538227200508": 8, "682188808918": 8, "573836791515": 8, "725548803806": 8, "13023357391": 8, "843411195278": 8, "713843202591": 8, "85886080265": 8, "657920002937": 8, "565254402161": 8, "697094392776": 8, "579904007912": 8, "07484800816": 8, "801119995117": 8, "667347204685": 8, "799059200287": 8, "643820810318": 8, "542937588692": 8, "740518403053": 8, "615148806572": 8, "731334400177": 8, "07002239227": 8, "805299210548": 8, "675923216343": 8, "782060790062": 8, "631142401695": 8, "540383994579": 8, "723999989033": 8, "578681600094": 8, "726335990429": 8, "13297917843": 8, "844428789616": 8, "710278391838": 8, "835494399071": 8, "637958395481": 8, "567417597771": 8, "699366402626": 8, "588492810726": 8, "tri": [8, 9, 10, 19], "grow": [8, 9, 10], "quickli": [8, 9, 10], "went": [8, 9, 10, 12], "72": [8, 9, 10], "26": [8, 9, 10], "32x2": [8, 9, 10], "64x4": [8, 9, 10], "four": [8, 9, 10], "best_tim": [8, 9], "min": [8, 9], "05": [8, 9], "join": [8, 9], "nice": [8, 9], "stdout": [8, 9], "why": [8, 9, 13, 17], "easili": [8, 9, 18], "easi": [8, 9, 17, 18, 23], "csv": [8, 9, 11], "analysi": [8, 9, 14], "panda": [8, 9, 11, 15], "18": [8, 9, 10], "fp": [8, 9], "datafram": [8, 9], "df": [8, 9], "to_csv": [8, 9], "0x2aab1de088d0": 9, "01": 9, "sy": 9, "140": 9, "wall": 9, "98": 9, "__kernel": 9, "get_group_id": 9, "get_local_id": 9, "cl": 9, "ctx": 9, "create_some_context": 9, "mf": 9, "mem_flag": 9, "a_h": 9, "a_d": 9, "read_writ": 9, "copy_host_ptr": 9, "hostbuf": 9, "b_d": 9, "kernel_src": 9, "prg": 9, "queue": 9, "commandqueu": 9, "run_gpu": 9, "444": 9, "154": 9, "598": 9, "985": 9, "enqueue_copi": 9, "1748096": 9, "7284544": 9, "7707904": 9, "8573184": 9, "8380288": 9, "686528": 9, "69648": 9, "7461632": 9, "818304": 9, "771072": 9, "7190464": 9, "7522432": 9, "7982208": 9, "9624512": 9, "7214464": 9, "7453312": 9, "8028416": 9, "8922624": 9, "747328": 9, "7860736": 9, "8637184": 9, "__local": 9, "barrier": 9, "clk_local_mem_f": 9, "8449472": 9, "1912576": 9, "1035136": 9, "0927808": 9, "1140736": 9, "1790336": 9, "0808192": 9, "0809792": 9, "0836928": 9, "1545856": 9, "1249984": 9, "1264": 9, "1230336": 9, "4015104": 9, "0873216": 9, "0626496": 9, "0692224": 9, "140192": 9, "0801344": 9, "0688128": 9, "1428928": 9, "8844544": 9, "3245952": 9, "0911808": 9, "3039616": 9, "0079296": 9, "84848": 9, "0708288": 9, "857728": 9, "7561792": 9, "231072": 9, "8774336": 9, "7087296": 9, "8772672": 9, "6911872": 9, "5715968": 9, "7584896": 9, "6292032": 9, "6498688": 9, "1145664": 9, "8252928": 9, "6757568": 9, "7881152": 9, "6237696": 9, "544224": 9, "6951168": 9, "5648128": 9, "6452736": 9, "1065792": 9, "8313792": 9, "6905984": 9, "8302656": 9, "6367488": 9, "5478592": 9, "6660672": 9, "5719744": 9, "6551744": 9, "1384064": 9, "8531072": 9, "7078976": 9, "8516672": 9, "6677696": 9, "5685632": 9, "7074048": 9, "5753152": 9, "8228864": 9, "2124736": 9, "8633344": 9, "6921216": 9, "8896384": 9, "6659904": 9, "5582144": 9, "7522624": 9, "6081536": 9, "6664448": 9, "1095936": 9, "8063424": 9, "6717888": 9, "7982848": 9, "6263552": 9, "5289728": 9, "7008832": 9, "567456": 9, "5968704": 9, "1018432": 9, "8117248": 9, "6724736": 9, "7728576": 9, "6038336": 9, "5172352": 9, "6796352": 9, "5470016": 9, "5968448": 9, "1107712": 9, "8237248": 9, "6810944": 9, "821952": 9, "620352": 9, "5230208": 9, "6415552": 9, "5476864": 9, "7168192": 9, "1942016": 9, "8626304": 9, "7099712": 9, "9123328": 9, "6608448": 9, "5631168": 9, "7113024": 9, "556576": 9, "1583104": 9, "8384832": 9, "67856": 9, "845856": 9, "6581248": 9, "54944": 9, "7520064": 9, "6076224": 9, "6842112": 9, "1547072": 9, "8422016": 9, "6895552": 9, "8037312": 9, "6387072": 9, "5383296": 9, "7326656": 9, "5863488": 9, "6813376": 9, "1493952": 9, "8444928": 9, "6929216": 9, "832768": 9, "6389312": 9, "5412672": 9, "698336": 9, "5717568": 9, "676096": 9, "4303104": 9, "0341696": 9, "8365184": 9, "0398656": 9, "7786496": 9, "648928": 9, "8479232": 9, "6508544": 9, "1219392": 9, "7994048": 9, "6492288": 9, "8068416": 9, "6343168": 9, "5235328": 9, "7268928": 9, "5898432": 9, "6633536": 9, "0849664": 9, "7869632": 9, "6458624": 9, "7611968": 9, "613088": 9, "50912": 9, "6972928": 9, "5620608": 9, "601856": 9, "095232": 9, "7967488": 9, "6601472": 9, "7952896": 9, "6047296": 9, "5108224": 9, "6607744": 9, "5492416": 9, "7091136": 9, "171552": 9, "8473408": 9, "6962112": 9, "8663936": 9, "6466816": 9, "5475584": 9, "6754048": 9, "5591744": 9, "108896": 9, "7907264": 9, "6459328": 9, "7965888": 9, "6250816": 9, "5188416": 9, "721408": 9, "5920832": 9, "7068608": 9, "0909248": 9, "7930752": 9, "6524544": 9, "7745216": 9, "6146176": 9, "5116928": 9, "6975872": 9, "5548416": 9, "7075136": 9, "174624": 9, "8384512": 9, "69104": 9, "8335488": 9, "6264192": 9, "5445248": 9, "6719104": 9, "5592064": 9, "19": [9, 10], "solv": 10, "0x7f888f8cd7b8": 10, "4152": 10, "086019515991": 10, "0x7f8865b51f28": 10, "gpuarrai": [10, 12], "tool": [10, 12, 14], "autoinit": [10, 12], "to_gpu": [10, 12], "mod": [10, 12], "t0": [10, 12], "ona": 10, "33": 10, "46109390258789": 10, "0x7f8858b873c8": 10, "1080": [10, 12], "916985595226": 10, "489004802704": 10, "500524806976": 10, "513356792927": 10, "545715200901": 10, "486515200138": 10, "449055999517": 10, "44974719882": 10, "457427197695": 10, "492915201187": 10, "464863997698": 10, "466118401289": 10, "475264000893": 10, "513632011414": 10, "458412796259": 10, "457715201378": 10, "461017608643": 10, "475987195969": 10, "460032004118": 10, "457779198885": 10, "462649595737": 10, "kernel_string_shar": 10, "22673916817": 10, "826361596584": 10, "793516802788": 10, "782112002373": 10, "776639997959": 10, "795135998726": 10, "722777605057": 10, "762777590752": 10, "75422719717": 10, "804876792431": 10, "778656005859": 10, "769734406471": 10, "782495999336": 10, "932281601429": 10, "734028804302": 10, "721625590324": 10, "736511993408": 10, "800019192696": 10, "724966406822": 10, "722969603539": 10, "759430396557": 10, "kernel_string_til": 10, "22200961113": 10, "91601279974": 10, "752838408947": 10, "873651194572": 10, "69833599329": 10, "586931192875": 10, "516473591328": 10, "411392003298": 10, "384262400866": 10, "82159358263": 10, "632607996464": 10, "506457602978": 10, "618758392334": 10, "500288009644": 10, "429862397909": 10, "44995200038": 10, "366150397062": 10, "342201602459": 10, "793542397022": 10, "58026239872": 10, "494163197279": 10, "546316814423": 10, "467059195042": 10, "404249596596": 10, "440895992517": 10, "341376006603": 10, "339692795277": 10, "783923208714": 10, "597920000553": 10, "50277120471": 10, "615475213528": 10, "470937597752": 10, "418393599987": 10, "443519997597": 10, "343961596489": 10, "342540800571": 10, "780352008343": 10, "611705589294": 10, "515667212009": 10, "622534394264": 10, "502195191383": 10, "437388807535": 10, "45568639636": 10, "359289598465": 10, "426995199919": 10, "788947200775": 10, "616556799412": 10, "496121603251": 10, "629164803028": 10, "474841600657": 10, "407667201757": 10, "47406719923": 10, "371507203579": 10, "352531200647": 10, "72023679018": 10, "574816000462": 10, "481817597151": 10, "580928003788": 10, "455724793673": 10, "394975996017": 10, "464659202099": 10, "357107198238": 10, "324083191156": 10, "759910392761": 10, "569177603722": 10, "481279999018": 10, "528115200996": 10, "441734397411": 10, "393126398325": 10, "455404800177": 10, "350457596779": 10, "322547197342": 10, "754201591015": 10, "579827189445": 10, "491852802038": 10, "582751989365": 10, "451283198595": 10, "391807991266": 10, "456275194883": 10, "356716805696": 10, "362937599421": 10, "809894394875": 10, "60433280468": 10, "507142400742": 10, "655827200413": 10, "474092799425": 10, "408166396618": 10, "480531209707": 10, "346707201004": 10, "780134403706": 10, "601049602032": 10, "493900799751": 10, "620384001732": 10, "494553589821": 10, "425414395332": 10, "467033600807": 10, "375468802452": 10, "346079999208": 10, "771052801609": 10, "593977594376": 10, "49723520875": 10, "583270406723": 10, "478079998493": 10, "416320002079": 10, "443942397833": 10, "359744000435": 10, "343545603752": 10, "780960011482": 10, "598758399487": 10, "498617601395": 10, "57678719759": 10, "46561280489": 10, "41324160099": 10, "431225597858": 10, "351263999939": 10, "34440960288": 10, "933260798454": 10, "715257608891": 10, "586604809761": 10, "711615991592": 10, "558771193027": 10, "466284793615": 10, "44043520093": 10, "361823999882": 10, "731839990616": 10, "57044479847": 10, "470220798254": 10, "608800005913": 10, "472665601969": 10, "416352003813": 10, "481376004219": 10, "380812799931": 10, "351923197508": 10, "719257593155": 10, "55171200037": 10, "466758400202": 10, "568435204029": 10, "459654402733": 10, "394380801916": 10, "463052803278": 10, "36409599781": 10, "328998398781": 10, "73579518795": 10, "564575994015": 10, "472236800194": 10, "549024009705": 10, "438406395912": 10, "389945602417": 10, "455193603039": 10, "364051198959": 10, "375519996881": 10, "798195195198": 10, "588998401165": 10, "49552000761": 10, "595462405682": 10, "460972803831": 10, "400672000647": 10, "465132802725": 10, "364627194405": 10, "729363203049": 10, "558815991879": 10, "466655993462": 10, "600819194317": 10, "460281592607": 10, "404908800125": 10, "478739196062": 10, "386668801308": 10, "385510402918": 10, "720915210247": 10, "550668799877": 10, "466937589645": 10, "564921605587": 10, "447974395752": 10, "394271999598": 10, "46233600378": 10, "365190398693": 10, "387827193737": 10, "762003195286": 10, "579007995129": 10, "486649608612": 10, "557331204414": 10, "443033593893": 10, "396070402861": 10, "457075202465": 10, "369555193186": 10, "wish": 10, "modifi": [10, 18], "tile_size_j": 10, "fixed_param": [10, 12], "ceil": [10, 12], "zip": [10, 12], "transfer": [10, 11, 13], "20": [10, 19], "21": 10, "618": 10, "2231903076172": 10, "0x7f887c3d2358": 10, "incorpor": 10, "ifndef": 10, "kerenel": 10, "psedo": 10, "endif": 10, "bypass": 10, "usecas": 11, "test_vector_add": 11, "test_vector_add_parameter": 11, "highlight": 11, "contact": 11, "illustr": 11, "openacc": 11, "dimension": [11, 12, 23], "clean": [11, 16], "center": [11, 12], "lock": [11, 18], "overlap": [11, 13], "shuffl": 11, "pipelin": 11, "consist": [11, 16, 23], "scipi": 11, "algorithm": [11, 14, 19, 23], "cub": 11, "gaussian": 12, "delv": 12, "hand": [12, 16], "sum_": 12, "exp": 12, "beta": [12, 19], "sqrt": 12, "y_i": 12, "z_i": 12, "vector": [12, 13, 20], "coordin": 12, "linalg": 12, "la": 12, "compute_grid": 12, "xgrid": 12, "ygrid": 12, "zgrid": 12, "x0": 12, "y0": 12, "z0": 12, "themselv": 12, "meshgrid": 12, "send": 12, "interv": 12, "256": [12, 14, 20], "suffici": [12, 17], "100": [12, 19, 23], "randomli": [12, 19], "distribut": [12, 16], "linspac": 12, "cpu_grid": 12, "npt": 12, "rand": 12, "xyz": [12, 23], "52320": 12, "160627": 12, "might": [12, 17], "nz": 12, "bz": 12, "kernel_cod": 12, "math": 12, "__host__": 12, "__device__": [12, 22], "b": [12, 14, 16, 19, 20, 22], "addgrid": 12, "xvect": 12, "yvect": 12, "zvect": 12, "dx": 12, "dy": 12, "dz": 12, "assign": 12, "explor": 12, "middl": 12, "henc": [12, 21], "mention": 12, "56833920479": 12, "80796158314": 12, "940044796467": 12, "855628800392": 12, "855359995365": 12, "16174077988": 12, "11877760887": 12, "01592960358": 12, "849273598194": 12, "849235200882": 12, "19029750824": 12, "16199679375": 12, "40401918888": 12, "39618558884": 12, "39508478642": 12, "31647996902": 12, "31470079422": 12, "50787198544": 12, "53760001659": 12, "56709756851": 12, "34500494003": 12, "25130877495": 12, "50662400723": 12, "55267841816": 12, "17987194061": 12, "12309756279": 12, "01125121117": 12, "849631989002": 12, "853708791733": 12, "17051515579": 12, "15584001541": 12, "40074241161": 12, "39547519684": 12, "39331197739": 12, "30295038223": 12, "28725762367": 12, "39589118958": 12, "38867840767": 12, "37724158764": 12, "34344320297": 12, "26213116646": 12, "38793599606": 12, "3775359869": 12, "74003200531": 12, "13276162148": 12, "37233917713": 12, "18835201263": 12, "15777277946": 12, "40247042179": 12, "39366400242": 12, "39439997673": 12, "23719043732": 12, "28542718887": 12, "39207677841": 12, "38956804276": 12, "3778496027": 12, "29814395905": 12, "26398081779": 12, "38625922203": 12, "3754431963": 12, "72981758118": 12, "12483196259": 12, "37322881222": 12, "61618566513": 12, "2194111824": 12, "17600002289": 12, "27082881927": 12, "38787200451": 12, "3835711956": 12, "37543039322": 12, "30227203369": 12, "23127679825": 12, "38627202511": 12, "37677440643": 12, "64358406067": 12, "12255358696": 12, "37474560738": 12, "61655673981": 12, "19179515839": 12, "99912958145": 12, "213971138": 12, "16430072784": 12, "38772480488": 12, "3735104084": 12, "54432649612": 12, "05524477959": 12, "36935677528": 12, "42449922562": 12, "10455036163": 12, "67516155243": 12, "programmat": 12, "30": 12, "minimum": 12, "84": 12, "suit": [12, 23], "grid_dim": 12, "associ": 12, "substitut": 12, "ourselv": 12, "extract": 12, "manual": [12, 15], "exlicitli": 12, "accur": [12, 18], "xgpu": 12, "ygpu": 12, "zgpu": 12, "grid_gpu": 12, "80": 12, "133200": 12, "lower": [12, 18, 19], "roughli": [12, 16], "40000": 12, "across": [13, 16], "qualiti": 13, "itself": [13, 14, 23], "precis": 13, "plain": 13, "omp_get_wtim": 13, "openmp": 13, "convolution_stream": 13, "complex": [13, 16], "behind": 13, "spread": 13, "back": [13, 23], "split": 13, "chunk": 13, "slightli": [13, 16, 22], "account": [13, 16], "border": [13, 23], "latter": 13, "cudastreamwaitev": 13, "num_stream": 13, "clarifi": 13, "fit": [13, 19], "choic": [13, 15], "grid_size_x": 13, "grid_size_i": 13, "cudamemcpytosymbol": 13, "upload": 13, "yourself": [13, 23], "spent": [13, 23], "relat": [14, 17, 24], "famili": 14, "launcher": 14, "kt": [14, 21], "easiest": 14, "toolkit": [14, 15], "intend": 14, "Or": [14, 15], "vector_add": [14, 19, 20, 22], "10000000": 14, "512": [14, 20], "research": 14, "cite": 14, "paper": 14, "significantli": [14, 16, 18], "articl": [14, 20], "author": 14, "ben": 14, "van": 14, "werkhoven": 14, "titl": 14, "auto": [14, 16, 18, 19, 22, 23, 24], "journal": 14, "year": 14, "2019": 14, "volum": 14, "90": 14, "347": 14, "358": 14, "url": 14, "www": 14, "sciencedirect": 14, "scienc": 14, "pii": 14, "s0167739x18313359": 14, "doi": 14, "1016": 14, "2018": 14, "08": 14, "004": 14, "referenc": 14, "bayesian": [14, 19, 23], "willemsen2021bayesian": 14, "willemsen": [14, 19], "flori": 14, "jan": 14, "nieuwpoort": 14, "rob": 14, "workshop": 14, "pmb": 14, "supercomput": 14, "sc21": 14, "2021": 14, "arxiv": 14, "ab": 14, "2111": 14, "14991": 14, "difficulti": 14, "schoonhoven2022benchmark": 14, "schoonhoven": 14, "richard": 14, "batenburg": 14, "joost": 14, "ieee": 14, "transact": 14, "evolutionari": 14, "2022": 14, "consumpt": [14, 16, 18], "schoonhoven2022go": 14, "veenboer": 14, "bram": 14, "green": 14, "effici": [14, 16, 18], "steer": 14, "sc22": 14, "2211": 14, "07260": 14, "comprehens": 15, "recommend": [15, 21], "download": 15, "repo": 15, "continuum": 15, "io": 15, "miniconda3": 15, "x86_64": 15, "sh": 15, "newer": [15, 18], "nativ": 15, "prefix": 15, "home": 15, "pythonpath": 15, "bind": [15, 18], "older": 15, "troubl": 15, "retri": 15, "wiki": 15, "tiker": 15, "net": 15, "amd": [15, 18], "app": 15, "sdk": 15, "intel": 15, "appl": 15, "beignet": 15, "stack": 15, "altern": [15, 23], "navig": 15, "benvanwerkhoven": 15, "differenti": [15, 19, 23], "chanc": [15, 19, 22], "algebra": 16, "frequent": 16, "programm": [16, 18], "row": 16, "column": 16, "squar": 16, "matric": 16, "matmul_na": 16, "width": 16, "matmul_kernel": 16, "height": 16, "Of": 16, "solut": [16, 18], "realiti": 16, "contant": 16, "denot": [16, 20, 23], "sensibl": 16, "pick": 16, "word": 16, "warpsiz": 16, "namelijk": 16, "stand": 16, "briefli": 16, "figur": 16, "fifth": 16, "fourth": 16, "dramat": 16, "profil": 16, "pretti": 16, "opportun": 16, "realiz": 16, "collabor": 16, "bandwidth": 16, "techniqu": 16, "submatric": 16, "proce": 16, "matmul_shar": 16, "sa": 16, "sb": 16, "kb": 16, "outer": 16, "inner": 16, "race": 16, "drastic": 16, "due": [16, 22, 23], "fortun": 16, "benefit": 16, "redund": 16, "distinct": 16, "1xn": 16, "usag": [16, 18], "occup": 16, "goe": 16, "down": 16, "matmul": 16, "newli": 16, "coupl": 16, "respect": [16, 18], "independ": 16, "yield": 16, "discontinu": 16, "room": 16, "impos": 16, "report": [17, 18, 23, 24], "possibli": [17, 23], "_flop": 17, "total_flop": 17, "ps_energi": [17, 18, 24], "occur": [17, 23], "exhaust": 17, "brute": [17, 19, 20], "forc": [17, 19, 20, 22], "maxim": [17, 23], "boolean": [17, 18, 23], "facilit": 18, "layer": 18, "act": 18, "hook": 18, "pattern": 18, "subscrib": 18, "benchmarkobserv": 18, "overwritten": [18, 23], "extend": 18, "mandatori": 18, "get_result": 18, "aggreg": 18, "after_finish": 18, "after_start": 18, "before_start": 18, "register_configur": 18, "register_devic": 18, "variou": [18, 20], "registerobserv": 18, "track": 18, "num_reg": 18, "current_modul": 18, "powersensor2": 18, "pcie": 18, "intercept": 18, "sensor": 18, "transmit": 18, "usb": 18, "connect": 18, "advantag": 18, "instantan": 18, "frequenc": 18, "khz": 18, "pybind11": 18, "powersensor": [18, 24], "ps_power": [18, 24], "joul": [18, 24], "watt": [18, 24], "ttyacm0": 18, "voltag": 18, "thin": 18, "wrapper": [18, 22], "intricaci": 18, "friendli": 18, "mode": 18, "repeatedli": 18, "downsid": 18, "approach": 18, "save_al": 18, "nvidia_smi_fallback": 18, "use_locked_clock": 18, "continous_dur": 18, "monitor": 18, "clock": [18, 24], "power_read": [18, 24], "nvml_power": [18, 24], "nvml_energi": [18, 24], "core_freq": [18, 24], "mem_freq": [18, 24], "gr_voltag": 18, "ordin": 18, "identifi": 18, "smi": 18, "root": 18, "opt": 18, "amper": 18, "continuous_dur": 18, "common": [18, 22], "cap": 18, "popular": 18, "nvml_gr_clock": [18, 24], "nvml_mem_clock": [18, 24], "nvml_pwr_limit": [18, 24], "graphic": [18, 24], "jetson": 18, "rapl": 18, "xilinx": 18, "pmt": 18, "astron": 18, "nl": 18, "rd": 18, "meter": 18, "arduino": 18, "_energi": 18, "_power": 18, "acceler": 19, "prohibit": 19, "slow": 19, "wast": 19, "basin": [19, 23], "hop": [19, 23], "dual": [19, 23], "anneal": [19, 23], "evolut": [19, 23], "firefli": [19, 23], "genet": [19, 23], "greedi": [19, 23], "multi": [19, 23], "particl": [19, 23], "swarm": [19, 23], "mechan": 19, "overrid": 19, "time_limit": [19, 23], "uniqu": [19, 23], "count": 19, "searchspac": 19, "runner": 19, "nelder": 19, "mead": 19, "powel": 19, "cg": 19, "bfg": 19, "l": 19, "tnc": 19, "cobyla": 19, "slsqp": 19, "reject": 19, "thesi": 19, "generate_normalized_param_dict": 19, "denorm": 19, "normalize_parameter_spac": 19, "param_spac": 19, "prune_parameter_spac": 19, "normalize_dict": 19, "prune": 19, "hyperparamet": 19, "popul": 19, "best1bin": 19, "best1exp": 19, "rand1exp": 19, "randtobest1exp": 19, "best2exp": 19, "rand2exp": 19, "randtobest1bin": 19, "best2bin": 19, "rand2bin": 19, "rand1bin": 19, "popsiz": 19, "maxit": 19, "constr": 19, "compute_intens": 19, "fun": 19, "intens": 19, "distance_to": 19, "euclidian": 19, "move_toward": 19, "alpha": 19, "toward": 19, "b0": 19, "attract": 19, "gamma": 19, "light": 19, "absorpt": 19, "coeffici": 19, "disruptive_uniform_crossov": 19, "dna1": 19, "dna2": 19, "disrupt": 19, "uniform": 19, "crossov": 19, "uniformli": 19, "gene": 19, "children": 19, "guarante": 19, "parent": 19, "mutat": 19, "dna": 19, "mutation_ch": 19, "single_point_crossov": 19, "single_point": 19, "two_point": 19, "disruptive_uniform": 19, "two_point_crossov": 19, "uniform_crossov": 19, "weighted_choic": 19, "probabl": [19, 23], "il": 19, "neighbor": 19, "ham": 19, "adjac": 19, "greedy": 19, "soon": 19, "no_improv": 19, "exce": 19, "50": 19, "random_walk": 19, "hillclimb": 19, "travers": 19, "inertia": 19, "c1": 19, "cognit": 19, "c2": 19, "social": 19, "fraction": 19, "acceptance_prob": 19, "old_cost": 19, "new_cost": 19, "modif": [19, 21], "po": 19, "t_min": 19, "001": 19, "995": 19, "vector_add_kernel": 20, "wise": 20, "1000000": [20, 22], "recogn": 20, "alright": 20, "portabl": 21, "stick": 21, "pointer": 21, "primit": 21, "lead": 21, "ineffici": 21, "situat": 21, "scientif": 21, "sens": 21, "experiment": 21, "pack": 21, "consult": 21, "create_receive_spec_struct": 21, "0l": 21, "pad": 21, "8byte": 21, "packstr": 21, "iiiiiiiiiiippi": 21, "fffi": 21, "nsampl": 21, "nsamplesiq": 21, "nslowtimesampl": 21, "nchannel": 21, "ntx": 21, "nrepeat": 21, "nfasttimesampl": 21, "rfsize": 21, "mnrow": 21, "mnrowsiq": 21, "nactivechannel": 21, "isiq": 21, "fsiq": 21, "fc": 21, "nbuffer": 21, "frombuff": 21, "len": 21, "receive_spec": 21, "bf": 21, "rf": 21, "recon": 21, "length": 21, "slight": 21, "matlab": 22, "typenam": 22, "my_typ": 22, "regardless": 22, "demot": 22, "rewrit": 22, "real": 22, "risk": 22, "seper": 22, "grid_div_z": 23, "06": 23, "log": 23, "auxilliari": 23, "safer": 23, "notat": 23, "divison": 23, "treat": 23, "warp": 23, "empti": 23, "kepler": 23, "plu": 23, "filter_mod": 23, "address_mod": 23, "clamp": 23, "mirror": 23, "axi": 23, "normalized_coordin": 23, "emtpi": 23, "get_local_s": 23, "satisfi": 23, "000001": 23, "ref": 23, "basinhop": 23, "bayes_opt": 23, "diff_evo": 23, "firefly_algorithm": 23, "genetic_algorithm": 23, "greedy_il": 23, "greedy_ml": 23, "ml": 23, "ordered_greedy_ml": 23, "pso": 23, "simulated_ann": 23, "sort": 23, "resourc": 23, "persist": 23, "consol": 23, "info": 23, "summar": 23, "store_result": 23, "results_filenam": 23, "typicali": 23, "percentag": 23, "create_device_target": 23, "header_filenam": 23, "target": 23, "dtarget_gpu": 23, "name_of_gpu": 23, "chosen": 23, "block_size_": 24, "grid_size_": 24, "compiler_opt_": 24, "loop_unroll_factor_": 24, "nvml_": 24, "nvmlobserv": 24}, "objects": {"kernel_tuner.backends.compiler": [[6, 0, 1, "", "CompilerFunctions"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "cleanup_lib"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.cupy": [[6, 0, 1, "", "CupyFunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.hip": [[6, 0, 1, "", "HipFunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.nvcuda": [[6, 0, 1, "", "CudaFunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.opencl": [[6, 0, 1, "", "OpenCLFunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.pycuda": [[6, 0, 1, "", "PyCudaFunctions"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.core": [[6, 0, 1, "", "DeviceInterface"]], "kernel_tuner.core.DeviceInterface": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "benchmark"], [6, 1, 1, "", "benchmark_continuous"], [6, 1, 1, "", "benchmark_default"], [6, 1, 1, "", "check_kernel_output"], [6, 1, 1, "", "compile_kernel"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "create_kernel_instance"], [6, 1, 1, "", "get_environment"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "preprocess_gpu_arguments"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "set_nvml_parameters"]], "kernel_tuner": [[23, 2, 1, "", "create_device_targets"], [23, 2, 1, "", "run_kernel"], [23, 2, 1, "", "store_results"], [23, 2, 1, "", "tune_kernel"], [6, 3, 0, "-", "util"]], "kernel_tuner.observers": [[18, 0, 1, "", "BenchmarkObserver"]], "kernel_tuner.observers.BenchmarkObserver": [[18, 1, 1, "", "after_finish"], [18, 1, 1, "", "after_start"], [18, 1, 1, "", "before_start"], [18, 1, 1, "", "during"], [18, 1, 1, "", "get_results"], [18, 1, 1, "", "register_configuration"], [18, 1, 1, "", "register_device"]], "kernel_tuner.observers.nvml": [[18, 0, 1, "", "NVMLObserver"]], "kernel_tuner.observers.pmt": [[18, 0, 1, "", "PMTObserver"]], "kernel_tuner.observers.powersensor": [[18, 0, 1, "", "PowerSensorObserver"]], "kernel_tuner.runners.sequential": [[6, 0, 1, "", "SequentialRunner"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.runners.simulation": [[6, 0, 1, "", "SimulationRunner"]], "kernel_tuner.runners.simulation.SimulationRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.strategies": [[19, 3, 0, "-", "basinhopping"], [19, 3, 0, "-", "bayes_opt"], [19, 3, 0, "-", "brute_force"], [6, 3, 0, "-", "common"], [19, 3, 0, "-", "diff_evo"], [19, 3, 0, "-", "dual_annealing"], [19, 3, 0, "-", "firefly_algorithm"], [19, 3, 0, "-", "genetic_algorithm"], [19, 3, 0, "-", "greedy_ils"], [19, 3, 0, "-", "greedy_mls"], [19, 3, 0, "-", "minimize"], [19, 3, 0, "-", "mls"], [19, 3, 0, "-", "ordered_greedy_mls"], [19, 3, 0, "-", "pso"], [19, 3, 0, "-", "random_sample"], [19, 3, 0, "-", "simulated_annealing"]], "kernel_tuner.strategies.basinhopping": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.bayes_opt": [[19, 2, 1, "", "generate_normalized_param_dicts"], [19, 2, 1, "", "normalize_parameter_space"], [19, 2, 1, "", "prune_parameter_space"], [19, 2, 1, "", "tune"]], "kernel_tuner.strategies.brute_force": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.common": [[6, 2, 1, "", "get_options"], [6, 2, 1, "", "get_strategy_docstring"], [6, 2, 1, "", "make_strategy_options_doc"], [6, 2, 1, "", "scale_from_params"], [6, 2, 1, "", "setup_method_arguments"], [6, 2, 1, "", "setup_method_options"], [6, 2, 1, "", "snap_to_nearest_config"], [6, 2, 1, "", "unscale_and_snap_to_nearest"]], "kernel_tuner.strategies.diff_evo": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.dual_annealing": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm": [[19, 0, 1, "", "Firefly"], [19, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm.Firefly": [[19, 1, 1, "", "compute_intensity"], [19, 1, 1, "", "distance_to"], [19, 1, 1, "", "move_towards"]], "kernel_tuner.strategies.genetic_algorithm": [[19, 2, 1, "", "disruptive_uniform_crossover"], [19, 2, 1, "", "mutate"], [19, 2, 1, "", "single_point_crossover"], [19, 2, 1, "", "tune"], [19, 2, 1, "", "two_point_crossover"], [19, 2, 1, "", "uniform_crossover"], [19, 2, 1, "", "weighted_choice"]], "kernel_tuner.strategies.greedy_ils": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.greedy_mls": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.minimize": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.mls": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.ordered_greedy_mls": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.pso": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.random_sample": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.simulated_annealing": [[19, 2, 1, "", "acceptance_prob"], [19, 2, 1, "", "neighbor"], [19, 2, 1, "", "tune"]], "kernel_tuner.util": [[6, 0, 1, "", "CompilationFailedConfig"], [6, 0, 1, "", "ErrorConfig"], [6, 0, 1, "", "InvalidConfig"], [6, 0, 1, "", "NpEncoder"], [6, 0, 1, "", "RuntimeFailedConfig"], [6, 4, 1, "", "SkippableFailure"], [6, 4, 1, "", "StopCriterionReached"], [6, 2, 1, "", "check_argument_list"], [6, 2, 1, "", "check_argument_type"], [6, 2, 1, "", "check_restriction"], [6, 2, 1, "", "check_restrictions"], [6, 2, 1, "", "check_stop_criterion"], [6, 2, 1, "", "check_thread_block_dimensions"], [6, 2, 1, "", "check_tune_params_list"], [6, 2, 1, "", "compile_restrictions"], [6, 2, 1, "", "config_valid"], [6, 2, 1, "", "convert_constraint_restriction"], [6, 2, 1, "", "correct_open_cache"], [6, 2, 1, "", "cuda_error_check"], [6, 2, 1, "", "delete_temp_file"], [6, 2, 1, "", "detect_language"], [6, 2, 1, "", "dump_cache"], [6, 2, 1, "", "get_best_config"], [6, 2, 1, "", "get_config_string"], [6, 2, 1, "", "get_grid_dimensions"], [6, 2, 1, "", "get_instance_string"], [6, 2, 1, "", "get_kernel_string"], [6, 2, 1, "", "get_problem_size"], [6, 2, 1, "", "get_smem_args"], [6, 2, 1, "", "get_temp_filename"], [6, 2, 1, "", "get_thread_block_dimensions"], [6, 2, 1, "", "get_total_timings"], [6, 2, 1, "", "looks_like_a_filename"], [6, 2, 1, "", "normalize_verify_function"], [6, 2, 1, "", "parse_restrictions"], [6, 2, 1, "", "prepare_kernel_string"], [6, 2, 1, "", "print_config"], [6, 2, 1, "", "print_config_output"], [6, 2, 1, "", "process_cache"], [6, 2, 1, "", "process_metrics"], [6, 2, 1, "", "read_cache"], [6, 2, 1, "", "read_file"], [6, 2, 1, "", "replace_param_occurrences"], [6, 2, 1, "", "setup_block_and_grid"], [6, 2, 1, "", "store_cache"], [6, 2, 1, "", "to_valid_nvrtc_gpu_arch_cc"], [6, 2, 1, "", "write_file"]], "kernel_tuner.util.NpEncoder": [[6, 1, 1, "", "default"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function", "3": "py:module", "4": "py:exception"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"], "3": ["py", "module", "Python module"], "4": ["py", "exception", "Python exception"]}, "titleterms": {"backend": [0, 6, 15, 22], "cuda": [0, 15, 16], "featur": [0, 2], "support": 0, "usag": [0, 14], "compil": [0, 6], "cach": 1, "file": 1, "The": [2, 14], "kernel": [2, 8, 9, 10, 11, 12, 14, 16, 22], "tuner": [2, 8, 9, 10, 11, 12, 14], "document": [2, 6, 7, 14, 23], "guid": [2, 3, 15], "refer": 2, "contribut": 3, "report": 3, "issu": 3, "code": [3, 8, 9, 10, 11, 13], "simpl": 3, "develop": [3, 7], "setup": [3, 7], "convolut": [4, 11], "2d": 4, "exampl": [4, 11, 14, 22], "implement": [4, 8, 9, 10], "test": [4, 7], "tune": [4, 8, 9, 10, 12, 13, 16, 17, 18], "more": 4, "tunabl": 4, "paramet": [4, 10, 12, 18, 24], "correct": 5, "verif": 5, "design": 6, "strategi": [6, 19], "kernel_tun": [6, 19], "common": 6, "runner": 6, "sequenti": 6, "sequentialrunn": 6, "simulationrunn": 6, "devic": 6, "interfac": 6, "core": 6, "deviceinterfac": 6, "pycuda": [6, 15], "pycudafunct": 6, "cupi": 6, "cupyfunct": 6, "nvcuda": 6, "cudafunct": 6, "opencl": [6, 15], "openclfunct": 6, "compilerfunct": 6, "hip": [6, 15], "hipfunct": 6, "util": 6, "function": 6, "environ": 7, "local": [7, 9], "cluster": 7, "run": [7, 10], "build": 7, "diffus": [8, 9, 10], "python": [8, 9, 10, 15], "comput": [8, 9, 10], "gpu": [8, 9, 10, 12], "auto": [8, 9, 10], "us": [8, 9, 10, 12, 16, 21], "share": [8, 9, 10, 16], "memori": [8, 9, 10, 16], "tile": [8, 9, 10], "store": [8, 9], "result": [8, 9], "tutori": [9, 10], "from": [9, 10], "physic": [9, 10], "best": 10, "product": 10, "c": 10, "vector": 11, "add": 11, "stencil": 11, "matrix": [11, 16], "multipl": [11, 16], "py": 11, "sepconv": 11, "convolution_correct": 11, "convolution_stream": 11, "reduct": 11, "spars": 11, "point": 11, "polygon": 11, "expdist": 11, "gener": 11, "3d": 12, "grid": 12, "let": 12, "": 12, "start": [12, 20], "cpu": 12, "move": 12, "optim": [12, 19], "host": 13, "number": 13, "stream": 13, "quick": 14, "instal": [14, 15], "citat": 14, "packag": 15, "other": 15, "pyopencl": 15, "pyhip": 15, "git": 15, "version": 15, "depend": 15, "naiv": 16, "increas": 16, "work": 16, "per": 16, "thread": 16, "metric": 17, "object": 17, "observ": 18, "powersensorobserv": 18, "nvmlobserv": 18, "execut": 18, "nvml": 18, "pmtobserv": 18, "basinhop": 19, "bayes_opt": 19, "brute_forc": 19, "diff_evo": 19, "dual_ann": 19, "firefly_algorithm": 19, "genetic_algorithm": 19, "greedy_il": 19, "greedy_ml": 19, "minim": 19, "ml": 19, "ordered_greedy_ml": 19, "pso": 19, "random_sampl": 19, "simulated_ann": 19, "get": 20, "struct": 21, "templat": 22, "select": 22, "api": 23, "vocabulari": 24}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "nbsphinx": 4, "sphinx": 58}, "alltitles": {"Backends": [[0, "backends"]], "CUDA Backends": [[0, "cuda-backends"]], "Backend feature support": [[0, "id1"]], "Backend usage and compiler": [[0, "id2"]], "Cache files": [[1, "cache-files"]], "The Kernel Tuner documentation": [[2, "the-kernel-tuner-documentation"], [14, "the-kernel-tuner-documentation"]], "Kernel Tuner": [[2, null]], "Guides": [[2, null]], "Features": [[2, null]], "Reference": [[2, null]], "Contribution guide": [[3, "contribution-guide"]], "Reporting Issues": [[3, "reporting-issues"]], "Contributing Code": [[3, "contributing-code"]], "Simple development setup": [[3, "simple-development-setup"]], "Convolution": [[4, "Convolution"], [11, "convolution"]], "2D Convolution example": [[4, "2D-Convolution-example"]], "Implement a test": [[4, "Implement-a-test"]], "Tuning 2D Convolution": [[4, "Tuning-2D-Convolution"]], "More tunable parameters": [[4, "More-tunable-parameters"]], "Correctness Verification": [[5, "correctness-verification"]], "Design documentation": [[6, "design-documentation"]], "Strategies": [[6, "strategies"]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common"]], "Runners": [[6, "runners"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, "kernel-tuner-runners-sequential-sequentialrunner"]], "kernel_tuner.runners.sequential.SimulationRunner": [[6, "kernel-tuner-runners-sequential-simulationrunner"]], "Device Interfaces": [[6, "device-interfaces"]], "kernel_tuner.core.DeviceInterface": [[6, "kernel-tuner-core-deviceinterface"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, "kernel-tuner-backends-pycuda-pycudafunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, "kernel-tuner-backends-cupy-cupyfunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, "kernel-tuner-backends-nvcuda-cudafunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, "kernel-tuner-backends-opencl-openclfunctions"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, "kernel-tuner-backends-compiler-compilerfunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, "kernel-tuner-backends-hip-hipfunctions"]], "Util Functions": [[6, "util-functions"]], "kernel_tuner.util": [[6, "module-kernel_tuner.util"]], "Development environment": [[7, "development-environment"]], "Local setup": [[7, "local-setup"]], "Cluster setup": [[7, "cluster-setup"]], "Running tests": [[7, "running-tests"]], "Building documentation": [[7, "building-documentation"]], "Diffusion": [[8, "Diffusion"], [8, "id1"], [9, "Diffusion"], [10, "Diffusion"]], "Python implementation": [[8, "Python-implementation"], [9, "Python-implementation"], [10, "Python-implementation"]], "Computing on the GPU": [[8, "Computing-on-the-GPU"], [9, "Computing-on-the-GPU"], [10, "Computing-on-the-GPU"]], "Auto-Tuning with the Kernel Tuner": [[8, "Auto-Tuning-with-the-Kernel-Tuner"], [9, "Auto-Tuning-with-the-Kernel-Tuner"], [10, "Auto-Tuning-with-the-Kernel-Tuner"]], "Using Shared Memory": [[8, "Using-Shared-Memory"]], "Tiling GPU Code": [[8, "Tiling-GPU-Code"], [9, "Tiling-GPU-Code"], [10, "Tiling-GPU-Code"]], "Storing the results": [[8, "Storing-the-results"], [9, "Storing-the-results"]], "Tutorial: From physics to tuned GPU kernels": [[9, "Tutorial:-From-physics-to-tuned-GPU-kernels"], [10, "Tutorial:-From-physics-to-tuned-GPU-kernels"]], "Using Shared (local) Memory": [[9, "Using-Shared-(local)-Memory"]], "Using shared memory": [[10, "Using-shared-memory"], [16, "Using-shared-memory"]], "Using the best parameters in a production run": [[10, "Using-the-best-parameters-in-a-production-run"]], "Python run": [[10, "Python-run"]], "C run": [[10, "C-run"]], "Kernel Tuner Examples": [[11, "kernel-tuner-examples"]], "Vector Add": [[11, "vector-add"]], "Stencil": [[11, "stencil"]], "Matrix Multiplication": [[11, "matrix-multiplication"]], "convolution.py": [[11, "convolution-py"]], "sepconv.py": [[11, "sepconv-py"]], "convolution_correct.py": [[11, "convolution-correct-py"]], "convolution_streams.py": [[11, "convolution-streams-py"]], "Reduction": [[11, "reduction"]], "Sparse Matrix Vector Multiplication": [[11, "sparse-matrix-vector-multiplication"]], "Point-in-Polygon": [[11, "point-in-polygon"]], "ExpDist": [[11, "expdist"]], "Code Generator": [[11, "code-generator"]], "3D Grid on GPU with Kernel Tuner": [[12, "3D-Grid-on-GPU-with-Kernel-Tuner"]], "Let\u2019s start on the CPU": [[12, "Let's-start-on-the-CPU"]], "Let\u2019s move to the GPU": [[12, "Let's-move-to-the-GPU"]], "Tune the kernel": [[12, "Tune-the-kernel"]], "Using the optimized parameters": [[12, "Using-the-optimized-parameters"]], "Tuning Host Code": [[13, "tuning-host-code"]], "Tuning the number of streams": [[13, "tuning-the-number-of-streams"]], "Quick install": [[14, "quick-install"]], "Example usage": [[14, "example-usage"]], "Citation": [[14, "citation"]], "Installation": [[15, "installation"]], "Python": [[15, "python"]], "Installing Python Packages": [[15, "installing-python-packages"]], "CUDA and PyCUDA": [[15, "cuda-and-pycuda"]], "Other CUDA Backends": [[15, "other-cuda-backends"]], "OpenCL and PyOpenCL": [[15, "opencl-and-pyopencl"]], "HIP and PyHIP": [[15, "hip-and-pyhip"]], "Installing the git version": [[15, "installing-the-git-version"]], "Dependencies for the guides": [[15, "dependencies-for-the-guides"]], "Matrix multiplication": [[16, "Matrix-multiplication"]], "Naive CUDA kernel": [[16, "Naive-CUDA-kernel"]], "Tuning a naive kernel": [[16, "Tuning-a-naive-kernel"]], "Increase work per thread": [[16, "Increase-work-per-thread"]], "Metrics and Objectives": [[17, "metrics-and-objectives"]], "Metrics": [[17, "metrics"]], "Tuning Objectives": [[17, "tuning-objectives"]], "Observers": [[18, "observers"]], "PowerSensorObserver": [[18, "powersensorobserver"]], "NVMLObserver": [[18, "nvmlobserver"]], "Tuning execution parameters with NVML": [[18, "tuning-execution-parameters-with-nvml"]], "PMTObserver": [[18, "pmtobserver"]], "Optimization strategies": [[19, "optimization-strategies"]], "kernel_tuner.strategies.basinhopping": [[19, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[19, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[19, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[19, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[19, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[19, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[19, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[19, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[19, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[19, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[19, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[19, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[19, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[19, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[19, "module-kernel_tuner.strategies.simulated_annealing"]], "Getting Started": [[20, "getting-started"]], "Using structs": [[21, "using-structs"]], "Templated kernels": [[22, "templated-kernels"]], "Example": [[22, "example"]], "Selecting a backend": [[22, "selecting-a-backend"]], "API Documentation": [[23, "api-documentation"]], "Parameter Vocabulary": [[24, "parameter-vocabulary"]]}, "indexentries": {"compilationfailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.CompilationFailedConfig"]], "compilerfunctions (class in kernel_tuner.backends.compiler)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions"]], "cudafunctions (class in kernel_tuner.backends.nvcuda)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions"]], "cupyfunctions (class in kernel_tuner.backends.cupy)": [[6, "kernel_tuner.backends.cupy.CupyFunctions"]], "deviceinterface (class in kernel_tuner.core)": [[6, "kernel_tuner.core.DeviceInterface"]], "errorconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.ErrorConfig"]], "hipfunctions (class in kernel_tuner.backends.hip)": [[6, "kernel_tuner.backends.hip.HipFunctions"]], "invalidconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.InvalidConfig"]], "npencoder (class in kernel_tuner.util)": [[6, "kernel_tuner.util.NpEncoder"]], "openclfunctions (class in kernel_tuner.backends.opencl)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions"]], "pycudafunctions (class in kernel_tuner.backends.pycuda)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions"]], "runtimefailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.RuntimeFailedConfig"]], "sequentialrunner (class in kernel_tuner.runners.sequential)": [[6, "kernel_tuner.runners.sequential.SequentialRunner"]], "simulationrunner (class in kernel_tuner.runners.simulation)": [[6, "kernel_tuner.runners.simulation.SimulationRunner"]], "skippablefailure": [[6, "kernel_tuner.util.SkippableFailure"]], "stopcriterionreached": [[6, "kernel_tuner.util.StopCriterionReached"]], "__init__() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.__init__"]], "__init__() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.__init__"]], "__init__() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.__init__"]], "__init__() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.__init__"]], "__init__() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.__init__"]], "__init__() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.__init__"]], "__init__() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.__init__"]], "__init__() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.__init__"]], "__init__() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.__init__"]], "benchmark() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark"]], "benchmark_continuous() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_continuous"]], "benchmark_default() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_default"]], "check_argument_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_list"]], "check_argument_type() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_type"]], "check_kernel_output() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.check_kernel_output"]], "check_restriction() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_restriction"]], "check_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_restrictions"]], "check_stop_criterion() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_stop_criterion"]], "check_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_thread_block_dimensions"]], "check_tune_params_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_tune_params_list"]], "cleanup_lib() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.cleanup_lib"]], "compile() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.compile"]], "compile() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.compile"]], "compile() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.compile"]], "compile() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.compile"]], "compile() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.compile"]], "compile() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.compile"]], "compile_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.compile_kernel"]], "compile_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.compile_restrictions"]], "config_valid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.config_valid"]], "convert_constraint_restriction() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.convert_constraint_restriction"]], "copy_constant_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_constant_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_shared_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_texture_memory_args"]], "correct_open_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.correct_open_cache"]], "create_kernel_instance() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.create_kernel_instance"]], "cuda_error_check() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.cuda_error_check"]], "default() (kernel_tuner.util.npencoder method)": [[6, "kernel_tuner.util.NpEncoder.default"]], "delete_temp_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.delete_temp_file"]], "detect_language() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.detect_language"]], "dump_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.dump_cache"]], "get_best_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_best_config"]], "get_config_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_config_string"]], "get_environment() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.get_environment"]], "get_grid_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_grid_dimensions"]], "get_instance_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_instance_string"]], "get_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_kernel_string"]], "get_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_options"]], "get_problem_size() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_problem_size"]], "get_smem_args() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_smem_args"]], "get_strategy_docstring() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_strategy_docstring"]], "get_temp_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_temp_filename"]], "get_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_thread_block_dimensions"]], "get_total_timings() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_total_timings"]], "kernel_finished() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished"]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common"]], "kernel_tuner.util": [[6, "module-kernel_tuner.util"]], "looks_like_a_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.looks_like_a_filename"]], "make_strategy_options_doc() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.make_strategy_options_doc"]], "memcpy_dtoh() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.memcpy_dtoh"]], "memcpy_htod() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod"]], "memset() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memset"]], "memset() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memset"]], "memset() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memset"]], "memset() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memset"]], "memset() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memset"]], "memset() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memset"]], "module": [[6, "module-kernel_tuner.strategies.common"], [6, "module-kernel_tuner.util"], [19, "module-kernel_tuner.strategies.basinhopping"], [19, "module-kernel_tuner.strategies.bayes_opt"], [19, "module-kernel_tuner.strategies.brute_force"], [19, "module-kernel_tuner.strategies.diff_evo"], [19, "module-kernel_tuner.strategies.dual_annealing"], [19, "module-kernel_tuner.strategies.firefly_algorithm"], [19, "module-kernel_tuner.strategies.genetic_algorithm"], [19, "module-kernel_tuner.strategies.greedy_ils"], [19, "module-kernel_tuner.strategies.greedy_mls"], [19, "module-kernel_tuner.strategies.minimize"], [19, "module-kernel_tuner.strategies.mls"], [19, "module-kernel_tuner.strategies.ordered_greedy_mls"], [19, "module-kernel_tuner.strategies.pso"], [19, "module-kernel_tuner.strategies.random_sample"], [19, "module-kernel_tuner.strategies.simulated_annealing"]], "normalize_verify_function() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.normalize_verify_function"]], "parse_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.parse_restrictions"]], "prepare_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.prepare_kernel_string"]], "preprocess_gpu_arguments() (kernel_tuner.core.deviceinterface static method)": [[6, "kernel_tuner.core.DeviceInterface.preprocess_gpu_arguments"]], "print_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config"]], "print_config_output() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config_output"]], "process_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_cache"]], "process_metrics() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_metrics"]], "read_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_cache"]], "read_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_file"]], "ready_argument_list() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.ready_argument_list"]], "replace_param_occurrences() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.replace_param_occurrences"]], "run() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.run"]], "run() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.run"]], "run_kernel() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.run_kernel"]], "scale_from_params() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.scale_from_params"]], "set_nvml_parameters() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.set_nvml_parameters"]], "setup_block_and_grid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.setup_block_and_grid"]], "setup_method_arguments() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_arguments"]], "setup_method_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_options"]], "snap_to_nearest_config() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.snap_to_nearest_config"]], "start_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.start_event"]], "start_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.start_event"]], "start_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.start_event"]], "start_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.start_event"]], "start_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.start_event"]], "start_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.start_event"]], "stop_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event"]], "store_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.store_cache"]], "synchronize() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize"]], "to_valid_nvrtc_gpu_arch_cc() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.to_valid_nvrtc_gpu_arch_cc"]], "unscale_and_snap_to_nearest() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.unscale_and_snap_to_nearest"]], "write_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.write_file"]], "benchmarkobserver (class in kernel_tuner.observers)": [[18, "kernel_tuner.observers.BenchmarkObserver"]], "nvmlobserver (class in kernel_tuner.observers.nvml)": [[18, "kernel_tuner.observers.nvml.NVMLObserver"]], "pmtobserver (class in kernel_tuner.observers.pmt)": [[18, "kernel_tuner.observers.pmt.PMTObserver"]], "powersensorobserver (class in kernel_tuner.observers.powersensor)": [[18, "kernel_tuner.observers.powersensor.PowerSensorObserver"]], "after_finish() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.after_finish"]], "after_start() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.after_start"]], "before_start() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.before_start"]], "during() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.during"]], "get_results() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.get_results"]], "register_configuration() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.register_configuration"]], "register_device() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.register_device"]], "firefly (class in kernel_tuner.strategies.firefly_algorithm)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly"]], "acceptance_prob() (in module kernel_tuner.strategies.simulated_annealing)": [[19, "kernel_tuner.strategies.simulated_annealing.acceptance_prob"]], "compute_intensity() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity"]], "disruptive_uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover"]], "distance_to() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to"]], "generate_normalized_param_dicts() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts"]], "kernel_tuner.strategies.basinhopping": [[19, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[19, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[19, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[19, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[19, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[19, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[19, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[19, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[19, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[19, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[19, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[19, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[19, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[19, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[19, "module-kernel_tuner.strategies.simulated_annealing"]], "move_towards() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards"]], "mutate() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.mutate"]], "neighbor() (in module kernel_tuner.strategies.simulated_annealing)": [[19, "kernel_tuner.strategies.simulated_annealing.neighbor"]], "normalize_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.normalize_parameter_space"]], "prune_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.prune_parameter_space"]], "single_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.single_point_crossover"]], "tune() (in module kernel_tuner.strategies.basinhopping)": [[19, "kernel_tuner.strategies.basinhopping.tune"]], "tune() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.tune"]], "tune() (in module kernel_tuner.strategies.brute_force)": [[19, "kernel_tuner.strategies.brute_force.tune"]], "tune() (in module kernel_tuner.strategies.diff_evo)": [[19, "kernel_tuner.strategies.diff_evo.tune"]], "tune() (in module kernel_tuner.strategies.dual_annealing)": [[19, "kernel_tuner.strategies.dual_annealing.tune"]], "tune() (in module kernel_tuner.strategies.firefly_algorithm)": [[19, "kernel_tuner.strategies.firefly_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.greedy_ils)": [[19, "kernel_tuner.strategies.greedy_ils.tune"]], "tune() (in module kernel_tuner.strategies.greedy_mls)": [[19, "kernel_tuner.strategies.greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.minimize)": [[19, "kernel_tuner.strategies.minimize.tune"]], "tune() (in module kernel_tuner.strategies.mls)": [[19, "kernel_tuner.strategies.mls.tune"]], "tune() (in module kernel_tuner.strategies.ordered_greedy_mls)": [[19, "kernel_tuner.strategies.ordered_greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.pso)": [[19, "kernel_tuner.strategies.pso.tune"]], "tune() (in module kernel_tuner.strategies.random_sample)": [[19, "kernel_tuner.strategies.random_sample.tune"]], "tune() (in module kernel_tuner.strategies.simulated_annealing)": [[19, "kernel_tuner.strategies.simulated_annealing.tune"]], "two_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.two_point_crossover"]], "uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.uniform_crossover"]], "weighted_choice() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.weighted_choice"]], "create_device_targets() (in module kernel_tuner)": [[23, "kernel_tuner.create_device_targets"]], "run_kernel() (in module kernel_tuner)": [[23, "kernel_tuner.run_kernel"]], "store_results() (in module kernel_tuner)": [[23, "kernel_tuner.store_results"]], "tune_kernel() (in module kernel_tuner)": [[23, "kernel_tuner.tune_kernel"]]}})
                    \ No newline at end of file
                    +Search.setIndex({"alltitles": {"2D Convolution example": [[4, "2D-Convolution-example"]], "3D Grid on GPU with Kernel Tuner": [[12, "3D-Grid-on-GPU-with-Kernel-Tuner"]], "API Documentation": [[23, "api-documentation"]], "Auto-Tuning with the Kernel Tuner": [[8, "Auto-Tuning-with-the-Kernel-Tuner"], [9, "Auto-Tuning-with-the-Kernel-Tuner"], [10, "Auto-Tuning-with-the-Kernel-Tuner"]], "Backend feature support": [[0, "id1"]], "Backend usage and compiler": [[0, "id2"]], "Backends": [[0, "backends"]], "Building documentation": [[7, "building-documentation"]], "C run": [[10, "C-run"]], "CUDA Backends": [[0, "cuda-backends"]], "CUDA and PyCUDA": [[15, "cuda-and-pycuda"]], "Cache files": [[1, "cache-files"]], "Citation": [[14, "citation"]], "Cluster setup": [[7, "cluster-setup"]], "Code Generator": [[11, "code-generator"]], "Computing on the GPU": [[8, "Computing-on-the-GPU"], [9, "Computing-on-the-GPU"], [10, "Computing-on-the-GPU"]], "Contributing Code": [[3, "contributing-code"]], "Contribution guide": [[3, "contribution-guide"]], "Convolution": [[4, "Convolution"], [11, "convolution"]], "Correctness Verification": [[5, "correctness-verification"]], "Dependencies for the guides": [[15, "dependencies-for-the-guides"]], "Design documentation": [[6, "design-documentation"]], "Development environment": [[7, "development-environment"]], "Device Interfaces": [[6, "device-interfaces"]], "Diffusion": [[8, "Diffusion"], [8, "id1"], [9, "Diffusion"], [10, "Diffusion"]], "Example": [[22, "example"]], "Example usage": [[14, "example-usage"]], "ExpDist": [[11, "expdist"]], "Features": [[2, null]], "Getting Started": [[20, "getting-started"]], "Guides": [[2, null]], "HIP and PyHIP": [[15, "hip-and-pyhip"]], "Implement a test": [[4, "Implement-a-test"]], "Increase work per thread": [[16, "Increase-work-per-thread"]], "Installation": [[15, "installation"]], "Installing Python Packages": [[15, "installing-python-packages"]], "Installing the git version": [[15, "installing-the-git-version"]], "Kernel Tuner": [[2, null]], "Kernel Tuner Examples": [[11, "kernel-tuner-examples"]], "Let\u2019s move to the GPU": [[12, "Let's-move-to-the-GPU"]], "Let\u2019s start on the CPU": [[12, "Let's-start-on-the-CPU"]], "Local setup": [[7, "local-setup"]], "Matrix Multiplication": [[11, "matrix-multiplication"]], "Matrix multiplication": [[16, "Matrix-multiplication"]], "Metrics": [[17, "metrics"]], "Metrics and Objectives": [[17, "metrics-and-objectives"]], "More tunable parameters": [[4, "More-tunable-parameters"]], "NVMLObserver": [[18, "nvmlobserver"]], "Naive CUDA kernel": [[16, "Naive-CUDA-kernel"]], "Observers": [[18, "observers"]], "OpenCL and PyOpenCL": [[15, "opencl-and-pyopencl"]], "Optimization strategies": [[19, "optimization-strategies"]], "Other CUDA Backends": [[15, "other-cuda-backends"]], "PMTObserver": [[18, "pmtobserver"]], "Parameter Vocabulary": [[24, "parameter-vocabulary"]], "Point-in-Polygon": [[11, "point-in-polygon"]], "PowerSensorObserver": [[18, "powersensorobserver"]], "Python": [[15, "python"]], "Python implementation": [[8, "Python-implementation"], [9, "Python-implementation"], [10, "Python-implementation"]], "Python run": [[10, "Python-run"]], "Quick install": [[14, "quick-install"]], "Reduction": [[11, "reduction"]], "Reference": [[2, null]], "Reporting Issues": [[3, "reporting-issues"]], "Runners": [[6, "runners"]], "Running tests": [[7, "running-tests"]], "Selecting a backend": [[22, "selecting-a-backend"]], "Simple development setup": [[3, "simple-development-setup"]], "Sparse Matrix Vector Multiplication": [[11, "sparse-matrix-vector-multiplication"]], "Stencil": [[11, "stencil"]], "Storing the results": [[8, "Storing-the-results"], [9, "Storing-the-results"]], "Strategies": [[6, "strategies"]], "Templated kernels": [[22, "templated-kernels"]], "The Kernel Tuner documentation": [[2, "the-kernel-tuner-documentation"], [14, "the-kernel-tuner-documentation"]], "Tiling GPU Code": [[8, "Tiling-GPU-Code"], [9, "Tiling-GPU-Code"], [10, "Tiling-GPU-Code"]], "Tune the kernel": [[12, "Tune-the-kernel"]], "Tuning 2D Convolution": [[4, "Tuning-2D-Convolution"]], "Tuning Host Code": [[13, "tuning-host-code"]], "Tuning Objectives": [[17, "tuning-objectives"]], "Tuning a naive kernel": [[16, "Tuning-a-naive-kernel"]], "Tuning execution parameters with NVML": [[18, "tuning-execution-parameters-with-nvml"]], "Tuning the number of streams": [[13, "tuning-the-number-of-streams"]], "Tutorial: From physics to tuned GPU kernels": [[9, "Tutorial:-From-physics-to-tuned-GPU-kernels"], [10, "Tutorial:-From-physics-to-tuned-GPU-kernels"]], "Using Shared (local) Memory": [[9, "Using-Shared-(local)-Memory"]], "Using Shared Memory": [[8, "Using-Shared-Memory"]], "Using shared memory": [[10, "Using-shared-memory"], [16, "Using-shared-memory"]], "Using structs": [[21, "using-structs"]], "Using the best parameters in a production run": [[10, "Using-the-best-parameters-in-a-production-run"]], "Using the optimized parameters": [[12, "Using-the-optimized-parameters"]], "Util Functions": [[6, "util-functions"]], "Vector Add": [[11, "vector-add"]], "convolution.py": [[11, "convolution-py"]], "convolution_correct.py": [[11, "convolution-correct-py"]], "convolution_streams.py": [[11, "convolution-streams-py"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, "kernel-tuner-backends-compiler-compilerfunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, "kernel-tuner-backends-cupy-cupyfunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, "kernel-tuner-backends-hip-hipfunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, "kernel-tuner-backends-nvcuda-cudafunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, "kernel-tuner-backends-opencl-openclfunctions"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, "kernel-tuner-backends-pycuda-pycudafunctions"]], "kernel_tuner.core.DeviceInterface": [[6, "kernel-tuner-core-deviceinterface"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, "kernel-tuner-runners-sequential-sequentialrunner"]], "kernel_tuner.runners.sequential.SimulationRunner": [[6, "kernel-tuner-runners-sequential-simulationrunner"]], "kernel_tuner.strategies.basinhopping": [[19, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[19, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[19, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common"]], "kernel_tuner.strategies.diff_evo": [[19, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[19, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[19, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[19, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[19, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[19, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[19, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[19, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[19, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[19, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[19, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[19, "module-kernel_tuner.strategies.simulated_annealing"]], "kernel_tuner.util": [[6, "module-kernel_tuner.util"]], "sepconv.py": [[11, "sepconv-py"]]}, "docnames": ["backends", "cache_files", "contents", "contributing", "convolution", "correctness", "design", "dev-environment", "diffusion", "diffusion_opencl", "diffusion_use_optparam", "examples", "grid3d", "hostcode", "index", "install", "matrix_multiplication", "metrics", "observers", "optimization", "quickstart", "structs", "templates", "user-api", "vocabulary"], "envversion": {"nbsphinx": 4, "sphinx": 61, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["backends.rst", "cache_files.rst", "contents.rst", "contributing.rst", "convolution.ipynb", "correctness.rst", "design.rst", "dev-environment.rst", "diffusion.ipynb", "diffusion_opencl.ipynb", "diffusion_use_optparam.ipynb", "examples.rst", "grid3d.ipynb", "hostcode.rst", "index.rst", "install.rst", "matrix_multiplication.ipynb", "metrics.rst", "observers.rst", "optimization.rst", "quickstart.rst", "structs.rst", "templates.rst", "user-api.rst", "vocabulary.rst"], "indexentries": {"__init__() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.__init__", false]], "__init__() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.__init__", false]], "__init__() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.__init__", false]], "__init__() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.__init__", false]], "__init__() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.__init__", false]], "__init__() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.__init__", false]], "__init__() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.__init__", false]], "__init__() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.__init__", false]], "__init__() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.__init__", false]], "acceptance_prob() (in module kernel_tuner.strategies.simulated_annealing)": [[19, "kernel_tuner.strategies.simulated_annealing.acceptance_prob", false]], "after_finish() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.after_finish", false]], "after_start() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.after_start", false]], "before_start() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.before_start", false]], "benchmark() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark", false]], "benchmark_continuous() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_continuous", false]], "benchmark_default() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_default", false]], "benchmarkobserver (class in kernel_tuner.observers)": [[18, "kernel_tuner.observers.BenchmarkObserver", false]], "check_argument_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_list", false]], "check_argument_type() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_type", false]], "check_kernel_output() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.check_kernel_output", false]], "check_restriction() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_restriction", false]], "check_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_restrictions", false]], "check_stop_criterion() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_stop_criterion", false]], "check_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_thread_block_dimensions", false]], "check_tune_params_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_tune_params_list", false]], "cleanup_lib() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.cleanup_lib", false]], "compilationfailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.CompilationFailedConfig", false]], "compile() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.compile", false]], "compile() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.compile", false]], "compile() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.compile", false]], "compile() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.compile", false]], "compile() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.compile", false]], "compile() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.compile", false]], "compile_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.compile_kernel", false]], "compile_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.compile_restrictions", false]], "compilerfunctions (class in kernel_tuner.backends.compiler)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions", false]], "compute_intensity() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity", false]], "config_valid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.config_valid", false]], "convert_constraint_restriction() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.convert_constraint_restriction", false]], "copy_constant_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args", false]], "copy_constant_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args", false]], "copy_constant_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args", false]], "copy_constant_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args", false]], "copy_constant_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args", false]], "copy_constant_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_constant_memory_args", false]], "copy_shared_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args", false]], "copy_shared_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args", false]], "copy_shared_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args", false]], "copy_shared_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args", false]], "copy_shared_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args", false]], "copy_shared_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_shared_memory_args", false]], "copy_texture_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args", false]], "copy_texture_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args", false]], "copy_texture_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args", false]], "copy_texture_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args", false]], "copy_texture_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args", false]], "copy_texture_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_texture_memory_args", false]], "correct_open_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.correct_open_cache", false]], "create_device_targets() (in module kernel_tuner)": [[23, "kernel_tuner.create_device_targets", false]], "create_kernel_instance() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.create_kernel_instance", false]], "cuda_error_check() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.cuda_error_check", false]], "cudafunctions (class in kernel_tuner.backends.nvcuda)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions", false]], "cupyfunctions (class in kernel_tuner.backends.cupy)": [[6, "kernel_tuner.backends.cupy.CupyFunctions", false]], "default() (kernel_tuner.util.npencoder method)": [[6, "kernel_tuner.util.NpEncoder.default", false]], "delete_temp_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.delete_temp_file", false]], "detect_language() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.detect_language", false]], "deviceinterface (class in kernel_tuner.core)": [[6, "kernel_tuner.core.DeviceInterface", false]], "disruptive_uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover", false]], "distance_to() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to", false]], "dump_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.dump_cache", false]], "during() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.during", false]], "errorconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.ErrorConfig", false]], "firefly (class in kernel_tuner.strategies.firefly_algorithm)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly", false]], "generate_normalized_param_dicts() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts", false]], "get_best_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_best_config", false]], "get_config_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_config_string", false]], "get_environment() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.get_environment", false]], "get_grid_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_grid_dimensions", false]], "get_instance_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_instance_string", false]], "get_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_kernel_string", false]], "get_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_options", false]], "get_problem_size() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_problem_size", false]], "get_results() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.get_results", false]], "get_smem_args() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_smem_args", false]], "get_strategy_docstring() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_strategy_docstring", false]], "get_temp_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_temp_filename", false]], "get_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_thread_block_dimensions", false]], "get_total_timings() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_total_timings", false]], "hipfunctions (class in kernel_tuner.backends.hip)": [[6, "kernel_tuner.backends.hip.HipFunctions", false]], "invalidconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.InvalidConfig", false]], "kernel_finished() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.kernel_finished", false]], "kernel_finished() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.kernel_finished", false]], "kernel_finished() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.kernel_finished", false]], "kernel_finished() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished", false]], "kernel_finished() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished", false]], "kernel_finished() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished", false]], "kernel_tuner.strategies.basinhopping": [[19, "module-kernel_tuner.strategies.basinhopping", false]], "kernel_tuner.strategies.bayes_opt": [[19, "module-kernel_tuner.strategies.bayes_opt", false]], "kernel_tuner.strategies.brute_force": [[19, "module-kernel_tuner.strategies.brute_force", false]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common", false]], "kernel_tuner.strategies.diff_evo": [[19, "module-kernel_tuner.strategies.diff_evo", false]], "kernel_tuner.strategies.dual_annealing": [[19, "module-kernel_tuner.strategies.dual_annealing", false]], "kernel_tuner.strategies.firefly_algorithm": [[19, "module-kernel_tuner.strategies.firefly_algorithm", false]], "kernel_tuner.strategies.genetic_algorithm": [[19, "module-kernel_tuner.strategies.genetic_algorithm", false]], "kernel_tuner.strategies.greedy_ils": [[19, "module-kernel_tuner.strategies.greedy_ils", false]], "kernel_tuner.strategies.greedy_mls": [[19, "module-kernel_tuner.strategies.greedy_mls", false]], "kernel_tuner.strategies.minimize": [[19, "module-kernel_tuner.strategies.minimize", false]], "kernel_tuner.strategies.mls": [[19, "module-kernel_tuner.strategies.mls", false]], "kernel_tuner.strategies.ordered_greedy_mls": [[19, "module-kernel_tuner.strategies.ordered_greedy_mls", false]], "kernel_tuner.strategies.pso": [[19, "module-kernel_tuner.strategies.pso", false]], "kernel_tuner.strategies.random_sample": [[19, "module-kernel_tuner.strategies.random_sample", false]], "kernel_tuner.strategies.simulated_annealing": [[19, "module-kernel_tuner.strategies.simulated_annealing", false]], "kernel_tuner.util": [[6, "module-kernel_tuner.util", false]], "looks_like_a_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.looks_like_a_filename", false]], "make_strategy_options_doc() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.make_strategy_options_doc", false]], "memcpy_dtoh() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_dtoh", false]], "memcpy_dtoh() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh", false]], "memcpy_dtoh() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh", false]], "memcpy_dtoh() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh", false]], "memcpy_dtoh() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh", false]], "memcpy_dtoh() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh", false]], "memcpy_dtoh() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.memcpy_dtoh", false]], "memcpy_htod() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_htod", false]], "memcpy_htod() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod", false]], "memcpy_htod() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_htod", false]], "memcpy_htod() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod", false]], "memcpy_htod() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod", false]], "memcpy_htod() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod", false]], "memset() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memset", false]], "memset() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memset", false]], "memset() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memset", false]], "memset() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memset", false]], "memset() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memset", false]], "memset() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memset", false]], "module": [[6, "module-kernel_tuner.strategies.common", false], [6, "module-kernel_tuner.util", false], [19, "module-kernel_tuner.strategies.basinhopping", false], [19, "module-kernel_tuner.strategies.bayes_opt", false], [19, "module-kernel_tuner.strategies.brute_force", false], [19, "module-kernel_tuner.strategies.diff_evo", false], [19, "module-kernel_tuner.strategies.dual_annealing", false], [19, "module-kernel_tuner.strategies.firefly_algorithm", false], [19, "module-kernel_tuner.strategies.genetic_algorithm", false], [19, "module-kernel_tuner.strategies.greedy_ils", false], [19, "module-kernel_tuner.strategies.greedy_mls", false], [19, "module-kernel_tuner.strategies.minimize", false], [19, "module-kernel_tuner.strategies.mls", false], [19, "module-kernel_tuner.strategies.ordered_greedy_mls", false], [19, "module-kernel_tuner.strategies.pso", false], [19, "module-kernel_tuner.strategies.random_sample", false], [19, "module-kernel_tuner.strategies.simulated_annealing", false]], "move_towards() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards", false]], "mutate() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.mutate", false]], "neighbor() (in module kernel_tuner.strategies.simulated_annealing)": [[19, "kernel_tuner.strategies.simulated_annealing.neighbor", false]], "normalize_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.normalize_parameter_space", false]], "normalize_verify_function() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.normalize_verify_function", false]], "npencoder (class in kernel_tuner.util)": [[6, "kernel_tuner.util.NpEncoder", false]], "nvmlobserver (class in kernel_tuner.observers.nvml)": [[18, "kernel_tuner.observers.nvml.NVMLObserver", false]], "openclfunctions (class in kernel_tuner.backends.opencl)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions", false]], "parse_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.parse_restrictions", false]], "pmtobserver (class in kernel_tuner.observers.pmt)": [[18, "kernel_tuner.observers.pmt.PMTObserver", false]], "powersensorobserver (class in kernel_tuner.observers.powersensor)": [[18, "kernel_tuner.observers.powersensor.PowerSensorObserver", false]], "prepare_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.prepare_kernel_string", false]], "preprocess_gpu_arguments() (kernel_tuner.core.deviceinterface static method)": [[6, "kernel_tuner.core.DeviceInterface.preprocess_gpu_arguments", false]], "print_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config", false]], "print_config_output() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config_output", false]], "process_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_cache", false]], "process_metrics() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_metrics", false]], "prune_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.prune_parameter_space", false]], "pycudafunctions (class in kernel_tuner.backends.pycuda)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions", false]], "read_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_cache", false]], "read_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_file", false]], "ready_argument_list() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.ready_argument_list", false]], "ready_argument_list() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list", false]], "ready_argument_list() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.ready_argument_list", false]], "ready_argument_list() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list", false]], "ready_argument_list() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list", false]], "ready_argument_list() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list", false]], "ready_argument_list() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.ready_argument_list", false]], "register_configuration() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.register_configuration", false]], "register_device() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.register_device", false]], "replace_param_occurrences() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.replace_param_occurrences", false]], "run() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.run", false]], "run() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.run", false]], "run_kernel() (in module kernel_tuner)": [[23, "kernel_tuner.run_kernel", false]], "run_kernel() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.run_kernel", false]], "run_kernel() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.run_kernel", false]], "run_kernel() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.run_kernel", false]], "run_kernel() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel", false]], "run_kernel() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel", false]], "run_kernel() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel", false]], "run_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.run_kernel", false]], "runtimefailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.RuntimeFailedConfig", false]], "scale_from_params() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.scale_from_params", false]], "sequentialrunner (class in kernel_tuner.runners.sequential)": [[6, "kernel_tuner.runners.sequential.SequentialRunner", false]], "set_nvml_parameters() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.set_nvml_parameters", false]], "setup_block_and_grid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.setup_block_and_grid", false]], "setup_method_arguments() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_arguments", false]], "setup_method_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_options", false]], "simulationrunner (class in kernel_tuner.runners.simulation)": [[6, "kernel_tuner.runners.simulation.SimulationRunner", false]], "single_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.single_point_crossover", false]], "skippablefailure": [[6, "kernel_tuner.util.SkippableFailure", false]], "snap_to_nearest_config() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.snap_to_nearest_config", false]], "start_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.start_event", false]], "start_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.start_event", false]], "start_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.start_event", false]], "start_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.start_event", false]], "start_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.start_event", false]], "start_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.start_event", false]], "stop_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.stop_event", false]], "stop_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.stop_event", false]], "stop_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.stop_event", false]], "stop_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.stop_event", false]], "stop_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.stop_event", false]], "stop_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event", false]], "stopcriterionreached": [[6, "kernel_tuner.util.StopCriterionReached", false]], "store_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.store_cache", false]], "store_results() (in module kernel_tuner)": [[23, "kernel_tuner.store_results", false]], "synchronize() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.synchronize", false]], "synchronize() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.synchronize", false]], "synchronize() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.synchronize", false]], "synchronize() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.synchronize", false]], "synchronize() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.synchronize", false]], "synchronize() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize", false]], "to_valid_nvrtc_gpu_arch_cc() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.to_valid_nvrtc_gpu_arch_cc", false]], "tune() (in module kernel_tuner.strategies.basinhopping)": [[19, "kernel_tuner.strategies.basinhopping.tune", false]], "tune() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.tune", false]], "tune() (in module kernel_tuner.strategies.brute_force)": [[19, "kernel_tuner.strategies.brute_force.tune", false]], "tune() (in module kernel_tuner.strategies.diff_evo)": [[19, "kernel_tuner.strategies.diff_evo.tune", false]], "tune() (in module kernel_tuner.strategies.dual_annealing)": [[19, "kernel_tuner.strategies.dual_annealing.tune", false]], "tune() (in module kernel_tuner.strategies.firefly_algorithm)": [[19, "kernel_tuner.strategies.firefly_algorithm.tune", false]], "tune() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.tune", false]], "tune() (in module kernel_tuner.strategies.greedy_ils)": [[19, "kernel_tuner.strategies.greedy_ils.tune", false]], "tune() (in module kernel_tuner.strategies.greedy_mls)": [[19, "kernel_tuner.strategies.greedy_mls.tune", false]], "tune() (in module kernel_tuner.strategies.minimize)": [[19, "kernel_tuner.strategies.minimize.tune", false]], "tune() (in module kernel_tuner.strategies.mls)": [[19, "kernel_tuner.strategies.mls.tune", false]], "tune() (in module kernel_tuner.strategies.ordered_greedy_mls)": [[19, "kernel_tuner.strategies.ordered_greedy_mls.tune", false]], "tune() (in module kernel_tuner.strategies.pso)": [[19, "kernel_tuner.strategies.pso.tune", false]], "tune() (in module kernel_tuner.strategies.random_sample)": [[19, "kernel_tuner.strategies.random_sample.tune", false]], "tune() (in module kernel_tuner.strategies.simulated_annealing)": [[19, "kernel_tuner.strategies.simulated_annealing.tune", false]], "tune_kernel() (in module kernel_tuner)": [[23, "kernel_tuner.tune_kernel", false]], "two_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.two_point_crossover", false]], "uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.uniform_crossover", false]], "unscale_and_snap_to_nearest() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.unscale_and_snap_to_nearest", false]], "weighted_choice() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.weighted_choice", false]], "write_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.write_file", false]]}, "objects": {"kernel_tuner": [[23, 2, 1, "", "create_device_targets"], [23, 2, 1, "", "run_kernel"], [23, 2, 1, "", "store_results"], [23, 2, 1, "", "tune_kernel"], [6, 3, 0, "-", "util"]], "kernel_tuner.backends.compiler": [[6, 0, 1, "", "CompilerFunctions"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "cleanup_lib"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.cupy": [[6, 0, 1, "", "CupyFunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.hip": [[6, 0, 1, "", "HipFunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.nvcuda": [[6, 0, 1, "", "CudaFunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.opencl": [[6, 0, 1, "", "OpenCLFunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.pycuda": [[6, 0, 1, "", "PyCudaFunctions"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.core": [[6, 0, 1, "", "DeviceInterface"]], "kernel_tuner.core.DeviceInterface": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "benchmark"], [6, 1, 1, "", "benchmark_continuous"], [6, 1, 1, "", "benchmark_default"], [6, 1, 1, "", "check_kernel_output"], [6, 1, 1, "", "compile_kernel"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "create_kernel_instance"], [6, 1, 1, "", "get_environment"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "preprocess_gpu_arguments"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "set_nvml_parameters"]], "kernel_tuner.observers": [[18, 0, 1, "", "BenchmarkObserver"]], "kernel_tuner.observers.BenchmarkObserver": [[18, 1, 1, "", "after_finish"], [18, 1, 1, "", "after_start"], [18, 1, 1, "", "before_start"], [18, 1, 1, "", "during"], [18, 1, 1, "", "get_results"], [18, 1, 1, "", "register_configuration"], [18, 1, 1, "", "register_device"]], "kernel_tuner.observers.nvml": [[18, 0, 1, "", "NVMLObserver"]], "kernel_tuner.observers.pmt": [[18, 0, 1, "", "PMTObserver"]], "kernel_tuner.observers.powersensor": [[18, 0, 1, "", "PowerSensorObserver"]], "kernel_tuner.runners.sequential": [[6, 0, 1, "", "SequentialRunner"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.runners.simulation": [[6, 0, 1, "", "SimulationRunner"]], "kernel_tuner.runners.simulation.SimulationRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.strategies": [[19, 3, 0, "-", "basinhopping"], [19, 3, 0, "-", "bayes_opt"], [19, 3, 0, "-", "brute_force"], [6, 3, 0, "-", "common"], [19, 3, 0, "-", "diff_evo"], [19, 3, 0, "-", "dual_annealing"], [19, 3, 0, "-", "firefly_algorithm"], [19, 3, 0, "-", "genetic_algorithm"], [19, 3, 0, "-", "greedy_ils"], [19, 3, 0, "-", "greedy_mls"], [19, 3, 0, "-", "minimize"], [19, 3, 0, "-", "mls"], [19, 3, 0, "-", "ordered_greedy_mls"], [19, 3, 0, "-", "pso"], [19, 3, 0, "-", "random_sample"], [19, 3, 0, "-", "simulated_annealing"]], "kernel_tuner.strategies.basinhopping": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.bayes_opt": [[19, 2, 1, "", "generate_normalized_param_dicts"], [19, 2, 1, "", "normalize_parameter_space"], [19, 2, 1, "", "prune_parameter_space"], [19, 2, 1, "", "tune"]], "kernel_tuner.strategies.brute_force": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.common": [[6, 2, 1, "", "get_options"], [6, 2, 1, "", "get_strategy_docstring"], [6, 2, 1, "", "make_strategy_options_doc"], [6, 2, 1, "", "scale_from_params"], [6, 2, 1, "", "setup_method_arguments"], [6, 2, 1, "", "setup_method_options"], [6, 2, 1, "", "snap_to_nearest_config"], [6, 2, 1, "", "unscale_and_snap_to_nearest"]], "kernel_tuner.strategies.diff_evo": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.dual_annealing": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm": [[19, 0, 1, "", "Firefly"], [19, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm.Firefly": [[19, 1, 1, "", "compute_intensity"], [19, 1, 1, "", "distance_to"], [19, 1, 1, "", "move_towards"]], "kernel_tuner.strategies.genetic_algorithm": [[19, 2, 1, "", "disruptive_uniform_crossover"], [19, 2, 1, "", "mutate"], [19, 2, 1, "", "single_point_crossover"], [19, 2, 1, "", "tune"], [19, 2, 1, "", "two_point_crossover"], [19, 2, 1, "", "uniform_crossover"], [19, 2, 1, "", "weighted_choice"]], "kernel_tuner.strategies.greedy_ils": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.greedy_mls": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.minimize": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.mls": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.ordered_greedy_mls": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.pso": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.random_sample": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.simulated_annealing": [[19, 2, 1, "", "acceptance_prob"], [19, 2, 1, "", "neighbor"], [19, 2, 1, "", "tune"]], "kernel_tuner.util": [[6, 0, 1, "", "CompilationFailedConfig"], [6, 0, 1, "", "ErrorConfig"], [6, 0, 1, "", "InvalidConfig"], [6, 0, 1, "", "NpEncoder"], [6, 0, 1, "", "RuntimeFailedConfig"], [6, 4, 1, "", "SkippableFailure"], [6, 4, 1, "", "StopCriterionReached"], [6, 2, 1, "", "check_argument_list"], [6, 2, 1, "", "check_argument_type"], [6, 2, 1, "", "check_restriction"], [6, 2, 1, "", "check_restrictions"], [6, 2, 1, "", "check_stop_criterion"], [6, 2, 1, "", "check_thread_block_dimensions"], [6, 2, 1, "", "check_tune_params_list"], [6, 2, 1, "", "compile_restrictions"], [6, 2, 1, "", "config_valid"], [6, 2, 1, "", "convert_constraint_restriction"], [6, 2, 1, "", "correct_open_cache"], [6, 2, 1, "", "cuda_error_check"], [6, 2, 1, "", "delete_temp_file"], [6, 2, 1, "", "detect_language"], [6, 2, 1, "", "dump_cache"], [6, 2, 1, "", "get_best_config"], [6, 2, 1, "", "get_config_string"], [6, 2, 1, "", "get_grid_dimensions"], [6, 2, 1, "", "get_instance_string"], [6, 2, 1, "", "get_kernel_string"], [6, 2, 1, "", "get_problem_size"], [6, 2, 1, "", "get_smem_args"], [6, 2, 1, "", "get_temp_filename"], [6, 2, 1, "", "get_thread_block_dimensions"], [6, 2, 1, "", "get_total_timings"], [6, 2, 1, "", "looks_like_a_filename"], [6, 2, 1, "", "normalize_verify_function"], [6, 2, 1, "", "parse_restrictions"], [6, 2, 1, "", "prepare_kernel_string"], [6, 2, 1, "", "print_config"], [6, 2, 1, "", "print_config_output"], [6, 2, 1, "", "process_cache"], [6, 2, 1, "", "process_metrics"], [6, 2, 1, "", "read_cache"], [6, 2, 1, "", "read_file"], [6, 2, 1, "", "replace_param_occurrences"], [6, 2, 1, "", "setup_block_and_grid"], [6, 2, 1, "", "store_cache"], [6, 2, 1, "", "to_valid_nvrtc_gpu_arch_cc"], [6, 2, 1, "", "write_file"]], "kernel_tuner.util.NpEncoder": [[6, 1, 1, "", "default"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"], "3": ["py", "module", "Python module"], "4": ["py", "exception", "Python exception"]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function", "3": "py:module", "4": "py:exception"}, "terms": {"": [0, 4, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23], "0": [4, 5, 6, 8, 9, 10, 12, 13, 16, 18, 19, 21, 23], "000001": 23, "001": 19, "004": 14, "0079296": 9, "01": 9, "01125121117": 12, "01592960358": 12, "018869400024": 8, "02665598392": 8, "0341696": 9, "03752319813": 8, "0398656": 9, "04807043076": 8, "05": [8, 9], "05262081623": 8, "054880023": 8, "05524477959": 12, "05549435616": 8, "05816960335": 8, "05957758427": 8, "06": 23, "0626496": 9, "0629119873": 8, "06332798004": 8, "06672639847": 8, "06709122658": 8, "06844799519": 8, "0688128": 9, "0692224": 9, "06983039379": 8, "07002239227": 8, "0708288": 9, "07260": 14, "0731967926": 8, "07386879921": 8, "07484800816": 8, "07508480549": 8, "0759360075": 8, "0799423933": 8, "08": 14, "0801344": 9, "0808192": 9, "0809792": 9, "08220798969": 8, "0836928": 9, "08389122486": 8, "0849664": 9, "086019515991": 10, "0873216": 9, "09015038013": 8, "0909248": 9, "0911808": 9, "0927808": 9, "095232": 9, "09730558395": 8, "09794559479": 8, "0f": [8, 9, 10], "0l": 21, "0x2aaab952f240": 8, "0x2aaabbdcb2e8": 8, "0x2aab1c98b3c8": 8, "0x2aab1de088d0": 9, "0x7f8858b873c8": 10, "0x7f8865b51f28": 10, "0x7f887c3d2358": 10, "0x7f888f8cd7b8": 10, "1": [4, 5, 7, 8, 9, 10, 12, 13, 16, 18, 19, 23], "10": [7, 8, 9, 10, 14, 19], "100": [12, 19, 23], "1000": [8, 9, 10, 12], "10000": 6, "1000000": [20, 22], "10000000": 14, "10033922195": 8, "10066559315": 8, "10125439167": 8, "1016": 14, "1018432": 9, "1024": [4, 8, 9, 10, 20], "1035136": 9, "10455036163": 12, "1065792": 9, "10700161457": 8, "10740480423": 8, "1080": [10, 12], "108896": 9, "1095936": 9, "11": [7, 8, 9, 10], "1107712": 9, "1140736": 9, "1145664": 9, "11514236927": 8, "11877760887": 12, "12": [7, 8, 9, 10], "12000002861": 8, "12033278942": 8, "1219392": 9, "12255358696": 12, "1230336": 9, "12309756279": 12, "12483196259": 12, "1249984": 9, "1264": 9, "128": [4, 8, 9, 10, 14, 20, 22], "128x32": [8, 9, 10], "13": [8, 9, 10], "13023357391": 8, "13276162148": 12, "13297917843": 8, "133200": 12, "134233": 6, "1384064": 9, "14": [8, 9, 10], "140": 9, "140192": 9, "1428928": 9, "14420480728": 8, "14729599953": 8, "1493952": 9, "14991": 14, "15": [8, 9, 10, 22], "15089921951": 8, "154": 9, "1545856": 9, "1547072": 9, "15584001541": 12, "15777277946": 12, "1583104": 9, "15916161537": 8, "16": [4, 5, 8, 9, 10, 12, 13, 16], "160627": 12, "16174077988": 12, "16199679375": 12, "16430072784": 12, "17": [4, 5, 8, 9, 10, 13], "17051515579": 12, "171552": 9, "174624": 9, "1748096": 9, "17600002289": 12, "1790336": 9, "17987194061": 12, "18": [8, 9, 10], "18713598251": 8, "18835201263": 12, "19": [9, 10], "19029750824": 12, "19084160328": 8, "1912576": 9, "19179515839": 12, "1942016": 9, "1e": [5, 23], "1e3": [4, 8, 9, 10, 16, 17], "1e9": [4, 16], "1xn": 16, "2": [4, 5, 7, 8, 9, 10, 11, 12, 13, 16, 18, 19, 23], "20": [10, 19], "2000": [8, 9, 10], "2018": 14, "2019": 14, "2021": 14, "2022": 14, "2048": 4, "21": 10, "2111": 14, "2124736": 9, "213971138": 12, "2194111824": 12, "2211": 14, "22200961113": 10, "22305920124": 8, "2231903076172": 10, "225": [8, 9, 10], "225f": [8, 9, 10], "22673916817": 10, "231072": 9, "23127679825": 12, "234342": 6, "23719043732": 12, "25130877495": 12, "256": [12, 14, 20], "26": [8, 9, 10], "26213116646": 12, "2634239912": 8, "26398081779": 12, "27082881927": 12, "28542718887": 12, "28725762367": 12, "29789438248": 8, "29814395905": 12, "2d": [8, 9, 10, 11], "2u_": [8, 9, 10], "3": [5, 7, 8, 9, 10, 12, 13, 15, 16, 19, 23], "30": 12, "30227203369": 12, "30295038223": 12, "3039616": 9, "31470079422": 12, "31647996902": 12, "31661438942": 8, "32": [4, 6, 8, 9, 10, 12, 14, 16, 20, 23], "322547197342": 10, "324083191156": 10, "3245952": 9, "3269824028": 8, "328998398781": 10, "32x2": [8, 9, 10], "33": 10, "339692795277": 10, "341376006603": 10, "342201602459": 10, "342540800571": 10, "34344320297": 12, "343545603752": 10, "343961596489": 10, "34440960288": 10, "34500494003": 12, "346079999208": 10, "346707201004": 10, "347": 14, "350457596779": 10, "351263999939": 10, "351923197508": 10, "352531200647": 10, "356716805696": 10, "357107198238": 10, "358": 14, "359289598465": 10, "359744000435": 10, "361823999882": 10, "362937599421": 10, "364051198959": 10, "36409599781": 10, "364627194405": 10, "365190398693": 10, "366150397062": 10, "36935677528": 12, "369555193186": 10, "371507203579": 10, "37233917713": 12, "37322881222": 12, "3735104084": 12, "37474560738": 12, "37543039322": 12, "3754431963": 12, "375468802452": 10, "375519996881": 10, "37677440643": 12, "37724158764": 12, "3775359869": 12, "3778496027": 12, "380812799931": 10, "3835711956": 12, "384262400866": 10, "385510402918": 10, "38625922203": 12, "38627202511": 12, "386668801308": 10, "38772480488": 12, "387827193737": 10, "38787200451": 12, "38793599606": 12, "38867840767": 12, "38956804276": 12, "389945602417": 10, "391807991266": 10, "39207677841": 12, "393126398325": 10, "39331197739": 12, "39366400242": 12, "394271999598": 10, "394380801916": 10, "39439997673": 12, "394975996017": 10, "39508478642": 12, "39547519684": 12, "39589118958": 12, "396070402861": 10, "39618558884": 12, "4": [4, 8, 9, 10, 12, 16, 18], "40000": 12, "400672000647": 10, "40074241161": 12, "4015104": 9, "40247042179": 12, "40401918888": 12, "404249596596": 10, "404908800125": 10, "407667201757": 10, "408166396618": 10, "4096": [4, 5, 8, 9, 10, 13, 16], "411392003298": 10, "41324160099": 10, "4152": 10, "416320002079": 10, "416352003813": 10, "4164": 8, "418393599987": 10, "42": 4, "423038482666016": 8, "42449922562": 12, "425414395332": 10, "426995199919": 10, "429862397909": 10, "4303104": 9, "431225597858": 10, "437388807535": 10, "438406395912": 10, "44043520093": 10, "440895992517": 10, "441734397411": 10, "443033593893": 10, "443519997597": 10, "443942397833": 10, "444": 9, "447974395752": 10, "449055999517": 10, "44974719882": 10, "44995200038": 10, "451283198595": 10, "455193603039": 10, "455404800177": 10, "45568639636": 10, "455724793673": 10, "456275194883": 10, "457075202465": 10, "457427197695": 10, "457715201378": 10, "457779198885": 10, "458412796259": 10, "459654402733": 10, "460032004118": 10, "460281592607": 10, "460972803831": 10, "461017608643": 10, "46109390258789": 10, "46233600378": 10, "462649595737": 10, "463052803278": 10, "464659202099": 10, "464863997698": 10, "465132802725": 10, "46561280489": 10, "466118401289": 10, "466284793615": 10, "466655993462": 10, "466758400202": 10, "466937589645": 10, "467033600807": 10, "467059195042": 10, "470220798254": 10, "470937597752": 10, "472236800194": 10, "472665601969": 10, "47406719923": 10, "474092799425": 10, "474841600657": 10, "475264000893": 10, "475987195969": 10, "478079998493": 10, "478739196062": 10, "48": [8, 9, 10], "480531209707": 10, "481279999018": 10, "481376004219": 10, "481817597151": 10, "486515200138": 10, "486649608612": 10, "489004802704": 10, "491852802038": 10, "492915201187": 10, "493900799751": 10, "494163197279": 10, "494553589821": 10, "49552000761": 10, "496121603251": 10, "49723520875": 10, "498617601395": 10, "4u_": [8, 9, 10], "5": [8, 9, 10, 12, 19], "50": 19, "500": [8, 9, 10], "500288009644": 10, "500524806976": 10, "502195191383": 10, "50277120471": 10, "506457602978": 10, "50662400723": 12, "507142400742": 10, "50787198544": 12, "50912": 9, "5108224": 9, "5116928": 9, "512": [14, 20], "513356792927": 10, "513632011414": 10, "515667212009": 10, "516473591328": 10, "5172352": 9, "5188416": 9, "5230208": 9, "52320": 12, "5235328": 9, "528115200996": 10, "5289728": 9, "53": [8, 9, 10], "53760001659": 12, "538227200508": 8, "5383296": 9, "539891195297": 8, "540352010727": 8, "540383994579": 8, "5412672": 9, "542387211323": 8, "542937588692": 8, "544224": 9, "54432649612": 12, "5445248": 9, "544691193104": 8, "545715200901": 10, "546316814423": 10, "5470016": 9, "5475584": 9, "5476864": 9, "5478592": 9, "549024009705": 10, "5492416": 9, "54944": 9, "550105595589": 8, "550668799877": 10, "55171200037": 10, "55267841816": 12, "554745602608": 8, "5548416": 9, "556576": 9, "557331204414": 10, "5582144": 9, "558771193027": 10, "558815991879": 10, "5591744": 9, "5592064": 9, "560505592823": 8, "5620608": 9, "562521612644": 8, "5631168": 9, "563417613506": 8, "564575994015": 10, "5648128": 9, "564921605587": 10, "565254402161": 8, "56585599184": 8, "56709756851": 12, "567417597771": 8, "567456": 9, "56833920479": 12, "568435204029": 10, "568556785583": 8, "5685632": 9, "569177603722": 10, "569388794899": 8, "57044479847": 10, "5715968": 9, "5717568": 9, "5719744": 9, "573836791515": 8, "574816000462": 10, "5753152": 9, "575859189034": 8, "576044797897": 8, "57678719759": 10, "577215993404": 8, "578681600094": 8, "578745603561": 8, "579007995129": 10, "579411196709": 8, "579827189445": 10, "579904007912": 8, "58026239872": 10, "58035838604": 8, "580928003788": 10, "581280004978": 8, "582751989365": 10, "583270406723": 10, "5863488": 9, "586604809761": 10, "586931192875": 10, "588492810726": 8, "588998401165": 10, "5898432": 9, "59088640213": 8, "5920832": 9, "593977594376": 10, "595276796818": 8, "595462405682": 10, "5968448": 9, "5968704": 9, "597267186642": 8, "597920000553": 10, "598": 9, "598758399487": 10, "6": [5, 8, 9, 10, 12, 13, 23], "600819194317": 10, "601049602032": 10, "601856": 9, "60216319561": 8, "6038336": 9, "60433280468": 10, "6047296": 9, "605760002136": 8, "6076224": 9, "6081536": 9, "608800005913": 10, "60942081213": 8, "611705589294": 10, "613088": 9, "6146176": 9, "615148806572": 8, "615475213528": 10, "61618566513": 12, "61655673981": 12, "616556799412": 10, "618": 10, "618003201485": 8, "618598401546": 8, "618758392334": 10, "620352": 9, "620384001732": 10, "621254396439": 8, "622534394264": 10, "622867202759": 8, "6237696": 9, "624492788315": 8, "6250816": 9, "625260794163": 8, "626163220406": 8, "6263552": 9, "6264192": 9, "626976013184": 8, "627136015892": 8, "629164803028": 10, "6292032": 9, "631142401695": 8, "632006394863": 8, "632607996464": 10, "6343168": 9, "6367488": 9, "637958395481": 8, "638348805904": 8, "6387072": 9, "6389312": 9, "64": [4, 8, 9, 10, 14, 16, 20, 22], "6415552": 9, "643359994888": 8, "64358406067": 12, "643820810318": 8, "6452736": 9, "6458624": 9, "6459328": 9, "646092808247": 8, "6466816": 9, "648620784283": 8, "648928": 9, "6492288": 9, "649779188633": 8, "6498688": 9, "64x4": [8, 9, 10], "650336003304": 8, "6508544": 9, "6524544": 9, "652575993538": 8, "6551744": 9, "655827200413": 10, "657920002937": 8, "6581248": 9, "6601472": 9, "6607744": 9, "6608448": 9, "662041604519": 8, "662566399574": 8, "6633536": 9, "66344319582": 8, "6659904": 9, "666003203392": 8, "6660672": 9, "6664448": 9, "666656005383": 8, "667251205444": 8, "667347204685": 8, "6677696": 9, "6717888": 9, "6719104": 9, "6724736": 9, "673248004913": 8, "67516155243": 12, "675232005119": 8, "6754048": 9, "6757568": 9, "675923216343": 8, "676096": 9, "676595199108": 8, "677363204956": 8, "67856": 9, "679372787476": 8, "6796352": 9, "680422389507": 8, "6810944": 9, "6813376": 9, "681350398064": 8, "682188808918": 8, "6842112": 9, "685670387745": 8, "686528": 9, "68781440258": 8, "687955200672": 8, "689356791973": 8, "6895552": 9, "690009605885": 8, "6905984": 9, "69104": 9, "691116797924": 8, "6911872": 9, "691385602951": 8, "6921216": 9, "692665600777": 8, "6929216": 9, "694451200962": 8, "6951168": 9, "6962112": 9, "69627519846": 8, "69648": 9, "697094392776": 8, "6972928": 9, "6975872": 9, "69833599329": 10, "698336": 9, "699366402626": 8, "7": [6, 8, 9, 10, 12, 23], "700883197784": 8, "7008832": 9, "70140799284": 8, "703302407265": 8, "705055999756": 8, "705900788307": 8, "705932807922": 8, "7068608": 9, "7074048": 9, "7075136": 9, "7078976": 9, "7087296": 9, "7091136": 9, "7099712": 9, "710278391838": 8, "7113024": 9, "711615991592": 10, "713843202591": 8, "714169609547": 8, "715257608891": 10, "716115188599": 8, "7168192": 9, "7168192029": 8, "7190464": 9, "719257593155": 10, "72": [8, 9, 10], "72023679018": 10, "720915210247": 10, "721408": 9, "7214464": 9, "721625590324": 10, "721862399578": 8, "722668802738": 8, "722777605057": 10, "722969603539": 10, "723999989033": 8, "724966406822": 10, "725548803806": 8, "726335990429": 8, "7268928": 9, "727967989445": 8, "7284544": 9, "729363203049": 10, "72981758118": 12, "730982398987": 8, "731334400177": 8, "731839990616": 10, "731891202927": 8, "732409596443": 8, "7326656": 9, "733248019218": 8, "734028804302": 10, "735436797142": 8, "73579518795": 10, "736511993408": 10, "74003200531": 12, "740518403053": 8, "741964805126": 8, "7453312": 9, "7461632": 9, "747328": 9, "75041918755": 8, "750636804104": 8, "7520064": 9, "7522432": 9, "7522624": 9, "752479994297": 8, "752838408947": 10, "754201591015": 10, "75422719717": 10, "7561792": 9, "7584896": 9, "759308815": 8, "759430396557": 10, "759679996967": 8, "759910392761": 10, "760915207863": 8, "761139214039": 8, "7611968": 9, "762003195286": 10, "762777590752": 10, "763775992393": 8, "766662418842": 8, "768064010143": 8, "769734406471": 10, "7707904": 9, "771052801609": 10, "771072": 9, "771103990078": 8, "7728576": 9, "7745216": 9, "776639997959": 10, "77759360075": 8, "7786496": 9, "778656005859": 10, "779033613205": 8, "780134403706": 10, "780352008343": 10, "780960011482": 10, "782060790062": 8, "782112002373": 10, "782495999336": 10, "78363519907": 8, "783923208714": 10, "7860736": 9, "7869632": 9, "7881152": 9, "788345599174": 8, "788947200775": 10, "7907264": 9, "791257584095": 8, "792108798027": 8, "792595207691": 8, "7930752": 9, "793516802788": 10, "793542397022": 10, "795135998726": 10, "7952896": 9, "7965888": 9, "7967488": 9, "797900807858": 8, "798195195198": 10, "7982208": 9, "7982848": 9, "799059200287": 8, "7994048": 9, "8": [4, 6, 8, 9, 10, 12, 16, 18], "80": 12, "800019192696": 10, "801119995117": 8, "801798415184": 8, "801996803284": 8, "8028416": 9, "803033602238": 8, "803718411922": 8, "8037312": 9, "804876792431": 10, "804953610897": 8, "805299210548": 8, "8063424": 9, "806828796864": 8, "8068416": 9, "80796158314": 12, "808000004292": 8, "808211183548": 8, "809894394875": 10, "8117248": 9, "818304": 9, "82159358263": 10, "821881604195": 8, "821952": 9, "822137594223": 8, "8228864": 9, "8237248": 9, "824838399887": 8, "8252928": 9, "826361596584": 10, "826515209675": 8, "8302656": 9, "8313792": 9, "832300806046": 8, "832768": 9, "833420813084": 8, "8335488": 9, "835481595993": 8, "835494399071": 8, "8365184": 9, "837299215794": 8, "837804794312": 8, "8380288": 9, "838195204735": 8, "8384512": 9, "8384832": 9, "84": 12, "840755212307": 8, "840908801556": 8, "841631996632": 8, "8422016": 9, "843411195278": 8, "843692803383": 8, "844428789616": 8, "8444928": 9, "8449472": 9, "845856": 9, "8473408": 9, "8479232": 9, "848044800758": 8, "84848": 9, "849235200882": 12, "849273598194": 12, "849631989002": 12, "851040017605": 8, "8516672": 9, "852166390419": 8, "852575981617": 8, "8531072": 9, "853574407101": 8, "853708791733": 12, "85437438488": 8, "855359995365": 12, "855628800392": 12, "8573184": 9, "857728": 9, "85886080265": 8, "860332798958": 8, "862348806858": 8, "8626304": 9, "8633344": 9, "8637184": 9, "8663936": 9, "867276787758": 8, "869497597218": 8, "873651194572": 10, "875001597404": 8, "876377594471": 8, "876627194881": 8, "8772672": 9, "8774336": 9, "8844544": 9, "888671982288": 8, "8896384": 9, "890803205967": 8, "8922624": 9, "893279993534": 8, "8byte": 21, "9": [4, 5, 7, 8, 9, 10, 13, 15], "90": 14, "900499212742": 8, "9123328": 9, "91601279974": 10, "916985595226": 10, "922745585442": 8, "932281601429": 10, "933260798454": 10, "93347837925": 8, "940044796467": 12, "9624512": 9, "971545600891": 8, "98": 9, "985": 9, "995": 19, "997139203548": 8, "99912958145": 12, "999763202667": 8, "A": [1, 4, 6, 7, 14, 15, 16, 18, 19, 23], "And": [4, 8, 9, 10, 19, 22, 23], "As": [0, 1, 4, 8, 9, 10, 12, 15, 16, 18], "At": [6, 12, 23], "Be": [7, 8, 9, 10], "But": [4, 8, 9, 10, 12, 20], "By": [6, 13, 16, 19, 23], "For": [3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 18, 20, 21, 23], "If": [0, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 21, 23], "In": [4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 20, 21, 23, 24], "It": [0, 4, 6, 7, 8, 9, 10, 13, 15, 16, 18, 22, 23], "Not": [3, 6], "Of": 16, "On": [7, 8, 9, 10, 23], "One": [6, 8, 9, 10, 18, 21], "Or": [14, 15], "That": [4, 8, 9, 10, 13, 16, 17, 20], "The": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23], "Then": [3, 8, 9, 10, 12, 14, 15, 22], "There": [5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 18, 20, 23, 24], "These": [3, 7, 8, 9, 10, 12, 15, 16, 18, 22, 23], "To": [0, 3, 5, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23], "With": [0, 12, 13], "_": [5, 8, 9, 10], "__device__": [12, 22], "__global__": [4, 8, 10, 12, 14, 16, 20, 22], "__host__": 12, "__init__": 6, "__kernel": 9, "__local": 9, "__shared__": [8, 10, 16], "__syncthread": [8, 9, 10, 16], "_energi": 18, "_flop": 17, "_funcptr": 6, "_power": 18, "a_d": 9, "a_h": 9, "ab": 14, "abil": 1, "abl": [4, 6, 8, 9, 10], "about": [3, 4, 6, 8, 9, 10, 14, 16, 18, 19, 20, 23], "abov": [4, 6, 8, 9, 10, 12, 15, 16, 20, 21], "abruptli": 6, "absolut": [5, 23], "absorpt": 19, "abstract": [6, 18], "acceler": 19, "accept": [5, 6, 19, 23], "acceptance_prob": 19, "access": [4, 7, 8, 9, 10, 12, 18, 21], "accord": [6, 23], "account": [13, 16], "accur": [12, 18], "achiev": [5, 10], "across": [13, 16], "act": 18, "action": 7, "activ": 7, "actual": [3, 4, 5, 6, 8, 9, 10, 12, 16, 22], "ad": [7, 8, 9, 10, 13, 23], "add": [4, 6, 7, 8, 9, 10, 13, 16, 18, 19], "addgrid": 12, "addit": [3, 4, 7, 8, 9, 10, 15, 17, 20], "address_mod": 23, "addtion": [8, 9, 10], "adjac": 19, "adjust": [4, 7], "advanc": [6, 22, 23], "advantag": 18, "advis": 6, "affect": [8, 9, 10, 16], "after": [4, 5, 6, 7, 8, 9, 10, 13, 15, 16, 18, 23], "after_finish": 18, "after_start": 18, "again": [4, 7, 8, 9, 10, 12, 16], "against": [5, 6, 7], "aggreg": 18, "algebra": 16, "algorithm": [11, 14, 19, 23], "all": [0, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 23], "allclos": [5, 23], "alloc": [4, 6, 8, 9, 10, 11, 13, 23], "allow": [1, 4, 5, 6, 7, 8, 9, 10, 16, 17, 18, 19, 22, 23], "allow_nan": 6, "almost": [5, 8, 9, 10, 18], "along": [4, 6, 15, 20, 24], "alpha": 19, "alreadi": [4, 6, 7, 8, 9, 10, 15, 16, 23], "alright": 20, "also": [0, 1, 3, 4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "altern": [15, 23], "although": 5, "alwai": [4, 6, 8, 9, 10], "amd": [15, 18], "among": [8, 9, 10, 14, 19], "amount": [4, 8, 9, 10, 16, 17, 23], "amper": 18, "an": [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "anaconda": 7, "analysi": [8, 9, 14], "analyz": [8, 9, 10], "ani": [1, 3, 4, 6, 7, 8, 9, 10, 13, 16, 17, 18, 19, 21, 22, 23, 24], "anneal": [19, 23], "anoth": [0, 8, 9, 10, 13, 16, 17, 19, 23], "answer": [4, 5, 6, 8, 9, 10, 11, 23], "anyth": 4, "api": [2, 4, 6], "app": 15, "append": [1, 6, 15, 23], "appl": 15, "appli": [7, 8, 9, 10], "applic": [3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 21, 22, 23], "approach": 18, "appropi": [8, 9, 10], "approx": [8, 9, 10], "approxim": [8, 9, 10], "apt": 7, "ar": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "arbitrari": 6, "arch": 8, "architectur": [6, 18], "arduino": 18, "area": [8, 9, 10, 16], "arg": [5, 6, 8, 9, 10, 12, 13, 14, 16, 19, 20, 21, 22], "argument": [0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 16, 18, 19, 20, 21, 22, 23], "arithmet": [8, 9, 10, 23], "around": [4, 11], "arrai": [0, 4, 5, 6, 8, 9, 10, 12, 13, 20, 21, 23], "articl": [14, 20], "arxiv": 14, "assign": 12, "associ": 12, "assum": [4, 6, 8, 9, 10, 16, 23], "assumpt": [8, 9, 10], "astron": 18, "astyp": [4, 5, 8, 9, 10, 12, 13, 14, 16, 20, 22], "asynchron": 6, "atol": [5, 6, 23], "attempt": [6, 22], "attract": 19, "author": 14, "auto": [14, 16, 18, 19, 22, 23, 24], "auto_activate_bas": 7, "autoinit": [10, 12], "autom": 7, "automat": [0, 4, 7, 8, 9, 10, 12, 13, 16, 22, 23], "auxilliari": 23, "avail": [4, 7, 8, 9, 10, 11, 12, 15, 18], "averag": [6, 8, 9, 10, 13, 18], "avoid": [4, 6, 16, 24], "ax1": [8, 9, 10], "ax2": [8, 9, 10], "axesimag": [8, 9, 10], "axi": 23, "b": [12, 14, 16, 19, 20, 22], "b0": 19, "b_d": 9, "back": [13, 23], "backend": [2, 7, 13, 18], "backward": 6, "bandwidth": 16, "barrier": 9, "base": [0, 6, 7, 17, 18, 22, 23], "bash": [7, 15], "bash_profil": 7, "bashrc": 7, "basic": [4, 6, 8, 9, 10, 20], "basin": [19, 23], "basinhop": 23, "batenburg": 14, "bayes_opt": 23, "bayesian": [14, 19, 23], "becaus": [0, 4, 5, 8, 9, 10, 13, 15, 16, 17, 22, 24], "becom": [8, 9, 10, 18, 19], "been": [4, 6, 7, 8, 9, 10, 13, 16, 19], "befor": [1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 19, 23], "before_start": 18, "begin": [4, 8, 9, 10, 12], "behavior": [4, 16, 18, 23], "behaviour": 6, "behind": 13, "beignet": 15, "being": [6, 8, 9, 10, 16, 18, 19, 23], "below": [0, 7, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21], "ben": 14, "benchmark": [0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 23, 24], "benchmark_continu": 6, "benchmark_default": 6, "benchmarkobserv": 18, "benefit": 16, "benvanwerkhoven": 15, "best": [6, 8, 9, 12, 16, 19, 22, 23, 24], "best1bin": 19, "best1exp": 19, "best2bin": 19, "best2exp": 19, "best_tim": [8, 9], "beta": [12, 19], "better": [3, 7, 8, 9, 10], "between": [0, 8, 9, 10, 13, 15, 16, 17, 19, 23], "beyond": [8, 9, 10, 23], "bf": 21, "bfg": 19, "bind": [15, 18], "biologi": [8, 9, 10], "bit": [4, 6, 8, 9, 10, 12, 13, 16], "block": [0, 4, 6, 8, 9, 10, 11, 12, 15, 16, 17, 20, 23, 24], "block_size_": 24, "block_size_i": [4, 5, 8, 9, 10, 12, 13, 16, 23], "block_size_nam": [4, 6, 23], "block_size_str": [8, 10], "block_size_x": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 20, 22, 23], "block_size_z": [4, 8, 9, 10, 12, 23], "blockdim": [4, 20, 23], "blockidx": [4, 8, 9, 10, 12, 14, 16, 20, 22], "boilerpl": [8, 9, 10], "bool": [6, 21, 23], "boolean": [17, 18, 23], "border": [13, 23], "border_s": 4, "both": [7, 8, 9, 10, 11, 16], "bottom": 6, "bound": [4, 6, 16, 19], "boundari": [8, 9, 10], "bracket": 6, "bram": 14, "branch": [3, 7], "break": [7, 22], "brew": 7, "briefli": 16, "brows": 7, "brute": [17, 19, 20], "brute_forc": [6, 23], "buffer": [6, 9, 21], "build": [3, 6, 8, 9, 10], "built": [7, 18, 19, 21, 23], "bulk": [8, 9, 10], "bx": [8, 9, 10, 12], "bypass": 10, "byte": [6, 21, 23], "bz": 12, "c": [0, 3, 4, 6, 7, 11, 13, 14, 15, 16, 20, 22, 23], "c1": 19, "c2": 19, "c_arg": 6, "cach": [2, 6, 7, 8, 9, 10, 15, 16, 19, 23], "cachefil": [6, 23], "calcul": [6, 12], "call": [1, 4, 5, 6, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 22, 23], "callabl": [5, 6, 23], "can": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "cannot": [0, 7, 8, 9, 10, 18], "cap": 18, "capabl": [6, 7, 8, 9, 14, 16, 23], "care": [8, 9, 10], "cartesian": [4, 12], "case": [0, 4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 18, 20, 21, 23], "cat": 7, "caus": [0, 8, 9, 10], "cc": 8, "cd": [3, 7, 15], "ceil": [10, 12], "cell": [4, 8, 9, 10, 12, 16], "center": [11, 12], "central": [8, 9, 10], "certain": [4, 6, 8, 9, 10, 11, 18, 24], "cfunction": 6, "cg": 19, "chanc": [15, 19, 22], "chang": [3, 7, 12, 18, 23], "changelog": 3, "check": [0, 5, 6, 7, 8, 9, 10, 13, 16], "check_argument_list": 6, "check_argument_typ": 6, "check_circular": 6, "check_kernel_output": 6, "check_restrict": 6, "check_stop_criterion": 6, "check_thread_block_dimens": 6, "check_tune_params_list": 6, "chemistri": [8, 9, 10], "children": 19, "choic": [13, 15], "choos": [0, 8, 9, 10, 16, 19, 23], "chosen": 23, "chunk": 13, "circumst": 4, "cite": 14, "cl": 9, "clamp": 23, "clarifi": 13, "class": [6, 18, 19], "clean": [11, 16], "cleaner": [8, 9, 10], "cleanup": 8, "cleanup_lib": 6, "clk_local_mem_f": 9, "clock": [18, 24], "clone": [3, 4, 7, 8, 9, 10, 12, 15, 16], "close": [6, 8, 9, 10], "closer": [8, 9, 10], "closest": 6, "cmem_arg": [5, 6, 23], "cobyla": 19, "code": [0, 2, 4, 6, 7, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "coeffici": 19, "cognit": 19, "collabor": 16, "collect": [4, 6, 8, 9, 10, 12, 16, 18, 21], "color": [8, 9, 10], "column": 16, "com": [0, 3, 6, 7, 14, 15], "combin": [4, 6, 7, 8, 9, 10, 11, 12, 16, 18, 19, 20, 23], "come": [1, 6, 8, 9, 10, 16, 18, 22], "command": [7, 15], "commandqueu": 9, "commit": 7, "common": [18, 22], "commonli": [4, 8, 9, 10, 15, 16], "commun": [8, 9, 10], "compact": 6, "compar": [0, 4, 5, 8, 9, 10, 12, 16, 17, 18], "comparison": [5, 14], "compat": [6, 7, 15], "compil": [3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 22, 23, 24], "compilationfailedconfig": 6, "compile_kernel": 6, "compile_restrict": 6, "compiler_opt": [6, 23], "compiler_opt_": 24, "complain": 6, "complet": [0, 1, 4], "complex": [13, 16], "compos": [4, 6, 16, 17], "comprehens": 15, "comput": [4, 5, 6, 11, 12, 13, 14, 16, 19, 23], "compute_cap": 6, "compute_capability_major": 8, "compute_capability_minor": 8, "compute_grid": 12, "compute_intens": 19, "concentr": [8, 9, 10], "concept": [8, 9, 10], "conda": 7, "condarc": 7, "condens": [8, 9, 10], "condit": [8, 9, 10, 16], "config": [6, 7], "config_valid": 6, "configur": [1, 4, 6, 8, 9, 10, 11, 12, 16, 17, 18, 19, 23], "confus": 4, "connect": 18, "consid": [3, 12, 14, 16, 23], "consist": [11, 16, 23], "consol": 23, "constant": [0, 4, 6, 8, 9, 10, 11, 13, 16, 19, 23], "constr": 19, "constraint": 6, "construct": [5, 16], "consult": 21, "consumpt": [14, 16, 18], "contact": 11, "contain": [0, 1, 4, 6, 8, 9, 10, 12, 13, 16, 18, 19, 22, 23], "contant": 16, "content": [4, 6, 23], "context": [6, 8, 10, 12], "continous_dur": 18, "continu": [4, 6, 7, 8, 9, 10, 15, 18, 19, 23], "continuous_dur": 18, "continuum": 15, "contrast": 4, "contribut": [2, 7], "control": [0, 8, 9, 10, 18, 19, 23], "conv_filt": 4, "conveni": [7, 8, 9, 10, 13, 23], "convent": [6, 13, 23], "convert": [6, 8, 9], "convert_constraint_restrict": 6, "convolut": [2, 5, 13, 16], "convolution_correct": 5, "convolution_kernel": [4, 5], "convolution_na": [4, 5], "convolution_stream": 13, "cooler": [8, 9, 10], "coordin": 12, "copi": [6, 8, 9, 10, 13, 20, 23], "copy_constant_memory_arg": 6, "copy_host_ptr": 9, "copy_shared_memory_arg": 6, "copy_texture_memory_arg": 6, "core": [7, 18], "core_freq": [18, 24], "correct": [2, 7, 13, 21, 23], "correct_open_cach": 6, "correctli": [7, 16], "correspond": [4, 7, 8, 9, 10, 12, 18, 19, 20], "correspondingli": 4, "cost": [8, 9, 10, 19], "could": [4, 5, 6, 8, 9, 10, 13, 15, 16, 18, 19, 22, 23], "count": 19, "counter": [7, 18], "coupl": 16, "cours": [4, 8, 9, 10, 15, 16], "cover": [8, 9, 10, 19], "coverag": 7, "cpath": 7, "cpu": [5, 9, 10, 13], "cpu_grid": 12, "cpu_result": 5, "creat": [1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 16, 18, 20, 21, 23], "create_device_target": 23, "create_kernel_inst": 6, "create_receive_spec_struct": 21, "create_some_context": 9, "creation": [4, 14, 19], "criterion": [6, 19], "crossov": 19, "csv": [8, 9, 11], "ctx": 9, "ctype": 6, "cu": [4, 5, 13, 16, 20, 22], "cub": 11, "cuda": [3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 18, 20, 21, 22, 23], "cuda11x": 7, "cuda12x": 7, "cuda_error_check": 6, "cudamemcpytosymbol": 13, "cudastreamwaitev": 13, "cudeviceptr": 6, "cufunct": 6, "cupi": [0, 7, 13, 15, 18, 22, 23], "curl": [7, 15], "current": [4, 5, 6, 7, 8, 9, 10, 15, 16, 18, 19, 23], "current_modul": 18, "current_problem_s": 6, "custom": [5, 11, 17, 18, 21], "d": [8, 9, 10, 12, 19, 20], "d_filter": 5, "dashboard": [1, 14], "data": [4, 6, 8, 9, 10, 12, 13, 16, 17, 18, 20, 21, 23], "datafram": [8, 9], "date": 3, "debug": 6, "decreas": [4, 16], "deep": 4, "def": [5, 6, 8, 9, 10, 12, 18, 21], "default": [0, 4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 18, 19, 22, 23], "defin": [4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 18, 22, 23], "definit": [4, 12, 23], "degrad": [8, 9, 10], "degre": [8, 9, 10], "delet": 6, "delete_temp_fil": 6, "delta": [8, 9, 10], "delv": 12, "demonstr": [5, 10, 11, 16], "demot": 22, "denorm": 19, "denot": [16, 20, 23], "depend": [3, 4, 5, 7, 10, 11, 12, 14, 17, 23], "deriv": [4, 6, 8, 9, 10, 17], "descret": [8, 9, 10], "describ": [3, 4, 6, 7, 13, 18, 21], "design": [2, 3, 8, 9, 10, 18], "desir": 7, "dest": 6, "detail": [0, 6, 15, 23], "detect": [6, 19, 22, 23], "detect_languag": 6, "determin": [4, 8, 9, 10, 12, 18, 19], "dev": [7, 15, 18], "develop": [2, 6, 11, 14, 15], "devic": [4, 5, 7, 8, 9, 10, 11, 13, 18, 22, 23], "device_nam": [6, 23], "device_opt": 6, "devicealloc": 6, "devprop": 8, "df": [8, 9], "dict": [4, 5, 6, 10, 13, 14, 18, 19, 20, 22, 23], "dictionari": [4, 6, 8, 9, 10, 12, 16, 18, 19, 20, 23], "did": [4, 8, 9, 10, 16], "diff_evo": 23, "differ": [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 23], "differenti": [15, 19, 23], "difficult": [8, 9, 10, 21, 22], "difficulti": 14, "diffus": 2, "diffuse_kernel": [8, 9, 10], "dim": 6, "dimens": [4, 6, 8, 9, 10, 11, 12, 13, 16, 17, 19, 20, 23, 24], "dimension": [11, 12, 23], "dir": [7, 15], "direct": [8, 9, 10, 13, 16, 17, 23], "directli": [6, 8, 9, 10, 13, 16, 18, 22, 23], "directori": [4, 7, 8, 9, 10, 12, 15, 16], "disabl": 7, "discontinu": 16, "discuss": [3, 6], "disk": 7, "diskquota": 7, "diskspac": 7, "displai": 4, "disrupt": 19, "disruptive_uniform": 19, "disruptive_uniform_crossov": 19, "distanc": [8, 9, 10, 19], "distance_to": 19, "distant": [8, 9, 10], "distinct": 16, "distribut": [12, 16], "divid": [4, 8, 9, 10, 12, 13, 16, 23], "divison": 23, "divisor": [4, 6, 8, 9, 10, 16, 23], "dna": 19, "dna1": 19, "dna2": 19, "do": [3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 23], "doc": [3, 4, 6, 7, 8, 9, 10, 12, 15, 16], "docstr": [3, 6], "document": [3, 4, 5, 8, 9, 10, 12, 15, 16, 21, 24], "doe": [5, 6, 7, 8, 9, 10, 12, 13, 16, 18, 22, 23], "doi": 14, "domain": [4, 8, 9, 10, 11, 12, 23], "don": [6, 7, 8, 10, 12, 13, 23], "done": [0, 4, 15, 17, 18], "doubl": [8, 9, 10, 21, 22], "doubt": 3, "down": 16, "download": 15, "downsid": 18, "dramat": 16, "drastic": 16, "driver": [6, 7, 8, 10, 12], "drv": 8, "dry": 7, "dt": [8, 9, 10], "dtarget_gpu": 23, "dtype": [6, 21], "dual": [19, 23], "due": [16, 22, 23], "dump": [6, 8, 9], "dump_cach": 6, "durat": [6, 18], "dure": [0, 1, 6, 8, 9, 10, 12, 18, 23], "dx": 12, "dy": 12, "dynam": [0, 6, 23], "dz": 12, "e": [3, 7, 15, 17, 18, 19, 23], "each": [4, 5, 6, 7, 8, 9, 12, 16, 18, 19, 23], "earlier": [6, 8, 9, 10, 12], "easi": [8, 9, 17, 18, 23], "easiest": 14, "easili": [8, 9, 18], "effect": [4, 7, 8, 9, 10, 23], "effici": [14, 16, 18], "either": [6, 12, 19, 22, 23], "element": [5, 8, 9, 10, 16, 17, 20, 21, 23], "ellipsi": 4, "els": 6, "elsewher": 7, "empti": 23, "emtpi": 23, "enabl": [1, 18, 19, 21, 22], "end": [4, 6, 7, 8, 9, 10, 12, 16, 18, 19, 21], "endif": 10, "energi": [7, 14, 18, 19, 24], "enough": [3, 4, 5, 16], "enqueue_copi": 9, "ensur": [3, 5, 8, 9, 10, 13, 15, 18, 21], "ensure_ascii": 6, "enter": [4, 7, 8, 9, 10, 12, 16], "entir": [0, 6, 7, 8, 9, 10, 16, 19, 23], "entri": [3, 6, 8, 9], "env": [4, 6, 19, 20, 23], "envdir": 7, "environ": [1, 2, 3, 4, 6, 15, 19, 23], "envs_dir": 7, "ep": [6, 19], "equal": [8, 9, 10, 16, 23], "equat": [4, 8, 9, 10, 12, 19], "equi": [8, 9, 10], "error": [3, 4, 5, 6, 13, 16, 22], "errorconfig": 6, "especi": 7, "essenti": [4, 7], "estim": [8, 9, 10], "euclidian": 19, "evalu": [8, 9, 10, 16, 19, 23], "even": [1, 7, 8, 9, 10, 13, 16, 19], "event": [6, 8, 13, 18], "everi": [4, 5, 8, 9, 10, 11, 18, 20], "everyth": [4, 6, 8, 9, 10], "everywher": [8, 9, 10], "evolut": [19, 23], "evolutionari": 14, "exactli": [4, 6, 8, 9, 10, 16, 18], "exampl": [2, 3, 5, 6, 7, 8, 9, 10, 13, 15, 16, 17, 18, 19, 20, 21, 23], "exce": 19, "exceed": 6, "except": [6, 7, 11], "exchang": [8, 9, 10], "execut": [4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 19, 23], "exhaust": 17, "exist": [1, 6, 23], "exit": 7, "exlicitli": 12, "exp": 12, "expand": [4, 14, 16, 18], "expect": [0, 3, 4, 5, 6, 7, 8, 9, 10, 16, 18, 23], "experi": 4, "experiment": 21, "explain": [4, 6, 8, 9, 10, 13, 15, 16, 17, 20, 22, 23], "explor": 12, "export": 7, "expos": 6, "express": [6, 8, 9, 10, 11, 13, 16, 23], "extend": 18, "extens": 6, "extern": [0, 18, 22], "extra": [7, 15, 22], "extract": 12, "f": [4, 5, 12, 13, 21], "f_h": 4, "f_w": 4, "facilit": 18, "fact": [8, 9, 10, 13], "factor": [4, 8, 9, 10, 11, 12, 16, 24], "fail": [4, 6, 15, 23], "fals": [6, 7, 18, 19, 23], "famili": 14, "familiar": [4, 16], "far": [4, 8, 9, 10, 16, 20], "fast": [5, 8, 9, 10], "faster": [8, 9, 10, 16], "fc": 21, "featur": [1, 4, 5, 11, 15, 17, 18, 20, 22, 23], "feel": [8, 9, 10], "few": [4, 8, 9, 10, 12, 13, 22], "fewer": [4, 8, 9, 10], "fffi": 21, "field": [5, 8, 9, 10], "field_copi": [8, 9], "fifth": 16, "fig": [8, 9, 10], "figur": 16, "file": [0, 2, 4, 6, 7, 8, 9, 11, 13, 16, 19, 20, 22, 23], "filenam": [1, 4, 6, 11, 16, 20, 23], "fill": [6, 16], "filter": [4, 5, 11, 13], "filter_height": 4, "filter_heigth": 4, "filter_mod": 23, "filter_s": 4, "filter_width": 4, "final": [4, 5, 8, 9, 10, 12], "find": [4, 13, 16, 19, 23], "fine": [8, 9, 10], "finish": [4, 6, 9, 12, 13, 18], "firefli": [19, 23], "firefly_algorithm": 23, "first": [3, 4, 5, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23], "first_kernel": 5, "fit": [13, 19], "five": [4, 6, 20], "fix": [8, 9, 10, 19, 23], "fixed_param": [10, 12], "flat": 6, "flexibl": [5, 8, 9, 16], "float": [4, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23], "float32": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 20, 22, 23], "flori": 14, "fly": [8, 9, 10], "folder": 7, "follow": [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 22, 23], "forbidden": 6, "forc": [17, 19, 20, 22], "foreseen": 6, "forg": 7, "forget": [7, 12], "fork": 3, "form": [6, 16, 18, 19], "format": [6, 8, 9, 21], "formula": [8, 9, 10], "fortran": [6, 11, 22], "fortun": 16, "found": [4, 6, 7, 14, 18, 19], "four": [8, 9, 10], "fourth": 16, "fp": [8, 9], "frac": [8, 9, 10], "fraction": 19, "free": [4, 8, 9, 10, 13, 15, 16], "freeli": 4, "frequenc": 18, "frequent": 16, "friendli": 18, "from": [1, 4, 5, 6, 7, 8, 11, 12, 13, 15, 16, 18, 19, 21, 22, 23], "frombuff": 21, "fsiq": 21, "full": [1, 3, 6, 7, 18, 20], "fulli": [0, 7, 15], "fun": 19, "func": [6, 18, 23], "function": [3, 4, 5, 8, 9, 10, 11, 12, 13, 16, 18, 19, 20, 21, 22, 23], "further": [0, 8, 9, 10, 15, 16], "futur": [6, 14, 23, 24], "g": [7, 15, 17, 18], "gamma": 19, "gaussian": 12, "gcc": 6, "geforc": [8, 9, 10, 12], "gene": 19, "gener": [0, 4, 6, 7, 8, 9, 10, 14, 16, 18, 19, 21, 23, 24], "generate_normalized_param_dict": 19, "genet": [19, 23], "genetic_algorithm": 23, "get": [2, 4, 6, 8, 9, 10, 12, 15, 16], "get_attribut": 8, "get_best_config": 6, "get_config_str": 6, "get_devic": 8, "get_environ": 6, "get_funct": [8, 10, 12], "get_grid_dimens": 6, "get_group_id": 9, "get_initial_condit": [8, 9, 10], "get_instance_str": 6, "get_kernel_str": [6, 8, 9, 10], "get_local_id": 9, "get_local_s": 23, "get_opt": 6, "get_problem_s": 6, "get_result": 18, "get_smem_arg": 6, "get_strategy_docstr": 6, "get_temp_filenam": 6, "get_thread_block_dimens": 6, "get_total_tim": 6, "gflop": [4, 6, 11, 16, 17], "giga": [4, 16], "gigabyt": 7, "git": [3, 7, 18], "github": [0, 3, 4, 7, 8, 9, 10, 12, 15, 16], "give": [0, 8, 9, 10, 19], "given": [6, 8, 9, 10, 12, 18, 19, 23], "global": [6, 7, 8, 9, 10, 19], "go": [4, 7, 8, 9, 10, 12, 14, 15, 16, 20], "goe": 16, "good": [5, 8, 9, 10, 24], "googl": 3, "got": [8, 9, 10], "gpu": [0, 3, 4, 5, 6, 7, 11, 13, 14, 16, 18, 20, 21, 23, 24], "gpu_arg": 6, "gpu_result": [5, 8, 10], "gpuarrai": [10, 12], "gr_voltag": 18, "gracefulli": 7, "grain": [8, 9, 10], "graphic": [18, 24], "great": [6, 8, 9, 10, 20], "greedi": [19, 23], "greedy": 19, "greedy_il": 23, "greedy_ml": 23, "green": 14, "grep": 7, "grid": [4, 6, 8, 9, 10, 11, 13, 16, 23, 24], "grid_dim": 12, "grid_div": 6, "grid_div_i": [4, 5, 8, 9, 10, 13, 16, 23], "grid_div_x": [4, 5, 8, 9, 10, 13, 16, 23], "grid_div_z": 23, "grid_gpu": 12, "grid_size_": 24, "grid_size_i": 13, "grid_size_x": 13, "group": [6, 8, 9, 10, 23], "group__opt": 6, "grow": [8, 9, 10], "gt": [8, 9, 10], "gtx": [8, 9, 10, 12], "guarante": 19, "guess": [8, 9, 10], "guid": [4, 8, 16, 17, 20], "h": [4, 12, 23], "ha": [0, 4, 6, 7, 8, 9, 10, 13, 16, 18, 19, 23], "had": [1, 4], "half": [8, 9, 10], "halt": [6, 13], "ham": 19, "hand": [12, 16], "handl": [0, 13, 23], "happen": [0, 1, 3, 4, 16, 20], "hardwar": [3, 7, 8, 9, 10, 12, 18, 19, 20], "have": [0, 1, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 20, 22, 23, 24], "haven": [4, 15], "header": [0, 23], "header_filenam": 23, "heat": [8, 9, 10], "height": 16, "help": [3, 7, 22], "helper": [6, 18], "henc": [12, 21], "here": [0, 4, 11, 12, 13, 15, 16, 18, 23], "high": [6, 8, 9, 10, 14, 16, 18], "highli": [4, 14, 16], "highlight": 11, "hillclimb": 19, "hip": [0, 3, 7, 14, 23], "hiprtc": 0, "hold": [7, 8, 9, 16, 20, 21, 23], "home": 15, "hook": 18, "hop": [19, 23], "host": [2, 6, 7, 9, 10, 11, 18, 21, 22, 23], "hostbuf": 9, "hot": [8, 9, 10], "hotspot": [8, 9, 10], "how": [0, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 20, 21, 22, 23], "howev": [4, 5, 8, 9, 10, 13, 15, 16, 18, 21, 22, 23], "hpc": 1, "html": [3, 6, 7], "http": [0, 6, 7, 14, 15, 18], "hyperparamet": 19, "i": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "i_like_convolut": 4, "id": [6, 7, 18], "idea": [8, 9, 10, 13, 16, 24], "ident": 0, "identifi": 18, "ieee": 14, "ifndef": 10, "ignor": [4, 6, 8, 9, 10, 23], "iiiiiiiiiiippi": 21, "il": 19, "illeg": 4, "illustr": 11, "imag": [4, 8, 9, 10], "image_height": 4, "image_width": 4, "impact": [8, 9, 10, 13], "implement": [0, 5, 6, 11, 12, 17, 18, 19, 23], "import": [0, 4, 5, 8, 9, 10, 12, 15, 16, 17, 20, 21, 22], "importantli": [8, 9, 10], "impos": 16, "improv": [3, 6, 8, 9, 10, 16, 19, 23], "imshow": [8, 9, 10], "includ": [0, 3, 4, 5, 8, 9, 10, 12, 13, 15, 16, 18, 22, 23], "incorpor": 10, "increas": [4, 8, 9, 10, 18], "indent": 6, "independ": 16, "index": [6, 19], "indic": [4, 19, 24], "individu": [1, 18, 19], "ineffici": 21, "inertia": 19, "influenc": 4, "info": 23, "inform": [4, 6, 7, 8, 9, 10, 14, 18, 19, 20, 23, 24], "init": 8, "initi": [8, 9, 10, 21], "inlin": [8, 9, 10], "inner": 16, "input": [0, 4, 5, 8, 9, 10, 11, 13, 16, 17, 20, 21, 23], "input_imag": 4, "input_s": [4, 5, 13], "input_width": 4, "insert": [4, 5, 6, 10, 12, 13, 16, 20, 22, 23, 24], "insid": [8, 9, 10, 13, 16, 22, 23], "inspect": [6, 7, 18], "instal": [2, 3, 4, 7, 8, 9, 10, 12, 13, 16, 18, 20], "instanc": [5, 6, 8, 9, 10, 13, 18, 23], "instant": [8, 9, 10, 12], "instantan": 18, "instanti": [6, 22], "instead": [4, 6, 11, 16, 23], "instruct": [3, 7, 8, 9, 10, 11, 15, 16], "int": [4, 6, 8, 9, 10, 12, 14, 16, 20, 22, 23], "int32": [6, 14, 20, 22, 23], "integ": [6, 18, 21, 23], "integr": [7, 22], "intel": 15, "intend": 14, "intens": 19, "interact": [6, 18], "intercept": 18, "interest": [4, 11, 21], "interfac": [0, 4, 5, 13, 15, 18, 19, 21, 23], "intermedi": [8, 9, 10], "intern": [6, 14, 19, 22], "interpret": 4, "interv": 12, "intricaci": 18, "introduc": [8, 9, 10, 16, 18], "introduct": 2, "invalidconfig": 6, "invers": 6, "invok": 7, "involv": 7, "io": 15, "isclos": 5, "isiq": 21, "isol": [7, 22], "issu": [0, 7, 21], "item": [6, 8, 9, 10, 12], "iter": [6, 8, 9, 10, 12, 16, 18, 19, 20, 23], "iterfac": 6, "its": [5, 6, 8, 9, 10, 12, 14, 15, 16, 17, 18, 23], "itself": [13, 14, 23], "j": [4, 8, 9, 10, 14, 16], "jan": 14, "jatinx": [0, 15], "jetson": 18, "job": 1, "join": [8, 9], "joost": 14, "joul": [18, 24], "journal": 14, "json": [6, 8, 9, 11, 23], "jsonencod": 6, "jupyt": [4, 8, 9, 10, 12, 15, 16], "just": [4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16], "k": [8, 9, 10, 12, 14, 16, 20], "kb": 16, "keep": [8, 9, 10, 16, 21], "kei": [6, 8, 9, 10, 16, 19, 20, 23], "kepler": 23, "kerenel": 10, "kernel": [0, 1, 3, 4, 5, 6, 7, 13, 15, 17, 18, 19, 20, 21, 23, 24], "kernel_argu": 6, "kernel_cod": 12, "kernel_finish": 6, "kernel_inst": 6, "kernel_nam": [4, 6, 13, 21, 22, 23], "kernel_opt": 6, "kernel_sourc": [4, 6, 21, 23], "kernel_src": 9, "kernel_str": [4, 5, 6, 8, 9, 10, 13, 14, 19, 23], "kernel_string_shar": 10, "kernel_string_til": 10, "kernel_tun": [3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 20, 21, 22, 23, 24], "kernelinst": 6, "kernelsourc": 6, "kerneltun": [3, 7, 14], "keyr": 7, "keyword": 6, "khz": 18, "know": [4, 8, 9, 10, 16, 17], "known": [7, 16], "kt": [14, 21], "l": 19, "l1": [8, 9, 10], "l2": [8, 9, 10], "la": 12, "lambda": [4, 6, 8, 9, 16, 17, 23], "lambdatyp": 6, "lang": [0, 6, 11, 13, 22, 23], "languag": [0, 6, 10, 13, 16, 21, 23], "larg": [6, 8, 9, 10, 12, 23], "larger": [3, 8, 9, 10, 13, 19, 22], "last": [6, 7, 21], "later": [8, 9, 10, 12, 23], "latest": [7, 15], "latter": 13, "launch": [6, 8, 9, 10, 13, 18, 23], "launcher": 14, "layer": 18, "ld_libary_path": 7, "lead": 21, "leak": 6, "learn": 4, "least": [6, 7], "leav": 7, "left": [6, 7, 8, 9, 10, 12, 17], "len": 21, "length": 21, "let": [4, 6, 8, 9, 10, 20, 22], "level": [6, 7, 18], "libbz2": 7, "libffi": 7, "libgdbm": 7, "liblzma": 7, "libncurses5": 7, "libncursesw5": 7, "libnss3": 7, "librari": [6, 11, 18, 21], "libreadlin": 7, "libsqlite3": 7, "libssl": 7, "light": 19, "like": [4, 6, 7, 8, 9, 10, 11, 12, 16, 19, 20, 21, 22, 23], "likewis": [8, 9, 10], "limit": [0, 4, 6, 7, 8, 9, 10, 11, 16, 18, 19, 22, 23, 24], "limits_": 4, "linalg": 12, "line": [4, 7, 8, 9, 10], "linear": [4, 16, 23], "linkag": [0, 22], "linspac": 12, "linux": [7, 15], "list": [0, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 23], "littl": [4, 8, 9, 10, 16], "ll": [0, 4, 8, 9, 10, 15, 16], "llvm": 7, "load": [6, 7], "local": [3, 19, 23], "locat": [5, 7, 12, 18], "lock": [11, 18], "log": 23, "long": [4, 8, 9, 10, 12, 13, 16, 21], "longer": [4, 6, 7, 17], "look": [3, 4, 6, 8, 9, 10, 12, 15, 16, 22], "looks_like_a_filenam": 6, "lookup": 6, "loop": [8, 9, 10, 11, 16, 24], "loop_unroll_factor_": 24, "loss": [8, 9, 10], "lot": [4, 8, 9, 10, 16, 18, 20, 21, 23], "low": [6, 8, 9, 10, 16], "lower": [12, 18, 19], "lt": [8, 9, 10], "lzma": 7, "m": [7, 8, 9, 10, 12], "mac": 7, "maco": 7, "macro": 6, "made": 6, "mai": [0, 4, 5, 6, 7, 8, 9, 10, 13, 15, 16, 17, 18, 19, 20, 21, 23], "main": [4, 6, 12, 18, 20], "maintain": 6, "make": [3, 4, 7, 8, 9, 10, 12, 14, 15, 16, 18, 21, 22], "make_context": 8, "make_strategy_options_doc": 6, "mamba": 7, "manag": [8, 9, 10, 16, 18], "mandatori": 18, "mani": [4, 6, 8, 9, 10, 16, 17, 18, 19, 23], "manual": [12, 15], "map": [5, 11, 12], "mark": 6, "master": 7, "match": [4, 5, 6], "math": 12, "matlab": 22, "matmul": 16, "matmul_kernel": 16, "matmul_na": 16, "matmul_shar": 16, "matplotlib": [8, 9, 10, 15], "matric": 16, "matrix": 2, "matter": [8, 9, 10, 13], "max": 6, "max_fev": [6, 19, 23], "max_thread": 6, "maxim": [17, 23], "maximum": [5, 6, 12, 19, 23], "maxit": 19, "md": 3, "mead": 19, "mean": [4, 13, 16, 17, 19, 21, 22, 24], "meant": 4, "measur": [6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18, 23, 24], "mechan": 19, "meet": 6, "melt": [8, 9, 10], "mem": 6, "mem_alloc": 8, "mem_flag": 9, "mem_freq": [18, 24], "memcpi": [6, 13], "memcpy_dtoh": [6, 8], "memcpy_htod": [6, 8], "memori": [0, 4, 6, 11, 13, 18, 21, 23, 24], "memset": 6, "mention": 12, "merg": [8, 9, 10, 16], "meshgrid": 12, "messi": [8, 9, 10], "metal": [8, 9, 10], "meter": 18, "method": [6, 8, 9, 10, 13, 16, 18, 19], "metric": [2, 4, 6, 11, 16, 23], "mf": 9, "middl": 12, "might": [12, 17], "millisecond": [8, 9, 10], "mimick": 4, "min": [8, 9], "mind": [8, 9, 10], "miniconda": [7, 15], "miniconda3": 15, "minim": [3, 7, 17, 22, 23], "minimum": 12, "mirror": 23, "miss": [6, 7, 23], "ml": 23, "mnrow": 21, "mnrowsiq": 21, "mod": [10, 12], "mode": 18, "model": [8, 9, 10, 14], "modif": [19, 21], "modifi": [10, 18], "modul": [3, 6, 7, 13, 18], "moment": [8, 9, 10, 23], "monitor": 18, "monolith": 6, "more": [0, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 18, 20, 22, 23], "most": [0, 3, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 19, 20, 21, 23], "mostli": [6, 14, 23], "motion": [8, 9, 10], "move": [4, 6, 8, 13, 16, 19, 23], "move_toward": 19, "much": [4, 8, 9, 10, 12, 18, 22, 23], "multi": [19, 23], "multipl": [0, 2, 6, 7, 13, 18, 22, 23], "multiprocessor": [8, 9, 10], "must": [6, 17, 23], "mutat": 19, "mutation_ch": 19, "my_typ": 22, "n": [5, 7, 8, 9, 10, 12, 13, 14, 16, 19, 20, 22], "nactivechannel": 21, "naiv": [4, 5, 8, 9, 10], "name": [4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 18, 19, 20, 23, 24], "name_of_gpu": 23, "namelijk": 16, "nativ": 15, "navig": 15, "nbuffer": 21, "nbyte": 8, "nchannel": 21, "ndarrai": [6, 12], "ndrang": 6, "nearest": [6, 23], "necessari": [5, 6, 7, 8, 9, 10, 23], "necessarili": [5, 13], "need": [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 20, 21, 22, 23], "neighbor": 19, "nelder": 19, "net": 15, "network": 4, "neural": 4, "new": [1, 3, 6, 7, 8, 9, 10, 19, 23], "new_cost": 19, "newer": [15, 18], "newli": 16, "next": [8, 9, 10, 16, 21], "nfasttimesampl": 21, "nice": [8, 9], "nieuwpoort": 14, "nl": 18, "no_improv": 19, "node": [7, 19], "non": [5, 7], "none": [5, 6, 18, 19, 23], "nonumb": [4, 12], "normal": [6, 19, 23], "normalize_dict": 19, "normalize_parameter_spac": 19, "normalize_verify_funct": 6, "normalized_coordin": 23, "notat": 23, "note": [4, 6, 7, 8, 9, 10, 12, 14, 15, 16, 18, 21, 23], "notebook": [4, 8, 9, 10, 12, 15, 16], "notic": [4, 8, 9, 10], "now": [4, 6, 8, 9, 10, 12, 13, 16, 20], "nox": [3, 7], "noxset": 7, "np": [4, 6, 12, 16, 20, 21], "npencod": 6, "npt": 12, "nrepeat": 21, "nsampl": 21, "nsamplesiq": 21, "nslowtimesampl": 21, "ntx": 21, "num_reg": 18, "num_stream": 13, "number": [1, 4, 5, 6, 8, 9, 10, 11, 12, 14, 16, 17, 18, 19, 20, 21, 23, 24], "numer": [8, 9, 10], "numpi": [0, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 16, 20, 21, 22, 23], "nv": 0, "nvcc": [0, 6], "nvcuda": 0, "nvidia": [0, 6, 7, 15, 16, 18, 22], "nvidia_smi_fallback": 18, "nvml": [6, 24], "nvml_": 24, "nvml_energi": [18, 24], "nvml_gr_clock": [18, 24], "nvml_mem_clock": [18, 24], "nvml_power": [18, 24], "nvml_pwr_limit": [18, 24], "nvmlobserv": 24, "nvrtc": [0, 6, 22], "nx": [8, 9, 10, 12], "ny": [8, 9, 10, 12], "nz": 12, "o": [4, 6], "obj": 6, "object": [2, 4, 5, 6, 8, 9, 10, 19, 23], "objective_higher_is_bett": [6, 17, 23], "observ": [0, 2, 6, 17, 23, 24], "obtain": [4, 8, 9, 10, 12, 18], "occup": 16, "occur": [17, 23], "occurr": 6, "offer": 6, "often": [1, 8, 9, 10, 18], "old": 4, "old_argu": 6, "old_cost": 19, "older": 15, "omit": 6, "omp_get_wtim": 13, "ona": 10, "onc": [5, 6, 8, 9, 10, 12, 18, 23], "one": [0, 3, 4, 6, 7, 8, 9, 10, 12, 15, 16, 18, 19, 23], "ones": [8, 9, 10, 24], "onli": [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 19, 21, 23], "onlin": 7, "open": [5, 7, 8, 9, 13, 16], "open_cach": 6, "openacc": 11, "opencl": [0, 3, 4, 7, 8, 9, 10, 11, 13, 14, 16, 23], "openmp": 13, "openssl": 7, "oper": [4, 8, 9, 10, 12, 13, 16, 17], "opportun": 16, "oppos": 7, "opt": 18, "optim": [1, 2, 4, 5, 6, 8, 9, 10, 13, 14, 16, 17, 18, 23], "option": [1, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19, 22, 23, 24], "order": [4, 5, 6, 8, 9, 10, 12, 13, 16, 17, 19, 20, 23], "ordered_greedy_ml": 23, "ordereddict": [4, 8, 9, 10, 12, 16, 17], "ordin": 18, "org": [7, 14, 15], "other": [0, 1, 4, 6, 7, 8, 9, 10, 13, 16, 17, 18, 19, 23, 24], "otherwis": [6, 7, 16, 23], "our": [4, 8, 9, 10, 12, 16, 20, 21], "ourselv": 12, "out": [1, 4, 5, 7, 12, 15, 16], "outer": 16, "output": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 20, 23, 24], "output_imag": 4, "output_s": 4, "over": [0, 6, 8, 9, 10, 15, 16, 18, 19], "overhead": [8, 9, 10, 16], "overhead_tim": 6, "overlap": [11, 13], "overrid": 19, "overwritten": [18, 23], "own": [4, 10, 13, 15, 17, 18], "p": [4, 6, 16, 17, 21, 23], "pack": 21, "packag": [0, 7], "packstr": 21, "pad": 21, "page": [4, 7, 8, 9, 10, 11, 12, 14, 16, 17], "pair": [8, 9, 10], "panda": [8, 9, 11, 15], "pandoc": 7, "paper": 14, "parallel": [4, 8, 9, 10], "param": [4, 5, 6, 7, 18, 19, 23], "param_spac": 19, "paramet": [2, 5, 6, 8, 9, 11, 13, 16, 17, 19, 20, 21, 22, 23], "parameter_spac": [6, 19], "parametr": 4, "parent": 19, "pars": [6, 8, 9], "parse_restrict": 6, "part": [7, 8, 9, 10, 14, 15, 16, 17, 21, 23], "partial": [5, 8, 9, 10, 11], "particl": [19, 23], "particular": [4, 6, 8, 9, 11, 13, 16, 18, 21], "particularli": [4, 7, 17], "pass": [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 22, 23], "path": [4, 7, 18], "pattern": 18, "pcie": 18, "per": [3, 4, 6, 8, 9, 10, 12, 17, 18, 23], "percentag": 23, "perform": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 23], "persist": 23, "physic": 8, "pick": 16, "pii": 14, "pionter": 6, "pip": [3, 4, 7, 8, 9, 14, 15, 16], "pip_cache_dir": 7, "pipelin": 11, "pixel": 4, "place": [4, 8, 9, 10, 18, 19, 20, 23], "plai": [8, 9, 10], "plain": 13, "platform": [6, 14, 15, 18, 23], "pleas": [1, 3, 4, 7, 11, 14, 15, 18, 20, 21, 23], "plot": [8, 9, 10], "plu": 23, "plugin": 7, "pmb": 14, "pmt": 18, "po": 19, "poetri": [7, 15], "point": [4, 6, 7, 8, 9, 10, 12, 13, 16, 17, 18, 20, 23], "pointer": 21, "pop": 8, "popsiz": 19, "popul": 19, "popular": 18, "portabl": 21, "posit": [5, 6, 12, 19, 22, 23], "possibl": [3, 4, 5, 7, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 23], "possibli": [17, 23], "powel": 19, "power": [6, 16, 18, 24], "power_read": [18, 24], "powersensor": [18, 24], "powersensor2": 18, "pragma": [8, 9, 10, 16], "precis": 13, "precomput": 5, "prefer": [4, 6, 7, 8, 10, 18, 23], "prefix": 15, "prepar": [6, 7, 8, 9, 10], "prepare_kernel_str": 6, "prepend": [6, 10], "preprocess_gpu_argu": 6, "preprocessor": [4, 6, 23], "present": [0, 7, 16], "press": [4, 8, 9, 10, 12, 16], "pretend": 6, "pretti": 16, "previou": [1, 7, 8, 9, 10, 19, 23], "previous": [6, 8, 9, 10, 16], "prg": 9, "primit": 21, "print": [3, 4, 6, 8, 9, 10, 12, 16, 23], "print_config": 6, "print_config_output": 6, "privileg": [7, 18], "probabl": [19, 23], "problem": [3, 4, 6, 8, 9, 10, 11, 12, 13, 16, 23], "problem_s": [4, 5, 6, 8, 9, 10, 12, 13, 16, 20, 21, 23, 24], "problemat": 4, "proc": 7, "proce": 16, "process": [4, 6, 7, 8, 9, 10, 16, 17, 18, 19, 22], "process_cach": 6, "process_metr": 6, "prod": [4, 5, 13], "produc": [3, 5], "product": [4, 8, 9, 23], "profil": 16, "program": [0, 5, 7, 8, 9, 10, 13, 16, 21, 22], "programm": [16, 18], "programmat": 12, "prohibit": 19, "project": [7, 15], "promis": 4, "properli": 6, "properti": [6, 16, 23], "propos": 3, "provid": [5, 6, 7, 8, 9, 10, 13, 22, 23], "prune": 19, "prune_parameter_spac": 19, "ps_energi": [17, 18, 24], "ps_power": [18, 24], "psedo": 10, "pso": 23, "ptr": 6, "public": [3, 14], "publish": [7, 14], "pull": 3, "purpos": [8, 9, 10, 13, 16, 23, 24], "put": [3, 6, 7, 8, 9, 10], "py": [5, 13, 15], "pybind11": 18, "pycuda": [0, 7, 8, 10, 12, 13, 18, 22], "pyenv": 7, "pyhip": [0, 6], "pyopencl": [6, 7, 9, 18], "pyplot": [8, 9, 10], "pytest": [3, 7], "python": [0, 3, 4, 6, 7, 11, 12, 13, 16, 18, 20, 21, 22, 23], "python3": [7, 15], "pythonpath": 15, "qualiti": 13, "quantiti": [8, 9, 10, 17, 18, 23], "queue": 9, "quick": [3, 8, 9, 10], "quickli": [8, 9, 10], "quiet": [6, 23], "quit": [1, 8, 9, 10, 12, 16, 22], "quota": 7, "r": [3, 5, 13], "race": 16, "radiat": [8, 9, 10], "rais": 6, "ran": 7, "rand": 12, "rand1bin": 19, "rand1exp": 19, "rand2bin": 19, "rand2exp": 19, "randint": [8, 9, 10], "randn": [4, 5, 13, 14, 16, 20, 22], "random": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 19, 20, 22, 23], "random_sampl": [6, 23], "random_walk": 19, "randomli": [12, 19], "randtobest1bin": 19, "randtobest1exp": 19, "rang": [4, 5, 8, 9, 10, 12, 13, 22], "rapl": 18, "rather": [8, 9, 10, 23], "rawkernel": 6, "rd": 18, "re": [4, 7, 8, 9, 10, 12, 16], "reach": 6, "read": [4, 5, 6, 7, 8, 9, 10, 12, 13, 16, 18, 23], "read_cach": 6, "read_fil": 6, "read_writ": 9, "readi": [4, 6, 8, 9, 10, 12, 16], "ready_argument_list": 6, "real": 22, "realiti": 16, "realiz": 16, "realli": [4, 8, 9, 10, 15], "reason": [4, 6, 7, 21, 23], "receive_spec": 21, "recent": [6, 15, 18], "recogn": 20, "recommend": [15, 21], "recon": 21, "record": [4, 6, 8, 18, 23], "redistribut": [8, 9, 10], "reduc": [8, 9, 10, 16], "reduct": [5, 17, 23], "redund": 16, "ref": 23, "refer": [4, 5, 6, 8, 9, 10, 11, 13, 15, 18, 23], "referenc": 14, "reflect": [5, 18], "regard": [3, 6, 19], "regardless": 22, "region": [8, 9, 10], "regist": [4, 8, 9, 10, 16, 18], "register_configur": 18, "register_devic": 18, "registerobserv": 18, "regular": [6, 10, 18], "reject": 19, "relat": [14, 17, 24], "releas": [6, 7], "relev": [6, 14, 18], "rememb": [4, 7, 8, 9, 10, 16], "remov": [7, 19], "repeatedli": 18, "replac": [4, 5, 6, 7, 8, 9, 10, 12, 16, 23], "replace_param_occurr": 6, "repo": 15, "report": [17, 18, 23, 24], "repositori": [3, 4, 7, 8, 9, 10, 12, 14, 15, 16], "repres": [6, 8, 9, 10], "represent": [6, 21], "reproduc": 3, "request": [3, 18, 23], "requir": [0, 3, 4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 22], "requirements_test": 3, "research": 14, "reserv": [1, 9, 24], "resourc": 23, "respect": [16, 18], "respons": 6, "rest": [6, 8, 9, 10], "restart": [1, 7, 8, 9, 10, 19], "restrict": [6, 7, 11, 16, 22, 23], "result": [1, 3, 4, 5, 6, 10, 12, 16, 17, 18, 19, 20, 23, 24], "result_host": 6, "results_filenam": 23, "retri": 15, "retriev": [4, 6, 23], "return": [4, 5, 6, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 23], "reus": [4, 8, 9, 10, 16], "rewrit": 22, "rf": 21, "rfsize": 21, "richard": 14, "right": [4, 8, 9, 10, 12, 15], "risk": 22, "rmprofilingadminonli": 7, "rob": 14, "robust": 6, "rocm": [7, 15, 18], "room": 16, "root": 18, "roughli": [12, 16], "round": [8, 9, 10, 23], "row": 16, "run": [1, 3, 4, 5, 6, 8, 9, 12, 13, 15, 16, 18, 19, 23], "run_gpu": 9, "run_kernel": [4, 5, 6, 11, 23], "runner": 19, "runtim": [4, 6, 8, 9, 10, 14, 15, 18, 22], "runtimefailedconfig": 6, "s0167739x18313359": 14, "sa": 16, "safer": 23, "sai": [6, 8, 9, 10, 20, 22], "same": [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 18, 20, 23], "sampl": [6, 19, 23], "satisfi": 23, "save": [7, 8, 9], "save_al": 18, "sb": 16, "sc21": 14, "sc22": 14, "scalar": [4, 8, 9, 10, 12, 23], "scale": 6, "scale_from_param": 6, "schoonhoven": 14, "schoonhoven2022benchmark": 14, "schoonhoven2022go": 14, "scienc": 14, "sciencedirect": 14, "scientif": 21, "scipi": 11, "script": [4, 6, 16, 21, 22], "sdk": 15, "search": [1, 4, 6, 11, 14, 16, 17, 19, 23], "searchspac": 19, "second": [4, 5, 6, 8, 9, 10, 12, 16, 17, 18, 19, 23], "secondli": [4, 16], "section": [6, 8, 9, 10], "see": [0, 1, 3, 4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 20, 22, 23], "seem": [8, 9, 10], "seemingli": 7, "seen": [4, 6, 7, 16], "select": [0, 3, 4, 6, 8, 9, 10, 12, 15, 16, 18, 19, 23], "self": [6, 7, 18, 19], "semant": 5, "send": 12, "sens": 21, "sensibl": 16, "sensor": 18, "separ": [6, 11, 13, 22], "seper": 22, "seri": [6, 12], "serializ": 6, "serv": [8, 9, 10, 17, 19], "session": [1, 6, 7, 19], "set": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 18, 19, 20, 22, 23, 24], "set_nvml_paramet": 6, "set_titl": [8, 9, 10], "setup": [4, 8, 9, 10, 13, 15, 18, 21], "setup_block_and_grid": 6, "setup_method_argu": 6, "setup_method_opt": 6, "sever": [6, 8, 9, 10, 11, 12, 15, 16, 22, 23], "sh": 15, "sh_u": [8, 9, 10], "share": [0, 4, 6, 23], "sheet": [8, 9, 10], "shell": 7, "shift": [4, 8, 9, 10, 12, 16], "shortli": 4, "should": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13, 16, 17, 18, 20, 23], "show": [4, 8, 9, 10, 11, 14, 17, 21], "shown": [4, 6, 18], "shuffl": 11, "signal": [4, 24], "signatur": [0, 4, 6], "signific": 3, "significantli": [14, 16, 18], "silent": 4, "similar": [6, 13, 16, 23], "similarli": 4, "simpl": [4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 21], "simpli": [4, 5, 6, 8, 9, 10, 12, 19, 20, 23], "simplic": [8, 9, 10, 12], "simplifi": [7, 8, 9, 10], "simul": [1, 6, 10, 14, 19, 21, 23], "simulated_ann": 23, "simulation_mod": [6, 23], "sinc": [4, 10, 12, 14, 16, 22], "singl": [4, 5, 6, 8, 9, 10, 13, 16, 18, 22, 23], "single_point": 19, "single_point_crossov": 19, "situat": 21, "size": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 19, 20, 22, 23], "skip": [3, 4, 7, 8, 9, 10, 23], "skip_nvml_set": 6, "skipkei": 6, "skippablefailur": 6, "slight": 21, "slightli": [13, 16, 22], "slow": 19, "slsqp": 19, "sm_": 8, "small": [3, 4, 7, 8, 9, 10, 16], "smem_arg": [6, 23], "smi": 18, "snap": 6, "snap_to_nearest_config": 6, "snippet": 5, "so": [4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 19, 20, 22, 23], "social": 19, "softwar": [4, 8, 9, 10, 14, 15, 18, 19, 20], "solut": [16, 18], "solv": 10, "some": [4, 6, 7, 8, 9, 10, 15, 16, 17, 18, 19, 20, 21, 22, 23], "somehow": [8, 9, 10], "someth": [1, 4, 8, 9, 10, 16], "sometim": [0, 7, 8, 9, 10, 21], "somewher": 4, "soon": 19, "sort": 23, "sort_kei": 6, "sourc": [4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 22, 23], "sourcemodul": [8, 10, 12], "space": [1, 4, 5, 6, 7, 12, 13, 16, 17, 19, 23], "spatial": [8, 9, 10], "special": [4, 8, 9, 10, 18, 20, 24], "specif": [0, 4, 6, 8, 9, 10, 11, 12, 17, 18, 19, 23], "specifi": [4, 5, 6, 7, 8, 9, 10, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24], "speed": 6, "spent": [13, 23], "sphinxdoc": 3, "split": 13, "spread": 13, "sqrt": 12, "squar": 16, "src": 6, "ssl": [7, 15], "stabl": 7, "stack": 15, "stai": 4, "stand": 16, "start": [1, 2, 4, 5, 6, 8, 9, 10, 13, 15, 16, 18, 19, 23], "start_ev": 6, "state": [6, 8, 9, 10, 18, 23], "statement": [4, 10, 12, 16, 22], "static": 6, "statu": 6, "stdout": [8, 9], "steer": 14, "step": [3, 7, 8, 9, 10, 15, 16, 17, 19, 22], "stick": 21, "still": [1, 3, 5, 16], "stop": [6, 19], "stop_ev": 6, "stopcriterionreach": 6, "store": [1, 4, 6, 7, 10, 16, 18, 20, 23], "store_cach": 6, "store_result": 23, "str": [6, 8, 9, 10, 12], "strategi": [1, 2, 4, 14, 17, 23], "strategy_opt": [6, 19, 23], "stream": [1, 6, 8, 9, 10], "string": [4, 6, 8, 9, 10, 11, 16, 17, 18, 20, 21, 23], "struct": 2, "structur": [4, 6, 8, 9, 16, 20], "studio": 7, "style": 3, "subclass": 6, "submatric": 16, "subplot": [8, 9, 10], "subscrib": 18, "substitut": 12, "sudo": [7, 15], "suffici": [12, 17], "suffix": [6, 23], "suit": [12, 23], "sum": [4, 5, 6, 16], "sum_": 12, "sum_float": 5, "sum_x": 5, "summar": 23, "supercomput": 14, "suppli": [6, 13, 16, 19, 22, 23], "support": [4, 6, 7, 8, 9, 10, 13, 15, 18, 19, 22, 23, 24], "suppos": [8, 9, 10], "sure": [4, 7, 8, 9, 10, 14, 15, 16], "swarm": [19, 23], "sy": 9, "symbol": [6, 23], "sync": [7, 21], "synchron": [6, 8, 10, 12, 16, 17], "system": [7, 14, 15, 18], "t": [4, 6, 7, 8, 9, 10, 12, 13, 15, 19, 22, 23], "t0": [10, 12], "t_min": 19, "tab": 7, "tabl": 0, "take": [4, 6, 7, 8, 9, 10, 12, 16, 18, 19, 20, 22, 23], "target": 23, "task": 6, "techniqu": 16, "tell": [4, 8, 9, 10, 11, 13, 16, 20, 21], "temp_x": 6, "temperatur": [8, 9, 10, 18, 19, 24], "templat": [0, 2, 12], "temporari": 6, "term": 4, "termin": [1, 15], "terminologi": [8, 10], "test": [3, 8, 9, 10, 11, 15, 16, 18, 23], "test_vector_add": 11, "test_vector_add_parameter": 11, "texmem_arg": [6, 23], "text": [8, 10, 16], "textur": [0, 6, 23], "than": [4, 8, 9, 10, 12, 17, 18, 19, 23, 24], "thank": 3, "thei": [1, 6, 7, 8, 9, 10, 11, 16, 17], "them": [4, 7, 10, 12, 13, 16], "themselv": 12, "therefor": [4, 5, 8, 9, 10, 12, 13, 16], "thesi": 19, "thi": [0, 1, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "thin": 18, "thing": [4, 13, 16], "think": [8, 9, 10], "third": [5, 16], "those": [4, 7, 11, 15, 18], "thousand": [8, 9, 10], "thread": [4, 6, 8, 9, 10, 11, 12, 17, 18, 20, 23, 24], "threadidx": [4, 8, 9, 10, 12, 14, 16, 20, 22], "three": [4, 5, 16], "through": [6, 7, 8, 9, 10, 12, 14, 17, 18, 19, 23], "thrown": 6, "ti": [8, 9, 10, 12], "tight": 7, "tiker": 15, "tile": [4, 11, 16], "tile_size_i": [4, 5, 8, 9, 10, 13, 16, 23], "tile_size_j": 10, "tile_size_x": [4, 5, 8, 9, 10, 13, 16], "time": [4, 6, 8, 9, 10, 12, 13, 16, 17, 18, 19, 22, 23, 24], "time_limit": [19, 23], "time_sinc": 8, "titan": [8, 9, 10], "titl": 14, "tj": [8, 9, 10], "tk": 7, "tnc": 19, "to_csv": [8, 9], "to_gpu": [10, 12], "to_valid_nvrtc_gpu_arch_cc": 6, "togeth": [8, 9, 10, 15, 23], "token": 4, "toler": 5, "toml": 7, "too": [4, 8, 9, 10, 12, 13, 16, 23], "took": [4, 8, 10, 19, 20, 23], "tool": [10, 12, 14], "toolkit": [14, 15], "top": [6, 7, 12, 18, 23], "total": [4, 6, 8, 9, 10, 16, 17, 20], "total_flop": 17, "toward": 19, "track": 18, "tradit": 7, "transact": 14, "transfer": [10, 11, 13], "transmit": 18, "travers": 19, "treat": 23, "tri": [8, 9, 10, 19], "troubl": 15, "true": [1, 4, 5, 6, 8, 9, 10, 13, 16, 18, 19, 23], "trust": [5, 19], "trusti": 4, "try": [4, 6, 8, 9, 10, 15, 16, 19, 23], "try_to_constraint": 6, "ttyacm0": 18, "tunabl": [6, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 22, 23, 24], "tune": [1, 2, 5, 6, 11, 14, 15, 19, 20, 22, 23, 24], "tune_kernel": [1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 19, 20, 21, 22, 23], "tune_param": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 19, 20, 21, 22, 23], "tune_params_kei": 6, "tuner": [0, 1, 3, 4, 5, 6, 7, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "tuning_opt": [6, 19], "tupl": [6, 10, 12, 19, 23], "turn": 6, "tutori": [4, 8, 12, 14, 15, 16], "two": [4, 6, 8, 9, 10, 11, 16, 17, 19, 23], "two_point": 19, "two_point_crossov": 19, "tx": [8, 9, 10, 16], "txt": 3, "ty": [8, 9, 10, 16], "type": [4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23], "typeerror": 6, "typenam": 22, "typic": [6, 15, 16, 23], "typicali": 23, "u": [3, 4, 8, 9, 10], "u_": [8, 9, 10], "u_new": [8, 9, 10], "u_old": [8, 10], "ubuntu": 7, "undefin": [4, 6, 8, 9, 10, 16], "under": [4, 7, 14, 23], "understand": 4, "underutil": [8, 9, 10], "uniform": 19, "uniform_crossov": 19, "uniformli": 19, "uniqu": [19, 23], "unit": [3, 6], "unless": 4, "unload": [6, 7], "unrol": [8, 9, 10, 11, 16, 24], "unscal": 6, "unscale_and_snap_to_nearest": 6, "unsign": [6, 9], "until": [6, 13], "up": [3, 4, 6, 7, 8, 9, 10, 15, 16, 20, 23], "updat": [6, 7], "upgrad": 7, "upload": 13, "url": 14, "us": [0, 1, 2, 4, 5, 6, 7, 11, 13, 14, 15, 17, 18, 19, 20, 22, 23, 24], "usag": [16, 18], "usb": 18, "use_locked_clock": 18, "usecas": 11, "user": [0, 4, 5, 6, 7, 9, 11, 15, 16, 17, 18, 19, 22, 23], "usual": [7, 18], "util": [7, 16], "v": [3, 6, 8, 9, 10, 12], "valid": [6, 11, 16, 23], "valu": [4, 5, 6, 8, 9, 10, 11, 12, 13, 16, 18, 19, 20, 23], "van": 14, "vari": [8, 9, 10, 12, 16, 17], "variabl": [6, 7, 12, 15, 19, 23], "variou": [18, 20], "ve": [4, 8, 9, 10, 15, 16], "vector": [12, 13, 20], "vector_add": [14, 19, 20, 22], "vector_add_kernel": 20, "veenboer": 14, "venv": 7, "venvbackend": 7, "verbos": [4, 5, 6, 8, 9, 10, 13, 23], "veri": [1, 5, 8, 9, 10, 13, 15, 16, 18, 21, 22], "verif": [0, 2, 11, 23], "verifi": [5, 6, 7, 11, 23], "verify_partial_reduc": 5, "version": [3, 4, 7, 16, 18, 23], "via": [7, 19], "virtual": [7, 15], "virtualenv": 7, "visual": [1, 7, 16], "vocabulari": [2, 18, 20], "void": [4, 8, 9, 10, 12, 14, 16, 20, 21, 22], "voltag": 18, "volum": 14, "w": [4, 8, 9, 17, 19], "wa": [4, 6, 8, 9, 10, 18, 23], "wai": [4, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 23], "wall": 9, "want": [0, 5, 10, 12, 13, 15, 16, 18, 20, 23, 24], "warp": 23, "warpsiz": 16, "wast": 19, "watt": [18, 24], "we": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 20, 21, 22], "weight": [4, 19], "weighted_choic": 19, "weird": 7, "well": [0, 8, 9, 10, 12, 16, 18, 23], "went": [8, 9, 10, 12], "were": [4, 8, 9, 10, 12, 16, 23], "werkhoven": 14, "wget": [7, 15], "what": [3, 4, 5, 6, 8, 9, 10, 13, 16, 18, 20, 21, 22, 23, 24], "whatev": [6, 7, 13, 19], "when": [0, 1, 3, 4, 6, 7, 8, 9, 10, 13, 15, 16, 17, 18, 19, 21, 22, 23, 24], "whenev": 5, "where": [3, 4, 5, 6, 7, 8, 9, 10, 16, 17, 18, 21, 22, 23], "whether": [6, 17, 19, 23], "which": [0, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24], "while": [0, 1, 4, 6, 8, 9, 10, 11, 16, 18, 19], "who": 7, "whole": [8, 9, 10, 16, 19], "whose": [5, 23], "why": [8, 9, 13, 17], "wide": [8, 9, 10, 15, 16], "width": 16, "wiki": 15, "willemsen": [14, 19], "willemsen2021bayesian": 14, "wise": 20, "wish": 10, "within": [8, 9, 10, 12, 16, 19, 23], "without": [7, 8, 9, 10, 12, 13, 18, 19], "won": 4, "word": 16, "work": [1, 3, 4, 6, 8, 9, 10, 15, 17, 19, 22, 23], "workshop": 14, "worri": [8, 10], "worst": [8, 9, 10], "would": [4, 7, 8, 9, 10, 22], "wrap": [0, 6, 20, 22, 23], "wrapper": [18, 22], "write": [4, 11, 12, 16, 22, 23], "write_fil": 6, "writefil": [4, 16], "written": [3, 22], "wrote": 4, "www": 14, "x": [4, 5, 6, 8, 9, 10, 12, 14, 16, 20, 22, 23], "x0": 12, "x1": 6, "x2": 6, "x86_64": 15, "x_": [8, 9, 10], "x_i": [8, 9, 10, 12], "xdg_cache_hom": 7, "xgpu": 12, "xgrid": 12, "xilinx": 18, "xn": 6, "xvect": 12, "xyz": [12, 23], "xz": 7, "y": [4, 6, 7, 8, 9, 10, 12, 13, 16, 23], "y0": 12, "y1": 6, "y2": 6, "y_": [8, 9, 10], "y_i": 12, "year": 14, "yet": [4, 6, 12, 13, 20], "ygpu": 12, "ygrid": 12, "yield": 16, "yn": 6, "you": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24], "your": [3, 4, 7, 8, 9, 10, 12, 13, 14, 15, 18, 21, 23], "yourself": [13, 23], "yvect": 12, "z": [4, 6, 12, 23], "z0": 12, "z_i": 12, "zero": [4, 5, 12, 13, 16], "zeros_lik": [8, 12, 14, 16, 20, 22], "zgpu": 12, "zgrid": 12, "zip": [10, 12], "zlib1g": 7, "zvect": 12}, "titles": ["Backends", "Cache files", "The Kernel Tuner documentation", "Contribution guide", "Convolution", "Correctness Verification", "Design documentation", "Development environment", "Diffusion", "Tutorial: From physics to tuned GPU kernels", "Tutorial: From physics to tuned GPU kernels", "Kernel Tuner Examples", "3D Grid on GPU with Kernel Tuner", "Tuning Host Code", "The Kernel Tuner documentation", "Installation", "Matrix multiplication", "Metrics and Objectives", "Observers", "Optimization strategies", "Getting Started", "Using structs", "Templated kernels", "API Documentation", "Parameter Vocabulary"], "titleterms": {"": 12, "2d": 4, "3d": 12, "The": [2, 14], "add": 11, "api": 23, "auto": [8, 9, 10], "backend": [0, 6, 15, 22], "basinhop": 19, "bayes_opt": 19, "best": 10, "brute_forc": 19, "build": 7, "c": 10, "cach": 1, "citat": 14, "cluster": 7, "code": [3, 8, 9, 10, 11, 13], "common": 6, "compil": [0, 6], "compilerfunct": 6, "comput": [8, 9, 10], "contribut": 3, "convolut": [4, 11], "convolution_correct": 11, "convolution_stream": 11, "core": 6, "correct": 5, "cpu": 12, "cuda": [0, 15, 16], "cudafunct": 6, "cupi": 6, "cupyfunct": 6, "depend": 15, "design": 6, "develop": [3, 7], "devic": 6, "deviceinterfac": 6, "diff_evo": 19, "diffus": [8, 9, 10], "document": [2, 6, 7, 14, 23], "dual_ann": 19, "environ": 7, "exampl": [4, 11, 14, 22], "execut": 18, "expdist": 11, "featur": [0, 2], "file": 1, "firefly_algorithm": 19, "from": [9, 10], "function": 6, "gener": 11, "genetic_algorithm": 19, "get": 20, "git": 15, "gpu": [8, 9, 10, 12], "greedy_il": 19, "greedy_ml": 19, "grid": 12, "guid": [2, 3, 15], "hip": [6, 15], "hipfunct": 6, "host": 13, "implement": [4, 8, 9, 10], "increas": 16, "instal": [14, 15], "interfac": 6, "issu": 3, "kernel": [2, 8, 9, 10, 11, 12, 14, 16, 22], "kernel_tun": [6, 19], "let": 12, "local": [7, 9], "matrix": [11, 16], "memori": [8, 9, 10, 16], "metric": 17, "minim": 19, "ml": 19, "more": 4, "move": 12, "multipl": [11, 16], "naiv": 16, "number": 13, "nvcuda": 6, "nvml": 18, "nvmlobserv": 18, "object": 17, "observ": 18, "opencl": [6, 15], "openclfunct": 6, "optim": [12, 19], "ordered_greedy_ml": 19, "other": 15, "packag": 15, "paramet": [4, 10, 12, 18, 24], "per": 16, "physic": [9, 10], "pmtobserv": 18, "point": 11, "polygon": 11, "powersensorobserv": 18, "product": 10, "pso": 19, "py": 11, "pycuda": [6, 15], "pycudafunct": 6, "pyhip": 15, "pyopencl": 15, "python": [8, 9, 10, 15], "quick": 14, "random_sampl": 19, "reduct": 11, "refer": 2, "report": 3, "result": [8, 9], "run": [7, 10], "runner": 6, "select": 22, "sepconv": 11, "sequenti": 6, "sequentialrunn": 6, "setup": [3, 7], "share": [8, 9, 10, 16], "simpl": 3, "simulated_ann": 19, "simulationrunn": 6, "spars": 11, "start": [12, 20], "stencil": 11, "store": [8, 9], "strategi": [6, 19], "stream": 13, "struct": 21, "support": 0, "templat": 22, "test": [4, 7], "thread": 16, "tile": [8, 9, 10], "tunabl": 4, "tune": [4, 8, 9, 10, 12, 13, 16, 17, 18], "tuner": [2, 8, 9, 10, 11, 12, 14], "tutori": [9, 10], "us": [8, 9, 10, 12, 16, 21], "usag": [0, 14], "util": 6, "vector": 11, "verif": 5, "version": 15, "vocabulari": 24, "work": 16}})
                    \ No newline at end of file
                    diff --git a/latest/structs.html b/latest/structs.html
                    index b1c6dc75b..69667a9e3 100644
                    --- a/latest/structs.html
                    +++ b/latest/structs.html
                    @@ -1,19 +1,21 @@
                     
                    -
                    +
                     
                    -  
                    +  
                     
                       
                       Using structs — Kernel Tuner 1.0 documentation
                    -      
                    -      
                    +      
                    +      
                    +
                    +  
                       
                       
                    -        
                    -        
                    -        
                    +        
                    +        
                    +        
                             
                         
                         
                    @@ -103,7 +105,7 @@
                                
                    -

                    Using structs

                    +

                    Using structs

                    One of the issues with calling GPU kernels from Python is the use of custom data types in kernel arguments. In general, it is recommended for portability of your GPU code, which may be used in any host program in any host programming language, to keep the interface of your kernels as simple as possible. This means sticking to simple pointers of primitive types such as integer, float, and double. For performance reasons, it is also recommended to not use arrays of structs for kernel arguments, as this is very likely to lead to inefficient memory accesses on the GPU.

                    diff --git a/latest/templates.html b/latest/templates.html index c0ae2ddcc..6106c0838 100644 --- a/latest/templates.html +++ b/latest/templates.html @@ -1,19 +1,21 @@ - + - + Templated kernels — Kernel Tuner 1.0 documentation - - + + + + - - - + + + @@ -107,12 +109,12 @@
                    -

                    Templated kernels

                    +

                    Templated kernels

                    It is quite common in CUDA programming to write kernels that use C++ templates. This can be very useful when writing code that can work for several types, for example floats and doubles. However, the use of C++ templates makes it slightly more difficult to directly integrate the CUDA kernel into applications that are not written in C++, for example Matlab, Fortran, or Python. And since Kernel Tuner is written in Python, we needed to take a few extra steps to provide support for templated CUDA kernels. Let’s first look at an example of what it’s like to tune a templated kernel with Kernel Tuner.

                    -

                    Example

                    +

                    Example

                    Say we have a templated CUDA kernel in a file called vector_add.cu:

                    1template<typename T>
                     2__global__ void vector_add(T *c, T *a, T *b, int n) {
                    @@ -153,7 +155,7 @@ 

                    Example -

                    Selecting a backend

                    +

                    Selecting a backend

                    Kernel Tuner supports multiple backends, for CUDA these are based on PyCUDA and Cupy. The following explains how to enable tuning of templated kernels with either backend.

                    The PyCuda backend is the default backend in Kernel Tuner and is selected if the user does not supply the ‘lang’ option and CUDA code is detected in the kernel source, or when lang is set to “CUDA” by the user. PyCuda requires CUDA kernels to have extern C linkage, which means that C++ templated kernels are not supported. To support templated kernels regardless of this limitation Kernel Tuner attempts to wrap the templated CUDA kernel by inserting a compile-time template instantiation statement and a wrapper kernel that calls diff --git a/latest/user-api.html b/latest/user-api.html index 528d736bd..f4a287f7c 100644 --- a/latest/user-api.html +++ b/latest/user-api.html @@ -1,19 +1,21 @@ - + - + API Documentation — Kernel Tuner 1.0 documentation - - + + + + - - - + + + @@ -111,11 +113,11 @@

                    -

                    API Documentation

                    +

                    API Documentation

                    This file provides all the details you need about how to call the Kernel Tuner’s functions, including all the optional arguments.

                    -kernel_tuner.tune_kernel(kernel_name, kernel_source, problem_size, arguments, tune_params, grid_div_x=None, grid_div_y=None, grid_div_z=None, restrictions=None, answer=None, atol=1e-06, verify=None, verbose=False, lang=None, device=0, platform=0, smem_args=None, cmem_args=None, texmem_args=None, compiler=None, compiler_options=None, defines=None, log=None, iterations=7, block_size_names=None, quiet=False, strategy=None, strategy_options=None, cache=None, metrics=None, simulation_mode=False, observers=None, objective=None, objective_higher_is_better=None)
                    +kernel_tuner.tune_kernel(kernel_name, kernel_source, problem_size, arguments, tune_params, grid_div_x=None, grid_div_y=None, grid_div_z=None, restrictions=None, answer=None, atol=1e-06, verify=None, verbose=False, lang=None, device=0, platform=0, smem_args=None, cmem_args=None, texmem_args=None, compiler=None, compiler_options=None, defines=None, log=None, iterations=7, block_size_names=None, quiet=False, strategy=None, strategy_options=None, cache=None, metrics=None, simulation_mode=False, observers=None, objective=None, objective_higher_is_better=None)

                    Tune a CUDA kernel given a set of tunable parameters

                    Parameters:
                    @@ -365,7 +367,7 @@
                    -kernel_tuner.run_kernel(kernel_name, kernel_source, problem_size, arguments, params, grid_div_x=None, grid_div_y=None, grid_div_z=None, lang=None, device=0, platform=0, smem_args=None, cmem_args=None, texmem_args=None, compiler=None, compiler_options=None, defines=None, block_size_names=None, quiet=False, log=None)
                    +kernel_tuner.run_kernel(kernel_name, kernel_source, problem_size, arguments, params, grid_div_x=None, grid_div_y=None, grid_div_z=None, lang=None, device=0, platform=0, smem_args=None, cmem_args=None, texmem_args=None, compiler=None, compiler_options=None, defines=None, block_size_names=None, quiet=False, log=None)

                    Compile and run a single kernel

                    Compiles and runs a single kernel once, given a specific instance of the kernels tuning parameters. However, instead of measuring execution time run_kernel returns the output of the kernel. @@ -517,7 +519,7 @@

                    -kernel_tuner.store_results(results_filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3, objective=None, objective_higher_is_better=None)
                    +kernel_tuner.store_results(results_filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3, objective=None, objective_higher_is_better=None)

                    stores tuning results to a JSON file

                    Stores the top (3% by default) best kernel configurations in a JSON file. The results are stored for a specific device (retrieved using env[‘device_name’]) @@ -550,7 +552,7 @@

                    -kernel_tuner.create_device_targets(header_filename, results_filename, objective=None, objective_higher_is_better=None)
                    +kernel_tuner.create_device_targets(header_filename, results_filename, objective=None, objective_higher_is_better=None)

                    create a header with device targets

                    This function generates a header file with device targets for compiling a kernel with different parameters on different devices. The tuning diff --git a/latest/vocabulary.html b/latest/vocabulary.html index 985e4fdf1..22083195f 100644 --- a/latest/vocabulary.html +++ b/latest/vocabulary.html @@ -1,19 +1,21 @@ - + - + Parameter Vocabulary — Kernel Tuner 1.0 documentation - - + + + + - - - + + + @@ -103,7 +105,7 @@

                    -

                    Parameter Vocabulary

                    +

                    Parameter Vocabulary

                    There are certain tunable parameters that have special meaning in Kernel Tuner. This document specifies which parameters are special and what there uses are when auto-tuning GPU kernels.

                    In general, it is best to avoid using these parameter names for purposes other than the ones indicated in this document.