diff --git a/latest/.buildinfo b/latest/.buildinfo
index 2e08383a8..fc2ec2aff 100644
--- a/latest/.buildinfo
+++ b/latest/.buildinfo
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: c3d758f43fb7e62bf2a1b966e9ea4853
+config: 0c64f1e50062f38d9437c83151ccb10e
 tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/latest/_sources/dev-environment.rst.txt b/latest/_sources/dev-environment.rst.txt
index 24b37e567..6c36101ac 100644
--- a/latest/_sources/dev-environment.rst.txt
+++ b/latest/_sources/dev-environment.rst.txt
@@ -27,8 +27,8 @@ Steps with :bash:`sudo` access (e.g. on a local device):
     * After installation, restart your shell. 
 #. Install the required Python versions: 
     * On some systems, additional packages may be needed to build Python versions. For example on Ubuntu: :bash:`sudo apt install build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev liblzma-dev lzma`.
-    * Install the Python versions with: :bash:`pyenv install 3.8 3.9 3.10 3.11`. The reason we're installing all these versions as opposed to just one, is so we can test against all supported Python versions.
-#. Set the Python versions so they can be found: :bash:`pyenv local 3.8 3.9 3.10 3.11` (replace :bash:`local` with :bash:`global` when not using the virtualenv).
+    * Install the Python versions with: :bash:`pyenv install 3.9 3.10 3.11 3.12`. The reason we're installing all these versions as opposed to just one, is so we can test against all supported Python versions.
+#. Set the Python versions so they can be found: :bash:`pyenv local 3.9 3.10 3.11 3.12` (replace :bash:`local` with :bash:`global` when not using the virtualenv).
 #. Setup a local virtual environment in the folder: :bash:`pyenv virtualenv 3.11 kerneltuner` (or whatever environment name and Python version you prefer).
 #. `Install Poetry <https://python-poetry.org/docs/#installing-with-the-official-installer>`__. 
     * Use :bash:`curl -sSL https://install.python-poetry.org | python3 -` to install Poetry.
diff --git a/latest/_static/basic.css b/latest/_static/basic.css
index cfc60b86c..f316efcb4 100644
--- a/latest/_static/basic.css
+++ b/latest/_static/basic.css
@@ -4,7 +4,7 @@
  *
  * Sphinx stylesheet -- basic theme.
  *
- * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
+ * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
  * :license: BSD, see LICENSE for details.
  *
  */
@@ -237,6 +237,10 @@ a.headerlink {
     visibility: hidden;
 }
 
+a:visited {
+    color: #551A8B;
+}
+
 h1:hover > a.headerlink,
 h2:hover > a.headerlink,
 h3:hover > a.headerlink,
diff --git a/latest/_static/doctools.js b/latest/_static/doctools.js
index d06a71d75..4d67807d1 100644
--- a/latest/_static/doctools.js
+++ b/latest/_static/doctools.js
@@ -4,7 +4,7 @@
  *
  * Base JavaScript utilities for all Sphinx HTML documentation.
  *
- * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
+ * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
  * :license: BSD, see LICENSE for details.
  *
  */
diff --git a/latest/_static/documentation_options.js b/latest/_static/documentation_options.js
index a7f754b66..529239f07 100644
--- a/latest/_static/documentation_options.js
+++ b/latest/_static/documentation_options.js
@@ -1,5 +1,4 @@
-var DOCUMENTATION_OPTIONS = {
-    URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
+const DOCUMENTATION_OPTIONS = {
     VERSION: '1.0',
     LANGUAGE: 'en',
     COLLAPSE_INDEX: false,
diff --git a/latest/_static/language_data.js b/latest/_static/language_data.js
index 250f5665f..367b8ed81 100644
--- a/latest/_static/language_data.js
+++ b/latest/_static/language_data.js
@@ -5,7 +5,7 @@
  * This script contains the language-specific data used by searchtools.js,
  * namely the list of stopwords, stemmer, scorer and splitter.
  *
- * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
+ * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
  * :license: BSD, see LICENSE for details.
  *
  */
@@ -13,7 +13,7 @@
 var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"];
 
 
-/* Non-minified version is copied as a separate JS file, is available */
+/* Non-minified version is copied as a separate JS file, if available */
 
 /**
  * Porter Stemmer
diff --git a/latest/_static/searchtools.js b/latest/_static/searchtools.js
index 97d56a74d..92da3f8b2 100644
--- a/latest/_static/searchtools.js
+++ b/latest/_static/searchtools.js
@@ -4,7 +4,7 @@
  *
  * Sphinx JavaScript utilities for the full-text search.
  *
- * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
+ * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
  * :license: BSD, see LICENSE for details.
  *
  */
@@ -57,12 +57,12 @@ const _removeChildren = (element) => {
 const _escapeRegExp = (string) =>
   string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
 
-const _displayItem = (item, searchTerms) => {
+const _displayItem = (item, searchTerms, highlightTerms) => {
   const docBuilder = DOCUMENTATION_OPTIONS.BUILDER;
-  const docUrlRoot = DOCUMENTATION_OPTIONS.URL_ROOT;
   const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX;
   const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX;
   const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY;
+  const contentRoot = document.documentElement.dataset.content_root;
 
   const [docName, title, anchor, descr, score, _filename] = item;
 
@@ -75,28 +75,35 @@ const _displayItem = (item, searchTerms) => {
     if (dirname.match(/\/index\/$/))
       dirname = dirname.substring(0, dirname.length - 6);
     else if (dirname === "index/") dirname = "";
-    requestUrl = docUrlRoot + dirname;
+    requestUrl = contentRoot + dirname;
     linkUrl = requestUrl;
   } else {
     // normal html builders
-    requestUrl = docUrlRoot + docName + docFileSuffix;
+    requestUrl = contentRoot + docName + docFileSuffix;
     linkUrl = docName + docLinkSuffix;
   }
   let linkEl = listItem.appendChild(document.createElement("a"));
   linkEl.href = linkUrl + anchor;
   linkEl.dataset.score = score;
   linkEl.innerHTML = title;
-  if (descr)
+  if (descr) {
     listItem.appendChild(document.createElement("span")).innerHTML =
       " (" + descr + ")";
+    // highlight search terms in the description
+    if (SPHINX_HIGHLIGHT_ENABLED)  // set in sphinx_highlight.js
+      highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted"));
+  }
   else if (showSearchSummary)
     fetch(requestUrl)
       .then((responseData) => responseData.text())
       .then((data) => {
         if (data)
           listItem.appendChild(
-            Search.makeSearchSummary(data, searchTerms)
+            Search.makeSearchSummary(data, searchTerms, anchor)
           );
+        // highlight search terms in the summary
+        if (SPHINX_HIGHLIGHT_ENABLED)  // set in sphinx_highlight.js
+          highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted"));
       });
   Search.output.appendChild(listItem);
 };
@@ -109,26 +116,43 @@ const _finishSearch = (resultCount) => {
     );
   else
     Search.status.innerText = _(
-      `Search finished, found ${resultCount} page(s) matching the search query.`
-    );
+      "Search finished, found ${resultCount} page(s) matching the search query."
+    ).replace('${resultCount}', resultCount);
 };
 const _displayNextItem = (
   results,
   resultCount,
-  searchTerms
+  searchTerms,
+  highlightTerms,
 ) => {
   // results left, load the summary and display it
   // this is intended to be dynamic (don't sub resultsCount)
   if (results.length) {
-    _displayItem(results.pop(), searchTerms);
+    _displayItem(results.pop(), searchTerms, highlightTerms);
     setTimeout(
-      () => _displayNextItem(results, resultCount, searchTerms),
+      () => _displayNextItem(results, resultCount, searchTerms, highlightTerms),
       5
     );
   }
   // search finished, update title and status message
   else _finishSearch(resultCount);
 };
+// Helper function used by query() to order search results.
+// Each input is an array of [docname, title, anchor, descr, score, filename].
+// Order the results by score (in opposite order of appearance, since the
+// `_displayNextItem` function uses pop() to retrieve items) and then alphabetically.
+const _orderResultsByScoreThenName = (a, b) => {
+  const leftScore = a[4];
+  const rightScore = b[4];
+  if (leftScore === rightScore) {
+    // same score: sort alphabetically
+    const leftTitle = a[1].toLowerCase();
+    const rightTitle = b[1].toLowerCase();
+    if (leftTitle === rightTitle) return 0;
+    return leftTitle > rightTitle ? -1 : 1; // inverted is intentional
+  }
+  return leftScore > rightScore ? 1 : -1;
+};
 
 /**
  * Default splitQuery function. Can be overridden in ``sphinx.search`` with a
@@ -152,13 +176,26 @@ const Search = {
   _queued_query: null,
   _pulse_status: -1,
 
-  htmlToText: (htmlString) => {
+  htmlToText: (htmlString, anchor) => {
     const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html');
-    htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() });
+    for (const removalQuery of [".headerlinks", "script", "style"]) {
+      htmlElement.querySelectorAll(removalQuery).forEach((el) => { el.remove() });
+    }
+    if (anchor) {
+      const anchorContent = htmlElement.querySelector(`[role="main"] ${anchor}`);
+      if (anchorContent) return anchorContent.textContent;
+
+      console.warn(
+        `Anchored content block not found. Sphinx search tries to obtain it via DOM query '[role=main] ${anchor}'. Check your theme or template.`
+      );
+    }
+
+    // if anchor not specified or not found, fall back to main content
     const docContent = htmlElement.querySelector('[role="main"]');
-    if (docContent !== undefined) return docContent.textContent;
+    if (docContent) return docContent.textContent;
+
     console.warn(
-      "Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template."
+      "Content block not found. Sphinx search tries to obtain it via DOM query '[role=main]'. Check your theme or template."
     );
     return "";
   },
@@ -231,16 +268,7 @@ const Search = {
     else Search.deferQuery(query);
   },
 
-  /**
-   * execute search (requires search index to be loaded)
-   */
-  query: (query) => {
-    const filenames = Search._index.filenames;
-    const docNames = Search._index.docnames;
-    const titles = Search._index.titles;
-    const allTitles = Search._index.alltitles;
-    const indexEntries = Search._index.indexentries;
-
+  _parseQuery: (query) => {
     // stem the search terms and add them to the correct list
     const stemmer = new Stemmer();
     const searchTerms = new Set();
@@ -276,16 +304,32 @@ const Search = {
     // console.info("required: ", [...searchTerms]);
     // console.info("excluded: ", [...excludedTerms]);
 
-    // array of [docname, title, anchor, descr, score, filename]
-    let results = [];
+    return [query, searchTerms, excludedTerms, highlightTerms, objectTerms];
+  },
+
+  /**
+   * execute search (requires search index to be loaded)
+   */
+  _performSearch: (query, searchTerms, excludedTerms, highlightTerms, objectTerms) => {
+    const filenames = Search._index.filenames;
+    const docNames = Search._index.docnames;
+    const titles = Search._index.titles;
+    const allTitles = Search._index.alltitles;
+    const indexEntries = Search._index.indexentries;
+
+    // Collect multiple result groups to be sorted separately and then ordered.
+    // Each is an array of [docname, title, anchor, descr, score, filename].
+    const normalResults = [];
+    const nonMainIndexResults = [];
+
     _removeChildren(document.getElementById("search-progress"));
 
-    const queryLower = query.toLowerCase();
+    const queryLower = query.toLowerCase().trim();
     for (const [title, foundTitles] of Object.entries(allTitles)) {
-      if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) {
+      if (title.toLowerCase().trim().includes(queryLower) && (queryLower.length >= title.length/2)) {
         for (const [file, id] of foundTitles) {
           let score = Math.round(100 * queryLower.length / title.length)
-          results.push([
+          normalResults.push([
             docNames[file],
             titles[file] !== title ? `${titles[file]} > ${title}` : title,
             id !== null ? "#" + id : "",
@@ -300,46 +344,47 @@ const Search = {
     // search for explicit entries in index directives
     for (const [entry, foundEntries] of Object.entries(indexEntries)) {
       if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) {
-        for (const [file, id] of foundEntries) {
-          let score = Math.round(100 * queryLower.length / entry.length)
-          results.push([
+        for (const [file, id, isMain] of foundEntries) {
+          const score = Math.round(100 * queryLower.length / entry.length);
+          const result = [
             docNames[file],
             titles[file],
             id ? "#" + id : "",
             null,
             score,
             filenames[file],
-          ]);
+          ];
+          if (isMain) {
+            normalResults.push(result);
+          } else {
+            nonMainIndexResults.push(result);
+          }
         }
       }
     }
 
     // lookup as object
     objectTerms.forEach((term) =>
-      results.push(...Search.performObjectSearch(term, objectTerms))
+      normalResults.push(...Search.performObjectSearch(term, objectTerms))
     );
 
     // lookup as search terms in fulltext
-    results.push(...Search.performTermsSearch(searchTerms, excludedTerms));
+    normalResults.push(...Search.performTermsSearch(searchTerms, excludedTerms));
 
     // let the scorer override scores with a custom scoring function
-    if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item)));
-
-    // now sort the results by score (in opposite order of appearance, since the
-    // display function below uses pop() to retrieve items) and then
-    // alphabetically
-    results.sort((a, b) => {
-      const leftScore = a[4];
-      const rightScore = b[4];
-      if (leftScore === rightScore) {
-        // same score: sort alphabetically
-        const leftTitle = a[1].toLowerCase();
-        const rightTitle = b[1].toLowerCase();
-        if (leftTitle === rightTitle) return 0;
-        return leftTitle > rightTitle ? -1 : 1; // inverted is intentional
-      }
-      return leftScore > rightScore ? 1 : -1;
-    });
+    if (Scorer.score) {
+      normalResults.forEach((item) => (item[4] = Scorer.score(item)));
+      nonMainIndexResults.forEach((item) => (item[4] = Scorer.score(item)));
+    }
+
+    // Sort each group of results by score and then alphabetically by name.
+    normalResults.sort(_orderResultsByScoreThenName);
+    nonMainIndexResults.sort(_orderResultsByScoreThenName);
+
+    // Combine the result groups in (reverse) order.
+    // Non-main index entries are typically arbitrary cross-references,
+    // so display them after other results.
+    let results = [...nonMainIndexResults, ...normalResults];
 
     // remove duplicate search results
     // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept
@@ -353,14 +398,19 @@ const Search = {
       return acc;
     }, []);
 
-    results = results.reverse();
+    return results.reverse();
+  },
+
+  query: (query) => {
+    const [searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms] = Search._parseQuery(query);
+    const results = Search._performSearch(searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms);
 
     // for debugging
     //Search.lastresults = results.slice();  // a copy
     // console.info("search results:", Search.lastresults);
 
     // print the results
-    _displayNextItem(results, results.length, searchTerms);
+    _displayNextItem(results, results.length, searchTerms, highlightTerms);
   },
 
   /**
@@ -458,14 +508,18 @@ const Search = {
       // add support for partial matches
       if (word.length > 2) {
         const escapedWord = _escapeRegExp(word);
-        Object.keys(terms).forEach((term) => {
-          if (term.match(escapedWord) && !terms[word])
-            arr.push({ files: terms[term], score: Scorer.partialTerm });
-        });
-        Object.keys(titleTerms).forEach((term) => {
-          if (term.match(escapedWord) && !titleTerms[word])
-            arr.push({ files: titleTerms[word], score: Scorer.partialTitle });
-        });
+        if (!terms.hasOwnProperty(word)) {
+          Object.keys(terms).forEach((term) => {
+            if (term.match(escapedWord))
+              arr.push({ files: terms[term], score: Scorer.partialTerm });
+          });
+        }
+        if (!titleTerms.hasOwnProperty(word)) {
+          Object.keys(titleTerms).forEach((term) => {
+            if (term.match(escapedWord))
+              arr.push({ files: titleTerms[term], score: Scorer.partialTitle });
+          });
+        }
       }
 
       // no match but word was a required one
@@ -488,9 +542,8 @@ const Search = {
 
       // create the mapping
       files.forEach((file) => {
-        if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1)
-          fileMap.get(file).push(word);
-        else fileMap.set(file, [word]);
+        if (!fileMap.has(file)) fileMap.set(file, [word]);
+        else if (fileMap.get(file).indexOf(word) === -1) fileMap.get(file).push(word);
       });
     });
 
@@ -541,8 +594,8 @@ const Search = {
    * search summary for a given text. keywords is a list
    * of stemmed words.
    */
-  makeSearchSummary: (htmlText, keywords) => {
-    const text = Search.htmlToText(htmlText);
+  makeSearchSummary: (htmlText, keywords, anchor) => {
+    const text = Search.htmlToText(htmlText, anchor);
     if (text === "") return null;
 
     const textLower = text.toLowerCase();
diff --git a/latest/_static/sphinx_highlight.js b/latest/_static/sphinx_highlight.js
index aae669d7e..8a96c69a1 100644
--- a/latest/_static/sphinx_highlight.js
+++ b/latest/_static/sphinx_highlight.js
@@ -29,14 +29,19 @@ const _highlight = (node, addItems, text, className) => {
       }
 
       span.appendChild(document.createTextNode(val.substr(pos, text.length)));
+      const rest = document.createTextNode(val.substr(pos + text.length));
       parent.insertBefore(
         span,
         parent.insertBefore(
-          document.createTextNode(val.substr(pos + text.length)),
+          rest,
           node.nextSibling
         )
       );
       node.nodeValue = val.substr(0, pos);
+      /* There may be more occurrences of search term in this node. So call this
+       * function recursively on the remaining fragment.
+       */
+      _highlight(rest, addItems, text, className);
 
       if (isInSVG) {
         const rect = document.createElementNS(
@@ -140,5 +145,10 @@ const SphinxHighlight = {
   },
 };
 
-_ready(SphinxHighlight.highlightSearchWords);
-_ready(SphinxHighlight.initEscapeListener);
+_ready(() => {
+  /* Do not call highlightSearchWords() when we are on the search page.
+   * It will highlight words from the *previous* search query.
+   */
+  if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords();
+  SphinxHighlight.initEscapeListener();
+});
diff --git a/latest/backends.html b/latest/backends.html
index 142065a4a..048514784 100644
--- a/latest/backends.html
+++ b/latest/backends.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Backends &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -108,13 +110,13 @@
   <div class="toctree-wrapper compound">
 </div>
 <section id="backends">
-<h1>Backends<a class="headerlink" href="#backends" title="Permalink to this heading">¶</a></h1>
+<h1>Backends<a class="headerlink" href="#backends" title="Link to this heading">¶</a></h1>
 <p>Kernel Tuner implements multiple backends for CUDA, one for OpenCL, one for HIP, and a generic
 Compiler backend.</p>
 <p>Selecting a backend is in most cases automatic and is done based on the kernel’s programming
 language, but sometimes you’ll want to specifically choose a backend.</p>
 <section id="cuda-backends">
-<h2>CUDA Backends<a class="headerlink" href="#cuda-backends" title="Permalink to this heading">¶</a></h2>
+<h2>CUDA Backends<a class="headerlink" href="#cuda-backends" title="Link to this heading">¶</a></h2>
 <p>PyCUDA is default CUDA backend in Kernel Tuner. It is comparable in feature completeness with CuPy.
 Because the HIP kernel language is identical to the CUDA kernel language, HIP is included here as well.
 To use HIP on nvidia GPUs, see <a class="reference external" href="https://github.com/jatinx/hip-on-nv">https://github.com/jatinx/hip-on-nv</a>.</p>
@@ -131,7 +133,7 @@ <h2>CUDA Backends<a class="headerlink" href="#cuda-backends" title="Permalink to
 limited support is implemented by Kernel Tuner to support templated kernels for the PyCUDA and
 CUDA-Python backends.</p>
 <table class="docutils align-default" id="id1">
-<caption><span class="caption-text">Backend feature support</span><a class="headerlink" href="#id1" title="Permalink to this table">¶</a></caption>
+<caption><span class="caption-text">Backend feature support</span><a class="headerlink" href="#id1" title="Link to this table">¶</a></caption>
 <thead>
 <tr class="row-odd"><th class="head"><p>Feature</p></th>
 <th class="head"><p>PyCUDA</p></th>
@@ -195,7 +197,7 @@ <h2>CUDA Backends<a class="headerlink" href="#cuda-backends" title="Permalink to
 below lists which Python package is required, how the backend can be selected and which compiler is
 used to compile the kernels.</p>
 <table class="docutils align-default" id="id2">
-<caption><span class="caption-text">Backend usage and compiler</span><a class="headerlink" href="#id2" title="Permalink to this table">¶</a></caption>
+<caption><span class="caption-text">Backend usage and compiler</span><a class="headerlink" href="#id2" title="Link to this table">¶</a></caption>
 <thead>
 <tr class="row-odd"><th class="head"><p>Feature</p></th>
 <th class="head"><p>PyCUDA</p></th>
diff --git a/latest/cache_files.html b/latest/cache_files.html
index 97ea0ec99..163c9834c 100644
--- a/latest/cache_files.html
+++ b/latest/cache_files.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Cache files &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -103,7 +105,7 @@
            <div itemprop="articleBody">
              
   <section id="cache-files">
-<span id="cache"></span><h1>Cache files<a class="headerlink" href="#cache-files" title="Permalink to this heading">¶</a></h1>
+<span id="cache"></span><h1>Cache files<a class="headerlink" href="#cache-files" title="Link to this heading">¶</a></h1>
 <p>A very useful feature of Kernel Tuner is the ability to store benchmarking results in a cache file during tuning. You can enable cache files by
 passing any filename to the <code class="docutils literal notranslate"><span class="pre">cache=</span></code> optional argument of <code class="docutils literal notranslate"><span class="pre">tune_kernel</span></code>.</p>
 <p>The benchmark results of individual kernel configurations are appended to the cache file as Kernel Tuner is running. This also allows Kernel Tuner
diff --git a/latest/contents.html b/latest/contents.html
index 4158bfc20..f4637c5de 100644
--- a/latest/contents.html
+++ b/latest/contents.html
@@ -1,20 +1,24 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>The Kernel Tuner documentation &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+        <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
+        <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
     <link rel="search" title="Search" href="search.html" />
@@ -102,7 +106,7 @@
            <div itemprop="articleBody">
              
   <section id="the-kernel-tuner-documentation">
-<h1>The Kernel Tuner documentation<a class="headerlink" href="#the-kernel-tuner-documentation" title="Permalink to this heading">¶</a></h1>
+<h1>The Kernel Tuner documentation<a class="headerlink" href="#the-kernel-tuner-documentation" title="Link to this heading">¶</a></h1>
 <div class="toctree-wrapper compound">
 <p class="caption" role="heading"><span class="caption-text">Kernel Tuner</span></p>
 <ul>
diff --git a/latest/contributing.html b/latest/contributing.html
index c52be4f0a..39d5826e8 100644
--- a/latest/contributing.html
+++ b/latest/contributing.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Contribution guide &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -110,10 +112,10 @@
   <div class="toctree-wrapper compound">
 </div>
 <section id="contribution-guide">
-<span id="contributing"></span><h1>Contribution guide<a class="headerlink" href="#contribution-guide" title="Permalink to this heading">¶</a></h1>
+<span id="contributing"></span><h1>Contribution guide<a class="headerlink" href="#contribution-guide" title="Link to this heading">¶</a></h1>
 <p>Thank you for considering to contribute to Kernel Tuner!</p>
 <section id="reporting-issues">
-<h2>Reporting Issues<a class="headerlink" href="#reporting-issues" title="Permalink to this heading">¶</a></h2>
+<h2>Reporting Issues<a class="headerlink" href="#reporting-issues" title="Link to this heading">¶</a></h2>
 <p>Not all contributions are code, creating an issue also helps us to improve. When you create an issue about a problem, please ensure the following:</p>
 <ul class="simple">
 <li><p>Describe what you expected to happen.</p></li>
@@ -123,7 +125,7 @@ <h2>Reporting Issues<a class="headerlink" href="#reporting-issues" title="Permal
 </ul>
 </section>
 <section id="contributing-code">
-<h2>Contributing Code<a class="headerlink" href="#contributing-code" title="Permalink to this heading">¶</a></h2>
+<h2>Contributing Code<a class="headerlink" href="#contributing-code" title="Link to this heading">¶</a></h2>
 <p>For contributing code to Kernel Tuner please select an issue to work on or create a new issue to propose a change or addition. For significant changes, it is required to first create an issue and discuss the proposed changes. Then fork the repository, create a branch, one per change or addition, and create a pull request.</p>
 <p>Kernel Tuner follows the Google Python style guide, with Sphinxdoc docstrings for module public functions.</p>
 <p>Before creating a pull request please ensure the following:</p>
@@ -137,7 +139,7 @@ <h2>Contributing Code<a class="headerlink" href="#contributing-code" title="Perm
 have look at the <a class="reference internal" href="design.html#design-documentation"><span class="std std-ref">design documentation</span></a>, or discuss it in the issue regarding your additions.</p>
 </section>
 <section id="simple-development-setup">
-<span id="simple-dev-env"></span><h2>Simple development setup<a class="headerlink" href="#simple-development-setup" title="Permalink to this heading">¶</a></h2>
+<span id="simple-dev-env"></span><h2>Simple development setup<a class="headerlink" href="#simple-development-setup" title="Link to this heading">¶</a></h2>
 <p>For small changes to the code you can setup a quick development environment with the following steps:</p>
 <ul class="simple">
 <li><p><code class="code highlight bash docutils literal highlight-bash">git<span class="w"> </span>clone<span class="w"> </span>git@github.com:KernelTuner/kernel_tuner.git</code></p></li>
diff --git a/latest/convolution.html b/latest/convolution.html
index 434d19f33..daff8a342 100644
--- a/latest/convolution.html
+++ b/latest/convolution.html
@@ -1,20 +1,22 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Convolution &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
-      <link rel="stylesheet" href="_static/nbsphinx-code-cells.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+      <link rel="stylesheet" type="text/css" href="_static/nbsphinx-code-cells.css" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
         <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
         <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
@@ -112,13 +114,13 @@
            <div itemprop="articleBody">
              
   <section id="Convolution">
-<h1>Convolution<a class="headerlink" href="#Convolution" title="Permalink to this heading">¶</a></h1>
+<h1>Convolution<a class="headerlink" href="#Convolution" title="Link to this heading">¶</a></h1>
 <p>This guide is meant to get you started with writing your tests and tuning scripts using Kernel Tuner. We’ll use a simple 2D Convolution kernel as an example kernel, but as you will find out shortly, much of the scripts that you write with Kernel Tuner can be reused for testing and tuning other kernels.</p>
 <div class="admonition note">
 <p><strong>Note:</strong> If you are reading this guide on the Kernel Tuner’s documentation pages, note that you can actually run this guide as a Jupyter Notebook. Just clone the Kernel Tuner’s <a class="reference external" href="http://github.com/kerneltuner/kernel_tuner">GitHub repository</a>. Install using <em>pip install .[tutorial,cuda]</em> and you’re ready to go! You can start the guide by typing “jupyter notebook” in the “kernel_tuner/doc/source” directory.</p>
 </div>
 <section id="2D-Convolution-example">
-<h2>2D Convolution example<a class="headerlink" href="#2D-Convolution-example" title="Permalink to this heading">¶</a></h2>
+<h2>2D Convolution example<a class="headerlink" href="#2D-Convolution-example" title="Link to this heading">¶</a></h2>
 <p>Convolution operations are essential to signal and image processing applications and are the main operation in convolutional neural networks used for deep learning. A convolution operation computes the linear combination of the weights in a <em>convolution filter</em> and a range of pixels from the input image for each output pixel. A 2D convolution of an input image <span class="math notranslate nohighlight">\(I\)</span> of size <span class="math notranslate nohighlight">\((w\times h)\)</span> and a convolution filter <span class="math notranslate nohighlight">\(F\)</span> of size <span class="math notranslate nohighlight">\((F_w\times F_h)\)</span> computes an output image
 <span class="math notranslate nohighlight">\(O\)</span> of size <span class="math notranslate nohighlight">\(((w-F_w)\times (h-F_h))\)</span>: <span class="math">\begin{equation}\nonumber
 O(x,y) = \sum\limits_{j=0}^{F_h} \sum\limits_{i=0}^{F_w} I(x+i,y+j)\times F(i,j)
@@ -154,7 +156,7 @@ <h2>2D Convolution example<a class="headerlink" href="#2D-Convolution-example" t
 </div>
 </section>
 <section id="Implement-a-test">
-<h2>Implement a test<a class="headerlink" href="#Implement-a-test" title="Permalink to this heading">¶</a></h2>
+<h2>Implement a test<a class="headerlink" href="#Implement-a-test" title="Link to this heading">¶</a></h2>
 <p>We will start with using Kernel Tuner’s <code class="docutils literal notranslate"><span class="pre">run_kernel</span></code> function to call our naive 2D convolution kernel. But first we will have to create some input data, which we will do as follows:</p>
 <div class="nbinput nblast docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[ ]:
@@ -266,7 +268,7 @@ <h2>Implement a test<a class="headerlink" href="#Implement-a-test" title="Permal
 <p>The <strong>answer</strong> list contains Numpy objects (arrays and/or scalars) in the same order and of the same type as the <strong>arguments</strong> list that we used to call the kernel with, but in contrast to <strong>arguments</strong> it contains the data that was stored in GPU memory after our naive convolution kernel had finished executing. This feature is particularly useful for implementing tests for your GPU kernels. You can perform the same operation in Python and compare the results.</p>
 </section>
 <section id="Tuning-2D-Convolution">
-<h2>Tuning 2D Convolution<a class="headerlink" href="#Tuning-2D-Convolution" title="Permalink to this heading">¶</a></h2>
+<h2>Tuning 2D Convolution<a class="headerlink" href="#Tuning-2D-Convolution" title="Link to this heading">¶</a></h2>
 <p>In many cases there are more tunable parameters than just the thread block dimensions. We have included a highly-optimized 2D Convolution kernel that contains many parametrized code optimizations. It’s a bit long to include here, so instead we just point to the file, you may need to adjust the path a little bit depending on where you’ve stored the Kernel Tuner’s source code and where this notebook is executing.</p>
 <div class="nbinput nblast docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[ ]:
@@ -311,7 +313,7 @@ <h2>Tuning 2D Convolution<a class="headerlink" href="#Tuning-2D-Convolution" tit
 device information. This second dictionary can be stored along with the results so that you can always find out under what circumstances those results were obtained.</p>
 </section>
 <section id="More-tunable-parameters">
-<h2>More tunable parameters<a class="headerlink" href="#More-tunable-parameters" title="Permalink to this heading">¶</a></h2>
+<h2>More tunable parameters<a class="headerlink" href="#More-tunable-parameters" title="Link to this heading">¶</a></h2>
 <p>I promised that we would use more tunable parameters than just thread block dimensions. Our 2D Convolution kernel also also supports tiling factors in the x and y dimensions. Tiling factors indicate that the amount of work performed by the thread block in a particular dimension is increased with a certain factor.</p>
 <div class="nbinput nblast docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[ ]:
diff --git a/latest/correctness.html b/latest/correctness.html
index ed27c4544..8bc96a552 100644
--- a/latest/correctness.html
+++ b/latest/correctness.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Correctness Verification &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -103,7 +105,7 @@
            <div itemprop="articleBody">
              
   <section id="correctness-verification">
-<h1>Correctness Verification<a class="headerlink" href="#correctness-verification" title="Permalink to this heading">¶</a></h1>
+<h1>Correctness Verification<a class="headerlink" href="#correctness-verification" title="Link to this heading">¶</a></h1>
 <p>Whenever you optimize a program for performance it is very important to
 ensure that the program is still producing the correct output. What good
 is a program that is fast but not correct?</p>
diff --git a/latest/design.html b/latest/design.html
index 46d143adf..1edc725af 100644
--- a/latest/design.html
+++ b/latest/design.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Design documentation &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -216,7 +218,7 @@
   <div class="toctree-wrapper compound">
 </div>
 <section id="design-documentation">
-<span id="id1"></span><h1>Design documentation<a class="headerlink" href="#design-documentation" title="Permalink to this heading">¶</a></h1>
+<span id="id1"></span><h1>Design documentation<a class="headerlink" href="#design-documentation" title="Link to this heading">¶</a></h1>
 <p>This section provides detailed information about the design and internals
 of the Kernel Tuner. <strong>This information is mostly relevant for developers.</strong></p>
 <p>The Kernel Tuner is designed to be extensible and support
@@ -260,72 +262,72 @@
 discussed above. For the documentation of the user API see the
 <a class="reference internal" href="user-api.html"><span class="doc">API Documentation</span></a>.</p>
 <section id="strategies">
-<h2>Strategies<a class="headerlink" href="#strategies" title="Permalink to this heading">¶</a></h2>
+<h2>Strategies<a class="headerlink" href="#strategies" title="Link to this heading">¶</a></h2>
 <p>Strategies are explained in <a class="reference internal" href="optimization.html#optimizations"><span class="std std-ref">Optimization strategies</span></a>.</p>
 <p>Many of the strategies use helper functions that are collected in <code class="docutils literal notranslate"><span class="pre">kernel_tuner.strategies.common</span></code>.</p>
 <section id="module-kernel_tuner.strategies.common">
-<span id="kernel-tuner-strategies-common"></span><h3>kernel_tuner.strategies.common<a class="headerlink" href="#module-kernel_tuner.strategies.common" title="Permalink to this heading">¶</a></h3>
+<span id="kernel-tuner-strategies-common"></span><h3>kernel_tuner.strategies.common<a class="headerlink" href="#module-kernel_tuner.strategies.common" title="Link to this heading">¶</a></h3>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.common.get_options">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">get_options</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">strategy_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.get_options" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">get_options</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">strategy_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.get_options" title="Link to this definition">¶</a></dt>
 <dd><p>Get the strategy-specific options or their defaults from user-supplied strategy_options.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.common.get_strategy_docstring">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">get_strategy_docstring</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">strategy_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.get_strategy_docstring" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">get_strategy_docstring</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">strategy_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.get_strategy_docstring" title="Link to this definition">¶</a></dt>
 <dd><p>Generate docstring for a ‘tune’ method of a strategy.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.common.make_strategy_options_doc">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">make_strategy_options_doc</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">strategy_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.make_strategy_options_doc" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">make_strategy_options_doc</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">strategy_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.make_strategy_options_doc" title="Link to this definition">¶</a></dt>
 <dd><p>Generate documentation for the supported strategy options and their defaults.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.common.scale_from_params">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">scale_from_params</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.scale_from_params" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">scale_from_params</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.scale_from_params" title="Link to this definition">¶</a></dt>
 <dd><p>Helper func to do the inverse of the ‘unscale’ function.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.common.setup_method_arguments">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">setup_method_arguments</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">method</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bounds</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.setup_method_arguments" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">setup_method_arguments</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">method</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bounds</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.setup_method_arguments" title="Link to this definition">¶</a></dt>
 <dd><p>Prepare method specific arguments.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.common.setup_method_options">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">setup_method_options</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">method</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.setup_method_options" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">setup_method_options</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">method</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.setup_method_options" title="Link to this definition">¶</a></dt>
 <dd><p>Prepare method specific options.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.common.snap_to_nearest_config">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">snap_to_nearest_config</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">x</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.snap_to_nearest_config" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">snap_to_nearest_config</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">x</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.snap_to_nearest_config" title="Link to this definition">¶</a></dt>
 <dd><p>Helper func that for each param selects the closest actual value.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.common.unscale_and_snap_to_nearest">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">unscale_and_snap_to_nearest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">x</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.unscale_and_snap_to_nearest" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.common.</span></span><span class="sig-name descname"><span class="pre">unscale_and_snap_to_nearest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">x</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.common.unscale_and_snap_to_nearest" title="Link to this definition">¶</a></dt>
 <dd><p>Helper func that snaps a scaled variable to the nearest config.</p>
 </dd></dl>
 
 </section>
 </section>
 <section id="runners">
-<h2>Runners<a class="headerlink" href="#runners" title="Permalink to this heading">¶</a></h2>
+<h2>Runners<a class="headerlink" href="#runners" title="Link to this heading">¶</a></h2>
 <section id="kernel-tuner-runners-sequential-sequentialrunner">
-<h3>kernel_tuner.runners.sequential.SequentialRunner<a class="headerlink" href="#kernel-tuner-runners-sequential-sequentialrunner" title="Permalink to this heading">¶</a></h3>
+<h3>kernel_tuner.runners.sequential.SequentialRunner<a class="headerlink" href="#kernel-tuner-runners-sequential-sequentialrunner" title="Link to this heading">¶</a></h3>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.runners.sequential.SequentialRunner">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.runners.sequential.</span></span><span class="sig-name descname"><span class="pre">SequentialRunner</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.runners.sequential.SequentialRunner" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.runners.sequential.</span></span><span class="sig-name descname"><span class="pre">SequentialRunner</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.runners.sequential.SequentialRunner" title="Link to this definition">¶</a></dt>
 <dd><p>SequentialRunner is used for tuning with a single process/thread.</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.runners.sequential.SequentialRunner.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.runners.sequential.SequentialRunner.__init__" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.runners.sequential.SequentialRunner.__init__" title="Link to this definition">¶</a></dt>
 <dd><p>Instantiate the SequentialRunner.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -343,7 +345,7 @@ <h3>kernel_tuner.runners.sequential.SequentialRunner<a class="headerlink" href="
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.runners.sequential.SequentialRunner.run">
-<span class="sig-name descname"><span class="pre">run</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">parameter_space</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.runners.sequential.SequentialRunner.run" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">run</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">parameter_space</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.runners.sequential.SequentialRunner.run" title="Link to this definition">¶</a></dt>
 <dd><p>Iterate through the entire parameter space using a single Python process.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -367,14 +369,14 @@ <h3>kernel_tuner.runners.sequential.SequentialRunner<a class="headerlink" href="
 
 </section>
 <section id="kernel-tuner-runners-sequential-simulationrunner">
-<h3>kernel_tuner.runners.sequential.SimulationRunner<a class="headerlink" href="#kernel-tuner-runners-sequential-simulationrunner" title="Permalink to this heading">¶</a></h3>
+<h3>kernel_tuner.runners.sequential.SimulationRunner<a class="headerlink" href="#kernel-tuner-runners-sequential-simulationrunner" title="Link to this heading">¶</a></h3>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.runners.simulation.SimulationRunner">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.runners.simulation.</span></span><span class="sig-name descname"><span class="pre">SimulationRunner</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.runners.simulation.SimulationRunner" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.runners.simulation.</span></span><span class="sig-name descname"><span class="pre">SimulationRunner</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.runners.simulation.SimulationRunner" title="Link to this definition">¶</a></dt>
 <dd><p>SimulationRunner is used for tuning with a single process/thread.</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.runners.simulation.SimulationRunner.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.runners.simulation.SimulationRunner.__init__" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.runners.simulation.SimulationRunner.__init__" title="Link to this definition">¶</a></dt>
 <dd><p>Instantiate the SimulationRunner.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -392,7 +394,7 @@ <h3>kernel_tuner.runners.sequential.SimulationRunner<a class="headerlink" href="
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.runners.simulation.SimulationRunner.run">
-<span class="sig-name descname"><span class="pre">run</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">parameter_space</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.runners.simulation.SimulationRunner.run" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">run</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">parameter_space</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.runners.simulation.SimulationRunner.run" title="Link to this definition">¶</a></dt>
 <dd><p>Iterate through the entire parameter space using a single Python process.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -417,16 +419,16 @@ <h3>kernel_tuner.runners.sequential.SimulationRunner<a class="headerlink" href="
 </section>
 </section>
 <section id="device-interfaces">
-<h2>Device Interfaces<a class="headerlink" href="#device-interfaces" title="Permalink to this heading">¶</a></h2>
+<h2>Device Interfaces<a class="headerlink" href="#device-interfaces" title="Link to this heading">¶</a></h2>
 <section id="kernel-tuner-core-deviceinterface">
-<h3>kernel_tuner.core.DeviceInterface<a class="headerlink" href="#kernel-tuner-core-deviceinterface" title="Permalink to this heading">¶</a></h3>
+<h3>kernel_tuner.core.DeviceInterface<a class="headerlink" href="#kernel-tuner-core-deviceinterface" title="Link to this heading">¶</a></h3>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.core.</span></span><span class="sig-name descname"><span class="pre">DeviceInterface</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">platform</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">quiet</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.core.</span></span><span class="sig-name descname"><span class="pre">DeviceInterface</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">platform</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">quiet</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface" title="Link to this definition">¶</a></dt>
 <dd><p>Class that offers a High-Level Device Interface to the rest of the Kernel Tuner</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">platform</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">quiet</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.__init__" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">platform</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">quiet</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.__init__" title="Link to this definition">¶</a></dt>
 <dd><p>Instantiate the DeviceInterface, based on language in kernel source</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -450,91 +452,91 @@ <h3>kernel_tuner.core.DeviceInterface<a class="headerlink" href="#kernel-tuner-c
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.benchmark">
-<span class="sig-name descname"><span class="pre">benchmark</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">instance</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">skip_nvml_setting</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.benchmark" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">benchmark</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">instance</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">skip_nvml_setting</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.benchmark" title="Link to this definition">¶</a></dt>
 <dd><p>Benchmark the kernel instance.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.benchmark_continuous">
-<span class="sig-name descname"><span class="pre">benchmark_continuous</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">result</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">duration</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.benchmark_continuous" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">benchmark_continuous</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">result</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">duration</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.benchmark_continuous" title="Link to this definition">¶</a></dt>
 <dd><p>Benchmark continuously for at least ‘duration’ seconds</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.benchmark_default">
-<span class="sig-name descname"><span class="pre">benchmark_default</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">result</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.benchmark_default" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">benchmark_default</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">result</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.benchmark_default" title="Link to this definition">¶</a></dt>
 <dd><p>Benchmark one kernel execution at a time.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.check_kernel_output">
-<span class="sig-name descname"><span class="pre">check_kernel_output</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">instance</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">answer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">atol</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verify</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.check_kernel_output" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">check_kernel_output</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">instance</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">answer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">atol</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verify</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.check_kernel_output" title="Link to this definition">¶</a></dt>
 <dd><p>runs the kernel once and checks the result against answer</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.compile_kernel">
-<span class="sig-name descname"><span class="pre">compile_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">instance</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.compile_kernel" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">compile_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">instance</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.compile_kernel" title="Link to this definition">¶</a></dt>
 <dd><p>compile the kernel for this specific instance</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.copy_constant_memory_args">
-<span class="sig-name descname"><span class="pre">copy_constant_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.copy_constant_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_constant_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.copy_constant_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>adds constant memory arguments to the most recently compiled module</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.copy_shared_memory_args">
-<span class="sig-name descname"><span class="pre">copy_shared_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">smem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.copy_shared_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_shared_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">smem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.copy_shared_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>adds shared memory arguments to the most recently compiled module</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.copy_texture_memory_args">
-<span class="sig-name descname"><span class="pre">copy_texture_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.copy_texture_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_texture_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.copy_texture_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>adds texture memory arguments to the most recently compiled module</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.create_kernel_instance">
-<span class="sig-name descname"><span class="pre">create_kernel_instance</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.create_kernel_instance" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">create_kernel_instance</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.create_kernel_instance" title="Link to this definition">¶</a></dt>
 <dd><p>create kernel instance from kernel source, parameters, problem size, grid divisors, and so on</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.get_environment">
-<span class="sig-name descname"><span class="pre">get_environment</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.get_environment" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">get_environment</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.get_environment" title="Link to this definition">¶</a></dt>
 <dd><p>Return dictionary with information about the environment</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.memcpy_dtoh">
-<span class="sig-name descname"><span class="pre">memcpy_dtoh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.memcpy_dtoh" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memcpy_dtoh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.memcpy_dtoh" title="Link to this definition">¶</a></dt>
 <dd><p>perform a device to host memory copy</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.preprocess_gpu_arguments">
-<em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">preprocess_gpu_arguments</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">old_arguments</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.preprocess_gpu_arguments" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">preprocess_gpu_arguments</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">old_arguments</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.preprocess_gpu_arguments" title="Link to this definition">¶</a></dt>
 <dd><p>Get a flat list of arguments based on the configuration given by <cite>params</cite></p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.ready_argument_list">
-<span class="sig-name descname"><span class="pre">ready_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.ready_argument_list" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">ready_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.ready_argument_list" title="Link to this definition">¶</a></dt>
 <dd><p>ready argument list to be passed to the kernel, allocates gpu mem if necessary</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.run_kernel">
-<span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.run_kernel" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.run_kernel" title="Link to this definition">¶</a></dt>
 <dd><p>Run a compiled kernel instance on a device</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.core.DeviceInterface.set_nvml_parameters">
-<span class="sig-name descname"><span class="pre">set_nvml_parameters</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.set_nvml_parameters" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">set_nvml_parameters</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.core.DeviceInterface.set_nvml_parameters" title="Link to this definition">¶</a></dt>
 <dd><p>Set the NVML parameters. Avoids setting time leaking into benchmark time.</p>
 </dd></dl>
 
@@ -542,14 +544,14 @@ <h3>kernel_tuner.core.DeviceInterface<a class="headerlink" href="#kernel-tuner-c
 
 </section>
 <section id="kernel-tuner-backends-pycuda-pycudafunctions">
-<h3>kernel_tuner.backends.pycuda.PyCudaFunctions<a class="headerlink" href="#kernel-tuner-backends-pycuda-pycudafunctions" title="Permalink to this heading">¶</a></h3>
+<h3>kernel_tuner.backends.pycuda.PyCudaFunctions<a class="headerlink" href="#kernel-tuner-backends-pycuda-pycudafunctions" title="Link to this heading">¶</a></h3>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.backends.pycuda.</span></span><span class="sig-name descname"><span class="pre">PyCudaFunctions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.backends.pycuda.</span></span><span class="sig-name descname"><span class="pre">PyCudaFunctions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions" title="Link to this definition">¶</a></dt>
 <dd><p>Class that groups the CUDA functions on maintains state about the device.</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.__init__" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.__init__" title="Link to this definition">¶</a></dt>
 <dd><p>Instantiate PyCudaFunctions object used for interacting with the CUDA device.</p>
 <p>Instantiating this object will inspect and store certain device properties at
 runtime, which are used during compilation and/or execution of kernels by the
@@ -567,7 +569,7 @@ <h3>kernel_tuner.backends.pycuda.PyCudaFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions.compile">
-<span class="sig-name descname"><span class="pre">compile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.compile" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">compile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.compile" title="Link to this definition">¶</a></dt>
 <dd><p>Call the CUDA compiler to compile the kernel, return the device function.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -588,7 +590,7 @@ <h3>kernel_tuner.backends.pycuda.PyCudaFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args">
-<span class="sig-name descname"><span class="pre">copy_constant_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_constant_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>Adds constant memory arguments to the most recently compiled module.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -603,13 +605,13 @@ <h3>kernel_tuner.backends.pycuda.PyCudaFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args">
-<span class="sig-name descname"><span class="pre">copy_shared_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">smem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_shared_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">smem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>Add shared memory arguments to the kernel.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args">
-<span class="sig-name descname"><span class="pre">copy_texture_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_texture_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>Adds texture memory arguments to the most recently compiled module.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -621,13 +623,13 @@ <h3>kernel_tuner.backends.pycuda.PyCudaFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished">
-<span class="sig-name descname"><span class="pre">kernel_finished</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">kernel_finished</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished" title="Link to this definition">¶</a></dt>
 <dd><p>Returns True if the kernel has finished, False otherwise.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh">
-<span class="sig-name descname"><span class="pre">memcpy_dtoh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memcpy_dtoh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh" title="Link to this definition">¶</a></dt>
 <dd><p>Perform a device to host memory copy.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -641,7 +643,7 @@ <h3>kernel_tuner.backends.pycuda.PyCudaFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod">
-<span class="sig-name descname"><span class="pre">memcpy_htod</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memcpy_htod</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod" title="Link to this definition">¶</a></dt>
 <dd><p>Perform a host to device memory copy.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -655,7 +657,7 @@ <h3>kernel_tuner.backends.pycuda.PyCudaFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions.memset">
-<span class="sig-name descname"><span class="pre">memset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">allocation</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.memset" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">allocation</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.memset" title="Link to this definition">¶</a></dt>
 <dd><p>Set the memory in allocation to the value in value.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -670,7 +672,7 @@ <h3>kernel_tuner.backends.pycuda.PyCudaFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list">
-<span class="sig-name descname"><span class="pre">ready_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">ready_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list" title="Link to this definition">¶</a></dt>
 <dd><p>Ready argument list to be passed to the kernel, allocates gpu mem.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -689,7 +691,7 @@ <h3>kernel_tuner.backends.pycuda.PyCudaFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel">
-<span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel" title="Link to this definition">¶</a></dt>
 <dd><p>Runs the CUDA kernel passed as ‘func’.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -709,19 +711,19 @@ <h3>kernel_tuner.backends.pycuda.PyCudaFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions.start_event">
-<span class="sig-name descname"><span class="pre">start_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.start_event" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">start_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.start_event" title="Link to this definition">¶</a></dt>
 <dd><p>Records the event that marks the start of a measurement.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event">
-<span class="sig-name descname"><span class="pre">stop_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">stop_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event" title="Link to this definition">¶</a></dt>
 <dd><p>Records the event that marks the end of a measurement.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize">
-<span class="sig-name descname"><span class="pre">synchronize</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">synchronize</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize" title="Link to this definition">¶</a></dt>
 <dd><p>Halts execution until device has finished its tasks.</p>
 </dd></dl>
 
@@ -729,14 +731,14 @@ <h3>kernel_tuner.backends.pycuda.PyCudaFunctions<a class="headerlink" href="#ker
 
 </section>
 <section id="kernel-tuner-backends-cupy-cupyfunctions">
-<h3>kernel_tuner.backends.cupy.CupyFunctions<a class="headerlink" href="#kernel-tuner-backends-cupy-cupyfunctions" title="Permalink to this heading">¶</a></h3>
+<h3>kernel_tuner.backends.cupy.CupyFunctions<a class="headerlink" href="#kernel-tuner-backends-cupy-cupyfunctions" title="Link to this heading">¶</a></h3>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.backends.cupy.</span></span><span class="sig-name descname"><span class="pre">CupyFunctions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.backends.cupy.</span></span><span class="sig-name descname"><span class="pre">CupyFunctions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions" title="Link to this definition">¶</a></dt>
 <dd><p>Class that groups the Cupy functions on maintains state about the device.</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.__init__" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.__init__" title="Link to this definition">¶</a></dt>
 <dd><p>Instantiate CupyFunctions object used for interacting with the CUDA device.</p>
 <p>Instantiating this object will inspect and store certain device properties at
 runtime, which are used during compilation and/or execution of kernels by the
@@ -754,7 +756,7 @@ <h3>kernel_tuner.backends.cupy.CupyFunctions<a class="headerlink" href="#kernel-
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions.compile">
-<span class="sig-name descname"><span class="pre">compile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.compile" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">compile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.compile" title="Link to this definition">¶</a></dt>
 <dd><p>Call the CUDA compiler to compile the kernel, return the device function.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -775,7 +777,7 @@ <h3>kernel_tuner.backends.cupy.CupyFunctions<a class="headerlink" href="#kernel-
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args">
-<span class="sig-name descname"><span class="pre">copy_constant_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_constant_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>Adds constant memory arguments to the most recently compiled module.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -790,13 +792,13 @@ <h3>kernel_tuner.backends.cupy.CupyFunctions<a class="headerlink" href="#kernel-
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args">
-<span class="sig-name descname"><span class="pre">copy_shared_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">smem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_shared_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">smem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>Add shared memory arguments to the kernel.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args">
-<span class="sig-name descname"><span class="pre">copy_texture_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_texture_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>Adds texture memory arguments to the most recently compiled module.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -808,13 +810,13 @@ <h3>kernel_tuner.backends.cupy.CupyFunctions<a class="headerlink" href="#kernel-
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions.kernel_finished">
-<span class="sig-name descname"><span class="pre">kernel_finished</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.kernel_finished" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">kernel_finished</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.kernel_finished" title="Link to this definition">¶</a></dt>
 <dd><p>Returns True if the kernel has finished, False otherwise.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh">
-<span class="sig-name descname"><span class="pre">memcpy_dtoh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memcpy_dtoh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh" title="Link to this definition">¶</a></dt>
 <dd><p>Perform a device to host memory copy.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -828,7 +830,7 @@ <h3>kernel_tuner.backends.cupy.CupyFunctions<a class="headerlink" href="#kernel-
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod">
-<span class="sig-name descname"><span class="pre">memcpy_htod</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memcpy_htod</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod" title="Link to this definition">¶</a></dt>
 <dd><p>Perform a host to device memory copy.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -842,7 +844,7 @@ <h3>kernel_tuner.backends.cupy.CupyFunctions<a class="headerlink" href="#kernel-
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions.memset">
-<span class="sig-name descname"><span class="pre">memset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">allocation</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.memset" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">allocation</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.memset" title="Link to this definition">¶</a></dt>
 <dd><p>Set the memory in allocation to the value in value.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -857,7 +859,7 @@ <h3>kernel_tuner.backends.cupy.CupyFunctions<a class="headerlink" href="#kernel-
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list">
-<span class="sig-name descname"><span class="pre">ready_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">ready_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list" title="Link to this definition">¶</a></dt>
 <dd><p>Ready argument list to be passed to the kernel, allocates gpu mem.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -876,7 +878,7 @@ <h3>kernel_tuner.backends.cupy.CupyFunctions<a class="headerlink" href="#kernel-
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions.run_kernel">
-<span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.run_kernel" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.run_kernel" title="Link to this definition">¶</a></dt>
 <dd><p>Runs the CUDA kernel passed as ‘func’.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -896,19 +898,19 @@ <h3>kernel_tuner.backends.cupy.CupyFunctions<a class="headerlink" href="#kernel-
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions.start_event">
-<span class="sig-name descname"><span class="pre">start_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.start_event" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">start_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.start_event" title="Link to this definition">¶</a></dt>
 <dd><p>Records the event that marks the start of a measurement.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions.stop_event">
-<span class="sig-name descname"><span class="pre">stop_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.stop_event" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">stop_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.stop_event" title="Link to this definition">¶</a></dt>
 <dd><p>Records the event that marks the end of a measurement.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.cupy.CupyFunctions.synchronize">
-<span class="sig-name descname"><span class="pre">synchronize</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.synchronize" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">synchronize</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.cupy.CupyFunctions.synchronize" title="Link to this definition">¶</a></dt>
 <dd><p>Halts execution until device has finished its tasks.</p>
 </dd></dl>
 
@@ -916,14 +918,14 @@ <h3>kernel_tuner.backends.cupy.CupyFunctions<a class="headerlink" href="#kernel-
 
 </section>
 <section id="kernel-tuner-backends-nvcuda-cudafunctions">
-<h3>kernel_tuner.backends.nvcuda.CudaFunctions<a class="headerlink" href="#kernel-tuner-backends-nvcuda-cudafunctions" title="Permalink to this heading">¶</a></h3>
+<h3>kernel_tuner.backends.nvcuda.CudaFunctions<a class="headerlink" href="#kernel-tuner-backends-nvcuda-cudafunctions" title="Link to this heading">¶</a></h3>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.backends.nvcuda.</span></span><span class="sig-name descname"><span class="pre">CudaFunctions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.backends.nvcuda.</span></span><span class="sig-name descname"><span class="pre">CudaFunctions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions" title="Link to this definition">¶</a></dt>
 <dd><p>Class that groups the Cuda functions on maintains state about the device.</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.__init__" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.__init__" title="Link to this definition">¶</a></dt>
 <dd><p>Instantiate CudaFunctions object used for interacting with the CUDA device.</p>
 <p>Instantiating this object will inspect and store certain device properties at
 runtime, which are used during compilation and/or execution of kernels by the
@@ -943,7 +945,7 @@ <h3>kernel_tuner.backends.nvcuda.CudaFunctions<a class="headerlink" href="#kerne
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions.compile">
-<span class="sig-name descname"><span class="pre">compile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.compile" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">compile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.compile" title="Link to this definition">¶</a></dt>
 <dd><p>Call the CUDA compiler to compile the kernel, return the device function.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -964,7 +966,7 @@ <h3>kernel_tuner.backends.nvcuda.CudaFunctions<a class="headerlink" href="#kerne
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args">
-<span class="sig-name descname"><span class="pre">copy_constant_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_constant_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>Adds constant memory arguments to the most recently compiled module.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -979,13 +981,13 @@ <h3>kernel_tuner.backends.nvcuda.CudaFunctions<a class="headerlink" href="#kerne
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args">
-<span class="sig-name descname"><span class="pre">copy_shared_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">smem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_shared_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">smem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>Add shared memory arguments to the kernel.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args">
-<span class="sig-name descname"><span class="pre">copy_texture_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_texture_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>Adds texture memory arguments to the most recently compiled module.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -997,13 +999,13 @@ <h3>kernel_tuner.backends.nvcuda.CudaFunctions<a class="headerlink" href="#kerne
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished">
-<span class="sig-name descname"><span class="pre">kernel_finished</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">kernel_finished</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished" title="Link to this definition">¶</a></dt>
 <dd><p>Returns True if the kernel has finished, False otherwise.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh">
-<em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">memcpy_dtoh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">memcpy_dtoh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh" title="Link to this definition">¶</a></dt>
 <dd><p>Perform a device to host memory copy.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1017,7 +1019,7 @@ <h3>kernel_tuner.backends.nvcuda.CudaFunctions<a class="headerlink" href="#kerne
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod">
-<em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">memcpy_htod</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">memcpy_htod</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod" title="Link to this definition">¶</a></dt>
 <dd><p>Perform a host to device memory copy.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1031,7 +1033,7 @@ <h3>kernel_tuner.backends.nvcuda.CudaFunctions<a class="headerlink" href="#kerne
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions.memset">
-<em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">memset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">allocation</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.memset" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">memset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">allocation</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.memset" title="Link to this definition">¶</a></dt>
 <dd><p>Set the memory in allocation to the value in value.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1046,7 +1048,7 @@ <h3>kernel_tuner.backends.nvcuda.CudaFunctions<a class="headerlink" href="#kerne
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list">
-<span class="sig-name descname"><span class="pre">ready_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">ready_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list" title="Link to this definition">¶</a></dt>
 <dd><p>Ready argument list to be passed to the kernel, allocates gpu mem.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1065,7 +1067,7 @@ <h3>kernel_tuner.backends.nvcuda.CudaFunctions<a class="headerlink" href="#kerne
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel">
-<span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel" title="Link to this definition">¶</a></dt>
 <dd><p>Runs the CUDA kernel passed as ‘func’.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1085,19 +1087,19 @@ <h3>kernel_tuner.backends.nvcuda.CudaFunctions<a class="headerlink" href="#kerne
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions.start_event">
-<span class="sig-name descname"><span class="pre">start_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.start_event" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">start_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.start_event" title="Link to this definition">¶</a></dt>
 <dd><p>Records the event that marks the start of a measurement.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions.stop_event">
-<span class="sig-name descname"><span class="pre">stop_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.stop_event" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">stop_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.stop_event" title="Link to this definition">¶</a></dt>
 <dd><p>Records the event that marks the end of a measurement.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.nvcuda.CudaFunctions.synchronize">
-<em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">synchronize</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.synchronize" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">synchronize</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.nvcuda.CudaFunctions.synchronize" title="Link to this definition">¶</a></dt>
 <dd><p>Halts execution until device has finished its tasks.</p>
 </dd></dl>
 
@@ -1105,14 +1107,14 @@ <h3>kernel_tuner.backends.nvcuda.CudaFunctions<a class="headerlink" href="#kerne
 
 </section>
 <section id="kernel-tuner-backends-opencl-openclfunctions">
-<h3>kernel_tuner.backends.opencl.OpenCLFunctions<a class="headerlink" href="#kernel-tuner-backends-opencl-openclfunctions" title="Permalink to this heading">¶</a></h3>
+<h3>kernel_tuner.backends.opencl.OpenCLFunctions<a class="headerlink" href="#kernel-tuner-backends-opencl-openclfunctions" title="Link to this heading">¶</a></h3>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.backends.opencl.</span></span><span class="sig-name descname"><span class="pre">OpenCLFunctions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">platform</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.backends.opencl.</span></span><span class="sig-name descname"><span class="pre">OpenCLFunctions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">platform</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions" title="Link to this definition">¶</a></dt>
 <dd><p>Class that groups the OpenCL functions on maintains some state about the device.</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">platform</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.__init__" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">platform</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.__init__" title="Link to this definition">¶</a></dt>
 <dd><p>Creates OpenCL device context and reads device properties.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1126,7 +1128,7 @@ <h3>kernel_tuner.backends.opencl.OpenCLFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions.compile">
-<span class="sig-name descname"><span class="pre">compile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.compile" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">compile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.compile" title="Link to this definition">¶</a></dt>
 <dd><p>Call the OpenCL compiler to compile the kernel, return the device function.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1147,31 +1149,31 @@ <h3>kernel_tuner.backends.opencl.OpenCLFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args">
-<span class="sig-name descname"><span class="pre">copy_constant_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_constant_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>This method must implement the allocation and copy of constant memory to the GPU.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args">
-<span class="sig-name descname"><span class="pre">copy_shared_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">smem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_shared_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">smem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>This method must implement the dynamic allocation of shared memory on the GPU.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args">
-<span class="sig-name descname"><span class="pre">copy_texture_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_texture_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>This method must implement the allocation and copy of texture memory to the GPU.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished">
-<span class="sig-name descname"><span class="pre">kernel_finished</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">kernel_finished</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished" title="Link to this definition">¶</a></dt>
 <dd><p>Returns True if the kernel has finished, False otherwise.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh">
-<span class="sig-name descname"><span class="pre">memcpy_dtoh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memcpy_dtoh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh" title="Link to this definition">¶</a></dt>
 <dd><p>Perform a device to host memory copy.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1185,7 +1187,7 @@ <h3>kernel_tuner.backends.opencl.OpenCLFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod">
-<span class="sig-name descname"><span class="pre">memcpy_htod</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memcpy_htod</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod" title="Link to this definition">¶</a></dt>
 <dd><p>Perform a host to device memory copy.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1199,7 +1201,7 @@ <h3>kernel_tuner.backends.opencl.OpenCLFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions.memset">
-<span class="sig-name descname"><span class="pre">memset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">buffer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.memset" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">buffer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.memset" title="Link to this definition">¶</a></dt>
 <dd><p>Set the memory in allocation to the value in value.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1214,7 +1216,7 @@ <h3>kernel_tuner.backends.opencl.OpenCLFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list">
-<span class="sig-name descname"><span class="pre">ready_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">ready_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list" title="Link to this definition">¶</a></dt>
 <dd><p>Ready argument list to be passed to the kernel, allocates gpu mem.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1233,7 +1235,7 @@ <h3>kernel_tuner.backends.opencl.OpenCLFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel">
-<span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel" title="Link to this definition">¶</a></dt>
 <dd><p>Runs the OpenCL kernel passed as ‘func’.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1253,21 +1255,21 @@ <h3>kernel_tuner.backends.opencl.OpenCLFunctions<a class="headerlink" href="#ker
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions.start_event">
-<span class="sig-name descname"><span class="pre">start_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.start_event" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">start_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.start_event" title="Link to this definition">¶</a></dt>
 <dd><p>Records the event that marks the start of a measurement.</p>
 <p>In OpenCL the event is created when the kernel is launched</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions.stop_event">
-<span class="sig-name descname"><span class="pre">stop_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.stop_event" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">stop_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.stop_event" title="Link to this definition">¶</a></dt>
 <dd><p>Records the event that marks the end of a measurement.</p>
 <p>In OpenCL the event is created when the kernel is launched</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.opencl.OpenCLFunctions.synchronize">
-<span class="sig-name descname"><span class="pre">synchronize</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.synchronize" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">synchronize</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.opencl.OpenCLFunctions.synchronize" title="Link to this definition">¶</a></dt>
 <dd><p>Halts execution until device has finished its tasks.</p>
 </dd></dl>
 
@@ -1275,14 +1277,14 @@ <h3>kernel_tuner.backends.opencl.OpenCLFunctions<a class="headerlink" href="#ker
 
 </section>
 <section id="kernel-tuner-backends-compiler-compilerfunctions">
-<h3>kernel_tuner.backends.compiler.CompilerFunctions<a class="headerlink" href="#kernel-tuner-backends-compiler-compilerfunctions" title="Permalink to this heading">¶</a></h3>
+<h3>kernel_tuner.backends.compiler.CompilerFunctions<a class="headerlink" href="#kernel-tuner-backends-compiler-compilerfunctions" title="Link to this heading">¶</a></h3>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.backends.compiler.CompilerFunctions">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.backends.compiler.</span></span><span class="sig-name descname"><span class="pre">CompilerFunctions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.backends.compiler.</span></span><span class="sig-name descname"><span class="pre">CompilerFunctions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions" title="Link to this definition">¶</a></dt>
 <dd><p>Class that groups the code for running and compiling C functions</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.compiler.CompilerFunctions.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.__init__" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.__init__" title="Link to this definition">¶</a></dt>
 <dd><p>instantiate CFunctions object used for interacting with C code</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1293,13 +1295,13 @@ <h3>kernel_tuner.backends.compiler.CompilerFunctions<a class="headerlink" href="
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.compiler.CompilerFunctions.cleanup_lib">
-<span class="sig-name descname"><span class="pre">cleanup_lib</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.cleanup_lib" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">cleanup_lib</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.cleanup_lib" title="Link to this definition">¶</a></dt>
 <dd><p>unload the previously loaded shared library</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.compiler.CompilerFunctions.compile">
-<span class="sig-name descname"><span class="pre">compile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.compile" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">compile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.compile" title="Link to this definition">¶</a></dt>
 <dd><p>call the C compiler to compile the kernel, return the function</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1317,14 +1319,14 @@ <h3>kernel_tuner.backends.compiler.CompilerFunctions<a class="headerlink" href="
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.compiler.CompilerFunctions.kernel_finished">
-<span class="sig-name descname"><span class="pre">kernel_finished</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.kernel_finished" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">kernel_finished</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.kernel_finished" title="Link to this definition">¶</a></dt>
 <dd><p>Returns True if the kernel has finished, False otherwise</p>
 <p>C backend does not support asynchronous launches</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.compiler.CompilerFunctions.memcpy_dtoh">
-<span class="sig-name descname"><span class="pre">memcpy_dtoh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.memcpy_dtoh" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memcpy_dtoh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.memcpy_dtoh" title="Link to this definition">¶</a></dt>
 <dd><p>a simple memcpy copying from an Argument to a numpy array</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1338,7 +1340,7 @@ <h3>kernel_tuner.backends.compiler.CompilerFunctions<a class="headerlink" href="
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.compiler.CompilerFunctions.memcpy_htod">
-<span class="sig-name descname"><span class="pre">memcpy_htod</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.memcpy_htod" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memcpy_htod</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.memcpy_htod" title="Link to this definition">¶</a></dt>
 <dd><p>a simple memcpy copying from a numpy array to an Argument</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1352,7 +1354,7 @@ <h3>kernel_tuner.backends.compiler.CompilerFunctions<a class="headerlink" href="
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.compiler.CompilerFunctions.memset">
-<span class="sig-name descname"><span class="pre">memset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">allocation</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.memset" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">allocation</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.memset" title="Link to this definition">¶</a></dt>
 <dd><p>set the memory in allocation to the value in value</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1367,7 +1369,7 @@ <h3>kernel_tuner.backends.compiler.CompilerFunctions<a class="headerlink" href="
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.compiler.CompilerFunctions.ready_argument_list">
-<span class="sig-name descname"><span class="pre">ready_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.ready_argument_list" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">ready_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.ready_argument_list" title="Link to this definition">¶</a></dt>
 <dd><p>ready argument list to be passed to the C function</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1386,7 +1388,7 @@ <h3>kernel_tuner.backends.compiler.CompilerFunctions<a class="headerlink" href="
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.compiler.CompilerFunctions.run_kernel">
-<span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">c_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.run_kernel" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">c_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.run_kernel" title="Link to this definition">¶</a></dt>
 <dd><p>runs the kernel once, returns whatever the kernel returns</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1414,21 +1416,21 @@ <h3>kernel_tuner.backends.compiler.CompilerFunctions<a class="headerlink" href="
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.compiler.CompilerFunctions.start_event">
-<span class="sig-name descname"><span class="pre">start_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.start_event" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">start_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.start_event" title="Link to this definition">¶</a></dt>
 <dd><p>Records the event that marks the start of a measurement</p>
 <p>C backend does not use events</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.compiler.CompilerFunctions.stop_event">
-<span class="sig-name descname"><span class="pre">stop_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.stop_event" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">stop_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.stop_event" title="Link to this definition">¶</a></dt>
 <dd><p>Records the event that marks the end of a measurement</p>
 <p>C backend does not use events</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.compiler.CompilerFunctions.synchronize">
-<span class="sig-name descname"><span class="pre">synchronize</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.synchronize" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">synchronize</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.compiler.CompilerFunctions.synchronize" title="Link to this definition">¶</a></dt>
 <dd><p>Halts execution until device has finished its tasks</p>
 <p>C backend does not support asynchronous launches</p>
 </dd></dl>
@@ -1437,14 +1439,14 @@ <h3>kernel_tuner.backends.compiler.CompilerFunctions<a class="headerlink" href="
 
 </section>
 <section id="kernel-tuner-backends-hip-hipfunctions">
-<h3>kernel_tuner.backends.hip.HipFunctions<a class="headerlink" href="#kernel-tuner-backends-hip-hipfunctions" title="Permalink to this heading">¶</a></h3>
+<h3>kernel_tuner.backends.hip.HipFunctions<a class="headerlink" href="#kernel-tuner-backends-hip-hipfunctions" title="Link to this heading">¶</a></h3>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.backends.hip.</span></span><span class="sig-name descname"><span class="pre">HipFunctions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.backends.hip.</span></span><span class="sig-name descname"><span class="pre">HipFunctions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions" title="Link to this definition">¶</a></dt>
 <dd><p>Class that groups the HIP functions on maintains state about the device.</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.__init__" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.__init__" title="Link to this definition">¶</a></dt>
 <dd><p>Instantiate HipFunctions object used for interacting with the HIP device.</p>
 <p>Instantiating this object will inspect and store certain device properties at
 runtime, which are used during compilation and/or execution of kernels by the
@@ -1462,7 +1464,7 @@ <h3>kernel_tuner.backends.hip.HipFunctions<a class="headerlink" href="#kernel-tu
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions.compile">
-<span class="sig-name descname"><span class="pre">compile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.compile" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">compile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_instance</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.compile" title="Link to this definition">¶</a></dt>
 <dd><p>Call the HIP compiler to compile the kernel, return the function.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1480,7 +1482,7 @@ <h3>kernel_tuner.backends.hip.HipFunctions<a class="headerlink" href="#kernel-tu
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args">
-<span class="sig-name descname"><span class="pre">copy_constant_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_constant_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>Adds constant memory arguments to the most recently compiled module.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1495,25 +1497,25 @@ <h3>kernel_tuner.backends.hip.HipFunctions<a class="headerlink" href="#kernel-tu
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args">
-<span class="sig-name descname"><span class="pre">copy_shared_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">smem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_shared_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">smem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>Add shared memory arguments to the kernel.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args">
-<span class="sig-name descname"><span class="pre">copy_texture_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">copy_texture_memory_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args" title="Link to this definition">¶</a></dt>
 <dd><p>Copy texture memory arguments. Not yet implemented.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions.kernel_finished">
-<span class="sig-name descname"><span class="pre">kernel_finished</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.kernel_finished" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">kernel_finished</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.kernel_finished" title="Link to this definition">¶</a></dt>
 <dd><p>Returns True if the kernel has finished, False otherwise.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh">
-<span class="sig-name descname"><span class="pre">memcpy_dtoh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memcpy_dtoh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh" title="Link to this definition">¶</a></dt>
 <dd><p>Perform a device to host memory copy.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1527,7 +1529,7 @@ <h3>kernel_tuner.backends.hip.HipFunctions<a class="headerlink" href="#kernel-tu
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions.memcpy_htod">
-<span class="sig-name descname"><span class="pre">memcpy_htod</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.memcpy_htod" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memcpy_htod</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dest</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.memcpy_htod" title="Link to this definition">¶</a></dt>
 <dd><p>Perform a host to device memory copy.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1541,7 +1543,7 @@ <h3>kernel_tuner.backends.hip.HipFunctions<a class="headerlink" href="#kernel-tu
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions.memset">
-<span class="sig-name descname"><span class="pre">memset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">allocation</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.memset" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">memset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">allocation</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.memset" title="Link to this definition">¶</a></dt>
 <dd><p>Set the memory in allocation to the value in value.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1556,7 +1558,7 @@ <h3>kernel_tuner.backends.hip.HipFunctions<a class="headerlink" href="#kernel-tu
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions.ready_argument_list">
-<span class="sig-name descname"><span class="pre">ready_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.ready_argument_list" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">ready_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.ready_argument_list" title="Link to this definition">¶</a></dt>
 <dd><p>Ready argument list to be passed to the HIP function.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1575,7 +1577,7 @@ <h3>kernel_tuner.backends.hip.HipFunctions<a class="headerlink" href="#kernel-tu
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions.run_kernel">
-<span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.run_kernel" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gpu_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.run_kernel" title="Link to this definition">¶</a></dt>
 <dd><p>Runs the HIP kernel passed as ‘func’.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -1595,19 +1597,19 @@ <h3>kernel_tuner.backends.hip.HipFunctions<a class="headerlink" href="#kernel-tu
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions.start_event">
-<span class="sig-name descname"><span class="pre">start_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.start_event" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">start_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.start_event" title="Link to this definition">¶</a></dt>
 <dd><p>Records the event that marks the start of a measurement.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions.stop_event">
-<span class="sig-name descname"><span class="pre">stop_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.stop_event" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">stop_event</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.stop_event" title="Link to this definition">¶</a></dt>
 <dd><p>Records the event that marks the end of a measurement.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.backends.hip.HipFunctions.synchronize">
-<span class="sig-name descname"><span class="pre">synchronize</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.synchronize" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">synchronize</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.backends.hip.HipFunctions.synchronize" title="Link to this definition">¶</a></dt>
 <dd><p>Halts execution until device has finished its tasks.</p>
 </dd></dl>
 
@@ -1616,32 +1618,32 @@ <h3>kernel_tuner.backends.hip.HipFunctions<a class="headerlink" href="#kernel-tu
 </section>
 </section>
 <section id="util-functions">
-<h2>Util Functions<a class="headerlink" href="#util-functions" title="Permalink to this heading">¶</a></h2>
+<h2>Util Functions<a class="headerlink" href="#util-functions" title="Link to this heading">¶</a></h2>
 <section id="module-kernel_tuner.util">
-<span id="kernel-tuner-util"></span><h3>kernel_tuner.util<a class="headerlink" href="#module-kernel_tuner.util" title="Permalink to this heading">¶</a></h3>
+<span id="kernel-tuner-util"></span><h3>kernel_tuner.util<a class="headerlink" href="#module-kernel_tuner.util" title="Link to this heading">¶</a></h3>
 <p>Module for kernel tuner utility functions.</p>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.util.CompilationFailedConfig">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">CompilationFailedConfig</span></span><a class="headerlink" href="#kernel_tuner.util.CompilationFailedConfig" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">CompilationFailedConfig</span></span><a class="headerlink" href="#kernel_tuner.util.CompilationFailedConfig" title="Link to this definition">¶</a></dt>
 <dd></dd></dl>
 
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.util.ErrorConfig">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">ErrorConfig</span></span><a class="headerlink" href="#kernel_tuner.util.ErrorConfig" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">ErrorConfig</span></span><a class="headerlink" href="#kernel_tuner.util.ErrorConfig" title="Link to this definition">¶</a></dt>
 <dd></dd></dl>
 
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.util.InvalidConfig">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">InvalidConfig</span></span><a class="headerlink" href="#kernel_tuner.util.InvalidConfig" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">InvalidConfig</span></span><a class="headerlink" href="#kernel_tuner.util.InvalidConfig" title="Link to this definition">¶</a></dt>
 <dd></dd></dl>
 
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.util.NpEncoder">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">NpEncoder</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">skipkeys</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ensure_ascii</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">check_circular</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">allow_nan</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sort_keys</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">indent</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">separators</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">default</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.NpEncoder" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">NpEncoder</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">skipkeys</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ensure_ascii</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">check_circular</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">allow_nan</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sort_keys</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">indent</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">separators</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">default</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.NpEncoder" title="Link to this definition">¶</a></dt>
 <dd><p>Class we use for dumping Numpy objects to JSON.</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.util.NpEncoder.default">
-<span class="sig-name descname"><span class="pre">default</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">obj</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.NpEncoder.default" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">default</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">obj</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.NpEncoder.default" title="Link to this definition">¶</a></dt>
 <dd><p>Implement this method in a subclass such that it returns
 a serializable object for <code class="docutils literal notranslate"><span class="pre">o</span></code>, or calls the base implementation
 (to raise a <code class="docutils literal notranslate"><span class="pre">TypeError</span></code>).</p>
@@ -1664,138 +1666,138 @@ <h2>Util Functions<a class="headerlink" href="#util-functions" title="Permalink
 
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.util.RuntimeFailedConfig">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">RuntimeFailedConfig</span></span><a class="headerlink" href="#kernel_tuner.util.RuntimeFailedConfig" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">RuntimeFailedConfig</span></span><a class="headerlink" href="#kernel_tuner.util.RuntimeFailedConfig" title="Link to this definition">¶</a></dt>
 <dd></dd></dl>
 
 <dl class="py exception">
 <dt class="sig sig-object py" id="kernel_tuner.util.SkippableFailure">
-<em class="property"><span class="pre">exception</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">SkippableFailure</span></span><a class="headerlink" href="#kernel_tuner.util.SkippableFailure" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">exception</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">SkippableFailure</span></span><a class="headerlink" href="#kernel_tuner.util.SkippableFailure" title="Link to this definition">¶</a></dt>
 <dd><p>Exception used to raise when compiling or launching a kernel fails for a reason that can be expected.</p>
 </dd></dl>
 
 <dl class="py exception">
 <dt class="sig sig-object py" id="kernel_tuner.util.StopCriterionReached">
-<em class="property"><span class="pre">exception</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">StopCriterionReached</span></span><a class="headerlink" href="#kernel_tuner.util.StopCriterionReached" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">exception</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">StopCriterionReached</span></span><a class="headerlink" href="#kernel_tuner.util.StopCriterionReached" title="Link to this definition">¶</a></dt>
 <dd><p>Exception thrown when a stop criterion has been reached.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.check_argument_list">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">check_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_string</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.check_argument_list" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">check_argument_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_string</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.check_argument_list" title="Link to this definition">¶</a></dt>
 <dd><p>Raise an exception if a kernel arguments do not match host arguments.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.check_argument_type">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">check_argument_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dtype</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_argument</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.check_argument_type" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">check_argument_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dtype</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_argument</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.check_argument_type" title="Link to this definition">¶</a></dt>
 <dd><p>Check if the numpy.dtype matches the type used in the code.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.check_restriction">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">check_restriction</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">restrict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#kernel_tuner.util.check_restriction" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">check_restriction</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">restrict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#kernel_tuner.util.check_restriction" title="Link to this definition">¶</a></dt>
 <dd><p>Check whether a configuration meets a search space restriction.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.check_restrictions">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">check_restrictions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">restrictions</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#kernel_tuner.util.check_restrictions" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">check_restrictions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">restrictions</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#kernel_tuner.util.check_restrictions" title="Link to this definition">¶</a></dt>
 <dd><p>Check whether a configuration meets the search space restrictions.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.check_stop_criterion">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">check_stop_criterion</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">to</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.check_stop_criterion" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">check_stop_criterion</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">to</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.check_stop_criterion" title="Link to this definition">¶</a></dt>
 <dd><p>Checks if max_fevals is reached or time limit is exceeded.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.check_thread_block_dimensions">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">check_thread_block_dimensions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">block_size_names</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.check_thread_block_dimensions" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">check_thread_block_dimensions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">block_size_names</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.check_thread_block_dimensions" title="Link to this definition">¶</a></dt>
 <dd><p>Check on maximum thread block dimensions.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.check_tune_params_list">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">check_tune_params_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">simulation_mode</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.check_tune_params_list" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">check_tune_params_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">simulation_mode</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.check_tune_params_list" title="Link to this definition">¶</a></dt>
 <dd><p>Raise an exception if a tune parameter has a forbidden name.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.compile_restrictions">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">compile_restrictions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">restrictions</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">monolithic</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">try_to_constraint</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">Union</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">constraint.constraints.Constraint</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">function</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#kernel_tuner.util.compile_restrictions" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">compile_restrictions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">restrictions</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">monolithic</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">try_to_constraint</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Constraint</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">LambdaType</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#kernel_tuner.util.compile_restrictions" title="Link to this definition">¶</a></dt>
 <dd><p>Parses restrictions from a list of strings into a list of strings, Functions, or Constraints (if <cite>try_to_constraint</cite>) and parameters used, or a single Function if monolithic is true.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.config_valid">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">config_valid</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_threads</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.config_valid" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">config_valid</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_threads</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.config_valid" title="Link to this definition">¶</a></dt>
 <dd><p>Combines restrictions and a check on the max thread block dimension to check config validity.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.convert_constraint_restriction">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">convert_constraint_restriction</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">restrict</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Constraint</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.convert_constraint_restriction" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">convert_constraint_restriction</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">restrict</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Constraint</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.convert_constraint_restriction" title="Link to this definition">¶</a></dt>
 <dd><p>Convert the python-constraint to a function for backwards compatibility.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.correct_open_cache">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">correct_open_cache</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cache</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">open_cache</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.correct_open_cache" title="Permalink to this definition">¶</a></dt>
-<dd><p>if cache file was not properly closed, pretend it was properly closed</p>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">correct_open_cache</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cache</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">open_cache</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.correct_open_cache" title="Link to this definition">¶</a></dt>
+<dd><p>If cache file was not properly closed, pretend it was properly closed.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.cuda_error_check">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">cuda_error_check</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">error</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.cuda_error_check" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">cuda_error_check</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">error</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.cuda_error_check" title="Link to this definition">¶</a></dt>
 <dd><p>Checking the status of CUDA calls using the NVIDIA cuda-python backend.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.delete_temp_file">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">delete_temp_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">filename</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.delete_temp_file" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">delete_temp_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">filename</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.delete_temp_file" title="Link to this definition">¶</a></dt>
 <dd><p>Delete a temporary file, don’t complain if no longer exists.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.detect_language">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">detect_language</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_string</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.detect_language" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">detect_language</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_string</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.detect_language" title="Link to this definition">¶</a></dt>
 <dd><p>Attempt to detect language from the kernel_string.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.dump_cache">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">dump_cache</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">obj</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.dump_cache" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">dump_cache</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">obj</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.dump_cache" title="Link to this definition">¶</a></dt>
 <dd><p>Dumps a string in the cache, this omits the several checks of store_cache() to speed up the process - with great power comes great responsibility!</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.get_best_config">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_best_config</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">results</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective_higher_is_better</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_best_config" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_best_config</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">results</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective_higher_is_better</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_best_config" title="Link to this definition">¶</a></dt>
 <dd><p>Returns the best configuration from a list of results according to some objective.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.get_config_string">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_config_string</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">keys</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">units</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_config_string" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_config_string</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">keys</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">units</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_config_string" title="Link to this definition">¶</a></dt>
 <dd><p>Return a compact string representation of a measurement.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.get_grid_dimensions">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_grid_dimensions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">current_problem_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">block_size_names</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_grid_dimensions" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_grid_dimensions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">current_problem_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">block_size_names</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_grid_dimensions" title="Link to this definition">¶</a></dt>
 <dd><p>Compute grid dims based on problem sizes and listed grid divisors.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.get_instance_string">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_instance_string</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">params</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_instance_string" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_instance_string</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">params</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_instance_string" title="Link to this definition">¶</a></dt>
 <dd><p>Combine the parameters to a string mostly used for debug output use of dict is advised.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.get_kernel_string">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_kernel_string</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_kernel_string" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_kernel_string</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_kernel_string" title="Link to this definition">¶</a></dt>
 <dd><p>Retrieve the kernel source and return as a string.</p>
 <p>This function processes the passed kernel_source argument, which could be
 a function, a string with a filename, or just a string with code already.</p>
@@ -1825,43 +1827,43 @@ <h2>Util Functions<a class="headerlink" href="#util-functions" title="Permalink
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.get_problem_size">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_problem_size</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">problem_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_problem_size" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_problem_size</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">problem_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_problem_size" title="Link to this definition">¶</a></dt>
 <dd><p>Compute current problem size.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.get_smem_args">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_smem_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">smem_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_smem_args" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_smem_args</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">smem_args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_smem_args" title="Link to this definition">¶</a></dt>
 <dd><p>Return a dict with kernel instance specific size.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.get_temp_filename">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_temp_filename</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">suffix</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_temp_filename" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_temp_filename</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">suffix</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_temp_filename" title="Link to this definition">¶</a></dt>
 <dd><p>Return a string in the form of temp_X, where X is a large integer.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.get_thread_block_dimensions">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_thread_block_dimensions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">block_size_names</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_thread_block_dimensions" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_thread_block_dimensions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">block_size_names</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_thread_block_dimensions" title="Link to this definition">¶</a></dt>
 <dd><p>Thread block size from tuning params, currently using convention.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.get_total_timings">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_total_timings</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">results</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">env</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">overhead_time</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_total_timings" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">get_total_timings</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">results</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">env</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">overhead_time</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.get_total_timings" title="Link to this definition">¶</a></dt>
 <dd><p>Sum all timings and put their totals in the env.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.looks_like_a_filename">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">looks_like_a_filename</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.looks_like_a_filename" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">looks_like_a_filename</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.looks_like_a_filename" title="Link to this definition">¶</a></dt>
 <dd><p>Attempt to detect whether source code or a filename was passed.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.normalize_verify_function">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">normalize_verify_function</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">v</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.normalize_verify_function" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">normalize_verify_function</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">v</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.normalize_verify_function" title="Link to this definition">¶</a></dt>
 <dd><p>Normalize a user-specified verify function.</p>
 <p>The user-specified function has two required positional arguments (answer, result_host),
 and an optional keyword (or keyword-only) argument atol. We normalize it to always accept
@@ -1871,13 +1873,13 @@ <h2>Util Functions<a class="headerlink" href="#util-functions" title="Permalink
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.parse_restrictions">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">parse_restrictions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">restrictions</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">monolithic</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">try_to_constraint</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">Union</span><span class="p"><span class="pre">[</span></span><span class="pre">constraint.constraints.Constraint</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#kernel_tuner.util.parse_restrictions" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">parse_restrictions</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">restrictions</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">monolithic</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">try_to_constraint</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">Constraint</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#kernel_tuner.util.parse_restrictions" title="Link to this definition">¶</a></dt>
 <dd><p>Parses restrictions from a list of strings into compilable functions and constraints, or a single compilable function (if monolithic is True). Returns a list of tuples of (strings or constraints) and parameters.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.prepare_kernel_string">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">prepare_kernel_string</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_string</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">block_size_names</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lang</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">defines</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.prepare_kernel_string" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">prepare_kernel_string</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_string</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">threads</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">block_size_names</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lang</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">defines</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.prepare_kernel_string" title="Link to this definition">¶</a></dt>
 <dd><p>Prepare kernel string for compilation.</p>
 <p>Prepends the kernel with a series of C preprocessor defines specific
 to this kernel instance:</p>
@@ -1916,19 +1918,19 @@ <h2>Util Functions<a class="headerlink" href="#util-functions" title="Permalink
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.print_config">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">print_config</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.print_config" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">print_config</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.print_config" title="Link to this definition">¶</a></dt>
 <dd><p>Print the configuration string with tunable parameters and benchmark results.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.print_config_output">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">print_config_output</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">quiet</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metrics</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">units</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.print_config_output" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">print_config_output</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">quiet</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metrics</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">units</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.print_config_output" title="Link to this definition">¶</a></dt>
 <dd><p>Print the configuration string with tunable parameters and benchmark results.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.process_cache">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">process_cache</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cache</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.process_cache" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">process_cache</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cache</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.process_cache" title="Link to this definition">¶</a></dt>
 <dd><p>Cache file for storing tuned configurations.</p>
 <p>the cache file is stored using JSON and uses the following format:</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="p">{</span> <span class="n">device_name</span><span class="p">:</span> <span class="s2">&quot;name of device&quot;</span>
@@ -1950,7 +1952,7 @@ <h2>Util Functions<a class="headerlink" href="#util-functions" title="Permalink
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.process_metrics">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">process_metrics</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metrics</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.process_metrics" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">process_metrics</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metrics</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.process_metrics" title="Link to this definition">¶</a></dt>
 <dd><p>Process user-defined metrics for derived benchmark results.</p>
 <p>Metrics must be a dictionary to support composable metrics. The dictionary keys describe
 the name given to this user-defined metric and will be used as the key in the results dictionaries
@@ -1983,43 +1985,43 @@ <h2>Util Functions<a class="headerlink" href="#util-functions" title="Permalink
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.read_cache">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">read_cache</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cache</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">open_cache</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.read_cache" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">read_cache</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cache</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">open_cache</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.read_cache" title="Link to this definition">¶</a></dt>
 <dd><p>Read the cachefile into a dictionary, if open_cache=True prepare the cachefile for appending.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.read_file">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">read_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">filename</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.read_file" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">read_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">filename</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.read_file" title="Link to this definition">¶</a></dt>
 <dd><p>Return the contents of the file named filename or None if file not found.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.replace_param_occurrences">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">replace_param_occurrences</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">string</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.replace_param_occurrences" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">replace_param_occurrences</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">string</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.replace_param_occurrences" title="Link to this definition">¶</a></dt>
 <dd><p>Replace occurrences of the tuning params with their current value.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.setup_block_and_grid">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">setup_block_and_grid</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">problem_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">block_size_names</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.setup_block_and_grid" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">setup_block_and_grid</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">problem_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">block_size_names</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.setup_block_and_grid" title="Link to this definition">¶</a></dt>
 <dd><p>Compute problem size, thread block and grid dimensions for this kernel.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.store_cache">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">store_cache</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">key</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.store_cache" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">store_cache</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">key</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.store_cache" title="Link to this definition">¶</a></dt>
 <dd><p>Stores a new entry (key, params) to the cachefile.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.to_valid_nvrtc_gpu_arch_cc">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">to_valid_nvrtc_gpu_arch_cc</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">compute_capability</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">str</span></span></span><a class="headerlink" href="#kernel_tuner.util.to_valid_nvrtc_gpu_arch_cc" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">to_valid_nvrtc_gpu_arch_cc</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">compute_capability</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">str</span></span></span><a class="headerlink" href="#kernel_tuner.util.to_valid_nvrtc_gpu_arch_cc" title="Link to this definition">¶</a></dt>
 <dd><p>Returns a valid Compute Capability for NVRTC <cite>–gpu-architecture=</cite>, as per <a class="reference external" href="https://docs.nvidia.com/cuda/nvrtc/index.html#group__options">https://docs.nvidia.com/cuda/nvrtc/index.html#group__options</a>.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.util.write_file">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">write_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">filename</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">string</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.write_file" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.util.</span></span><span class="sig-name descname"><span class="pre">write_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">filename</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">string</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.util.write_file" title="Link to this definition">¶</a></dt>
 <dd><p>Dump the contents of string to a file called filename.</p>
 </dd></dl>
 
diff --git a/latest/dev-environment.html b/latest/dev-environment.html
index e27f6de7c..4cc3263db 100644
--- a/latest/dev-environment.html
+++ b/latest/dev-environment.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Development environment &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -110,12 +112,12 @@
   <div class="toctree-wrapper compound">
 </div>
 <section id="development-environment">
-<span id="dev-environment"></span><h1>Development environment<a class="headerlink" href="#development-environment" title="Permalink to this heading">¶</a></h1>
+<span id="dev-environment"></span><h1>Development environment<a class="headerlink" href="#development-environment" title="Link to this heading">¶</a></h1>
 <p>The following steps help you set up a full development environment. <strong>These steps are only needed for core developers of Kernel Tuner who need to test against multiple Python versions
 or change dependencies of Kernel Tuner.</strong></p>
 <p>For small changes to the code, please see the simplified instructions in the <a class="reference internal" href="contributing.html#simple-dev-env"><span class="std std-ref">Simple development setup</span></a>.</p>
 <section id="local-setup">
-<h2>Local setup<a class="headerlink" href="#local-setup" title="Permalink to this heading">¶</a></h2>
+<h2>Local setup<a class="headerlink" href="#local-setup" title="Link to this heading">¶</a></h2>
 <p>Steps with <code class="code highlight bash docutils literal highlight-bash">sudo</code> access (e.g. on a local device):</p>
 <ol class="arabic simple">
 <li><p>Clone the git repository to the desired location: <code class="code highlight bash docutils literal highlight-bash">git<span class="w"> </span>clone<span class="w"> </span>https://github.com/KernelTuner/kernel_tuner.git</code>, and <code class="code highlight bash docutils literal highlight-bash"><span class="nb">cd</span></code> to it.</p></li>
@@ -138,12 +140,12 @@ <h2>Local setup<a class="headerlink" href="#local-setup" title="Permalink to thi
 <li><dl class="simple">
 <dt>Install the required Python versions:</dt><dd><ul class="simple">
 <li><p>On some systems, additional packages may be needed to build Python versions. For example on Ubuntu: <code class="code highlight bash docutils literal highlight-bash">sudo<span class="w"> </span>apt<span class="w"> </span>install<span class="w"> </span>build-essential<span class="w"> </span>zlib1g-dev<span class="w"> </span>libncurses5-dev<span class="w"> </span>libgdbm-dev<span class="w"> </span>libnss3-dev<span class="w"> </span>libssl-dev<span class="w"> </span>libreadline-dev<span class="w"> </span>libffi-dev<span class="w"> </span>libsqlite3-dev<span class="w"> </span>wget<span class="w"> </span>libbz2-dev<span class="w"> </span>liblzma-dev<span class="w"> </span>lzma</code>.</p></li>
-<li><p>Install the Python versions with: <code class="code highlight bash docutils literal highlight-bash">pyenv<span class="w"> </span>install<span class="w"> </span><span class="m">3</span>.8<span class="w"> </span><span class="m">3</span>.9<span class="w"> </span><span class="m">3</span>.10<span class="w"> </span><span class="m">3</span>.11</code>. The reason we’re installing all these versions as opposed to just one, is so we can test against all supported Python versions.</p></li>
+<li><p>Install the Python versions with: <code class="code highlight bash docutils literal highlight-bash">pyenv<span class="w"> </span>install<span class="w"> </span><span class="m">3</span>.9<span class="w"> </span><span class="m">3</span>.10<span class="w"> </span><span class="m">3</span>.11<span class="w"> </span><span class="m">3</span>.12</code>. The reason we’re installing all these versions as opposed to just one, is so we can test against all supported Python versions.</p></li>
 </ul>
 </dd>
 </dl>
 </li>
-<li><p>Set the Python versions so they can be found: <code class="code highlight bash docutils literal highlight-bash">pyenv<span class="w"> </span><span class="nb">local</span><span class="w"> </span><span class="m">3</span>.8<span class="w"> </span><span class="m">3</span>.9<span class="w"> </span><span class="m">3</span>.10<span class="w"> </span><span class="m">3</span>.11</code> (replace <code class="code highlight bash docutils literal highlight-bash"><span class="nb">local</span></code> with <code class="code highlight bash docutils literal highlight-bash">global</code> when not using the virtualenv).</p></li>
+<li><p>Set the Python versions so they can be found: <code class="code highlight bash docutils literal highlight-bash">pyenv<span class="w"> </span><span class="nb">local</span><span class="w"> </span><span class="m">3</span>.9<span class="w"> </span><span class="m">3</span>.10<span class="w"> </span><span class="m">3</span>.11<span class="w"> </span><span class="m">3</span>.12</code> (replace <code class="code highlight bash docutils literal highlight-bash"><span class="nb">local</span></code> with <code class="code highlight bash docutils literal highlight-bash">global</code> when not using the virtualenv).</p></li>
 <li><p>Setup a local virtual environment in the folder: <code class="code highlight bash docutils literal highlight-bash">pyenv<span class="w"> </span>virtualenv<span class="w"> </span><span class="m">3</span>.11<span class="w"> </span>kerneltuner</code> (or whatever environment name and Python version you prefer).</p></li>
 <li><dl class="simple">
 <dt><a class="reference external" href="https://python-poetry.org/docs/#installing-with-the-official-installer">Install Poetry</a>.</dt><dd><ul class="simple">
@@ -184,7 +186,7 @@ <h2>Local setup<a class="headerlink" href="#local-setup" title="Permalink to thi
 </ol>
 </section>
 <section id="cluster-setup">
-<h2>Cluster setup<a class="headerlink" href="#cluster-setup" title="Permalink to this heading">¶</a></h2>
+<h2>Cluster setup<a class="headerlink" href="#cluster-setup" title="Link to this heading">¶</a></h2>
 <p>Steps without <code class="code highlight bash docutils literal highlight-bash">sudo</code> access (e.g. on a cluster):</p>
 <ol class="arabic">
 <li><p>Clone the git repository to the desired location: <code class="code highlight bash docutils literal highlight-bash">git<span class="w"> </span>clone<span class="w"> </span>https://github.com/KernelTuner/kernel_tuner.git</code>.</p></li>
@@ -263,7 +265,7 @@ <h2>Cluster setup<a class="headerlink" href="#cluster-setup" title="Permalink to
 </ol>
 </section>
 <section id="running-tests">
-<h2>Running tests<a class="headerlink" href="#running-tests" title="Permalink to this heading">¶</a></h2>
+<h2>Running tests<a class="headerlink" href="#running-tests" title="Link to this heading">¶</a></h2>
 <p>To run the tests you can use <code class="code highlight bash docutils literal highlight-bash">nox</code> (to run against all supported Python versions in isolated environments) and <code class="code highlight bash docutils literal highlight-bash">pytest</code> (to run against the local Python version, see below) in the top-level directory.
 For full coverage, make Nox use the additional tests (such as cupy and cuda-python) with <code class="code highlight bash docutils literal highlight-bash">nox<span class="w"> </span>--<span class="w"> </span>additional-tests</code>.</p>
 <p>The Nox isolated environments can take up to 1 gigabyte in size, so users tight on diskspace can run <code class="code highlight bash docutils literal highlight-bash">nox</code> with the <code class="code highlight bash docutils literal highlight-bash">small-disk</code> option. This removes the other environment caches before each session is ran (note that this will take longer to run). A better option would be to change the location environments are stored in with <code class="code highlight bash docutils literal highlight-bash">envdir</code> in the <code class="code highlight bash docutils literal highlight-bash">noxsettings.toml</code> file.</p>
@@ -284,7 +286,7 @@ <h2>Running tests<a class="headerlink" href="#running-tests" title="Permalink to
 Note that these will also use the installed package.</p>
 </section>
 <section id="building-documentation">
-<h2>Building documentation<a class="headerlink" href="#building-documentation" title="Permalink to this heading">¶</a></h2>
+<h2>Building documentation<a class="headerlink" href="#building-documentation" title="Link to this heading">¶</a></h2>
 <p>Documentation is located in the <code class="docutils literal notranslate"><span class="pre">doc/</span></code> directory. This is where you can type
 <code class="docutils literal notranslate"><span class="pre">make</span> <span class="pre">html</span></code> to generate the html pages in the <code class="docutils literal notranslate"><span class="pre">doc/build/html</span></code> directory.
 The source files used for building the documentation are located in
diff --git a/latest/diffusion.html b/latest/diffusion.html
index 0aec5079d..ae5e821ff 100644
--- a/latest/diffusion.html
+++ b/latest/diffusion.html
@@ -1,20 +1,22 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Diffusion &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
-      <link rel="stylesheet" href="_static/nbsphinx-code-cells.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+      <link rel="stylesheet" type="text/css" href="_static/nbsphinx-code-cells.css" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
         <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
         <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
@@ -115,7 +117,7 @@
            <div itemprop="articleBody">
              
   <section id="Diffusion">
-<h1>Diffusion<a class="headerlink" href="#Diffusion" title="Permalink to this heading">¶</a></h1>
+<h1>Diffusion<a class="headerlink" href="#Diffusion" title="Link to this heading">¶</a></h1>
 <p>This guide is designed to show you the whole process starting from modeling a physical process to a Python implementation to creating optimized and auto-tuned GPU application using Kernel Tuner.</p>
 <p>In this guide, we will use <a class="reference external" href="https://en.wikipedia.org/wiki/Diffusion">diffusion</a> as an example application.</p>
 <p>We start with modeling the physical process of diffusion, for which we create a simple numerical implementation in Python. Then we create a CUDA kernel that performs the same computation, but on the GPU. Once we have a CUDA kernel, we start using the Kernel Tuner for auto-tuning our GPU application. And finally, we’ll introduce a few code optimizations to our CUDA kernel that will improve performance, but also add more parameters to tune on using the Kernel Tuner.</p>
@@ -123,7 +125,7 @@ <h1>Diffusion<a class="headerlink" href="#Diffusion" title="Permalink to this he
 <p><strong>Note:</strong> If you are reading this guide on the Kernel Tuner’s documentation pages, note that you can actually run this guide as a Jupyter Notebook. Just clone the Kernel Tuner’s <a class="reference external" href="http://github.com/kerneltuner/kernel_tuner">GitHub repository</a>. Install using <em>pip install .[tutorial,cuda]</em> and you’re ready to go! You can start the guide by typing “jupyter notebook” in the “kernel_tuner/doc/source” directory.</p>
 </div>
 <section id="id1">
-<h2>Diffusion<a class="headerlink" href="#id1" title="Permalink to this heading">¶</a></h2>
+<h2>Diffusion<a class="headerlink" href="#id1" title="Link to this heading">¶</a></h2>
 <p>Put simply, diffusion is the redistribution of something from a region of high concentration to a region of low concentration without bulk motion. The concept of diffusion is widely used in many fields, including physics, chemistry, biology, and many more.</p>
 <p>Suppose that we take a metal sheet, in which the temperature is exactly equal to one degree everywhere in the sheet. Now if we were to heat a number of points on the sheet to a very high temperature, say a thousand degrees, in an instant by some method. We could see the heat diffuse from these hotspots to the cooler areas. We are assuming that the metal does not melt. In addition, we will ignore any heat loss from radiation or other causes in this example.</p>
 <p>We can use the <a class="reference external" href="https://en.wikipedia.org/wiki/Diffusion_equation">diffusion equation</a> to model how the heat diffuses through our metal sheet:</p>
@@ -159,7 +161,7 @@ <h2>Diffusion<a class="headerlink" href="#id1" title="Permalink to this heading"
 \end{equation*}</span></p>
 </section>
 <section id="Python-implementation">
-<h2>Python implementation<a class="headerlink" href="#Python-implementation" title="Permalink to this heading">¶</a></h2>
+<h2>Python implementation<a class="headerlink" href="#Python-implementation" title="Link to this heading">¶</a></h2>
 <p>We can create a Python function that implements the numerical approximation defined in the above equation. For simplicity we’ll use the assumption of a free boundary condition.</p>
 <div class="nbinput nblast docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]:
@@ -262,7 +264,7 @@ <h2>Python implementation<a class="headerlink" href="#Python-implementation" tit
 </div>
 </section>
 <section id="Computing-on-the-GPU">
-<h2>Computing on the GPU<a class="headerlink" href="#Computing-on-the-GPU" title="Permalink to this heading">¶</a></h2>
+<h2>Computing on the GPU<a class="headerlink" href="#Computing-on-the-GPU" title="Link to this heading">¶</a></h2>
 <p>The next step in this guide is to implement a GPU kernel that will allow us to run our problem on the GPU. We store the kernel code in a Python string, because we can directly compile and run the kernel from Python. In this guide, we’ll use the CUDA programming model to implement our kernels.</p>
 <blockquote>
 <div><p>If you prefer OpenCL over CUDA, don’t worry. Everything in this guide applies as much to OpenCL as it does to CUDA. But we will use CUDA for our examples, and CUDA terminology in the text.</p>
@@ -391,7 +393,7 @@ <h2>Computing on the GPU<a class="headerlink" href="#Computing-on-the-GPU" title
 <p>Also, if you think the Python boilerplate code to call a GPU kernel was a bit messy, we’ve got good news for you! From now on, we’ll only use the Kernel Tuner to compile and benchmark GPU kernels, which we can do with much cleaner Python code.</p>
 </section>
 <section id="Auto-Tuning-with-the-Kernel-Tuner">
-<h2>Auto-Tuning with the Kernel Tuner<a class="headerlink" href="#Auto-Tuning-with-the-Kernel-Tuner" title="Permalink to this heading">¶</a></h2>
+<h2>Auto-Tuning with the Kernel Tuner<a class="headerlink" href="#Auto-Tuning-with-the-Kernel-Tuner" title="Link to this heading">¶</a></h2>
 <p>Remember that previously we’ve set the thread block dimensions to 16 by 16. But how do we actually know if that is the best performing setting? That is where auto-tuning comes into play. Basically, it is very difficult to provide an answer through performance modeling and as such, we’d rather use the Kernel Tuner to compile and benchmark all possible kernel configurations.</p>
 <p>But before we continue, we’ll increase the problem size, because the GPU is very likely underutilized.</p>
 <div class="nbinput nblast docutils container">
@@ -487,7 +489,7 @@ <h2>Auto-Tuning with the Kernel Tuner<a class="headerlink" href="#Auto-Tuning-wi
 <p>However, knowing the best performing combination of tunable parameters becomes even more important when we start to further optimize our CUDA kernel. In the next section, we’ll add a simple code optimization and show how this affects performance.</p>
 </section>
 <section id="Using-Shared-Memory">
-<h2>Using Shared Memory<a class="headerlink" href="#Using-Shared-Memory" title="Permalink to this heading">¶</a></h2>
+<h2>Using Shared Memory<a class="headerlink" href="#Using-Shared-Memory" title="Link to this heading">¶</a></h2>
 <p>Shared memory, is a special type of the memory available in CUDA. Shared memory can be used by threads within the same thread block to exchange and share values. It is in fact, one of the very few ways for threads to communicate on the GPU.</p>
 <p>The idea is that we’ll try improve the performance of our kernel by using shared memory as a software controlled cache. There are already caches on the GPU, but most GPUs only cache accesses to global memory in L2. Shared memory is closer to the multiprocessors where the thread blocks are executed, comparable to an L1 cache.</p>
 <p>However, because there are also hardware caches, the performance improvement from this step is expected to not be that great. The more fine-grained control that we get by using a software managed cache, rather than a hardware implemented cache, comes at the cost of some instruction overhead. In fact, performance is quite likely to degrade a little. However, this intermediate step is necessary for the next optimization step we have in mind.</p>
@@ -570,7 +572,7 @@ <h2>Using Shared Memory<a class="headerlink" href="#Using-Shared-Memory" title="
 </div>
 </section>
 <section id="Tiling-GPU-Code">
-<h2>Tiling GPU Code<a class="headerlink" href="#Tiling-GPU-Code" title="Permalink to this heading">¶</a></h2>
+<h2>Tiling GPU Code<a class="headerlink" href="#Tiling-GPU-Code" title="Link to this heading">¶</a></h2>
 <p>One very useful code optimization is called tiling, sometimes also called thread-block-merge. You can look at it in this way, currently we have many thread blocks that together work on the entire domain. If we were to use only half of the number of thread blocks, every thread block would need to double the amount of work it performs to cover the entire domain. However, the threads may be able to reuse part of the data and computation that is required to process a single output element for every
 element beyond the first.</p>
 <p>This is a code optimization because effectively we are reducing the total number of instructions executed by all threads in all thread blocks. So in a way, were are condensing the total instruction stream while keeping the all the really necessary compute instructions. More importantly, we are increasing data reuse, where previously these values would have been reused from the cache or in the worst-case from GPU memory.</p>
@@ -876,7 +878,7 @@ <h2>Tiling GPU Code<a class="headerlink" href="#Tiling-GPU-Code" title="Permalin
 </div>
 </section>
 <section id="Storing-the-results">
-<h2>Storing the results<a class="headerlink" href="#Storing-the-results" title="Permalink to this heading">¶</a></h2>
+<h2>Storing the results<a class="headerlink" href="#Storing-the-results" title="Link to this heading">¶</a></h2>
 <p>While it’s nice that the Kernel Tuner prints the tuning results to stdout, it’s not that great if we’d have to parse what is printed to get the results. That is why the <code class="docutils literal notranslate"><span class="pre">tune_kernel()</span></code> returns a data structure that holds all the results. We’ve actually already used this data in the above bit of Python code.</p>
 <p><code class="docutils literal notranslate"><span class="pre">tune_kernel</span></code> returns a list of dictionaries, where each benchmarked kernel is represented by a dictionary containing the tunable parameters for that particular kernel configuration and one more entry called ‘time’. The list of dictionaries format is very flexible and can easily be converted other formats that are easy to parse formats, like json or csv, for further analysis.</p>
 <p>You can execute the following code block to store the tuning results to both a json and a csv file (if you have Pandas installed).</p>
diff --git a/latest/diffusion_opencl.html b/latest/diffusion_opencl.html
index 286e59da2..b72ce8782 100644
--- a/latest/diffusion_opencl.html
+++ b/latest/diffusion_opencl.html
@@ -1,20 +1,22 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Tutorial: From physics to tuned GPU kernels &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
-      <link rel="stylesheet" href="_static/nbsphinx-code-cells.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+      <link rel="stylesheet" type="text/css" href="_static/nbsphinx-code-cells.css" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
         <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
         <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
@@ -104,7 +106,7 @@
            <div itemprop="articleBody">
              
   <section id="Tutorial:-From-physics-to-tuned-GPU-kernels">
-<h1>Tutorial: From physics to tuned GPU kernels<a class="headerlink" href="#Tutorial:-From-physics-to-tuned-GPU-kernels" title="Permalink to this heading">¶</a></h1>
+<h1>Tutorial: From physics to tuned GPU kernels<a class="headerlink" href="#Tutorial:-From-physics-to-tuned-GPU-kernels" title="Link to this heading">¶</a></h1>
 <p>This tutorial is designed to show you the whole process starting from modeling a physical process to a Python implementation to creating optimized and auto-tuned GPU application using Kernel Tuner.</p>
 <p>In this tutorial, we will use <a class="reference external" href="https://en.wikipedia.org/wiki/Diffusion">diffusion</a> as an example application.</p>
 <p>We start with modeling the physical process of diffusion, for which we create a simple numerical implementation in Python. Then we create an OpenCL kernel that performs the same computation, but on the GPU. Once we have a OpenCL kernel, we start using the Kernel Tuner for auto-tuning our GPU application. And finally, we’ll introduce a few code optimizations to our OpenCL kernel that will improve performance, but also add more parameters to tune on using the Kernel Tuner.</p>
@@ -112,7 +114,7 @@ <h1>Tutorial: From physics to tuned GPU kernels<a class="headerlink" href="#Tuto
 <p><strong>Note:</strong> If you are reading this tutorial on the Kernel Tuner’s documentation pages, note that you can actually run this tutorial as a Jupyter Notebook. Just clone the Kernel Tuner’s <a class="reference external" href="http://github.com/benvanwerkhoven/kernel_tuner">GitHub repository</a>. Install using <em>pip install .[tutorial,opencl]</em> and you’re ready to go! You can start the tutorial by typing “jupyter notebook” in the “kernel_tuner/doc/source” directory.</p>
 </div>
 <section id="Diffusion">
-<h2>Diffusion<a class="headerlink" href="#Diffusion" title="Permalink to this heading">¶</a></h2>
+<h2>Diffusion<a class="headerlink" href="#Diffusion" title="Link to this heading">¶</a></h2>
 <p>Put simply, diffusion is the redistribution of something from a region of high concentration to a region of low concentration without bulk motion. The concept of diffusion is widely used in many fields, including physics, chemistry, biology, and many more.</p>
 <p>Suppose that we take a metal sheet, in which the temperature is exactly equal to one degree everywhere in the sheet. Now if we were to heat a number of points on the sheet to a very high temperature, say a thousand degrees, in an instant by some method. We could see the heat diffuse from these hotspots to the cooler areas. We are assuming that the metal does not melt. In addition, we will ignore any heat loss from radiation or other causes in this example.</p>
 <p>We can use the <a class="reference external" href="https://en.wikipedia.org/wiki/Diffusion_equation">diffusion equation</a> to model how the heat diffuses through our metal sheet:</p>
@@ -148,7 +150,7 @@ <h2>Diffusion<a class="headerlink" href="#Diffusion" title="Permalink to this he
 \end{equation*}</span></p>
 </section>
 <section id="Python-implementation">
-<h2>Python implementation<a class="headerlink" href="#Python-implementation" title="Permalink to this heading">¶</a></h2>
+<h2>Python implementation<a class="headerlink" href="#Python-implementation" title="Link to this heading">¶</a></h2>
 <p>We can create a Python function that implements the numerical approximation defined in the above equation. For simplicity we’ll use the assumption of a free boundary condition.</p>
 <div class="nbinput nblast docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]:
@@ -249,7 +251,7 @@ <h2>Python implementation<a class="headerlink" href="#Python-implementation" tit
 </div>
 </section>
 <section id="Computing-on-the-GPU">
-<h2>Computing on the GPU<a class="headerlink" href="#Computing-on-the-GPU" title="Permalink to this heading">¶</a></h2>
+<h2>Computing on the GPU<a class="headerlink" href="#Computing-on-the-GPU" title="Link to this heading">¶</a></h2>
 <p>The next step in this tutorial is to implement a GPU kernel that will allow us to run our problem on the GPU. We store the kernel code in a Python string, because we can directly compile and run the kernel from Python. In this tutorial, we’ll use the OpenCL programming model to implement our kernels.</p>
 <div class="nbinput nblast docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[6]:
@@ -358,7 +360,7 @@ <h2>Computing on the GPU<a class="headerlink" href="#Computing-on-the-GPU" title
 <p>Also, if you think the Python boilerplate code to call a GPU kernel was a bit messy, we’ve got good news for you! From now on, we’ll only use the Kernel Tuner to compile and benchmark GPU kernels, which we can do with much cleaner Python code.</p>
 </section>
 <section id="Auto-Tuning-with-the-Kernel-Tuner">
-<h2>Auto-Tuning with the Kernel Tuner<a class="headerlink" href="#Auto-Tuning-with-the-Kernel-Tuner" title="Permalink to this heading">¶</a></h2>
+<h2>Auto-Tuning with the Kernel Tuner<a class="headerlink" href="#Auto-Tuning-with-the-Kernel-Tuner" title="Link to this heading">¶</a></h2>
 <p>Remember that previously we’ve set the thread block dimensions to 16 by 16. But how do we actually know if that is the best performing setting? That is where auto-tuning comes into play. Basically, it is very difficult to provide an answer through performance modeling and as such, we’d rather use the Kernel Tuner to compile and benchmark all possible kernel configurations.</p>
 <p>But before we continue, we’ll increase the problem size, because the GPU is very likely underutilized.</p>
 <div class="nbinput nblast docutils container">
@@ -453,7 +455,7 @@ <h2>Auto-Tuning with the Kernel Tuner<a class="headerlink" href="#Auto-Tuning-wi
 <p>However, knowing the best performing combination of tunable parameters becomes even more important when we start to further optimize our OpenCL kernel. In the next section, we’ll add a simple code optimization and show how this affects performance.</p>
 </section>
 <section id="Using-Shared-(local)-Memory">
-<h2>Using Shared (local) Memory<a class="headerlink" href="#Using-Shared-(local)-Memory" title="Permalink to this heading">¶</a></h2>
+<h2>Using Shared (local) Memory<a class="headerlink" href="#Using-Shared-(local)-Memory" title="Link to this heading">¶</a></h2>
 <p>Shared (or local) memory, is a special type of the memory available in OpenCL. Shared memory can be used by threads within the same thread block to exchange and share values. It is in fact, one of the very few ways for threads to communicate on the GPU.</p>
 <p>The idea is that we’ll try improve the performance of our kernel by using shared memory as a software controlled cache. There are already caches on the GPU, but most GPUs only cache accesses to global memory in L2. Shared memory is closer to the multiprocessors where the thread blocks are executed, comparable to an L1 cache.</p>
 <p>However, because there are also hardware caches, the performance improvement from this step is expected to not be that great. The more fine-grained control that we get by using a software managed cache, rather than a hardware implemented cache, comes at the cost of some instruction overhead. In fact, performance is quite likely to degrade a little. However, this intermediate step is necessary for the next optimization step we have in mind.</p>
@@ -535,7 +537,7 @@ <h2>Using Shared (local) Memory<a class="headerlink" href="#Using-Shared-(local)
 </div>
 </section>
 <section id="Tiling-GPU-Code">
-<h2>Tiling GPU Code<a class="headerlink" href="#Tiling-GPU-Code" title="Permalink to this heading">¶</a></h2>
+<h2>Tiling GPU Code<a class="headerlink" href="#Tiling-GPU-Code" title="Link to this heading">¶</a></h2>
 <p>One very useful code optimization is called tiling, sometimes also called thread-block-merge. You can look at it in this way, currently we have many thread blocks that together work on the entire domain. If we were to use only half of the number of thread blocks, every thread block would need to double the amount of work it performs to cover the entire domain. However, the threads may be able to reuse part of the data and computation that is required to process a single output element for every
 element beyond the first.</p>
 <p>This is a code optimization because effectively we are reducing the total number of instructions executed by all threads in all thread blocks. So in a way, were are condensing the total instruction stream while keeping the all the really necessary compute instructions. More importantly, we are increasing data reuse, where previously these values would have been reused from the cache or in the worst-case from GPU memory.</p>
@@ -615,7 +617,7 @@ <h2>Tiling GPU Code<a class="headerlink" href="#Tiling-GPU-Code" title="Permalin
 </pre></div>
 </div>
 </div>
-<div class="nboutput docutils container">
+<div class="nboutput nblast docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
@@ -727,13 +729,6 @@ <h2>Tiling GPU Code<a class="headerlink" href="#Tiling-GPU-Code" title="Permalin
 block_size_x=48, block_size_y=4, tile_size_x=2, tile_size_y=4, time=0.5383296
 block_size_x=48, block_size_y=4, tile_size_x=4, tile_size_y=1, time=0.7326656
 block_size_x=48, block_size_y=4, tile_size_x=4, tile_size_y=2, time=0.5863488
-</pre></div></div>
-</div>
-<div class="nboutput nblast docutils container">
-<div class="prompt empty docutils container">
-</div>
-<div class="output_area docutils container">
-<div class="highlight"><pre>
 block_size_x=48, block_size_y=4, tile_size_x=4, tile_size_y=4, time=0.6813376
 block_size_x=48, block_size_y=8, tile_size_x=1, tile_size_y=1, time=1.1493952
 block_size_x=48, block_size_y=8, tile_size_x=1, tile_size_y=2, time=0.8444928
@@ -848,7 +843,7 @@ <h2>Tiling GPU Code<a class="headerlink" href="#Tiling-GPU-Code" title="Permalin
 </div>
 </section>
 <section id="Storing-the-results">
-<h2>Storing the results<a class="headerlink" href="#Storing-the-results" title="Permalink to this heading">¶</a></h2>
+<h2>Storing the results<a class="headerlink" href="#Storing-the-results" title="Link to this heading">¶</a></h2>
 <p>While it’s nice that the Kernel Tuner prints the tuning results to stdout, it’s not that great if we’d have to parse what is printed to get the results. That is why the <code class="docutils literal notranslate"><span class="pre">tune_kernel()</span></code> returns a data structure that holds all the results. We’ve actually already used this data in the above bit of Python code.</p>
 <p><code class="docutils literal notranslate"><span class="pre">tune_kernel</span></code> returns a list of dictionaries, where each benchmarked kernel is represented by a dictionary containing the tunable parameters for that particular kernel configuration and one more entry called ‘time’. The list of dictionaries format is very flexible and can easily be converted other formats that are easy to parse formats, like json or csv, for further analysis.</p>
 <p>You can execute the following code block to store the tuning results to both a json and a csv file (if you have Pandas installed).</p>
diff --git a/latest/diffusion_use_optparam.html b/latest/diffusion_use_optparam.html
index a3d9ccc0d..4e7f19d2e 100644
--- a/latest/diffusion_use_optparam.html
+++ b/latest/diffusion_use_optparam.html
@@ -1,20 +1,22 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Tutorial: From physics to tuned GPU kernels &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
-      <link rel="stylesheet" href="_static/nbsphinx-code-cells.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+      <link rel="stylesheet" type="text/css" href="_static/nbsphinx-code-cells.css" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
         <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
         <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
@@ -104,7 +106,7 @@
            <div itemprop="articleBody">
              
   <section id="Tutorial:-From-physics-to-tuned-GPU-kernels">
-<h1>Tutorial: From physics to tuned GPU kernels<a class="headerlink" href="#Tutorial:-From-physics-to-tuned-GPU-kernels" title="Permalink to this heading">¶</a></h1>
+<h1>Tutorial: From physics to tuned GPU kernels<a class="headerlink" href="#Tutorial:-From-physics-to-tuned-GPU-kernels" title="Link to this heading">¶</a></h1>
 <p>This tutorial is designed to show you the whole process starting from modeling a physical process to a Python implementation to creating optimized and auto-tuned GPU application using Kernel Tuner.</p>
 <p>In this tutorial, we will use <a class="reference external" href="https://en.wikipedia.org/wiki/Diffusion">diffusion</a> as an example application.</p>
 <p>We start with modeling the physical process of diffusion, for which we create a simple numerical implementation in Python. Then we create a CUDA kernel that performs the same computation, but on the GPU. Once we have a CUDA kernel, we start using the Kernel Tuner for auto-tuning our GPU application. And finally, we’ll introduce a few code optimizations to our CUDA kernel that will improve performance, but also add more parameters to tune on using the Kernel Tuner.</p>
@@ -112,7 +114,7 @@ <h1>Tutorial: From physics to tuned GPU kernels<a class="headerlink" href="#Tuto
 <p><strong>Note:</strong> If you are reading this tutorial on the Kernel Tuner’s documentation pages, note that you can actually run this tutorial as a Jupyter Notebook. Just clone the Kernel Tuner’s <a class="reference external" href="http://github.com/benvanwerkhoven/kernel_tuner">GitHub repository</a>. Install the Kernel Tuner and Jupyter Notebooks and you’re ready to go! You can start the tutorial by typing “jupyter notebook” in the “kernel_tuner/doc/source” directory.</p>
 </div>
 <section id="Diffusion">
-<h2>Diffusion<a class="headerlink" href="#Diffusion" title="Permalink to this heading">¶</a></h2>
+<h2>Diffusion<a class="headerlink" href="#Diffusion" title="Link to this heading">¶</a></h2>
 <p>Put simply, diffusion is the redistribution of something from a region of high concentration to a region of low concentration without bulk motion. The concept of diffusion is widely used in many fields, including physics, chemistry, biology, and many more.</p>
 <p>Suppose that we take a metal sheet, in which the temperature is exactly equal to one degree everywhere in the sheet. Now if we were to heat a number of points on the sheet to a very high temperature, say a thousand degrees, in an instant by some method. We could see the heat diffuse from these hotspots to the cooler areas. We are assuming that the metal does not melt. In addition, we will ignore any heat loss from radiation or other causes in this example.</p>
 <p>We can use the <a class="reference external" href="https://en.wikipedia.org/wiki/Diffusion_equation">diffusion equation</a> to model how the heat diffuses through our metal sheet:</p>
@@ -148,7 +150,7 @@ <h2>Diffusion<a class="headerlink" href="#Diffusion" title="Permalink to this he
 \end{equation*}</span></p>
 </section>
 <section id="Python-implementation">
-<h2>Python implementation<a class="headerlink" href="#Python-implementation" title="Permalink to this heading">¶</a></h2>
+<h2>Python implementation<a class="headerlink" href="#Python-implementation" title="Link to this heading">¶</a></h2>
 <p>We can create a Python function that implements the numerical approximation defined in the above equation. For simplicity we’ll use the assumption of a free boundary condition.</p>
 <div class="nbinput nblast docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[2]:
@@ -257,7 +259,7 @@ <h2>Python implementation<a class="headerlink" href="#Python-implementation" tit
 </div>
 </section>
 <section id="Computing-on-the-GPU">
-<h2>Computing on the GPU<a class="headerlink" href="#Computing-on-the-GPU" title="Permalink to this heading">¶</a></h2>
+<h2>Computing on the GPU<a class="headerlink" href="#Computing-on-the-GPU" title="Link to this heading">¶</a></h2>
 <p>The next step in this tutorial is to implement a GPU kernel that will allow us to run our problem on the GPU. We store the kernel code in a Python string, because we can directly compile and run the kernel from Python. In this tutorial, we’ll use the CUDA programming model to implement our kernels.</p>
 <blockquote>
 <div><p>If you prefer OpenCL over CUDA, don’t worry. Everything in this tutorial applies as much to OpenCL as it does to CUDA. But we will use CUDA for our examples, and CUDA terminology in the text.</p>
@@ -361,7 +363,7 @@ <h2>Computing on the GPU<a class="headerlink" href="#Computing-on-the-GPU" title
 <p>Also, if you think the Python boilerplate code to call a GPU kernel was a bit messy, we’ve got good news for you! From now on, we’ll only use the Kernel Tuner to compile and benchmark GPU kernels, which we can do with much cleaner Python code.</p>
 </section>
 <section id="Auto-Tuning-with-the-Kernel-Tuner">
-<h2>Auto-Tuning with the Kernel Tuner<a class="headerlink" href="#Auto-Tuning-with-the-Kernel-Tuner" title="Permalink to this heading">¶</a></h2>
+<h2>Auto-Tuning with the Kernel Tuner<a class="headerlink" href="#Auto-Tuning-with-the-Kernel-Tuner" title="Link to this heading">¶</a></h2>
 <p>Remember that previously we’ve set the thread block dimensions to 16 by 16. But how do we actually know if that is the best performing setting? That is where auto-tuning comes into play. Basically, it is very difficult to provide an answer through performance modeling and as such, we’d rather use the Kernel Tuner to compile and benchmark all possible kernel configurations.</p>
 <p>But before we continue, we’ll increase the problem size, because the GPU is very likely underutilized.</p>
 <div class="nbinput nblast docutils container">
@@ -456,7 +458,7 @@ <h2>Auto-Tuning with the Kernel Tuner<a class="headerlink" href="#Auto-Tuning-wi
 <p>However, knowing the best performing combination of tunable parameters becomes even more important when we start to further optimize our CUDA kernel. In the next section, we’ll add a simple code optimization and show how this affects performance.</p>
 </section>
 <section id="Using-shared-memory">
-<h2>Using shared memory<a class="headerlink" href="#Using-shared-memory" title="Permalink to this heading">¶</a></h2>
+<h2>Using shared memory<a class="headerlink" href="#Using-shared-memory" title="Link to this heading">¶</a></h2>
 <p>Shared memory, is a special type of the memory available in CUDA. Shared memory can be used by threads within the same thread block to exchange and share values. It is in fact, one of the very few ways for threads to communicate on the GPU.</p>
 <p>The idea is that we’ll try improve the performance of our kernel by using shared memory as a software controlled cache. There are already caches on the GPU, but most GPUs only cache accesses to global memory in L2. Shared memory is closer to the multiprocessors where the thread blocks are executed, comparable to an L1 cache.</p>
 <p>However, because there are also hardware caches, the performance improvement from this step is expected to not be that great. The more fine-grained control that we get by using a software managed cache, rather than a hardware implemented cache, comes at the cost of some instruction overhead. In fact, performance is quite likely to degrade a little. However, this intermediate step is necessary for the next optimization step we have in mind.</p>
@@ -546,7 +548,7 @@ <h2>Using shared memory<a class="headerlink" href="#Using-shared-memory" title="
 </div>
 </section>
 <section id="Tiling-GPU-Code">
-<h2>Tiling GPU Code<a class="headerlink" href="#Tiling-GPU-Code" title="Permalink to this heading">¶</a></h2>
+<h2>Tiling GPU Code<a class="headerlink" href="#Tiling-GPU-Code" title="Link to this heading">¶</a></h2>
 <p>One very useful code optimization is called tiling, sometimes also called thread-block-merge. You can look at it in this way, currently we have many thread blocks that together work on the entire domain. If we were to use only half of the number of thread blocks, every thread block would need to double the amount of work it performs to cover the entire domain. However, the threads may be able to reuse part of the data and computation that is required to process a single output element for every
 element beyond the first.</p>
 <p>This is a code optimization because effectively we are reducing the total number of instructions executed by all threads in all thread blocks. So in a way, were are condensing the total instruction stream while keeping the all the really necessary compute instructions. More importantly, we are increasing data reuse, where previously these values would have been reused from the cache or in the worst-case from GPU memory.</p>
@@ -621,7 +623,7 @@ <h2>Tiling GPU Code<a class="headerlink" href="#Tiling-GPU-Code" title="Permalin
 </pre></div>
 </div>
 </div>
-<div class="nboutput docutils container">
+<div class="nboutput nblast docutils container">
 <div class="prompt empty docutils container">
 </div>
 <div class="output_area docutils container">
@@ -726,13 +728,6 @@ <h2>Tiling GPU Code<a class="headerlink" href="#Tiling-GPU-Code" title="Permalin
 block_size_x=48, block_size_y=2, tile_size_x=4, tile_size_y=2, time=0.375468802452
 block_size_x=48, block_size_y=2, tile_size_x=4, tile_size_y=4, time=0.346079999208
 block_size_x=48, block_size_y=4, tile_size_x=1, tile_size_y=1, time=0.771052801609
-</pre></div></div>
-</div>
-<div class="nboutput nblast docutils container">
-<div class="prompt empty docutils container">
-</div>
-<div class="output_area docutils container">
-<div class="highlight"><pre>
 block_size_x=48, block_size_y=4, tile_size_x=1, tile_size_y=2, time=0.593977594376
 block_size_x=48, block_size_y=4, tile_size_x=1, tile_size_y=4, time=0.49723520875
 block_size_x=48, block_size_y=4, tile_size_x=2, tile_size_y=1, time=0.583270406723
@@ -828,10 +823,10 @@ <h2>Tiling GPU Code<a class="headerlink" href="#Tiling-GPU-Code" title="Permalin
 <p>However, there are actually several kernel configurations that come close. The following Python code prints all instances with an execution time within 5% of the best performing configuration.</p>
 </section>
 <section id="Using-the-best-parameters-in-a-production-run">
-<h2>Using the best parameters in a production run<a class="headerlink" href="#Using-the-best-parameters-in-a-production-run" title="Permalink to this heading">¶</a></h2>
+<h2>Using the best parameters in a production run<a class="headerlink" href="#Using-the-best-parameters-in-a-production-run" title="Link to this heading">¶</a></h2>
 <p>Now that we have determined which parameters are the best for our problems we can use them to simulate the heat diffusion problem. There are several ways to do so depending on the host language you wish to use.</p>
 <section id="Python-run">
-<h3>Python run<a class="headerlink" href="#Python-run" title="Permalink to this heading">¶</a></h3>
+<h3>Python run<a class="headerlink" href="#Python-run" title="Link to this heading">¶</a></h3>
 <p>To use the optimized parameters in a python run, we simply have to modify the kernel code to specify which value to use for the block and tile size. There are of course many different ways to achieve this. In simple cases on can define a dictionary of values and replace the string <code class="docutils literal notranslate"><span class="pre">block_size_i</span></code> and <code class="docutils literal notranslate"><span class="pre">tile_size_j</span></code> by their values.</p>
 <div class="nbinput nblast docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[18]:
@@ -927,7 +922,7 @@ <h3>Python run<a class="headerlink" href="#Python-run" title="Permalink to this
 </div>
 </section>
 <section id="C-run">
-<h3>C run<a class="headerlink" href="#C-run" title="Permalink to this heading">¶</a></h3>
+<h3>C run<a class="headerlink" href="#C-run" title="Link to this heading">¶</a></h3>
 <p>If you wish to incorporate the optimized parameters in the kernel and use it in a C run you can use <code class="docutils literal notranslate"><span class="pre">ifndef</span></code> statement at the begining of the kerenel as demonstrated in the psedo code below.</p>
 <div class="nbinput nblast docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[ ]:
diff --git a/latest/examples.html b/latest/examples.html
index cde01499e..ed6eab402 100644
--- a/latest/examples.html
+++ b/latest/examples.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Kernel Tuner Examples &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -122,7 +124,7 @@
   <div class="toctree-wrapper compound">
 </div>
 <section id="kernel-tuner-examples">
-<span id="examples"></span><h1>Kernel Tuner Examples<a class="headerlink" href="#kernel-tuner-examples" title="Permalink to this heading">¶</a></h1>
+<span id="examples"></span><h1>Kernel Tuner Examples<a class="headerlink" href="#kernel-tuner-examples" title="Link to this heading">¶</a></h1>
 <p>Most of the examples show how to use Kernel Tuner to tune a
 CUDA, OpenCL, or C kernel, while demonstrating a particular usecase of Kernel Tuner.</p>
 <p>Except for <a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/test_vector_add.py">test_vector_add.py</a>  and
@@ -136,7 +138,7 @@
 </div>
 <p>Below we list the example applications and the features they illustrate.</p>
 <section id="vector-add">
-<h2>Vector Add<a class="headerlink" href="#vector-add" title="Permalink to this heading">¶</a></h2>
+<h2>Vector Add<a class="headerlink" href="#vector-add" title="Link to this heading">¶</a></h2>
 <dl class="simple">
 <dt>[<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/vector_add.py">CUDA</a>] [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda-c++/vector_add.py">CUDA-C++</a>] [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/vector_add.py">OpenCL</a>] [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/c/vector_add.py">C</a>] [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/fortran/vector_add.py">Fortran</a>] [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/directives/vector_add_c_openacc.py">OpenACC-C++</a>] [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/directives/vector_add_fortran_openacc.py">OpenACC-Fortran</a>]</dt><dd><ul class="simple">
 <li><p>use Kernel Tuner to tune a simple kernel</p></li>
@@ -145,7 +147,7 @@ <h2>Vector Add<a class="headerlink" href="#vector-add" title="Permalink to this
 </dl>
 </section>
 <section id="stencil">
-<h2>Stencil<a class="headerlink" href="#stencil" title="Permalink to this heading">¶</a></h2>
+<h2>Stencil<a class="headerlink" href="#stencil" title="Link to this heading">¶</a></h2>
 <dl class="simple">
 <dt>[<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/stencil.py">CUDA</a>] [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/stencil.py">OpenCL</a>]</dt><dd><ul class="simple">
 <li><p>use a 2-dimensional problem domain with 2-dimensional thread blocks in a simple and clean example</p></li>
@@ -154,7 +156,7 @@ <h2>Stencil<a class="headerlink" href="#stencil" title="Permalink to this headin
 </dl>
 </section>
 <section id="matrix-multiplication">
-<h2>Matrix Multiplication<a class="headerlink" href="#matrix-multiplication" title="Permalink to this heading">¶</a></h2>
+<h2>Matrix Multiplication<a class="headerlink" href="#matrix-multiplication" title="Link to this heading">¶</a></h2>
 <dl class="simple">
 <dt>[<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/matmul.py">CUDA</a>] [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/matmul.py">OpenCL</a>]</dt><dd><ul class="simple">
 <li><p>pass a filename instead of a string with code</p></li>
@@ -167,12 +169,12 @@ <h2>Matrix Multiplication<a class="headerlink" href="#matrix-multiplication" tit
 </dl>
 </section>
 <section id="convolution">
-<h2>Convolution<a class="headerlink" href="#convolution" title="Permalink to this heading">¶</a></h2>
+<h2>Convolution<a class="headerlink" href="#convolution" title="Link to this heading">¶</a></h2>
 <p>There are several different examples centered around the convolution
 kernel [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/convolution.cu">CUDA</a>]
 [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/convolution.cl">OpenCL</a>]</p>
 <section id="convolution-py">
-<h3>convolution.py<a class="headerlink" href="#convolution-py" title="Permalink to this heading">¶</a></h3>
+<h3>convolution.py<a class="headerlink" href="#convolution-py" title="Link to this heading">¶</a></h3>
 <dl class="simple">
 <dt>[<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/convolution.py">CUDA</a>] [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/convolution.py">OpenCL</a>]</dt><dd><ul class="simple">
 <li><p>use tunable parameters for tuning for multiple input sizes</p></li>
@@ -183,7 +185,7 @@ <h3>convolution.py<a class="headerlink" href="#convolution-py" title="Permalink
 </dl>
 </section>
 <section id="sepconv-py">
-<h3>sepconv.py<a class="headerlink" href="#sepconv-py" title="Permalink to this heading">¶</a></h3>
+<h3>sepconv.py<a class="headerlink" href="#sepconv-py" title="Link to this heading">¶</a></h3>
 <dl class="simple">
 <dt>[<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/sepconv.py">CUDA</a>] [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/sepconv.py">OpenCL</a>]</dt><dd><ul class="simple">
 <li><p>use the convolution kernel for separable filters</p></li>
@@ -193,7 +195,7 @@ <h3>sepconv.py<a class="headerlink" href="#sepconv-py" title="Permalink to this
 </dl>
 </section>
 <section id="convolution-correct-py">
-<h3>convolution_correct.py<a class="headerlink" href="#convolution-correct-py" title="Permalink to this heading">¶</a></h3>
+<h3>convolution_correct.py<a class="headerlink" href="#convolution-correct-py" title="Link to this heading">¶</a></h3>
 <dl class="simple">
 <dt>[<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/convolution_correct.py">CUDA</a>] [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/convolution_correct.py">OpenCL</a>]</dt><dd><ul class="simple">
 <li><p>use run_kernel to compute a reference answer</p></li>
@@ -203,7 +205,7 @@ <h3>convolution_correct.py<a class="headerlink" href="#convolution-correct-py" t
 </dl>
 </section>
 <section id="convolution-streams-py">
-<h3>convolution_streams.py<a class="headerlink" href="#convolution-streams-py" title="Permalink to this heading">¶</a></h3>
+<h3>convolution_streams.py<a class="headerlink" href="#convolution-streams-py" title="Link to this heading">¶</a></h3>
 <dl class="simple">
 <dt>[<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/convolution_streams.py">CUDA</a>]</dt><dd><ul class="simple">
 <li><p>allocate page-locked host memory from Python</p></li>
@@ -217,7 +219,7 @@ <h3>convolution_streams.py<a class="headerlink" href="#convolution-streams-py" t
 </section>
 </section>
 <section id="reduction">
-<h2>Reduction<a class="headerlink" href="#reduction" title="Permalink to this heading">¶</a></h2>
+<h2>Reduction<a class="headerlink" href="#reduction" title="Link to this heading">¶</a></h2>
 <dl class="simple">
 <dt>[<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/reduction.py">CUDA</a>] [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/reduction.py">OpenCL</a>]</dt><dd><ul class="simple">
 <li><p>use vector types and shuffle instructions (shuffle is only available in CUDA)</p></li>
@@ -230,7 +232,7 @@ <h2>Reduction<a class="headerlink" href="#reduction" title="Permalink to this he
 </dl>
 </section>
 <section id="sparse-matrix-vector-multiplication">
-<h2>Sparse Matrix Vector Multiplication<a class="headerlink" href="#sparse-matrix-vector-multiplication" title="Permalink to this heading">¶</a></h2>
+<h2>Sparse Matrix Vector Multiplication<a class="headerlink" href="#sparse-matrix-vector-multiplication" title="Link to this heading">¶</a></h2>
 <dl class="simple">
 <dt>[<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/spmv.py">CUDA</a>]</dt><dd><ul class="simple">
 <li><p>use scipy to compute a reference answer and verify all benchmarked kernels</p></li>
@@ -240,7 +242,7 @@ <h2>Sparse Matrix Vector Multiplication<a class="headerlink" href="#sparse-matri
 </dl>
 </section>
 <section id="point-in-polygon">
-<h2>Point-in-Polygon<a class="headerlink" href="#point-in-polygon" title="Permalink to this heading">¶</a></h2>
+<h2>Point-in-Polygon<a class="headerlink" href="#point-in-polygon" title="Link to this heading">¶</a></h2>
 <dl class="simple">
 <dt>[<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/pnpoly.py">CUDA</a>]</dt><dd><ul class="simple">
 <li><p>overlap transfers with device mapped host memory</p></li>
@@ -250,7 +252,7 @@ <h2>Point-in-Polygon<a class="headerlink" href="#point-in-polygon" title="Permal
 </dl>
 </section>
 <section id="expdist">
-<h2>ExpDist<a class="headerlink" href="#expdist" title="Permalink to this heading">¶</a></h2>
+<h2>ExpDist<a class="headerlink" href="#expdist" title="Link to this heading">¶</a></h2>
 <dl class="simple">
 <dt>[<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/expdist.py">CUDA</a>]</dt><dd><ul class="simple">
 <li><p>in-thread block 2D reduction using CUB library</p></li>
@@ -261,7 +263,7 @@ <h2>ExpDist<a class="headerlink" href="#expdist" title="Permalink to this headin
 </dl>
 </section>
 <section id="code-generator">
-<h2>Code Generator<a class="headerlink" href="#code-generator" title="Permalink to this heading">¶</a></h2>
+<h2>Code Generator<a class="headerlink" href="#code-generator" title="Link to this heading">¶</a></h2>
 <dl class="simple">
 <dt>[<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/vector_add_codegen.py">CUDA</a>] [<a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/vector_add_codegen.py">OpenCL</a>]</dt><dd><ul class="simple">
 <li><p>use a Python function as a code generator</p></li>
diff --git a/latest/genindex.html b/latest/genindex.html
index 88f5ac7cc..8b6a2852b 100644
--- a/latest/genindex.html
+++ b/latest/genindex.html
@@ -1,18 +1,20 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
   <meta charset="utf-8" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Index &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="#" />
diff --git a/latest/grid3d.html b/latest/grid3d.html
index fdeed281d..1156437b5 100644
--- a/latest/grid3d.html
+++ b/latest/grid3d.html
@@ -1,20 +1,22 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>3D Grid on GPU with Kernel Tuner &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
-      <link rel="stylesheet" href="_static/nbsphinx-code-cells.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+      <link rel="stylesheet" type="text/css" href="_static/nbsphinx-code-cells.css" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
         <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
         <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
@@ -104,13 +106,13 @@
            <div itemprop="articleBody">
              
   <section id="3D-Grid-on-GPU-with-Kernel-Tuner">
-<h1>3D Grid on GPU with Kernel Tuner<a class="headerlink" href="#3D-Grid-on-GPU-with-Kernel-Tuner" title="Permalink to this heading">¶</a></h1>
+<h1>3D Grid on GPU with Kernel Tuner<a class="headerlink" href="#3D-Grid-on-GPU-with-Kernel-Tuner" title="Link to this heading">¶</a></h1>
 <p>In this tutorial we are going to see how to map a series of Gaussian functions, each located at a different point on a 3D a grid. We are going to optimize the GPU code and compare its performance with the CPU implementation.</p>
 <div class="admonition note">
 <p><strong>Note:</strong> If you are reading this tutorial on the Kernel Tuner’s documentation pages, note that you can actually run this tutorial as a Jupyter Notebook. Just clone the Kernel Tuner’s <a class="reference external" href="http://github.com/benvanwerkhoven/kernel_tuner">GitHub repository</a>. Install the Kernel Tuner and Jupyter Notebooks and you’re ready to go! You can start the tutorial by typing “jupyter notebook” in the “kernel_tuner/doc/source” directory.</p>
 </div>
 <section id="Let's-start-on-the-CPU">
-<h2>Let’s start on the CPU<a class="headerlink" href="#Let's-start-on-the-CPU" title="Permalink to this heading">¶</a></h2>
+<h2>Let’s start on the CPU<a class="headerlink" href="#Let's-start-on-the-CPU" title="Link to this heading">¶</a></h2>
 <p>Before delving into the GPU implementation, let’s start with a simple CPU implementation of the problem. The problem at hand is to compute the values of the following function</p>
 <p><span class="math">\begin{equation} \nonumber
 f = \sum_{i=1}^{N}\exp\left(-\beta \sqrt{(x-x_i)^2+(y-y_i)^2+(z-z_i)^2}\right)
@@ -175,7 +177,7 @@ <h2>Let’s start on the CPU<a class="headerlink" href="#Let's-start-on-the-CPU"
 <p>Depending on your hardware it might take a few seconds for the calculations above to finish.</p>
 </section>
 <section id="Let's-move-to-the-GPU">
-<h2>Let’s move to the GPU<a class="headerlink" href="#Let's-move-to-the-GPU" title="Permalink to this heading">¶</a></h2>
+<h2>Let’s move to the GPU<a class="headerlink" href="#Let's-move-to-the-GPU" title="Link to this heading">¶</a></h2>
 <p>Let’s see now how that will look like on the GPU. We first write a kernel that does the same calculation as the above function. As you can see see below, the variables <code class="docutils literal notranslate"><span class="pre">block_size_x</span></code>, <code class="docutils literal notranslate"><span class="pre">block_size_y</span></code> and <code class="docutils literal notranslate"><span class="pre">block_size_z</span></code> are not yet defined here. These variables are used to set the number of threads per thread block on the GPU and are the main parameters that we will optimize in this tutorial. During tuning, Kernel Tuner will automatically insert <code class="docutils literal notranslate"><span class="pre">#define</span></code> statements for these parameters at
 the top of the kernel code. So for now we don’t have to specify their values.</p>
 <p>The dimensions of the problem <code class="docutils literal notranslate"><span class="pre">nx</span></code>, <code class="docutils literal notranslate"><span class="pre">ny</span></code>, and <code class="docutils literal notranslate"><span class="pre">nz</span></code>, are the number of grid points in the x, y, and z dimensions. We can again use Kernel Tuner to insert these parameters into the code.</p>
@@ -221,7 +223,7 @@ <h2>Let’s move to the GPU<a class="headerlink" href="#Let's-move-to-the-GPU" t
 </div>
 </div>
 <section id="Tune-the-kernel">
-<h3>Tune the kernel<a class="headerlink" href="#Tune-the-kernel" title="Permalink to this heading">¶</a></h3>
+<h3>Tune the kernel<a class="headerlink" href="#Tune-the-kernel" title="Link to this heading">¶</a></h3>
 <p>We can now use the tuner to optimize the thread block dimensions on our GPU. To do so we define the tunable parameters of our kernel using the <code class="docutils literal notranslate"><span class="pre">tune_params</span></code> dictionary, which assigns to each block size the values we want the tuner to explore. We also use the tunable parameters to insert the domain dimensions <code class="docutils literal notranslate"><span class="pre">nx</span></code>, <code class="docutils literal notranslate"><span class="pre">ny</span></code>, and <code class="docutils literal notranslate"><span class="pre">nz</span></code>.</p>
 <p>We also define a list containing the arguments of the CUDA function (AddGrid) above. Since we only want to optimize the performance of the kernel we only consider here one center in the middle of the grid. Note that Kernel Tuner needs either <code class="docutils literal notranslate"><span class="pre">numpy.ndarray</span></code> or <code class="docutils literal notranslate"><span class="pre">numpy.scalar</span></code> as arguments of the kernel. Hence we need to be specific on the types of the Gaussians positions.</p>
 <div class="nbinput nblast docutils container">
@@ -366,7 +368,7 @@ <h3>Tune the kernel<a class="headerlink" href="#Tune-the-kernel" title="Permalin
 </section>
 </section>
 <section id="Using-the-optimized-parameters">
-<h2>Using the optimized parameters<a class="headerlink" href="#Using-the-optimized-parameters" title="Permalink to this heading">¶</a></h2>
+<h2>Using the optimized parameters<a class="headerlink" href="#Using-the-optimized-parameters" title="Link to this heading">¶</a></h2>
 <p>Now that we have determined which parameters are the best suited for our application we can specify them in our kernel and run it. In our case, the optimal grid size determined by the tuner were <em>block_size_x = 4, block_size_y = 2, block_size_z=16</em>. We therefore use these parameters here to define the block size. The grid size is simply obtained by dividing the dimension of the problem by the corresponding block size.</p>
 <div class="nbinput nblast docutils container">
 <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[6]:
diff --git a/latest/hostcode.html b/latest/hostcode.html
index 44ed78b16..7d2bec503 100644
--- a/latest/hostcode.html
+++ b/latest/hostcode.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Tuning Host Code &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -106,7 +108,7 @@
            <div itemprop="articleBody">
              
   <section id="tuning-host-code">
-<h1>Tuning Host Code<a class="headerlink" href="#tuning-host-code" title="Permalink to this heading">¶</a></h1>
+<h1>Tuning Host Code<a class="headerlink" href="#tuning-host-code" title="Link to this heading">¶</a></h1>
 <p>With the Kernel Tuner it is also possible to tune the host code of your GPU programs, or even just any C function for that matter.
 Tuning host code can be useful when it contains parameters that have impact on the performance of kernel on the GPU, such as the number of
 streams to use when executing a kernel across multiple streams. Another example is when you want to include the data transfers between
@@ -132,7 +134,7 @@ <h1>Tuning Host Code<a class="headerlink" href="#tuning-host-code" title="Permal
 CUDA Events to do the timing for you. However, if you are using plain C then you have to supply your own timing function.
 In the <a class="reference external" href="https://github.com/kerneltuner/kernel_tuner/blob/master/examples/c/vector_add.py">C vector add example</a> we are using the <code class="docutils literal notranslate"><span class="pre">omp_get_wtime()</span></code> function from OpenMP to measure time on the CPU.</p>
 <section id="tuning-the-number-of-streams">
-<h2>Tuning the number of streams<a class="headerlink" href="#tuning-the-number-of-streams" title="Permalink to this heading">¶</a></h2>
+<h2>Tuning the number of streams<a class="headerlink" href="#tuning-the-number-of-streams" title="Link to this heading">¶</a></h2>
 <p>The following describes the example in <code class="docutils literal notranslate"><span class="pre">examples/cuda/convolution_streams.py</span></code>.
 In this example, the same convolution kernel is used as with correctness checking and convolution application example.</p>
 <p>What is different is that we also supply the host code, which you can find in <code class="docutils literal notranslate"><span class="pre">examples/cuda/convolution_streams.cu</span></code>. It is a bit
diff --git a/latest/index.html b/latest/index.html
index a325f19de..0d428b2d6 100644
--- a/latest/index.html
+++ b/latest/index.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>The Kernel Tuner documentation &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -108,7 +110,7 @@
            <div itemprop="articleBody">
              
   <section id="the-kernel-tuner-documentation">
-<h1>The Kernel Tuner documentation<a class="headerlink" href="#the-kernel-tuner-documentation" title="Permalink to this heading">¶</a></h1>
+<h1>The Kernel Tuner documentation<a class="headerlink" href="#the-kernel-tuner-documentation" title="Link to this heading">¶</a></h1>
 <p>Kernel Tuner is a software development tool for the creation of highly-optimized and tuned GPU applications.</p>
 <p>The Kernel Tuner documentation pages are mostly about Kernel Tuner itself, but there are a number of related repositories that
 are considered part of the Kernel Tuner family:</p>
@@ -120,7 +122,7 @@ <h1>The Kernel Tuner documentation<a class="headerlink" href="#the-kernel-tuner-
 </ul>
 </div></blockquote>
 <section id="quick-install">
-<h2>Quick install<a class="headerlink" href="#quick-install" title="Permalink to this heading">¶</a></h2>
+<h2>Quick install<a class="headerlink" href="#quick-install" title="Link to this heading">¶</a></h2>
 <p>The easiest way to install the Kernel Tuner is using pip:</p>
 <p>To tune CUDA kernels:</p>
 <ul class="simple">
@@ -145,7 +147,7 @@ <h2>Quick install<a class="headerlink" href="#quick-install" title="Permalink to
 dependencies can be found under <a class="reference internal" href="install.html#install"><span class="std std-ref">Installation</span></a>.</p>
 </section>
 <section id="example-usage">
-<h2>Example usage<a class="headerlink" href="#example-usage" title="Permalink to this heading">¶</a></h2>
+<h2>Example usage<a class="headerlink" href="#example-usage" title="Link to this heading">¶</a></h2>
 <p>The following shows a simple example for tuning a CUDA kernel:</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">kernel_string</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;</span>
 <span class="s2">__global__ void vector_add(float *c, float *a, float *b, int n) {</span>
@@ -172,7 +174,7 @@ <h2>Example usage<a class="headerlink" href="#example-usage" title="Permalink to
 </div>
 </section>
 <section id="citation">
-<h2>Citation<a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
+<h2>Citation<a class="headerlink" href="#citation" title="Link to this heading">¶</a></h2>
 <p>If you use Kernel Tuner in research or research software, please cite the most relevant among the following publications:</p>
 <p>The first paper on Kernel Tuner, please note that the capabilities of Kernel Tuner have significantly expanded since the first publication:</p>
 <div class="highlight-latex notranslate"><div class="highlight"><pre><span></span>@article<span class="nb">{</span>kerneltuner,
diff --git a/latest/install.html b/latest/install.html
index 90e904efd..7e4aabd58 100644
--- a/latest/install.html
+++ b/latest/install.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Installation &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -115,23 +117,23 @@
   <div class="toctree-wrapper compound">
 </div>
 <section id="installation">
-<span id="install"></span><span id="id1"></span><h1>Installation<a class="headerlink" href="#installation" title="Permalink to this heading">¶</a></h1>
+<span id="install"></span><span id="id1"></span><h1>Installation<a class="headerlink" href="#installation" title="Link to this heading">¶</a></h1>
 <p>The Kernel Tuner requires several packages to be installed. First of all, you need a
 working Python version, several Python packages, and optionally CUDA and/or OpenCL
 installations. All of this is explained in detail in this guide.</p>
 <p>For comprehensive step-by-step instructions on setting up a development environment, see <span class="xref std std-ref">Development Environment</span>.</p>
 <section id="python">
-<h2>Python<a class="headerlink" href="#python" title="Permalink to this heading">¶</a></h2>
+<h2>Python<a class="headerlink" href="#python" title="Link to this heading">¶</a></h2>
 <p>You need a Python installation. We recommend using Python 3 and installing it with <a class="reference external" href="https://conda.io/miniconda.html">Miniconda</a>.
 Linux users could type the following to download and install Python 3 using Miniconda:</p>
 <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>wget<span class="w"> </span>https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
 bash<span class="w"> </span>Miniconda3-latest-Linux-x86_64.sh
 </pre></div>
 </div>
-<p>You are of course also free to use your own Python installation, and the Kernel Tuner is developed to be fully compatible with Python 3.8 and newer.</p>
+<p>You are of course also free to use your own Python installation, and the Kernel Tuner is developed to be fully compatible with Python 3.9 and newer.</p>
 </section>
 <section id="installing-python-packages">
-<h2>Installing Python Packages<a class="headerlink" href="#installing-python-packages" title="Permalink to this heading">¶</a></h2>
+<h2>Installing Python Packages<a class="headerlink" href="#installing-python-packages" title="Link to this heading">¶</a></h2>
 <p>Note that when you are using a native Python installation, the <cite>pip</cite> command used
 to install
 Kernel Tuner and its dependencies requires <cite>sudo</cite> rights for system wide installation.</p>
@@ -147,7 +149,7 @@ <h2>Installing Python Packages<a class="headerlink" href="#installing-python-pac
 <p>There are also optional dependencies, explained below.</p>
 </section>
 <section id="cuda-and-pycuda">
-<span id="installing-cuda"></span><h2>CUDA and PyCUDA<a class="headerlink" href="#cuda-and-pycuda" title="Permalink to this heading">¶</a></h2>
+<span id="installing-cuda"></span><h2>CUDA and PyCUDA<a class="headerlink" href="#cuda-and-pycuda" title="Link to this heading">¶</a></h2>
 <p>Installing CUDA and PyCUDA is optional, because you may want to only use Kernel
 Tuner for tuning OpenCL or C kernels.</p>
 <p>If you want to use the Kernel Tuner to tune
@@ -173,12 +175,12 @@ <h2>Installing Python Packages<a class="headerlink" href="#installing-python-pac
 <p>If this fails, I recommend to see the PyCuda installation guide (<a class="reference external" href="https://wiki.tiker.net/PyCuda/Installation">https://wiki.tiker.net/PyCuda/Installation</a>)</p>
 </section>
 <section id="other-cuda-backends">
-<h2>Other CUDA Backends<a class="headerlink" href="#other-cuda-backends" title="Permalink to this heading">¶</a></h2>
+<h2>Other CUDA Backends<a class="headerlink" href="#other-cuda-backends" title="Link to this heading">¶</a></h2>
 <p>Kernel Tuner can also be used with CuPy (<a class="reference external" href="https://cupy.dev/">https://cupy.dev/</a>) or Nvidia’s CUDA Python bindings (<a class="reference external" href="https://nvidia.github.io/cuda-python/">https://nvidia.github.io/cuda-python/</a>). Please see the installation instructions of those projects for how the required Python packages.</p>
 <p>Please refer to the documentation on <a class="reference external" href="https://kerneltuner.github.io/kernel_tuner/stable/backends.html">backends</a> on how to use and select these backends.</p>
 </section>
 <section id="opencl-and-pyopencl">
-<h2>OpenCL and PyOpenCL<a class="headerlink" href="#opencl-and-pyopencl" title="Permalink to this heading">¶</a></h2>
+<h2>OpenCL and PyOpenCL<a class="headerlink" href="#opencl-and-pyopencl" title="Link to this heading">¶</a></h2>
 <p>Before we can install PyOpenCL you’ll need an OpenCL compiler. There are several
 OpenCL compilers available depending on the OpenCL platform you want to your
 code to run on.</p>
@@ -203,7 +205,7 @@ <h2>OpenCL and PyOpenCL<a class="headerlink" href="#opencl-and-pyopencl" title="
 <p>If this fails, please see the PyOpenCL installation guide (<a class="reference external" href="https://wiki.tiker.net/PyOpenCL/Installation">https://wiki.tiker.net/PyOpenCL/Installation</a>)</p>
 </section>
 <section id="hip-and-pyhip">
-<h2>HIP and PyHIP<a class="headerlink" href="#hip-and-pyhip" title="Permalink to this heading">¶</a></h2>
+<h2>HIP and PyHIP<a class="headerlink" href="#hip-and-pyhip" title="Link to this heading">¶</a></h2>
 <p>Before we can install PyHIP, you’ll need to have the HIP runtime and compiler installed on your system.
 The HIP compiler is included as part of the ROCm software stack. Here is AMD’s installation guide:</p>
 <ul class="simple">
@@ -223,7 +225,7 @@ <h2>HIP and PyHIP<a class="headerlink" href="#hip-and-pyhip" title="Permalink to
 </div>
 </section>
 <section id="installing-the-git-version">
-<h2>Installing the git version<a class="headerlink" href="#installing-the-git-version" title="Permalink to this heading">¶</a></h2>
+<h2>Installing the git version<a class="headerlink" href="#installing-the-git-version" title="Link to this heading">¶</a></h2>
 <p>You can also install from the git repository. This way you also get the examples.
 Please note that this will install all required dependencies in the current environment.
 For step-by-step instructions on setting up a development environment, see <span class="xref std std-ref">Development Environment</span>.</p>
@@ -253,7 +255,7 @@ <h2>Installing the git version<a class="headerlink" href="#installing-the-git-ve
 <p>To install Kernel Tuner along with all the packages required for development.</p>
 </section>
 <section id="dependencies-for-the-guides">
-<h2>Dependencies for the guides<a class="headerlink" href="#dependencies-for-the-guides" title="Permalink to this heading">¶</a></h2>
+<h2>Dependencies for the guides<a class="headerlink" href="#dependencies-for-the-guides" title="Link to this heading">¶</a></h2>
 <p>Some addition Python packages are required to run the Jupyter notebook guides.
 These packages are commonly used and chances are that you already have these installed.</p>
 <p>However, to install Kernel Tuner along with the dependencies to run the guides,
diff --git a/latest/matrix_multiplication.html b/latest/matrix_multiplication.html
index 58a3f275c..74cccfffe 100644
--- a/latest/matrix_multiplication.html
+++ b/latest/matrix_multiplication.html
@@ -1,20 +1,22 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Matrix multiplication &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
-      <link rel="stylesheet" href="_static/nbsphinx-code-cells.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+      <link rel="stylesheet" type="text/css" href="_static/nbsphinx-code-cells.css" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
         <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script>
         <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
@@ -112,7 +114,7 @@
            <div itemprop="articleBody">
              
   <section id="Matrix-multiplication">
-<h1>Matrix multiplication<a class="headerlink" href="#Matrix-multiplication" title="Permalink to this heading">¶</a></h1>
+<h1>Matrix multiplication<a class="headerlink" href="#Matrix-multiplication" title="Link to this heading">¶</a></h1>
 <p>This guide demonstrates how to use Kernel Tuner to test and tune kernels, using matrix multiplication as an example.</p>
 <p>Matrix multiplication is one of the most well-known and widely-used linear algebra operations, and is frequently used to demonstrate the high-performance computing capabilities of GPUs. As such, matrix multiplication presents a familiar starting point for many GPU programmers.</p>
 <div class="admonition note">
@@ -120,7 +122,7 @@ <h1>Matrix multiplication<a class="headerlink" href="#Matrix-multiplication" tit
 </div>
 <p>Make sure to execute all the code cells you come across in this tutorial by selecting them and pressing <em>shift+enter</em>.</p>
 <section id="Naive-CUDA-kernel">
-<h2>Naive CUDA kernel<a class="headerlink" href="#Naive-CUDA-kernel" title="Permalink to this heading">¶</a></h2>
+<h2>Naive CUDA kernel<a class="headerlink" href="#Naive-CUDA-kernel" title="Link to this heading">¶</a></h2>
 <p>We’ll start with a very simple kernel for performing a matrix multiplication in CUDA. The idea is that this kernel is executed with one thread per element in the output matrix. As such, each thread <span class="math notranslate nohighlight">\((i,j)\)</span> iterates over the entire row <span class="math notranslate nohighlight">\(i\)</span> in matrix <span class="math notranslate nohighlight">\(A\)</span>, and column <span class="math notranslate nohighlight">\(j\)</span> in matrix <span class="math notranslate nohighlight">\(B\)</span>.</p>
 <p>To keep the code clean and simple, we’ll assume that we only work with square matrices. Execute the following cell to write our naive matrix multiplication kernel to a file name “matmul_naive.cu” by pressing <em>shift+enter</em>.</p>
 <div class="nbinput nblast docutils container">
@@ -151,7 +153,7 @@ <h2>Naive CUDA kernel<a class="headerlink" href="#Naive-CUDA-kernel" title="Perm
 <p>Because we can pick any value for these parameters, we can use auto-tuning to automatically find the best performing combination of parameters. That’s exactly what we’re going to do in this tutorial!</p>
 </section>
 <section id="Tuning-a-naive-kernel">
-<h2>Tuning a naive kernel<a class="headerlink" href="#Tuning-a-naive-kernel" title="Permalink to this heading">¶</a></h2>
+<h2>Tuning a naive kernel<a class="headerlink" href="#Tuning-a-naive-kernel" title="Link to this heading">¶</a></h2>
 <p>Now we will have a look at how to use Kernel Tuner to find the best performing combination of tunable parameters for our naive matrix multiplication kernel. We’ll go over the process of creating an auto-tuning script step-by-step.</p>
 <p>Because the tuner will need to execute the kernel, we start with creating some input data.</p>
 <div class="nbinput nblast docutils container">
@@ -211,7 +213,7 @@ <h2>Tuning a naive kernel<a class="headerlink" href="#Tuning-a-naive-kernel" tit
 <p>Therefore, we’ll have a look at the Nvidia Visual Profiler to find that the utilization of our kernel is actually pretty low: <img alt="matmul_naive" src="https://raw.githubusercontent.com/kerneltuner/kernel_tuner/master/doc/source/matmul/matmul_naive.png" /> There is however, a lot of opportunity for data reuse, which is realized by making the threads in a thread block collaborate.</p>
 </section>
 <section id="Using-shared-memory">
-<h2>Using shared memory<a class="headerlink" href="#Using-shared-memory" title="Permalink to this heading">¶</a></h2>
+<h2>Using shared memory<a class="headerlink" href="#Using-shared-memory" title="Link to this heading">¶</a></h2>
 <p>We can increase the utilization of memory bandwidth with a technique called cache-blocking or loop-tiling. To this end, we define two square data structures in shared memory, which will be used for storing square parts of matrix <span class="math notranslate nohighlight">\(A\)</span> and <span class="math notranslate nohighlight">\(B\)</span>. The threads in a thread block will collaboratively fill these two submatrices, and then proceed to perform all the computations that need this data, before moving to the next blocked iteration.</p>
 <p>The code required to do this is a little bit more complex:</p>
 <div class="nbinput nblast docutils container">
@@ -289,7 +291,7 @@ <h2>Using shared memory<a class="headerlink" href="#Using-shared-memory" title="
 <p>The restriction we have introduced has limited the number of kernel configurations benchmarked by the tuner significantly. Because the thread block size needs to be a square, there only a handful of configurations we can try. Fortunately, we can add several more optimizations to the code that also open the parameter space for tuning.</p>
 </section>
 <section id="Increase-work-per-thread">
-<h2>Increase work per thread<a class="headerlink" href="#Increase-work-per-thread" title="Permalink to this heading">¶</a></h2>
+<h2>Increase work per thread<a class="headerlink" href="#Increase-work-per-thread" title="Link to this heading">¶</a></h2>
 <p>A commonly used code optimization in GPU programming is to increase the amount of work performed by each thread. This optimization has several benefits. It increases data reuse within the thread block and reduces the number of redundant instructions executed by distinct threads. This code optimization is typically called <em>1xN Tiling</em> or <em>thread-block-merge</em>. We will use two different forms of 1xN tiling in this example:</p>
 <p>First of all, in the x-direction we will use tiling in a way that is similar to the convolution example (used as part of the ‘Getting Started’ tutorial). The area of output data that is processed by a single thread block is increased by a factor of N, and as such shared memory usage also increases by a factor <span class="math notranslate nohighlight">\(N\)</span>. This means that the number of thread blocks needed to execute the kernel for this problem size is also reduced by a factor of <span class="math notranslate nohighlight">\(N\)</span>. While this may reduce occupancy due to
 increased shared memory and register usage, this optimization drastically reduces the number of redundant instructions that were previously distributed across multiple thread blocks.</p>
diff --git a/latest/metrics.html b/latest/metrics.html
index 7ac260443..2dd3335df 100644
--- a/latest/metrics.html
+++ b/latest/metrics.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Metrics and Objectives &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -107,10 +109,10 @@
            <div itemprop="articleBody">
              
   <section id="metrics-and-objectives">
-<h1>Metrics and Objectives<a class="headerlink" href="#metrics-and-objectives" title="Permalink to this heading">¶</a></h1>
+<h1>Metrics and Objectives<a class="headerlink" href="#metrics-and-objectives" title="Link to this heading">¶</a></h1>
 <p>Metrics and custom tuning objectives are two related features that are explained on this page.</p>
 <section id="metrics">
-<span id="id1"></span><h2>Metrics<a class="headerlink" href="#metrics" title="Permalink to this heading">¶</a></h2>
+<span id="id1"></span><h2>Metrics<a class="headerlink" href="#metrics" title="Link to this heading">¶</a></h2>
 <p>User-defined metrics serve as an easy way for the user to define their own derived results based on the measurements reported
 by Kernel Tuner, and possibly any additional observers. This allows for example to implement performance metrics, such as
 performance in floating point operations per second (e.g. GFLOP/s), or other metrics that might be more specific to the
@@ -131,7 +133,7 @@ <h1>Metrics and Objectives<a class="headerlink" href="#metrics-and-objectives" t
 number of thread blocks.</p>
 </section>
 <section id="tuning-objectives">
-<span id="objectives"></span><h2>Tuning Objectives<a class="headerlink" href="#tuning-objectives" title="Permalink to this heading">¶</a></h2>
+<span id="objectives"></span><h2>Tuning Objectives<a class="headerlink" href="#tuning-objectives" title="Link to this heading">¶</a></h2>
 <p>Users can specify tuning objectives other than the default optimization objective, which is kernel execution time. When using
 an optimization strategy other than exhaustive search (brute force), this objective is used to guide the optimization through
 the parameter space. The tuning objective is specified using the <code class="docutils literal notranslate"><span class="pre">objective=</span></code> optional parameter of <code class="docutils literal notranslate"><span class="pre">tune_kernel()</span></code> and
diff --git a/latest/observers.html b/latest/observers.html
index 6f60bacb7..1ed7175a2 100644
--- a/latest/observers.html
+++ b/latest/observers.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Observers &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -128,7 +130,7 @@
            <div itemprop="articleBody">
              
   <section id="observers">
-<span id="id1"></span><h1>Observers<a class="headerlink" href="#observers" title="Permalink to this heading">¶</a></h1>
+<span id="id1"></span><h1>Observers<a class="headerlink" href="#observers" title="Link to this heading">¶</a></h1>
 <p>To facilitate measurements of quantities other than kernel execution time, and to make it easy
 for the user to control exactly what is being measured by Kernel Tuner, we have introduced the Observers
 feature. In the layered software architecture of Kernel Tuner, observers act as programmable hooks to allow the
@@ -144,35 +146,35 @@
 function, the state of GPU memory, or any other information in the GPU runtime.</p>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.observers.BenchmarkObserver">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.observers.</span></span><span class="sig-name descname"><span class="pre">BenchmarkObserver</span></span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.observers.</span></span><span class="sig-name descname"><span class="pre">BenchmarkObserver</span></span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver" title="Link to this definition">¶</a></dt>
 <dd><p>Base class for Benchmark Observers</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.observers.BenchmarkObserver.after_finish">
-<span class="sig-name descname"><span class="pre">after_finish</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver.after_finish" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">after_finish</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver.after_finish" title="Link to this definition">¶</a></dt>
 <dd><p>after finish is called once every iteration after the kernel has finished execution</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.observers.BenchmarkObserver.after_start">
-<span class="sig-name descname"><span class="pre">after_start</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver.after_start" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">after_start</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver.after_start" title="Link to this definition">¶</a></dt>
 <dd><p>after start is called every iteration directly after the kernel was launched</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.observers.BenchmarkObserver.before_start">
-<span class="sig-name descname"><span class="pre">before_start</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver.before_start" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">before_start</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver.before_start" title="Link to this definition">¶</a></dt>
 <dd><p>before start is called every iteration before the kernel starts</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.observers.BenchmarkObserver.during">
-<span class="sig-name descname"><span class="pre">during</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver.during" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">during</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver.during" title="Link to this definition">¶</a></dt>
 <dd><p>during is called as often as possible while the kernel is running</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.observers.BenchmarkObserver.get_results">
-<em class="property"><span class="pre">abstract</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">get_results</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver.get_results" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">abstract</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">get_results</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver.get_results" title="Link to this definition">¶</a></dt>
 <dd><p>get_results should return a dict with results that adds to the benchmarking data</p>
 <p>get_results is called only once per benchmarking of a single kernel configuration and
 generally returns averaged values over multiple iterations.</p>
@@ -180,14 +182,14 @@
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.observers.BenchmarkObserver.register_configuration">
-<span class="sig-name descname"><span class="pre">register_configuration</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">params</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver.register_configuration" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">register_configuration</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">params</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver.register_configuration" title="Link to this definition">¶</a></dt>
 <dd><p>Called once before benchmarking of a single kernel configuration. The <cite>params</cite> argument is a <cite>dict</cite>
 that stores the configuration parameters.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.observers.BenchmarkObserver.register_device">
-<span class="sig-name descname"><span class="pre">register_device</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dev</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver.register_device" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">register_device</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dev</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.BenchmarkObserver.register_device" title="Link to this definition">¶</a></dt>
 <dd><p>Sets self.dev, for inspection by the observer at various points during benchmarking</p>
 </dd></dl>
 
@@ -207,7 +209,7 @@
 </pre></div>
 </div>
 <section id="powersensorobserver">
-<h2>PowerSensorObserver<a class="headerlink" href="#powersensorobserver" title="Permalink to this heading">¶</a></h2>
+<h2>PowerSensorObserver<a class="headerlink" href="#powersensorobserver" title="Link to this heading">¶</a></h2>
 <p><a class="reference external" href="https://www.astron.nl/~romein/papers/ISPASS-18/paper.pdf">PowerSensor2</a> is a custom-built power measurement device for PCIe devices that
 intercepts the device power with current sensors and transmits the data to the host over a USB connection. The
 main advantage of using PowerSensor2 over the GPU’s built-in power sensor is that PowerSensor2 reports
@@ -221,7 +223,7 @@ <h2>PowerSensorObserver<a class="headerlink" href="#powersensorobserver" title="
 during auto-tuning.</p>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.observers.powersensor.PowerSensorObserver">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.observers.powersensor.</span></span><span class="sig-name descname"><span class="pre">PowerSensorObserver</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">observables</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.powersensor.PowerSensorObserver" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.observers.powersensor.</span></span><span class="sig-name descname"><span class="pre">PowerSensorObserver</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">observables</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.powersensor.PowerSensorObserver" title="Link to this definition">¶</a></dt>
 <dd><p>Observer that an external PowerSensor2 device to accurately measure power</p>
 <p>Requires PowerSensor2 hardware and powersensor Python bindings.</p>
 <dl class="field-list simple">
@@ -238,7 +240,7 @@ <h2>PowerSensorObserver<a class="headerlink" href="#powersensorobserver" title="
 
 </section>
 <section id="nvmlobserver">
-<h2>NVMLObserver<a class="headerlink" href="#nvmlobserver" title="Permalink to this heading">¶</a></h2>
+<h2>NVMLObserver<a class="headerlink" href="#nvmlobserver" title="Link to this heading">¶</a></h2>
 <p>Kernel Tuner also implements an NVMLObserver, which allows the user to observe the power usage, energy
 consumption, core and memory frequencies, core voltage and temperature for all kernel configurations during
 benchmarking as reported by the NVIDIA Management Library (NVML). To facilitate the interaction with
@@ -254,7 +256,7 @@ <h2>NVMLObserver<a class="headerlink" href="#nvmlobserver" title="Permalink to t
 custom hardware, such as PowerSensor2.</p>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.observers.nvml.NVMLObserver">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.observers.nvml.</span></span><span class="sig-name descname"><span class="pre">NVMLObserver</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">observables</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">save_all</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nvidia_smi_fallback</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_locked_clocks</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">continous_duration</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.nvml.NVMLObserver" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.observers.nvml.</span></span><span class="sig-name descname"><span class="pre">NVMLObserver</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">observables</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">save_all</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nvidia_smi_fallback</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_locked_clocks</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">continous_duration</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.nvml.NVMLObserver" title="Link to this definition">¶</a></dt>
 <dd><p>Observer that uses NVML to monitor power, energy, clock frequencies, voltages and temperature.</p>
 <p>The NVMLObserver can also be used to tune application-specific clock frequencies or power limits
 in combination with other parameters.</p>
@@ -281,7 +283,7 @@ <h2>NVMLObserver<a class="headerlink" href="#nvmlobserver" title="Permalink to t
 </dd></dl>
 
 <section id="tuning-execution-parameters-with-nvml">
-<h3>Tuning execution parameters with NVML<a class="headerlink" href="#tuning-execution-parameters-with-nvml" title="Permalink to this heading">¶</a></h3>
+<h3>Tuning execution parameters with NVML<a class="headerlink" href="#tuning-execution-parameters-with-nvml" title="Link to this heading">¶</a></h3>
 <p>When you are using the NVMLObserver, Kernel Tuner can use its interface to NVML to enable tuning of
 execution parameters, such as power limits or memory and core clock frequencies.
 Using application-specific clock frequencies is one of the most common approaches to tuning energy efficiency on
@@ -302,13 +304,13 @@ <h3>Tuning execution parameters with NVML<a class="headerlink" href="#tuning-exe
 </section>
 </section>
 <section id="pmtobserver">
-<h2>PMTObserver<a class="headerlink" href="#pmtobserver" title="Permalink to this heading">¶</a></h2>
+<h2>PMTObserver<a class="headerlink" href="#pmtobserver" title="Link to this heading">¶</a></h2>
 <p>The PMTObserver can be used to measure power and energy on various platforms including Nvidia Jetson, Nvidia NVML,
 the RAPL interface, AMD ROCM, and Xilinx. It requires PMT to be installed, as well as the PMT’s Python interface.
 More information about PMT can be found here: <a class="reference external" href="https://git.astron.nl/RD/pmt/">https://git.astron.nl/RD/pmt/</a></p>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.observers.pmt.PMTObserver">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.observers.pmt.</span></span><span class="sig-name descname"><span class="pre">PMTObserver</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">observable</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.pmt.PMTObserver" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.observers.pmt.</span></span><span class="sig-name descname"><span class="pre">PMTObserver</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">observable</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.observers.pmt.PMTObserver" title="Link to this definition">¶</a></dt>
 <dd><p>Observer that uses the PMT library to measure power</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
diff --git a/latest/optimization.html b/latest/optimization.html
index 448d0729e..2d45eb803 100644
--- a/latest/optimization.html
+++ b/latest/optimization.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Optimization strategies &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -182,7 +184,7 @@
            <div itemprop="articleBody">
              
   <section id="optimization-strategies">
-<span id="optimizations"></span><h1>Optimization strategies<a class="headerlink" href="#optimization-strategies" title="Permalink to this heading">¶</a></h1>
+<span id="optimizations"></span><h1>Optimization strategies<a class="headerlink" href="#optimization-strategies" title="Link to this heading">¶</a></h1>
 <p>Kernel Tuner supports many optimization strategies that accelerate the auto-tuning search process. By default, Kernel Tuner
 uses ‘brute force’ tuning, which means that Kernel Tuner will try all possible combinations of all values of all tunable
 parameters. Even with simple kernels this form of tuning can become prohibitively slow and a waste of time and energy.</p>
@@ -225,11 +227,11 @@
 <p>Below all the strategies are listed with their strategy-specific options that can be passed in a dictionary to the <code class="docutils literal notranslate"><span class="pre">strategy_options=</span></code> argument
 of <code class="docutils literal notranslate"><span class="pre">tune_kernel()</span></code>.</p>
 <section id="module-kernel_tuner.strategies.basinhopping">
-<span id="kernel-tuner-strategies-basinhopping"></span><h2>kernel_tuner.strategies.basinhopping<a class="headerlink" href="#module-kernel_tuner.strategies.basinhopping" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-basinhopping"></span><h2>kernel_tuner.strategies.basinhopping<a class="headerlink" href="#module-kernel_tuner.strategies.basinhopping" title="Link to this heading">¶</a></h2>
 <p>The strategy that uses the basinhopping global optimization method.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.basinhopping.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.basinhopping.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.basinhopping.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.basinhopping.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.basinhopping.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space</p>
 <p>This basin hopping strategy supports the following strategy_options:</p>
 <blockquote>
@@ -259,29 +261,29 @@
 
 </section>
 <section id="module-kernel_tuner.strategies.bayes_opt">
-<span id="kernel-tuner-strategies-bayes-opt"></span><h2>kernel_tuner.strategies.bayes_opt<a class="headerlink" href="#module-kernel_tuner.strategies.bayes_opt" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-bayes-opt"></span><h2>kernel_tuner.strategies.bayes_opt<a class="headerlink" href="#module-kernel_tuner.strategies.bayes_opt" title="Link to this heading">¶</a></h2>
 <p>Bayesian Optimization implementation from the thesis by Willemsen.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.bayes_opt.</span></span><span class="sig-name descname"><span class="pre">generate_normalized_param_dicts</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tune_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">Tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.bayes_opt.</span></span><span class="sig-name descname"><span class="pre">generate_normalized_param_dicts</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tune_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eps</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">Tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts" title="Link to this definition">¶</a></dt>
 <dd><p>Generates normalization and denormalization dictionaries.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.bayes_opt.normalize_parameter_space">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.bayes_opt.</span></span><span class="sig-name descname"><span class="pre">normalize_parameter_space</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">param_space</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">normalized</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span></span></span><a class="headerlink" href="#kernel_tuner.strategies.bayes_opt.normalize_parameter_space" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.bayes_opt.</span></span><span class="sig-name descname"><span class="pre">normalize_parameter_space</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">param_space</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">normalized</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span></span></span><a class="headerlink" href="#kernel_tuner.strategies.bayes_opt.normalize_parameter_space" title="Link to this definition">¶</a></dt>
 <dd><p>Normalize the parameter space given a normalization dictionary.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.bayes_opt.prune_parameter_space">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.bayes_opt.</span></span><span class="sig-name descname"><span class="pre">prune_parameter_space</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">parameter_space</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">normalize_dict</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.bayes_opt.prune_parameter_space" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.bayes_opt.</span></span><span class="sig-name descname"><span class="pre">prune_parameter_space</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">parameter_space</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">normalize_dict</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.bayes_opt.prune_parameter_space" title="Link to this definition">¶</a></dt>
 <dd><p>Pruning of the parameter space to remove dimensions that have a constant parameter.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.bayes_opt.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.bayes_opt.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.bayes_opt.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.bayes_opt.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.bayes_opt.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Params runner<span class="colon">:</span></dt>
@@ -303,11 +305,11 @@
 
 </section>
 <section id="module-kernel_tuner.strategies.brute_force">
-<span id="kernel-tuner-strategies-brute-force"></span><h2>kernel_tuner.strategies.brute_force<a class="headerlink" href="#module-kernel_tuner.strategies.brute_force" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-brute-force"></span><h2>kernel_tuner.strategies.brute_force<a class="headerlink" href="#module-kernel_tuner.strategies.brute_force" title="Link to this heading">¶</a></h2>
 <p>The default strategy that iterates through the whole parameter space</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.brute_force.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.brute_force.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.brute_force.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.brute_force.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.brute_force.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space</p>
 <p>This Brute Force strategy supports the following strategy_options:</p>
 <dl class="field-list simple">
@@ -331,11 +333,11 @@
 
 </section>
 <section id="module-kernel_tuner.strategies.diff_evo">
-<span id="kernel-tuner-strategies-diff-evo"></span><h2>kernel_tuner.strategies.diff_evo<a class="headerlink" href="#module-kernel_tuner.strategies.diff_evo" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-diff-evo"></span><h2>kernel_tuner.strategies.diff_evo<a class="headerlink" href="#module-kernel_tuner.strategies.diff_evo" title="Link to this heading">¶</a></h2>
 <p>The differential evolution strategy that optimizes the search through the parameter space.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.diff_evo.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.diff_evo.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.diff_evo.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.diff_evo.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.diff_evo.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space</p>
 <p>This Differential Evolution strategy supports the following strategy_options:</p>
 <blockquote>
@@ -366,11 +368,11 @@
 
 </section>
 <section id="module-kernel_tuner.strategies.dual_annealing">
-<span id="kernel-tuner-strategies-dual-annealing"></span><h2>kernel_tuner.strategies.dual_annealing<a class="headerlink" href="#module-kernel_tuner.strategies.dual_annealing" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-dual-annealing"></span><h2>kernel_tuner.strategies.dual_annealing<a class="headerlink" href="#module-kernel_tuner.strategies.dual_annealing" title="Link to this heading">¶</a></h2>
 <p>The strategy that uses the dual annealing optimization method.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.dual_annealing.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.dual_annealing.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.dual_annealing.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.dual_annealing.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.dual_annealing.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space</p>
 <p>This Dual Annealing strategy supports the following strategy_options:</p>
 <blockquote>
@@ -399,27 +401,27 @@
 
 </section>
 <section id="module-kernel_tuner.strategies.firefly_algorithm">
-<span id="kernel-tuner-strategies-firefly-algorithm"></span><h2>kernel_tuner.strategies.firefly_algorithm<a class="headerlink" href="#module-kernel_tuner.strategies.firefly_algorithm" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-firefly-algorithm"></span><h2>kernel_tuner.strategies.firefly_algorithm<a class="headerlink" href="#module-kernel_tuner.strategies.firefly_algorithm" title="Link to this heading">¶</a></h2>
 <p>The strategy that uses the firefly algorithm for optimization.</p>
 <dl class="py class">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.firefly_algorithm.Firefly">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.firefly_algorithm.</span></span><span class="sig-name descname"><span class="pre">Firefly</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bounds</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.firefly_algorithm.Firefly" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.firefly_algorithm.</span></span><span class="sig-name descname"><span class="pre">Firefly</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bounds</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.firefly_algorithm.Firefly" title="Link to this definition">¶</a></dt>
 <dd><p>Firefly object for use in the Firefly Algorithm.</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity">
-<span class="sig-name descname"><span class="pre">compute_intensity</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">fun</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">compute_intensity</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">fun</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity" title="Link to this definition">¶</a></dt>
 <dd><p>Evaluate cost function and compute intensity at this position.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to">
-<span class="sig-name descname"><span class="pre">distance_to</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">other</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">distance_to</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">other</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to" title="Link to this definition">¶</a></dt>
 <dd><p>Return Euclidian distance between self and other Firefly.</p>
 </dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards">
-<span class="sig-name descname"><span class="pre">move_towards</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">other</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">beta</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alpha</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">move_towards</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">other</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">beta</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">alpha</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards" title="Link to this definition">¶</a></dt>
 <dd><p>Move firefly towards another given beta and alpha values.</p>
 </dd></dl>
 
@@ -427,7 +429,7 @@
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.firefly_algorithm.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.firefly_algorithm.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.firefly_algorithm.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.firefly_algorithm.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.firefly_algorithm.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space</p>
 <p>This firefly algorithm strategy supports the following strategy_options:</p>
 <blockquote>
@@ -460,11 +462,11 @@
 
 </section>
 <section id="module-kernel_tuner.strategies.genetic_algorithm">
-<span id="kernel-tuner-strategies-genetic-algorithm"></span><h2>kernel_tuner.strategies.genetic_algorithm<a class="headerlink" href="#module-kernel_tuner.strategies.genetic_algorithm" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-genetic-algorithm"></span><h2>kernel_tuner.strategies.genetic_algorithm<a class="headerlink" href="#module-kernel_tuner.strategies.genetic_algorithm" title="Link to this heading">¶</a></h2>
 <p>A simple genetic algorithm for parameter search.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.genetic_algorithm.</span></span><span class="sig-name descname"><span class="pre">disruptive_uniform_crossover</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dna1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dna2</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.genetic_algorithm.</span></span><span class="sig-name descname"><span class="pre">disruptive_uniform_crossover</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dna1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dna2</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover" title="Link to this definition">¶</a></dt>
 <dd><p>Disruptive uniform crossover.</p>
 <p>uniformly crossover genes between dna1 and dna2,
 with children guaranteed to be different from parents,
@@ -473,19 +475,19 @@
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.genetic_algorithm.mutate">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.genetic_algorithm.</span></span><span class="sig-name descname"><span class="pre">mutate</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dna</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">mutation_chance</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cache</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.genetic_algorithm.mutate" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.genetic_algorithm.</span></span><span class="sig-name descname"><span class="pre">mutate</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dna</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">mutation_chance</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cache</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.genetic_algorithm.mutate" title="Link to this definition">¶</a></dt>
 <dd><p>Mutate DNA with 1/mutation_chance chance.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.genetic_algorithm.single_point_crossover">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.genetic_algorithm.</span></span><span class="sig-name descname"><span class="pre">single_point_crossover</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dna1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dna2</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.genetic_algorithm.single_point_crossover" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.genetic_algorithm.</span></span><span class="sig-name descname"><span class="pre">single_point_crossover</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dna1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dna2</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.genetic_algorithm.single_point_crossover" title="Link to this definition">¶</a></dt>
 <dd><p>Crossover dna1 and dna2 at a random index.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.genetic_algorithm.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.genetic_algorithm.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.genetic_algorithm.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.genetic_algorithm.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.genetic_algorithm.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space</p>
 <p>This Genetic Algorithm strategy supports the following strategy_options:</p>
 <blockquote>
@@ -517,29 +519,29 @@
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.genetic_algorithm.two_point_crossover">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.genetic_algorithm.</span></span><span class="sig-name descname"><span class="pre">two_point_crossover</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dna1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dna2</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.genetic_algorithm.two_point_crossover" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.genetic_algorithm.</span></span><span class="sig-name descname"><span class="pre">two_point_crossover</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dna1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dna2</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.genetic_algorithm.two_point_crossover" title="Link to this definition">¶</a></dt>
 <dd><p>Crossover dna1 and dna2 at 2 random indices.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.genetic_algorithm.uniform_crossover">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.genetic_algorithm.</span></span><span class="sig-name descname"><span class="pre">uniform_crossover</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dna1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dna2</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.genetic_algorithm.uniform_crossover" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.genetic_algorithm.</span></span><span class="sig-name descname"><span class="pre">uniform_crossover</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dna1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dna2</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.genetic_algorithm.uniform_crossover" title="Link to this definition">¶</a></dt>
 <dd><p>Randomly crossover genes between dna1 and dna2.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.genetic_algorithm.weighted_choice">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.genetic_algorithm.</span></span><span class="sig-name descname"><span class="pre">weighted_choice</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">population</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.genetic_algorithm.weighted_choice" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.genetic_algorithm.</span></span><span class="sig-name descname"><span class="pre">weighted_choice</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">population</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.genetic_algorithm.weighted_choice" title="Link to this definition">¶</a></dt>
 <dd><p>Randomly select n unique individuals from a weighted population, fitness determines probability of being selected.</p>
 </dd></dl>
 
 </section>
 <section id="module-kernel_tuner.strategies.greedy_ils">
-<span id="kernel-tuner-strategies-greedy-ils"></span><h2>kernel_tuner.strategies.greedy_ils<a class="headerlink" href="#module-kernel_tuner.strategies.greedy_ils" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-greedy-ils"></span><h2>kernel_tuner.strategies.greedy_ils<a class="headerlink" href="#module-kernel_tuner.strategies.greedy_ils" title="Link to this heading">¶</a></h2>
 <p>A simple greedy iterative local search algorithm for parameter search.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.greedy_ils.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.greedy_ils.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.greedy_ils.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.greedy_ils.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.greedy_ils.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space</p>
 <p>This Greedy Iterative Local Search (ILS) strategy supports the following strategy_options:</p>
 <blockquote>
@@ -571,11 +573,11 @@
 
 </section>
 <section id="module-kernel_tuner.strategies.greedy_mls">
-<span id="kernel-tuner-strategies-greedy-mls"></span><h2>kernel_tuner.strategies.greedy_mls<a class="headerlink" href="#module-kernel_tuner.strategies.greedy_mls" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-greedy-mls"></span><h2>kernel_tuner.strategies.greedy_mls<a class="headerlink" href="#module-kernel_tuner.strategies.greedy_mls" title="Link to this heading">¶</a></h2>
 <p>A greedy multi-start local search algorithm for parameter search.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.greedy_mls.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.greedy_mls.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.greedy_mls.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.greedy_mls.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.greedy_mls.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space</p>
 <p>This Greedy Multi-start Local Search (MLS) strategy supports the following strategy_options:</p>
 <blockquote>
@@ -607,11 +609,11 @@
 
 </section>
 <section id="module-kernel_tuner.strategies.minimize">
-<span id="kernel-tuner-strategies-minimize"></span><h2>kernel_tuner.strategies.minimize<a class="headerlink" href="#module-kernel_tuner.strategies.minimize" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-minimize"></span><h2>kernel_tuner.strategies.minimize<a class="headerlink" href="#module-kernel_tuner.strategies.minimize" title="Link to this heading">¶</a></h2>
 <p>The strategy that uses a minimizer method for searching through the parameter space.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.minimize.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.minimize.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.minimize.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.minimize.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.minimize.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space</p>
 <p>This Minimize strategy supports the following strategy_options:</p>
 <blockquote>
@@ -640,11 +642,11 @@
 
 </section>
 <section id="module-kernel_tuner.strategies.mls">
-<span id="kernel-tuner-strategies-mls"></span><h2>kernel_tuner.strategies.mls<a class="headerlink" href="#module-kernel_tuner.strategies.mls" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-mls"></span><h2>kernel_tuner.strategies.mls<a class="headerlink" href="#module-kernel_tuner.strategies.mls" title="Link to this heading">¶</a></h2>
 <p>The strategy that uses multi-start local search.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.mls.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.mls.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.mls.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.mls.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.mls.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space</p>
 <p>This Multi-start Local Search (MLS) strategy supports the following strategy_options:</p>
 <blockquote>
@@ -676,11 +678,11 @@
 
 </section>
 <section id="module-kernel_tuner.strategies.ordered_greedy_mls">
-<span id="kernel-tuner-strategies-ordered-greedy-mls"></span><h2>kernel_tuner.strategies.ordered_greedy_mls<a class="headerlink" href="#module-kernel_tuner.strategies.ordered_greedy_mls" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-ordered-greedy-mls"></span><h2>kernel_tuner.strategies.ordered_greedy_mls<a class="headerlink" href="#module-kernel_tuner.strategies.ordered_greedy_mls" title="Link to this heading">¶</a></h2>
 <p>A greedy multi-start local search algorithm for parameter search that traverses variables in order.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.ordered_greedy_mls.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.ordered_greedy_mls.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.ordered_greedy_mls.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.ordered_greedy_mls.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.ordered_greedy_mls.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space</p>
 <p>This Ordered Greedy Multi-start Local Search (MLS) strategy supports the following strategy_options:</p>
 <blockquote>
@@ -712,11 +714,11 @@
 
 </section>
 <section id="module-kernel_tuner.strategies.pso">
-<span id="kernel-tuner-strategies-pso"></span><h2>kernel_tuner.strategies.pso<a class="headerlink" href="#module-kernel_tuner.strategies.pso" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-pso"></span><h2>kernel_tuner.strategies.pso<a class="headerlink" href="#module-kernel_tuner.strategies.pso" title="Link to this heading">¶</a></h2>
 <p>The strategy that uses particle swarm optimization.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.pso.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.pso.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.pso.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.pso.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.pso.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space</p>
 <p>This Particle Swarm Optimization (PSO) strategy supports the following strategy_options:</p>
 <blockquote>
@@ -749,11 +751,11 @@
 
 </section>
 <section id="module-kernel_tuner.strategies.random_sample">
-<span id="kernel-tuner-strategies-random-sample"></span><h2>kernel_tuner.strategies.random_sample<a class="headerlink" href="#module-kernel_tuner.strategies.random_sample" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-random-sample"></span><h2>kernel_tuner.strategies.random_sample<a class="headerlink" href="#module-kernel_tuner.strategies.random_sample" title="Link to this heading">¶</a></h2>
 <p>Iterate over a random sample of the parameter space.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.random_sample.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.random_sample.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.random_sample.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.random_sample.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.random_sample.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space</p>
 <p>This Random Sampling strategy supports the following strategy_options:</p>
 <blockquote>
@@ -782,23 +784,23 @@
 
 </section>
 <section id="module-kernel_tuner.strategies.simulated_annealing">
-<span id="kernel-tuner-strategies-simulated-annealing"></span><h2>kernel_tuner.strategies.simulated_annealing<a class="headerlink" href="#module-kernel_tuner.strategies.simulated_annealing" title="Permalink to this heading">¶</a></h2>
+<span id="kernel-tuner-strategies-simulated-annealing"></span><h2>kernel_tuner.strategies.simulated_annealing<a class="headerlink" href="#module-kernel_tuner.strategies.simulated_annealing" title="Link to this heading">¶</a></h2>
 <p>The strategy that uses particle swarm optimization.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.simulated_annealing.acceptance_prob">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.simulated_annealing.</span></span><span class="sig-name descname"><span class="pre">acceptance_prob</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">old_cost</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">new_cost</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">T</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.simulated_annealing.acceptance_prob" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.simulated_annealing.</span></span><span class="sig-name descname"><span class="pre">acceptance_prob</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">old_cost</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">new_cost</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">T</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.simulated_annealing.acceptance_prob" title="Link to this definition">¶</a></dt>
 <dd><p>Annealing equation, with modifications to work towards a lower value.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.simulated_annealing.neighbor">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.simulated_annealing.</span></span><span class="sig-name descname"><span class="pre">neighbor</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">pos</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.simulated_annealing.neighbor" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.simulated_annealing.</span></span><span class="sig-name descname"><span class="pre">neighbor</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">pos</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.simulated_annealing.neighbor" title="Link to this definition">¶</a></dt>
 <dd><p>Return a random neighbor of pos.</p>
 </dd></dl>
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.strategies.simulated_annealing.tune">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.simulated_annealing.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.simulated_annealing.tune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.strategies.simulated_annealing.</span></span><span class="sig-name descname"><span class="pre">tune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">searchspace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Searchspace</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runner</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tuning_options</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.strategies.simulated_annealing.tune" title="Link to this definition">¶</a></dt>
 <dd><p>Find the best performing kernel configuration in the parameter space</p>
 <p>This Simulated Annealing strategy supports the following strategy_options:</p>
 <blockquote>
diff --git a/latest/py-modindex.html b/latest/py-modindex.html
index 8ce00ed8f..64c70e244 100644
--- a/latest/py-modindex.html
+++ b/latest/py-modindex.html
@@ -1,18 +1,20 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
   <meta charset="utf-8" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Python Module Index &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
diff --git a/latest/quickstart.html b/latest/quickstart.html
index b042cac82..69501d8a3 100644
--- a/latest/quickstart.html
+++ b/latest/quickstart.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Getting Started &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -103,7 +105,7 @@
            <div itemprop="articleBody">
              
   <section id="getting-started">
-<h1>Getting Started<a class="headerlink" href="#getting-started" title="Permalink to this heading">¶</a></h1>
+<h1>Getting Started<a class="headerlink" href="#getting-started" title="Link to this heading">¶</a></h1>
 <p>So you have installed Kernel Tuner! That’s great! But now you’d like to get started tuning some GPU code.</p>
 <p>Let’s say we have a simple CUDA kernel stored in a file called <code class="docutils literal notranslate"><span class="pre">vector_add_kernel.cu</span></code>:</p>
 <div class="highlight-cuda notranslate"><div class="highlight"><pre><span></span><span class="kr">__global__</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">vector_add</span><span class="p">(</span><span class="kt">float</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="n">c</span><span class="p">,</span><span class="w"> </span><span class="kt">float</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="n">a</span><span class="p">,</span><span class="w"> </span><span class="kt">float</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="n">b</span><span class="p">,</span><span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="n">n</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
diff --git a/latest/search.html b/latest/search.html
index d3cb971fc..8b964308a 100644
--- a/latest/search.html
+++ b/latest/search.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
   <meta charset="utf-8" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Search &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
     
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <script src="_static/searchtools.js"></script>
diff --git a/latest/searchindex.js b/latest/searchindex.js
index 793ac2038..78e74701f 100644
--- a/latest/searchindex.js
+++ b/latest/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["backends", "cache_files", "contents", "contributing", "convolution", "correctness", "design", "dev-environment", "diffusion", "diffusion_opencl", "diffusion_use_optparam", "examples", "grid3d", "hostcode", "index", "install", "matrix_multiplication", "metrics", "observers", "optimization", "quickstart", "structs", "templates", "user-api", "vocabulary"], "filenames": ["backends.rst", "cache_files.rst", "contents.rst", "contributing.rst", "convolution.ipynb", "correctness.rst", "design.rst", "dev-environment.rst", "diffusion.ipynb", "diffusion_opencl.ipynb", "diffusion_use_optparam.ipynb", "examples.rst", "grid3d.ipynb", "hostcode.rst", "index.rst", "install.rst", "matrix_multiplication.ipynb", "metrics.rst", "observers.rst", "optimization.rst", "quickstart.rst", "structs.rst", "templates.rst", "user-api.rst", "vocabulary.rst"], "titles": ["Backends", "Cache files", "The Kernel Tuner documentation", "Contribution guide", "Convolution", "Correctness Verification", "Design documentation", "Development environment", "Diffusion", "Tutorial: From physics to tuned GPU kernels", "Tutorial: From physics to tuned GPU kernels", "Kernel Tuner Examples", "3D Grid on GPU with Kernel Tuner", "Tuning Host Code", "The Kernel Tuner documentation", "Installation", "Matrix multiplication", "Metrics and Objectives", "Observers", "Optimization strategies", "Getting Started", "Using structs", "Templated kernels", "API Documentation", "Parameter Vocabulary"], "terms": {"kernel": [0, 1, 3, 4, 5, 6, 7, 13, 15, 17, 18, 19, 20, 21, 23, 24], "tuner": [0, 1, 3, 4, 5, 6, 7, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "implement": [0, 5, 6, 11, 12, 17, 18, 19, 23], "multipl": [0, 2, 6, 7, 13, 18, 22, 23], "one": [0, 3, 4, 6, 7, 8, 9, 10, 12, 15, 16, 18, 19, 23], "opencl": [0, 3, 4, 7, 8, 9, 10, 11, 13, 14, 16, 23], "hip": [0, 3, 7, 14, 23], "gener": [0, 4, 6, 7, 8, 9, 10, 14, 16, 18, 19, 21, 23, 24], "select": [0, 3, 4, 6, 8, 9, 10, 12, 15, 16, 18, 19, 23], "i": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "most": [0, 3, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 19, 20, 21, 23], "case": [0, 4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 18, 20, 21, 23], "automat": [0, 4, 7, 8, 9, 10, 12, 13, 16, 22, 23], "done": [0, 4, 15, 17, 18], "base": [0, 6, 7, 17, 18, 22, 23], "": [0, 4, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23], "program": [0, 5, 7, 8, 9, 10, 13, 16, 21, 22], "languag": [0, 6, 10, 13, 16, 21, 23], "sometim": [0, 7, 8, 9, 10, 21], "you": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24], "ll": [0, 4, 8, 9, 10, 15, 16], "want": [0, 5, 10, 12, 13, 15, 16, 18, 20, 23, 24], "specif": [0, 4, 6, 8, 9, 10, 11, 12, 17, 18, 19, 23], "choos": [0, 8, 9, 10, 16, 19, 23], "pycuda": [0, 7, 8, 10, 12, 13, 18, 22], "default": [0, 4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 18, 19, 22, 23], "It": [0, 4, 6, 7, 8, 9, 10, 13, 15, 16, 18, 22, 23], "compar": [0, 4, 5, 8, 9, 10, 12, 16, 17, 18], "complet": [0, 1, 4], "cupi": [0, 7, 13, 15, 18, 22, 23], "becaus": [0, 4, 5, 8, 9, 10, 13, 15, 16, 17, 22, 24], "ident": 0, "includ": [0, 3, 4, 5, 8, 9, 10, 12, 13, 15, 16, 18, 22, 23], "here": [0, 4, 11, 12, 13, 15, 16, 18, 23], "well": [0, 8, 9, 10, 12, 16, 18, 23], "To": [0, 3, 5, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23], "us": [0, 1, 2, 4, 5, 6, 7, 11, 13, 14, 15, 17, 18, 19, 20, 22, 23, 24], "nvidia": [0, 6, 7, 15, 16, 18, 22], "gpu": [0, 3, 4, 5, 6, 7, 11, 13, 14, 16, 18, 20, 21, 23, 24], "see": [0, 1, 3, 4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 20, 22, 23], "http": [0, 6, 7, 14, 15, 18], "github": [0, 3, 4, 7, 8, 9, 10, 12, 15, 16], "com": [0, 3, 6, 7, 14, 15], "jatinx": [0, 15], "nv": 0, "while": [0, 1, 4, 6, 8, 9, 10, 11, 16, 18, 19], "expect": [0, 3, 4, 5, 6, 7, 8, 9, 10, 16, 18, 23], "all": [0, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 23], "input": [0, 4, 5, 8, 9, 10, 11, 13, 16, 17, 20, 21, 23], "output": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 20, 23, 24], "numpi": [0, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 16, 20, 21, 22, 23], "arrai": [0, 4, 5, 6, 8, 9, 10, 12, 13, 20, 21, 23], "also": [0, 1, 3, 4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "argument": [0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 16, 18, 19, 20, 21, 22, 23], "thi": [0, 1, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "give": [0, 8, 9, 10, 19], "user": [0, 4, 5, 6, 7, 9, 11, 15, 16, 17, 18, 19, 22, 23], "more": [0, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 18, 20, 22, 23], "control": [0, 8, 9, 10, 18, 19, 23], "over": [0, 6, 8, 9, 10, 15, 16, 18, 19], "how": [0, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 20, 21, 22, 23], "memori": [0, 4, 6, 11, 13, 18, 21, 23, 24], "handl": [0, 13, 23], "check": [0, 5, 6, 7, 8, 9, 10, 13, 16], "dure": [0, 1, 6, 8, 9, 10, 12, 18, 23], "verif": [0, 2, 11, 23], "can": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "happen": [0, 1, 3, 4, 16, 20], "entir": [0, 6, 7, 8, 9, 10, 16, 19, 23], "when": [0, 1, 3, 4, 6, 7, 8, 9, 10, 13, 15, 16, 17, 18, 19, 21, 22, 23, 24], "onli": [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 19, 21, 23], "textur": [0, 6, 23], "c": [0, 3, 4, 6, 7, 11, 13, 14, 15, 16, 20, 22, 23], "signatur": [0, 4, 6], "With": [0, 12, 13], "other": [0, 1, 4, 6, 7, 8, 9, 10, 13, 16, 17, 18, 19, 23, 24], "requir": [0, 3, 4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 22], "ha": [0, 4, 6, 7, 8, 9, 10, 13, 16, 18, 19, 23], "extern": [0, 18, 22], "linkag": [0, 22], "If": [0, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 21, 23], "code": [0, 2, 4, 6, 7, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "wrap": [0, 6, 20, 22, 23], "an": [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "block": [0, 4, 6, 8, 9, 10, 11, 12, 15, 16, 17, 20, 23, 24], "which": [0, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24], "mai": [0, 4, 5, 6, 7, 8, 9, 10, 13, 15, 16, 17, 18, 19, 20, 21, 23], "caus": [0, 8, 9, 10], "issu": [0, 7, 21], "contain": [0, 1, 4, 6, 8, 9, 10, 12, 13, 16, 18, 19, 22, 23], "cannot": [0, 7, 8, 9, 10, 18], "have": [0, 1, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 20, 22, 23, 24], "present": [0, 7, 16], "header": [0, 23], "file": [0, 2, 4, 6, 7, 8, 9, 11, 13, 16, 19, 20, 22, 23], "As": [0, 1, 4, 8, 9, 10, 12, 15, 16, 18], "detail": [0, 6, 15, 23], "further": [0, 8, 9, 10, 15, 16], "templat": [0, 2, 12], "ar": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "fulli": [0, 7, 15], "limit": [0, 4, 6, 7, 8, 9, 10, 11, 16, 18, 19, 22, 23, 24], "python": [0, 3, 4, 6, 7, 11, 12, 13, 16, 18, 20, 21, 22, 23], "benchmark": [0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 23, 24], "observ": [0, 2, 6, 17, 23, 24], "constant": [0, 4, 6, 8, 9, 10, 11, 13, 16, 19, 23], "dynam": [0, 6, 23], "share": [0, 4, 6, 23], "anoth": [0, 8, 9, 10, 13, 16, 17, 19, 23], "import": [0, 4, 5, 8, 9, 10, 12, 15, 16, 17, 20, 21, 22], "differ": [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 23], "between": [0, 8, 9, 10, 13, 15, 16, 17, 19, 23], "The": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23], "tabl": 0, "below": [0, 7, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21], "list": [0, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 23], "packag": [0, 7], "pyhip": [0, 6], "interfac": [0, 4, 5, 13, 15, 18, 19, 21, 23], "lang": [0, 6, 11, 13, 22, 23], "nvcuda": 0, "nvcc": [0, 6], "nvrtc": [0, 6, 22], "hiprtc": 0, "A": [1, 4, 6, 7, 14, 15, 16, 18, 19, 23], "veri": [1, 5, 8, 9, 10, 13, 15, 16, 18, 21, 22], "featur": [1, 4, 5, 11, 15, 17, 18, 20, 22, 23], "abil": 1, "store": [1, 4, 6, 7, 10, 16, 18, 20, 23], "result": [1, 3, 4, 5, 6, 10, 12, 16, 17, 18, 19, 20, 23, 24], "tune": [1, 2, 5, 6, 11, 14, 15, 19, 20, 22, 23, 24], "enabl": [1, 18, 19, 21, 22], "pass": [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 22, 23], "ani": [1, 3, 4, 6, 7, 8, 9, 10, 13, 16, 17, 18, 19, 21, 22, 23, 24], "filenam": [1, 4, 6, 11, 16, 20, 23], "option": [1, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19, 22, 23, 24], "tune_kernel": [1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 19, 20, 21, 22, 23], "individu": [1, 18, 19], "configur": [1, 4, 6, 8, 9, 10, 11, 12, 16, 17, 18, 19, 23], "append": [1, 6, 15, 23], "run": [1, 3, 4, 5, 6, 8, 9, 12, 13, 15, 16, 18, 19, 23], "allow": [1, 4, 5, 6, 7, 8, 9, 10, 16, 17, 18, 19, 22, 23], "restart": [1, 7, 8, 9, 10, 19], "session": [1, 6, 7, 19], "from": [1, 4, 5, 6, 7, 8, 11, 12, 13, 15, 16, 18, 19, 21, 22, 23], "exist": [1, 6, 23], "should": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13, 16, 17, 18, 20, 23], "someth": [1, 4, 8, 9, 10, 16], "termin": [1, 15], "previou": [1, 7, 8, 9, 10, 19, 23], "befor": [1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 19, 23], "had": [1, 4], "quit": [1, 8, 9, 10, 12, 16, 22], "often": [1, 8, 9, 10, 18], "hpc": 1, "environ": [1, 2, 3, 4, 6, 15, 19, 23], "job": 1, "reserv": [1, 9, 24], "out": [1, 4, 5, 7, 12, 15, 16], "number": [1, 4, 5, 6, 8, 9, 10, 11, 12, 14, 16, 17, 18, 19, 20, 21, 23, 24], "simul": [1, 6, 10, 14, 19, 21, 23], "visual": [1, 7, 16], "optim": [1, 2, 4, 5, 6, 8, 9, 10, 13, 14, 16, 17, 18, 23], "strategi": [1, 2, 4, 14, 17, 23], "start": [1, 2, 4, 5, 6, 8, 9, 10, 13, 15, 16, 18, 19, 23], "call": [1, 4, 5, 6, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 22, 23], "full": [1, 3, 6, 7, 18, 20], "search": [1, 4, 6, 11, 14, 16, 17, 19, 23], "space": [1, 4, 5, 6, 7, 12, 13, 16, 17, 19, 23], "true": [1, 4, 5, 6, 8, 9, 10, 13, 16, 18, 19, 23], "creat": [1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 16, 18, 20, 21, 23], "even": [1, 7, 8, 9, 10, 13, 16, 19], "work": [1, 3, 4, 6, 8, 9, 10, 15, 17, 19, 22, 23], "still": [1, 3, 5, 16], "new": [1, 3, 6, 7, 8, 9, 10, 19, 23], "come": [1, 6, 8, 9, 10, 16, 18, 22], "thei": [1, 6, 7, 8, 9, 10, 11, 16, 17], "stream": [1, 6, 8, 9, 10], "pleas": [1, 3, 4, 7, 11, 14, 15, 18, 20, 21, 23], "dashboard": [1, 14], "introduct": 2, "instal": [2, 3, 4, 7, 8, 9, 10, 12, 13, 16, 18, 20], "get": [2, 4, 6, 8, 9, 10, 12, 15, 16], "convolut": [2, 5, 13, 16], "diffus": 2, "matrix": 2, "exampl": [2, 3, 5, 6, 7, 8, 9, 10, 13, 15, 16, 17, 18, 19, 20, 21, 23], "backend": [2, 7, 13, 18], "cach": [2, 6, 7, 8, 9, 10, 15, 16, 19, 23], "correct": [2, 7, 13, 21, 23], "host": [2, 6, 7, 9, 10, 11, 18, 21, 22, 23], "struct": 2, "metric": [2, 4, 6, 11, 16, 23], "object": [2, 4, 5, 6, 8, 9, 10, 19, 23], "api": [2, 4, 6], "paramet": [2, 5, 6, 8, 9, 11, 13, 16, 17, 19, 20, 21, 22, 23], "vocabulari": [2, 18, 20], "design": [2, 3, 8, 9, 10, 18], "contribut": [2, 7], "develop": [2, 6, 11, 14, 15], "thank": 3, "consid": [3, 12, 14, 16, 23], "Not": [3, 6], "help": [3, 7, 22], "u": [3, 4, 8, 9, 10], "improv": [3, 6, 8, 9, 10, 16, 19, 23], "about": [3, 4, 6, 8, 9, 10, 14, 16, 18, 19, 20, 23], "problem": [3, 4, 6, 8, 9, 10, 11, 12, 13, 16, 23], "ensur": [3, 5, 8, 9, 10, 13, 15, 18, 21], "follow": [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 22, 23], "describ": [3, 4, 6, 7, 13, 18, 21], "what": [3, 4, 5, 6, 8, 9, 10, 13, 16, 18, 20, 21, 22, 23, 24], "possibl": [3, 4, 5, 7, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 23], "minim": [3, 7, 17, 22, 23], "reproduc": 3, "actual": [3, 4, 5, 6, 8, 9, 10, 12, 16, 22], "error": [3, 4, 5, 6, 13, 16, 22], "print": [3, 4, 6, 8, 9, 10, 12, 16, 23], "version": [3, 4, 7, 16, 18, 23], "cuda": [3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 18, 20, 21, 22, 23], "compil": [3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 22, 23, 24], "applic": [3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 21, 22, 23], "For": [3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 18, 20, 21, 23], "propos": 3, "chang": [3, 7, 12, 18, 23], "addit": [3, 4, 7, 8, 9, 10, 15, 17, 20], "signific": 3, "first": [3, 4, 5, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23], "discuss": [3, 6], "Then": [3, 8, 9, 10, 12, 14, 15, 22], "fork": 3, "repositori": [3, 4, 7, 8, 9, 10, 12, 14, 15, 16], "branch": [3, 7], "per": [3, 4, 6, 8, 9, 10, 12, 17, 18, 23], "pull": 3, "request": [3, 18, 23], "googl": 3, "style": 3, "sphinxdoc": 3, "docstr": [3, 6], "modul": [3, 6, 7, 13, 18], "public": [3, 14], "function": [3, 4, 5, 8, 9, 10, 11, 12, 13, 16, 18, 19, 20, 21, 22, 23], "up": [3, 4, 6, 7, 8, 9, 10, 15, 16, 20, 23], "date": 3, "written": [3, 22], "unit": [3, 6], "test": [3, 8, 9, 10, 11, 15, 16, 18, 23], "your": [3, 4, 7, 8, 9, 10, 12, 13, 14, 15, 18, 21, 23], "nox": [3, 7], "do": [3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 23], "hardwar": [3, 7, 8, 9, 10, 12, 18, 19, 20], "skip": [3, 4, 7, 8, 9, 10, 23], "produc": [3, 5], "same": [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 18, 20, 23], "better": [3, 7, 8, 9, 10], "entri": [3, 6, 8, 9], "changelog": 3, "md": 3, "doubt": 3, "where": [3, 4, 5, 6, 7, 8, 9, 10, 16, 17, 18, 21, 22, 23], "put": [3, 6, 7, 8, 9, 10], "look": [3, 4, 6, 8, 9, 10, 12, 15, 16, 22], "document": [3, 4, 5, 8, 9, 10, 12, 15, 16, 21, 24], "regard": [3, 6, 19], "small": [3, 4, 7, 8, 9, 10, 16], "quick": [3, 8, 9, 10], "step": [3, 7, 8, 9, 10, 15, 16, 17, 19, 22], "git": [3, 7, 18], "clone": [3, 4, 7, 8, 9, 10, 12, 15, 16], "kerneltun": [3, 7, 14], "kernel_tun": [3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 20, 21, 22, 23, 24], "cd": [3, 7, 15], "pip": [3, 4, 7, 8, 9, 14, 15, 16], "e": [3, 7, 15, 17, 18, 19, 23], "local": [3, 19, 23], "r": [3, 5, 13], "doc": [3, 4, 6, 7, 8, 9, 10, 12, 15, 16], "requirements_test": 3, "txt": 3, "pytest": [3, 7], "v": [3, 6, 8, 9, 10, 12], "build": [3, 6, 8, 9, 10], "make": [3, 4, 7, 8, 9, 10, 12, 14, 15, 16, 18, 21, 22], "html": [3, 6, 7], "These": [3, 7, 8, 9, 10, 12, 15, 16, 18, 22, 23], "instruct": [3, 7, 8, 9, 10, 11, 15, 16], "enough": [3, 4, 5, 16], "larger": [3, 8, 9, 10, 13, 19, 22], "need": [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 20, 21, 22, 23], "depend": [3, 4, 5, 7, 10, 11, 12, 14, 17, 23], "set": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 18, 19, 20, 22, 23, 24], "guid": [4, 8, 16, 17, 20], "meant": 4, "write": [4, 11, 12, 16, 22, 23], "script": [4, 6, 16, 21, 22], "we": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 20, 21, 22], "simpl": [4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 21], "find": [4, 13, 16, 19, 23], "shortli": 4, "much": [4, 8, 9, 10, 12, 18, 22, 23], "reus": [4, 8, 9, 10, 16], "note": [4, 6, 7, 8, 9, 10, 12, 14, 15, 16, 18, 21, 23], "read": [4, 5, 6, 7, 8, 9, 10, 12, 13, 16, 18, 23], "page": [4, 7, 8, 9, 10, 11, 12, 14, 16, 17], "jupyt": [4, 8, 9, 10, 12, 15, 16], "notebook": [4, 8, 9, 10, 12, 15, 16], "just": [4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16], "tutori": [4, 8, 12, 14, 15, 16], "re": [4, 7, 8, 9, 10, 12, 16], "readi": [4, 6, 8, 9, 10, 12, 16], "go": [4, 7, 8, 9, 10, 12, 14, 15, 16, 20], "type": [4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23], "sourc": [4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 22, 23], "directori": [4, 7, 8, 9, 10, 12, 15, 16], "oper": [4, 8, 9, 10, 12, 13, 16, 17], "essenti": [4, 7], "signal": [4, 24], "imag": [4, 8, 9, 10], "process": [4, 6, 7, 8, 9, 10, 16, 17, 18, 19, 22], "main": [4, 6, 12, 18, 20], "neural": 4, "network": 4, "deep": 4, "learn": 4, "comput": [4, 5, 6, 11, 12, 13, 14, 16, 19, 23], "linear": [4, 16, 23], "combin": [4, 6, 7, 8, 9, 10, 11, 12, 16, 18, 19, 20, 23], "weight": [4, 19], "filter": [4, 5, 11, 13], "rang": [4, 5, 8, 9, 10, 12, 13, 22], "pixel": 4, "each": [4, 5, 6, 7, 8, 9, 12, 16, 18, 19, 23], "size": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 19, 20, 22, 23], "w": [4, 8, 9, 17, 19], "time": [4, 6, 8, 9, 10, 12, 13, 16, 17, 18, 19, 22, 23, 24], "h": [4, 12, 23], "f": [4, 5, 12, 13, 21], "f_w": 4, "f_h": 4, "o": [4, 6], "begin": [4, 8, 9, 10, 12], "equat": [4, 8, 9, 10, 12, 19], "nonumb": [4, 12], "x": [4, 5, 6, 8, 9, 10, 12, 14, 16, 20, 22, 23], "y": [4, 6, 7, 8, 9, 10, 12, 13, 16, 23], "sum": [4, 5, 6, 16], "limits_": 4, "j": [4, 8, 9, 10, 14, 16], "0": [4, 5, 6, 8, 9, 10, 12, 13, 16, 18, 19, 21, 23], "end": [4, 6, 7, 8, 9, 10, 12, 16, 18, 19, 21], "naiv": [4, 5, 8, 9, 10], "parallel": [4, 8, 9, 10], "thread": [4, 6, 8, 9, 10, 11, 12, 17, 18, 20, 23, 24], "avoid": [4, 6, 16, 24], "confus": 4, "around": [4, 11], "term": 4, "refer": [4, 5, 6, 8, 9, 10, 11, 13, 15, 18, 23], "shown": [4, 6, 18], "sure": [4, 7, 8, 9, 10, 14, 15, 16], "execut": [4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 19, 23], "them": [4, 7, 10, 12, 13, 16], "press": [4, 8, 9, 10, 12, 16], "shift": [4, 8, 9, 10, 12, 16], "enter": [4, 7, 8, 9, 10, 12, 16], "writefil": [4, 16], "convolution_na": [4, 5], "cu": [4, 5, 13, 16, 20, 22], "__global__": [4, 8, 10, 12, 14, 16, 20, 22], "void": [4, 8, 9, 10, 12, 14, 16, 20, 21, 22], "convolution_kernel": [4, 5], "float": [4, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23], "int": [4, 6, 8, 9, 10, 12, 14, 16, 20, 22, 23], "blockidx": [4, 8, 9, 10, 12, 14, 16, 20, 22], "blockdim": [4, 20, 23], "threadidx": [4, 8, 9, 10, 12, 14, 16, 20, 22], "image_height": 4, "image_width": 4, "filter_height": 4, "filter_width": 4, "input_width": 4, "run_kernel": [4, 5, 6, 11, 23], "our": [4, 8, 9, 10, 12, 16, 20, 21], "But": [4, 8, 9, 10, 12, 20], "some": [4, 6, 7, 8, 9, 10, 15, 16, 17, 18, 19, 20, 21, 22, 23], "data": [4, 6, 8, 9, 10, 12, 13, 16, 17, 18, 20, 21, 23], "np": [4, 6, 12, 16, 20, 21], "filter_s": 4, "17": [4, 5, 8, 9, 10, 13], "output_s": 4, "4096": [4, 5, 8, 9, 10, 13, 16], "prod": [4, 5, 13], "border_s": 4, "2": [4, 5, 7, 8, 9, 10, 11, 12, 13, 16, 18, 19, 23], "1": [4, 5, 7, 8, 9, 10, 12, 13, 16, 18, 19, 23], "input_s": [4, 5, 13], "output_imag": 4, "zero": [4, 5, 12, 13, 16], "astyp": [4, 5, 8, 9, 10, 12, 13, 14, 16, 20, 22], "float32": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 20, 22, 23], "input_imag": 4, "random": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 19, 20, 22, 23], "randn": [4, 5, 13, 14, 16, 20, 22], "conv_filt": 4, "now": [4, 6, 8, 9, 10, 12, 13, 16, 20], "structur": [4, 6, 8, 9, 16, 20], "kernel_nam": [4, 6, 13, 21, 22, 23], "kernel_sourc": [4, 6, 21, 23], "problem_s": [4, 5, 6, 8, 9, 10, 12, 13, 16, 20, 21, 23, 24], "param": [4, 5, 6, 7, 18, 19, 23], "ellipsi": 4, "indic": [4, 19, 24], "mani": [4, 6, 8, 9, 10, 16, 17, 18, 19, 23], "won": 4, "t": [4, 6, 7, 8, 9, 10, 12, 13, 15, 19, 22, 23], "right": [4, 8, 9, 10, 12, 15], "interest": [4, 11, 21], "found": [4, 6, 7, 14, 18, 19], "five": [4, 6, 20], "name": [4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 18, 19, 20, 23, 24], "string": [4, 6, 8, 9, 10, 11, 16, 17, 18, 20, 21, 23], "domain": [4, 8, 9, 10, 11, 12, 23], "three": [4, 5, 16], "dimens": [4, 6, 8, 9, 10, 11, 12, 13, 16, 17, 19, 20, 23, 24], "dictionari": [4, 6, 8, 9, 10, 12, 16, 18, 19, 20, 23], "simpli": [4, 5, 6, 8, 9, 10, 12, 19, 20, 23], "cell": [4, 8, 9, 10, 12, 16], "wrote": 4, "determin": [4, 8, 9, 10, 12, 18, 19], "grid": [4, 6, 8, 9, 10, 11, 13, 16, 23, 24], "defin": [4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 18, 22, 23], "abov": [4, 6, 8, 9, 10, 12, 15, 16, 20, 21], "divid": [4, 8, 9, 10, 12, 13, 16, 23], "divisor": [4, 6, 8, 9, 10, 16, 23], "so": [4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 19, 20, 22, 23], "specifi": [4, 5, 6, 7, 8, 9, 10, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24], "point": [4, 6, 7, 8, 9, 10, 12, 13, 16, 17, 18, 20, 23], "scalar": [4, 8, 9, 10, 12, 23], "correspond": [4, 7, 8, 9, 10, 12, 18, 19, 20], "therefor": [4, 5, 8, 9, 10, 12, 13, 16], "exactli": [4, 6, 8, 9, 10, 16, 18], "order": [4, 5, 6, 8, 9, 10, 12, 13, 16, 17, 19, 20, 23], "match": [4, 5, 6], "32": [4, 6, 8, 9, 10, 12, 14, 16, 20, 23], "bit": [4, 6, 8, 9, 10, 12, 13, 16], "final": [4, 5, 8, 9, 10, 12], "rememb": [4, 7, 8, 9, 10, 16], "anyth": 4, "insert": [4, 5, 6, 10, 12, 13, 16, 20, 22, 23, 24], "preprocessor": [4, 6, 23], "statement": [4, 10, 12, 16, 22], "valu": [4, 5, 6, 8, 9, 10, 11, 12, 13, 16, 18, 19, 20, 23], "were": [4, 8, 9, 10, 12, 16, 23], "like": [4, 6, 7, 8, 9, 10, 11, 12, 16, 19, 20, 21, 22, 23], "i_like_convolut": 4, "42": 4, "line": [4, 7, 8, 9, 10], "definit": [4, 12, 23], "effect": [4, 7, 8, 9, 10, 23], "perform": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 23], "unless": 4, "cours": [4, 8, 9, 10, 15, 16], "somewher": 4, "token": 4, "In": [4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 20, 21, 23, 24], "freeli": 4, "few": [4, 8, 9, 10, 12, 13, 22], "special": [4, 8, 9, 10, 18, 20, 24], "notic": [4, 8, 9, 10], "haven": [4, 15], "yet": [4, 6, 12, 13, 20], "basic": [4, 6, 8, 9, 10, 20], "block_size_x": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 20, 22, 23], "block_size_i": [4, 5, 8, 9, 10, 12, 13, 16, 23], "block_size_z": [4, 8, 9, 10, 12, 23], "interpret": 4, "z": [4, 6, 12, 23], "prefer": [4, 6, 7, 8, 10, 18, 23], "block_size_nam": [4, 6, 23], "let": [4, 6, 8, 9, 10, 20, 22], "continu": [4, 6, 7, 8, 9, 10, 15, 18, 19, 23], "creation": [4, 14, 19], "trusti": 4, "old": 4, "16": [4, 5, 8, 9, 10, 12, 13, 16], "dict": [4, 5, 6, 10, 13, 14, 18, 19, 20, 22, 23], "current": [4, 5, 6, 7, 8, 9, 10, 15, 16, 18, 19, 23], "undefin": [4, 6, 8, 9, 10, 16], "filter_heigth": 4, "those": [4, 7, 11, 15, 18], "could": [4, 5, 6, 8, 9, 10, 13, 15, 16, 18, 19, 22, 23], "runtim": [4, 6, 8, 9, 10, 14, 15, 18, 22], "setup": [4, 8, 9, 10, 13, 15, 18, 21], "everyth": [4, 6, 8, 9, 10], "answer": [4, 5, 6, 8, 9, 10, 11, 23], "alloc": [4, 6, 8, 9, 10, 11, 13, 23], "move": [4, 6, 8, 13, 16, 19, 23], "content": [4, 6, 23], "deriv": [4, 6, 8, 9, 10, 17], "after": [4, 5, 6, 7, 8, 9, 10, 13, 15, 16, 18, 23], "retriev": [4, 6, 23], "free": [4, 8, 9, 10, 13, 15, 16], "return": [4, 5, 6, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 23], "contrast": 4, "wa": [4, 6, 8, 9, 10, 18, 23], "finish": [4, 6, 9, 12, 13, 18], "particularli": [4, 7, 17], "than": [4, 8, 9, 10, 12, 17, 18, 19, 23, 24], "highli": [4, 14, 16], "parametr": 4, "long": [4, 8, 9, 10, 12, 13, 16, 21], "instead": [4, 6, 11, 16, 23], "adjust": [4, 7], "path": [4, 7, 18], "littl": [4, 8, 9, 10, 16], "ve": [4, 8, 9, 10, 15, 16], "familiar": [4, 16], "kernel_str": [4, 5, 6, 8, 9, 10, 13, 14, 19, 23], "tune_param": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 19, 20, 21, 22, 23], "replac": [4, 5, 6, 7, 8, 9, 10, 12, 16, 23], "similarli": 4, "singl": [4, 5, 6, 8, 9, 10, 13, 16, 18, 22, 23], "again": [4, 7, 8, 9, 10, 12, 16], "wai": [4, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 23], "64": [4, 8, 9, 10, 14, 16, 20, 22], "128": [4, 8, 9, 10, 14, 20, 22], "8": [4, 6, 7, 8, 9, 10, 12, 15, 16, 18], "try": [4, 6, 8, 9, 10, 15, 16, 19, 23], "env": [4, 6, 19, 20, 23], "take": [4, 6, 7, 8, 9, 10, 12, 16, 18, 19, 20, 22, 23], "cartesian": [4, 12], "product": [4, 8, 9, 23], "realli": [4, 8, 9, 10, 15], "howev": [4, 5, 8, 9, 10, 13, 15, 16, 18, 21, 22, 23], "lot": [4, 8, 9, 10, 16, 18, 20, 21, 23], "problemat": 4, "support": [4, 6, 7, 8, 9, 10, 13, 15, 18, 19, 22, 23, 24], "explain": [4, 6, 8, 9, 10, 13, 15, 16, 17, 20, 22, 23], "illeg": 4, "2048": 4, "1024": [4, 8, 9, 10, 20], "devic": [4, 5, 7, 8, 9, 10, 11, 13, 18, 22, 23], "fail": [4, 6, 15, 23], "reason": [4, 6, 7, 21, 23], "too": [4, 8, 9, 10, 12, 13, 16, 23], "regist": [4, 8, 9, 10, 16, 18], "avail": [4, 7, 8, 9, 10, 11, 12, 15, 18], "silent": 4, "verbos": [4, 5, 6, 8, 9, 10, 13, 23], "bound": [4, 6, 16, 19], "access": [4, 7, 8, 9, 10, 12, 18, 21], "ignor": [4, 6, 8, 9, 10, 23], "two": [4, 6, 8, 9, 10, 11, 16, 17, 19, 23], "thing": [4, 13, 16], "record": [4, 6, 8, 18, 23], "show": [4, 8, 9, 10, 11, 14, 17, 21], "secondli": [4, 16], "experi": 4, "took": [4, 8, 10, 19, 20, 23], "place": [4, 8, 9, 10, 18, 19, 20, 23], "That": [4, 8, 9, 10, 13, 16, 17, 20], "mean": [4, 13, 16, 17, 19, 21, 22, 24], "softwar": [4, 8, 9, 10, 14, 15, 18, 19, 20], "along": [4, 6, 15, 20, 24], "inform": [4, 6, 7, 8, 9, 10, 14, 18, 19, 20, 23, 24], "second": [4, 5, 6, 8, 9, 10, 12, 16, 17, 18, 19, 23], "alwai": [4, 6, 8, 9, 10], "under": [4, 7, 14, 23], "circumst": 4, "obtain": [4, 8, 9, 10, 12, 18], "promis": 4, "would": [4, 7, 8, 9, 10, 22], "tile": [4, 11, 16], "factor": [4, 8, 9, 10, 11, 12, 16, 24], "amount": [4, 8, 9, 10, 16, 17, 23], "particular": [4, 6, 8, 9, 11, 13, 16, 18, 21], "increas": [4, 8, 9, 10, 18], "certain": [4, 6, 8, 9, 10, 11, 18, 24], "tile_size_x": [4, 5, 8, 9, 10, 13, 16], "4": [4, 8, 9, 10, 12, 16, 18], "tile_size_i": [4, 5, 8, 9, 10, 13, 16, 23], "understand": 4, "everi": [4, 5, 8, 9, 10, 11, 18, 20], "fewer": [4, 8, 9, 10], "total": [4, 6, 8, 9, 10, 16, 17, 20], "stai": 4, "tell": [4, 8, 9, 10, 11, 13, 16, 20, 21], "influenc": 4, "alreadi": [4, 6, 7, 8, 9, 10, 15, 16, 23], "did": [4, 8, 9, 10, 16], "mimick": 4, "behavior": [4, 16, 18, 23], "been": [4, 6, 7, 8, 9, 10, 13, 16, 19], "assum": [4, 6, 8, 9, 10, 16, 23], "far": [4, 8, 9, 10, 16, 20], "grid_div_x": [4, 5, 8, 9, 10, 13, 16, 23], "grid_div_i": [4, 5, 8, 9, 10, 13, 16, 23], "add": [4, 6, 7, 8, 9, 10, 13, 16, 18, 19], "decreas": [4, 16], "correspondingli": 4, "displai": 4, "commonli": [4, 8, 9, 10, 15, 16], "gflop": [4, 6, 11, 16, 17], "giga": [4, 16], "compos": [4, 6, 16, 17], "lambda": [4, 6, 8, 9, 16, 17, 23], "collect": [4, 6, 8, 9, 10, 12, 16, 18, 21], "ordereddict": [4, 8, 9, 10, 12, 16, 17], "p": [4, 6, 16, 17, 21, 23], "1e9": [4, 16], "1e3": [4, 8, 9, 10, 16, 17], "expand": [4, 14, 16, 18], "longer": [4, 6, 7, 17], "sinc": [4, 10, 12, 14, 16, 22], "9": [4, 5, 7, 8, 9, 10, 13], "And": [4, 8, 9, 10, 19, 22, 23], "seen": [4, 6, 7, 16], "know": [4, 8, 9, 10, 16, 17], "abl": [4, 6, 8, 9, 10], "own": [4, 10, 13, 15, 17, 18], "whenev": 5, "good": [5, 8, 9, 10, 24], "fast": [5, 8, 9, 10], "verifi": [5, 6, 7, 11, 23], "instanc": [5, 6, 8, 9, 10, 13, 18, 23], "none": [5, 6, 18, 19, 23], "locat": [5, 7, 12, 18], "onc": [5, 6, 8, 9, 10, 12, 18, 23], "against": [5, 6, 7], "comparison": [5, 14], "allclos": [5, 23], "maximum": [5, 6, 12, 19, 23], "absolut": [5, 23], "1e": [5, 23], "6": [5, 8, 9, 10, 12, 13, 23], "toler": 5, "atol": [5, 6, 23], "convolution_correct": 5, "py": [5, 13, 15], "demonstr": [5, 10, 11, 16], "open": [5, 7, 8, 9, 13, 16], "cmem_arg": [5, 6, 23], "d_filter": 5, "arg": [5, 6, 8, 9, 10, 12, 13, 14, 16, 19, 20, 21, 22], "3": [5, 7, 8, 9, 10, 12, 13, 15, 16, 19, 23], "non": [5, 7], "field": [5, 8, 9, 10], "its": [5, 6, 8, 9, 10, 12, 14, 15, 16, 17, 18, 23], "almost": [5, 8, 9, 10, 18], "whose": [5, 23], "trust": [5, 19], "construct": [5, 16], "There": [5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 18, 20, 23, 24], "precomput": 5, "flexibl": [5, 8, 9, 16], "necessari": [5, 6, 7, 8, 9, 10, 23], "callabl": [5, 6, 23], "accept": [5, 6, 19, 23], "cpu_result": 5, "gpu_result": [5, 8, 10], "although": 5, "semant": 5, "posit": [5, 6, 12, 19, 22, 23], "reflect": [5, 18], "reduct": [5, 17, 23], "snippet": 5, "sum_x": 5, "n": [5, 7, 8, 9, 10, 12, 13, 14, 16, 19, 20, 22], "custom": [5, 11, 17, 18, 21], "def": [5, 6, 8, 9, 10, 12, 18, 21], "verify_partial_reduc": 5, "isclos": 5, "first_kernel": 5, "_": [5, 8, 9, 10], "sum_float": 5, "map": [5, 11, 12], "provid": [5, 6, 7, 8, 9, 10, 13, 22, 23], "third": [5, 16], "partial": [5, 8, 9, 10, 11], "cpu": [5, 9, 10, 13], "achiev": [5, 10], "element": [5, 8, 9, 10, 16, 17, 20, 21, 23], "doe": [5, 6, 7, 8, 9, 10, 12, 13, 16, 18, 22, 23], "necessarili": [5, 13], "section": [6, 8, 9, 10], "intern": [6, 14, 19, 22], "mostli": [6, 14, 23], "relev": [6, 14, 18], "extens": 6, "architectur": [6, 18], "At": [6, 12, 23], "top": [6, 7, 12, 18, 23], "expos": 6, "respons": 6, "iter": [6, 8, 9, 10, 12, 16, 18, 19, 20, 23], "through": [6, 7, 8, 9, 10, 12, 14, 17, 18, 19, 23], "brute_forc": [6, 23], "valid": [6, 11, 16, 23], "random_sampl": [6, 23], "sampl": [6, 19, 23], "advanc": [6, 22, 23], "being": [6, 8, 9, 10, 16, 18, 19, 23], "strategy_opt": [6, 19, 23], "sai": [6, 8, 9, 10, 20, 22], "foreseen": 6, "futur": [6, 14, 23, 24], "releas": [6, 7], "high": [6, 8, 9, 10, 14, 16, 18], "level": [6, 7, 18], "low": [6, 8, 9, 10, 16], "abstract": [6, 18], "ready_argument_list": 6, "bottom": 6, "pyopencl": [6, 7, 9, 18], "either": [6, 12, 19, 22, 23], "typic": [6, 15, 16, 23], "gcc": 6, "fortran": [6, 11, 22], "turn": 6, "launch": [6, 8, 9, 10, 13, 18, 23], "rest": [6, 8, 9, 10], "helper": [6, 18], "get_opt": 6, "suppli": [6, 13, 16, 19, 22, 23], "get_strategy_docstr": 6, "method": [6, 8, 9, 10, 13, 16, 18, 19], "make_strategy_options_doc": 6, "scale_from_param": 6, "ep": [6, 19], "func": [6, 18, 23], "invers": 6, "unscal": 6, "setup_method_argu": 6, "prepar": [6, 7, 8, 9, 10], "setup_method_opt": 6, "tuning_opt": [6, 19], "snap_to_nearest_config": 6, "closest": 6, "unscale_and_snap_to_nearest": 6, "snap": 6, "scale": 6, "variabl": [6, 7, 12, 15, 19, 23], "nearest": [6, 23], "config": [6, 7], "class": [6, 18, 19], "kernel_opt": 6, "device_opt": 6, "__init__": 6, "instanti": [6, 22], "kernelsourc": 6, "parameter_spac": [6, 19], "iterfac": 6, "platform": [6, 14, 15, 18, 23], "quiet": [6, 23], "fals": [6, 7, 18, 19, 23], "compiler_opt": [6, 23], "7": [6, 8, 9, 10, 12, 23], "offer": 6, "capabl": [6, 7, 8, 9, 14, 16, 23], "bool": [6, 21, 23], "gpu_arg": 6, "skip_nvml_set": 6, "benchmark_continu": 6, "durat": [6, 18], "least": [6, 7], "benchmark_default": 6, "check_kernel_output": 6, "compile_kernel": 6, "copy_constant_memory_arg": 6, "recent": [6, 15, 18], "copy_shared_memory_arg": 6, "smem_arg": [6, 23], "copy_texture_memory_arg": 6, "texmem_arg": [6, 23], "create_kernel_inst": 6, "get_environ": 6, "memcpy_dtoh": [6, 8], "dest": 6, "src": 6, "copi": [6, 8, 9, 10, 13, 20, 23], "static": 6, "preprocess_gpu_argu": 6, "old_argu": 6, "flat": 6, "given": [6, 8, 9, 10, 12, 18, 19, 23], "mem": 6, "set_nvml_paramet": 6, "nvml": [6, 24], "leak": 6, "group": [6, 8, 9, 10, 23], "maintain": 6, "state": [6, 8, 9, 10, 18, 23], "interact": [6, 18], "inspect": [6, 7, 18], "properti": [6, 16, 23], "context": [6, 8, 10, 12], "kernel_inst": 6, "lookup": 6, "directli": [6, 8, 9, 10, 13, 16, 18, 22, 23], "driver": [6, 7, 8, 10, 12], "ndarrai": [6, 12], "format": [6, 8, 9, 21], "kei": [6, 8, 9, 10, 16, 19, 20, 23], "symbol": [6, 23], "similar": [6, 13, 16, 23], "regular": [6, 10, 18], "int32": [6, 14, 20, 22, 23], "kernel_finish": 6, "otherwis": [6, 7, 16, 23], "devicealloc": 6, "memcpy_htod": [6, 8], "memset": 6, "unsign": [6, 9], "byte": [6, 21, 23], "global": [6, 7, 8, 9, 10, 19], "tupl": [6, 10, 12, 19, 23], "start_ev": 6, "event": [6, 8, 13, 18], "mark": 6, "measur": [6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18, 23, 24], "stop_ev": 6, "synchron": [6, 8, 10, 12, 16, 17], "halt": [6, 13], "until": [6, 13], "task": 6, "rawkernel": 6, "cudeviceptr": 6, "cufunct": 6, "id": [6, 7, 18], "must": [6, 17, 23], "buffer": [6, 9, 21], "fill": [6, 16], "item": [6, 8, 9, 10, 12], "ndrang": 6, "cfunction": 6, "cleanup_lib": 6, "unload": [6, 7], "previous": [6, 8, 9, 10, 16], "load": [6, 7], "librari": [6, 11, 18, 21], "kernelinst": 6, "repres": [6, 8, 9, 10], "tunabl": [6, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 22, 23, 24], "ctype": 6, "_funcptr": 6, "asynchron": 6, "memcpi": [6, 13], "c_arg": 6, "whatev": [6, 7, 13, 19], "left": [6, 7, 8, 9, 10, 12, 17], "robust": 6, "averag": [6, 8, 9, 10, 13, 18], "ptr": 6, "pionter": 6, "compilationfailedconfig": 6, "errorconfig": 6, "invalidconfig": 6, "npencod": 6, "skipkei": 6, "ensure_ascii": 6, "check_circular": 6, "allow_nan": 6, "sort_kei": 6, "indent": 6, "separ": [6, 11, 13, 22], "dump": [6, 8, 9], "json": [6, 8, 9, 11, 23], "obj": 6, "subclass": 6, "serializ": 6, "rais": 6, "typeerror": 6, "arbitrari": 6, "self": [6, 7, 18, 19], "except": [6, 7, 11], "els": 6, "jsonencod": 6, "runtimefailedconfig": 6, "skippablefailur": 6, "stopcriterionreach": 6, "thrown": 6, "stop": [6, 19], "criterion": [6, 19], "reach": 6, "check_argument_list": 6, "check_argument_typ": 6, "dtype": [6, 21], "kernel_argu": 6, "check_restrict": 6, "restrict": [6, 7, 11, 16, 22, 23], "whether": [6, 17, 19, 23], "meet": 6, "check_stop_criterion": 6, "max_fev": [6, 19, 23], "exceed": 6, "check_thread_block_dimens": 6, "max_thread": 6, "check_tune_params_list": 6, "simulation_mod": [6, 23], "forbidden": 6, "compile_restrict": 6, "monolith": 6, "try_to_constraint": 6, "union": 6, "str": [6, 8, 9, 10, 12], "constraint": 6, "pars": [6, 8, 9], "config_valid": 6, "max": 6, "convert_constraint_restrict": 6, "convert": [6, 8, 9], "backward": 6, "compat": [6, 7, 15], "correct_open_cach": 6, "open_cach": 6, "properli": 6, "close": [6, 8, 9, 10], "pretend": 6, "cuda_error_check": 6, "statu": 6, "delete_temp_fil": 6, "delet": 6, "temporari": 6, "don": [6, 7, 8, 10, 12, 13, 23], "complain": 6, "detect_languag": 6, "attempt": [6, 22], "detect": [6, 19, 22, 23], "dump_cach": 6, "omit": 6, "sever": [6, 8, 9, 10, 11, 12, 15, 16, 22, 23], "store_cach": 6, "speed": 6, "great": [6, 8, 9, 10, 20], "power": [6, 16, 18, 24], "get_best_config": 6, "objective_higher_is_bett": [6, 17, 23], "best": [6, 8, 9, 12, 16, 19, 22, 23, 24], "accord": [6, 23], "get_config_str": 6, "compact": 6, "represent": [6, 21], "get_grid_dimens": 6, "current_problem_s": 6, "grid_div": 6, "dim": 6, "get_instance_str": 6, "debug": 6, "advis": 6, "get_kernel_str": [6, 8, 9, 10], "One": [6, 8, 9, 10, 18, 21], "get_problem_s": 6, "get_smem_arg": 6, "get_temp_filenam": 6, "suffix": [6, 23], "form": [6, 16, 18, 19], "temp_x": 6, "larg": [6, 8, 9, 10, 12, 23], "integ": [6, 18, 21, 23], "get_thread_block_dimens": 6, "convent": [6, 13, 23], "get_total_tim": 6, "overhead_tim": 6, "looks_like_a_filenam": 6, "normalize_verify_funct": 6, "normal": [6, 19, 23], "result_host": 6, "keyword": 6, "behaviour": 6, "parse_restrict": 6, "prepare_kernel_str": 6, "prepend": [6, 10], "seri": [6, 12], "By": [6, 13, 16, 19, 23], "macro": 6, "made": 6, "print_config": 6, "print_config_output": 6, "process_cach": 6, "device_nam": [6, 23], "tune_params_kei": 6, "x1": 6, "x2": 6, "xn": 6, "234342": 6, "y1": 6, "y2": 6, "yn": 6, "134233": 6, "last": [6, 7, 21], "bracket": 6, "miss": [6, 7, 23], "earlier": [6, 8, 9, 10, 12], "abruptli": 6, "process_metr": 6, "calcul": [6, 12], "express": [6, 8, 9, 10, 11, 13, 16, 23], "10000": 6, "updat": [6, 7], "read_cach": 6, "cachefil": [6, 23], "read_fil": 6, "replace_param_occurr": 6, "occurr": 6, "setup_block_and_grid": 6, "to_valid_nvrtc_gpu_arch_cc": 6, "compute_cap": 6, "index": [6, 19], "group__opt": 6, "write_fil": 6, "core": [7, 18], "who": 7, "simplifi": [7, 8, 9, 10], "sudo": [7, 15], "g": [7, 15, 17, 18], "desir": 7, "system": [7, 14, 15, 18], "On": [7, 8, 9, 10, 23], "ubuntu": 7, "apt": 7, "upgrad": 7, "libssl": 7, "dev": [7, 15, 18], "zlib1g": 7, "libbz2": 7, "libreadlin": 7, "libsqlite3": 7, "wget": [7, 15], "curl": [7, 15], "llvm": 7, "libncurses5": 7, "libncursesw5": 7, "xz": 7, "util": [7, 16], "tk": 7, "libffi": 7, "liblzma": 7, "openssl": 7, "pyenv": 7, "linux": [7, 15], "bash": [7, 15], "bash_profil": 7, "bashrc": 7, "maco": 7, "brew": 7, "shell": 7, "libgdbm": 7, "libnss3": 7, "lzma": 7, "10": [7, 8, 9, 10, 14, 19], "11": [7, 8, 9, 10], "oppos": 7, "virtualenv": 7, "virtual": [7, 15], "folder": 7, "poetri": [7, 15], "ssl": [7, 15], "org": [7, 14, 15], "python3": [7, 15], "export": 7, "plugin": 7, "appli": [7, 8, 9, 10], "activ": 7, "project": [7, 15], "extra": [7, 15, 22], "leav": 7, "conveni": [7, 8, 9, 10, 13, 23], "cuda11x": 7, "cuda12x": 7, "part": [7, 8, 9, 10, 14, 15, 16, 17, 21, 23], "forget": [7, 12], "correctli": [7, 16], "ld_libary_path": 7, "cpath": 7, "gracefulli": 7, "privileg": [7, 18], "counter": [7, 18], "energi": [7, 14, 18, 19, 24], "cat": 7, "proc": 7, "grep": 7, "rmprofilingadminonli": 7, "without": [7, 8, 9, 10, 12, 13, 18, 19], "conda": 7, "mamba": 7, "miniconda": [7, 15], "tradit": 7, "quota": 7, "disk": 7, "save": [7, 8, 9], "ad": [7, 8, 9, 10, 13, 23], "condarc": 7, "envs_dir": 7, "both": [7, 8, 9, 10, 11, 16], "via": [7, 19], "usual": [7, 18], "exit": 7, "elsewher": 7, "pip_cache_dir": 7, "dir": [7, 15], "xdg_cache_hom": 7, "forg": 7, "auto_activate_bas": 7, "rocm": [7, 15, 18], "keyr": 7, "seemingli": 7, "weird": 7, "known": [7, 16], "m": [7, 8, 9, 10, 12], "disabl": 7, "sync": [7, 21], "dry": 7, "node": [7, 19], "noxset": 7, "toml": 7, "venvbackend": 7, "anaconda": 7, "venv": 7, "Be": [7, 8, 9, 10], "envdir": 7, "diskquota": 7, "isol": [7, 22], "coverag": 7, "gigabyt": 7, "tight": 7, "diskspac": 7, "remov": [7, 19], "ran": 7, "command": [7, 15], "involv": 7, "especi": 7, "break": [7, 22], "hold": [7, 8, 9, 16, 20, 21, 23], "invok": 7, "tab": 7, "studio": 7, "integr": [7, 22], "commit": 7, "brows": 7, "pandoc": 7, "mac": 7, "onlin": 7, "built": [7, 18, 19, 21, 23], "action": 7, "master": 7, "latest": [7, 15], "stabl": 7, "publish": [7, 14], "autom": 7, "whole": [8, 9, 10, 16, 19], "model": [8, 9, 10, 14], "physic": 8, "numer": [8, 9, 10], "introduc": [8, 9, 10, 16, 18], "redistribut": [8, 9, 10], "region": [8, 9, 10], "concentr": [8, 9, 10], "bulk": [8, 9, 10], "motion": [8, 9, 10], "concept": [8, 9, 10], "wide": [8, 9, 10, 15, 16], "chemistri": [8, 9, 10], "biologi": [8, 9, 10], "suppos": [8, 9, 10], "metal": [8, 9, 10], "sheet": [8, 9, 10], "temperatur": [8, 9, 10, 18, 19, 24], "equal": [8, 9, 10, 16, 23], "degre": [8, 9, 10], "everywher": [8, 9, 10], "heat": [8, 9, 10], "thousand": [8, 9, 10], "instant": [8, 9, 10, 12], "hotspot": [8, 9, 10], "cooler": [8, 9, 10], "area": [8, 9, 10, 16], "melt": [8, 9, 10], "loss": [8, 9, 10], "radiat": [8, 9, 10], "frac": [8, 9, 10], "d": [8, 9, 10, 12, 19, 20], "spatial": [8, 9, 10], "descret": [8, 9, 10], "2d": [8, 9, 10, 11], "quantiti": [8, 9, 10, 17, 18, 23], "nx": [8, 9, 10, 12], "equi": [8, 9, 10], "distant": [8, 9, 10], "direct": [8, 9, 10, 13, 16, 17, 23], "ny": [8, 9, 10, 12], "distanc": [8, 9, 10, 19], "delta": [8, 9, 10], "central": [8, 9, 10], "approxim": [8, 9, 10], "x_i": [8, 9, 10, 12], "x_": [8, 9, 10], "approx": [8, 9, 10], "u_": [8, 9, 10], "2u_": [8, 9, 10], "y_": [8, 9, 10], "estim": [8, 9, 10], "next": [8, 9, 10, 16, 21], "formula": [8, 9, 10], "4u_": [8, 9, 10], "simplic": [8, 9, 10, 12], "assumpt": [8, 9, 10], "boundari": [8, 9, 10], "condit": [8, 9, 10, 16], "dt": [8, 9, 10], "225": [8, 9, 10], "initi": [8, 9, 10, 21], "hot": [8, 9, 10], "plot": [8, 9, 10], "color": [8, 9, 10], "matplotlib": [8, 9, 10, 15], "pyplot": [8, 9, 10], "inlin": [8, 9, 10], "get_initial_condit": [8, 9, 10], "ones": [8, 9, 10, 24], "randint": [8, 9, 10], "1000": [8, 9, 10, 12], "2000": [8, 9, 10], "fig": [8, 9, 10], "ax1": [8, 9, 10], "ax2": [8, 9, 10], "subplot": [8, 9, 10], "imshow": [8, 9, 10], "lt": [8, 9, 10], "axesimag": [8, 9, 10], "0x2aaab952f240": 8, "gt": [8, 9, 10], "later": [8, 9, 10, 12, 23], "field_copi": [8, 9], "4164": 8, "018869400024": 8, "0x2aab1c98b3c8": 8, "worri": [8, 10], "terminologi": [8, 10], "text": [8, 10, 16], "5": [8, 9, 10, 12, 19], "225f": [8, 9, 10], "diffuse_kernel": [8, 9, 10], "u_new": [8, 9, 10], "0f": [8, 9, 10], "togeth": [8, 9, 10, 15, 23], "impact": [8, 9, 10, 13], "fix": [8, 9, 10, 19, 23], "unrol": [8, 9, 10, 11, 16, 24], "loop": [8, 9, 10, 11, 16, 24], "drv": 8, "sourcemodul": [8, 10, 12], "init": 8, "make_context": 8, "devprop": 8, "k": [8, 9, 10, 12, 14, 16, 20], "get_devic": 8, "get_attribut": 8, "cc": 8, "compute_capability_major": 8, "compute_capability_minor": 8, "u_old": [8, 10], "mem_alloc": 8, "nbyte": 8, "block_size_str": [8, 10], "arch": 8, "sm_": 8, "get_funct": [8, 10, 12], "boilerpl": [8, 9, 10], "moment": [8, 9, 10, 23], "serv": [8, 9, 10, 17, 19], "guess": [8, 9, 10], "pair": [8, 9, 10], "500": [8, 9, 10], "time_sinc": 8, "zeros_lik": [8, 12, 14, 16, 20, 22], "set_titl": [8, 9, 10], "53": [8, 9, 10], "423038482666016": 8, "0x2aaabbdcb2e8": 8, "faster": [8, 9, 10, 16], "cleanup": 8, "pop": 8, "think": [8, 9, 10], "messi": [8, 9, 10], "got": [8, 9, 10], "cleaner": [8, 9, 10], "plai": [8, 9, 10], "difficult": [8, 9, 10, 21, 22], "rather": [8, 9, 10, 23], "underutil": [8, 9, 10], "purpos": [8, 9, 10, 13, 16, 23, 24], "feel": [8, 9, 10], "48": [8, 9, 10], "care": [8, 9, 10], "appropi": [8, 9, 10], "fly": [8, 9, 10], "12": [8, 9, 10], "13": [8, 9, 10], "geforc": [8, 9, 10, 12], "gtx": [8, 9, 10, 12], "titan": [8, 9, 10], "22305920124": 8, "779033613205": 8, "824838399887": 8, "900499212742": 8, "999763202667": 8, "727967989445": 8, "752479994297": 8, "797900807858": 8, "876627194881": 8, "93347837925": 8, "766662418842": 8, "803033602238": 8, "853574407101": 8, "971545600891": 8, "763775992393": 8, "791257584095": 8, "848044800758": 8, "922745585442": 8, "792595207691": 8, "822137594223": 8, "893279993534": 8, "millisecond": [8, 9, 10], "matter": [8, 9, 10, 13], "analyz": [8, 9, 10], "seem": [8, 9, 10], "vari": [8, 9, 10, 12, 16, 17], "addtion": [8, 9, 10], "among": [8, 9, 10, 14, 19], "128x32": [8, 9, 10], "likewis": [8, 9, 10], "becom": [8, 9, 10, 18, 19], "affect": [8, 9, 10, 16], "within": [8, 9, 10, 12, 16, 19, 23], "exchang": [8, 9, 10], "fact": [8, 9, 10, 13], "commun": [8, 9, 10], "idea": [8, 9, 10, 13, 16, 24], "l2": [8, 9, 10], "closer": [8, 9, 10], "multiprocessor": [8, 9, 10], "l1": [8, 9, 10], "fine": [8, 9, 10], "grain": [8, 9, 10], "manag": [8, 9, 10, 16, 18], "cost": [8, 9, 10, 19], "overhead": [8, 9, 10, 16], "degrad": [8, 9, 10], "intermedi": [8, 9, 10], "mind": [8, 9, 10], "14": [8, 9, 10], "tx": [8, 9, 10, 16], "ty": [8, 9, 10, 16], "bx": [8, 9, 10, 12], "__shared__": [8, 10, 16], "sh_u": [8, 9, 10], "pragma": [8, 9, 10, 16], "__syncthread": [8, 9, 10, 16], "75041918755": 8, "18713598251": 8, "09015038013": 8, "06844799519": 8, "09730558395": 8, "14420480728": 8, "05957758427": 8, "07508480549": 8, "0731967926": 8, "14729599953": 8, "08389122486": 8, "10700161457": 8, "10125439167": 8, "31661438942": 8, "0629119873": 8, "04807043076": 8, "054880023": 8, "12033278942": 8, "06672639847": 8, "05816960335": 8, "12000002861": 8, "merg": [8, 9, 10, 16], "half": [8, 9, 10], "doubl": [8, 9, 10, 21, 22], "cover": [8, 9, 10, 19], "beyond": [8, 9, 10, 23], "reduc": [8, 9, 10, 16], "condens": [8, 9, 10], "keep": [8, 9, 10, 16, 21], "importantli": [8, 9, 10], "worst": [8, 9, 10], "15": [8, 9, 10, 22], "tj": [8, 9, 10], "ti": [8, 9, 10, 12], "somehow": [8, 9, 10], "insid": [8, 9, 10, 13, 16, 22, 23], "round": [8, 9, 10, 23], "arithmet": [8, 9, 10, 23], "evalu": [8, 9, 10, 16, 19, 23], "759308815": 8, "29789438248": 8, "06983039379": 8, "2634239912": 8, "997139203548": 8, "843692803383": 8, "05549435616": 8, "862348806858": 8, "750636804104": 8, "19084160328": 8, "876377594471": 8, "714169609547": 8, "875001597404": 8, "691116797924": 8, "575859189034": 8, "759679996967": 8, "622867202759": 8, "650336003304": 8, "09794559479": 8, "826515209675": 8, "692665600777": 8, "78363519907": 8, "646092808247": 8, "554745602608": 8, "716115188599": 8, "581280004978": 8, "662566399574": 8, "07386879921": 8, "833420813084": 8, "705055999756": 8, "840755212307": 8, "652575993538": 8, "569388794899": 8, "689356791973": 8, "597267186642": 8, "675232005119": 8, "10033922195": 8, "860332798958": 8, "731891202927": 8, "867276787758": 8, "68781440258": 8, "595276796818": 8, "735436797142": 8, "60216319561": 8, "852166390419": 8, "15089921951": 8, "852575981617": 8, "705932807922": 8, "888671982288": 8, "673248004913": 8, "563417613506": 8, "761139214039": 8, "621254396439": 8, "676595199108": 8, "06709122658": 8, "804953610897": 8, "685670387745": 8, "801798415184": 8, "632006394863": 8, "542387211323": 8, "722668802738": 8, "578745603561": 8, "618598401546": 8, "08220798969": 8, "821881604195": 8, "687955200672": 8, "77759360075": 8, "618003201485": 8, "539891195297": 8, "705900788307": 8, "568556785583": 8, "624492788315": 8, "0799423933": 8, "832300806046": 8, "70140799284": 8, "835481595993": 8, "638348805904": 8, "550105595589": 8, "667251205444": 8, "576044797897": 8, "732409596443": 8, "15916161537": 8, "869497597218": 8, "733248019218": 8, "890803205967": 8, "677363204956": 8, "577215993404": 8, "730982398987": 8, "58035838604": 8, "10066559315": 8, "837804794312": 8, "691385602951": 8, "851040017605": 8, "666656005383": 8, "560505592823": 8, "771103990078": 8, "626163220406": 8, "694451200962": 8, "11514236927": 8, "837299215794": 8, "703302407265": 8, "806828796864": 8, "648620784283": 8, "562521612644": 8, "760915207863": 8, "605760002136": 8, "690009605885": 8, "10740480423": 8, "841631996632": 8, "700883197784": 8, "838195204735": 8, "649779188633": 8, "56585599184": 8, "7168192029": 8, "59088640213": 8, "69627519846": 8, "3269824028": 8, "02665598392": 8, "840908801556": 8, "03752319813": 8, "788345599174": 8, "662041604519": 8, "85437438488": 8, "680422389507": 8, "0759360075": 8, "801996803284": 8, "666003203392": 8, "808000004292": 8, "643359994888": 8, "544691193104": 8, "741964805126": 8, "60942081213": 8, "681350398064": 8, "05262081623": 8, "792108798027": 8, "66344319582": 8, "768064010143": 8, "625260794163": 8, "540352010727": 8, "721862399578": 8, "579411196709": 8, "626976013184": 8, "06332798004": 8, "808211183548": 8, "679372787476": 8, "803718411922": 8, "627136015892": 8, "538227200508": 8, "682188808918": 8, "573836791515": 8, "725548803806": 8, "13023357391": 8, "843411195278": 8, "713843202591": 8, "85886080265": 8, "657920002937": 8, "565254402161": 8, "697094392776": 8, "579904007912": 8, "07484800816": 8, "801119995117": 8, "667347204685": 8, "799059200287": 8, "643820810318": 8, "542937588692": 8, "740518403053": 8, "615148806572": 8, "731334400177": 8, "07002239227": 8, "805299210548": 8, "675923216343": 8, "782060790062": 8, "631142401695": 8, "540383994579": 8, "723999989033": 8, "578681600094": 8, "726335990429": 8, "13297917843": 8, "844428789616": 8, "710278391838": 8, "835494399071": 8, "637958395481": 8, "567417597771": 8, "699366402626": 8, "588492810726": 8, "tri": [8, 9, 10, 19], "grow": [8, 9, 10], "quickli": [8, 9, 10], "went": [8, 9, 10, 12], "72": [8, 9, 10], "26": [8, 9, 10], "32x2": [8, 9, 10], "64x4": [8, 9, 10], "four": [8, 9, 10], "best_tim": [8, 9], "min": [8, 9], "05": [8, 9], "join": [8, 9], "nice": [8, 9], "stdout": [8, 9], "why": [8, 9, 13, 17], "easili": [8, 9, 18], "easi": [8, 9, 17, 18, 23], "csv": [8, 9, 11], "analysi": [8, 9, 14], "panda": [8, 9, 11, 15], "18": [8, 9, 10], "fp": [8, 9], "datafram": [8, 9], "df": [8, 9], "to_csv": [8, 9], "0x2aab1de088d0": 9, "01": 9, "sy": 9, "140": 9, "wall": 9, "98": 9, "__kernel": 9, "get_group_id": 9, "get_local_id": 9, "cl": 9, "ctx": 9, "create_some_context": 9, "mf": 9, "mem_flag": 9, "a_h": 9, "a_d": 9, "read_writ": 9, "copy_host_ptr": 9, "hostbuf": 9, "b_d": 9, "kernel_src": 9, "prg": 9, "queue": 9, "commandqueu": 9, "run_gpu": 9, "444": 9, "154": 9, "598": 9, "985": 9, "enqueue_copi": 9, "1748096": 9, "7284544": 9, "7707904": 9, "8573184": 9, "8380288": 9, "686528": 9, "69648": 9, "7461632": 9, "818304": 9, "771072": 9, "7190464": 9, "7522432": 9, "7982208": 9, "9624512": 9, "7214464": 9, "7453312": 9, "8028416": 9, "8922624": 9, "747328": 9, "7860736": 9, "8637184": 9, "__local": 9, "barrier": 9, "clk_local_mem_f": 9, "8449472": 9, "1912576": 9, "1035136": 9, "0927808": 9, "1140736": 9, "1790336": 9, "0808192": 9, "0809792": 9, "0836928": 9, "1545856": 9, "1249984": 9, "1264": 9, "1230336": 9, "4015104": 9, "0873216": 9, "0626496": 9, "0692224": 9, "140192": 9, "0801344": 9, "0688128": 9, "1428928": 9, "8844544": 9, "3245952": 9, "0911808": 9, "3039616": 9, "0079296": 9, "84848": 9, "0708288": 9, "857728": 9, "7561792": 9, "231072": 9, "8774336": 9, "7087296": 9, "8772672": 9, "6911872": 9, "5715968": 9, "7584896": 9, "6292032": 9, "6498688": 9, "1145664": 9, "8252928": 9, "6757568": 9, "7881152": 9, "6237696": 9, "544224": 9, "6951168": 9, "5648128": 9, "6452736": 9, "1065792": 9, "8313792": 9, "6905984": 9, "8302656": 9, "6367488": 9, "5478592": 9, "6660672": 9, "5719744": 9, "6551744": 9, "1384064": 9, "8531072": 9, "7078976": 9, "8516672": 9, "6677696": 9, "5685632": 9, "7074048": 9, "5753152": 9, "8228864": 9, "2124736": 9, "8633344": 9, "6921216": 9, "8896384": 9, "6659904": 9, "5582144": 9, "7522624": 9, "6081536": 9, "6664448": 9, "1095936": 9, "8063424": 9, "6717888": 9, "7982848": 9, "6263552": 9, "5289728": 9, "7008832": 9, "567456": 9, "5968704": 9, "1018432": 9, "8117248": 9, "6724736": 9, "7728576": 9, "6038336": 9, "5172352": 9, "6796352": 9, "5470016": 9, "5968448": 9, "1107712": 9, "8237248": 9, "6810944": 9, "821952": 9, "620352": 9, "5230208": 9, "6415552": 9, "5476864": 9, "7168192": 9, "1942016": 9, "8626304": 9, "7099712": 9, "9123328": 9, "6608448": 9, "5631168": 9, "7113024": 9, "556576": 9, "1583104": 9, "8384832": 9, "67856": 9, "845856": 9, "6581248": 9, "54944": 9, "7520064": 9, "6076224": 9, "6842112": 9, "1547072": 9, "8422016": 9, "6895552": 9, "8037312": 9, "6387072": 9, "5383296": 9, "7326656": 9, "5863488": 9, "6813376": 9, "1493952": 9, "8444928": 9, "6929216": 9, "832768": 9, "6389312": 9, "5412672": 9, "698336": 9, "5717568": 9, "676096": 9, "4303104": 9, "0341696": 9, "8365184": 9, "0398656": 9, "7786496": 9, "648928": 9, "8479232": 9, "6508544": 9, "1219392": 9, "7994048": 9, "6492288": 9, "8068416": 9, "6343168": 9, "5235328": 9, "7268928": 9, "5898432": 9, "6633536": 9, "0849664": 9, "7869632": 9, "6458624": 9, "7611968": 9, "613088": 9, "50912": 9, "6972928": 9, "5620608": 9, "601856": 9, "095232": 9, "7967488": 9, "6601472": 9, "7952896": 9, "6047296": 9, "5108224": 9, "6607744": 9, "5492416": 9, "7091136": 9, "171552": 9, "8473408": 9, "6962112": 9, "8663936": 9, "6466816": 9, "5475584": 9, "6754048": 9, "5591744": 9, "108896": 9, "7907264": 9, "6459328": 9, "7965888": 9, "6250816": 9, "5188416": 9, "721408": 9, "5920832": 9, "7068608": 9, "0909248": 9, "7930752": 9, "6524544": 9, "7745216": 9, "6146176": 9, "5116928": 9, "6975872": 9, "5548416": 9, "7075136": 9, "174624": 9, "8384512": 9, "69104": 9, "8335488": 9, "6264192": 9, "5445248": 9, "6719104": 9, "5592064": 9, "19": [9, 10], "solv": 10, "0x7f888f8cd7b8": 10, "4152": 10, "086019515991": 10, "0x7f8865b51f28": 10, "gpuarrai": [10, 12], "tool": [10, 12, 14], "autoinit": [10, 12], "to_gpu": [10, 12], "mod": [10, 12], "t0": [10, 12], "ona": 10, "33": 10, "46109390258789": 10, "0x7f8858b873c8": 10, "1080": [10, 12], "916985595226": 10, "489004802704": 10, "500524806976": 10, "513356792927": 10, "545715200901": 10, "486515200138": 10, "449055999517": 10, "44974719882": 10, "457427197695": 10, "492915201187": 10, "464863997698": 10, "466118401289": 10, "475264000893": 10, "513632011414": 10, "458412796259": 10, "457715201378": 10, "461017608643": 10, "475987195969": 10, "460032004118": 10, "457779198885": 10, "462649595737": 10, "kernel_string_shar": 10, "22673916817": 10, "826361596584": 10, "793516802788": 10, "782112002373": 10, "776639997959": 10, "795135998726": 10, "722777605057": 10, "762777590752": 10, "75422719717": 10, "804876792431": 10, "778656005859": 10, "769734406471": 10, "782495999336": 10, "932281601429": 10, "734028804302": 10, "721625590324": 10, "736511993408": 10, "800019192696": 10, "724966406822": 10, "722969603539": 10, "759430396557": 10, "kernel_string_til": 10, "22200961113": 10, "91601279974": 10, "752838408947": 10, "873651194572": 10, "69833599329": 10, "586931192875": 10, "516473591328": 10, "411392003298": 10, "384262400866": 10, "82159358263": 10, "632607996464": 10, "506457602978": 10, "618758392334": 10, "500288009644": 10, "429862397909": 10, "44995200038": 10, "366150397062": 10, "342201602459": 10, "793542397022": 10, "58026239872": 10, "494163197279": 10, "546316814423": 10, "467059195042": 10, "404249596596": 10, "440895992517": 10, "341376006603": 10, "339692795277": 10, "783923208714": 10, "597920000553": 10, "50277120471": 10, "615475213528": 10, "470937597752": 10, "418393599987": 10, "443519997597": 10, "343961596489": 10, "342540800571": 10, "780352008343": 10, "611705589294": 10, "515667212009": 10, "622534394264": 10, "502195191383": 10, "437388807535": 10, "45568639636": 10, "359289598465": 10, "426995199919": 10, "788947200775": 10, "616556799412": 10, "496121603251": 10, "629164803028": 10, "474841600657": 10, "407667201757": 10, "47406719923": 10, "371507203579": 10, "352531200647": 10, "72023679018": 10, "574816000462": 10, "481817597151": 10, "580928003788": 10, "455724793673": 10, "394975996017": 10, "464659202099": 10, "357107198238": 10, "324083191156": 10, "759910392761": 10, "569177603722": 10, "481279999018": 10, "528115200996": 10, "441734397411": 10, "393126398325": 10, "455404800177": 10, "350457596779": 10, "322547197342": 10, "754201591015": 10, "579827189445": 10, "491852802038": 10, "582751989365": 10, "451283198595": 10, "391807991266": 10, "456275194883": 10, "356716805696": 10, "362937599421": 10, "809894394875": 10, "60433280468": 10, "507142400742": 10, "655827200413": 10, "474092799425": 10, "408166396618": 10, "480531209707": 10, "346707201004": 10, "780134403706": 10, "601049602032": 10, "493900799751": 10, "620384001732": 10, "494553589821": 10, "425414395332": 10, "467033600807": 10, "375468802452": 10, "346079999208": 10, "771052801609": 10, "593977594376": 10, "49723520875": 10, "583270406723": 10, "478079998493": 10, "416320002079": 10, "443942397833": 10, "359744000435": 10, "343545603752": 10, "780960011482": 10, "598758399487": 10, "498617601395": 10, "57678719759": 10, "46561280489": 10, "41324160099": 10, "431225597858": 10, "351263999939": 10, "34440960288": 10, "933260798454": 10, "715257608891": 10, "586604809761": 10, "711615991592": 10, "558771193027": 10, "466284793615": 10, "44043520093": 10, "361823999882": 10, "731839990616": 10, "57044479847": 10, "470220798254": 10, "608800005913": 10, "472665601969": 10, "416352003813": 10, "481376004219": 10, "380812799931": 10, "351923197508": 10, "719257593155": 10, "55171200037": 10, "466758400202": 10, "568435204029": 10, "459654402733": 10, "394380801916": 10, "463052803278": 10, "36409599781": 10, "328998398781": 10, "73579518795": 10, "564575994015": 10, "472236800194": 10, "549024009705": 10, "438406395912": 10, "389945602417": 10, "455193603039": 10, "364051198959": 10, "375519996881": 10, "798195195198": 10, "588998401165": 10, "49552000761": 10, "595462405682": 10, "460972803831": 10, "400672000647": 10, "465132802725": 10, "364627194405": 10, "729363203049": 10, "558815991879": 10, "466655993462": 10, "600819194317": 10, "460281592607": 10, "404908800125": 10, "478739196062": 10, "386668801308": 10, "385510402918": 10, "720915210247": 10, "550668799877": 10, "466937589645": 10, "564921605587": 10, "447974395752": 10, "394271999598": 10, "46233600378": 10, "365190398693": 10, "387827193737": 10, "762003195286": 10, "579007995129": 10, "486649608612": 10, "557331204414": 10, "443033593893": 10, "396070402861": 10, "457075202465": 10, "369555193186": 10, "wish": 10, "modifi": [10, 18], "tile_size_j": 10, "fixed_param": [10, 12], "ceil": [10, 12], "zip": [10, 12], "transfer": [10, 11, 13], "20": [10, 19], "21": 10, "618": 10, "2231903076172": 10, "0x7f887c3d2358": 10, "incorpor": 10, "ifndef": 10, "kerenel": 10, "psedo": 10, "endif": 10, "bypass": 10, "usecas": 11, "test_vector_add": 11, "test_vector_add_parameter": 11, "highlight": 11, "contact": 11, "illustr": 11, "openacc": 11, "dimension": [11, 12, 23], "clean": [11, 16], "center": [11, 12], "lock": [11, 18], "overlap": [11, 13], "shuffl": 11, "pipelin": 11, "consist": [11, 16, 23], "scipi": 11, "algorithm": [11, 14, 19, 23], "cub": 11, "gaussian": 12, "delv": 12, "hand": [12, 16], "sum_": 12, "exp": 12, "beta": [12, 19], "sqrt": 12, "y_i": 12, "z_i": 12, "vector": [12, 13, 20], "coordin": 12, "linalg": 12, "la": 12, "compute_grid": 12, "xgrid": 12, "ygrid": 12, "zgrid": 12, "x0": 12, "y0": 12, "z0": 12, "themselv": 12, "meshgrid": 12, "send": 12, "interv": 12, "256": [12, 14, 20], "suffici": [12, 17], "100": [12, 19, 23], "randomli": [12, 19], "distribut": [12, 16], "linspac": 12, "cpu_grid": 12, "npt": 12, "rand": 12, "xyz": [12, 23], "52320": 12, "160627": 12, "might": [12, 17], "nz": 12, "bz": 12, "kernel_cod": 12, "math": 12, "__host__": 12, "__device__": [12, 22], "b": [12, 14, 16, 19, 20, 22], "addgrid": 12, "xvect": 12, "yvect": 12, "zvect": 12, "dx": 12, "dy": 12, "dz": 12, "assign": 12, "explor": 12, "middl": 12, "henc": [12, 21], "mention": 12, "56833920479": 12, "80796158314": 12, "940044796467": 12, "855628800392": 12, "855359995365": 12, "16174077988": 12, "11877760887": 12, "01592960358": 12, "849273598194": 12, "849235200882": 12, "19029750824": 12, "16199679375": 12, "40401918888": 12, "39618558884": 12, "39508478642": 12, "31647996902": 12, "31470079422": 12, "50787198544": 12, "53760001659": 12, "56709756851": 12, "34500494003": 12, "25130877495": 12, "50662400723": 12, "55267841816": 12, "17987194061": 12, "12309756279": 12, "01125121117": 12, "849631989002": 12, "853708791733": 12, "17051515579": 12, "15584001541": 12, "40074241161": 12, "39547519684": 12, "39331197739": 12, "30295038223": 12, "28725762367": 12, "39589118958": 12, "38867840767": 12, "37724158764": 12, "34344320297": 12, "26213116646": 12, "38793599606": 12, "3775359869": 12, "74003200531": 12, "13276162148": 12, "37233917713": 12, "18835201263": 12, "15777277946": 12, "40247042179": 12, "39366400242": 12, "39439997673": 12, "23719043732": 12, "28542718887": 12, "39207677841": 12, "38956804276": 12, "3778496027": 12, "29814395905": 12, "26398081779": 12, "38625922203": 12, "3754431963": 12, "72981758118": 12, "12483196259": 12, "37322881222": 12, "61618566513": 12, "2194111824": 12, "17600002289": 12, "27082881927": 12, "38787200451": 12, "3835711956": 12, "37543039322": 12, "30227203369": 12, "23127679825": 12, "38627202511": 12, "37677440643": 12, "64358406067": 12, "12255358696": 12, "37474560738": 12, "61655673981": 12, "19179515839": 12, "99912958145": 12, "213971138": 12, "16430072784": 12, "38772480488": 12, "3735104084": 12, "54432649612": 12, "05524477959": 12, "36935677528": 12, "42449922562": 12, "10455036163": 12, "67516155243": 12, "programmat": 12, "30": 12, "minimum": 12, "84": 12, "suit": [12, 23], "grid_dim": 12, "associ": 12, "substitut": 12, "ourselv": 12, "extract": 12, "manual": [12, 15], "exlicitli": 12, "accur": [12, 18], "xgpu": 12, "ygpu": 12, "zgpu": 12, "grid_gpu": 12, "80": 12, "133200": 12, "lower": [12, 18, 19], "roughli": [12, 16], "40000": 12, "across": [13, 16], "qualiti": 13, "itself": [13, 14, 23], "precis": 13, "plain": 13, "omp_get_wtim": 13, "openmp": 13, "convolution_stream": 13, "complex": [13, 16], "behind": 13, "spread": 13, "back": [13, 23], "split": 13, "chunk": 13, "slightli": [13, 16, 22], "account": [13, 16], "border": [13, 23], "latter": 13, "cudastreamwaitev": 13, "num_stream": 13, "clarifi": 13, "fit": [13, 19], "choic": [13, 15], "grid_size_x": 13, "grid_size_i": 13, "cudamemcpytosymbol": 13, "upload": 13, "yourself": [13, 23], "spent": [13, 23], "relat": [14, 17, 24], "famili": 14, "launcher": 14, "kt": [14, 21], "easiest": 14, "toolkit": [14, 15], "intend": 14, "Or": [14, 15], "vector_add": [14, 19, 20, 22], "10000000": 14, "512": [14, 20], "research": 14, "cite": 14, "paper": 14, "significantli": [14, 16, 18], "articl": [14, 20], "author": 14, "ben": 14, "van": 14, "werkhoven": 14, "titl": 14, "auto": [14, 16, 18, 19, 22, 23, 24], "journal": 14, "year": 14, "2019": 14, "volum": 14, "90": 14, "347": 14, "358": 14, "url": 14, "www": 14, "sciencedirect": 14, "scienc": 14, "pii": 14, "s0167739x18313359": 14, "doi": 14, "1016": 14, "2018": 14, "08": 14, "004": 14, "referenc": 14, "bayesian": [14, 19, 23], "willemsen2021bayesian": 14, "willemsen": [14, 19], "flori": 14, "jan": 14, "nieuwpoort": 14, "rob": 14, "workshop": 14, "pmb": 14, "supercomput": 14, "sc21": 14, "2021": 14, "arxiv": 14, "ab": 14, "2111": 14, "14991": 14, "difficulti": 14, "schoonhoven2022benchmark": 14, "schoonhoven": 14, "richard": 14, "batenburg": 14, "joost": 14, "ieee": 14, "transact": 14, "evolutionari": 14, "2022": 14, "consumpt": [14, 16, 18], "schoonhoven2022go": 14, "veenboer": 14, "bram": 14, "green": 14, "effici": [14, 16, 18], "steer": 14, "sc22": 14, "2211": 14, "07260": 14, "comprehens": 15, "recommend": [15, 21], "download": 15, "repo": 15, "continuum": 15, "io": 15, "miniconda3": 15, "x86_64": 15, "sh": 15, "newer": [15, 18], "nativ": 15, "prefix": 15, "home": 15, "pythonpath": 15, "bind": [15, 18], "older": 15, "troubl": 15, "retri": 15, "wiki": 15, "tiker": 15, "net": 15, "amd": [15, 18], "app": 15, "sdk": 15, "intel": 15, "appl": 15, "beignet": 15, "stack": 15, "altern": [15, 23], "navig": 15, "benvanwerkhoven": 15, "differenti": [15, 19, 23], "chanc": [15, 19, 22], "algebra": 16, "frequent": 16, "programm": [16, 18], "row": 16, "column": 16, "squar": 16, "matric": 16, "matmul_na": 16, "width": 16, "matmul_kernel": 16, "height": 16, "Of": 16, "solut": [16, 18], "realiti": 16, "contant": 16, "denot": [16, 20, 23], "sensibl": 16, "pick": 16, "word": 16, "warpsiz": 16, "namelijk": 16, "stand": 16, "briefli": 16, "figur": 16, "fifth": 16, "fourth": 16, "dramat": 16, "profil": 16, "pretti": 16, "opportun": 16, "realiz": 16, "collabor": 16, "bandwidth": 16, "techniqu": 16, "submatric": 16, "proce": 16, "matmul_shar": 16, "sa": 16, "sb": 16, "kb": 16, "outer": 16, "inner": 16, "race": 16, "drastic": 16, "due": [16, 22, 23], "fortun": 16, "benefit": 16, "redund": 16, "distinct": 16, "1xn": 16, "usag": [16, 18], "occup": 16, "goe": 16, "down": 16, "matmul": 16, "newli": 16, "coupl": 16, "respect": [16, 18], "independ": 16, "yield": 16, "discontinu": 16, "room": 16, "impos": 16, "report": [17, 18, 23, 24], "possibli": [17, 23], "_flop": 17, "total_flop": 17, "ps_energi": [17, 18, 24], "occur": [17, 23], "exhaust": 17, "brute": [17, 19, 20], "forc": [17, 19, 20, 22], "maxim": [17, 23], "boolean": [17, 18, 23], "facilit": 18, "layer": 18, "act": 18, "hook": 18, "pattern": 18, "subscrib": 18, "benchmarkobserv": 18, "overwritten": [18, 23], "extend": 18, "mandatori": 18, "get_result": 18, "aggreg": 18, "after_finish": 18, "after_start": 18, "before_start": 18, "register_configur": 18, "register_devic": 18, "variou": [18, 20], "registerobserv": 18, "track": 18, "num_reg": 18, "current_modul": 18, "powersensor2": 18, "pcie": 18, "intercept": 18, "sensor": 18, "transmit": 18, "usb": 18, "connect": 18, "advantag": 18, "instantan": 18, "frequenc": 18, "khz": 18, "pybind11": 18, "powersensor": [18, 24], "ps_power": [18, 24], "joul": [18, 24], "watt": [18, 24], "ttyacm0": 18, "voltag": 18, "thin": 18, "wrapper": [18, 22], "intricaci": 18, "friendli": 18, "mode": 18, "repeatedli": 18, "downsid": 18, "approach": 18, "save_al": 18, "nvidia_smi_fallback": 18, "use_locked_clock": 18, "continous_dur": 18, "monitor": 18, "clock": [18, 24], "power_read": [18, 24], "nvml_power": [18, 24], "nvml_energi": [18, 24], "core_freq": [18, 24], "mem_freq": [18, 24], "gr_voltag": 18, "ordin": 18, "identifi": 18, "smi": 18, "root": 18, "opt": 18, "amper": 18, "continuous_dur": 18, "common": [18, 22], "cap": 18, "popular": 18, "nvml_gr_clock": [18, 24], "nvml_mem_clock": [18, 24], "nvml_pwr_limit": [18, 24], "graphic": [18, 24], "jetson": 18, "rapl": 18, "xilinx": 18, "pmt": 18, "astron": 18, "nl": 18, "rd": 18, "meter": 18, "arduino": 18, "_energi": 18, "_power": 18, "acceler": 19, "prohibit": 19, "slow": 19, "wast": 19, "basin": [19, 23], "hop": [19, 23], "dual": [19, 23], "anneal": [19, 23], "evolut": [19, 23], "firefli": [19, 23], "genet": [19, 23], "greedi": [19, 23], "multi": [19, 23], "particl": [19, 23], "swarm": [19, 23], "mechan": 19, "overrid": 19, "time_limit": [19, 23], "uniqu": [19, 23], "count": 19, "searchspac": 19, "runner": 19, "nelder": 19, "mead": 19, "powel": 19, "cg": 19, "bfg": 19, "l": 19, "tnc": 19, "cobyla": 19, "slsqp": 19, "reject": 19, "thesi": 19, "generate_normalized_param_dict": 19, "denorm": 19, "normalize_parameter_spac": 19, "param_spac": 19, "prune_parameter_spac": 19, "normalize_dict": 19, "prune": 19, "hyperparamet": 19, "popul": 19, "best1bin": 19, "best1exp": 19, "rand1exp": 19, "randtobest1exp": 19, "best2exp": 19, "rand2exp": 19, "randtobest1bin": 19, "best2bin": 19, "rand2bin": 19, "rand1bin": 19, "popsiz": 19, "maxit": 19, "constr": 19, "compute_intens": 19, "fun": 19, "intens": 19, "distance_to": 19, "euclidian": 19, "move_toward": 19, "alpha": 19, "toward": 19, "b0": 19, "attract": 19, "gamma": 19, "light": 19, "absorpt": 19, "coeffici": 19, "disruptive_uniform_crossov": 19, "dna1": 19, "dna2": 19, "disrupt": 19, "uniform": 19, "crossov": 19, "uniformli": 19, "gene": 19, "children": 19, "guarante": 19, "parent": 19, "mutat": 19, "dna": 19, "mutation_ch": 19, "single_point_crossov": 19, "single_point": 19, "two_point": 19, "disruptive_uniform": 19, "two_point_crossov": 19, "uniform_crossov": 19, "weighted_choic": 19, "probabl": [19, 23], "il": 19, "neighbor": 19, "ham": 19, "adjac": 19, "greedy": 19, "soon": 19, "no_improv": 19, "exce": 19, "50": 19, "random_walk": 19, "hillclimb": 19, "travers": 19, "inertia": 19, "c1": 19, "cognit": 19, "c2": 19, "social": 19, "fraction": 19, "acceptance_prob": 19, "old_cost": 19, "new_cost": 19, "modif": [19, 21], "po": 19, "t_min": 19, "001": 19, "995": 19, "vector_add_kernel": 20, "wise": 20, "1000000": [20, 22], "recogn": 20, "alright": 20, "portabl": 21, "stick": 21, "pointer": 21, "primit": 21, "lead": 21, "ineffici": 21, "situat": 21, "scientif": 21, "sens": 21, "experiment": 21, "pack": 21, "consult": 21, "create_receive_spec_struct": 21, "0l": 21, "pad": 21, "8byte": 21, "packstr": 21, "iiiiiiiiiiippi": 21, "fffi": 21, "nsampl": 21, "nsamplesiq": 21, "nslowtimesampl": 21, "nchannel": 21, "ntx": 21, "nrepeat": 21, "nfasttimesampl": 21, "rfsize": 21, "mnrow": 21, "mnrowsiq": 21, "nactivechannel": 21, "isiq": 21, "fsiq": 21, "fc": 21, "nbuffer": 21, "frombuff": 21, "len": 21, "receive_spec": 21, "bf": 21, "rf": 21, "recon": 21, "length": 21, "slight": 21, "matlab": 22, "typenam": 22, "my_typ": 22, "regardless": 22, "demot": 22, "rewrit": 22, "real": 22, "risk": 22, "seper": 22, "grid_div_z": 23, "06": 23, "log": 23, "auxilliari": 23, "safer": 23, "notat": 23, "divison": 23, "treat": 23, "warp": 23, "empti": 23, "kepler": 23, "plu": 23, "filter_mod": 23, "address_mod": 23, "clamp": 23, "mirror": 23, "axi": 23, "normalized_coordin": 23, "emtpi": 23, "get_local_s": 23, "satisfi": 23, "000001": 23, "ref": 23, "basinhop": 23, "bayes_opt": 23, "diff_evo": 23, "firefly_algorithm": 23, "genetic_algorithm": 23, "greedy_il": 23, "greedy_ml": 23, "ml": 23, "ordered_greedy_ml": 23, "pso": 23, "simulated_ann": 23, "sort": 23, "resourc": 23, "persist": 23, "consol": 23, "info": 23, "summar": 23, "store_result": 23, "results_filenam": 23, "typicali": 23, "percentag": 23, "create_device_target": 23, "header_filenam": 23, "target": 23, "dtarget_gpu": 23, "name_of_gpu": 23, "chosen": 23, "block_size_": 24, "grid_size_": 24, "compiler_opt_": 24, "loop_unroll_factor_": 24, "nvml_": 24, "nvmlobserv": 24}, "objects": {"kernel_tuner.backends.compiler": [[6, 0, 1, "", "CompilerFunctions"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "cleanup_lib"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.cupy": [[6, 0, 1, "", "CupyFunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.hip": [[6, 0, 1, "", "HipFunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.nvcuda": [[6, 0, 1, "", "CudaFunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.opencl": [[6, 0, 1, "", "OpenCLFunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.pycuda": [[6, 0, 1, "", "PyCudaFunctions"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.core": [[6, 0, 1, "", "DeviceInterface"]], "kernel_tuner.core.DeviceInterface": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "benchmark"], [6, 1, 1, "", "benchmark_continuous"], [6, 1, 1, "", "benchmark_default"], [6, 1, 1, "", "check_kernel_output"], [6, 1, 1, "", "compile_kernel"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "create_kernel_instance"], [6, 1, 1, "", "get_environment"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "preprocess_gpu_arguments"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "set_nvml_parameters"]], "kernel_tuner": [[23, 2, 1, "", "create_device_targets"], [23, 2, 1, "", "run_kernel"], [23, 2, 1, "", "store_results"], [23, 2, 1, "", "tune_kernel"], [6, 3, 0, "-", "util"]], "kernel_tuner.observers": [[18, 0, 1, "", "BenchmarkObserver"]], "kernel_tuner.observers.BenchmarkObserver": [[18, 1, 1, "", "after_finish"], [18, 1, 1, "", "after_start"], [18, 1, 1, "", "before_start"], [18, 1, 1, "", "during"], [18, 1, 1, "", "get_results"], [18, 1, 1, "", "register_configuration"], [18, 1, 1, "", "register_device"]], "kernel_tuner.observers.nvml": [[18, 0, 1, "", "NVMLObserver"]], "kernel_tuner.observers.pmt": [[18, 0, 1, "", "PMTObserver"]], "kernel_tuner.observers.powersensor": [[18, 0, 1, "", "PowerSensorObserver"]], "kernel_tuner.runners.sequential": [[6, 0, 1, "", "SequentialRunner"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.runners.simulation": [[6, 0, 1, "", "SimulationRunner"]], "kernel_tuner.runners.simulation.SimulationRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.strategies": [[19, 3, 0, "-", "basinhopping"], [19, 3, 0, "-", "bayes_opt"], [19, 3, 0, "-", "brute_force"], [6, 3, 0, "-", "common"], [19, 3, 0, "-", "diff_evo"], [19, 3, 0, "-", "dual_annealing"], [19, 3, 0, "-", "firefly_algorithm"], [19, 3, 0, "-", "genetic_algorithm"], [19, 3, 0, "-", "greedy_ils"], [19, 3, 0, "-", "greedy_mls"], [19, 3, 0, "-", "minimize"], [19, 3, 0, "-", "mls"], [19, 3, 0, "-", "ordered_greedy_mls"], [19, 3, 0, "-", "pso"], [19, 3, 0, "-", "random_sample"], [19, 3, 0, "-", "simulated_annealing"]], "kernel_tuner.strategies.basinhopping": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.bayes_opt": [[19, 2, 1, "", "generate_normalized_param_dicts"], [19, 2, 1, "", "normalize_parameter_space"], [19, 2, 1, "", "prune_parameter_space"], [19, 2, 1, "", "tune"]], "kernel_tuner.strategies.brute_force": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.common": [[6, 2, 1, "", "get_options"], [6, 2, 1, "", "get_strategy_docstring"], [6, 2, 1, "", "make_strategy_options_doc"], [6, 2, 1, "", "scale_from_params"], [6, 2, 1, "", "setup_method_arguments"], [6, 2, 1, "", "setup_method_options"], [6, 2, 1, "", "snap_to_nearest_config"], [6, 2, 1, "", "unscale_and_snap_to_nearest"]], "kernel_tuner.strategies.diff_evo": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.dual_annealing": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm": [[19, 0, 1, "", "Firefly"], [19, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm.Firefly": [[19, 1, 1, "", "compute_intensity"], [19, 1, 1, "", "distance_to"], [19, 1, 1, "", "move_towards"]], "kernel_tuner.strategies.genetic_algorithm": [[19, 2, 1, "", "disruptive_uniform_crossover"], [19, 2, 1, "", "mutate"], [19, 2, 1, "", "single_point_crossover"], [19, 2, 1, "", "tune"], [19, 2, 1, "", "two_point_crossover"], [19, 2, 1, "", "uniform_crossover"], [19, 2, 1, "", "weighted_choice"]], "kernel_tuner.strategies.greedy_ils": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.greedy_mls": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.minimize": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.mls": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.ordered_greedy_mls": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.pso": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.random_sample": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.simulated_annealing": [[19, 2, 1, "", "acceptance_prob"], [19, 2, 1, "", "neighbor"], [19, 2, 1, "", "tune"]], "kernel_tuner.util": [[6, 0, 1, "", "CompilationFailedConfig"], [6, 0, 1, "", "ErrorConfig"], [6, 0, 1, "", "InvalidConfig"], [6, 0, 1, "", "NpEncoder"], [6, 0, 1, "", "RuntimeFailedConfig"], [6, 4, 1, "", "SkippableFailure"], [6, 4, 1, "", "StopCriterionReached"], [6, 2, 1, "", "check_argument_list"], [6, 2, 1, "", "check_argument_type"], [6, 2, 1, "", "check_restriction"], [6, 2, 1, "", "check_restrictions"], [6, 2, 1, "", "check_stop_criterion"], [6, 2, 1, "", "check_thread_block_dimensions"], [6, 2, 1, "", "check_tune_params_list"], [6, 2, 1, "", "compile_restrictions"], [6, 2, 1, "", "config_valid"], [6, 2, 1, "", "convert_constraint_restriction"], [6, 2, 1, "", "correct_open_cache"], [6, 2, 1, "", "cuda_error_check"], [6, 2, 1, "", "delete_temp_file"], [6, 2, 1, "", "detect_language"], [6, 2, 1, "", "dump_cache"], [6, 2, 1, "", "get_best_config"], [6, 2, 1, "", "get_config_string"], [6, 2, 1, "", "get_grid_dimensions"], [6, 2, 1, "", "get_instance_string"], [6, 2, 1, "", "get_kernel_string"], [6, 2, 1, "", "get_problem_size"], [6, 2, 1, "", "get_smem_args"], [6, 2, 1, "", "get_temp_filename"], [6, 2, 1, "", "get_thread_block_dimensions"], [6, 2, 1, "", "get_total_timings"], [6, 2, 1, "", "looks_like_a_filename"], [6, 2, 1, "", "normalize_verify_function"], [6, 2, 1, "", "parse_restrictions"], [6, 2, 1, "", "prepare_kernel_string"], [6, 2, 1, "", "print_config"], [6, 2, 1, "", "print_config_output"], [6, 2, 1, "", "process_cache"], [6, 2, 1, "", "process_metrics"], [6, 2, 1, "", "read_cache"], [6, 2, 1, "", "read_file"], [6, 2, 1, "", "replace_param_occurrences"], [6, 2, 1, "", "setup_block_and_grid"], [6, 2, 1, "", "store_cache"], [6, 2, 1, "", "to_valid_nvrtc_gpu_arch_cc"], [6, 2, 1, "", "write_file"]], "kernel_tuner.util.NpEncoder": [[6, 1, 1, "", "default"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function", "3": "py:module", "4": "py:exception"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"], "3": ["py", "module", "Python module"], "4": ["py", "exception", "Python exception"]}, "titleterms": {"backend": [0, 6, 15, 22], "cuda": [0, 15, 16], "featur": [0, 2], "support": 0, "usag": [0, 14], "compil": [0, 6], "cach": 1, "file": 1, "The": [2, 14], "kernel": [2, 8, 9, 10, 11, 12, 14, 16, 22], "tuner": [2, 8, 9, 10, 11, 12, 14], "document": [2, 6, 7, 14, 23], "guid": [2, 3, 15], "refer": 2, "contribut": 3, "report": 3, "issu": 3, "code": [3, 8, 9, 10, 11, 13], "simpl": 3, "develop": [3, 7], "setup": [3, 7], "convolut": [4, 11], "2d": 4, "exampl": [4, 11, 14, 22], "implement": [4, 8, 9, 10], "test": [4, 7], "tune": [4, 8, 9, 10, 12, 13, 16, 17, 18], "more": 4, "tunabl": 4, "paramet": [4, 10, 12, 18, 24], "correct": 5, "verif": 5, "design": 6, "strategi": [6, 19], "kernel_tun": [6, 19], "common": 6, "runner": 6, "sequenti": 6, "sequentialrunn": 6, "simulationrunn": 6, "devic": 6, "interfac": 6, "core": 6, "deviceinterfac": 6, "pycuda": [6, 15], "pycudafunct": 6, "cupi": 6, "cupyfunct": 6, "nvcuda": 6, "cudafunct": 6, "opencl": [6, 15], "openclfunct": 6, "compilerfunct": 6, "hip": [6, 15], "hipfunct": 6, "util": 6, "function": 6, "environ": 7, "local": [7, 9], "cluster": 7, "run": [7, 10], "build": 7, "diffus": [8, 9, 10], "python": [8, 9, 10, 15], "comput": [8, 9, 10], "gpu": [8, 9, 10, 12], "auto": [8, 9, 10], "us": [8, 9, 10, 12, 16, 21], "share": [8, 9, 10, 16], "memori": [8, 9, 10, 16], "tile": [8, 9, 10], "store": [8, 9], "result": [8, 9], "tutori": [9, 10], "from": [9, 10], "physic": [9, 10], "best": 10, "product": 10, "c": 10, "vector": 11, "add": 11, "stencil": 11, "matrix": [11, 16], "multipl": [11, 16], "py": 11, "sepconv": 11, "convolution_correct": 11, "convolution_stream": 11, "reduct": 11, "spars": 11, "point": 11, "polygon": 11, "expdist": 11, "gener": 11, "3d": 12, "grid": 12, "let": 12, "": 12, "start": [12, 20], "cpu": 12, "move": 12, "optim": [12, 19], "host": 13, "number": 13, "stream": 13, "quick": 14, "instal": [14, 15], "citat": 14, "packag": 15, "other": 15, "pyopencl": 15, "pyhip": 15, "git": 15, "version": 15, "depend": 15, "naiv": 16, "increas": 16, "work": 16, "per": 16, "thread": 16, "metric": 17, "object": 17, "observ": 18, "powersensorobserv": 18, "nvmlobserv": 18, "execut": 18, "nvml": 18, "pmtobserv": 18, "basinhop": 19, "bayes_opt": 19, "brute_forc": 19, "diff_evo": 19, "dual_ann": 19, "firefly_algorithm": 19, "genetic_algorithm": 19, "greedy_il": 19, "greedy_ml": 19, "minim": 19, "ml": 19, "ordered_greedy_ml": 19, "pso": 19, "random_sampl": 19, "simulated_ann": 19, "get": 20, "struct": 21, "templat": 22, "select": 22, "api": 23, "vocabulari": 24}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "nbsphinx": 4, "sphinx": 58}, "alltitles": {"Backends": [[0, "backends"]], "CUDA Backends": [[0, "cuda-backends"]], "Backend feature support": [[0, "id1"]], "Backend usage and compiler": [[0, "id2"]], "Cache files": [[1, "cache-files"]], "The Kernel Tuner documentation": [[2, "the-kernel-tuner-documentation"], [14, "the-kernel-tuner-documentation"]], "Kernel Tuner": [[2, null]], "Guides": [[2, null]], "Features": [[2, null]], "Reference": [[2, null]], "Contribution guide": [[3, "contribution-guide"]], "Reporting Issues": [[3, "reporting-issues"]], "Contributing Code": [[3, "contributing-code"]], "Simple development setup": [[3, "simple-development-setup"]], "Convolution": [[4, "Convolution"], [11, "convolution"]], "2D Convolution example": [[4, "2D-Convolution-example"]], "Implement a test": [[4, "Implement-a-test"]], "Tuning 2D Convolution": [[4, "Tuning-2D-Convolution"]], "More tunable parameters": [[4, "More-tunable-parameters"]], "Correctness Verification": [[5, "correctness-verification"]], "Design documentation": [[6, "design-documentation"]], "Strategies": [[6, "strategies"]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common"]], "Runners": [[6, "runners"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, "kernel-tuner-runners-sequential-sequentialrunner"]], "kernel_tuner.runners.sequential.SimulationRunner": [[6, "kernel-tuner-runners-sequential-simulationrunner"]], "Device Interfaces": [[6, "device-interfaces"]], "kernel_tuner.core.DeviceInterface": [[6, "kernel-tuner-core-deviceinterface"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, "kernel-tuner-backends-pycuda-pycudafunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, "kernel-tuner-backends-cupy-cupyfunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, "kernel-tuner-backends-nvcuda-cudafunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, "kernel-tuner-backends-opencl-openclfunctions"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, "kernel-tuner-backends-compiler-compilerfunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, "kernel-tuner-backends-hip-hipfunctions"]], "Util Functions": [[6, "util-functions"]], "kernel_tuner.util": [[6, "module-kernel_tuner.util"]], "Development environment": [[7, "development-environment"]], "Local setup": [[7, "local-setup"]], "Cluster setup": [[7, "cluster-setup"]], "Running tests": [[7, "running-tests"]], "Building documentation": [[7, "building-documentation"]], "Diffusion": [[8, "Diffusion"], [8, "id1"], [9, "Diffusion"], [10, "Diffusion"]], "Python implementation": [[8, "Python-implementation"], [9, "Python-implementation"], [10, "Python-implementation"]], "Computing on the GPU": [[8, "Computing-on-the-GPU"], [9, "Computing-on-the-GPU"], [10, "Computing-on-the-GPU"]], "Auto-Tuning with the Kernel Tuner": [[8, "Auto-Tuning-with-the-Kernel-Tuner"], [9, "Auto-Tuning-with-the-Kernel-Tuner"], [10, "Auto-Tuning-with-the-Kernel-Tuner"]], "Using Shared Memory": [[8, "Using-Shared-Memory"]], "Tiling GPU Code": [[8, "Tiling-GPU-Code"], [9, "Tiling-GPU-Code"], [10, "Tiling-GPU-Code"]], "Storing the results": [[8, "Storing-the-results"], [9, "Storing-the-results"]], "Tutorial: From physics to tuned GPU kernels": [[9, "Tutorial:-From-physics-to-tuned-GPU-kernels"], [10, "Tutorial:-From-physics-to-tuned-GPU-kernels"]], "Using Shared (local) Memory": [[9, "Using-Shared-(local)-Memory"]], "Using shared memory": [[10, "Using-shared-memory"], [16, "Using-shared-memory"]], "Using the best parameters in a production run": [[10, "Using-the-best-parameters-in-a-production-run"]], "Python run": [[10, "Python-run"]], "C run": [[10, "C-run"]], "Kernel Tuner Examples": [[11, "kernel-tuner-examples"]], "Vector Add": [[11, "vector-add"]], "Stencil": [[11, "stencil"]], "Matrix Multiplication": [[11, "matrix-multiplication"]], "convolution.py": [[11, "convolution-py"]], "sepconv.py": [[11, "sepconv-py"]], "convolution_correct.py": [[11, "convolution-correct-py"]], "convolution_streams.py": [[11, "convolution-streams-py"]], "Reduction": [[11, "reduction"]], "Sparse Matrix Vector Multiplication": [[11, "sparse-matrix-vector-multiplication"]], "Point-in-Polygon": [[11, "point-in-polygon"]], "ExpDist": [[11, "expdist"]], "Code Generator": [[11, "code-generator"]], "3D Grid on GPU with Kernel Tuner": [[12, "3D-Grid-on-GPU-with-Kernel-Tuner"]], "Let\u2019s start on the CPU": [[12, "Let's-start-on-the-CPU"]], "Let\u2019s move to the GPU": [[12, "Let's-move-to-the-GPU"]], "Tune the kernel": [[12, "Tune-the-kernel"]], "Using the optimized parameters": [[12, "Using-the-optimized-parameters"]], "Tuning Host Code": [[13, "tuning-host-code"]], "Tuning the number of streams": [[13, "tuning-the-number-of-streams"]], "Quick install": [[14, "quick-install"]], "Example usage": [[14, "example-usage"]], "Citation": [[14, "citation"]], "Installation": [[15, "installation"]], "Python": [[15, "python"]], "Installing Python Packages": [[15, "installing-python-packages"]], "CUDA and PyCUDA": [[15, "cuda-and-pycuda"]], "Other CUDA Backends": [[15, "other-cuda-backends"]], "OpenCL and PyOpenCL": [[15, "opencl-and-pyopencl"]], "HIP and PyHIP": [[15, "hip-and-pyhip"]], "Installing the git version": [[15, "installing-the-git-version"]], "Dependencies for the guides": [[15, "dependencies-for-the-guides"]], "Matrix multiplication": [[16, "Matrix-multiplication"]], "Naive CUDA kernel": [[16, "Naive-CUDA-kernel"]], "Tuning a naive kernel": [[16, "Tuning-a-naive-kernel"]], "Increase work per thread": [[16, "Increase-work-per-thread"]], "Metrics and Objectives": [[17, "metrics-and-objectives"]], "Metrics": [[17, "metrics"]], "Tuning Objectives": [[17, "tuning-objectives"]], "Observers": [[18, "observers"]], "PowerSensorObserver": [[18, "powersensorobserver"]], "NVMLObserver": [[18, "nvmlobserver"]], "Tuning execution parameters with NVML": [[18, "tuning-execution-parameters-with-nvml"]], "PMTObserver": [[18, "pmtobserver"]], "Optimization strategies": [[19, "optimization-strategies"]], "kernel_tuner.strategies.basinhopping": [[19, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[19, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[19, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[19, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[19, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[19, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[19, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[19, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[19, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[19, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[19, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[19, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[19, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[19, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[19, "module-kernel_tuner.strategies.simulated_annealing"]], "Getting Started": [[20, "getting-started"]], "Using structs": [[21, "using-structs"]], "Templated kernels": [[22, "templated-kernels"]], "Example": [[22, "example"]], "Selecting a backend": [[22, "selecting-a-backend"]], "API Documentation": [[23, "api-documentation"]], "Parameter Vocabulary": [[24, "parameter-vocabulary"]]}, "indexentries": {"compilationfailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.CompilationFailedConfig"]], "compilerfunctions (class in kernel_tuner.backends.compiler)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions"]], "cudafunctions (class in kernel_tuner.backends.nvcuda)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions"]], "cupyfunctions (class in kernel_tuner.backends.cupy)": [[6, "kernel_tuner.backends.cupy.CupyFunctions"]], "deviceinterface (class in kernel_tuner.core)": [[6, "kernel_tuner.core.DeviceInterface"]], "errorconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.ErrorConfig"]], "hipfunctions (class in kernel_tuner.backends.hip)": [[6, "kernel_tuner.backends.hip.HipFunctions"]], "invalidconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.InvalidConfig"]], "npencoder (class in kernel_tuner.util)": [[6, "kernel_tuner.util.NpEncoder"]], "openclfunctions (class in kernel_tuner.backends.opencl)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions"]], "pycudafunctions (class in kernel_tuner.backends.pycuda)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions"]], "runtimefailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.RuntimeFailedConfig"]], "sequentialrunner (class in kernel_tuner.runners.sequential)": [[6, "kernel_tuner.runners.sequential.SequentialRunner"]], "simulationrunner (class in kernel_tuner.runners.simulation)": [[6, "kernel_tuner.runners.simulation.SimulationRunner"]], "skippablefailure": [[6, "kernel_tuner.util.SkippableFailure"]], "stopcriterionreached": [[6, "kernel_tuner.util.StopCriterionReached"]], "__init__() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.__init__"]], "__init__() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.__init__"]], "__init__() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.__init__"]], "__init__() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.__init__"]], "__init__() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.__init__"]], "__init__() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.__init__"]], "__init__() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.__init__"]], "__init__() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.__init__"]], "__init__() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.__init__"]], "benchmark() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark"]], "benchmark_continuous() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_continuous"]], "benchmark_default() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_default"]], "check_argument_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_list"]], "check_argument_type() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_type"]], "check_kernel_output() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.check_kernel_output"]], "check_restriction() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_restriction"]], "check_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_restrictions"]], "check_stop_criterion() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_stop_criterion"]], "check_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_thread_block_dimensions"]], "check_tune_params_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_tune_params_list"]], "cleanup_lib() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.cleanup_lib"]], "compile() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.compile"]], "compile() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.compile"]], "compile() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.compile"]], "compile() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.compile"]], "compile() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.compile"]], "compile() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.compile"]], "compile_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.compile_kernel"]], "compile_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.compile_restrictions"]], "config_valid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.config_valid"]], "convert_constraint_restriction() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.convert_constraint_restriction"]], "copy_constant_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_constant_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_shared_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_texture_memory_args"]], "correct_open_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.correct_open_cache"]], "create_kernel_instance() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.create_kernel_instance"]], "cuda_error_check() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.cuda_error_check"]], "default() (kernel_tuner.util.npencoder method)": [[6, "kernel_tuner.util.NpEncoder.default"]], "delete_temp_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.delete_temp_file"]], "detect_language() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.detect_language"]], "dump_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.dump_cache"]], "get_best_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_best_config"]], "get_config_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_config_string"]], "get_environment() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.get_environment"]], "get_grid_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_grid_dimensions"]], "get_instance_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_instance_string"]], "get_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_kernel_string"]], "get_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_options"]], "get_problem_size() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_problem_size"]], "get_smem_args() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_smem_args"]], "get_strategy_docstring() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_strategy_docstring"]], "get_temp_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_temp_filename"]], "get_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_thread_block_dimensions"]], "get_total_timings() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_total_timings"]], "kernel_finished() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished"]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common"]], "kernel_tuner.util": [[6, "module-kernel_tuner.util"]], "looks_like_a_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.looks_like_a_filename"]], "make_strategy_options_doc() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.make_strategy_options_doc"]], "memcpy_dtoh() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.memcpy_dtoh"]], "memcpy_htod() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod"]], "memset() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memset"]], "memset() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memset"]], "memset() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memset"]], "memset() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memset"]], "memset() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memset"]], "memset() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memset"]], "module": [[6, "module-kernel_tuner.strategies.common"], [6, "module-kernel_tuner.util"], [19, "module-kernel_tuner.strategies.basinhopping"], [19, "module-kernel_tuner.strategies.bayes_opt"], [19, "module-kernel_tuner.strategies.brute_force"], [19, "module-kernel_tuner.strategies.diff_evo"], [19, "module-kernel_tuner.strategies.dual_annealing"], [19, "module-kernel_tuner.strategies.firefly_algorithm"], [19, "module-kernel_tuner.strategies.genetic_algorithm"], [19, "module-kernel_tuner.strategies.greedy_ils"], [19, "module-kernel_tuner.strategies.greedy_mls"], [19, "module-kernel_tuner.strategies.minimize"], [19, "module-kernel_tuner.strategies.mls"], [19, "module-kernel_tuner.strategies.ordered_greedy_mls"], [19, "module-kernel_tuner.strategies.pso"], [19, "module-kernel_tuner.strategies.random_sample"], [19, "module-kernel_tuner.strategies.simulated_annealing"]], "normalize_verify_function() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.normalize_verify_function"]], "parse_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.parse_restrictions"]], "prepare_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.prepare_kernel_string"]], "preprocess_gpu_arguments() (kernel_tuner.core.deviceinterface static method)": [[6, "kernel_tuner.core.DeviceInterface.preprocess_gpu_arguments"]], "print_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config"]], "print_config_output() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config_output"]], "process_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_cache"]], "process_metrics() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_metrics"]], "read_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_cache"]], "read_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_file"]], "ready_argument_list() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.ready_argument_list"]], "replace_param_occurrences() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.replace_param_occurrences"]], "run() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.run"]], "run() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.run"]], "run_kernel() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.run_kernel"]], "scale_from_params() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.scale_from_params"]], "set_nvml_parameters() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.set_nvml_parameters"]], "setup_block_and_grid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.setup_block_and_grid"]], "setup_method_arguments() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_arguments"]], "setup_method_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_options"]], "snap_to_nearest_config() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.snap_to_nearest_config"]], "start_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.start_event"]], "start_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.start_event"]], "start_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.start_event"]], "start_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.start_event"]], "start_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.start_event"]], "start_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.start_event"]], "stop_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event"]], "store_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.store_cache"]], "synchronize() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize"]], "to_valid_nvrtc_gpu_arch_cc() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.to_valid_nvrtc_gpu_arch_cc"]], "unscale_and_snap_to_nearest() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.unscale_and_snap_to_nearest"]], "write_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.write_file"]], "benchmarkobserver (class in kernel_tuner.observers)": [[18, "kernel_tuner.observers.BenchmarkObserver"]], "nvmlobserver (class in kernel_tuner.observers.nvml)": [[18, "kernel_tuner.observers.nvml.NVMLObserver"]], "pmtobserver (class in kernel_tuner.observers.pmt)": [[18, "kernel_tuner.observers.pmt.PMTObserver"]], "powersensorobserver (class in kernel_tuner.observers.powersensor)": [[18, "kernel_tuner.observers.powersensor.PowerSensorObserver"]], "after_finish() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.after_finish"]], "after_start() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.after_start"]], "before_start() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.before_start"]], "during() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.during"]], "get_results() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.get_results"]], "register_configuration() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.register_configuration"]], "register_device() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.register_device"]], "firefly (class in kernel_tuner.strategies.firefly_algorithm)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly"]], "acceptance_prob() (in module kernel_tuner.strategies.simulated_annealing)": [[19, "kernel_tuner.strategies.simulated_annealing.acceptance_prob"]], "compute_intensity() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity"]], "disruptive_uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover"]], "distance_to() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to"]], "generate_normalized_param_dicts() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts"]], "kernel_tuner.strategies.basinhopping": [[19, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[19, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[19, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[19, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[19, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[19, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[19, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[19, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[19, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[19, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[19, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[19, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[19, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[19, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[19, "module-kernel_tuner.strategies.simulated_annealing"]], "move_towards() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards"]], "mutate() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.mutate"]], "neighbor() (in module kernel_tuner.strategies.simulated_annealing)": [[19, "kernel_tuner.strategies.simulated_annealing.neighbor"]], "normalize_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.normalize_parameter_space"]], "prune_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.prune_parameter_space"]], "single_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.single_point_crossover"]], "tune() (in module kernel_tuner.strategies.basinhopping)": [[19, "kernel_tuner.strategies.basinhopping.tune"]], "tune() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.tune"]], "tune() (in module kernel_tuner.strategies.brute_force)": [[19, "kernel_tuner.strategies.brute_force.tune"]], "tune() (in module kernel_tuner.strategies.diff_evo)": [[19, "kernel_tuner.strategies.diff_evo.tune"]], "tune() (in module kernel_tuner.strategies.dual_annealing)": [[19, "kernel_tuner.strategies.dual_annealing.tune"]], "tune() (in module kernel_tuner.strategies.firefly_algorithm)": [[19, "kernel_tuner.strategies.firefly_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.greedy_ils)": [[19, "kernel_tuner.strategies.greedy_ils.tune"]], "tune() (in module kernel_tuner.strategies.greedy_mls)": [[19, "kernel_tuner.strategies.greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.minimize)": [[19, "kernel_tuner.strategies.minimize.tune"]], "tune() (in module kernel_tuner.strategies.mls)": [[19, "kernel_tuner.strategies.mls.tune"]], "tune() (in module kernel_tuner.strategies.ordered_greedy_mls)": [[19, "kernel_tuner.strategies.ordered_greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.pso)": [[19, "kernel_tuner.strategies.pso.tune"]], "tune() (in module kernel_tuner.strategies.random_sample)": [[19, "kernel_tuner.strategies.random_sample.tune"]], "tune() (in module kernel_tuner.strategies.simulated_annealing)": [[19, "kernel_tuner.strategies.simulated_annealing.tune"]], "two_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.two_point_crossover"]], "uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.uniform_crossover"]], "weighted_choice() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.weighted_choice"]], "create_device_targets() (in module kernel_tuner)": [[23, "kernel_tuner.create_device_targets"]], "run_kernel() (in module kernel_tuner)": [[23, "kernel_tuner.run_kernel"]], "store_results() (in module kernel_tuner)": [[23, "kernel_tuner.store_results"]], "tune_kernel() (in module kernel_tuner)": [[23, "kernel_tuner.tune_kernel"]]}})
\ No newline at end of file
+Search.setIndex({"alltitles": {"2D Convolution example": [[4, "2D-Convolution-example"]], "3D Grid on GPU with Kernel Tuner": [[12, "3D-Grid-on-GPU-with-Kernel-Tuner"]], "API Documentation": [[23, "api-documentation"]], "Auto-Tuning with the Kernel Tuner": [[8, "Auto-Tuning-with-the-Kernel-Tuner"], [9, "Auto-Tuning-with-the-Kernel-Tuner"], [10, "Auto-Tuning-with-the-Kernel-Tuner"]], "Backend feature support": [[0, "id1"]], "Backend usage and compiler": [[0, "id2"]], "Backends": [[0, "backends"]], "Building documentation": [[7, "building-documentation"]], "C run": [[10, "C-run"]], "CUDA Backends": [[0, "cuda-backends"]], "CUDA and PyCUDA": [[15, "cuda-and-pycuda"]], "Cache files": [[1, "cache-files"]], "Citation": [[14, "citation"]], "Cluster setup": [[7, "cluster-setup"]], "Code Generator": [[11, "code-generator"]], "Computing on the GPU": [[8, "Computing-on-the-GPU"], [9, "Computing-on-the-GPU"], [10, "Computing-on-the-GPU"]], "Contributing Code": [[3, "contributing-code"]], "Contribution guide": [[3, "contribution-guide"]], "Convolution": [[4, "Convolution"], [11, "convolution"]], "Correctness Verification": [[5, "correctness-verification"]], "Dependencies for the guides": [[15, "dependencies-for-the-guides"]], "Design documentation": [[6, "design-documentation"]], "Development environment": [[7, "development-environment"]], "Device Interfaces": [[6, "device-interfaces"]], "Diffusion": [[8, "Diffusion"], [8, "id1"], [9, "Diffusion"], [10, "Diffusion"]], "Example": [[22, "example"]], "Example usage": [[14, "example-usage"]], "ExpDist": [[11, "expdist"]], "Features": [[2, null]], "Getting Started": [[20, "getting-started"]], "Guides": [[2, null]], "HIP and PyHIP": [[15, "hip-and-pyhip"]], "Implement a test": [[4, "Implement-a-test"]], "Increase work per thread": [[16, "Increase-work-per-thread"]], "Installation": [[15, "installation"]], "Installing Python Packages": [[15, "installing-python-packages"]], "Installing the git version": [[15, "installing-the-git-version"]], "Kernel Tuner": [[2, null]], "Kernel Tuner Examples": [[11, "kernel-tuner-examples"]], "Let\u2019s move to the GPU": [[12, "Let's-move-to-the-GPU"]], "Let\u2019s start on the CPU": [[12, "Let's-start-on-the-CPU"]], "Local setup": [[7, "local-setup"]], "Matrix Multiplication": [[11, "matrix-multiplication"]], "Matrix multiplication": [[16, "Matrix-multiplication"]], "Metrics": [[17, "metrics"]], "Metrics and Objectives": [[17, "metrics-and-objectives"]], "More tunable parameters": [[4, "More-tunable-parameters"]], "NVMLObserver": [[18, "nvmlobserver"]], "Naive CUDA kernel": [[16, "Naive-CUDA-kernel"]], "Observers": [[18, "observers"]], "OpenCL and PyOpenCL": [[15, "opencl-and-pyopencl"]], "Optimization strategies": [[19, "optimization-strategies"]], "Other CUDA Backends": [[15, "other-cuda-backends"]], "PMTObserver": [[18, "pmtobserver"]], "Parameter Vocabulary": [[24, "parameter-vocabulary"]], "Point-in-Polygon": [[11, "point-in-polygon"]], "PowerSensorObserver": [[18, "powersensorobserver"]], "Python": [[15, "python"]], "Python implementation": [[8, "Python-implementation"], [9, "Python-implementation"], [10, "Python-implementation"]], "Python run": [[10, "Python-run"]], "Quick install": [[14, "quick-install"]], "Reduction": [[11, "reduction"]], "Reference": [[2, null]], "Reporting Issues": [[3, "reporting-issues"]], "Runners": [[6, "runners"]], "Running tests": [[7, "running-tests"]], "Selecting a backend": [[22, "selecting-a-backend"]], "Simple development setup": [[3, "simple-development-setup"]], "Sparse Matrix Vector Multiplication": [[11, "sparse-matrix-vector-multiplication"]], "Stencil": [[11, "stencil"]], "Storing the results": [[8, "Storing-the-results"], [9, "Storing-the-results"]], "Strategies": [[6, "strategies"]], "Templated kernels": [[22, "templated-kernels"]], "The Kernel Tuner documentation": [[2, "the-kernel-tuner-documentation"], [14, "the-kernel-tuner-documentation"]], "Tiling GPU Code": [[8, "Tiling-GPU-Code"], [9, "Tiling-GPU-Code"], [10, "Tiling-GPU-Code"]], "Tune the kernel": [[12, "Tune-the-kernel"]], "Tuning 2D Convolution": [[4, "Tuning-2D-Convolution"]], "Tuning Host Code": [[13, "tuning-host-code"]], "Tuning Objectives": [[17, "tuning-objectives"]], "Tuning a naive kernel": [[16, "Tuning-a-naive-kernel"]], "Tuning execution parameters with NVML": [[18, "tuning-execution-parameters-with-nvml"]], "Tuning the number of streams": [[13, "tuning-the-number-of-streams"]], "Tutorial: From physics to tuned GPU kernels": [[9, "Tutorial:-From-physics-to-tuned-GPU-kernels"], [10, "Tutorial:-From-physics-to-tuned-GPU-kernels"]], "Using Shared (local) Memory": [[9, "Using-Shared-(local)-Memory"]], "Using Shared Memory": [[8, "Using-Shared-Memory"]], "Using shared memory": [[10, "Using-shared-memory"], [16, "Using-shared-memory"]], "Using structs": [[21, "using-structs"]], "Using the best parameters in a production run": [[10, "Using-the-best-parameters-in-a-production-run"]], "Using the optimized parameters": [[12, "Using-the-optimized-parameters"]], "Util Functions": [[6, "util-functions"]], "Vector Add": [[11, "vector-add"]], "convolution.py": [[11, "convolution-py"]], "convolution_correct.py": [[11, "convolution-correct-py"]], "convolution_streams.py": [[11, "convolution-streams-py"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, "kernel-tuner-backends-compiler-compilerfunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, "kernel-tuner-backends-cupy-cupyfunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, "kernel-tuner-backends-hip-hipfunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, "kernel-tuner-backends-nvcuda-cudafunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, "kernel-tuner-backends-opencl-openclfunctions"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, "kernel-tuner-backends-pycuda-pycudafunctions"]], "kernel_tuner.core.DeviceInterface": [[6, "kernel-tuner-core-deviceinterface"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, "kernel-tuner-runners-sequential-sequentialrunner"]], "kernel_tuner.runners.sequential.SimulationRunner": [[6, "kernel-tuner-runners-sequential-simulationrunner"]], "kernel_tuner.strategies.basinhopping": [[19, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[19, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[19, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common"]], "kernel_tuner.strategies.diff_evo": [[19, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[19, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[19, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[19, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[19, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[19, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[19, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[19, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[19, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[19, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[19, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[19, "module-kernel_tuner.strategies.simulated_annealing"]], "kernel_tuner.util": [[6, "module-kernel_tuner.util"]], "sepconv.py": [[11, "sepconv-py"]]}, "docnames": ["backends", "cache_files", "contents", "contributing", "convolution", "correctness", "design", "dev-environment", "diffusion", "diffusion_opencl", "diffusion_use_optparam", "examples", "grid3d", "hostcode", "index", "install", "matrix_multiplication", "metrics", "observers", "optimization", "quickstart", "structs", "templates", "user-api", "vocabulary"], "envversion": {"nbsphinx": 4, "sphinx": 61, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["backends.rst", "cache_files.rst", "contents.rst", "contributing.rst", "convolution.ipynb", "correctness.rst", "design.rst", "dev-environment.rst", "diffusion.ipynb", "diffusion_opencl.ipynb", "diffusion_use_optparam.ipynb", "examples.rst", "grid3d.ipynb", "hostcode.rst", "index.rst", "install.rst", "matrix_multiplication.ipynb", "metrics.rst", "observers.rst", "optimization.rst", "quickstart.rst", "structs.rst", "templates.rst", "user-api.rst", "vocabulary.rst"], "indexentries": {"__init__() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.__init__", false]], "__init__() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.__init__", false]], "__init__() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.__init__", false]], "__init__() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.__init__", false]], "__init__() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.__init__", false]], "__init__() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.__init__", false]], "__init__() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.__init__", false]], "__init__() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.__init__", false]], "__init__() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.__init__", false]], "acceptance_prob() (in module kernel_tuner.strategies.simulated_annealing)": [[19, "kernel_tuner.strategies.simulated_annealing.acceptance_prob", false]], "after_finish() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.after_finish", false]], "after_start() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.after_start", false]], "before_start() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.before_start", false]], "benchmark() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark", false]], "benchmark_continuous() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_continuous", false]], "benchmark_default() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_default", false]], "benchmarkobserver (class in kernel_tuner.observers)": [[18, "kernel_tuner.observers.BenchmarkObserver", false]], "check_argument_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_list", false]], "check_argument_type() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_type", false]], "check_kernel_output() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.check_kernel_output", false]], "check_restriction() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_restriction", false]], "check_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_restrictions", false]], "check_stop_criterion() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_stop_criterion", false]], "check_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_thread_block_dimensions", false]], "check_tune_params_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_tune_params_list", false]], "cleanup_lib() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.cleanup_lib", false]], "compilationfailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.CompilationFailedConfig", false]], "compile() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.compile", false]], "compile() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.compile", false]], "compile() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.compile", false]], "compile() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.compile", false]], "compile() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.compile", false]], "compile() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.compile", false]], "compile_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.compile_kernel", false]], "compile_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.compile_restrictions", false]], "compilerfunctions (class in kernel_tuner.backends.compiler)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions", false]], "compute_intensity() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity", false]], "config_valid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.config_valid", false]], "convert_constraint_restriction() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.convert_constraint_restriction", false]], "copy_constant_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args", false]], "copy_constant_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args", false]], "copy_constant_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args", false]], "copy_constant_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args", false]], "copy_constant_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args", false]], "copy_constant_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_constant_memory_args", false]], "copy_shared_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args", false]], "copy_shared_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args", false]], "copy_shared_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args", false]], "copy_shared_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args", false]], "copy_shared_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args", false]], "copy_shared_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_shared_memory_args", false]], "copy_texture_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args", false]], "copy_texture_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args", false]], "copy_texture_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args", false]], "copy_texture_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args", false]], "copy_texture_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args", false]], "copy_texture_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_texture_memory_args", false]], "correct_open_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.correct_open_cache", false]], "create_device_targets() (in module kernel_tuner)": [[23, "kernel_tuner.create_device_targets", false]], "create_kernel_instance() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.create_kernel_instance", false]], "cuda_error_check() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.cuda_error_check", false]], "cudafunctions (class in kernel_tuner.backends.nvcuda)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions", false]], "cupyfunctions (class in kernel_tuner.backends.cupy)": [[6, "kernel_tuner.backends.cupy.CupyFunctions", false]], "default() (kernel_tuner.util.npencoder method)": [[6, "kernel_tuner.util.NpEncoder.default", false]], "delete_temp_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.delete_temp_file", false]], "detect_language() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.detect_language", false]], "deviceinterface (class in kernel_tuner.core)": [[6, "kernel_tuner.core.DeviceInterface", false]], "disruptive_uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover", false]], "distance_to() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to", false]], "dump_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.dump_cache", false]], "during() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.during", false]], "errorconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.ErrorConfig", false]], "firefly (class in kernel_tuner.strategies.firefly_algorithm)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly", false]], "generate_normalized_param_dicts() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts", false]], "get_best_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_best_config", false]], "get_config_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_config_string", false]], "get_environment() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.get_environment", false]], "get_grid_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_grid_dimensions", false]], "get_instance_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_instance_string", false]], "get_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_kernel_string", false]], "get_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_options", false]], "get_problem_size() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_problem_size", false]], "get_results() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.get_results", false]], "get_smem_args() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_smem_args", false]], "get_strategy_docstring() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_strategy_docstring", false]], "get_temp_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_temp_filename", false]], "get_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_thread_block_dimensions", false]], "get_total_timings() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_total_timings", false]], "hipfunctions (class in kernel_tuner.backends.hip)": [[6, "kernel_tuner.backends.hip.HipFunctions", false]], "invalidconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.InvalidConfig", false]], "kernel_finished() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.kernel_finished", false]], "kernel_finished() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.kernel_finished", false]], "kernel_finished() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.kernel_finished", false]], "kernel_finished() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished", false]], "kernel_finished() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished", false]], "kernel_finished() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished", false]], "kernel_tuner.strategies.basinhopping": [[19, "module-kernel_tuner.strategies.basinhopping", false]], "kernel_tuner.strategies.bayes_opt": [[19, "module-kernel_tuner.strategies.bayes_opt", false]], "kernel_tuner.strategies.brute_force": [[19, "module-kernel_tuner.strategies.brute_force", false]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common", false]], "kernel_tuner.strategies.diff_evo": [[19, "module-kernel_tuner.strategies.diff_evo", false]], "kernel_tuner.strategies.dual_annealing": [[19, "module-kernel_tuner.strategies.dual_annealing", false]], "kernel_tuner.strategies.firefly_algorithm": [[19, "module-kernel_tuner.strategies.firefly_algorithm", false]], "kernel_tuner.strategies.genetic_algorithm": [[19, "module-kernel_tuner.strategies.genetic_algorithm", false]], "kernel_tuner.strategies.greedy_ils": [[19, "module-kernel_tuner.strategies.greedy_ils", false]], "kernel_tuner.strategies.greedy_mls": [[19, "module-kernel_tuner.strategies.greedy_mls", false]], "kernel_tuner.strategies.minimize": [[19, "module-kernel_tuner.strategies.minimize", false]], "kernel_tuner.strategies.mls": [[19, "module-kernel_tuner.strategies.mls", false]], "kernel_tuner.strategies.ordered_greedy_mls": [[19, "module-kernel_tuner.strategies.ordered_greedy_mls", false]], "kernel_tuner.strategies.pso": [[19, "module-kernel_tuner.strategies.pso", false]], "kernel_tuner.strategies.random_sample": [[19, "module-kernel_tuner.strategies.random_sample", false]], "kernel_tuner.strategies.simulated_annealing": [[19, "module-kernel_tuner.strategies.simulated_annealing", false]], "kernel_tuner.util": [[6, "module-kernel_tuner.util", false]], "looks_like_a_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.looks_like_a_filename", false]], "make_strategy_options_doc() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.make_strategy_options_doc", false]], "memcpy_dtoh() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_dtoh", false]], "memcpy_dtoh() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh", false]], "memcpy_dtoh() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh", false]], "memcpy_dtoh() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh", false]], "memcpy_dtoh() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh", false]], "memcpy_dtoh() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh", false]], "memcpy_dtoh() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.memcpy_dtoh", false]], "memcpy_htod() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_htod", false]], "memcpy_htod() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod", false]], "memcpy_htod() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_htod", false]], "memcpy_htod() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod", false]], "memcpy_htod() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod", false]], "memcpy_htod() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod", false]], "memset() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memset", false]], "memset() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memset", false]], "memset() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memset", false]], "memset() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memset", false]], "memset() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memset", false]], "memset() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memset", false]], "module": [[6, "module-kernel_tuner.strategies.common", false], [6, "module-kernel_tuner.util", false], [19, "module-kernel_tuner.strategies.basinhopping", false], [19, "module-kernel_tuner.strategies.bayes_opt", false], [19, "module-kernel_tuner.strategies.brute_force", false], [19, "module-kernel_tuner.strategies.diff_evo", false], [19, "module-kernel_tuner.strategies.dual_annealing", false], [19, "module-kernel_tuner.strategies.firefly_algorithm", false], [19, "module-kernel_tuner.strategies.genetic_algorithm", false], [19, "module-kernel_tuner.strategies.greedy_ils", false], [19, "module-kernel_tuner.strategies.greedy_mls", false], [19, "module-kernel_tuner.strategies.minimize", false], [19, "module-kernel_tuner.strategies.mls", false], [19, "module-kernel_tuner.strategies.ordered_greedy_mls", false], [19, "module-kernel_tuner.strategies.pso", false], [19, "module-kernel_tuner.strategies.random_sample", false], [19, "module-kernel_tuner.strategies.simulated_annealing", false]], "move_towards() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[19, "kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards", false]], "mutate() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.mutate", false]], "neighbor() (in module kernel_tuner.strategies.simulated_annealing)": [[19, "kernel_tuner.strategies.simulated_annealing.neighbor", false]], "normalize_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.normalize_parameter_space", false]], "normalize_verify_function() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.normalize_verify_function", false]], "npencoder (class in kernel_tuner.util)": [[6, "kernel_tuner.util.NpEncoder", false]], "nvmlobserver (class in kernel_tuner.observers.nvml)": [[18, "kernel_tuner.observers.nvml.NVMLObserver", false]], "openclfunctions (class in kernel_tuner.backends.opencl)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions", false]], "parse_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.parse_restrictions", false]], "pmtobserver (class in kernel_tuner.observers.pmt)": [[18, "kernel_tuner.observers.pmt.PMTObserver", false]], "powersensorobserver (class in kernel_tuner.observers.powersensor)": [[18, "kernel_tuner.observers.powersensor.PowerSensorObserver", false]], "prepare_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.prepare_kernel_string", false]], "preprocess_gpu_arguments() (kernel_tuner.core.deviceinterface static method)": [[6, "kernel_tuner.core.DeviceInterface.preprocess_gpu_arguments", false]], "print_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config", false]], "print_config_output() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config_output", false]], "process_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_cache", false]], "process_metrics() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_metrics", false]], "prune_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.prune_parameter_space", false]], "pycudafunctions (class in kernel_tuner.backends.pycuda)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions", false]], "read_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_cache", false]], "read_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_file", false]], "ready_argument_list() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.ready_argument_list", false]], "ready_argument_list() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list", false]], "ready_argument_list() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.ready_argument_list", false]], "ready_argument_list() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list", false]], "ready_argument_list() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list", false]], "ready_argument_list() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list", false]], "ready_argument_list() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.ready_argument_list", false]], "register_configuration() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.register_configuration", false]], "register_device() (kernel_tuner.observers.benchmarkobserver method)": [[18, "kernel_tuner.observers.BenchmarkObserver.register_device", false]], "replace_param_occurrences() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.replace_param_occurrences", false]], "run() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.run", false]], "run() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.run", false]], "run_kernel() (in module kernel_tuner)": [[23, "kernel_tuner.run_kernel", false]], "run_kernel() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.run_kernel", false]], "run_kernel() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.run_kernel", false]], "run_kernel() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.run_kernel", false]], "run_kernel() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel", false]], "run_kernel() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel", false]], "run_kernel() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel", false]], "run_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.run_kernel", false]], "runtimefailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.RuntimeFailedConfig", false]], "scale_from_params() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.scale_from_params", false]], "sequentialrunner (class in kernel_tuner.runners.sequential)": [[6, "kernel_tuner.runners.sequential.SequentialRunner", false]], "set_nvml_parameters() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.set_nvml_parameters", false]], "setup_block_and_grid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.setup_block_and_grid", false]], "setup_method_arguments() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_arguments", false]], "setup_method_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_options", false]], "simulationrunner (class in kernel_tuner.runners.simulation)": [[6, "kernel_tuner.runners.simulation.SimulationRunner", false]], "single_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.single_point_crossover", false]], "skippablefailure": [[6, "kernel_tuner.util.SkippableFailure", false]], "snap_to_nearest_config() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.snap_to_nearest_config", false]], "start_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.start_event", false]], "start_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.start_event", false]], "start_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.start_event", false]], "start_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.start_event", false]], "start_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.start_event", false]], "start_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.start_event", false]], "stop_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.stop_event", false]], "stop_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.stop_event", false]], "stop_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.stop_event", false]], "stop_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.stop_event", false]], "stop_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.stop_event", false]], "stop_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event", false]], "stopcriterionreached": [[6, "kernel_tuner.util.StopCriterionReached", false]], "store_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.store_cache", false]], "store_results() (in module kernel_tuner)": [[23, "kernel_tuner.store_results", false]], "synchronize() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.synchronize", false]], "synchronize() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.synchronize", false]], "synchronize() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.synchronize", false]], "synchronize() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.synchronize", false]], "synchronize() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.synchronize", false]], "synchronize() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize", false]], "to_valid_nvrtc_gpu_arch_cc() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.to_valid_nvrtc_gpu_arch_cc", false]], "tune() (in module kernel_tuner.strategies.basinhopping)": [[19, "kernel_tuner.strategies.basinhopping.tune", false]], "tune() (in module kernel_tuner.strategies.bayes_opt)": [[19, "kernel_tuner.strategies.bayes_opt.tune", false]], "tune() (in module kernel_tuner.strategies.brute_force)": [[19, "kernel_tuner.strategies.brute_force.tune", false]], "tune() (in module kernel_tuner.strategies.diff_evo)": [[19, "kernel_tuner.strategies.diff_evo.tune", false]], "tune() (in module kernel_tuner.strategies.dual_annealing)": [[19, "kernel_tuner.strategies.dual_annealing.tune", false]], "tune() (in module kernel_tuner.strategies.firefly_algorithm)": [[19, "kernel_tuner.strategies.firefly_algorithm.tune", false]], "tune() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.tune", false]], "tune() (in module kernel_tuner.strategies.greedy_ils)": [[19, "kernel_tuner.strategies.greedy_ils.tune", false]], "tune() (in module kernel_tuner.strategies.greedy_mls)": [[19, "kernel_tuner.strategies.greedy_mls.tune", false]], "tune() (in module kernel_tuner.strategies.minimize)": [[19, "kernel_tuner.strategies.minimize.tune", false]], "tune() (in module kernel_tuner.strategies.mls)": [[19, "kernel_tuner.strategies.mls.tune", false]], "tune() (in module kernel_tuner.strategies.ordered_greedy_mls)": [[19, "kernel_tuner.strategies.ordered_greedy_mls.tune", false]], "tune() (in module kernel_tuner.strategies.pso)": [[19, "kernel_tuner.strategies.pso.tune", false]], "tune() (in module kernel_tuner.strategies.random_sample)": [[19, "kernel_tuner.strategies.random_sample.tune", false]], "tune() (in module kernel_tuner.strategies.simulated_annealing)": [[19, "kernel_tuner.strategies.simulated_annealing.tune", false]], "tune_kernel() (in module kernel_tuner)": [[23, "kernel_tuner.tune_kernel", false]], "two_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.two_point_crossover", false]], "uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.uniform_crossover", false]], "unscale_and_snap_to_nearest() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.unscale_and_snap_to_nearest", false]], "weighted_choice() (in module kernel_tuner.strategies.genetic_algorithm)": [[19, "kernel_tuner.strategies.genetic_algorithm.weighted_choice", false]], "write_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.write_file", false]]}, "objects": {"kernel_tuner": [[23, 2, 1, "", "create_device_targets"], [23, 2, 1, "", "run_kernel"], [23, 2, 1, "", "store_results"], [23, 2, 1, "", "tune_kernel"], [6, 3, 0, "-", "util"]], "kernel_tuner.backends.compiler": [[6, 0, 1, "", "CompilerFunctions"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "cleanup_lib"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.cupy": [[6, 0, 1, "", "CupyFunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.hip": [[6, 0, 1, "", "HipFunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.nvcuda": [[6, 0, 1, "", "CudaFunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.opencl": [[6, 0, 1, "", "OpenCLFunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.pycuda": [[6, 0, 1, "", "PyCudaFunctions"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.core": [[6, 0, 1, "", "DeviceInterface"]], "kernel_tuner.core.DeviceInterface": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "benchmark"], [6, 1, 1, "", "benchmark_continuous"], [6, 1, 1, "", "benchmark_default"], [6, 1, 1, "", "check_kernel_output"], [6, 1, 1, "", "compile_kernel"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "create_kernel_instance"], [6, 1, 1, "", "get_environment"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "preprocess_gpu_arguments"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "set_nvml_parameters"]], "kernel_tuner.observers": [[18, 0, 1, "", "BenchmarkObserver"]], "kernel_tuner.observers.BenchmarkObserver": [[18, 1, 1, "", "after_finish"], [18, 1, 1, "", "after_start"], [18, 1, 1, "", "before_start"], [18, 1, 1, "", "during"], [18, 1, 1, "", "get_results"], [18, 1, 1, "", "register_configuration"], [18, 1, 1, "", "register_device"]], "kernel_tuner.observers.nvml": [[18, 0, 1, "", "NVMLObserver"]], "kernel_tuner.observers.pmt": [[18, 0, 1, "", "PMTObserver"]], "kernel_tuner.observers.powersensor": [[18, 0, 1, "", "PowerSensorObserver"]], "kernel_tuner.runners.sequential": [[6, 0, 1, "", "SequentialRunner"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.runners.simulation": [[6, 0, 1, "", "SimulationRunner"]], "kernel_tuner.runners.simulation.SimulationRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.strategies": [[19, 3, 0, "-", "basinhopping"], [19, 3, 0, "-", "bayes_opt"], [19, 3, 0, "-", "brute_force"], [6, 3, 0, "-", "common"], [19, 3, 0, "-", "diff_evo"], [19, 3, 0, "-", "dual_annealing"], [19, 3, 0, "-", "firefly_algorithm"], [19, 3, 0, "-", "genetic_algorithm"], [19, 3, 0, "-", "greedy_ils"], [19, 3, 0, "-", "greedy_mls"], [19, 3, 0, "-", "minimize"], [19, 3, 0, "-", "mls"], [19, 3, 0, "-", "ordered_greedy_mls"], [19, 3, 0, "-", "pso"], [19, 3, 0, "-", "random_sample"], [19, 3, 0, "-", "simulated_annealing"]], "kernel_tuner.strategies.basinhopping": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.bayes_opt": [[19, 2, 1, "", "generate_normalized_param_dicts"], [19, 2, 1, "", "normalize_parameter_space"], [19, 2, 1, "", "prune_parameter_space"], [19, 2, 1, "", "tune"]], "kernel_tuner.strategies.brute_force": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.common": [[6, 2, 1, "", "get_options"], [6, 2, 1, "", "get_strategy_docstring"], [6, 2, 1, "", "make_strategy_options_doc"], [6, 2, 1, "", "scale_from_params"], [6, 2, 1, "", "setup_method_arguments"], [6, 2, 1, "", "setup_method_options"], [6, 2, 1, "", "snap_to_nearest_config"], [6, 2, 1, "", "unscale_and_snap_to_nearest"]], "kernel_tuner.strategies.diff_evo": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.dual_annealing": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm": [[19, 0, 1, "", "Firefly"], [19, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm.Firefly": [[19, 1, 1, "", "compute_intensity"], [19, 1, 1, "", "distance_to"], [19, 1, 1, "", "move_towards"]], "kernel_tuner.strategies.genetic_algorithm": [[19, 2, 1, "", "disruptive_uniform_crossover"], [19, 2, 1, "", "mutate"], [19, 2, 1, "", "single_point_crossover"], [19, 2, 1, "", "tune"], [19, 2, 1, "", "two_point_crossover"], [19, 2, 1, "", "uniform_crossover"], [19, 2, 1, "", "weighted_choice"]], "kernel_tuner.strategies.greedy_ils": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.greedy_mls": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.minimize": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.mls": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.ordered_greedy_mls": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.pso": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.random_sample": [[19, 2, 1, "", "tune"]], "kernel_tuner.strategies.simulated_annealing": [[19, 2, 1, "", "acceptance_prob"], [19, 2, 1, "", "neighbor"], [19, 2, 1, "", "tune"]], "kernel_tuner.util": [[6, 0, 1, "", "CompilationFailedConfig"], [6, 0, 1, "", "ErrorConfig"], [6, 0, 1, "", "InvalidConfig"], [6, 0, 1, "", "NpEncoder"], [6, 0, 1, "", "RuntimeFailedConfig"], [6, 4, 1, "", "SkippableFailure"], [6, 4, 1, "", "StopCriterionReached"], [6, 2, 1, "", "check_argument_list"], [6, 2, 1, "", "check_argument_type"], [6, 2, 1, "", "check_restriction"], [6, 2, 1, "", "check_restrictions"], [6, 2, 1, "", "check_stop_criterion"], [6, 2, 1, "", "check_thread_block_dimensions"], [6, 2, 1, "", "check_tune_params_list"], [6, 2, 1, "", "compile_restrictions"], [6, 2, 1, "", "config_valid"], [6, 2, 1, "", "convert_constraint_restriction"], [6, 2, 1, "", "correct_open_cache"], [6, 2, 1, "", "cuda_error_check"], [6, 2, 1, "", "delete_temp_file"], [6, 2, 1, "", "detect_language"], [6, 2, 1, "", "dump_cache"], [6, 2, 1, "", "get_best_config"], [6, 2, 1, "", "get_config_string"], [6, 2, 1, "", "get_grid_dimensions"], [6, 2, 1, "", "get_instance_string"], [6, 2, 1, "", "get_kernel_string"], [6, 2, 1, "", "get_problem_size"], [6, 2, 1, "", "get_smem_args"], [6, 2, 1, "", "get_temp_filename"], [6, 2, 1, "", "get_thread_block_dimensions"], [6, 2, 1, "", "get_total_timings"], [6, 2, 1, "", "looks_like_a_filename"], [6, 2, 1, "", "normalize_verify_function"], [6, 2, 1, "", "parse_restrictions"], [6, 2, 1, "", "prepare_kernel_string"], [6, 2, 1, "", "print_config"], [6, 2, 1, "", "print_config_output"], [6, 2, 1, "", "process_cache"], [6, 2, 1, "", "process_metrics"], [6, 2, 1, "", "read_cache"], [6, 2, 1, "", "read_file"], [6, 2, 1, "", "replace_param_occurrences"], [6, 2, 1, "", "setup_block_and_grid"], [6, 2, 1, "", "store_cache"], [6, 2, 1, "", "to_valid_nvrtc_gpu_arch_cc"], [6, 2, 1, "", "write_file"]], "kernel_tuner.util.NpEncoder": [[6, 1, 1, "", "default"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"], "3": ["py", "module", "Python module"], "4": ["py", "exception", "Python exception"]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function", "3": "py:module", "4": "py:exception"}, "terms": {"": [0, 4, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23], "0": [4, 5, 6, 8, 9, 10, 12, 13, 16, 18, 19, 21, 23], "000001": 23, "001": 19, "004": 14, "0079296": 9, "01": 9, "01125121117": 12, "01592960358": 12, "018869400024": 8, "02665598392": 8, "0341696": 9, "03752319813": 8, "0398656": 9, "04807043076": 8, "05": [8, 9], "05262081623": 8, "054880023": 8, "05524477959": 12, "05549435616": 8, "05816960335": 8, "05957758427": 8, "06": 23, "0626496": 9, "0629119873": 8, "06332798004": 8, "06672639847": 8, "06709122658": 8, "06844799519": 8, "0688128": 9, "0692224": 9, "06983039379": 8, "07002239227": 8, "0708288": 9, "07260": 14, "0731967926": 8, "07386879921": 8, "07484800816": 8, "07508480549": 8, "0759360075": 8, "0799423933": 8, "08": 14, "0801344": 9, "0808192": 9, "0809792": 9, "08220798969": 8, "0836928": 9, "08389122486": 8, "0849664": 9, "086019515991": 10, "0873216": 9, "09015038013": 8, "0909248": 9, "0911808": 9, "0927808": 9, "095232": 9, "09730558395": 8, "09794559479": 8, "0f": [8, 9, 10], "0l": 21, "0x2aaab952f240": 8, "0x2aaabbdcb2e8": 8, "0x2aab1c98b3c8": 8, "0x2aab1de088d0": 9, "0x7f8858b873c8": 10, "0x7f8865b51f28": 10, "0x7f887c3d2358": 10, "0x7f888f8cd7b8": 10, "1": [4, 5, 7, 8, 9, 10, 12, 13, 16, 18, 19, 23], "10": [7, 8, 9, 10, 14, 19], "100": [12, 19, 23], "1000": [8, 9, 10, 12], "10000": 6, "1000000": [20, 22], "10000000": 14, "10033922195": 8, "10066559315": 8, "10125439167": 8, "1016": 14, "1018432": 9, "1024": [4, 8, 9, 10, 20], "1035136": 9, "10455036163": 12, "1065792": 9, "10700161457": 8, "10740480423": 8, "1080": [10, 12], "108896": 9, "1095936": 9, "11": [7, 8, 9, 10], "1107712": 9, "1140736": 9, "1145664": 9, "11514236927": 8, "11877760887": 12, "12": [7, 8, 9, 10], "12000002861": 8, "12033278942": 8, "1219392": 9, "12255358696": 12, "1230336": 9, "12309756279": 12, "12483196259": 12, "1249984": 9, "1264": 9, "128": [4, 8, 9, 10, 14, 20, 22], "128x32": [8, 9, 10], "13": [8, 9, 10], "13023357391": 8, "13276162148": 12, "13297917843": 8, "133200": 12, "134233": 6, "1384064": 9, "14": [8, 9, 10], "140": 9, "140192": 9, "1428928": 9, "14420480728": 8, "14729599953": 8, "1493952": 9, "14991": 14, "15": [8, 9, 10, 22], "15089921951": 8, "154": 9, "1545856": 9, "1547072": 9, "15584001541": 12, "15777277946": 12, "1583104": 9, "15916161537": 8, "16": [4, 5, 8, 9, 10, 12, 13, 16], "160627": 12, "16174077988": 12, "16199679375": 12, "16430072784": 12, "17": [4, 5, 8, 9, 10, 13], "17051515579": 12, "171552": 9, "174624": 9, "1748096": 9, "17600002289": 12, "1790336": 9, "17987194061": 12, "18": [8, 9, 10], "18713598251": 8, "18835201263": 12, "19": [9, 10], "19029750824": 12, "19084160328": 8, "1912576": 9, "19179515839": 12, "1942016": 9, "1e": [5, 23], "1e3": [4, 8, 9, 10, 16, 17], "1e9": [4, 16], "1xn": 16, "2": [4, 5, 7, 8, 9, 10, 11, 12, 13, 16, 18, 19, 23], "20": [10, 19], "2000": [8, 9, 10], "2018": 14, "2019": 14, "2021": 14, "2022": 14, "2048": 4, "21": 10, "2111": 14, "2124736": 9, "213971138": 12, "2194111824": 12, "2211": 14, "22200961113": 10, "22305920124": 8, "2231903076172": 10, "225": [8, 9, 10], "225f": [8, 9, 10], "22673916817": 10, "231072": 9, "23127679825": 12, "234342": 6, "23719043732": 12, "25130877495": 12, "256": [12, 14, 20], "26": [8, 9, 10], "26213116646": 12, "2634239912": 8, "26398081779": 12, "27082881927": 12, "28542718887": 12, "28725762367": 12, "29789438248": 8, "29814395905": 12, "2d": [8, 9, 10, 11], "2u_": [8, 9, 10], "3": [5, 7, 8, 9, 10, 12, 13, 15, 16, 19, 23], "30": 12, "30227203369": 12, "30295038223": 12, "3039616": 9, "31470079422": 12, "31647996902": 12, "31661438942": 8, "32": [4, 6, 8, 9, 10, 12, 14, 16, 20, 23], "322547197342": 10, "324083191156": 10, "3245952": 9, "3269824028": 8, "328998398781": 10, "32x2": [8, 9, 10], "33": 10, "339692795277": 10, "341376006603": 10, "342201602459": 10, "342540800571": 10, "34344320297": 12, "343545603752": 10, "343961596489": 10, "34440960288": 10, "34500494003": 12, "346079999208": 10, "346707201004": 10, "347": 14, "350457596779": 10, "351263999939": 10, "351923197508": 10, "352531200647": 10, "356716805696": 10, "357107198238": 10, "358": 14, "359289598465": 10, "359744000435": 10, "361823999882": 10, "362937599421": 10, "364051198959": 10, "36409599781": 10, "364627194405": 10, "365190398693": 10, "366150397062": 10, "36935677528": 12, "369555193186": 10, "371507203579": 10, "37233917713": 12, "37322881222": 12, "3735104084": 12, "37474560738": 12, "37543039322": 12, "3754431963": 12, "375468802452": 10, "375519996881": 10, "37677440643": 12, "37724158764": 12, "3775359869": 12, "3778496027": 12, "380812799931": 10, "3835711956": 12, "384262400866": 10, "385510402918": 10, "38625922203": 12, "38627202511": 12, "386668801308": 10, "38772480488": 12, "387827193737": 10, "38787200451": 12, "38793599606": 12, "38867840767": 12, "38956804276": 12, "389945602417": 10, "391807991266": 10, "39207677841": 12, "393126398325": 10, "39331197739": 12, "39366400242": 12, "394271999598": 10, "394380801916": 10, "39439997673": 12, "394975996017": 10, "39508478642": 12, "39547519684": 12, "39589118958": 12, "396070402861": 10, "39618558884": 12, "4": [4, 8, 9, 10, 12, 16, 18], "40000": 12, "400672000647": 10, "40074241161": 12, "4015104": 9, "40247042179": 12, "40401918888": 12, "404249596596": 10, "404908800125": 10, "407667201757": 10, "408166396618": 10, "4096": [4, 5, 8, 9, 10, 13, 16], "411392003298": 10, "41324160099": 10, "4152": 10, "416320002079": 10, "416352003813": 10, "4164": 8, "418393599987": 10, "42": 4, "423038482666016": 8, "42449922562": 12, "425414395332": 10, "426995199919": 10, "429862397909": 10, "4303104": 9, "431225597858": 10, "437388807535": 10, "438406395912": 10, "44043520093": 10, "440895992517": 10, "441734397411": 10, "443033593893": 10, "443519997597": 10, "443942397833": 10, "444": 9, "447974395752": 10, "449055999517": 10, "44974719882": 10, "44995200038": 10, "451283198595": 10, "455193603039": 10, "455404800177": 10, "45568639636": 10, "455724793673": 10, "456275194883": 10, "457075202465": 10, "457427197695": 10, "457715201378": 10, "457779198885": 10, "458412796259": 10, "459654402733": 10, "460032004118": 10, "460281592607": 10, "460972803831": 10, "461017608643": 10, "46109390258789": 10, "46233600378": 10, "462649595737": 10, "463052803278": 10, "464659202099": 10, "464863997698": 10, "465132802725": 10, "46561280489": 10, "466118401289": 10, "466284793615": 10, "466655993462": 10, "466758400202": 10, "466937589645": 10, "467033600807": 10, "467059195042": 10, "470220798254": 10, "470937597752": 10, "472236800194": 10, "472665601969": 10, "47406719923": 10, "474092799425": 10, "474841600657": 10, "475264000893": 10, "475987195969": 10, "478079998493": 10, "478739196062": 10, "48": [8, 9, 10], "480531209707": 10, "481279999018": 10, "481376004219": 10, "481817597151": 10, "486515200138": 10, "486649608612": 10, "489004802704": 10, "491852802038": 10, "492915201187": 10, "493900799751": 10, "494163197279": 10, "494553589821": 10, "49552000761": 10, "496121603251": 10, "49723520875": 10, "498617601395": 10, "4u_": [8, 9, 10], "5": [8, 9, 10, 12, 19], "50": 19, "500": [8, 9, 10], "500288009644": 10, "500524806976": 10, "502195191383": 10, "50277120471": 10, "506457602978": 10, "50662400723": 12, "507142400742": 10, "50787198544": 12, "50912": 9, "5108224": 9, "5116928": 9, "512": [14, 20], "513356792927": 10, "513632011414": 10, "515667212009": 10, "516473591328": 10, "5172352": 9, "5188416": 9, "5230208": 9, "52320": 12, "5235328": 9, "528115200996": 10, "5289728": 9, "53": [8, 9, 10], "53760001659": 12, "538227200508": 8, "5383296": 9, "539891195297": 8, "540352010727": 8, "540383994579": 8, "5412672": 9, "542387211323": 8, "542937588692": 8, "544224": 9, "54432649612": 12, "5445248": 9, "544691193104": 8, "545715200901": 10, "546316814423": 10, "5470016": 9, "5475584": 9, "5476864": 9, "5478592": 9, "549024009705": 10, "5492416": 9, "54944": 9, "550105595589": 8, "550668799877": 10, "55171200037": 10, "55267841816": 12, "554745602608": 8, "5548416": 9, "556576": 9, "557331204414": 10, "5582144": 9, "558771193027": 10, "558815991879": 10, "5591744": 9, "5592064": 9, "560505592823": 8, "5620608": 9, "562521612644": 8, "5631168": 9, "563417613506": 8, "564575994015": 10, "5648128": 9, "564921605587": 10, "565254402161": 8, "56585599184": 8, "56709756851": 12, "567417597771": 8, "567456": 9, "56833920479": 12, "568435204029": 10, "568556785583": 8, "5685632": 9, "569177603722": 10, "569388794899": 8, "57044479847": 10, "5715968": 9, "5717568": 9, "5719744": 9, "573836791515": 8, "574816000462": 10, "5753152": 9, "575859189034": 8, "576044797897": 8, "57678719759": 10, "577215993404": 8, "578681600094": 8, "578745603561": 8, "579007995129": 10, "579411196709": 8, "579827189445": 10, "579904007912": 8, "58026239872": 10, "58035838604": 8, "580928003788": 10, "581280004978": 8, "582751989365": 10, "583270406723": 10, "5863488": 9, "586604809761": 10, "586931192875": 10, "588492810726": 8, "588998401165": 10, "5898432": 9, "59088640213": 8, "5920832": 9, "593977594376": 10, "595276796818": 8, "595462405682": 10, "5968448": 9, "5968704": 9, "597267186642": 8, "597920000553": 10, "598": 9, "598758399487": 10, "6": [5, 8, 9, 10, 12, 13, 23], "600819194317": 10, "601049602032": 10, "601856": 9, "60216319561": 8, "6038336": 9, "60433280468": 10, "6047296": 9, "605760002136": 8, "6076224": 9, "6081536": 9, "608800005913": 10, "60942081213": 8, "611705589294": 10, "613088": 9, "6146176": 9, "615148806572": 8, "615475213528": 10, "61618566513": 12, "61655673981": 12, "616556799412": 10, "618": 10, "618003201485": 8, "618598401546": 8, "618758392334": 10, "620352": 9, "620384001732": 10, "621254396439": 8, "622534394264": 10, "622867202759": 8, "6237696": 9, "624492788315": 8, "6250816": 9, "625260794163": 8, "626163220406": 8, "6263552": 9, "6264192": 9, "626976013184": 8, "627136015892": 8, "629164803028": 10, "6292032": 9, "631142401695": 8, "632006394863": 8, "632607996464": 10, "6343168": 9, "6367488": 9, "637958395481": 8, "638348805904": 8, "6387072": 9, "6389312": 9, "64": [4, 8, 9, 10, 14, 16, 20, 22], "6415552": 9, "643359994888": 8, "64358406067": 12, "643820810318": 8, "6452736": 9, "6458624": 9, "6459328": 9, "646092808247": 8, "6466816": 9, "648620784283": 8, "648928": 9, "6492288": 9, "649779188633": 8, "6498688": 9, "64x4": [8, 9, 10], "650336003304": 8, "6508544": 9, "6524544": 9, "652575993538": 8, "6551744": 9, "655827200413": 10, "657920002937": 8, "6581248": 9, "6601472": 9, "6607744": 9, "6608448": 9, "662041604519": 8, "662566399574": 8, "6633536": 9, "66344319582": 8, "6659904": 9, "666003203392": 8, "6660672": 9, "6664448": 9, "666656005383": 8, "667251205444": 8, "667347204685": 8, "6677696": 9, "6717888": 9, "6719104": 9, "6724736": 9, "673248004913": 8, "67516155243": 12, "675232005119": 8, "6754048": 9, "6757568": 9, "675923216343": 8, "676096": 9, "676595199108": 8, "677363204956": 8, "67856": 9, "679372787476": 8, "6796352": 9, "680422389507": 8, "6810944": 9, "6813376": 9, "681350398064": 8, "682188808918": 8, "6842112": 9, "685670387745": 8, "686528": 9, "68781440258": 8, "687955200672": 8, "689356791973": 8, "6895552": 9, "690009605885": 8, "6905984": 9, "69104": 9, "691116797924": 8, "6911872": 9, "691385602951": 8, "6921216": 9, "692665600777": 8, "6929216": 9, "694451200962": 8, "6951168": 9, "6962112": 9, "69627519846": 8, "69648": 9, "697094392776": 8, "6972928": 9, "6975872": 9, "69833599329": 10, "698336": 9, "699366402626": 8, "7": [6, 8, 9, 10, 12, 23], "700883197784": 8, "7008832": 9, "70140799284": 8, "703302407265": 8, "705055999756": 8, "705900788307": 8, "705932807922": 8, "7068608": 9, "7074048": 9, "7075136": 9, "7078976": 9, "7087296": 9, "7091136": 9, "7099712": 9, "710278391838": 8, "7113024": 9, "711615991592": 10, "713843202591": 8, "714169609547": 8, "715257608891": 10, "716115188599": 8, "7168192": 9, "7168192029": 8, "7190464": 9, "719257593155": 10, "72": [8, 9, 10], "72023679018": 10, "720915210247": 10, "721408": 9, "7214464": 9, "721625590324": 10, "721862399578": 8, "722668802738": 8, "722777605057": 10, "722969603539": 10, "723999989033": 8, "724966406822": 10, "725548803806": 8, "726335990429": 8, "7268928": 9, "727967989445": 8, "7284544": 9, "729363203049": 10, "72981758118": 12, "730982398987": 8, "731334400177": 8, "731839990616": 10, "731891202927": 8, "732409596443": 8, "7326656": 9, "733248019218": 8, "734028804302": 10, "735436797142": 8, "73579518795": 10, "736511993408": 10, "74003200531": 12, "740518403053": 8, "741964805126": 8, "7453312": 9, "7461632": 9, "747328": 9, "75041918755": 8, "750636804104": 8, "7520064": 9, "7522432": 9, "7522624": 9, "752479994297": 8, "752838408947": 10, "754201591015": 10, "75422719717": 10, "7561792": 9, "7584896": 9, "759308815": 8, "759430396557": 10, "759679996967": 8, "759910392761": 10, "760915207863": 8, "761139214039": 8, "7611968": 9, "762003195286": 10, "762777590752": 10, "763775992393": 8, "766662418842": 8, "768064010143": 8, "769734406471": 10, "7707904": 9, "771052801609": 10, "771072": 9, "771103990078": 8, "7728576": 9, "7745216": 9, "776639997959": 10, "77759360075": 8, "7786496": 9, "778656005859": 10, "779033613205": 8, "780134403706": 10, "780352008343": 10, "780960011482": 10, "782060790062": 8, "782112002373": 10, "782495999336": 10, "78363519907": 8, "783923208714": 10, "7860736": 9, "7869632": 9, "7881152": 9, "788345599174": 8, "788947200775": 10, "7907264": 9, "791257584095": 8, "792108798027": 8, "792595207691": 8, "7930752": 9, "793516802788": 10, "793542397022": 10, "795135998726": 10, "7952896": 9, "7965888": 9, "7967488": 9, "797900807858": 8, "798195195198": 10, "7982208": 9, "7982848": 9, "799059200287": 8, "7994048": 9, "8": [4, 6, 8, 9, 10, 12, 16, 18], "80": 12, "800019192696": 10, "801119995117": 8, "801798415184": 8, "801996803284": 8, "8028416": 9, "803033602238": 8, "803718411922": 8, "8037312": 9, "804876792431": 10, "804953610897": 8, "805299210548": 8, "8063424": 9, "806828796864": 8, "8068416": 9, "80796158314": 12, "808000004292": 8, "808211183548": 8, "809894394875": 10, "8117248": 9, "818304": 9, "82159358263": 10, "821881604195": 8, "821952": 9, "822137594223": 8, "8228864": 9, "8237248": 9, "824838399887": 8, "8252928": 9, "826361596584": 10, "826515209675": 8, "8302656": 9, "8313792": 9, "832300806046": 8, "832768": 9, "833420813084": 8, "8335488": 9, "835481595993": 8, "835494399071": 8, "8365184": 9, "837299215794": 8, "837804794312": 8, "8380288": 9, "838195204735": 8, "8384512": 9, "8384832": 9, "84": 12, "840755212307": 8, "840908801556": 8, "841631996632": 8, "8422016": 9, "843411195278": 8, "843692803383": 8, "844428789616": 8, "8444928": 9, "8449472": 9, "845856": 9, "8473408": 9, "8479232": 9, "848044800758": 8, "84848": 9, "849235200882": 12, "849273598194": 12, "849631989002": 12, "851040017605": 8, "8516672": 9, "852166390419": 8, "852575981617": 8, "8531072": 9, "853574407101": 8, "853708791733": 12, "85437438488": 8, "855359995365": 12, "855628800392": 12, "8573184": 9, "857728": 9, "85886080265": 8, "860332798958": 8, "862348806858": 8, "8626304": 9, "8633344": 9, "8637184": 9, "8663936": 9, "867276787758": 8, "869497597218": 8, "873651194572": 10, "875001597404": 8, "876377594471": 8, "876627194881": 8, "8772672": 9, "8774336": 9, "8844544": 9, "888671982288": 8, "8896384": 9, "890803205967": 8, "8922624": 9, "893279993534": 8, "8byte": 21, "9": [4, 5, 7, 8, 9, 10, 13, 15], "90": 14, "900499212742": 8, "9123328": 9, "91601279974": 10, "916985595226": 10, "922745585442": 8, "932281601429": 10, "933260798454": 10, "93347837925": 8, "940044796467": 12, "9624512": 9, "971545600891": 8, "98": 9, "985": 9, "995": 19, "997139203548": 8, "99912958145": 12, "999763202667": 8, "A": [1, 4, 6, 7, 14, 15, 16, 18, 19, 23], "And": [4, 8, 9, 10, 19, 22, 23], "As": [0, 1, 4, 8, 9, 10, 12, 15, 16, 18], "At": [6, 12, 23], "Be": [7, 8, 9, 10], "But": [4, 8, 9, 10, 12, 20], "By": [6, 13, 16, 19, 23], "For": [3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 18, 20, 21, 23], "If": [0, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 21, 23], "In": [4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 20, 21, 23, 24], "It": [0, 4, 6, 7, 8, 9, 10, 13, 15, 16, 18, 22, 23], "Not": [3, 6], "Of": 16, "On": [7, 8, 9, 10, 23], "One": [6, 8, 9, 10, 18, 21], "Or": [14, 15], "That": [4, 8, 9, 10, 13, 16, 17, 20], "The": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23], "Then": [3, 8, 9, 10, 12, 14, 15, 22], "There": [5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 18, 20, 23, 24], "These": [3, 7, 8, 9, 10, 12, 15, 16, 18, 22, 23], "To": [0, 3, 5, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23], "With": [0, 12, 13], "_": [5, 8, 9, 10], "__device__": [12, 22], "__global__": [4, 8, 10, 12, 14, 16, 20, 22], "__host__": 12, "__init__": 6, "__kernel": 9, "__local": 9, "__shared__": [8, 10, 16], "__syncthread": [8, 9, 10, 16], "_energi": 18, "_flop": 17, "_funcptr": 6, "_power": 18, "a_d": 9, "a_h": 9, "ab": 14, "abil": 1, "abl": [4, 6, 8, 9, 10], "about": [3, 4, 6, 8, 9, 10, 14, 16, 18, 19, 20, 23], "abov": [4, 6, 8, 9, 10, 12, 15, 16, 20, 21], "abruptli": 6, "absolut": [5, 23], "absorpt": 19, "abstract": [6, 18], "acceler": 19, "accept": [5, 6, 19, 23], "acceptance_prob": 19, "access": [4, 7, 8, 9, 10, 12, 18, 21], "accord": [6, 23], "account": [13, 16], "accur": [12, 18], "achiev": [5, 10], "across": [13, 16], "act": 18, "action": 7, "activ": 7, "actual": [3, 4, 5, 6, 8, 9, 10, 12, 16, 22], "ad": [7, 8, 9, 10, 13, 23], "add": [4, 6, 7, 8, 9, 10, 13, 16, 18, 19], "addgrid": 12, "addit": [3, 4, 7, 8, 9, 10, 15, 17, 20], "address_mod": 23, "addtion": [8, 9, 10], "adjac": 19, "adjust": [4, 7], "advanc": [6, 22, 23], "advantag": 18, "advis": 6, "affect": [8, 9, 10, 16], "after": [4, 5, 6, 7, 8, 9, 10, 13, 15, 16, 18, 23], "after_finish": 18, "after_start": 18, "again": [4, 7, 8, 9, 10, 12, 16], "against": [5, 6, 7], "aggreg": 18, "algebra": 16, "algorithm": [11, 14, 19, 23], "all": [0, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 23], "allclos": [5, 23], "alloc": [4, 6, 8, 9, 10, 11, 13, 23], "allow": [1, 4, 5, 6, 7, 8, 9, 10, 16, 17, 18, 19, 22, 23], "allow_nan": 6, "almost": [5, 8, 9, 10, 18], "along": [4, 6, 15, 20, 24], "alpha": 19, "alreadi": [4, 6, 7, 8, 9, 10, 15, 16, 23], "alright": 20, "also": [0, 1, 3, 4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "altern": [15, 23], "although": 5, "alwai": [4, 6, 8, 9, 10], "amd": [15, 18], "among": [8, 9, 10, 14, 19], "amount": [4, 8, 9, 10, 16, 17, 23], "amper": 18, "an": [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "anaconda": 7, "analysi": [8, 9, 14], "analyz": [8, 9, 10], "ani": [1, 3, 4, 6, 7, 8, 9, 10, 13, 16, 17, 18, 19, 21, 22, 23, 24], "anneal": [19, 23], "anoth": [0, 8, 9, 10, 13, 16, 17, 19, 23], "answer": [4, 5, 6, 8, 9, 10, 11, 23], "anyth": 4, "api": [2, 4, 6], "app": 15, "append": [1, 6, 15, 23], "appl": 15, "appli": [7, 8, 9, 10], "applic": [3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 21, 22, 23], "approach": 18, "appropi": [8, 9, 10], "approx": [8, 9, 10], "approxim": [8, 9, 10], "apt": 7, "ar": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "arbitrari": 6, "arch": 8, "architectur": [6, 18], "arduino": 18, "area": [8, 9, 10, 16], "arg": [5, 6, 8, 9, 10, 12, 13, 14, 16, 19, 20, 21, 22], "argument": [0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 16, 18, 19, 20, 21, 22, 23], "arithmet": [8, 9, 10, 23], "around": [4, 11], "arrai": [0, 4, 5, 6, 8, 9, 10, 12, 13, 20, 21, 23], "articl": [14, 20], "arxiv": 14, "assign": 12, "associ": 12, "assum": [4, 6, 8, 9, 10, 16, 23], "assumpt": [8, 9, 10], "astron": 18, "astyp": [4, 5, 8, 9, 10, 12, 13, 14, 16, 20, 22], "asynchron": 6, "atol": [5, 6, 23], "attempt": [6, 22], "attract": 19, "author": 14, "auto": [14, 16, 18, 19, 22, 23, 24], "auto_activate_bas": 7, "autoinit": [10, 12], "autom": 7, "automat": [0, 4, 7, 8, 9, 10, 12, 13, 16, 22, 23], "auxilliari": 23, "avail": [4, 7, 8, 9, 10, 11, 12, 15, 18], "averag": [6, 8, 9, 10, 13, 18], "avoid": [4, 6, 16, 24], "ax1": [8, 9, 10], "ax2": [8, 9, 10], "axesimag": [8, 9, 10], "axi": 23, "b": [12, 14, 16, 19, 20, 22], "b0": 19, "b_d": 9, "back": [13, 23], "backend": [2, 7, 13, 18], "backward": 6, "bandwidth": 16, "barrier": 9, "base": [0, 6, 7, 17, 18, 22, 23], "bash": [7, 15], "bash_profil": 7, "bashrc": 7, "basic": [4, 6, 8, 9, 10, 20], "basin": [19, 23], "basinhop": 23, "batenburg": 14, "bayes_opt": 23, "bayesian": [14, 19, 23], "becaus": [0, 4, 5, 8, 9, 10, 13, 15, 16, 17, 22, 24], "becom": [8, 9, 10, 18, 19], "been": [4, 6, 7, 8, 9, 10, 13, 16, 19], "befor": [1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 19, 23], "before_start": 18, "begin": [4, 8, 9, 10, 12], "behavior": [4, 16, 18, 23], "behaviour": 6, "behind": 13, "beignet": 15, "being": [6, 8, 9, 10, 16, 18, 19, 23], "below": [0, 7, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21], "ben": 14, "benchmark": [0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 23, 24], "benchmark_continu": 6, "benchmark_default": 6, "benchmarkobserv": 18, "benefit": 16, "benvanwerkhoven": 15, "best": [6, 8, 9, 12, 16, 19, 22, 23, 24], "best1bin": 19, "best1exp": 19, "best2bin": 19, "best2exp": 19, "best_tim": [8, 9], "beta": [12, 19], "better": [3, 7, 8, 9, 10], "between": [0, 8, 9, 10, 13, 15, 16, 17, 19, 23], "beyond": [8, 9, 10, 23], "bf": 21, "bfg": 19, "bind": [15, 18], "biologi": [8, 9, 10], "bit": [4, 6, 8, 9, 10, 12, 13, 16], "block": [0, 4, 6, 8, 9, 10, 11, 12, 15, 16, 17, 20, 23, 24], "block_size_": 24, "block_size_i": [4, 5, 8, 9, 10, 12, 13, 16, 23], "block_size_nam": [4, 6, 23], "block_size_str": [8, 10], "block_size_x": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 20, 22, 23], "block_size_z": [4, 8, 9, 10, 12, 23], "blockdim": [4, 20, 23], "blockidx": [4, 8, 9, 10, 12, 14, 16, 20, 22], "boilerpl": [8, 9, 10], "bool": [6, 21, 23], "boolean": [17, 18, 23], "border": [13, 23], "border_s": 4, "both": [7, 8, 9, 10, 11, 16], "bottom": 6, "bound": [4, 6, 16, 19], "boundari": [8, 9, 10], "bracket": 6, "bram": 14, "branch": [3, 7], "break": [7, 22], "brew": 7, "briefli": 16, "brows": 7, "brute": [17, 19, 20], "brute_forc": [6, 23], "buffer": [6, 9, 21], "build": [3, 6, 8, 9, 10], "built": [7, 18, 19, 21, 23], "bulk": [8, 9, 10], "bx": [8, 9, 10, 12], "bypass": 10, "byte": [6, 21, 23], "bz": 12, "c": [0, 3, 4, 6, 7, 11, 13, 14, 15, 16, 20, 22, 23], "c1": 19, "c2": 19, "c_arg": 6, "cach": [2, 6, 7, 8, 9, 10, 15, 16, 19, 23], "cachefil": [6, 23], "calcul": [6, 12], "call": [1, 4, 5, 6, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 22, 23], "callabl": [5, 6, 23], "can": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "cannot": [0, 7, 8, 9, 10, 18], "cap": 18, "capabl": [6, 7, 8, 9, 14, 16, 23], "care": [8, 9, 10], "cartesian": [4, 12], "case": [0, 4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 18, 20, 21, 23], "cat": 7, "caus": [0, 8, 9, 10], "cc": 8, "cd": [3, 7, 15], "ceil": [10, 12], "cell": [4, 8, 9, 10, 12, 16], "center": [11, 12], "central": [8, 9, 10], "certain": [4, 6, 8, 9, 10, 11, 18, 24], "cfunction": 6, "cg": 19, "chanc": [15, 19, 22], "chang": [3, 7, 12, 18, 23], "changelog": 3, "check": [0, 5, 6, 7, 8, 9, 10, 13, 16], "check_argument_list": 6, "check_argument_typ": 6, "check_circular": 6, "check_kernel_output": 6, "check_restrict": 6, "check_stop_criterion": 6, "check_thread_block_dimens": 6, "check_tune_params_list": 6, "chemistri": [8, 9, 10], "children": 19, "choic": [13, 15], "choos": [0, 8, 9, 10, 16, 19, 23], "chosen": 23, "chunk": 13, "circumst": 4, "cite": 14, "cl": 9, "clamp": 23, "clarifi": 13, "class": [6, 18, 19], "clean": [11, 16], "cleaner": [8, 9, 10], "cleanup": 8, "cleanup_lib": 6, "clk_local_mem_f": 9, "clock": [18, 24], "clone": [3, 4, 7, 8, 9, 10, 12, 15, 16], "close": [6, 8, 9, 10], "closer": [8, 9, 10], "closest": 6, "cmem_arg": [5, 6, 23], "cobyla": 19, "code": [0, 2, 4, 6, 7, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "coeffici": 19, "cognit": 19, "collabor": 16, "collect": [4, 6, 8, 9, 10, 12, 16, 18, 21], "color": [8, 9, 10], "column": 16, "com": [0, 3, 6, 7, 14, 15], "combin": [4, 6, 7, 8, 9, 10, 11, 12, 16, 18, 19, 20, 23], "come": [1, 6, 8, 9, 10, 16, 18, 22], "command": [7, 15], "commandqueu": 9, "commit": 7, "common": [18, 22], "commonli": [4, 8, 9, 10, 15, 16], "commun": [8, 9, 10], "compact": 6, "compar": [0, 4, 5, 8, 9, 10, 12, 16, 17, 18], "comparison": [5, 14], "compat": [6, 7, 15], "compil": [3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 22, 23, 24], "compilationfailedconfig": 6, "compile_kernel": 6, "compile_restrict": 6, "compiler_opt": [6, 23], "compiler_opt_": 24, "complain": 6, "complet": [0, 1, 4], "complex": [13, 16], "compos": [4, 6, 16, 17], "comprehens": 15, "comput": [4, 5, 6, 11, 12, 13, 14, 16, 19, 23], "compute_cap": 6, "compute_capability_major": 8, "compute_capability_minor": 8, "compute_grid": 12, "compute_intens": 19, "concentr": [8, 9, 10], "concept": [8, 9, 10], "conda": 7, "condarc": 7, "condens": [8, 9, 10], "condit": [8, 9, 10, 16], "config": [6, 7], "config_valid": 6, "configur": [1, 4, 6, 8, 9, 10, 11, 12, 16, 17, 18, 19, 23], "confus": 4, "connect": 18, "consid": [3, 12, 14, 16, 23], "consist": [11, 16, 23], "consol": 23, "constant": [0, 4, 6, 8, 9, 10, 11, 13, 16, 19, 23], "constr": 19, "constraint": 6, "construct": [5, 16], "consult": 21, "consumpt": [14, 16, 18], "contact": 11, "contain": [0, 1, 4, 6, 8, 9, 10, 12, 13, 16, 18, 19, 22, 23], "contant": 16, "content": [4, 6, 23], "context": [6, 8, 10, 12], "continous_dur": 18, "continu": [4, 6, 7, 8, 9, 10, 15, 18, 19, 23], "continuous_dur": 18, "continuum": 15, "contrast": 4, "contribut": [2, 7], "control": [0, 8, 9, 10, 18, 19, 23], "conv_filt": 4, "conveni": [7, 8, 9, 10, 13, 23], "convent": [6, 13, 23], "convert": [6, 8, 9], "convert_constraint_restrict": 6, "convolut": [2, 5, 13, 16], "convolution_correct": 5, "convolution_kernel": [4, 5], "convolution_na": [4, 5], "convolution_stream": 13, "cooler": [8, 9, 10], "coordin": 12, "copi": [6, 8, 9, 10, 13, 20, 23], "copy_constant_memory_arg": 6, "copy_host_ptr": 9, "copy_shared_memory_arg": 6, "copy_texture_memory_arg": 6, "core": [7, 18], "core_freq": [18, 24], "correct": [2, 7, 13, 21, 23], "correct_open_cach": 6, "correctli": [7, 16], "correspond": [4, 7, 8, 9, 10, 12, 18, 19, 20], "correspondingli": 4, "cost": [8, 9, 10, 19], "could": [4, 5, 6, 8, 9, 10, 13, 15, 16, 18, 19, 22, 23], "count": 19, "counter": [7, 18], "coupl": 16, "cours": [4, 8, 9, 10, 15, 16], "cover": [8, 9, 10, 19], "coverag": 7, "cpath": 7, "cpu": [5, 9, 10, 13], "cpu_grid": 12, "cpu_result": 5, "creat": [1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 16, 18, 20, 21, 23], "create_device_target": 23, "create_kernel_inst": 6, "create_receive_spec_struct": 21, "create_some_context": 9, "creation": [4, 14, 19], "criterion": [6, 19], "crossov": 19, "csv": [8, 9, 11], "ctx": 9, "ctype": 6, "cu": [4, 5, 13, 16, 20, 22], "cub": 11, "cuda": [3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 18, 20, 21, 22, 23], "cuda11x": 7, "cuda12x": 7, "cuda_error_check": 6, "cudamemcpytosymbol": 13, "cudastreamwaitev": 13, "cudeviceptr": 6, "cufunct": 6, "cupi": [0, 7, 13, 15, 18, 22, 23], "curl": [7, 15], "current": [4, 5, 6, 7, 8, 9, 10, 15, 16, 18, 19, 23], "current_modul": 18, "current_problem_s": 6, "custom": [5, 11, 17, 18, 21], "d": [8, 9, 10, 12, 19, 20], "d_filter": 5, "dashboard": [1, 14], "data": [4, 6, 8, 9, 10, 12, 13, 16, 17, 18, 20, 21, 23], "datafram": [8, 9], "date": 3, "debug": 6, "decreas": [4, 16], "deep": 4, "def": [5, 6, 8, 9, 10, 12, 18, 21], "default": [0, 4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 18, 19, 22, 23], "defin": [4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 18, 22, 23], "definit": [4, 12, 23], "degrad": [8, 9, 10], "degre": [8, 9, 10], "delet": 6, "delete_temp_fil": 6, "delta": [8, 9, 10], "delv": 12, "demonstr": [5, 10, 11, 16], "demot": 22, "denorm": 19, "denot": [16, 20, 23], "depend": [3, 4, 5, 7, 10, 11, 12, 14, 17, 23], "deriv": [4, 6, 8, 9, 10, 17], "descret": [8, 9, 10], "describ": [3, 4, 6, 7, 13, 18, 21], "design": [2, 3, 8, 9, 10, 18], "desir": 7, "dest": 6, "detail": [0, 6, 15, 23], "detect": [6, 19, 22, 23], "detect_languag": 6, "determin": [4, 8, 9, 10, 12, 18, 19], "dev": [7, 15, 18], "develop": [2, 6, 11, 14, 15], "devic": [4, 5, 7, 8, 9, 10, 11, 13, 18, 22, 23], "device_nam": [6, 23], "device_opt": 6, "devicealloc": 6, "devprop": 8, "df": [8, 9], "dict": [4, 5, 6, 10, 13, 14, 18, 19, 20, 22, 23], "dictionari": [4, 6, 8, 9, 10, 12, 16, 18, 19, 20, 23], "did": [4, 8, 9, 10, 16], "diff_evo": 23, "differ": [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 23], "differenti": [15, 19, 23], "difficult": [8, 9, 10, 21, 22], "difficulti": 14, "diffus": 2, "diffuse_kernel": [8, 9, 10], "dim": 6, "dimens": [4, 6, 8, 9, 10, 11, 12, 13, 16, 17, 19, 20, 23, 24], "dimension": [11, 12, 23], "dir": [7, 15], "direct": [8, 9, 10, 13, 16, 17, 23], "directli": [6, 8, 9, 10, 13, 16, 18, 22, 23], "directori": [4, 7, 8, 9, 10, 12, 15, 16], "disabl": 7, "discontinu": 16, "discuss": [3, 6], "disk": 7, "diskquota": 7, "diskspac": 7, "displai": 4, "disrupt": 19, "disruptive_uniform": 19, "disruptive_uniform_crossov": 19, "distanc": [8, 9, 10, 19], "distance_to": 19, "distant": [8, 9, 10], "distinct": 16, "distribut": [12, 16], "divid": [4, 8, 9, 10, 12, 13, 16, 23], "divison": 23, "divisor": [4, 6, 8, 9, 10, 16, 23], "dna": 19, "dna1": 19, "dna2": 19, "do": [3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 23], "doc": [3, 4, 6, 7, 8, 9, 10, 12, 15, 16], "docstr": [3, 6], "document": [3, 4, 5, 8, 9, 10, 12, 15, 16, 21, 24], "doe": [5, 6, 7, 8, 9, 10, 12, 13, 16, 18, 22, 23], "doi": 14, "domain": [4, 8, 9, 10, 11, 12, 23], "don": [6, 7, 8, 10, 12, 13, 23], "done": [0, 4, 15, 17, 18], "doubl": [8, 9, 10, 21, 22], "doubt": 3, "down": 16, "download": 15, "downsid": 18, "dramat": 16, "drastic": 16, "driver": [6, 7, 8, 10, 12], "drv": 8, "dry": 7, "dt": [8, 9, 10], "dtarget_gpu": 23, "dtype": [6, 21], "dual": [19, 23], "due": [16, 22, 23], "dump": [6, 8, 9], "dump_cach": 6, "durat": [6, 18], "dure": [0, 1, 6, 8, 9, 10, 12, 18, 23], "dx": 12, "dy": 12, "dynam": [0, 6, 23], "dz": 12, "e": [3, 7, 15, 17, 18, 19, 23], "each": [4, 5, 6, 7, 8, 9, 12, 16, 18, 19, 23], "earlier": [6, 8, 9, 10, 12], "easi": [8, 9, 17, 18, 23], "easiest": 14, "easili": [8, 9, 18], "effect": [4, 7, 8, 9, 10, 23], "effici": [14, 16, 18], "either": [6, 12, 19, 22, 23], "element": [5, 8, 9, 10, 16, 17, 20, 21, 23], "ellipsi": 4, "els": 6, "elsewher": 7, "empti": 23, "emtpi": 23, "enabl": [1, 18, 19, 21, 22], "end": [4, 6, 7, 8, 9, 10, 12, 16, 18, 19, 21], "endif": 10, "energi": [7, 14, 18, 19, 24], "enough": [3, 4, 5, 16], "enqueue_copi": 9, "ensur": [3, 5, 8, 9, 10, 13, 15, 18, 21], "ensure_ascii": 6, "enter": [4, 7, 8, 9, 10, 12, 16], "entir": [0, 6, 7, 8, 9, 10, 16, 19, 23], "entri": [3, 6, 8, 9], "env": [4, 6, 19, 20, 23], "envdir": 7, "environ": [1, 2, 3, 4, 6, 15, 19, 23], "envs_dir": 7, "ep": [6, 19], "equal": [8, 9, 10, 16, 23], "equat": [4, 8, 9, 10, 12, 19], "equi": [8, 9, 10], "error": [3, 4, 5, 6, 13, 16, 22], "errorconfig": 6, "especi": 7, "essenti": [4, 7], "estim": [8, 9, 10], "euclidian": 19, "evalu": [8, 9, 10, 16, 19, 23], "even": [1, 7, 8, 9, 10, 13, 16, 19], "event": [6, 8, 13, 18], "everi": [4, 5, 8, 9, 10, 11, 18, 20], "everyth": [4, 6, 8, 9, 10], "everywher": [8, 9, 10], "evolut": [19, 23], "evolutionari": 14, "exactli": [4, 6, 8, 9, 10, 16, 18], "exampl": [2, 3, 5, 6, 7, 8, 9, 10, 13, 15, 16, 17, 18, 19, 20, 21, 23], "exce": 19, "exceed": 6, "except": [6, 7, 11], "exchang": [8, 9, 10], "execut": [4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 19, 23], "exhaust": 17, "exist": [1, 6, 23], "exit": 7, "exlicitli": 12, "exp": 12, "expand": [4, 14, 16, 18], "expect": [0, 3, 4, 5, 6, 7, 8, 9, 10, 16, 18, 23], "experi": 4, "experiment": 21, "explain": [4, 6, 8, 9, 10, 13, 15, 16, 17, 20, 22, 23], "explor": 12, "export": 7, "expos": 6, "express": [6, 8, 9, 10, 11, 13, 16, 23], "extend": 18, "extens": 6, "extern": [0, 18, 22], "extra": [7, 15, 22], "extract": 12, "f": [4, 5, 12, 13, 21], "f_h": 4, "f_w": 4, "facilit": 18, "fact": [8, 9, 10, 13], "factor": [4, 8, 9, 10, 11, 12, 16, 24], "fail": [4, 6, 15, 23], "fals": [6, 7, 18, 19, 23], "famili": 14, "familiar": [4, 16], "far": [4, 8, 9, 10, 16, 20], "fast": [5, 8, 9, 10], "faster": [8, 9, 10, 16], "fc": 21, "featur": [1, 4, 5, 11, 15, 17, 18, 20, 22, 23], "feel": [8, 9, 10], "few": [4, 8, 9, 10, 12, 13, 22], "fewer": [4, 8, 9, 10], "fffi": 21, "field": [5, 8, 9, 10], "field_copi": [8, 9], "fifth": 16, "fig": [8, 9, 10], "figur": 16, "file": [0, 2, 4, 6, 7, 8, 9, 11, 13, 16, 19, 20, 22, 23], "filenam": [1, 4, 6, 11, 16, 20, 23], "fill": [6, 16], "filter": [4, 5, 11, 13], "filter_height": 4, "filter_heigth": 4, "filter_mod": 23, "filter_s": 4, "filter_width": 4, "final": [4, 5, 8, 9, 10, 12], "find": [4, 13, 16, 19, 23], "fine": [8, 9, 10], "finish": [4, 6, 9, 12, 13, 18], "firefli": [19, 23], "firefly_algorithm": 23, "first": [3, 4, 5, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23], "first_kernel": 5, "fit": [13, 19], "five": [4, 6, 20], "fix": [8, 9, 10, 19, 23], "fixed_param": [10, 12], "flat": 6, "flexibl": [5, 8, 9, 16], "float": [4, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23], "float32": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 20, 22, 23], "flori": 14, "fly": [8, 9, 10], "folder": 7, "follow": [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 22, 23], "forbidden": 6, "forc": [17, 19, 20, 22], "foreseen": 6, "forg": 7, "forget": [7, 12], "fork": 3, "form": [6, 16, 18, 19], "format": [6, 8, 9, 21], "formula": [8, 9, 10], "fortran": [6, 11, 22], "fortun": 16, "found": [4, 6, 7, 14, 18, 19], "four": [8, 9, 10], "fourth": 16, "fp": [8, 9], "frac": [8, 9, 10], "fraction": 19, "free": [4, 8, 9, 10, 13, 15, 16], "freeli": 4, "frequenc": 18, "frequent": 16, "friendli": 18, "from": [1, 4, 5, 6, 7, 8, 11, 12, 13, 15, 16, 18, 19, 21, 22, 23], "frombuff": 21, "fsiq": 21, "full": [1, 3, 6, 7, 18, 20], "fulli": [0, 7, 15], "fun": 19, "func": [6, 18, 23], "function": [3, 4, 5, 8, 9, 10, 11, 12, 13, 16, 18, 19, 20, 21, 22, 23], "further": [0, 8, 9, 10, 15, 16], "futur": [6, 14, 23, 24], "g": [7, 15, 17, 18], "gamma": 19, "gaussian": 12, "gcc": 6, "geforc": [8, 9, 10, 12], "gene": 19, "gener": [0, 4, 6, 7, 8, 9, 10, 14, 16, 18, 19, 21, 23, 24], "generate_normalized_param_dict": 19, "genet": [19, 23], "genetic_algorithm": 23, "get": [2, 4, 6, 8, 9, 10, 12, 15, 16], "get_attribut": 8, "get_best_config": 6, "get_config_str": 6, "get_devic": 8, "get_environ": 6, "get_funct": [8, 10, 12], "get_grid_dimens": 6, "get_group_id": 9, "get_initial_condit": [8, 9, 10], "get_instance_str": 6, "get_kernel_str": [6, 8, 9, 10], "get_local_id": 9, "get_local_s": 23, "get_opt": 6, "get_problem_s": 6, "get_result": 18, "get_smem_arg": 6, "get_strategy_docstr": 6, "get_temp_filenam": 6, "get_thread_block_dimens": 6, "get_total_tim": 6, "gflop": [4, 6, 11, 16, 17], "giga": [4, 16], "gigabyt": 7, "git": [3, 7, 18], "github": [0, 3, 4, 7, 8, 9, 10, 12, 15, 16], "give": [0, 8, 9, 10, 19], "given": [6, 8, 9, 10, 12, 18, 19, 23], "global": [6, 7, 8, 9, 10, 19], "go": [4, 7, 8, 9, 10, 12, 14, 15, 16, 20], "goe": 16, "good": [5, 8, 9, 10, 24], "googl": 3, "got": [8, 9, 10], "gpu": [0, 3, 4, 5, 6, 7, 11, 13, 14, 16, 18, 20, 21, 23, 24], "gpu_arg": 6, "gpu_result": [5, 8, 10], "gpuarrai": [10, 12], "gr_voltag": 18, "gracefulli": 7, "grain": [8, 9, 10], "graphic": [18, 24], "great": [6, 8, 9, 10, 20], "greedi": [19, 23], "greedy": 19, "greedy_il": 23, "greedy_ml": 23, "green": 14, "grep": 7, "grid": [4, 6, 8, 9, 10, 11, 13, 16, 23, 24], "grid_dim": 12, "grid_div": 6, "grid_div_i": [4, 5, 8, 9, 10, 13, 16, 23], "grid_div_x": [4, 5, 8, 9, 10, 13, 16, 23], "grid_div_z": 23, "grid_gpu": 12, "grid_size_": 24, "grid_size_i": 13, "grid_size_x": 13, "group": [6, 8, 9, 10, 23], "group__opt": 6, "grow": [8, 9, 10], "gt": [8, 9, 10], "gtx": [8, 9, 10, 12], "guarante": 19, "guess": [8, 9, 10], "guid": [4, 8, 16, 17, 20], "h": [4, 12, 23], "ha": [0, 4, 6, 7, 8, 9, 10, 13, 16, 18, 19, 23], "had": [1, 4], "half": [8, 9, 10], "halt": [6, 13], "ham": 19, "hand": [12, 16], "handl": [0, 13, 23], "happen": [0, 1, 3, 4, 16, 20], "hardwar": [3, 7, 8, 9, 10, 12, 18, 19, 20], "have": [0, 1, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 20, 22, 23, 24], "haven": [4, 15], "header": [0, 23], "header_filenam": 23, "heat": [8, 9, 10], "height": 16, "help": [3, 7, 22], "helper": [6, 18], "henc": [12, 21], "here": [0, 4, 11, 12, 13, 15, 16, 18, 23], "high": [6, 8, 9, 10, 14, 16, 18], "highli": [4, 14, 16], "highlight": 11, "hillclimb": 19, "hip": [0, 3, 7, 14, 23], "hiprtc": 0, "hold": [7, 8, 9, 16, 20, 21, 23], "home": 15, "hook": 18, "hop": [19, 23], "host": [2, 6, 7, 9, 10, 11, 18, 21, 22, 23], "hostbuf": 9, "hot": [8, 9, 10], "hotspot": [8, 9, 10], "how": [0, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 20, 21, 22, 23], "howev": [4, 5, 8, 9, 10, 13, 15, 16, 18, 21, 22, 23], "hpc": 1, "html": [3, 6, 7], "http": [0, 6, 7, 14, 15, 18], "hyperparamet": 19, "i": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "i_like_convolut": 4, "id": [6, 7, 18], "idea": [8, 9, 10, 13, 16, 24], "ident": 0, "identifi": 18, "ieee": 14, "ifndef": 10, "ignor": [4, 6, 8, 9, 10, 23], "iiiiiiiiiiippi": 21, "il": 19, "illeg": 4, "illustr": 11, "imag": [4, 8, 9, 10], "image_height": 4, "image_width": 4, "impact": [8, 9, 10, 13], "implement": [0, 5, 6, 11, 12, 17, 18, 19, 23], "import": [0, 4, 5, 8, 9, 10, 12, 15, 16, 17, 20, 21, 22], "importantli": [8, 9, 10], "impos": 16, "improv": [3, 6, 8, 9, 10, 16, 19, 23], "imshow": [8, 9, 10], "includ": [0, 3, 4, 5, 8, 9, 10, 12, 13, 15, 16, 18, 22, 23], "incorpor": 10, "increas": [4, 8, 9, 10, 18], "indent": 6, "independ": 16, "index": [6, 19], "indic": [4, 19, 24], "individu": [1, 18, 19], "ineffici": 21, "inertia": 19, "influenc": 4, "info": 23, "inform": [4, 6, 7, 8, 9, 10, 14, 18, 19, 20, 23, 24], "init": 8, "initi": [8, 9, 10, 21], "inlin": [8, 9, 10], "inner": 16, "input": [0, 4, 5, 8, 9, 10, 11, 13, 16, 17, 20, 21, 23], "input_imag": 4, "input_s": [4, 5, 13], "input_width": 4, "insert": [4, 5, 6, 10, 12, 13, 16, 20, 22, 23, 24], "insid": [8, 9, 10, 13, 16, 22, 23], "inspect": [6, 7, 18], "instal": [2, 3, 4, 7, 8, 9, 10, 12, 13, 16, 18, 20], "instanc": [5, 6, 8, 9, 10, 13, 18, 23], "instant": [8, 9, 10, 12], "instantan": 18, "instanti": [6, 22], "instead": [4, 6, 11, 16, 23], "instruct": [3, 7, 8, 9, 10, 11, 15, 16], "int": [4, 6, 8, 9, 10, 12, 14, 16, 20, 22, 23], "int32": [6, 14, 20, 22, 23], "integ": [6, 18, 21, 23], "integr": [7, 22], "intel": 15, "intend": 14, "intens": 19, "interact": [6, 18], "intercept": 18, "interest": [4, 11, 21], "interfac": [0, 4, 5, 13, 15, 18, 19, 21, 23], "intermedi": [8, 9, 10], "intern": [6, 14, 19, 22], "interpret": 4, "interv": 12, "intricaci": 18, "introduc": [8, 9, 10, 16, 18], "introduct": 2, "invalidconfig": 6, "invers": 6, "invok": 7, "involv": 7, "io": 15, "isclos": 5, "isiq": 21, "isol": [7, 22], "issu": [0, 7, 21], "item": [6, 8, 9, 10, 12], "iter": [6, 8, 9, 10, 12, 16, 18, 19, 20, 23], "iterfac": 6, "its": [5, 6, 8, 9, 10, 12, 14, 15, 16, 17, 18, 23], "itself": [13, 14, 23], "j": [4, 8, 9, 10, 14, 16], "jan": 14, "jatinx": [0, 15], "jetson": 18, "job": 1, "join": [8, 9], "joost": 14, "joul": [18, 24], "journal": 14, "json": [6, 8, 9, 11, 23], "jsonencod": 6, "jupyt": [4, 8, 9, 10, 12, 15, 16], "just": [4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16], "k": [8, 9, 10, 12, 14, 16, 20], "kb": 16, "keep": [8, 9, 10, 16, 21], "kei": [6, 8, 9, 10, 16, 19, 20, 23], "kepler": 23, "kerenel": 10, "kernel": [0, 1, 3, 4, 5, 6, 7, 13, 15, 17, 18, 19, 20, 21, 23, 24], "kernel_argu": 6, "kernel_cod": 12, "kernel_finish": 6, "kernel_inst": 6, "kernel_nam": [4, 6, 13, 21, 22, 23], "kernel_opt": 6, "kernel_sourc": [4, 6, 21, 23], "kernel_src": 9, "kernel_str": [4, 5, 6, 8, 9, 10, 13, 14, 19, 23], "kernel_string_shar": 10, "kernel_string_til": 10, "kernel_tun": [3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18, 20, 21, 22, 23, 24], "kernelinst": 6, "kernelsourc": 6, "kerneltun": [3, 7, 14], "keyr": 7, "keyword": 6, "khz": 18, "know": [4, 8, 9, 10, 16, 17], "known": [7, 16], "kt": [14, 21], "l": 19, "l1": [8, 9, 10], "l2": [8, 9, 10], "la": 12, "lambda": [4, 6, 8, 9, 16, 17, 23], "lambdatyp": 6, "lang": [0, 6, 11, 13, 22, 23], "languag": [0, 6, 10, 13, 16, 21, 23], "larg": [6, 8, 9, 10, 12, 23], "larger": [3, 8, 9, 10, 13, 19, 22], "last": [6, 7, 21], "later": [8, 9, 10, 12, 23], "latest": [7, 15], "latter": 13, "launch": [6, 8, 9, 10, 13, 18, 23], "launcher": 14, "layer": 18, "ld_libary_path": 7, "lead": 21, "leak": 6, "learn": 4, "least": [6, 7], "leav": 7, "left": [6, 7, 8, 9, 10, 12, 17], "len": 21, "length": 21, "let": [4, 6, 8, 9, 10, 20, 22], "level": [6, 7, 18], "libbz2": 7, "libffi": 7, "libgdbm": 7, "liblzma": 7, "libncurses5": 7, "libncursesw5": 7, "libnss3": 7, "librari": [6, 11, 18, 21], "libreadlin": 7, "libsqlite3": 7, "libssl": 7, "light": 19, "like": [4, 6, 7, 8, 9, 10, 11, 12, 16, 19, 20, 21, 22, 23], "likewis": [8, 9, 10], "limit": [0, 4, 6, 7, 8, 9, 10, 11, 16, 18, 19, 22, 23, 24], "limits_": 4, "linalg": 12, "line": [4, 7, 8, 9, 10], "linear": [4, 16, 23], "linkag": [0, 22], "linspac": 12, "linux": [7, 15], "list": [0, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 23], "littl": [4, 8, 9, 10, 16], "ll": [0, 4, 8, 9, 10, 15, 16], "llvm": 7, "load": [6, 7], "local": [3, 19, 23], "locat": [5, 7, 12, 18], "lock": [11, 18], "log": 23, "long": [4, 8, 9, 10, 12, 13, 16, 21], "longer": [4, 6, 7, 17], "look": [3, 4, 6, 8, 9, 10, 12, 15, 16, 22], "looks_like_a_filenam": 6, "lookup": 6, "loop": [8, 9, 10, 11, 16, 24], "loop_unroll_factor_": 24, "loss": [8, 9, 10], "lot": [4, 8, 9, 10, 16, 18, 20, 21, 23], "low": [6, 8, 9, 10, 16], "lower": [12, 18, 19], "lt": [8, 9, 10], "lzma": 7, "m": [7, 8, 9, 10, 12], "mac": 7, "maco": 7, "macro": 6, "made": 6, "mai": [0, 4, 5, 6, 7, 8, 9, 10, 13, 15, 16, 17, 18, 19, 20, 21, 23], "main": [4, 6, 12, 18, 20], "maintain": 6, "make": [3, 4, 7, 8, 9, 10, 12, 14, 15, 16, 18, 21, 22], "make_context": 8, "make_strategy_options_doc": 6, "mamba": 7, "manag": [8, 9, 10, 16, 18], "mandatori": 18, "mani": [4, 6, 8, 9, 10, 16, 17, 18, 19, 23], "manual": [12, 15], "map": [5, 11, 12], "mark": 6, "master": 7, "match": [4, 5, 6], "math": 12, "matlab": 22, "matmul": 16, "matmul_kernel": 16, "matmul_na": 16, "matmul_shar": 16, "matplotlib": [8, 9, 10, 15], "matric": 16, "matrix": 2, "matter": [8, 9, 10, 13], "max": 6, "max_fev": [6, 19, 23], "max_thread": 6, "maxim": [17, 23], "maximum": [5, 6, 12, 19, 23], "maxit": 19, "md": 3, "mead": 19, "mean": [4, 13, 16, 17, 19, 21, 22, 24], "meant": 4, "measur": [6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18, 23, 24], "mechan": 19, "meet": 6, "melt": [8, 9, 10], "mem": 6, "mem_alloc": 8, "mem_flag": 9, "mem_freq": [18, 24], "memcpi": [6, 13], "memcpy_dtoh": [6, 8], "memcpy_htod": [6, 8], "memori": [0, 4, 6, 11, 13, 18, 21, 23, 24], "memset": 6, "mention": 12, "merg": [8, 9, 10, 16], "meshgrid": 12, "messi": [8, 9, 10], "metal": [8, 9, 10], "meter": 18, "method": [6, 8, 9, 10, 13, 16, 18, 19], "metric": [2, 4, 6, 11, 16, 23], "mf": 9, "middl": 12, "might": [12, 17], "millisecond": [8, 9, 10], "mimick": 4, "min": [8, 9], "mind": [8, 9, 10], "miniconda": [7, 15], "miniconda3": 15, "minim": [3, 7, 17, 22, 23], "minimum": 12, "mirror": 23, "miss": [6, 7, 23], "ml": 23, "mnrow": 21, "mnrowsiq": 21, "mod": [10, 12], "mode": 18, "model": [8, 9, 10, 14], "modif": [19, 21], "modifi": [10, 18], "modul": [3, 6, 7, 13, 18], "moment": [8, 9, 10, 23], "monitor": 18, "monolith": 6, "more": [0, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 18, 20, 22, 23], "most": [0, 3, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 19, 20, 21, 23], "mostli": [6, 14, 23], "motion": [8, 9, 10], "move": [4, 6, 8, 13, 16, 19, 23], "move_toward": 19, "much": [4, 8, 9, 10, 12, 18, 22, 23], "multi": [19, 23], "multipl": [0, 2, 6, 7, 13, 18, 22, 23], "multiprocessor": [8, 9, 10], "must": [6, 17, 23], "mutat": 19, "mutation_ch": 19, "my_typ": 22, "n": [5, 7, 8, 9, 10, 12, 13, 14, 16, 19, 20, 22], "nactivechannel": 21, "naiv": [4, 5, 8, 9, 10], "name": [4, 5, 6, 7, 8, 9, 10, 12, 16, 17, 18, 19, 20, 23, 24], "name_of_gpu": 23, "namelijk": 16, "nativ": 15, "navig": 15, "nbuffer": 21, "nbyte": 8, "nchannel": 21, "ndarrai": [6, 12], "ndrang": 6, "nearest": [6, 23], "necessari": [5, 6, 7, 8, 9, 10, 23], "necessarili": [5, 13], "need": [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 20, 21, 22, 23], "neighbor": 19, "nelder": 19, "net": 15, "network": 4, "neural": 4, "new": [1, 3, 6, 7, 8, 9, 10, 19, 23], "new_cost": 19, "newer": [15, 18], "newli": 16, "next": [8, 9, 10, 16, 21], "nfasttimesampl": 21, "nice": [8, 9], "nieuwpoort": 14, "nl": 18, "no_improv": 19, "node": [7, 19], "non": [5, 7], "none": [5, 6, 18, 19, 23], "nonumb": [4, 12], "normal": [6, 19, 23], "normalize_dict": 19, "normalize_parameter_spac": 19, "normalize_verify_funct": 6, "normalized_coordin": 23, "notat": 23, "note": [4, 6, 7, 8, 9, 10, 12, 14, 15, 16, 18, 21, 23], "notebook": [4, 8, 9, 10, 12, 15, 16], "notic": [4, 8, 9, 10], "now": [4, 6, 8, 9, 10, 12, 13, 16, 20], "nox": [3, 7], "noxset": 7, "np": [4, 6, 12, 16, 20, 21], "npencod": 6, "npt": 12, "nrepeat": 21, "nsampl": 21, "nsamplesiq": 21, "nslowtimesampl": 21, "ntx": 21, "num_reg": 18, "num_stream": 13, "number": [1, 4, 5, 6, 8, 9, 10, 11, 12, 14, 16, 17, 18, 19, 20, 21, 23, 24], "numer": [8, 9, 10], "numpi": [0, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 16, 20, 21, 22, 23], "nv": 0, "nvcc": [0, 6], "nvcuda": 0, "nvidia": [0, 6, 7, 15, 16, 18, 22], "nvidia_smi_fallback": 18, "nvml": [6, 24], "nvml_": 24, "nvml_energi": [18, 24], "nvml_gr_clock": [18, 24], "nvml_mem_clock": [18, 24], "nvml_power": [18, 24], "nvml_pwr_limit": [18, 24], "nvmlobserv": 24, "nvrtc": [0, 6, 22], "nx": [8, 9, 10, 12], "ny": [8, 9, 10, 12], "nz": 12, "o": [4, 6], "obj": 6, "object": [2, 4, 5, 6, 8, 9, 10, 19, 23], "objective_higher_is_bett": [6, 17, 23], "observ": [0, 2, 6, 17, 23, 24], "obtain": [4, 8, 9, 10, 12, 18], "occup": 16, "occur": [17, 23], "occurr": 6, "offer": 6, "often": [1, 8, 9, 10, 18], "old": 4, "old_argu": 6, "old_cost": 19, "older": 15, "omit": 6, "omp_get_wtim": 13, "ona": 10, "onc": [5, 6, 8, 9, 10, 12, 18, 23], "one": [0, 3, 4, 6, 7, 8, 9, 10, 12, 15, 16, 18, 19, 23], "ones": [8, 9, 10, 24], "onli": [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 19, 21, 23], "onlin": 7, "open": [5, 7, 8, 9, 13, 16], "open_cach": 6, "openacc": 11, "opencl": [0, 3, 4, 7, 8, 9, 10, 11, 13, 14, 16, 23], "openmp": 13, "openssl": 7, "oper": [4, 8, 9, 10, 12, 13, 16, 17], "opportun": 16, "oppos": 7, "opt": 18, "optim": [1, 2, 4, 5, 6, 8, 9, 10, 13, 14, 16, 17, 18, 23], "option": [1, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19, 22, 23, 24], "order": [4, 5, 6, 8, 9, 10, 12, 13, 16, 17, 19, 20, 23], "ordered_greedy_ml": 23, "ordereddict": [4, 8, 9, 10, 12, 16, 17], "ordin": 18, "org": [7, 14, 15], "other": [0, 1, 4, 6, 7, 8, 9, 10, 13, 16, 17, 18, 19, 23, 24], "otherwis": [6, 7, 16, 23], "our": [4, 8, 9, 10, 12, 16, 20, 21], "ourselv": 12, "out": [1, 4, 5, 7, 12, 15, 16], "outer": 16, "output": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 20, 23, 24], "output_imag": 4, "output_s": 4, "over": [0, 6, 8, 9, 10, 15, 16, 18, 19], "overhead": [8, 9, 10, 16], "overhead_tim": 6, "overlap": [11, 13], "overrid": 19, "overwritten": [18, 23], "own": [4, 10, 13, 15, 17, 18], "p": [4, 6, 16, 17, 21, 23], "pack": 21, "packag": [0, 7], "packstr": 21, "pad": 21, "page": [4, 7, 8, 9, 10, 11, 12, 14, 16, 17], "pair": [8, 9, 10], "panda": [8, 9, 11, 15], "pandoc": 7, "paper": 14, "parallel": [4, 8, 9, 10], "param": [4, 5, 6, 7, 18, 19, 23], "param_spac": 19, "paramet": [2, 5, 6, 8, 9, 11, 13, 16, 17, 19, 20, 21, 22, 23], "parameter_spac": [6, 19], "parametr": 4, "parent": 19, "pars": [6, 8, 9], "parse_restrict": 6, "part": [7, 8, 9, 10, 14, 15, 16, 17, 21, 23], "partial": [5, 8, 9, 10, 11], "particl": [19, 23], "particular": [4, 6, 8, 9, 11, 13, 16, 18, 21], "particularli": [4, 7, 17], "pass": [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 22, 23], "path": [4, 7, 18], "pattern": 18, "pcie": 18, "per": [3, 4, 6, 8, 9, 10, 12, 17, 18, 23], "percentag": 23, "perform": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 23], "persist": 23, "physic": 8, "pick": 16, "pii": 14, "pionter": 6, "pip": [3, 4, 7, 8, 9, 14, 15, 16], "pip_cache_dir": 7, "pipelin": 11, "pixel": 4, "place": [4, 8, 9, 10, 18, 19, 20, 23], "plai": [8, 9, 10], "plain": 13, "platform": [6, 14, 15, 18, 23], "pleas": [1, 3, 4, 7, 11, 14, 15, 18, 20, 21, 23], "plot": [8, 9, 10], "plu": 23, "plugin": 7, "pmb": 14, "pmt": 18, "po": 19, "poetri": [7, 15], "point": [4, 6, 7, 8, 9, 10, 12, 13, 16, 17, 18, 20, 23], "pointer": 21, "pop": 8, "popsiz": 19, "popul": 19, "popular": 18, "portabl": 21, "posit": [5, 6, 12, 19, 22, 23], "possibl": [3, 4, 5, 7, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 23], "possibli": [17, 23], "powel": 19, "power": [6, 16, 18, 24], "power_read": [18, 24], "powersensor": [18, 24], "powersensor2": 18, "pragma": [8, 9, 10, 16], "precis": 13, "precomput": 5, "prefer": [4, 6, 7, 8, 10, 18, 23], "prefix": 15, "prepar": [6, 7, 8, 9, 10], "prepare_kernel_str": 6, "prepend": [6, 10], "preprocess_gpu_argu": 6, "preprocessor": [4, 6, 23], "present": [0, 7, 16], "press": [4, 8, 9, 10, 12, 16], "pretend": 6, "pretti": 16, "previou": [1, 7, 8, 9, 10, 19, 23], "previous": [6, 8, 9, 10, 16], "prg": 9, "primit": 21, "print": [3, 4, 6, 8, 9, 10, 12, 16, 23], "print_config": 6, "print_config_output": 6, "privileg": [7, 18], "probabl": [19, 23], "problem": [3, 4, 6, 8, 9, 10, 11, 12, 13, 16, 23], "problem_s": [4, 5, 6, 8, 9, 10, 12, 13, 16, 20, 21, 23, 24], "problemat": 4, "proc": 7, "proce": 16, "process": [4, 6, 7, 8, 9, 10, 16, 17, 18, 19, 22], "process_cach": 6, "process_metr": 6, "prod": [4, 5, 13], "produc": [3, 5], "product": [4, 8, 9, 23], "profil": 16, "program": [0, 5, 7, 8, 9, 10, 13, 16, 21, 22], "programm": [16, 18], "programmat": 12, "prohibit": 19, "project": [7, 15], "promis": 4, "properli": 6, "properti": [6, 16, 23], "propos": 3, "provid": [5, 6, 7, 8, 9, 10, 13, 22, 23], "prune": 19, "prune_parameter_spac": 19, "ps_energi": [17, 18, 24], "ps_power": [18, 24], "psedo": 10, "pso": 23, "ptr": 6, "public": [3, 14], "publish": [7, 14], "pull": 3, "purpos": [8, 9, 10, 13, 16, 23, 24], "put": [3, 6, 7, 8, 9, 10], "py": [5, 13, 15], "pybind11": 18, "pycuda": [0, 7, 8, 10, 12, 13, 18, 22], "pyenv": 7, "pyhip": [0, 6], "pyopencl": [6, 7, 9, 18], "pyplot": [8, 9, 10], "pytest": [3, 7], "python": [0, 3, 4, 6, 7, 11, 12, 13, 16, 18, 20, 21, 22, 23], "python3": [7, 15], "pythonpath": 15, "qualiti": 13, "quantiti": [8, 9, 10, 17, 18, 23], "queue": 9, "quick": [3, 8, 9, 10], "quickli": [8, 9, 10], "quiet": [6, 23], "quit": [1, 8, 9, 10, 12, 16, 22], "quota": 7, "r": [3, 5, 13], "race": 16, "radiat": [8, 9, 10], "rais": 6, "ran": 7, "rand": 12, "rand1bin": 19, "rand1exp": 19, "rand2bin": 19, "rand2exp": 19, "randint": [8, 9, 10], "randn": [4, 5, 13, 14, 16, 20, 22], "random": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 19, 20, 22, 23], "random_sampl": [6, 23], "random_walk": 19, "randomli": [12, 19], "randtobest1bin": 19, "randtobest1exp": 19, "rang": [4, 5, 8, 9, 10, 12, 13, 22], "rapl": 18, "rather": [8, 9, 10, 23], "rawkernel": 6, "rd": 18, "re": [4, 7, 8, 9, 10, 12, 16], "reach": 6, "read": [4, 5, 6, 7, 8, 9, 10, 12, 13, 16, 18, 23], "read_cach": 6, "read_fil": 6, "read_writ": 9, "readi": [4, 6, 8, 9, 10, 12, 16], "ready_argument_list": 6, "real": 22, "realiti": 16, "realiz": 16, "realli": [4, 8, 9, 10, 15], "reason": [4, 6, 7, 21, 23], "receive_spec": 21, "recent": [6, 15, 18], "recogn": 20, "recommend": [15, 21], "recon": 21, "record": [4, 6, 8, 18, 23], "redistribut": [8, 9, 10], "reduc": [8, 9, 10, 16], "reduct": [5, 17, 23], "redund": 16, "ref": 23, "refer": [4, 5, 6, 8, 9, 10, 11, 13, 15, 18, 23], "referenc": 14, "reflect": [5, 18], "regard": [3, 6, 19], "regardless": 22, "region": [8, 9, 10], "regist": [4, 8, 9, 10, 16, 18], "register_configur": 18, "register_devic": 18, "registerobserv": 18, "regular": [6, 10, 18], "reject": 19, "relat": [14, 17, 24], "releas": [6, 7], "relev": [6, 14, 18], "rememb": [4, 7, 8, 9, 10, 16], "remov": [7, 19], "repeatedli": 18, "replac": [4, 5, 6, 7, 8, 9, 10, 12, 16, 23], "replace_param_occurr": 6, "repo": 15, "report": [17, 18, 23, 24], "repositori": [3, 4, 7, 8, 9, 10, 12, 14, 15, 16], "repres": [6, 8, 9, 10], "represent": [6, 21], "reproduc": 3, "request": [3, 18, 23], "requir": [0, 3, 4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 22], "requirements_test": 3, "research": 14, "reserv": [1, 9, 24], "resourc": 23, "respect": [16, 18], "respons": 6, "rest": [6, 8, 9, 10], "restart": [1, 7, 8, 9, 10, 19], "restrict": [6, 7, 11, 16, 22, 23], "result": [1, 3, 4, 5, 6, 10, 12, 16, 17, 18, 19, 20, 23, 24], "result_host": 6, "results_filenam": 23, "retri": 15, "retriev": [4, 6, 23], "return": [4, 5, 6, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 23], "reus": [4, 8, 9, 10, 16], "rewrit": 22, "rf": 21, "rfsize": 21, "richard": 14, "right": [4, 8, 9, 10, 12, 15], "risk": 22, "rmprofilingadminonli": 7, "rob": 14, "robust": 6, "rocm": [7, 15, 18], "room": 16, "root": 18, "roughli": [12, 16], "round": [8, 9, 10, 23], "row": 16, "run": [1, 3, 4, 5, 6, 8, 9, 12, 13, 15, 16, 18, 19, 23], "run_gpu": 9, "run_kernel": [4, 5, 6, 11, 23], "runner": 19, "runtim": [4, 6, 8, 9, 10, 14, 15, 18, 22], "runtimefailedconfig": 6, "s0167739x18313359": 14, "sa": 16, "safer": 23, "sai": [6, 8, 9, 10, 20, 22], "same": [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 18, 20, 23], "sampl": [6, 19, 23], "satisfi": 23, "save": [7, 8, 9], "save_al": 18, "sb": 16, "sc21": 14, "sc22": 14, "scalar": [4, 8, 9, 10, 12, 23], "scale": 6, "scale_from_param": 6, "schoonhoven": 14, "schoonhoven2022benchmark": 14, "schoonhoven2022go": 14, "scienc": 14, "sciencedirect": 14, "scientif": 21, "scipi": 11, "script": [4, 6, 16, 21, 22], "sdk": 15, "search": [1, 4, 6, 11, 14, 16, 17, 19, 23], "searchspac": 19, "second": [4, 5, 6, 8, 9, 10, 12, 16, 17, 18, 19, 23], "secondli": [4, 16], "section": [6, 8, 9, 10], "see": [0, 1, 3, 4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 20, 22, 23], "seem": [8, 9, 10], "seemingli": 7, "seen": [4, 6, 7, 16], "select": [0, 3, 4, 6, 8, 9, 10, 12, 15, 16, 18, 19, 23], "self": [6, 7, 18, 19], "semant": 5, "send": 12, "sens": 21, "sensibl": 16, "sensor": 18, "separ": [6, 11, 13, 22], "seper": 22, "seri": [6, 12], "serializ": 6, "serv": [8, 9, 10, 17, 19], "session": [1, 6, 7, 19], "set": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 18, 19, 20, 22, 23, 24], "set_nvml_paramet": 6, "set_titl": [8, 9, 10], "setup": [4, 8, 9, 10, 13, 15, 18, 21], "setup_block_and_grid": 6, "setup_method_argu": 6, "setup_method_opt": 6, "sever": [6, 8, 9, 10, 11, 12, 15, 16, 22, 23], "sh": 15, "sh_u": [8, 9, 10], "share": [0, 4, 6, 23], "sheet": [8, 9, 10], "shell": 7, "shift": [4, 8, 9, 10, 12, 16], "shortli": 4, "should": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13, 16, 17, 18, 20, 23], "show": [4, 8, 9, 10, 11, 14, 17, 21], "shown": [4, 6, 18], "shuffl": 11, "signal": [4, 24], "signatur": [0, 4, 6], "signific": 3, "significantli": [14, 16, 18], "silent": 4, "similar": [6, 13, 16, 23], "similarli": 4, "simpl": [4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 21], "simpli": [4, 5, 6, 8, 9, 10, 12, 19, 20, 23], "simplic": [8, 9, 10, 12], "simplifi": [7, 8, 9, 10], "simul": [1, 6, 10, 14, 19, 21, 23], "simulated_ann": 23, "simulation_mod": [6, 23], "sinc": [4, 10, 12, 14, 16, 22], "singl": [4, 5, 6, 8, 9, 10, 13, 16, 18, 22, 23], "single_point": 19, "single_point_crossov": 19, "situat": 21, "size": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 19, 20, 22, 23], "skip": [3, 4, 7, 8, 9, 10, 23], "skip_nvml_set": 6, "skipkei": 6, "skippablefailur": 6, "slight": 21, "slightli": [13, 16, 22], "slow": 19, "slsqp": 19, "sm_": 8, "small": [3, 4, 7, 8, 9, 10, 16], "smem_arg": [6, 23], "smi": 18, "snap": 6, "snap_to_nearest_config": 6, "snippet": 5, "so": [4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 19, 20, 22, 23], "social": 19, "softwar": [4, 8, 9, 10, 14, 15, 18, 19, 20], "solut": [16, 18], "solv": 10, "some": [4, 6, 7, 8, 9, 10, 15, 16, 17, 18, 19, 20, 21, 22, 23], "somehow": [8, 9, 10], "someth": [1, 4, 8, 9, 10, 16], "sometim": [0, 7, 8, 9, 10, 21], "somewher": 4, "soon": 19, "sort": 23, "sort_kei": 6, "sourc": [4, 6, 7, 8, 9, 10, 12, 13, 15, 16, 18, 22, 23], "sourcemodul": [8, 10, 12], "space": [1, 4, 5, 6, 7, 12, 13, 16, 17, 19, 23], "spatial": [8, 9, 10], "special": [4, 8, 9, 10, 18, 20, 24], "specif": [0, 4, 6, 8, 9, 10, 11, 12, 17, 18, 19, 23], "specifi": [4, 5, 6, 7, 8, 9, 10, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24], "speed": 6, "spent": [13, 23], "sphinxdoc": 3, "split": 13, "spread": 13, "sqrt": 12, "squar": 16, "src": 6, "ssl": [7, 15], "stabl": 7, "stack": 15, "stai": 4, "stand": 16, "start": [1, 2, 4, 5, 6, 8, 9, 10, 13, 15, 16, 18, 19, 23], "start_ev": 6, "state": [6, 8, 9, 10, 18, 23], "statement": [4, 10, 12, 16, 22], "static": 6, "statu": 6, "stdout": [8, 9], "steer": 14, "step": [3, 7, 8, 9, 10, 15, 16, 17, 19, 22], "stick": 21, "still": [1, 3, 5, 16], "stop": [6, 19], "stop_ev": 6, "stopcriterionreach": 6, "store": [1, 4, 6, 7, 10, 16, 18, 20, 23], "store_cach": 6, "store_result": 23, "str": [6, 8, 9, 10, 12], "strategi": [1, 2, 4, 14, 17, 23], "strategy_opt": [6, 19, 23], "stream": [1, 6, 8, 9, 10], "string": [4, 6, 8, 9, 10, 11, 16, 17, 18, 20, 21, 23], "struct": 2, "structur": [4, 6, 8, 9, 16, 20], "studio": 7, "style": 3, "subclass": 6, "submatric": 16, "subplot": [8, 9, 10], "subscrib": 18, "substitut": 12, "sudo": [7, 15], "suffici": [12, 17], "suffix": [6, 23], "suit": [12, 23], "sum": [4, 5, 6, 16], "sum_": 12, "sum_float": 5, "sum_x": 5, "summar": 23, "supercomput": 14, "suppli": [6, 13, 16, 19, 22, 23], "support": [4, 6, 7, 8, 9, 10, 13, 15, 18, 19, 22, 23, 24], "suppos": [8, 9, 10], "sure": [4, 7, 8, 9, 10, 14, 15, 16], "swarm": [19, 23], "sy": 9, "symbol": [6, 23], "sync": [7, 21], "synchron": [6, 8, 10, 12, 16, 17], "system": [7, 14, 15, 18], "t": [4, 6, 7, 8, 9, 10, 12, 13, 15, 19, 22, 23], "t0": [10, 12], "t_min": 19, "tab": 7, "tabl": 0, "take": [4, 6, 7, 8, 9, 10, 12, 16, 18, 19, 20, 22, 23], "target": 23, "task": 6, "techniqu": 16, "tell": [4, 8, 9, 10, 11, 13, 16, 20, 21], "temp_x": 6, "temperatur": [8, 9, 10, 18, 19, 24], "templat": [0, 2, 12], "temporari": 6, "term": 4, "termin": [1, 15], "terminologi": [8, 10], "test": [3, 8, 9, 10, 11, 15, 16, 18, 23], "test_vector_add": 11, "test_vector_add_parameter": 11, "texmem_arg": [6, 23], "text": [8, 10, 16], "textur": [0, 6, 23], "than": [4, 8, 9, 10, 12, 17, 18, 19, 23, 24], "thank": 3, "thei": [1, 6, 7, 8, 9, 10, 11, 16, 17], "them": [4, 7, 10, 12, 13, 16], "themselv": 12, "therefor": [4, 5, 8, 9, 10, 12, 13, 16], "thesi": 19, "thi": [0, 1, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "thin": 18, "thing": [4, 13, 16], "think": [8, 9, 10], "third": [5, 16], "those": [4, 7, 11, 15, 18], "thousand": [8, 9, 10], "thread": [4, 6, 8, 9, 10, 11, 12, 17, 18, 20, 23, 24], "threadidx": [4, 8, 9, 10, 12, 14, 16, 20, 22], "three": [4, 5, 16], "through": [6, 7, 8, 9, 10, 12, 14, 17, 18, 19, 23], "thrown": 6, "ti": [8, 9, 10, 12], "tight": 7, "tiker": 15, "tile": [4, 11, 16], "tile_size_i": [4, 5, 8, 9, 10, 13, 16, 23], "tile_size_j": 10, "tile_size_x": [4, 5, 8, 9, 10, 13, 16], "time": [4, 6, 8, 9, 10, 12, 13, 16, 17, 18, 19, 22, 23, 24], "time_limit": [19, 23], "time_sinc": 8, "titan": [8, 9, 10], "titl": 14, "tj": [8, 9, 10], "tk": 7, "tnc": 19, "to_csv": [8, 9], "to_gpu": [10, 12], "to_valid_nvrtc_gpu_arch_cc": 6, "togeth": [8, 9, 10, 15, 23], "token": 4, "toler": 5, "toml": 7, "too": [4, 8, 9, 10, 12, 13, 16, 23], "took": [4, 8, 10, 19, 20, 23], "tool": [10, 12, 14], "toolkit": [14, 15], "top": [6, 7, 12, 18, 23], "total": [4, 6, 8, 9, 10, 16, 17, 20], "total_flop": 17, "toward": 19, "track": 18, "tradit": 7, "transact": 14, "transfer": [10, 11, 13], "transmit": 18, "travers": 19, "treat": 23, "tri": [8, 9, 10, 19], "troubl": 15, "true": [1, 4, 5, 6, 8, 9, 10, 13, 16, 18, 19, 23], "trust": [5, 19], "trusti": 4, "try": [4, 6, 8, 9, 10, 15, 16, 19, 23], "try_to_constraint": 6, "ttyacm0": 18, "tunabl": [6, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 22, 23, 24], "tune": [1, 2, 5, 6, 11, 14, 15, 19, 20, 22, 23, 24], "tune_kernel": [1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 19, 20, 21, 22, 23], "tune_param": [4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 19, 20, 21, 22, 23], "tune_params_kei": 6, "tuner": [0, 1, 3, 4, 5, 6, 7, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], "tuning_opt": [6, 19], "tupl": [6, 10, 12, 19, 23], "turn": 6, "tutori": [4, 8, 12, 14, 15, 16], "two": [4, 6, 8, 9, 10, 11, 16, 17, 19, 23], "two_point": 19, "two_point_crossov": 19, "tx": [8, 9, 10, 16], "txt": 3, "ty": [8, 9, 10, 16], "type": [4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23], "typeerror": 6, "typenam": 22, "typic": [6, 15, 16, 23], "typicali": 23, "u": [3, 4, 8, 9, 10], "u_": [8, 9, 10], "u_new": [8, 9, 10], "u_old": [8, 10], "ubuntu": 7, "undefin": [4, 6, 8, 9, 10, 16], "under": [4, 7, 14, 23], "understand": 4, "underutil": [8, 9, 10], "uniform": 19, "uniform_crossov": 19, "uniformli": 19, "uniqu": [19, 23], "unit": [3, 6], "unless": 4, "unload": [6, 7], "unrol": [8, 9, 10, 11, 16, 24], "unscal": 6, "unscale_and_snap_to_nearest": 6, "unsign": [6, 9], "until": [6, 13], "up": [3, 4, 6, 7, 8, 9, 10, 15, 16, 20, 23], "updat": [6, 7], "upgrad": 7, "upload": 13, "url": 14, "us": [0, 1, 2, 4, 5, 6, 7, 11, 13, 14, 15, 17, 18, 19, 20, 22, 23, 24], "usag": [16, 18], "usb": 18, "use_locked_clock": 18, "usecas": 11, "user": [0, 4, 5, 6, 7, 9, 11, 15, 16, 17, 18, 19, 22, 23], "usual": [7, 18], "util": [7, 16], "v": [3, 6, 8, 9, 10, 12], "valid": [6, 11, 16, 23], "valu": [4, 5, 6, 8, 9, 10, 11, 12, 13, 16, 18, 19, 20, 23], "van": 14, "vari": [8, 9, 10, 12, 16, 17], "variabl": [6, 7, 12, 15, 19, 23], "variou": [18, 20], "ve": [4, 8, 9, 10, 15, 16], "vector": [12, 13, 20], "vector_add": [14, 19, 20, 22], "vector_add_kernel": 20, "veenboer": 14, "venv": 7, "venvbackend": 7, "verbos": [4, 5, 6, 8, 9, 10, 13, 23], "veri": [1, 5, 8, 9, 10, 13, 15, 16, 18, 21, 22], "verif": [0, 2, 11, 23], "verifi": [5, 6, 7, 11, 23], "verify_partial_reduc": 5, "version": [3, 4, 7, 16, 18, 23], "via": [7, 19], "virtual": [7, 15], "virtualenv": 7, "visual": [1, 7, 16], "vocabulari": [2, 18, 20], "void": [4, 8, 9, 10, 12, 14, 16, 20, 21, 22], "voltag": 18, "volum": 14, "w": [4, 8, 9, 17, 19], "wa": [4, 6, 8, 9, 10, 18, 23], "wai": [4, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 23], "wall": 9, "want": [0, 5, 10, 12, 13, 15, 16, 18, 20, 23, 24], "warp": 23, "warpsiz": 16, "wast": 19, "watt": [18, 24], "we": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 20, 21, 22], "weight": [4, 19], "weighted_choic": 19, "weird": 7, "well": [0, 8, 9, 10, 12, 16, 18, 23], "went": [8, 9, 10, 12], "were": [4, 8, 9, 10, 12, 16, 23], "werkhoven": 14, "wget": [7, 15], "what": [3, 4, 5, 6, 8, 9, 10, 13, 16, 18, 20, 21, 22, 23, 24], "whatev": [6, 7, 13, 19], "when": [0, 1, 3, 4, 6, 7, 8, 9, 10, 13, 15, 16, 17, 18, 19, 21, 22, 23, 24], "whenev": 5, "where": [3, 4, 5, 6, 7, 8, 9, 10, 16, 17, 18, 21, 22, 23], "whether": [6, 17, 19, 23], "which": [0, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24], "while": [0, 1, 4, 6, 8, 9, 10, 11, 16, 18, 19], "who": 7, "whole": [8, 9, 10, 16, 19], "whose": [5, 23], "why": [8, 9, 13, 17], "wide": [8, 9, 10, 15, 16], "width": 16, "wiki": 15, "willemsen": [14, 19], "willemsen2021bayesian": 14, "wise": 20, "wish": 10, "within": [8, 9, 10, 12, 16, 19, 23], "without": [7, 8, 9, 10, 12, 13, 18, 19], "won": 4, "word": 16, "work": [1, 3, 4, 6, 8, 9, 10, 15, 17, 19, 22, 23], "workshop": 14, "worri": [8, 10], "worst": [8, 9, 10], "would": [4, 7, 8, 9, 10, 22], "wrap": [0, 6, 20, 22, 23], "wrapper": [18, 22], "write": [4, 11, 12, 16, 22, 23], "write_fil": 6, "writefil": [4, 16], "written": [3, 22], "wrote": 4, "www": 14, "x": [4, 5, 6, 8, 9, 10, 12, 14, 16, 20, 22, 23], "x0": 12, "x1": 6, "x2": 6, "x86_64": 15, "x_": [8, 9, 10], "x_i": [8, 9, 10, 12], "xdg_cache_hom": 7, "xgpu": 12, "xgrid": 12, "xilinx": 18, "xn": 6, "xvect": 12, "xyz": [12, 23], "xz": 7, "y": [4, 6, 7, 8, 9, 10, 12, 13, 16, 23], "y0": 12, "y1": 6, "y2": 6, "y_": [8, 9, 10], "y_i": 12, "year": 14, "yet": [4, 6, 12, 13, 20], "ygpu": 12, "ygrid": 12, "yield": 16, "yn": 6, "you": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24], "your": [3, 4, 7, 8, 9, 10, 12, 13, 14, 15, 18, 21, 23], "yourself": [13, 23], "yvect": 12, "z": [4, 6, 12, 23], "z0": 12, "z_i": 12, "zero": [4, 5, 12, 13, 16], "zeros_lik": [8, 12, 14, 16, 20, 22], "zgpu": 12, "zgrid": 12, "zip": [10, 12], "zlib1g": 7, "zvect": 12}, "titles": ["Backends", "Cache files", "The Kernel Tuner documentation", "Contribution guide", "Convolution", "Correctness Verification", "Design documentation", "Development environment", "Diffusion", "Tutorial: From physics to tuned GPU kernels", "Tutorial: From physics to tuned GPU kernels", "Kernel Tuner Examples", "3D Grid on GPU with Kernel Tuner", "Tuning Host Code", "The Kernel Tuner documentation", "Installation", "Matrix multiplication", "Metrics and Objectives", "Observers", "Optimization strategies", "Getting Started", "Using structs", "Templated kernels", "API Documentation", "Parameter Vocabulary"], "titleterms": {"": 12, "2d": 4, "3d": 12, "The": [2, 14], "add": 11, "api": 23, "auto": [8, 9, 10], "backend": [0, 6, 15, 22], "basinhop": 19, "bayes_opt": 19, "best": 10, "brute_forc": 19, "build": 7, "c": 10, "cach": 1, "citat": 14, "cluster": 7, "code": [3, 8, 9, 10, 11, 13], "common": 6, "compil": [0, 6], "compilerfunct": 6, "comput": [8, 9, 10], "contribut": 3, "convolut": [4, 11], "convolution_correct": 11, "convolution_stream": 11, "core": 6, "correct": 5, "cpu": 12, "cuda": [0, 15, 16], "cudafunct": 6, "cupi": 6, "cupyfunct": 6, "depend": 15, "design": 6, "develop": [3, 7], "devic": 6, "deviceinterfac": 6, "diff_evo": 19, "diffus": [8, 9, 10], "document": [2, 6, 7, 14, 23], "dual_ann": 19, "environ": 7, "exampl": [4, 11, 14, 22], "execut": 18, "expdist": 11, "featur": [0, 2], "file": 1, "firefly_algorithm": 19, "from": [9, 10], "function": 6, "gener": 11, "genetic_algorithm": 19, "get": 20, "git": 15, "gpu": [8, 9, 10, 12], "greedy_il": 19, "greedy_ml": 19, "grid": 12, "guid": [2, 3, 15], "hip": [6, 15], "hipfunct": 6, "host": 13, "implement": [4, 8, 9, 10], "increas": 16, "instal": [14, 15], "interfac": 6, "issu": 3, "kernel": [2, 8, 9, 10, 11, 12, 14, 16, 22], "kernel_tun": [6, 19], "let": 12, "local": [7, 9], "matrix": [11, 16], "memori": [8, 9, 10, 16], "metric": 17, "minim": 19, "ml": 19, "more": 4, "move": 12, "multipl": [11, 16], "naiv": 16, "number": 13, "nvcuda": 6, "nvml": 18, "nvmlobserv": 18, "object": 17, "observ": 18, "opencl": [6, 15], "openclfunct": 6, "optim": [12, 19], "ordered_greedy_ml": 19, "other": 15, "packag": 15, "paramet": [4, 10, 12, 18, 24], "per": 16, "physic": [9, 10], "pmtobserv": 18, "point": 11, "polygon": 11, "powersensorobserv": 18, "product": 10, "pso": 19, "py": 11, "pycuda": [6, 15], "pycudafunct": 6, "pyhip": 15, "pyopencl": 15, "python": [8, 9, 10, 15], "quick": 14, "random_sampl": 19, "reduct": 11, "refer": 2, "report": 3, "result": [8, 9], "run": [7, 10], "runner": 6, "select": 22, "sepconv": 11, "sequenti": 6, "sequentialrunn": 6, "setup": [3, 7], "share": [8, 9, 10, 16], "simpl": 3, "simulated_ann": 19, "simulationrunn": 6, "spars": 11, "start": [12, 20], "stencil": 11, "store": [8, 9], "strategi": [6, 19], "stream": 13, "struct": 21, "support": 0, "templat": 22, "test": [4, 7], "thread": 16, "tile": [8, 9, 10], "tunabl": 4, "tune": [4, 8, 9, 10, 12, 13, 16, 17, 18], "tuner": [2, 8, 9, 10, 11, 12, 14], "tutori": [9, 10], "us": [8, 9, 10, 12, 16, 21], "usag": [0, 14], "util": 6, "vector": 11, "verif": 5, "version": 15, "vocabulari": 24, "work": 16}})
\ No newline at end of file
diff --git a/latest/structs.html b/latest/structs.html
index b1c6dc75b..69667a9e3 100644
--- a/latest/structs.html
+++ b/latest/structs.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Using structs &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -103,7 +105,7 @@
            <div itemprop="articleBody">
              
   <section id="using-structs">
-<h1>Using structs<a class="headerlink" href="#using-structs" title="Permalink to this heading">¶</a></h1>
+<h1>Using structs<a class="headerlink" href="#using-structs" title="Link to this heading">¶</a></h1>
 <p>One of the issues with calling GPU kernels from Python is the use of custom data types in kernel arguments. In general, it is recommended for portability of your GPU code, which may be
 used in any host program in any host programming language, to keep the interface of your kernels as simple as possible. This means sticking to simple pointers of primitive types such as integer, float, and double.
 For performance reasons, it is also recommended to not use arrays of structs for kernel arguments, as this is very likely to lead to inefficient memory accesses on the GPU.</p>
diff --git a/latest/templates.html b/latest/templates.html
index c0ae2ddcc..6106c0838 100644
--- a/latest/templates.html
+++ b/latest/templates.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Templated kernels &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -107,12 +109,12 @@
            <div itemprop="articleBody">
              
   <section id="templated-kernels">
-<span id="templates"></span><h1>Templated kernels<a class="headerlink" href="#templated-kernels" title="Permalink to this heading">¶</a></h1>
+<span id="templates"></span><h1>Templated kernels<a class="headerlink" href="#templated-kernels" title="Link to this heading">¶</a></h1>
 <p>It is quite common in CUDA programming to write kernels that use C++ templates. This can be very useful when writing code that can work for several types, for example floats and doubles. However, the use of C++ templates makes it slightly more difficult to directly
 integrate the CUDA kernel into applications that are not written in C++, for example Matlab, Fortran, or Python. And since Kernel Tuner is written in Python, we needed to take a few extra steps to provide support for templated CUDA kernels. Let’s first look at an
 example of what it’s like to tune a templated kernel with Kernel Tuner.</p>
 <section id="example">
-<h2>Example<a class="headerlink" href="#example" title="Permalink to this heading">¶</a></h2>
+<h2>Example<a class="headerlink" href="#example" title="Link to this heading">¶</a></h2>
 <p>Say we have a templated CUDA kernel in a file called vector_add.cu:</p>
 <div class="highlight-cuda notranslate"><div class="highlight"><pre><span></span><span class="linenos">1</span><span class="n">template</span><span class="o">&lt;</span><span class="n">typename</span><span class="w"> </span><span class="n">T</span><span class="o">&gt;</span>
 <span class="linenos">2</span><span class="kr">__global__</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">vector_add</span><span class="p">(</span><span class="n">T</span><span class="w"> </span><span class="o">*</span><span class="n">c</span><span class="p">,</span><span class="w"> </span><span class="n">T</span><span class="w"> </span><span class="o">*</span><span class="n">a</span><span class="p">,</span><span class="w"> </span><span class="n">T</span><span class="w"> </span><span class="o">*</span><span class="n">b</span><span class="p">,</span><span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="n">n</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
@@ -153,7 +155,7 @@ <h2>Example<a class="headerlink" href="#example" title="Permalink to this headin
 </div>
 </section>
 <section id="selecting-a-backend">
-<h2>Selecting a backend<a class="headerlink" href="#selecting-a-backend" title="Permalink to this heading">¶</a></h2>
+<h2>Selecting a backend<a class="headerlink" href="#selecting-a-backend" title="Link to this heading">¶</a></h2>
 <p>Kernel Tuner supports multiple backends, for CUDA these are based on PyCUDA and Cupy. The following explains how to enable tuning of templated kernels with either backend.</p>
 <p>The PyCuda backend is the default backend in Kernel Tuner and is selected if the user does not supply the ‘lang’ option and CUDA code is detected in the kernel source, or when lang is set to “CUDA” by the user. PyCuda requires CUDA kernels to have extern C linkage,
 which means that C++ templated kernels are not supported. To support templated kernels regardless of this limitation Kernel Tuner attempts to wrap the templated CUDA kernel by inserting a compile-time template instantiation statement and a wrapper kernel that calls
diff --git a/latest/user-api.html b/latest/user-api.html
index 528d736bd..f4a287f7c 100644
--- a/latest/user-api.html
+++ b/latest/user-api.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>API Documentation &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -111,11 +113,11 @@
   <div class="toctree-wrapper compound">
 </div>
 <section id="api-documentation">
-<span id="details"></span><h1>API Documentation<a class="headerlink" href="#api-documentation" title="Permalink to this heading">¶</a></h1>
+<span id="details"></span><h1>API Documentation<a class="headerlink" href="#api-documentation" title="Link to this heading">¶</a></h1>
 <p>This file provides all the details you need about how to call the Kernel Tuner’s functions, including all the optional arguments.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.tune_kernel">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.</span></span><span class="sig-name descname"><span class="pre">tune_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">problem_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div_x</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div_y</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div_z</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">restrictions</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">answer</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">atol</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1e-06</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verify</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lang</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">platform</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smem_args</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">defines</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">log</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">block_size_names</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">quiet</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">strategy</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">strategy_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cache</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metrics</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">simulation_mode</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective_higher_is_better</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.tune_kernel" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.</span></span><span class="sig-name descname"><span class="pre">tune_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">problem_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div_x</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div_y</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div_z</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">restrictions</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">answer</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">atol</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1e-06</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verify</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lang</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">platform</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smem_args</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">defines</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">log</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">iterations</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">block_size_names</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">quiet</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">strategy</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">strategy_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cache</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metrics</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">simulation_mode</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">observers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective_higher_is_better</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.tune_kernel" title="Link to this definition">¶</a></dt>
 <dd><p>Tune a CUDA kernel given a set of tunable parameters</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -365,7 +367,7 @@
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.run_kernel">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.</span></span><span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">problem_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div_x</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div_y</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div_z</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lang</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">platform</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smem_args</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">defines</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">block_size_names</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">quiet</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">log</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.run_kernel" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.</span></span><span class="sig-name descname"><span class="pre">run_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">problem_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">arguments</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div_x</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div_y</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grid_div_z</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lang</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">platform</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smem_args</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cmem_args</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">texmem_args</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">compiler_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">defines</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">block_size_names</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">quiet</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">log</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.run_kernel" title="Link to this definition">¶</a></dt>
 <dd><p>Compile and run a single kernel</p>
 <p>Compiles and runs a single kernel once, given a specific instance of the kernels tuning parameters.
 However, instead of measuring execution time run_kernel returns the output of the kernel.
@@ -517,7 +519,7 @@
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.store_results">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.</span></span><span class="sig-name descname"><span class="pre">store_results</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">results_filename</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_string</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">problem_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">results</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">env</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective_higher_is_better</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.store_results" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.</span></span><span class="sig-name descname"><span class="pre">store_results</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">results_filename</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_string</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tune_params</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">problem_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">results</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">env</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective_higher_is_better</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.store_results" title="Link to this definition">¶</a></dt>
 <dd><p>stores tuning results to a JSON file</p>
 <p>Stores the top (3% by default) best kernel configurations in a JSON file.
 The results are stored for a specific device (retrieved using env[‘device_name’])
@@ -550,7 +552,7 @@
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kernel_tuner.create_device_targets">
-<span class="sig-prename descclassname"><span class="pre">kernel_tuner.</span></span><span class="sig-name descname"><span class="pre">create_device_targets</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">header_filename</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">results_filename</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective_higher_is_better</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.create_device_targets" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kernel_tuner.</span></span><span class="sig-name descname"><span class="pre">create_device_targets</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">header_filename</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">results_filename</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objective_higher_is_better</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#kernel_tuner.create_device_targets" title="Link to this definition">¶</a></dt>
 <dd><p>create a header with device targets</p>
 <p>This function generates a header file with device targets for compiling
 a kernel with different parameters on different devices. The tuning
diff --git a/latest/vocabulary.html b/latest/vocabulary.html
index 985e4fdf1..22083195f 100644
--- a/latest/vocabulary.html
+++ b/latest/vocabulary.html
@@ -1,19 +1,21 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" >
+<html class="writer-html5" lang="en" data-content_root="./">
 <head>
-  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Parameter Vocabulary &mdash; Kernel Tuner 1.0 documentation</title>
-      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
-      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=fa44fd50" />
+      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
+
+  
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
   
-        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=359c27e9"></script>
-        <script src="_static/doctools.js?v=888ff710"></script>
-        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
+        <script src="_static/documentation_options.js?v=f2a433a1"></script>
+        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
         <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -103,7 +105,7 @@
            <div itemprop="articleBody">
              
   <section id="parameter-vocabulary">
-<span id="id1"></span><h1>Parameter Vocabulary<a class="headerlink" href="#parameter-vocabulary" title="Permalink to this heading">¶</a></h1>
+<span id="id1"></span><h1>Parameter Vocabulary<a class="headerlink" href="#parameter-vocabulary" title="Link to this heading">¶</a></h1>
 <p>There are certain tunable parameters that have special meaning in Kernel Tuner.
 This document specifies which parameters are special and what there uses are when auto-tuning GPU kernels.</p>
 <p>In general, it is best to avoid using these parameter names for purposes other than the ones indicated in this document.</p>