diff --git a/.buildinfo b/.buildinfo index 159e30b..5fcd2fb 100644 --- a/.buildinfo +++ b/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 59e7d19ae29c5a46205834932d5afbf9 +config: 4cb7ea34329eb381d11e782552c32ee5 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/_sources/af1/af1.0.ipynb b/_sources/af1/af1.0.ipynb index 967db06..fb99a60 100644 --- a/_sources/af1/af1.0.ipynb +++ b/_sources/af1/af1.0.ipynb @@ -1244,9 +1244,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "global-global-binder-5.0.0", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "conda-env-global-global-binder-5.0.0-py" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -1258,7 +1258,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/af1/af1.1.ipynb b/_sources/af1/af1.1.ipynb index 7139b56..8ec0d3f 100644 --- a/_sources/af1/af1.1.ipynb +++ b/_sources/af1/af1.1.ipynb @@ -1264,9 +1264,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "global-global-binder-5.0.0", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "conda-env-global-global-binder-5.0.0-py" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -1278,7 +1278,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/af1/af1.2.ipynb b/_sources/af1/af1.2.ipynb index 40986e2..f526f3a 100644 --- a/_sources/af1/af1.2.ipynb +++ b/_sources/af1/af1.2.ipynb @@ -693,9 +693,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "global-global-binder-5.0.0", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "conda-env-global-global-binder-5.0.0-py" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -707,7 +707,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/af1/af1.3.ipynb b/_sources/af1/af1.3.ipynb index 31cea97..9e863c9 100644 --- a/_sources/af1/af1.3.ipynb +++ b/_sources/af1/af1.3.ipynb @@ -656,9 +656,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "global-global-binder-5.0.0", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "conda-env-global-global-binder-5.0.0-py" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -670,7 +670,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/af1/af1.4.ipynb b/_sources/af1/af1.4.ipynb index 531b091..1f72bc1 100644 --- a/_sources/af1/af1.4.ipynb +++ b/_sources/af1/af1.4.ipynb @@ -693,9 +693,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "global-global-mgenv-5.1.0", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "conda-env-global-global-mgenv-5.1.0-py" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -707,7 +707,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/af1/cloud.ipynb b/_sources/af1/cloud.ipynb index a945ec0..2ee86d3 100644 --- a/_sources/af1/cloud.ipynb +++ b/_sources/af1/cloud.ipynb @@ -108,7 +108,318 @@ "outputs": [ { "data": { - "application/javascript": "'use strict';\n(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\nconst JS_MIME_TYPE = 'application/javascript';\n const HTML_MIME_TYPE = 'text/html';\n const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n const CLASS_NAME = 'output_bokeh rendered_html';\n\n /**\n * Render data to the DOM node\n */\n function render(props, node) {\n const script = document.createElement(\"script\");\n node.appendChild(script);\n }\n\n /**\n * Handle when an output is cleared or removed\n */\n function handleClearOutput(event, handle) {\n function drop(id) {\n const view = Bokeh.index.get_by_id(id)\n if (view != null) {\n view.model.document.clear()\n Bokeh.index.delete(view)\n }\n }\n\n const cell = handle.cell;\n\n const id = cell.output_area._bokeh_element_id;\n const server_id = cell.output_area._bokeh_server_id;\n\n // Clean up Bokeh references\n if (id != null) {\n drop(id)\n }\n\n if (server_id !== undefined) {\n // Clean up Bokeh references\n const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n cell.notebook.kernel.execute(cmd_clean, {\n iopub: {\n output: function(msg) {\n const id = msg.content.text.trim()\n drop(id)\n }\n }\n });\n // Destroy server and session\n const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n cell.notebook.kernel.execute(cmd_destroy);\n }\n }\n\n /**\n * Handle when a new output is added\n */\n function handleAddOutput(event, handle) {\n const output_area = handle.output_area;\n const output = handle.output;\n\n // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n return\n }\n\n const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n\n if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n // store reference to embed id on output_area\n output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n }\n if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n const bk_div = document.createElement(\"div\");\n bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n const script_attrs = bk_div.children[0].attributes;\n for (let i = 0; i < script_attrs.length; i++) {\n toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n }\n // store reference to server id on output_area\n output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n }\n }\n\n function register_renderer(events, OutputArea) {\n\n function append_mime(data, metadata, element) {\n // create a DOM node to render to\n const toinsert = this.create_output_subarea(\n metadata,\n CLASS_NAME,\n EXEC_MIME_TYPE\n );\n this.keyboard_manager.register_events(toinsert);\n // Render to node\n const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n render(props, toinsert[toinsert.length - 1]);\n element.append(toinsert);\n return toinsert\n }\n\n /* Handle when an output is cleared or removed */\n events.on('clear_output.CodeCell', handleClearOutput);\n events.on('delete.Cell', handleClearOutput);\n\n /* Handle when a new output is added */\n events.on('output_added.OutputArea', handleAddOutput);\n\n /**\n * Register the mime type and append_mime function with output_area\n */\n OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n /* Is output safe? */\n safe: true,\n /* Index of renderer in `output_area.display_order` */\n index: 0\n });\n }\n\n // register the mime type if in Jupyter Notebook environment and previously unregistered\n if (root.Jupyter !== undefined) {\n const events = require('base/js/events');\n const OutputArea = require('notebook/js/outputarea').OutputArea;\n\n if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n register_renderer(events, OutputArea);\n }\n }\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded(error = null) {\n const el = document.getElementById(null);\n if (el != null) {\n const html = (() => {\n if (typeof root.Bokeh === \"undefined\") {\n if (error == null) {\n return \"BokehJS is loading ...\";\n } else {\n return \"BokehJS failed to load.\";\n }\n } else {\n const prefix = `BokehJS ${root.Bokeh.version}`;\n if (error == null) {\n return `${prefix} successfully loaded.`;\n } else {\n return `${prefix} encountered errors while loading and may not function as expected.`;\n }\n }\n })();\n el.innerHTML = html;\n\n if (error != null) {\n const wrapper = document.createElement(\"div\");\n wrapper.style.overflow = \"auto\";\n wrapper.style.height = \"5em\";\n wrapper.style.resize = \"vertical\";\n const content = document.createElement(\"div\");\n content.style.fontFamily = \"monospace\";\n content.style.whiteSpace = \"pre-wrap\";\n content.style.backgroundColor = \"rgb(255, 221, 221)\";\n content.textContent = error.stack ?? error.toString();\n wrapper.append(content);\n el.append(wrapper);\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(() => display_loaded(error), 100);\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.4.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.4.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.4.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.4.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.4.1.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n try {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n\n } catch (error) {throw error;\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(null)).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));", + "application/javascript": [ + "'use strict';\n", + "(function(root) {\n", + " function now() {\n", + " return new Date();\n", + " }\n", + "\n", + " const force = true;\n", + "\n", + " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", + " root._bokeh_onload_callbacks = [];\n", + " root._bokeh_is_loading = undefined;\n", + " }\n", + "\n", + "const JS_MIME_TYPE = 'application/javascript';\n", + " const HTML_MIME_TYPE = 'text/html';\n", + " const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", + " const CLASS_NAME = 'output_bokeh rendered_html';\n", + "\n", + " /**\n", + " * Render data to the DOM node\n", + " */\n", + " function render(props, node) {\n", + " const script = document.createElement(\"script\");\n", + " node.appendChild(script);\n", + " }\n", + "\n", + " /**\n", + " * Handle when an output is cleared or removed\n", + " */\n", + " function handleClearOutput(event, handle) {\n", + " function drop(id) {\n", + " const view = Bokeh.index.get_by_id(id)\n", + " if (view != null) {\n", + " view.model.document.clear()\n", + " Bokeh.index.delete(view)\n", + " }\n", + " }\n", + "\n", + " const cell = handle.cell;\n", + "\n", + " const id = cell.output_area._bokeh_element_id;\n", + " const server_id = cell.output_area._bokeh_server_id;\n", + "\n", + " // Clean up Bokeh references\n", + " if (id != null) {\n", + " drop(id)\n", + " }\n", + "\n", + " if (server_id !== undefined) {\n", + " // Clean up Bokeh references\n", + " const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", + " cell.notebook.kernel.execute(cmd_clean, {\n", + " iopub: {\n", + " output: function(msg) {\n", + " const id = msg.content.text.trim()\n", + " drop(id)\n", + " }\n", + " }\n", + " });\n", + " // Destroy server and session\n", + " const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", + " cell.notebook.kernel.execute(cmd_destroy);\n", + " }\n", + " }\n", + "\n", + " /**\n", + " * Handle when a new output is added\n", + " */\n", + " function handleAddOutput(event, handle) {\n", + " const output_area = handle.output_area;\n", + " const output = handle.output;\n", + "\n", + " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", + " if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n", + " return\n", + " }\n", + "\n", + " const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", + "\n", + " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", + " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", + " // store reference to embed id on output_area\n", + " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", + " }\n", + " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", + " const bk_div = document.createElement(\"div\");\n", + " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", + " const script_attrs = bk_div.children[0].attributes;\n", + " for (let i = 0; i < script_attrs.length; i++) {\n", + " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", + " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", + " }\n", + " // store reference to server id on output_area\n", + " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", + " }\n", + " }\n", + "\n", + " function register_renderer(events, OutputArea) {\n", + "\n", + " function append_mime(data, metadata, element) {\n", + " // create a DOM node to render to\n", + " const toinsert = this.create_output_subarea(\n", + " metadata,\n", + " CLASS_NAME,\n", + " EXEC_MIME_TYPE\n", + " );\n", + " this.keyboard_manager.register_events(toinsert);\n", + " // Render to node\n", + " const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", + " render(props, toinsert[toinsert.length - 1]);\n", + " element.append(toinsert);\n", + " return toinsert\n", + " }\n", + "\n", + " /* Handle when an output is cleared or removed */\n", + " events.on('clear_output.CodeCell', handleClearOutput);\n", + " events.on('delete.Cell', handleClearOutput);\n", + "\n", + " /* Handle when a new output is added */\n", + " events.on('output_added.OutputArea', handleAddOutput);\n", + "\n", + " /**\n", + " * Register the mime type and append_mime function with output_area\n", + " */\n", + " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", + " /* Is output safe? */\n", + " safe: true,\n", + " /* Index of renderer in `output_area.display_order` */\n", + " index: 0\n", + " });\n", + " }\n", + "\n", + " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", + " if (root.Jupyter !== undefined) {\n", + " const events = require('base/js/events');\n", + " const OutputArea = require('notebook/js/outputarea').OutputArea;\n", + "\n", + " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", + " register_renderer(events, OutputArea);\n", + " }\n", + " }\n", + " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", + " root._bokeh_timeout = Date.now() + 5000;\n", + " root._bokeh_failed_load = false;\n", + " }\n", + "\n", + " const NB_LOAD_WARNING = {'data': {'text/html':\n", + " \"
\\n\"+\n", + " \"

\\n\"+\n", + " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", + " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", + " \"

\\n\"+\n", + " \"\\n\"+\n", + " \"\\n\"+\n", + " \"from bokeh.resources import INLINE\\n\"+\n", + " \"output_notebook(resources=INLINE)\\n\"+\n", + " \"\\n\"+\n", + " \"
\"}};\n", + "\n", + " function display_loaded(error = null) {\n", + " const el = document.getElementById(null);\n", + " if (el != null) {\n", + " const html = (() => {\n", + " if (typeof root.Bokeh === \"undefined\") {\n", + " if (error == null) {\n", + " return \"BokehJS is loading ...\";\n", + " } else {\n", + " return \"BokehJS failed to load.\";\n", + " }\n", + " } else {\n", + " const prefix = `BokehJS ${root.Bokeh.version}`;\n", + " if (error == null) {\n", + " return `${prefix} successfully loaded.`;\n", + " } else {\n", + " return `${prefix} encountered errors while loading and may not function as expected.`;\n", + " }\n", + " }\n", + " })();\n", + " el.innerHTML = html;\n", + "\n", + " if (error != null) {\n", + " const wrapper = document.createElement(\"div\");\n", + " wrapper.style.overflow = \"auto\";\n", + " wrapper.style.height = \"5em\";\n", + " wrapper.style.resize = \"vertical\";\n", + " const content = document.createElement(\"div\");\n", + " content.style.fontFamily = \"monospace\";\n", + " content.style.whiteSpace = \"pre-wrap\";\n", + " content.style.backgroundColor = \"rgb(255, 221, 221)\";\n", + " content.textContent = error.stack ?? error.toString();\n", + " wrapper.append(content);\n", + " el.append(wrapper);\n", + " }\n", + " } else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(() => display_loaded(error), 100);\n", + " }\n", + " }\n", + "\n", + " function run_callbacks() {\n", + " try {\n", + " root._bokeh_onload_callbacks.forEach(function(callback) {\n", + " if (callback != null)\n", + " callback();\n", + " });\n", + " } finally {\n", + " delete root._bokeh_onload_callbacks\n", + " }\n", + " console.debug(\"Bokeh: all callbacks have finished\");\n", + " }\n", + "\n", + " function load_libs(css_urls, js_urls, callback) {\n", + " if (css_urls == null) css_urls = [];\n", + " if (js_urls == null) js_urls = [];\n", + "\n", + " root._bokeh_onload_callbacks.push(callback);\n", + " if (root._bokeh_is_loading > 0) {\n", + " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", + " return null;\n", + " }\n", + " if (js_urls == null || js_urls.length === 0) {\n", + " run_callbacks();\n", + " return null;\n", + " }\n", + " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", + " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", + "\n", + " function on_load() {\n", + " root._bokeh_is_loading--;\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", + " run_callbacks()\n", + " }\n", + " }\n", + "\n", + " function on_error(url) {\n", + " console.error(\"failed to load \" + url);\n", + " }\n", + "\n", + " for (let i = 0; i < css_urls.length; i++) {\n", + " const url = css_urls[i];\n", + " const element = document.createElement(\"link\");\n", + " element.onload = on_load;\n", + " element.onerror = on_error.bind(null, url);\n", + " element.rel = \"stylesheet\";\n", + " element.type = \"text/css\";\n", + " element.href = url;\n", + " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " for (let i = 0; i < js_urls.length; i++) {\n", + " const url = js_urls[i];\n", + " const element = document.createElement('script');\n", + " element.onload = on_load;\n", + " element.onerror = on_error.bind(null, url);\n", + " element.async = false;\n", + " element.src = url;\n", + " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " document.head.appendChild(element);\n", + " }\n", + " };\n", + "\n", + " function inject_raw_css(css) {\n", + " const element = document.createElement(\"style\");\n", + " element.appendChild(document.createTextNode(css));\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.4.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.4.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.4.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.4.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.4.1.min.js\"];\n", + " const css_urls = [];\n", + "\n", + " const inline_js = [ function(Bokeh) {\n", + " Bokeh.set_log_level(\"info\");\n", + " },\n", + "function(Bokeh) {\n", + " }\n", + " ];\n", + "\n", + " function run_inline_js() {\n", + " if (root.Bokeh !== undefined || force === true) {\n", + " try {\n", + " for (let i = 0; i < inline_js.length; i++) {\n", + " inline_js[i].call(root, root.Bokeh);\n", + " }\n", + "\n", + " } catch (error) {throw error;\n", + " }} else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(run_inline_js, 100);\n", + " } else if (!root._bokeh_failed_load) {\n", + " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", + " root._bokeh_failed_load = true;\n", + " } else if (force !== true) {\n", + " const cell = $(document.getElementById(null)).parents('.cell').data().cell;\n", + " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", + " }\n", + " }\n", + "\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", + " run_inline_js();\n", + " } else {\n", + " load_libs(css_urls, js_urls, function() {\n", + " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", + " run_inline_js();\n", + " });\n", + " }\n", + "}(window));" + ], "application/vnd.bokehjs_load.v0+json": "" }, "metadata": {}, @@ -9619,9 +9930,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "global-global-binder-5.0.0", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "conda-env-global-global-binder-5.0.0-py" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -9633,7 +9944,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/af1/download.ipynb b/_sources/af1/download.ipynb index 24134be..fa0d82e 100644 --- a/_sources/af1/download.ipynb +++ b/_sources/af1/download.ipynb @@ -800,9 +800,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "python3" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -814,7 +814,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/ag3.0.ipynb b/_sources/ag3/ag3.0.ipynb index c578aec..5c71390 100644 --- a/_sources/ag3/ag3.0.ipynb +++ b/_sources/ag3/ag3.0.ipynb @@ -1410,9 +1410,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "python3" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -1424,7 +1424,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/ag3.1.ipynb b/_sources/ag3/ag3.1.ipynb index b60f85f..5b7a5dd 100644 --- a/_sources/ag3/ag3.1.ipynb +++ b/_sources/ag3/ag3.1.ipynb @@ -662,9 +662,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "python3" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -676,7 +676,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/ag3.10.ipynb b/_sources/ag3/ag3.10.ipynb index f6d3513..b9625aa 100644 --- a/_sources/ag3/ag3.10.ipynb +++ b/_sources/ag3/ag3.10.ipynb @@ -1036,9 +1036,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "global-global-mgenv-6.0.5", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "conda-env-global-global-mgenv-6.0.5-py" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -1050,7 +1050,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/ag3.11.ipynb b/_sources/ag3/ag3.11.ipynb index 3fc4f62..b66430a 100644 --- a/_sources/ag3/ag3.11.ipynb +++ b/_sources/ag3/ag3.11.ipynb @@ -720,9 +720,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "global-global-mgenv-6.0.5", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "conda-env-global-global-mgenv-6.0.5-py" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -734,7 +734,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/ag3.12.ipynb b/_sources/ag3/ag3.12.ipynb index e808741..d548003 100644 --- a/_sources/ag3/ag3.12.ipynb +++ b/_sources/ag3/ag3.12.ipynb @@ -679,9 +679,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "global-global-mgenv-6.0.5", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "conda-env-global-global-mgenv-6.0.5-py" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -693,7 +693,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/ag3.13.ipynb b/_sources/ag3/ag3.13.ipynb index 322c180..fc4db2d 100644 --- a/_sources/ag3/ag3.13.ipynb +++ b/_sources/ag3/ag3.13.ipynb @@ -660,9 +660,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "global-global-mgenv-6.0.5", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "conda-env-global-global-mgenv-6.0.5-py" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -674,7 +674,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/ag3.2.ipynb b/_sources/ag3/ag3.2.ipynb index 45f29e4..6db8963 100644 --- a/_sources/ag3/ag3.2.ipynb +++ b/_sources/ag3/ag3.2.ipynb @@ -750,9 +750,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "python3" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -764,7 +764,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/ag3.4.ipynb b/_sources/ag3/ag3.4.ipynb index b114fc2..09c9692 100644 --- a/_sources/ag3/ag3.4.ipynb +++ b/_sources/ag3/ag3.4.ipynb @@ -726,9 +726,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "python3" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -740,7 +740,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/ag3.5.ipynb b/_sources/ag3/ag3.5.ipynb index f8235a6..7a83a6a 100644 --- a/_sources/ag3/ag3.5.ipynb +++ b/_sources/ag3/ag3.5.ipynb @@ -782,9 +782,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "python3" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -796,7 +796,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/ag3.6.ipynb b/_sources/ag3/ag3.6.ipynb index 7c3c5f1..65ea41b 100644 --- a/_sources/ag3/ag3.6.ipynb +++ b/_sources/ag3/ag3.6.ipynb @@ -778,9 +778,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "python3" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -792,7 +792,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/ag3.7.ipynb b/_sources/ag3/ag3.7.ipynb index deea6e1..a14b099 100644 --- a/_sources/ag3/ag3.7.ipynb +++ b/_sources/ag3/ag3.7.ipynb @@ -949,9 +949,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "python3" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -963,7 +963,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/ag3.8.ipynb b/_sources/ag3/ag3.8.ipynb index 7ae240d..141151b 100644 --- a/_sources/ag3/ag3.8.ipynb +++ b/_sources/ag3/ag3.8.ipynb @@ -965,9 +965,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "python3" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -979,7 +979,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/ag3.9.ipynb b/_sources/ag3/ag3.9.ipynb index fa19360..9ca5e77 100644 --- a/_sources/ag3/ag3.9.ipynb +++ b/_sources/ag3/ag3.9.ipynb @@ -1507,9 +1507,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "global-global-mgenv-6.0.5", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "conda-env-global-global-mgenv-6.0.5-py" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -1521,7 +1521,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/analysis.ipynb b/_sources/ag3/analysis.ipynb index 91bf314..1aecab7 100644 --- a/_sources/ag3/analysis.ipynb +++ b/_sources/ag3/analysis.ipynb @@ -35,9 +35,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "python3" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -49,7 +49,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/ag3/cloud.ipynb b/_sources/ag3/cloud.ipynb index f2a4d68..a22000a 100644 --- a/_sources/ag3/cloud.ipynb +++ b/_sources/ag3/cloud.ipynb @@ -17442,9 +17442,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "python3" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -17456,9 +17456,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.10" + "version": "3.12.7" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/_sources/ag3/download.ipynb b/_sources/ag3/download.ipynb index 2bede7b..812f304 100644 --- a/_sources/ag3/download.ipynb +++ b/_sources/ag3/download.ipynb @@ -93,35 +93,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "sample_set\tsample_count\r\n", - "AG1000G-AO\t81\r\n", - "AG1000G-BF-A\t181\r\n", - "AG1000G-BF-B\t102\r\n", - "AG1000G-BF-C\t13\r\n", - "AG1000G-CD\t76\r\n", - "AG1000G-CF\t73\r\n", - "AG1000G-CI\t80\r\n", - "AG1000G-CM-A\t303\r\n", - "AG1000G-CM-B\t97\r\n", - "AG1000G-CM-C\t44\r\n", - "AG1000G-FR\t23\r\n", - "AG1000G-GA-A\t69\r\n", - "AG1000G-GH\t100\r\n", - "AG1000G-GM-A\t74\r\n", - "AG1000G-GM-B\t31\r\n", - "AG1000G-GM-C\t174\r\n", - "AG1000G-GN-A\t45\r\n", - "AG1000G-GN-B\t185\r\n", - "AG1000G-GQ\t10\r\n", - "AG1000G-GW\t101\r\n", - "AG1000G-KE\t86\r\n", - "AG1000G-ML-A\t60\r\n", - "AG1000G-ML-B\t71\r\n", - "AG1000G-MW\t41\r\n", - "AG1000G-MZ\t74\r\n", - "AG1000G-TZ\t300\r\n", - "AG1000G-UG\t290\r\n", - "AG1000G-X\t297\r\n" + "sample_set\tsample_count\n", + "AG1000G-AO\t81\n", + "AG1000G-BF-A\t181\n", + "AG1000G-BF-B\t102\n", + "AG1000G-BF-C\t13\n", + "AG1000G-CD\t76\n", + "AG1000G-CF\t73\n", + "AG1000G-CI\t80\n", + "AG1000G-CM-A\t303\n", + "AG1000G-CM-B\t97\n", + "AG1000G-CM-C\t44\n", + "AG1000G-FR\t23\n", + "AG1000G-GA-A\t69\n", + "AG1000G-GH\t100\n", + "AG1000G-GM-A\t74\n", + "AG1000G-GM-B\t31\n", + "AG1000G-GM-C\t174\n", + "AG1000G-GN-A\t45\n", + "AG1000G-GN-B\t185\n", + "AG1000G-GQ\t10\n", + "AG1000G-GW\t101\n", + "AG1000G-KE\t86\n", + "AG1000G-ML-A\t60\n", + "AG1000G-ML-B\t71\n", + "AG1000G-MW\t41\n", + "AG1000G-MZ\t74\n", + "AG1000G-TZ\t300\n", + "AG1000G-UG\t290\n", + "AG1000G-X\t297\n" ] } ], @@ -209,16 +209,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call\r\n", - "AB0085-Cx,BF2-4,Austin Burt,Burkina Faso,Pala,2012,7,11.151,-4.235,F\r\n", - "AB0086-Cx,BF2-6,Austin Burt,Burkina Faso,Pala,2012,7,11.151,-4.235,F\r\n", - "AB0087-C,BF3-3,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F\r\n", - "AB0088-C,BF3-5,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F\r\n", - "AB0089-Cx,BF3-8,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F\r\n", - "AB0090-C,BF3-10,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F\r\n", - "AB0091-C,BF3-12,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F\r\n", - "AB0092-C,BF3-13,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F\r\n", - "AB0094-Cx,BF3-17,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F\r\n" + "sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call\n", + "AB0085-Cx,BF2-4,Austin Burt,Burkina Faso,Pala,2012,7,11.151,-4.235,F\n", + "AB0086-Cx,BF2-6,Austin Burt,Burkina Faso,Pala,2012,7,11.151,-4.235,F\n", + "AB0087-C,BF3-3,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F\n", + "AB0088-C,BF3-5,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F\n", + "AB0089-Cx,BF3-8,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F\n", + "AB0090-C,BF3-10,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F\n", + "AB0091-C,BF3-12,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F\n", + "AB0092-C,BF3-13,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F\n", + "AB0094-Cx,BF3-17,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F\n" ] } ], @@ -275,16 +275,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "sample_id,aim_species_fraction_arab,aim_species_fraction_colu,aim_species_fraction_colu_no2L,aim_species_gambcolu_arabiensis,aim_species_gambiae_coluzzii,aim_species\r\n", - "AB0085-Cx,0.0007664303506418854,0.014316392269148175,0.016488046166529265,gambcolu,gambiae,gambiae\r\n", - "AB0086-Cx,0.0024904214559386974,0.023571428571428573,0.02631578947368421,gambcolu,gambiae,gambiae\r\n", - "AB0087-C,0.0017244682889442423,0.9247311827956989,0.9810074318744839,gambcolu,coluzzii,coluzzii\r\n", - "AB0088-C,0.001915341888527102,0.8686288585786073,0.9875930521091811,gambcolu,coluzzii,coluzzii\r\n", - "AB0089-Cx,0.0028735632183908046,0.8567335243553008,0.9735973597359736,gambcolu,coluzzii,coluzzii\r\n", - "AB0090-C,0.0011500862564692352,0.972083035075161,0.9777411376751854,gambcolu,coluzzii,coluzzii\r\n", - "AB0091-C,0.0015322735108216816,0.9157060518731989,0.9750830564784053,gambcolu,coluzzii,coluzzii\r\n", - "AB0092-C,0.0013417672992141077,0.870157819225251,0.9801652892561984,gambcolu,coluzzii,coluzzii\r\n", - "AB0094-Cx,0.002106069308826345,0.8658536585365854,0.984297520661157,gambcolu,coluzzii,coluzzii\r\n" + "sample_id,aim_species_fraction_arab,aim_species_fraction_colu,aim_species_fraction_colu_no2L,aim_species_gambcolu_arabiensis,aim_species_gambiae_coluzzii,aim_species\n", + "AB0085-Cx,0.0007664303506418854,0.014316392269148175,0.016488046166529265,gambcolu,gambiae,gambiae\n", + "AB0086-Cx,0.0024904214559386974,0.023571428571428573,0.02631578947368421,gambcolu,gambiae,gambiae\n", + "AB0087-C,0.0017244682889442423,0.9247311827956989,0.9810074318744839,gambcolu,coluzzii,coluzzii\n", + "AB0088-C,0.001915341888527102,0.8686288585786073,0.9875930521091811,gambcolu,coluzzii,coluzzii\n", + "AB0089-Cx,0.0028735632183908046,0.8567335243553008,0.9735973597359736,gambcolu,coluzzii,coluzzii\n", + "AB0090-C,0.0011500862564692352,0.972083035075161,0.9777411376751854,gambcolu,coluzzii,coluzzii\n", + "AB0091-C,0.0015322735108216816,0.9157060518731989,0.9750830564784053,gambcolu,coluzzii,coluzzii\n", + "AB0092-C,0.0013417672992141077,0.870157819225251,0.9801652892561984,gambcolu,coluzzii,coluzzii\n", + "AB0094-Cx,0.002106069308826345,0.8658536585365854,0.984297520661157,gambcolu,coluzzii,coluzzii\n" ] } ], @@ -335,16 +335,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "sample_id,ena_run\r\n", - "AR0001-C,ERR347035\r\n", - "AR0001-C,ERR347047\r\n", - "AR0001-C,ERR352136\r\n", - "AR0002-C,ERR328585\r\n", - "AR0002-C,ERR323844\r\n", - "AR0002-C,ERR328597\r\n", - "AR0004-C,ERR343648\r\n", - "AR0004-C,ERR343636\r\n", - "AR0004-C,ERR343468\r\n" + "sample_id,ena_run\n", + "AR0001-C,ERR347035\n", + "AR0001-C,ERR347047\n", + "AR0001-C,ERR352136\n", + "AR0002-C,ERR328585\n", + "AR0002-C,ERR323844\n", + "AR0002-C,ERR328597\n", + "AR0004-C,ERR343648\n", + "AR0004-C,ERR343636\n", + "AR0004-C,ERR343468\n" ] } ], @@ -393,26 +393,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "sample_id,ena_analysis\r\n", - "\r\n", - "AR0001-C,ERZ1695275\r\n", - "\r\n", - "AR0002-C,ERZ1695276\r\n", - "\r\n", - "AR0004-C,ERZ1695277\r\n", - "\r\n", - "AR0006-C,ERZ1695278\r\n", - "\r\n", - "AR0007-C,ERZ1695279\r\n", - "\r\n", - "AR0008-C,ERZ1695280\r\n", - "\r\n", - "AR0009-C,ERZ1695281\r\n", - "\r\n", - "AR0010-Cx,ERZ1695282\r\n", - "\r\n", - "AR0011-C,ERZ1695283\r\n", - "\r\n" + "sample_id,ena_analysis\n", + "\n", + "AR0001-C,ERZ1695275\n", + "\n", + "AR0002-C,ERZ1695276\n", + "\n", + "AR0004-C,ERZ1695277\n", + "\n", + "AR0006-C,ERZ1695278\n", + "\n", + "AR0007-C,ERZ1695279\n", + "\n", + "AR0008-C,ERZ1695280\n", + "\n", + "AR0009-C,ERZ1695281\n", + "\n", + "AR0010-Cx,ERZ1695282\n", + "\n", + "AR0011-C,ERZ1695283\n", + "\n" ] } ], @@ -453,16 +453,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "sample_id,snp_genotypes_vcf\r\n", - "AR0047-C,https://vo_agam_output.cog.sanger.ac.uk/AR0047-C.vcf.gz\r\n", - "AR0049-C,https://vo_agam_output.cog.sanger.ac.uk/AR0049-C.vcf.gz\r\n", - "AR0051-C,https://vo_agam_output.cog.sanger.ac.uk/AR0051-C.vcf.gz\r\n", - "AR0061-C,https://vo_agam_output.cog.sanger.ac.uk/AR0061-C.vcf.gz\r\n", - "AR0078-C,https://vo_agam_output.cog.sanger.ac.uk/AR0078-C.vcf.gz\r\n", - "AR0080-C,https://vo_agam_output.cog.sanger.ac.uk/AR0080-C.vcf.gz\r\n", - "AR0084-C,https://vo_agam_output.cog.sanger.ac.uk/AR0084-C.vcf.gz\r\n", - "AR0097-C,https://vo_agam_output.cog.sanger.ac.uk/AR0097-C.vcf.gz\r\n", - "AR0072-C,https://vo_agam_output.cog.sanger.ac.uk/AR0072-C.vcf.gz\r\n" + "sample_id,snp_genotypes_vcf\n", + "AR0047-C,https://vo_agam_output.cog.sanger.ac.uk/AR0047-C.vcf.gz\n", + "AR0049-C,https://vo_agam_output.cog.sanger.ac.uk/AR0049-C.vcf.gz\n", + "AR0051-C,https://vo_agam_output.cog.sanger.ac.uk/AR0051-C.vcf.gz\n", + "AR0061-C,https://vo_agam_output.cog.sanger.ac.uk/AR0061-C.vcf.gz\n", + "AR0078-C,https://vo_agam_output.cog.sanger.ac.uk/AR0078-C.vcf.gz\n", + "AR0080-C,https://vo_agam_output.cog.sanger.ac.uk/AR0080-C.vcf.gz\n", + "AR0084-C,https://vo_agam_output.cog.sanger.ac.uk/AR0084-C.vcf.gz\n", + "AR0097-C,https://vo_agam_output.cog.sanger.ac.uk/AR0097-C.vcf.gz\n", + "AR0072-C,https://vo_agam_output.cog.sanger.ac.uk/AR0072-C.vcf.gz\n" ] } ], @@ -1031,9 +1031,9 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "python3" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -1045,7 +1045,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.10" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/amin1/cloud.ipynb b/_sources/amin1/cloud.ipynb index 46a74ae..ec5daa3 100644 --- a/_sources/amin1/cloud.ipynb +++ b/_sources/amin1/cloud.ipynb @@ -1,6179 +1,6179 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "DZw8vyUJ0y5k" - }, - "source": [ - "# Amin1.0 cloud data access\n", - "\n", - "This page provides information about how to access data from the [Amin1.0 resource](intro) via Google Cloud. This includes sample metadata and single nucleotide polymorphism (SNP) calls.\n", - "\n", - "This notebook illustrates how to read data directly from the cloud, without having to first download any data locally. This notebook can be run from any computer, but will work best when run from a compute node within Google Cloud, because it will be physically closer to the data and so data transfer is faster. For example, this notebook can be run via [Google Colab](https://colab.research.google.com/) which is free interactive computing service running in the cloud.\n", - "\n", - "To launch this notebook in the cloud and run it for yourself, click the launch icon () at the top of the page and select one of the cloud computing services available.\n", - "\n", - "## Data hosting\n", - "\n", - "All data required for this notebook is hosted on Google Cloud Storage (GCS). Data are hosted in the `vo_amin_release` bucket, which is a multi-region bucket located in the United States. All data hosted in GCS are publicly accessible and do not require any authentication to access. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Zn_-HkLIQH_0" - }, - "source": [ - "## Setup\n", - "\n", - "Running this notebook requires some Python packages to be installed. These packages can be installed via pip or conda. E.g.:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "wqHBq442QH_1", - "tags": [ - "hide-output" - ], - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "480221f7-02f7-4c8c-d3e6-bb9484a04ce6" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\u001b[K |████████████████████████████████| 133 kB 6.3 MB/s \n", - "\u001b[K |████████████████████████████████| 5.7 MB 37.8 MB/s \n", - "\u001b[K |████████████████████████████████| 2.3 MB 45.4 MB/s \n", - "\u001b[K |████████████████████████████████| 153 kB 38.8 MB/s \n", - "\u001b[K |████████████████████████████████| 1.1 MB 46.9 MB/s \n", - "\u001b[K |████████████████████████████████| 144 kB 44.7 MB/s \n", - "\u001b[K |████████████████████████████████| 271 kB 54.7 MB/s \n", - "\u001b[K |████████████████████████████████| 94 kB 2.6 MB/s \n", - "\u001b[K |████████████████████████████████| 6.2 MB 41.1 MB/s \n", - "\u001b[?25h Building wheel for asciitree (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } - ], - "source": [ - "!pip install -q malariagen_data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "osgZ9pB7QH_1" - }, - "source": [ - "To make accessing these data more convenient, we've created the [malariagen_data Python package](https://github.com/malariagen/malariagen-data-python), which is available from PyPI. This is experimental so please let us know if you find any bugs or have any suggestions. \n", - "\n", - "Now import the packages we'll need to use here." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "970klnG1eu8N" - }, - "outputs": [], - "source": [ - "import malariagen_data\n", - "import numpy as np\n", - "import dask.array as da\n", - "from dask.diagnostics.progress import ProgressBar\n", - "import dask\n", - "dask.config.set(**{'array.slicing.split_large_chunks': False})\n", - "import allel" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jPqZ-LFPQH_2" - }, - "source": [ - "Data access from Google Cloud is set up with the following code:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "mIsSaTuOQH_2" - }, - "outputs": [], - "source": [ - "amin1 = malariagen_data.Amin1()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "78L85pli9HdO" - }, - "source": [ - "## Sample metadata\n", - "\n", - "Data about the samples that were sequenced to generate this data resource are available, including the time and place of collection, the gender of the specimen, and our call regarding the species of the specimen. These are organised by sample set.\n", - "\n", - "Load sample metadata into a [pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#dataframe):" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 641 - }, - "id": "-V8nLGSaQH_4", - "outputId": "2c1fdbe0-7d63-4536-d345-d465f7f89e8b" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sample_idoriginal_sample_idsanger_sample_idpartner_sample_idcontributorcountrylocationyearmonthlatitudelongitudeseasonPCA_cohortcohortsubsampled_cohort
0VBS09378-4248STDY7308980VBS093784248STDY7308980CB-2-00264Brandy St. LaurentCambodiaPreah Kleang2016313.667104.982Feb-Apr (late dry)APVNaN
1VBS09382-4248STDY7308981VBS093824248STDY7308981CB-2-00258Brandy St. LaurentCambodiaPreah Kleang2016313.667104.982Feb-Apr (late dry)APVNaN
2VBS09397-4248STDY7308982VBS093974248STDY7308982CB-2-00384Brandy St. LaurentCambodiaPreah Kleang2016313.667104.982Feb-Apr (late dry)APVPV
3VBS09460-4248STDY7308986VBS094604248STDY7308986CB-2-02960Brandy St. LaurentCambodiaPreah Kleang2016613.667104.982May-Jul (early wet)APVNaN
4VBS09466-4248STDY7308989VBS094664248STDY7308989CB-2-04070Brandy St. LaurentCambodiaPreah Kleang20161113.667104.982Nov-Jan (early dry)APVNaN
................................................
297VBS16624-4248STDY7918667VBS166244248STDY7918667KV-32-01591Brandy St. LaurentCambodiaSayas2014613.548107.025May-Jul (early wet)CRK2RK2
298VBS16625-4248STDY7918668VBS166254248STDY7918668KV-32-01499Brandy St. LaurentCambodiaSayas2014613.548107.025May-Jul (early wet)CRK2RK2
299VBS16626-4248STDY7918669VBS166264248STDY7918669KV-32-01465Brandy St. LaurentCambodiaSayas2014613.548107.025May-Jul (early wet)BRK1RK1
300VBS16628-4248STDY7918670VBS166284248STDY7918670KV-32-01454Brandy St. LaurentCambodiaSayas2014613.548107.025May-Jul (early wet)CRK2RK2
301VBS16630-4248STDY7918671VBS166304248STDY7918671KV-31-01949Brandy St. LaurentCambodiaChamkar San2014413.595106.995Feb-Apr (late dry)BRK1RK1
\n", - "

302 rows × 15 columns

\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ], - "text/plain": [ - " sample_id original_sample_id ... cohort subsampled_cohort\n", - "0 VBS09378-4248STDY7308980 VBS09378 ... PV NaN\n", - "1 VBS09382-4248STDY7308981 VBS09382 ... PV NaN\n", - "2 VBS09397-4248STDY7308982 VBS09397 ... PV PV\n", - "3 VBS09460-4248STDY7308986 VBS09460 ... PV NaN\n", - "4 VBS09466-4248STDY7308989 VBS09466 ... PV NaN\n", - ".. ... ... ... ... ...\n", - "297 VBS16624-4248STDY7918667 VBS16624 ... RK2 RK2\n", - "298 VBS16625-4248STDY7918668 VBS16625 ... RK2 RK2\n", - "299 VBS16626-4248STDY7918669 VBS16626 ... RK1 RK1\n", - "300 VBS16628-4248STDY7918670 VBS16628 ... RK2 RK2\n", - "301 VBS16630-4248STDY7918671 VBS16630 ... RK1 RK1\n", - "\n", - "[302 rows x 15 columns]" - ] - }, - "metadata": {}, - "execution_count": 4 - } - ], - "source": [ - "df_samples = amin1.sample_metadata()\n", - "df_samples" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ssCdOykfQH_4" - }, - "source": [ - "The `sample_id` column gives the sample identifier used throughout all analyses.\n", - "\n", - "The `country`, `location`, `latitude` and `longitude` columns give the location where the specimen was collected.\n", - "\n", - "The `year` and `month` columns give the approximate date when the specimen was collected.\n", - "\n", - "The `cohort` column gives an assignment of individual mosquitoes to populations based on location of sampling and genetic population structure. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9APw05D5gAQ9" - }, - "source": [ - "[Pandas](https://pandas.pydata.org/) can be used to explore and query the sample metadata in various ways. E.g., here is a summary of the numbers of samples by species:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 255 - }, - "id": "PpsTgviZQH_4", - "outputId": "197e8763-9b1c-480e-cfc0-9ab0891cde89" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Number of mosquito specimens by collection site and year.
  year20102011201420152016
longitudelatitudelocation     
102.73512.155Thmar Da2615000
104.9213.77Chean Mok006690
104.98213.667Preah Kleang0047936
106.99513.595Chamkar San0040110
107.02513.548Sayas003940
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "execution_count": 5 - } - ], - "source": [ - "df_summary = df_samples.pivot_table(\n", - " index=[\"longitude\", \"latitude\", \"location\"], \n", - " columns=[\"year\"],\n", - " values=\"sample_id\", \n", - " aggfunc=len,\n", - " fill_value=0\n", - ")\n", - "df_summary.style.set_caption(\"Number of mosquito specimens by collection site and year.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hPhD-MauERQ0" - }, - "source": [ - "## Reference genome\n", - "\n", - "Sequence data in this study were aligned to the MINIMUS1 reference genome. This reference genome contains 678 contigs in total, but many contigs are small and not suitable for population genetic analyses. We therefore have included only SNP calls for the 40 largest contigs. The set of contigs used for SNP calling can be accessed as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "-bhq4HCFERQ1", - "outputId": "c048cb11-eeae-4bd6-ef12-c8b0f3c67593" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "('KB663610',\n", - " 'KB663611',\n", - " 'KB663622',\n", - " 'KB663633',\n", - " 'KB663644',\n", - " 'KB663655',\n", - " 'KB663666',\n", - " 'KB663677',\n", - " 'KB663688',\n", - " 'KB663699',\n", - " 'KB663710',\n", - " 'KB663721',\n", - " 'KB663722',\n", - " 'KB663733',\n", - " 'KB663744',\n", - " 'KB663755',\n", - " 'KB663766',\n", - " 'KB663777',\n", - " 'KB663788',\n", - " 'KB663799',\n", - " 'KB663810',\n", - " 'KB663821',\n", - " 'KB663832',\n", - " 'KB663833',\n", - " 'KB663844',\n", - " 'KB663855',\n", - " 'KB663866',\n", - " 'KB663877',\n", - " 'KB663888',\n", - " 'KB663899',\n", - " 'KB663910',\n", - " 'KB663921',\n", - " 'KB663932',\n", - " 'KB663943',\n", - " 'KB663955',\n", - " 'KB664054',\n", - " 'KB664165',\n", - " 'KB664255',\n", - " 'KB664266',\n", - " 'KB664277')" - ] - }, - "metadata": {}, - "execution_count": 6 - } - ], - "source": [ - "amin1.contigs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7ajM0DKUERQ1" - }, - "source": [ - "For convenience, the reference genome sequence for any contig can be loaded as a [NumPy array](https://numpy.org/doc/stable/user/absolute_beginners.html), e.g.:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "rBTsYhUSERQ1", - "outputId": "10d6549f-fc40-4da2-c98e-d4d9edd15591" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "array([b'T', b'T', b'C', ..., b'C', b'A', b'C'], dtype='|S1')" - ] - }, - "metadata": {}, - "execution_count": 7 - } - ], - "source": [ - "# load the reference sequence for a single contig as a numpy array\n", - "seq = amin1.genome_sequence(region=\"KB663610\").compute()\n", - "seq" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "GByJ16LPERQ1", - "outputId": "e5c6c626-49e0-4115-85d9-4646a782588a" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "31626230" - ] - }, - "metadata": {}, - "execution_count": 8 - } - ], - "source": [ - "# length of contig\n", - "len(seq)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "i-uFxYwaERQ2", - "outputId": "05c5798f-32e3-40d1-acf6-79d90cd97f8b" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "30141520" - ] - }, - "metadata": {}, - "execution_count": 9 - } - ], - "source": [ - "# number of called bases in contig\n", - "np.sum((seq != b'N') & (seq != b'n'))" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "DZw8vyUJ0y5k" + }, + "source": [ + "# Amin1.0 cloud data access\n", + "\n", + "This page provides information about how to access data from the [Amin1.0 resource](intro) via Google Cloud. This includes sample metadata and single nucleotide polymorphism (SNP) calls.\n", + "\n", + "This notebook illustrates how to read data directly from the cloud, without having to first download any data locally. This notebook can be run from any computer, but will work best when run from a compute node within Google Cloud, because it will be physically closer to the data and so data transfer is faster. For example, this notebook can be run via [Google Colab](https://colab.research.google.com/) which is free interactive computing service running in the cloud.\n", + "\n", + "To launch this notebook in the cloud and run it for yourself, click the launch icon () at the top of the page and select one of the cloud computing services available.\n", + "\n", + "## Data hosting\n", + "\n", + "All data required for this notebook is hosted on Google Cloud Storage (GCS). Data are hosted in the `vo_amin_release` bucket, which is a multi-region bucket located in the United States. All data hosted in GCS are publicly accessible and do not require any authentication to access. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Zn_-HkLIQH_0" + }, + "source": [ + "## Setup\n", + "\n", + "Running this notebook requires some Python packages to be installed. These packages can be installed via pip or conda. E.g.:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "wqHBq442QH_1", + "outputId": "480221f7-02f7-4c8c-d3e6-bb9484a04ce6", + "tags": [ + "hide-output" + ] + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "Lvv-lFHJ-Um2" - }, - "source": [ - "## SNP calls\n", - "\n", - "We have called SNP genotypes in all samples at all positions in the genome where the reference allele is not \"N\". Data on the SNP positions, alleles, site filters and genotype calls for a given contig can be accessed as an [xarray Dataset](http://xarray.pydata.org/en/stable/user-guide/data-structures.html#dataset). E.g., access SNP calls for contig KB663610: " - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[K |████████████████████████████████| 133 kB 6.3 MB/s \n", + "\u001b[K |████████████████████████████████| 5.7 MB 37.8 MB/s \n", + "\u001b[K |████████████████████████████████| 2.3 MB 45.4 MB/s \n", + "\u001b[K |████████████████████████████████| 153 kB 38.8 MB/s \n", + "\u001b[K |████████████████████████████████| 1.1 MB 46.9 MB/s \n", + "\u001b[K |████████████████████████████████| 144 kB 44.7 MB/s \n", + "\u001b[K |████████████████████████████████| 271 kB 54.7 MB/s \n", + "\u001b[K |████████████████████████████████| 94 kB 2.6 MB/s \n", + "\u001b[K |████████████████████████████████| 6.2 MB 41.1 MB/s \n", + "\u001b[?25h Building wheel for asciitree (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "!pip install -q malariagen_data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "osgZ9pB7QH_1" + }, + "source": [ + "To make accessing these data more convenient, we've created the [malariagen_data Python package](https://github.com/malariagen/malariagen-data-python), which is available from PyPI. This is experimental so please let us know if you find any bugs or have any suggestions. \n", + "\n", + "Now import the packages we'll need to use here." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "970klnG1eu8N" + }, + "outputs": [], + "source": [ + "import malariagen_data\n", + "import numpy as np\n", + "import dask.array as da\n", + "from dask.diagnostics.progress import ProgressBar\n", + "import dask\n", + "dask.config.set(**{'array.slicing.split_large_chunks': False})\n", + "import allel" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jPqZ-LFPQH_2" + }, + "source": [ + "Data access from Google Cloud is set up with the following code:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "mIsSaTuOQH_2" + }, + "outputs": [], + "source": [ + "amin1 = malariagen_data.Amin1()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "78L85pli9HdO" + }, + "source": [ + "## Sample metadata\n", + "\n", + "Data about the samples that were sequenced to generate this data resource are available, including the time and place of collection, the gender of the specimen, and our call regarding the species of the specimen. These are organised by sample set.\n", + "\n", + "Load sample metadata into a [pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#dataframe):" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 641 }, + "id": "-V8nLGSaQH_4", + "outputId": "2c1fdbe0-7d63-4536-d345-d465f7f89e8b" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 475 - }, - "id": "24I2b0ZrERQ2", - "outputId": "d88cbe81-b50b-4408-9bdd-1f91d67a2ec2" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.Dataset>\n",
-              "Dimensions:              (alleles: 4, ploidy: 2, samples: 302, variants: 30141520)\n",
-              "Coordinates:\n",
-              "    variant_position     (variants) int32 dask.array<chunksize=(524288,), meta=np.ndarray>\n",
-              "    variant_contig       (variants) uint8 dask.array<chunksize=(524288,), meta=np.ndarray>\n",
-              "    sample_id            (samples) |S24 dask.array<chunksize=(302,), meta=np.ndarray>\n",
-              "Dimensions without coordinates: alleles, ploidy, samples, variants\n",
-              "Data variables:\n",
-              "    variant_allele       (variants, alleles) |S1 dask.array<chunksize=(524288, 1), meta=np.ndarray>\n",
-              "    variant_filter_pass  (variants) bool dask.array<chunksize=(941923,), meta=np.ndarray>\n",
-              "    call_genotype        (variants, samples, ploidy) int8 dask.array<chunksize=(300000, 50, 2), meta=np.ndarray>\n",
-              "    call_GQ              (variants, samples) int8 dask.array<chunksize=(300000, 50), meta=np.ndarray>\n",
-              "    call_MQ              (variants, samples) float32 dask.array<chunksize=(300000, 50), meta=np.ndarray>\n",
-              "    call_AD              (variants, samples, alleles) int16 dask.array<chunksize=(300000, 50, 4), meta=np.ndarray>\n",
-              "    call_genotype_mask   (variants, samples, ploidy) bool dask.array<chunksize=(300000, 50, 2), meta=np.ndarray>\n",
-              "Attributes:\n",
-              "    contigs:  ('KB663610', 'KB663611', 'KB663622', 'KB663633', 'KB663644', 'K...
" - ], - "text/plain": [ - "\n", - "Dimensions: (alleles: 4, ploidy: 2, samples: 302, variants: 30141520)\n", - "Coordinates:\n", - " variant_position (variants) int32 dask.array\n", - " variant_contig (variants) uint8 dask.array\n", - " sample_id (samples) |S24 dask.array\n", - "Dimensions without coordinates: alleles, ploidy, samples, variants\n", - "Data variables:\n", - " variant_allele (variants, alleles) |S1 dask.array\n", - " variant_filter_pass (variants) bool dask.array\n", - " call_genotype (variants, samples, ploidy) int8 dask.array\n", - " call_GQ (variants, samples) int8 dask.array\n", - " call_MQ (variants, samples) float32 dask.array\n", - " call_AD (variants, samples, alleles) int16 dask.array\n", - " call_genotype_mask (variants, samples, ploidy) bool dask.array\n", - "Attributes:\n", - " contigs: ('KB663610', 'KB663611', 'KB663622', 'KB663633', 'KB663644', 'K..." - ] - }, - "metadata": {}, - "execution_count": 10 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idoriginal_sample_idsanger_sample_idpartner_sample_idcontributorcountrylocationyearmonthlatitudelongitudeseasonPCA_cohortcohortsubsampled_cohort
0VBS09378-4248STDY7308980VBS093784248STDY7308980CB-2-00264Brandy St. LaurentCambodiaPreah Kleang2016313.667104.982Feb-Apr (late dry)APVNaN
1VBS09382-4248STDY7308981VBS093824248STDY7308981CB-2-00258Brandy St. LaurentCambodiaPreah Kleang2016313.667104.982Feb-Apr (late dry)APVNaN
2VBS09397-4248STDY7308982VBS093974248STDY7308982CB-2-00384Brandy St. LaurentCambodiaPreah Kleang2016313.667104.982Feb-Apr (late dry)APVPV
3VBS09460-4248STDY7308986VBS094604248STDY7308986CB-2-02960Brandy St. LaurentCambodiaPreah Kleang2016613.667104.982May-Jul (early wet)APVNaN
4VBS09466-4248STDY7308989VBS094664248STDY7308989CB-2-04070Brandy St. LaurentCambodiaPreah Kleang20161113.667104.982Nov-Jan (early dry)APVNaN
................................................
297VBS16624-4248STDY7918667VBS166244248STDY7918667KV-32-01591Brandy St. LaurentCambodiaSayas2014613.548107.025May-Jul (early wet)CRK2RK2
298VBS16625-4248STDY7918668VBS166254248STDY7918668KV-32-01499Brandy St. LaurentCambodiaSayas2014613.548107.025May-Jul (early wet)CRK2RK2
299VBS16626-4248STDY7918669VBS166264248STDY7918669KV-32-01465Brandy St. LaurentCambodiaSayas2014613.548107.025May-Jul (early wet)BRK1RK1
300VBS16628-4248STDY7918670VBS166284248STDY7918670KV-32-01454Brandy St. LaurentCambodiaSayas2014613.548107.025May-Jul (early wet)CRK2RK2
301VBS16630-4248STDY7918671VBS166304248STDY7918671KV-31-01949Brandy St. LaurentCambodiaChamkar San2014413.595106.995Feb-Apr (late dry)BRK1RK1
\n", + "

302 rows × 15 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "ds_snps = amin1.snp_calls(region=\"KB663610\", site_mask=False)\n", - "ds_snps" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oKkkQty-ERQ2" - }, - "source": [ - "The arrays within this dataset are backed by [Dask arrays](https://docs.dask.org/en/stable/array.html), and can be accessed as shown below." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hSqXiY93ERQ3" - }, - "source": [ - "### SNP positions and alleles" + "text/plain": [ + " sample_id original_sample_id ... cohort subsampled_cohort\n", + "0 VBS09378-4248STDY7308980 VBS09378 ... PV NaN\n", + "1 VBS09382-4248STDY7308981 VBS09382 ... PV NaN\n", + "2 VBS09397-4248STDY7308982 VBS09397 ... PV PV\n", + "3 VBS09460-4248STDY7308986 VBS09460 ... PV NaN\n", + "4 VBS09466-4248STDY7308989 VBS09466 ... PV NaN\n", + ".. ... ... ... ... ...\n", + "297 VBS16624-4248STDY7918667 VBS16624 ... RK2 RK2\n", + "298 VBS16625-4248STDY7918668 VBS16625 ... RK2 RK2\n", + "299 VBS16626-4248STDY7918669 VBS16626 ... RK1 RK1\n", + "300 VBS16628-4248STDY7918670 VBS16628 ... RK2 RK2\n", + "301 VBS16630-4248STDY7918671 VBS16630 ... RK1 RK1\n", + "\n", + "[302 rows x 15 columns]" ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_samples = amin1.sample_metadata()\n", + "df_samples" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ssCdOykfQH_4" + }, + "source": [ + "The `sample_id` column gives the sample identifier used throughout all analyses.\n", + "\n", + "The `country`, `location`, `latitude` and `longitude` columns give the location where the specimen was collected.\n", + "\n", + "The `year` and `month` columns give the approximate date when the specimen was collected.\n", + "\n", + "The `cohort` column gives an assignment of individual mosquitoes to populations based on location of sampling and genetic population structure. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9APw05D5gAQ9" + }, + "source": [ + "[Pandas](https://pandas.pydata.org/) can be used to explore and query the sample metadata in various ways. E.g., here is a summary of the numbers of samples by species:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 255 }, + "id": "PpsTgviZQH_4", + "outputId": "197e8763-9b1c-480e-cfc0-9ab0891cde89" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 130 - }, - "id": "KE-JjBGFERQ3", - "outputId": "7fba7bd9-21fc-46cf-ffb1-a922693f1d60" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Array Chunk
Bytes 120.57 MB 2.10 MB
Shape (30141520,) (524288,)
Count 59 Tasks 58 Chunks
Type int32 numpy.ndarray
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " 30141520\n", - " 1\n", - "\n", - "
" - ], - "text/plain": [ - "dask.array" - ] - }, - "metadata": {}, - "execution_count": 11 - } + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Number of mosquito specimens by collection site and year.
  year20102011201420152016
longitudelatitudelocation     
102.73512.155Thmar Da2615000
104.9213.77Chean Mok006690
104.98213.667Preah Kleang0047936
106.99513.595Chamkar San0040110
107.02513.548Sayas003940
\n" ], - "source": [ - "# SNP positions (1-based)\n", - "pos = ds_snps['variant_position'].data\n", - "pos" + "text/plain": [ + "" ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_summary = df_samples.pivot_table(\n", + " index=[\"longitude\", \"latitude\", \"location\"], \n", + " columns=[\"year\"],\n", + " values=\"sample_id\", \n", + " aggfunc=len,\n", + " fill_value=0\n", + ")\n", + "df_summary.style.set_caption(\"Number of mosquito specimens by collection site and year.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hPhD-MauERQ0" + }, + "source": [ + "## Reference genome\n", + "\n", + "Sequence data in this study were aligned to the MINIMUS1 reference genome. This reference genome contains 678 contigs in total, but many contigs are small and not suitable for population genetic analyses. We therefore have included only SNP calls for the 40 largest contigs. The set of contigs used for SNP calling can be accessed as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "-bhq4HCFERQ1", + "outputId": "c048cb11-eeae-4bd6-ef12-c8b0f3c67593" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "No8xjezlERQ3", - "outputId": "1af65416-4ec1-4329-ce53-1fa82e2f6a4e" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=int32)" - ] - }, - "metadata": {}, - "execution_count": 12 - } - ], - "source": [ - "# read first 10 SNP positions into a numpy array\n", - "p = pos[:10].compute()\n", - "p" + "data": { + "text/plain": [ + "('KB663610',\n", + " 'KB663611',\n", + " 'KB663622',\n", + " 'KB663633',\n", + " 'KB663644',\n", + " 'KB663655',\n", + " 'KB663666',\n", + " 'KB663677',\n", + " 'KB663688',\n", + " 'KB663699',\n", + " 'KB663710',\n", + " 'KB663721',\n", + " 'KB663722',\n", + " 'KB663733',\n", + " 'KB663744',\n", + " 'KB663755',\n", + " 'KB663766',\n", + " 'KB663777',\n", + " 'KB663788',\n", + " 'KB663799',\n", + " 'KB663810',\n", + " 'KB663821',\n", + " 'KB663832',\n", + " 'KB663833',\n", + " 'KB663844',\n", + " 'KB663855',\n", + " 'KB663866',\n", + " 'KB663877',\n", + " 'KB663888',\n", + " 'KB663899',\n", + " 'KB663910',\n", + " 'KB663921',\n", + " 'KB663932',\n", + " 'KB663943',\n", + " 'KB663955',\n", + " 'KB664054',\n", + " 'KB664165',\n", + " 'KB664255',\n", + " 'KB664266',\n", + " 'KB664277')" ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "amin1.contigs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7ajM0DKUERQ1" + }, + "source": [ + "For convenience, the reference genome sequence for any contig can be loaded as a [NumPy array](https://numpy.org/doc/stable/user/absolute_beginners.html), e.g.:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "rBTsYhUSERQ1", + "outputId": "10d6549f-fc40-4da2-c98e-d4d9edd15591" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 197 - }, - "id": "A3DdmWz3ERQ3", - "outputId": "7115f064-b386-431a-db7f-86761d9655f0" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Array Chunk
Bytes 120.57 MB 1.57 MB
Shape (30141520, 4) (524288, 3)
Count 292 Tasks 116 Chunks
Type |S1 numpy.ndarray
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " 4\n", - " 30141520\n", - "\n", - "
" - ], - "text/plain": [ - "dask.array" - ] - }, - "metadata": {}, - "execution_count": 13 - } - ], - "source": [ - "# SNP alleles (first column is the reference allele)\n", - "alleles = ds_snps['variant_allele'].data\n", - "alleles" + "data": { + "text/plain": [ + "array([b'T', b'T', b'C', ..., b'C', b'A', b'C'], dtype='|S1')" ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# load the reference sequence for a single contig as a numpy array\n", + "seq = amin1.genome_sequence(region=\"KB663610\").compute()\n", + "seq" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "GByJ16LPERQ1", + "outputId": "e5c6c626-49e0-4115-85d9-4646a782588a" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Pcr7eN8FERQ4", - "outputId": "ebd3c6fe-6554-4428-c28d-94953a9570ed" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "array([[b'T', b'C', b'A', b'G'],\n", - " [b'T', b'C', b'A', b'G'],\n", - " [b'C', b'A', b'G', b'T'],\n", - " [b'G', b'C', b'A', b'T'],\n", - " [b'T', b'C', b'A', b'G'],\n", - " [b'C', b'A', b'G', b'T'],\n", - " [b'A', b'C', b'G', b'T'],\n", - " [b'T', b'C', b'A', b'G'],\n", - " [b'T', b'C', b'A', b'G'],\n", - " [b'G', b'C', b'A', b'T']], dtype='|S1')" - ] - }, - "metadata": {}, - "execution_count": 14 - } - ], - "source": [ - "# read first 10 SNP alleles into a numpy array\n", - "a = alleles[:10, :].compute()\n", - "a" + "data": { + "text/plain": [ + "31626230" ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# length of contig\n", + "len(seq)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "i-uFxYwaERQ2", + "outputId": "05c5798f-32e3-40d1-acf6-79d90cd97f8b" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "-xWnFhOXERQ4" - }, - "source": [ - "### Site filters\n", - "\n", - "SNP calling is not always reliable, and we have created site filters to allow excluding low quality SNPs. For each contig, a \"filter_pass\" Boolean mask is available, where `True` indicates that the site passed the filter and is accessible to high quality SNP calling. " + "data": { + "text/plain": [ + "30141520" ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# number of called bases in contig\n", + "np.sum((seq != b'N') & (seq != b'n'))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lvv-lFHJ-Um2" + }, + "source": [ + "## SNP calls\n", + "\n", + "We have called SNP genotypes in all samples at all positions in the genome where the reference allele is not \"N\". Data on the SNP positions, alleles, site filters and genotype calls for a given contig can be accessed as an [xarray Dataset](http://xarray.pydata.org/en/stable/user-guide/data-structures.html#dataset). E.g., access SNP calls for contig KB663610: " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 475 }, + "id": "24I2b0ZrERQ2", + "outputId": "d88cbe81-b50b-4408-9bdd-1f91d67a2ec2" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 130 - }, - "id": "n97drvKwERQ4", - "outputId": "94b92427-239d-4c33-e44c-f5ce17ab662b" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Array Chunk
Bytes 30.14 MB 941.92 kB
Shape (30141520,) (941923,)
Count 33 Tasks 32 Chunks
Type bool numpy.ndarray
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " 30141520\n", - " 1\n", - "\n", - "
" - ], - "text/plain": [ - "dask.array" - ] - }, - "metadata": {}, - "execution_count": 15 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:              (alleles: 4, ploidy: 2, samples: 302, variants: 30141520)\n",
+       "Coordinates:\n",
+       "    variant_position     (variants) int32 dask.array<chunksize=(524288,), meta=np.ndarray>\n",
+       "    variant_contig       (variants) uint8 dask.array<chunksize=(524288,), meta=np.ndarray>\n",
+       "    sample_id            (samples) |S24 dask.array<chunksize=(302,), meta=np.ndarray>\n",
+       "Dimensions without coordinates: alleles, ploidy, samples, variants\n",
+       "Data variables:\n",
+       "    variant_allele       (variants, alleles) |S1 dask.array<chunksize=(524288, 1), meta=np.ndarray>\n",
+       "    variant_filter_pass  (variants) bool dask.array<chunksize=(941923,), meta=np.ndarray>\n",
+       "    call_genotype        (variants, samples, ploidy) int8 dask.array<chunksize=(300000, 50, 2), meta=np.ndarray>\n",
+       "    call_GQ              (variants, samples) int8 dask.array<chunksize=(300000, 50), meta=np.ndarray>\n",
+       "    call_MQ              (variants, samples) float32 dask.array<chunksize=(300000, 50), meta=np.ndarray>\n",
+       "    call_AD              (variants, samples, alleles) int16 dask.array<chunksize=(300000, 50, 4), meta=np.ndarray>\n",
+       "    call_genotype_mask   (variants, samples, ploidy) bool dask.array<chunksize=(300000, 50, 2), meta=np.ndarray>\n",
+       "Attributes:\n",
+       "    contigs:  ('KB663610', 'KB663611', 'KB663622', 'KB663633', 'KB663644', 'K...
" ], - "source": [ - "# site filters\n", - "filter_pass = ds_snps['variant_filter_pass'].data\n", - "filter_pass" + "text/plain": [ + "\n", + "Dimensions: (alleles: 4, ploidy: 2, samples: 302, variants: 30141520)\n", + "Coordinates:\n", + " variant_position (variants) int32 dask.array\n", + " variant_contig (variants) uint8 dask.array\n", + " sample_id (samples) |S24 dask.array\n", + "Dimensions without coordinates: alleles, ploidy, samples, variants\n", + "Data variables:\n", + " variant_allele (variants, alleles) |S1 dask.array\n", + " variant_filter_pass (variants) bool dask.array\n", + " call_genotype (variants, samples, ploidy) int8 dask.array\n", + " call_GQ (variants, samples) int8 dask.array\n", + " call_MQ (variants, samples) float32 dask.array\n", + " call_AD (variants, samples, alleles) int16 dask.array\n", + " call_genotype_mask (variants, samples, ploidy) bool dask.array\n", + "Attributes:\n", + " contigs: ('KB663610', 'KB663611', 'KB663622', 'KB663633', 'KB663644', 'K..." ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_snps = amin1.snp_calls(region=\"KB663610\", site_mask=False)\n", + "ds_snps" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oKkkQty-ERQ2" + }, + "source": [ + "The arrays within this dataset are backed by [Dask arrays](https://docs.dask.org/en/stable/array.html), and can be accessed as shown below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hSqXiY93ERQ3" + }, + "source": [ + "### SNP positions and alleles" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 130 }, + "id": "KE-JjBGFERQ3", + "outputId": "7fba7bd9-21fc-46cf-ffb1-a922693f1d60" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3mARZxOeERQ4", - "outputId": "c5b7d962-7b34-4e43-80f7-aea8cc9d666c" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "array([False, False, False, False, False, False, False, False, False,\n", - " False])" - ] - }, - "metadata": {}, - "execution_count": 16 - } + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Array Chunk
Bytes 120.57 MB 2.10 MB
Shape (30141520,) (524288,)
Count 59 Tasks 58 Chunks
Type int32 numpy.ndarray
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " 30141520\n", + " 1\n", + "\n", + "
" ], - "source": [ - "# load site filters for first 10 SNPs\n", - "f = filter_pass[:10].compute()\n", - "f" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yVX0PJlGgAQ_" - }, - "source": [ - "Note that we have chosen to genotype all samples at all sites in the genome, assuming all possible SNP alleles. Not all of these alternate alleles will actually have been observed in the samples. To determine which sites and alleles are segregating, an allele count can be performed over the samples you are interested in. See the example below. " + "text/plain": [ + "dask.array" ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# SNP positions (1-based)\n", + "pos = ds_snps['variant_position'].data\n", + "pos" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "No8xjezlERQ3", + "outputId": "1af65416-4ec1-4329-ce53-1fa82e2f6a4e" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "sMnfrmCNBzW8" - }, - "source": [ - "### SNP genotypes\n", - "\n", - "SNP genotypes for individual samples are available. Genotypes are stored as a three-dimensional array, where the first dimension corresponds to genomic positions, the second dimension is samples, and the third dimension is ploidy (2). Values coded as integers, where -1 represents a missing value, 0 represents the reference allele, and 1, 2, and 3 represent alternate alleles.\n", - "\n", - "SNP genotypes can be accessed as dask arrays as shown in the example below." + "data": { + "text/plain": [ + "array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=int32)" ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# read first 10 SNP positions into a numpy array\n", + "p = pos[:10].compute()\n", + "p" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 197 }, + "id": "A3DdmWz3ERQ3", + "outputId": "7115f064-b386-431a-db7f-86761d9655f0" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 172 - }, - "id": "Xk1-s2KDERQ5", - "outputId": "eb6eb58c-2731-493c-b8ea-ac02d123d14d" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Array Chunk
Bytes 18.21 GB 30.00 MB
Shape (30141520, 302, 2) (300000, 50, 2)
Count 708 Tasks 707 Chunks
Type int8 numpy.ndarray
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " 2\n", - " 302\n", - " 30141520\n", - "\n", - "
" - ], - "text/plain": [ - "dask.array" - ] - }, - "metadata": {}, - "execution_count": 17 - } + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Array Chunk
Bytes 120.57 MB 1.57 MB
Shape (30141520, 4) (524288, 3)
Count 292 Tasks 116 Chunks
Type |S1 numpy.ndarray
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " 4\n", + " 30141520\n", + "\n", + "
" ], - "source": [ - "gt = ds_snps['call_genotype'].data\n", - "gt" + "text/plain": [ + "dask.array" ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# SNP alleles (first column is the reference allele)\n", + "alleles = ds_snps['variant_allele'].data\n", + "alleles" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "Pcr7eN8FERQ4", + "outputId": "ebd3c6fe-6554-4428-c28d-94953a9570ed" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "FSYizL3gERQ5" - }, - "source": [ - "Note that the columns of this array (second dimension) match the rows in the sample metadata, if the same sample sets were loaded. I.e.:" + "data": { + "text/plain": [ + "array([[b'T', b'C', b'A', b'G'],\n", + " [b'T', b'C', b'A', b'G'],\n", + " [b'C', b'A', b'G', b'T'],\n", + " [b'G', b'C', b'A', b'T'],\n", + " [b'T', b'C', b'A', b'G'],\n", + " [b'C', b'A', b'G', b'T'],\n", + " [b'A', b'C', b'G', b'T'],\n", + " [b'T', b'C', b'A', b'G'],\n", + " [b'T', b'C', b'A', b'G'],\n", + " [b'G', b'C', b'A', b'T']], dtype='|S1')" ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# read first 10 SNP alleles into a numpy array\n", + "a = alleles[:10, :].compute()\n", + "a" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-xWnFhOXERQ4" + }, + "source": [ + "### Site filters\n", + "\n", + "SNP calling is not always reliable, and we have created site filters to allow excluding low quality SNPs. For each contig, a \"filter_pass\" Boolean mask is available, where `True` indicates that the site passed the filter and is accessible to high quality SNP calling. " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 130 }, + "id": "n97drvKwERQ4", + "outputId": "94b92427-239d-4c33-e44c-f5ce17ab662b" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "bR753FhnERQ5", - "outputId": "0c3de15e-058d-489f-ce35-2e117b6f8b37" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "True" - ] - }, - "metadata": {}, - "execution_count": 18 - } + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Array Chunk
Bytes 30.14 MB 941.92 kB
Shape (30141520,) (941923,)
Count 33 Tasks 32 Chunks
Type bool numpy.ndarray
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " 30141520\n", + " 1\n", + "\n", + "
" ], - "source": [ - "len(df_samples) == gt.shape[1]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lp50lmyOERQ5" - }, - "source": [ - "You can use this correspondance to apply further subsetting operations to the genotypes by querying the sample metadata. E.g.:" + "text/plain": [ + "dask.array" ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# site filters\n", + "filter_pass = ds_snps['variant_filter_pass'].data\n", + "filter_pass" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "3mARZxOeERQ4", + "outputId": "c5b7d962-7b34-4e43-80f7-aea8cc9d666c" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_dHG6HxtERQ6", - "outputId": "4d5d8ddf-75f7-42bf-de2b-35892e24017a" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "array(['PV', nan, 'RK2', 'RK1', 'TD'], dtype=object)" - ] - }, - "metadata": {}, - "execution_count": 19 - } - ], - "source": [ - "df_samples.cohort.unique()" + "data": { + "text/plain": [ + "array([False, False, False, False, False, False, False, False, False,\n", + " False])" ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# load site filters for first 10 SNPs\n", + "f = filter_pass[:10].compute()\n", + "f" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yVX0PJlGgAQ_" + }, + "source": [ + "Note that we have chosen to genotype all samples at all sites in the genome, assuming all possible SNP alleles. Not all of these alternate alleles will actually have been observed in the samples. To determine which sites and alleles are segregating, an allele count can be performed over the samples you are interested in. See the example below. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sMnfrmCNBzW8" + }, + "source": [ + "### SNP genotypes\n", + "\n", + "SNP genotypes for individual samples are available. Genotypes are stored as a three-dimensional array, where the first dimension corresponds to genomic positions, the second dimension is samples, and the third dimension is ploidy (2). Values coded as integers, where -1 represents a missing value, 0 represents the reference allele, and 1, 2, and 3 represent alternate alleles.\n", + "\n", + "SNP genotypes can be accessed as dask arrays as shown in the example below." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 172 }, + "id": "Xk1-s2KDERQ5", + "outputId": "eb6eb58c-2731-493c-b8ea-ac02d123d14d" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "id": "Q3kgf_mkERQ6", - "outputId": "c7a241ff-0499-4153-bc72-24925d8548b6" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "found 41 samples\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Array Chunk
Bytes 2.47 GB 20.40 MB
Shape (30141520, 41, 2) (300000, 34, 2)
Count 910 Tasks 202 Chunks
Type int8 numpy.ndarray
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " 2\n", - " 41\n", - " 30141520\n", - "\n", - "
" - ], - "text/plain": [ - "dask.array" - ] - }, - "metadata": {}, - "execution_count": 20 - } + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Array Chunk
Bytes 18.21 GB 30.00 MB
Shape (30141520, 302, 2) (300000, 50, 2)
Count 708 Tasks 707 Chunks
Type int8 numpy.ndarray
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " 2\n", + " 302\n", + " 30141520\n", + "\n", + "
" ], - "source": [ - "# select samples from the Thmar Da cohort\n", - "loc_cohort = df_samples.eval(\"cohort == 'TD'\").values\n", - "print(f\"found {np.count_nonzero(loc_cohort)} samples\")\n", - "gt_cohort = da.compress(loc_cohort, gt, axis=1)\n", - "gt_cohort" + "text/plain": [ + "dask.array" ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gt = ds_snps['call_genotype'].data\n", + "gt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FSYizL3gERQ5" + }, + "source": [ + "Note that the columns of this array (second dimension) match the rows in the sample metadata, if the same sample sets were loaded. I.e.:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "bR753FhnERQ5", + "outputId": "0c3de15e-058d-489f-ce35-2e117b6f8b37" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "Oi-5lGIxERQ6" - }, - "source": [ - "Data can be read into memory as numpy arrays, e.g., read genotypes for the first 5 SNPs and the first 3 samples:" + "data": { + "text/plain": [ + "True" ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df_samples) == gt.shape[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lp50lmyOERQ5" + }, + "source": [ + "You can use this correspondance to apply further subsetting operations to the genotypes by querying the sample metadata. E.g.:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "_dHG6HxtERQ6", + "outputId": "4d5d8ddf-75f7-42bf-de2b-35892e24017a" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "yEPeVHn1ERQ6", - "outputId": "7ac00a45-c876-4f37-fd32-5229deb68f7a" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "array([[[0, 0],\n", - " [0, 0],\n", - " [0, 0]],\n", - "\n", - " [[0, 0],\n", - " [0, 0],\n", - " [0, 0]],\n", - "\n", - " [[0, 0],\n", - " [0, 0],\n", - " [0, 0]],\n", - "\n", - " [[0, 0],\n", - " [0, 0],\n", - " [0, 0]],\n", - "\n", - " [[0, 0],\n", - " [0, 0],\n", - " [0, 0]]], dtype=int8)" - ] - }, - "metadata": {}, - "execution_count": 21 - } - ], - "source": [ - "g = gt[:5, :3, :].compute()\n", - "g" + "data": { + "text/plain": [ + "array(['PV', nan, 'RK2', 'RK1', 'TD'], dtype=object)" ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_samples.cohort.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 190 }, + "id": "Q3kgf_mkERQ6", + "outputId": "c7a241ff-0499-4153-bc72-24925d8548b6" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "vKwalCkoERQ6" - }, - "source": [ - "If you want to work with the genotype calls, you may find it convenient to use [scikit-allel](http://scikit-allel.readthedocs.org/).\n", - "E.g., the code below sets up a genotype array." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "found 41 samples\n" + ] }, { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 205 - }, - "id": "INPTILKQERQ6", - "outputId": "4d37d24a-18d7-41d7-a3d9-f9ee1625c0f8" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
<GenotypeDaskArray shape=(30141520, 302, 2) dtype=int8>
01234...297298299300301
00/00/00/00/00/0...0/00/00/00/00/0
10/00/00/00/00/0...0/00/00/00/00/0
20/00/00/00/00/0...0/00/00/00/00/0
......
30141517./../../../../...../../../../../.
30141518./../../../../...../../../../../.
30141519./../../../../...../../../../../.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "execution_count": 22 - } + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Array Chunk
Bytes 2.47 GB 20.40 MB
Shape (30141520, 41, 2) (300000, 34, 2)
Count 910 Tasks 202 Chunks
Type int8 numpy.ndarray
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " 2\n", + " 41\n", + " 30141520\n", + "\n", + "
" ], - "source": [ - "# use the scikit-allel wrapper class for genotype calls\n", - "gtd = allel.GenotypeDaskArray(ds_snps['call_genotype'].data)\n", - "gtd" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "arZZ_OcPoSQV" - }, - "source": [ - "## Example computations\n", - "\n", - "Below are some examples of simple computations that can be run with these data." + "text/plain": [ + "dask.array" ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# select samples from the Thmar Da cohort\n", + "loc_cohort = df_samples.eval(\"cohort == 'TD'\").values\n", + "print(f\"found {np.count_nonzero(loc_cohort)} samples\")\n", + "gt_cohort = da.compress(loc_cohort, gt, axis=1)\n", + "gt_cohort" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Oi-5lGIxERQ6" + }, + "source": [ + "Data can be read into memory as numpy arrays, e.g., read genotypes for the first 5 SNPs and the first 3 samples:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "yEPeVHn1ERQ6", + "outputId": "7ac00a45-c876-4f37-fd32-5229deb68f7a" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "8vSriNMMERQ7" - }, - "source": [ - "### Counting sites passing filters\n", - "\n", - "For each of the contigs for which SNP calling was performed, count the number of sites and the number passing site filters." + "data": { + "text/plain": [ + "array([[[0, 0],\n", + " [0, 0],\n", + " [0, 0]],\n", + "\n", + " [[0, 0],\n", + " [0, 0],\n", + " [0, 0]],\n", + "\n", + " [[0, 0],\n", + " [0, 0],\n", + " [0, 0]],\n", + "\n", + " [[0, 0],\n", + " [0, 0],\n", + " [0, 0]],\n", + "\n", + " [[0, 0],\n", + " [0, 0],\n", + " [0, 0]]], dtype=int8)" ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g = gt[:5, :3, :].compute()\n", + "g" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vKwalCkoERQ6" + }, + "source": [ + "If you want to work with the genotype calls, you may find it convenient to use [scikit-allel](http://scikit-allel.readthedocs.org/).\n", + "E.g., the code below sets up a genotype array." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 205 }, + "id": "INPTILKQERQ6", + "outputId": "4d37d24a-18d7-41d7-a3d9-f9ee1625c0f8" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "GMNapZKNERQ7", - "outputId": "8ced350d-00c3-4a88-fa59-aded5f80621f" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "KB663610: 22,857,027 out of 30,141,520 (76%) sites pass filters\n", - "KB663611: 3,737,822 out of 5,453,172 (69%) sites pass filters\n", - "KB663622: 4,388,762 out of 5,461,660 (80%) sites pass filters\n", - "KB663633: 4,215,569 out of 5,450,627 (77%) sites pass filters\n", - "KB663644: 4,221,032 out of 5,413,357 (78%) sites pass filters\n", - "KB663655: 2,916,639 out of 3,744,733 (78%) sites pass filters\n", - "KB663666: 2,501,655 out of 3,719,057 (67%) sites pass filters\n", - "KB663677: 2,586,150 out of 3,624,725 (71%) sites pass filters\n", - "KB663688: 2,742,023 out of 3,544,647 (77%) sites pass filters\n", - "KB663699: 1,534,228 out of 1,873,197 (82%) sites pass filters\n", - "KB663710: 1,213,646 out of 1,685,057 (72%) sites pass filters\n", - "KB663721: 15,452,041 out of 20,762,843 (74%) sites pass filters\n", - "KB663722: 1,209,792 out of 1,659,122 (73%) sites pass filters\n", - "KB663733: 1,062,674 out of 1,537,061 (69%) sites pass filters\n", - "KB663744: 1,240,722 out of 1,464,294 (85%) sites pass filters\n", - "KB663755: 1,099,411 out of 1,252,480 (88%) sites pass filters\n", - "KB663766: 1,178,051 out of 1,313,725 (90%) sites pass filters\n", - "KB663777: 1,012,298 out of 1,306,825 (77%) sites pass filters\n", - "KB663788: 1,125,292 out of 1,246,996 (90%) sites pass filters\n", - "KB663799: 966,257 out of 1,234,260 (78%) sites pass filters\n", - "KB663810: 800,823 out of 928,103 (86%) sites pass filters\n", - "KB663821: 757,864 out of 875,898 (87%) sites pass filters\n", - "KB663832: 11,641,571 out of 16,000,346 (73%) sites pass filters\n", - "KB663833: 395,479 out of 554,130 (71%) sites pass filters\n", - "KB663844: 482,803 out of 602,452 (80%) sites pass filters\n", - "KB663855: 356,534 out of 428,439 (83%) sites pass filters\n", - "KB663866: 437,007 out of 472,977 (92%) sites pass filters\n", - "KB663877: 172,685 out of 273,855 (63%) sites pass filters\n", - "KB663888: 171,639 out of 221,033 (78%) sites pass filters\n", - "KB663899: 29,212 out of 87,710 (33%) sites pass filters\n", - "KB663910: 105,536 out of 150,741 (70%) sites pass filters\n", - "KB663921: 29,136 out of 87,604 (33%) sites pass filters\n", - "KB663932: 100,838 out of 133,376 (76%) sites pass filters\n", - "KB663943: 10,913,259 out of 14,689,009 (74%) sites pass filters\n", - "KB663955: 24,703 out of 55,494 (45%) sites pass filters\n", - "KB664054: 8,476,484 out of 11,726,578 (72%) sites pass filters\n", - "KB664165: 7,711,730 out of 9,974,421 (77%) sites pass filters\n", - "KB664255: 6,529,746 out of 9,221,028 (71%) sites pass filters\n", - "KB664266: 6,594,776 out of 8,849,743 (75%) sites pass filters\n", - "KB664277: 5,168,159 out of 6,068,897 (85%) sites pass filters\n" - ] - } + "data": { + "text/html": [ + "
<GenotypeDaskArray shape=(30141520, 302, 2) dtype=int8>
01234...297298299300301
00/00/00/00/00/0...0/00/00/00/00/0
10/00/00/00/00/0...0/00/00/00/00/0
20/00/00/00/00/0...0/00/00/00/00/0
......
30141517./../../../../...../../../../../.
30141518./../../../../...../../../../../.
30141519./../../../../...../../../../../.
" ], - "source": [ - "for contig in amin1.contigs:\n", - " ds_snps = amin1.snp_calls(region=contig)\n", - " filter_pass = ds_snps['variant_filter_pass'].data\n", - " n_sites = ds_snps.dims['variants']\n", - " n_pass = filter_pass.sum().compute()\n", - " print(f\"{contig}: {n_pass:,} out of {n_sites:,} ({n_pass/n_sites:.0%}) sites pass filters\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zrAMwuz5ERQ7" - }, - "source": [ - "### Counting segregating sites\n", - "\n", - "Count the number of segregating SNPs that also pass site filters, for a single contig." + "text/plain": [ + "" ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# use the scikit-allel wrapper class for genotype calls\n", + "gtd = allel.GenotypeDaskArray(ds_snps['call_genotype'].data)\n", + "gtd" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "arZZ_OcPoSQV" + }, + "source": [ + "## Example computations\n", + "\n", + "Below are some examples of simple computations that can be run with these data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8vSriNMMERQ7" + }, + "source": [ + "### Counting sites passing filters\n", + "\n", + "For each of the contigs for which SNP calling was performed, count the number of sites and the number passing site filters." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "GMNapZKNERQ7", + "outputId": "8ced350d-00c3-4a88-fa59-aded5f80621f" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 475 - }, - "id": "-9ykBE7yERQ7", - "outputId": "85edb63e-2ebf-4d41-d41c-9a97eb76747b" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.Dataset>\n",
-              "Dimensions:              (alleles: 4, ploidy: 2, samples: 302, variants: 30141520)\n",
-              "Coordinates:\n",
-              "    variant_position     (variants) int32 dask.array<chunksize=(524288,), meta=np.ndarray>\n",
-              "    variant_contig       (variants) uint8 dask.array<chunksize=(524288,), meta=np.ndarray>\n",
-              "    sample_id            (samples) |S24 dask.array<chunksize=(302,), meta=np.ndarray>\n",
-              "Dimensions without coordinates: alleles, ploidy, samples, variants\n",
-              "Data variables:\n",
-              "    variant_allele       (variants, alleles) |S1 dask.array<chunksize=(524288, 1), meta=np.ndarray>\n",
-              "    variant_filter_pass  (variants) bool dask.array<chunksize=(941923,), meta=np.ndarray>\n",
-              "    call_genotype        (variants, samples, ploidy) int8 dask.array<chunksize=(300000, 50, 2), meta=np.ndarray>\n",
-              "    call_GQ              (variants, samples) int8 dask.array<chunksize=(300000, 50), meta=np.ndarray>\n",
-              "    call_MQ              (variants, samples) float32 dask.array<chunksize=(300000, 50), meta=np.ndarray>\n",
-              "    call_AD              (variants, samples, alleles) int16 dask.array<chunksize=(300000, 50, 4), meta=np.ndarray>\n",
-              "    call_genotype_mask   (variants, samples, ploidy) bool dask.array<chunksize=(300000, 50, 2), meta=np.ndarray>\n",
-              "Attributes:\n",
-              "    contigs:  ('KB663610', 'KB663611', 'KB663622', 'KB663633', 'KB663644', 'K...
" - ], - "text/plain": [ - "\n", - "Dimensions: (alleles: 4, ploidy: 2, samples: 302, variants: 30141520)\n", - "Coordinates:\n", - " variant_position (variants) int32 dask.array\n", - " variant_contig (variants) uint8 dask.array\n", - " sample_id (samples) |S24 dask.array\n", - "Dimensions without coordinates: alleles, ploidy, samples, variants\n", - "Data variables:\n", - " variant_allele (variants, alleles) |S1 dask.array\n", - " variant_filter_pass (variants) bool dask.array\n", - " call_genotype (variants, samples, ploidy) int8 dask.array\n", - " call_GQ (variants, samples) int8 dask.array\n", - " call_MQ (variants, samples) float32 dask.array\n", - " call_AD (variants, samples, alleles) int16 dask.array\n", - " call_genotype_mask (variants, samples, ploidy) bool dask.array\n", - "Attributes:\n", - " contigs: ('KB663610', 'KB663611', 'KB663622', 'KB663633', 'KB663644', 'K..." - ] - }, - "metadata": {}, - "execution_count": 24 - } - ], - "source": [ - "# choose contig\n", - "region = \"KB663610\"\n", - "\n", - "# access SNP data\n", - "ds_snps = amin1.snp_calls(region=region)\n", - "ds_snps" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "KB663610: 22,857,027 out of 30,141,520 (76%) sites pass filters\n", + "KB663611: 3,737,822 out of 5,453,172 (69%) sites pass filters\n", + "KB663622: 4,388,762 out of 5,461,660 (80%) sites pass filters\n", + "KB663633: 4,215,569 out of 5,450,627 (77%) sites pass filters\n", + "KB663644: 4,221,032 out of 5,413,357 (78%) sites pass filters\n", + "KB663655: 2,916,639 out of 3,744,733 (78%) sites pass filters\n", + "KB663666: 2,501,655 out of 3,719,057 (67%) sites pass filters\n", + "KB663677: 2,586,150 out of 3,624,725 (71%) sites pass filters\n", + "KB663688: 2,742,023 out of 3,544,647 (77%) sites pass filters\n", + "KB663699: 1,534,228 out of 1,873,197 (82%) sites pass filters\n", + "KB663710: 1,213,646 out of 1,685,057 (72%) sites pass filters\n", + "KB663721: 15,452,041 out of 20,762,843 (74%) sites pass filters\n", + "KB663722: 1,209,792 out of 1,659,122 (73%) sites pass filters\n", + "KB663733: 1,062,674 out of 1,537,061 (69%) sites pass filters\n", + "KB663744: 1,240,722 out of 1,464,294 (85%) sites pass filters\n", + "KB663755: 1,099,411 out of 1,252,480 (88%) sites pass filters\n", + "KB663766: 1,178,051 out of 1,313,725 (90%) sites pass filters\n", + "KB663777: 1,012,298 out of 1,306,825 (77%) sites pass filters\n", + "KB663788: 1,125,292 out of 1,246,996 (90%) sites pass filters\n", + "KB663799: 966,257 out of 1,234,260 (78%) sites pass filters\n", + "KB663810: 800,823 out of 928,103 (86%) sites pass filters\n", + "KB663821: 757,864 out of 875,898 (87%) sites pass filters\n", + "KB663832: 11,641,571 out of 16,000,346 (73%) sites pass filters\n", + "KB663833: 395,479 out of 554,130 (71%) sites pass filters\n", + "KB663844: 482,803 out of 602,452 (80%) sites pass filters\n", + "KB663855: 356,534 out of 428,439 (83%) sites pass filters\n", + "KB663866: 437,007 out of 472,977 (92%) sites pass filters\n", + "KB663877: 172,685 out of 273,855 (63%) sites pass filters\n", + "KB663888: 171,639 out of 221,033 (78%) sites pass filters\n", + "KB663899: 29,212 out of 87,710 (33%) sites pass filters\n", + "KB663910: 105,536 out of 150,741 (70%) sites pass filters\n", + "KB663921: 29,136 out of 87,604 (33%) sites pass filters\n", + "KB663932: 100,838 out of 133,376 (76%) sites pass filters\n", + "KB663943: 10,913,259 out of 14,689,009 (74%) sites pass filters\n", + "KB663955: 24,703 out of 55,494 (45%) sites pass filters\n", + "KB664054: 8,476,484 out of 11,726,578 (72%) sites pass filters\n", + "KB664165: 7,711,730 out of 9,974,421 (77%) sites pass filters\n", + "KB664255: 6,529,746 out of 9,221,028 (71%) sites pass filters\n", + "KB664266: 6,594,776 out of 8,849,743 (75%) sites pass filters\n", + "KB664277: 5,168,159 out of 6,068,897 (85%) sites pass filters\n" + ] + } + ], + "source": [ + "for contig in amin1.contigs:\n", + " ds_snps = amin1.snp_calls(region=contig)\n", + " filter_pass = ds_snps['variant_filter_pass'].data\n", + " n_sites = ds_snps.dims['variants']\n", + " n_pass = filter_pass.sum().compute()\n", + " print(f\"{contig}: {n_pass:,} out of {n_sites:,} ({n_pass/n_sites:.0%}) sites pass filters\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zrAMwuz5ERQ7" + }, + "source": [ + "### Counting segregating sites\n", + "\n", + "Count the number of segregating SNPs that also pass site filters, for a single contig." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 475 }, + "id": "-9ykBE7yERQ7", + "outputId": "85edb63e-2ebf-4d41-d41c-9a97eb76747b" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "NujlHfFpERQ7", - "outputId": "d03a3d2d-7e8e-4743-8d15-f9c9d9943cf2" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "array([False, False, False, ..., False, False, False])" - ] - }, - "metadata": {}, - "execution_count": 25 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:              (alleles: 4, ploidy: 2, samples: 302, variants: 30141520)\n",
+       "Coordinates:\n",
+       "    variant_position     (variants) int32 dask.array<chunksize=(524288,), meta=np.ndarray>\n",
+       "    variant_contig       (variants) uint8 dask.array<chunksize=(524288,), meta=np.ndarray>\n",
+       "    sample_id            (samples) |S24 dask.array<chunksize=(302,), meta=np.ndarray>\n",
+       "Dimensions without coordinates: alleles, ploidy, samples, variants\n",
+       "Data variables:\n",
+       "    variant_allele       (variants, alleles) |S1 dask.array<chunksize=(524288, 1), meta=np.ndarray>\n",
+       "    variant_filter_pass  (variants) bool dask.array<chunksize=(941923,), meta=np.ndarray>\n",
+       "    call_genotype        (variants, samples, ploidy) int8 dask.array<chunksize=(300000, 50, 2), meta=np.ndarray>\n",
+       "    call_GQ              (variants, samples) int8 dask.array<chunksize=(300000, 50), meta=np.ndarray>\n",
+       "    call_MQ              (variants, samples) float32 dask.array<chunksize=(300000, 50), meta=np.ndarray>\n",
+       "    call_AD              (variants, samples, alleles) int16 dask.array<chunksize=(300000, 50, 4), meta=np.ndarray>\n",
+       "    call_genotype_mask   (variants, samples, ploidy) bool dask.array<chunksize=(300000, 50, 2), meta=np.ndarray>\n",
+       "Attributes:\n",
+       "    contigs:  ('KB663610', 'KB663611', 'KB663622', 'KB663633', 'KB663644', 'K...
" ], - "source": [ - "# locate pass sites\n", - "loc_pass = ds_snps['variant_filter_pass'].values\n", - "loc_pass" + "text/plain": [ + "\n", + "Dimensions: (alleles: 4, ploidy: 2, samples: 302, variants: 30141520)\n", + "Coordinates:\n", + " variant_position (variants) int32 dask.array\n", + " variant_contig (variants) uint8 dask.array\n", + " sample_id (samples) |S24 dask.array\n", + "Dimensions without coordinates: alleles, ploidy, samples, variants\n", + "Data variables:\n", + " variant_allele (variants, alleles) |S1 dask.array\n", + " variant_filter_pass (variants) bool dask.array\n", + " call_genotype (variants, samples, ploidy) int8 dask.array\n", + " call_GQ (variants, samples) int8 dask.array\n", + " call_MQ (variants, samples) float32 dask.array\n", + " call_AD (variants, samples, alleles) int16 dask.array\n", + " call_genotype_mask (variants, samples, ploidy) bool dask.array\n", + "Attributes:\n", + " contigs: ('KB663610', 'KB663611', 'KB663622', 'KB663633', 'KB663644', 'K..." ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# choose contig\n", + "region = \"KB663610\"\n", + "\n", + "# access SNP data\n", + "ds_snps = amin1.snp_calls(region=region)\n", + "ds_snps" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "NujlHfFpERQ7", + "outputId": "d03a3d2d-7e8e-4743-8d15-f9c9d9943cf2" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 205 - }, - "id": "Pc2yqXlAERQ7", - "outputId": "b778b7bb-c4dc-4a3f-bcc3-e6defb91640a" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
<GenotypeDaskArray shape=(30141520, 302, 2) dtype=int8>
01234...297298299300301
00/00/00/00/00/0...0/00/00/00/00/0
10/00/00/00/00/0...0/00/00/00/00/0
20/00/00/00/00/0...0/00/00/00/00/0
......
30141517./../../../../...../../../../../.
30141518./../../../../...../../../../../.
30141519./../../../../...../../../../../.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "execution_count": 26 - } - ], - "source": [ - "# access genotypes\n", - "gt = allel.GenotypeDaskArray(ds_snps['call_genotype'].data)\n", - "gt" + "data": { + "text/plain": [ + "array([False, False, False, ..., False, False, False])" ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# locate pass sites\n", + "loc_pass = ds_snps['variant_filter_pass'].values\n", + "loc_pass" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 205 }, + "id": "Pc2yqXlAERQ7", + "outputId": "b778b7bb-c4dc-4a3f-bcc3-e6defb91640a" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 222 - }, - "id": "MJbnjx2-ERQ8", - "outputId": "6c905d02-fe62-4556-bb99-18579f2a1fb6" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[########################################] | 100% Completed | 4min 58.5s\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
<AlleleCountsArray shape=(30141520, 4) dtype=int32>
0123
0604 0 0 0
1604 0 0 0
2604 0 0 0
......
301415170008
301415186000
301415194000
" - ], - "text/plain": [ - "\n", - "604 0 0 0\n", - "604 0 0 0\n", - "604 0 0 0\n", - "...\n", - "0 0 0 8\n", - "6 0 0 0\n", - "4 0 0 0" - ] - }, - "metadata": {}, - "execution_count": 27 - } + "data": { + "text/html": [ + "
<GenotypeDaskArray shape=(30141520, 302, 2) dtype=int8>
01234...297298299300301
00/00/00/00/00/0...0/00/00/00/00/0
10/00/00/00/00/0...0/00/00/00/00/0
20/00/00/00/00/0...0/00/00/00/00/0
......
30141517./../../../../...../../../../../.
30141518./../../../../...../../../../../.
30141519./../../../../...../../../../../.
" ], - "source": [ - "# perform an allele count over genotypes\n", - "with ProgressBar():\n", - " ac = gt.count_alleles(max_allele=3).compute()\n", - "ac" + "text/plain": [ + "" ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# access genotypes\n", + "gt = allel.GenotypeDaskArray(ds_snps['call_genotype'].data)\n", + "gt" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 222 }, + "id": "MJbnjx2-ERQ8", + "outputId": "6c905d02-fe62-4556-bb99-18579f2a1fb6" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 205 - }, - "id": "4-NtNBMfERQ8", - "outputId": "d1653793-4dfa-41bb-cb2f-8cba52953f1f" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
<AlleleCountsArray shape=(22857027, 4) dtype=int32>
0123
0604 0 0 0
1604 0 0 0
2604 0 0 0
......
22857024604 0 0 0
22857025604 0 0 0
22857026604 0 0 0
" - ], - "text/plain": [ - "\n", - "604 0 0 0\n", - "604 0 0 0\n", - "604 0 0 0\n", - "...\n", - "604 0 0 0\n", - "604 0 0 0\n", - "604 0 0 0" - ] - }, - "metadata": {}, - "execution_count": 28 - } - ], - "source": [ - "# locate pass sites\n", - "loc_pass = ds_snps['variant_filter_pass'].values\n", - "ac_pass = ac[loc_pass]\n", - "ac_pass" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 4min 58.5s\n" + ] }, { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "KeuMkVaeF03E", - "outputId": "6fca88b2-9dcf-4545-c339-2e3504d97633" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "No. segregating sites: 6,918,663\n" - ] - } + "data": { + "text/html": [ + "
<AlleleCountsArray shape=(30141520, 4) dtype=int32>
0123
0604 0 0 0
1604 0 0 0
2604 0 0 0
......
301415170008
301415186000
301415194000
" ], - "source": [ - "# count segregating sites\n", - "n_pass_seg = np.count_nonzero(ac_pass.is_segregating())\n", - "print(f\"No. segregating sites: {n_pass_seg:,}\")" + "text/plain": [ + "\n", + "604 0 0 0\n", + "604 0 0 0\n", + "604 0 0 0\n", + "...\n", + "0 0 0 8\n", + "6 0 0 0\n", + "4 0 0 0" ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# perform an allele count over genotypes\n", + "with ProgressBar():\n", + " ac = gt.count_alleles(max_allele=3).compute()\n", + "ac" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 205 }, + "id": "4-NtNBMfERQ8", + "outputId": "d1653793-4dfa-41bb-cb2f-8cba52953f1f" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "JeRpWW92ERQ8", - "outputId": "04122202-0c29-4088-ed54-6ec96508b792" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "No. invariant sites: 15,938,364\n", - "No. biallelic SNPs: 5,922,306\n", - "No. triallelic SNPs: 938,414\n", - "No. quadriallelic SNPs: 57,943\n" - ] - } + "data": { + "text/html": [ + "
<AlleleCountsArray shape=(22857027, 4) dtype=int32>
0123
0604 0 0 0
1604 0 0 0
2604 0 0 0
......
22857024604 0 0 0
22857025604 0 0 0
22857026604 0 0 0
" ], - "source": [ - "allelism_count = np.bincount(ac_pass.allelism())\n", - "print(f\"No. invariant sites: {allelism_count[1]:,}\")\n", - "print(f\"No. biallelic SNPs: {allelism_count[2]:,}\")\n", - "print(f\"No. triallelic SNPs: {allelism_count[3]:,}\")\n", - "print(f\"No. quadriallelic SNPs: {allelism_count[4]:,}\")" + "text/plain": [ + "\n", + "604 0 0 0\n", + "604 0 0 0\n", + "604 0 0 0\n", + "...\n", + "604 0 0 0\n", + "604 0 0 0\n", + "604 0 0 0" ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# locate pass sites\n", + "loc_pass = ds_snps['variant_filter_pass'].values\n", + "ac_pass = ac[loc_pass]\n", + "ac_pass" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "KeuMkVaeF03E", + "outputId": "6fca88b2-9dcf-4545-c339-2e3504d97633" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "4n73mSO-heAF" - }, - "source": [ - "## Feedback and suggestions\n", - "\n", - "If there are particular analyses you would like to run, or if you have other suggestions for useful documentation we could add to this site, we would love to know, please get in touch via the [malariagen/vector-data GitHub discussion board](https://github.com/malariagen/vector-data/discussions)." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "No. segregating sites: 6,918,663\n" + ] } - ], - "metadata": { - "celltoolbar": "Tags", + ], + "source": [ + "# count segregating sites\n", + "n_pass_seg = np.count_nonzero(ac_pass.is_segregating())\n", + "print(f\"No. segregating sites: {n_pass_seg:,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { "colab": { - "collapsed_sections": [], - "name": "Amin1.0-cloud-data-access.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" + "base_uri": "https://localhost:8080/" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.12" + "id": "JeRpWW92ERQ8", + "outputId": "04122202-0c29-4088-ed54-6ec96508b792" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No. invariant sites: 15,938,364\n", + "No. biallelic SNPs: 5,922,306\n", + "No. triallelic SNPs: 938,414\n", + "No. quadriallelic SNPs: 57,943\n" + ] } + ], + "source": [ + "allelism_count = np.bincount(ac_pass.allelism())\n", + "print(f\"No. invariant sites: {allelism_count[1]:,}\")\n", + "print(f\"No. biallelic SNPs: {allelism_count[2]:,}\")\n", + "print(f\"No. triallelic SNPs: {allelism_count[3]:,}\")\n", + "print(f\"No. quadriallelic SNPs: {allelism_count[4]:,}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4n73mSO-heAF" + }, + "source": [ + "## Feedback and suggestions\n", + "\n", + "If there are particular analyses you would like to run, or if you have other suggestions for useful documentation we could add to this site, we would love to know, please get in touch via the [malariagen/vector-data GitHub discussion board](https://github.com/malariagen/vector-data/discussions)." + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "colab": { + "collapsed_sections": [], + "name": "Amin1.0-cloud-data-access.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "global-global-mgenv-6.0.6", + "language": "python", + "name": "conda-env-global-global-mgenv-6.0.6-py" }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/_sources/amin1/download.ipynb b/_sources/amin1/download.ipynb index 67b0377..834e246 100644 --- a/_sources/amin1/download.ipynb +++ b/_sources/amin1/download.ipynb @@ -84,16 +84,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "sample_id,original_sample_id,sanger_sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,season,PCA_cohort,cohort,subsampled_cohort\r\n", - "VBS09378-4248STDY7308980,VBS09378,4248STDY7308980,CB-2-00264,Brandy St. Laurent,Cambodia,Preah Kleang,2016,3,13.667,104.982,Feb-Apr (late dry),A,PV,\r\n", - "VBS09382-4248STDY7308981,VBS09382,4248STDY7308981,CB-2-00258,Brandy St. Laurent,Cambodia,Preah Kleang,2016,3,13.667,104.982,Feb-Apr (late dry),A,PV,\r\n", - "VBS09397-4248STDY7308982,VBS09397,4248STDY7308982,CB-2-00384,Brandy St. Laurent,Cambodia,Preah Kleang,2016,3,13.667,104.982,Feb-Apr (late dry),A,PV,PV\r\n", - "VBS09460-4248STDY7308986,VBS09460,4248STDY7308986,CB-2-02960,Brandy St. Laurent,Cambodia,Preah Kleang,2016,6,13.667,104.982,May-Jul (early wet),A,PV,\r\n", - "VBS09466-4248STDY7308989,VBS09466,4248STDY7308989,CB-2-04070,Brandy St. Laurent,Cambodia,Preah Kleang,2016,11,13.667,104.982,Nov-Jan (early dry),A,PV,\r\n", - "VBS09467-4248STDY7308990,VBS09467,4248STDY7308990,CB-2-04121,Brandy St. Laurent,Cambodia,Preah Kleang,2016,11,13.667,104.982,Nov-Jan (early dry),A,PV,\r\n", - "VBS09477-4248STDY7308994,VBS09477,4248STDY7308994,CB-2-05011,Brandy St. Laurent,Cambodia,Preah Kleang,2016,12,13.667,104.982,Nov-Jan (early dry),A,PV,PV\r\n", - "VBS09482-4248STDY7308996,VBS09482,4248STDY7308996,CB-2-05167,Brandy St. Laurent,Cambodia,Preah Kleang,2016,12,13.667,104.982,Nov-Jan (early dry),A,PV,PV\r\n", - "VBS09483-4248STDY7308997,VBS09483,4248STDY7308997,CB-2-03873,Brandy St. Laurent,Cambodia,Preah Kleang,2016,12,13.667,104.982,Nov-Jan (early dry),A,PV,PV\r\n" + "sample_id,original_sample_id,sanger_sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,season,PCA_cohort,cohort,subsampled_cohort\n", + "VBS09378-4248STDY7308980,VBS09378,4248STDY7308980,CB-2-00264,Brandy St. Laurent,Cambodia,Preah Kleang,2016,3,13.667,104.982,Feb-Apr (late dry),A,PV,\n", + "VBS09382-4248STDY7308981,VBS09382,4248STDY7308981,CB-2-00258,Brandy St. Laurent,Cambodia,Preah Kleang,2016,3,13.667,104.982,Feb-Apr (late dry),A,PV,\n", + "VBS09397-4248STDY7308982,VBS09397,4248STDY7308982,CB-2-00384,Brandy St. Laurent,Cambodia,Preah Kleang,2016,3,13.667,104.982,Feb-Apr (late dry),A,PV,PV\n", + "VBS09460-4248STDY7308986,VBS09460,4248STDY7308986,CB-2-02960,Brandy St. Laurent,Cambodia,Preah Kleang,2016,6,13.667,104.982,May-Jul (early wet),A,PV,\n", + "VBS09466-4248STDY7308989,VBS09466,4248STDY7308989,CB-2-04070,Brandy St. Laurent,Cambodia,Preah Kleang,2016,11,13.667,104.982,Nov-Jan (early dry),A,PV,\n", + "VBS09467-4248STDY7308990,VBS09467,4248STDY7308990,CB-2-04121,Brandy St. Laurent,Cambodia,Preah Kleang,2016,11,13.667,104.982,Nov-Jan (early dry),A,PV,\n", + "VBS09477-4248STDY7308994,VBS09477,4248STDY7308994,CB-2-05011,Brandy St. Laurent,Cambodia,Preah Kleang,2016,12,13.667,104.982,Nov-Jan (early dry),A,PV,PV\n", + "VBS09482-4248STDY7308996,VBS09482,4248STDY7308996,CB-2-05167,Brandy St. Laurent,Cambodia,Preah Kleang,2016,12,13.667,104.982,Nov-Jan (early dry),A,PV,PV\n", + "VBS09483-4248STDY7308997,VBS09483,4248STDY7308997,CB-2-03873,Brandy St. Laurent,Cambodia,Preah Kleang,2016,12,13.667,104.982,Nov-Jan (early dry),A,PV,PV\n" ] } ], @@ -140,16 +140,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "sample_id,alignments_bam\r\n", - "VBS09378-4248STDY7308980,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09378-4248STDY7308980-2019-03-03.bam\r\n", - "VBS09382-4248STDY7308981,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09382-4248STDY7308981-2019-03-03.bam\r\n", - "VBS09397-4248STDY7308982,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09397-4248STDY7308982-2019-03-04.bam\r\n", - "VBS09460-4248STDY7308986,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09460-4248STDY7308986-2019-03-07.bam\r\n", - "VBS09466-4248STDY7308989,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09466-4248STDY7308989-2019-03-06.bam\r\n", - "VBS09467-4248STDY7308990,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09467-4248STDY7308990-2019-03-06.bam\r\n", - "VBS09477-4248STDY7308994,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09477-4248STDY7308994-2019-03-06.bam\r\n", - "VBS09482-4248STDY7308996,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09482-4248STDY7308996-2019-03-06.bam\r\n", - "VBS09483-4248STDY7308997,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09483-4248STDY7308997-2019-03-06.bam\r\n" + "sample_id,alignments_bam\n", + "VBS09378-4248STDY7308980,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09378-4248STDY7308980-2019-03-03.bam\n", + "VBS09382-4248STDY7308981,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09382-4248STDY7308981-2019-03-03.bam\n", + "VBS09397-4248STDY7308982,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09397-4248STDY7308982-2019-03-04.bam\n", + "VBS09460-4248STDY7308986,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09460-4248STDY7308986-2019-03-07.bam\n", + "VBS09466-4248STDY7308989,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09466-4248STDY7308989-2019-03-06.bam\n", + "VBS09467-4248STDY7308990,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09467-4248STDY7308990-2019-03-06.bam\n", + "VBS09477-4248STDY7308994,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09477-4248STDY7308994-2019-03-06.bam\n", + "VBS09482-4248STDY7308996,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09482-4248STDY7308996-2019-03-06.bam\n", + "VBS09483-4248STDY7308997,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09483-4248STDY7308997-2019-03-06.bam\n" ] } ], @@ -211,16 +211,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "sample_id,snp_genotypes_vcf\r\n", - "VBS09378-4248STDY7308980,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09378-4248STDY7308980-2019-03-04.vcf.gz\r\n", - "VBS09382-4248STDY7308981,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09382-4248STDY7308981-2019-03-04.vcf.gz\r\n", - "VBS09397-4248STDY7308982,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09397-4248STDY7308982-2019-03-04.vcf.gz\r\n", - "VBS09460-4248STDY7308986,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09460-4248STDY7308986-2019-03-07.vcf.gz\r\n", - "VBS09466-4248STDY7308989,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09466-4248STDY7308989-2019-03-07.vcf.gz\r\n", - "VBS09467-4248STDY7308990,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09467-4248STDY7308990-2019-03-07.vcf.gz\r\n", - "VBS09477-4248STDY7308994,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09477-4248STDY7308994-2019-03-07.vcf.gz\r\n", - "VBS09482-4248STDY7308996,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09482-4248STDY7308996-2019-03-07.vcf.gz\r\n", - "VBS09483-4248STDY7308997,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09483-4248STDY7308997-2019-03-07.vcf.gz\r\n" + "sample_id,snp_genotypes_vcf\n", + "VBS09378-4248STDY7308980,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09378-4248STDY7308980-2019-03-04.vcf.gz\n", + "VBS09382-4248STDY7308981,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09382-4248STDY7308981-2019-03-04.vcf.gz\n", + "VBS09397-4248STDY7308982,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09397-4248STDY7308982-2019-03-04.vcf.gz\n", + "VBS09460-4248STDY7308986,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09460-4248STDY7308986-2019-03-07.vcf.gz\n", + "VBS09466-4248STDY7308989,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09466-4248STDY7308989-2019-03-07.vcf.gz\n", + "VBS09467-4248STDY7308990,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09467-4248STDY7308990-2019-03-07.vcf.gz\n", + "VBS09477-4248STDY7308994,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09477-4248STDY7308994-2019-03-07.vcf.gz\n", + "VBS09482-4248STDY7308996,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09482-4248STDY7308996-2019-03-07.vcf.gz\n", + "VBS09483-4248STDY7308997,https://1175-vo-kh-stlaurent-minimus.cog.sanger.ac.uk/VBS09483-4248STDY7308997-2019-03-07.vcf.gz\n" ] } ], @@ -277,9 +277,9 @@ "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "global-global-mgenv-6.0.6", "language": "python", - "name": "python3" + "name": "conda-env-global-global-mgenv-6.0.6-py" }, "language_info": { "codemirror_mode": { @@ -291,7 +291,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.15" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/_sources/amin1/intro.ipynb b/_sources/amin1/intro.ipynb index 058fe17..3fd3af4 100644 --- a/_sources/amin1/intro.ipynb +++ b/_sources/amin1/intro.ipynb @@ -1,248 +1,248 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "LBNBl2exUYWu" - }, - "source": [ - "# Amin1.0\n", - "\n", - "The `Amin1.0` resource comprises data from whole-genome sequencing of *Anopheles minimus* mosquitoes, which are a major vector of malaria in Southeast Asia. The mosquitoes were collected from sites in Cambodia in the context of a study of malaria vector species diversity led by Brandy St. Laurent.\n", - "\n", - "This page provides an introduction to the `Amin1.0` data, which we hope will be a valuable resource for research and surveillance of malaria vectors in Southeast Asia. If you have any questions about this guide or how to use the data, please [start a new discussion](https://github.com/malariagen/vector-public-data/discussions/new) on the malariagen/vector-open-data repo on GitHub. If you find any bugs, please [raise an issue](https://github.com/malariagen/vector-public-data/issues/new/choose)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LFEzNy4X89ZF" - }, - "source": [ - "## Citation and terms of use\n", - "\n", - "Data from `Amin1.0` are released openly and can be downloaded and analysed for any purpose. If you use these data as part of a publication, please cite the following paper:\n", - "\n", - "```{admonition} Citation\n", - "Brandyce St. Laurent et al. (2021) Population genomics reveal distinct and diverging populations of *An. minimus* in Cambodia – a widespread malaria vector in Southeast Asia. bioRxiv. [https://doi.org/10.1101/2021.11.11.468219](https://doi.org/10.1101/2021.11.11.468219)\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Aapyacfh89ZG" - }, - "source": [ - "## Partner studies and population sampling\n", - "\n", - "`Amin1.0` includes data from 302 individual mosquitoes. Mosquito specimens sequenced for this data resource came from three separate field studies in Cambodia, led by Brandy St. Laurent, in collaboration with the [National Center for Parasitology, Entomology and Malaria Control (CNM), Cambodia](https://www.cnm.gov.kh/), and the [NIH NIAID Laboratory of Malaria and Vector Research, USA](https://www.niaid.nih.gov/research/lab-malaria-vector-research). \n", - "\n", - "Mosquito collections were carried out in 2010 in Thmar Da; a longitudinal collection over 2014 at two sites in each of Pursat, Preah Vihear, and Ratanakiri provinces; and quarterly collections over 2016 at one site each in Pursat and Preah Vihear province, Cambodia. Multiple *Anopheles* species were collected in each of these studies, including the *An. minimus* s.s. specimens that have been included in this data resource. Field specimens were stored in 1.5 ml tubes with silica gel dessicant. DNA was extracted using either Nextec plates or a CTAB DNA extraction method. GPS coordinates for collections are available in the sample metadata." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IqIG0Ez789ZG" - }, - "source": [ - "```{image} ../images/amin1-map.png\n", - ":alt: Amin1 map of sampling sites\n", - ":class: bg-primary\n", - ":width: 700px\n", - ":align: center\n", - "```" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "LBNBl2exUYWu" + }, + "source": [ + "# Amin1.0\n", + "\n", + "The `Amin1.0` resource comprises data from whole-genome sequencing of *Anopheles minimus* mosquitoes, which are a major vector of malaria in Southeast Asia. The mosquitoes were collected from sites in Cambodia in the context of a study of malaria vector species diversity led by Brandy St. Laurent.\n", + "\n", + "This page provides an introduction to the `Amin1.0` data, which we hope will be a valuable resource for research and surveillance of malaria vectors in Southeast Asia. If you have any questions about this guide or how to use the data, please [start a new discussion](https://github.com/malariagen/vector-public-data/discussions/new) on the malariagen/vector-open-data repo on GitHub. If you find any bugs, please [raise an issue](https://github.com/malariagen/vector-public-data/issues/new/choose)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LFEzNy4X89ZF" + }, + "source": [ + "## Citation and terms of use\n", + "\n", + "Data from `Amin1.0` are released openly and can be downloaded and analysed for any purpose. If you use these data as part of a publication, please cite the following paper:\n", + "\n", + "```{admonition} Citation\n", + "Brandyce St. Laurent et al. (2021) Population genomics reveal distinct and diverging populations of *An. minimus* in Cambodia – a widespread malaria vector in Southeast Asia. bioRxiv. [https://doi.org/10.1101/2021.11.11.468219](https://doi.org/10.1101/2021.11.11.468219)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Aapyacfh89ZG" + }, + "source": [ + "## Partner studies and population sampling\n", + "\n", + "`Amin1.0` includes data from 302 individual mosquitoes. Mosquito specimens sequenced for this data resource came from three separate field studies in Cambodia, led by Brandy St. Laurent, in collaboration with the [National Center for Parasitology, Entomology and Malaria Control (CNM), Cambodia](https://www.cnm.gov.kh/), and the [NIH NIAID Laboratory of Malaria and Vector Research, USA](https://www.niaid.nih.gov/research/lab-malaria-vector-research). \n", + "\n", + "Mosquito collections were carried out in 2010 in Thmar Da; a longitudinal collection over 2014 at two sites in each of Pursat, Preah Vihear, and Ratanakiri provinces; and quarterly collections over 2016 at one site each in Pursat and Preah Vihear province, Cambodia. Multiple *Anopheles* species were collected in each of these studies, including the *An. minimus* s.s. specimens that have been included in this data resource. Field specimens were stored in 1.5 ml tubes with silica gel dessicant. DNA was extracted using either Nextec plates or a CTAB DNA extraction method. GPS coordinates for collections are available in the sample metadata." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IqIG0Ez789ZG" + }, + "source": [ + "```{image} ../images/amin1-map.png\n", + ":alt: Amin1 map of sampling sites\n", + ":class: bg-primary\n", + ":width: 700px\n", + ":align: center\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 255 }, + "id": "uREA_Xzz89ZH", + "outputId": "6c00a6c6-f61b-4f67-ffea-008cda4c4096", + "tags": [ + "remove-input" + ] + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "tags": [ - "remove-input" - ], - "id": "uREA_Xzz89ZH", - "outputId": "6c00a6c6-f61b-4f67-ffea-008cda4c4096", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 255 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Number of mosquito specimens by collection site and year.
  year20102011201420152016
longitudelatitudelocation     
102.73512.155Thmar Da2615000
104.9213.77Chean Mok006690
104.98213.667Preah Kleang0047936
106.99513.595Chamkar San0040110
107.02513.548Sayas003940
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "execution_count": 2 - } + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Number of mosquito specimens by collection site and year.
  year20102011201420152016
longitudelatitudelocation     
102.73512.155Thmar Da2615000
104.9213.77Chean Mok006690
104.98213.667Preah Kleang0047936
106.99513.595Chamkar San0040110
107.02513.548Sayas003940
\n" ], - "source": [ - "!pip install -q malariagen_data\n", - "import malariagen_data\n", - "amin1 = malariagen_data.Amin1()\n", - "df_samples = amin1.sample_metadata()\n", - "df_summary = df_samples.pivot_table(\n", - " index=[\"longitude\", \"latitude\", \"location\"], \n", - " columns=[\"year\"],\n", - " values=\"sample_id\", \n", - " aggfunc=len,\n", - " fill_value=0\n", - ")\n", - "df_summary.style.set_caption(\"Number of mosquito specimens by collection site and year.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8rctK8oh89ZJ" - }, - "source": [ - "## Whole-genome sequencing and variant calling\n", - "\n", - "All samples in `Amin1.0` have been sequenced individually to high coverage using Illumina technology at the Wellcome Sanger Institute. These sequence data have then been analysed to identify genetic variants such as single nucleotide polymorphisms (SNPs). After variant calling, both the samples and the variants have been through a range of quality control analyses, to ensure the data are of high quality. Both the raw sequence data and the curated variant calls are openly available for download and analysis.\n", - "\n", - "For further information about the sequencing and variant calling methods used, please see please see [St. Laurent et al. (2021)](https://doi.org/10.1101/2021.11.11.468219)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VhnLw9TE89ZJ" - }, - "source": [ - "## Data hosting\n", - "\n", - "All data in `Amin1.0` are available from Google Cloud Storage (GCS). \n", - "\n", - "The SNP calls can be analysed directly within the cloud without having to download or copy any data, via free interactive computing services such as [Google Colab](https://colab.research.google.com/). For more information, see the [cloud data access guide](cloud).\n", - "\n", - "Sequence read alignments and SNP calls can also be downloaded from GCS for analysis locally. For more information, see the [data download guide](download)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mFX4GHnQ89ZK" - }, - "source": [ - "## Further reading\n", - "\n", - "If you would like to start working with the `Amin1.0` data, please visit the [cloud data access guide](cloud) or the [data download guide](download) or continue browsing the other documentation on this site.\n", - "\n", - "For further information about the dataset and results of population genetic analyses, please see [St. Laurent et al. (2021)](https://doi.org/10.1101/2021.11.11.468219).\n" + "text/plain": [ + "" ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "colab": { - "name": "Amin1.0-intro.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.12" - } + ], + "source": [ + "!pip install -q malariagen_data\n", + "import malariagen_data\n", + "amin1 = malariagen_data.Amin1()\n", + "df_samples = amin1.sample_metadata()\n", + "df_summary = df_samples.pivot_table(\n", + " index=[\"longitude\", \"latitude\", \"location\"], \n", + " columns=[\"year\"],\n", + " values=\"sample_id\", \n", + " aggfunc=len,\n", + " fill_value=0\n", + ")\n", + "df_summary.style.set_caption(\"Number of mosquito specimens by collection site and year.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8rctK8oh89ZJ" + }, + "source": [ + "## Whole-genome sequencing and variant calling\n", + "\n", + "All samples in `Amin1.0` have been sequenced individually to high coverage using Illumina technology at the Wellcome Sanger Institute. These sequence data have then been analysed to identify genetic variants such as single nucleotide polymorphisms (SNPs). After variant calling, both the samples and the variants have been through a range of quality control analyses, to ensure the data are of high quality. Both the raw sequence data and the curated variant calls are openly available for download and analysis.\n", + "\n", + "For further information about the sequencing and variant calling methods used, please see please see [St. Laurent et al. (2021)](https://doi.org/10.1101/2021.11.11.468219)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VhnLw9TE89ZJ" + }, + "source": [ + "## Data hosting\n", + "\n", + "All data in `Amin1.0` are available from Google Cloud Storage (GCS). \n", + "\n", + "The SNP calls can be analysed directly within the cloud without having to download or copy any data, via free interactive computing services such as [Google Colab](https://colab.research.google.com/). For more information, see the [cloud data access guide](cloud).\n", + "\n", + "Sequence read alignments and SNP calls can also be downloaded from GCS for analysis locally. For more information, see the [data download guide](download)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mFX4GHnQ89ZK" + }, + "source": [ + "## Further reading\n", + "\n", + "If you would like to start working with the `Amin1.0` data, please visit the [cloud data access guide](cloud) or the [data download guide](download) or continue browsing the other documentation on this site.\n", + "\n", + "For further information about the dataset and results of population genetic analyses, please see [St. Laurent et al. (2021)](https://doi.org/10.1101/2021.11.11.468219).\n" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Amin1.0-intro.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "global-global-mgenv-6.0.6", + "language": "python", + "name": "conda-env-global-global-mgenv-6.0.6-py" }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/af1/af1.0.html b/af1/af1.0.html index 11c1421..4df2675 100644 --- a/af1/af1.0.html +++ b/af1/af1.0.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1595,16 +1559,16 @@

Further reading - + diff --git a/af1/af1.1.html b/af1/af1.1.html index e060a2f..cb7dc6c 100644 --- a/af1/af1.1.html +++ b/af1/af1.1.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1567,16 +1531,16 @@

Further reading - + diff --git a/af1/af1.2.html b/af1/af1.2.html index 61f1710..f1bb892 100644 --- a/af1/af1.2.html +++ b/af1/af1.2.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1172,16 +1136,16 @@

Further reading - + diff --git a/af1/af1.3.html b/af1/af1.3.html index 7a359f7..aad2b98 100644 --- a/af1/af1.3.html +++ b/af1/af1.3.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1142,16 +1106,16 @@

Further reading - + diff --git a/af1/af1.4.html b/af1/af1.4.html index a143944..8db7c3a 100644 --- a/af1/af1.4.html +++ b/af1/af1.4.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1172,16 +1136,16 @@

Further reading - + diff --git a/af1/cloud.html b/af1/cloud.html index fccd602..eb94ef2 100644 --- a/af1/cloud.html +++ b/af1/cloud.html @@ -430,42 +430,6 @@

MalariaGEN vector data user guide

@@ -9344,16 +9308,16 @@

Feedback and suggestions - + diff --git a/af1/download.html b/af1/download.html index 4545fc9..ca05ea8 100644 --- a/af1/download.html +++ b/af1/download.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1289,16 +1253,16 @@

Feedback and suggestions - + diff --git a/ag3/ag3.0.html b/ag3/ag3.0.html index e30c693..ae5e564 100644 --- a/ag3/ag3.0.html +++ b/ag3/ag3.0.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1659,16 +1623,16 @@

Further reading - + diff --git a/ag3/ag3.1.html b/ag3/ag3.1.html index 37b250a..3ca1899 100644 --- a/ag3/ag3.1.html +++ b/ag3/ag3.1.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1134,16 +1098,16 @@

Further reading - + diff --git a/ag3/ag3.10.html b/ag3/ag3.10.html index 177691b..87a5d2b 100644 --- a/ag3/ag3.10.html +++ b/ag3/ag3.10.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1370,16 +1334,16 @@

Further reading - + diff --git a/ag3/ag3.11.html b/ag3/ag3.11.html index a8002d7..270e1e9 100644 --- a/ag3/ag3.11.html +++ b/ag3/ag3.11.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1183,16 +1147,16 @@

Further reading - + diff --git a/ag3/ag3.12.html b/ag3/ag3.12.html index 2285340..84a7e05 100644 --- a/ag3/ag3.12.html +++ b/ag3/ag3.12.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1160,16 +1124,16 @@

Further reading - + diff --git a/ag3/ag3.13.html b/ag3/ag3.13.html index fb258b0..5355c92 100644 --- a/ag3/ag3.13.html +++ b/ag3/ag3.13.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1145,16 +1109,16 @@

Further reading - + diff --git a/ag3/ag3.2.html b/ag3/ag3.2.html index 9a49fd3..2b95517 100644 --- a/ag3/ag3.2.html +++ b/ag3/ag3.2.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1210,16 +1174,16 @@

Further reading - + diff --git a/ag3/ag3.3.html b/ag3/ag3.3.html index e31a7e9..36a21b9 100644 --- a/ag3/ag3.3.html +++ b/ag3/ag3.3.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

diff --git a/ag3/ag3.4.html b/ag3/ag3.4.html index b8f94e2..11c7567 100644 --- a/ag3/ag3.4.html +++ b/ag3/ag3.4.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1180,16 +1144,16 @@

Further reading - + diff --git a/ag3/ag3.5.html b/ag3/ag3.5.html index 37314ee..2e1361a 100644 --- a/ag3/ag3.5.html +++ b/ag3/ag3.5.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1214,16 +1178,16 @@

Further reading - + diff --git a/ag3/ag3.6.html b/ag3/ag3.6.html index d77488b..31fbf9d 100644 --- a/ag3/ag3.6.html +++ b/ag3/ag3.6.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1218,16 +1182,16 @@

Further reading - + diff --git a/ag3/ag3.7.html b/ag3/ag3.7.html index 24e7e9a..eb44305 100644 --- a/ag3/ag3.7.html +++ b/ag3/ag3.7.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1321,16 +1285,16 @@

Further reading - + diff --git a/ag3/ag3.8.html b/ag3/ag3.8.html index 21ece38..7f8692f 100644 --- a/ag3/ag3.8.html +++ b/ag3/ag3.8.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1327,16 +1291,16 @@

Further reading - + diff --git a/ag3/ag3.9.html b/ag3/ag3.9.html index 1ecdeab..a24f2ec 100644 --- a/ag3/ag3.9.html +++ b/ag3/ag3.9.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1644,16 +1608,16 @@

Further reading - + diff --git a/ag3/analysis.html b/ag3/analysis.html index 1e82672..2ce7d76 100644 --- a/ag3/analysis.html +++ b/ag3/analysis.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -617,16 +581,16 @@

Ag3 analysis - + diff --git a/ag3/cloud.html b/ag3/cloud.html index eacdc23..b195277 100644 --- a/ag3/cloud.html +++ b/ag3/cloud.html @@ -430,42 +430,6 @@

MalariaGEN vector data user guide

@@ -16650,16 +16614,16 @@

Feedback and suggestions - + diff --git a/ag3/download.html b/ag3/download.html index 7cef0f3..c1c101f 100644 --- a/ag3/download.html +++ b/ag3/download.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -1468,16 +1432,16 @@

Feedback and suggestions - + diff --git a/amin1/cloud.html b/amin1/cloud.html index a87efce..cf19979 100644 --- a/amin1/cloud.html +++ b/amin1/cloud.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -6182,16 +6146,16 @@

Feedback and suggestions - + diff --git a/amin1/download.html b/amin1/download.html index 0a6e599..d5fa8cf 100644 --- a/amin1/download.html +++ b/amin1/download.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -802,16 +766,16 @@

Feedback and suggestions - + diff --git a/amin1/intro.html b/amin1/intro.html index d9b6a33..fcdf269 100644 --- a/amin1/intro.html +++ b/amin1/intro.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide

@@ -803,16 +767,16 @@

Further reading - + diff --git a/vobs/vobs-data-access.html b/vobs/vobs-data-access.html index b2679c8..95e52e2 100644 --- a/vobs/vobs-data-access.html +++ b/vobs/vobs-data-access.html @@ -428,42 +428,6 @@

MalariaGEN vector data user guide