From 02ba77dbefb8cdbbad9500cab1d7ffc676c51dbd Mon Sep 17 00:00:00 2001 From: 5im1z <5imonedec9@gmail.com> Date: Tue, 9 Jan 2024 15:53:50 +0000 Subject: [PATCH] notebooks --- RNA_CCLE.ipynb | 24 + WGS_CCLE.ipynb | 1168 ++++++++++++++++++++++++------------------------ 2 files changed, 614 insertions(+), 578 deletions(-) diff --git a/RNA_CCLE.ipynb b/RNA_CCLE.ipynb index 8c873f07..7b14bdff 100644 --- a/RNA_CCLE.ipynb +++ b/RNA_CCLE.ipynb @@ -190,6 +190,30 @@ "# Fusion post processing" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "old_fusion_fn = \"\"\n", + "new_fusion_fn = \"\"\n", + "\n", + "new = pd.read_csv(new_fusion_fn, sep='\\t')\n", + "old = pd.read_csv(old_fusion_fn, sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.concat([old, new], ignore_index=True).to_csv(\"fusion_stitched_\" + constants.SAMPLESETNAME + \".csv\", sep='\\t')" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/WGS_CCLE.ipynb b/WGS_CCLE.ipynb index 1c37cdc2..3301e8dc 100644 --- a/WGS_CCLE.ipynb +++ b/WGS_CCLE.ipynb @@ -1,580 +1,592 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Loading the necessary packages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import depmapomics.patch_firecloud\n", - "depmapomics.patch_firecloud.install_patches()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from __future__ import print_function\n", - "\n", - "from depmapomics import constants\n", - "from depmapomics import env_config\n", - "\n", - "from depmapomics import dm_omics\n", - "from depmapomics import mutations as omics_mut\n", - "from depmapomics import copynumbers as omics_cn\n", - "from depmapomics import fingerprinting as fp\n", - "\n", - "from mgenepy import terra\n", - "import dalmatian as dm\n", - "from bokeh.plotting import output_notebook\n", - "\n", - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "output_notebook()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "isCCLE = True\n", - "doCleanup = False" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Loading new data\n", - "\n", - "Currently, sequenced data for DepMap is generated by the Genomics Platform (GP) at the Broad who deposits them into several different Terra workspaces. Therefore, the first step of this pipeline is to look at these workspaces and:\n", - "\n", - " - identify new samples by looking at the bam files and compare them with bams we have already onboarded\n", - " - remove duplicates and ones with broken file paths\n", - " - map files to profiles in Gumbo, if possible\n", - " - onboard new samples and new versions of old cell lines if we find any" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### The following two cells scan the delivery workspaces and add new samples to gumbo. Currently under construction to be regularly run off-cycle" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Currently working on running this step off-cycle\n", - "# if isCCLE:\n", - "# print(\"loading new WGS data\")\n", - "# from depmap_omics_upload import loading\n", - "# wgssamples, unmapped = loading.loadFromMultipleWorkspaces(WGSWORKSPACES, EXTRACT_DEFAULTS[\"sm_id\"], \"SMIDOrdered\", \"wgs\", bamcol=\"cram_path\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Currently working on running this step off-cycle\n", - "# if isCCLE:\n", - "# from depmap_omics_upload import loading\n", - "# # write samples to Sequencing table, copy bam files to internal storage bucket:\n", - "# wgssamples, cmds = loading.addSamplesToGumbo(wgssamples, 'wgs', WGS_GCS_PATH, filetypes=[\"cram\", \"crai\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### All WGS sequencingIDs in gumbo that are not in the WGS terra workspace yet are considered \"new\" for the current release. Here we add them to the terra processing workspace as a sample set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if isCCLE:\n", - " from depmap_omics_upload import loading\n", - " # load new rna samples from gumbo to WGS terra workspace:\n", - " loading.addSamplesToDepMapWorkspace('wgs', env_config.WGSWORKSPACE, samplesetname=constants.SAMPLESETNAME, add_to_samplesets=['allcurrent'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Run SNP fingerprinting, new (rna + wgs) vs all existing samples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wgs_wm = dm.WorkspaceManager(env_config.WGSWORKSPACE)\n", - "rna_wm = dm.WorkspaceManager(env_config.RNAWORKSPACE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wgs_all = wgs_wm.get_sample_sets().loc[constants.SAMPLESETNAME, \"samples\"]\n", - "rna_all = rna_wm.get_sample_sets().loc[constants.SAMPLESETNAME, \"samples\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if isCCLE:\n", - " updated_lod_mat, mismatches, matches = await fp._CCLEFingerPrint(rna_all, wgs_all)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Run pipeline on Terra\n", - "\n", - "We are using Dalmatian to send requests to Terra. See [our readme](https://github.com/broadinstitute/depmap_omics/blob/master/documentation/DepMap_processing_pipeline.md) for detailed breakdown of the subtasks in our WGS pipeline." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For non internal users, please make sure that your workspace is correctly setup\n", - "\n", - "To set up your workspace, follow the instructions in the README page." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"running Terra pipeline\")\n", - "refwm = dm.WorkspaceManager(env_config.WGSWORKSPACE)\n", - "submission_id = refwm.create_submission(\"WGS_pipeline\", constants.SAMPLESETNAME, 'sample_set', expression='this.samples')\n", - "await terra.waitForSubmission(env_config.WGSWORKSPACE, submission_id)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "submission_id = refwm.create_submission(\"Aggregate_CN_seg_files\", 'all')\n", - "await terra.waitForSubmission(env_config.WGSWORKSPACE, submission_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save the workflow configurations used" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "terra.saveWorkspace(env_config.WGSWORKSPACE,'data/'+constants.SAMPLESETNAME+'/WGSconfig/')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Postprocessing on local\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Copy Number" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wgs_wm = dm.WorkspaceManager(env_config.WGSWORKSPACE)\n", - "wgs_samples = wgs_wm.get_samples()\n", - "wgs_purecn = wgs_samples[(~wgs_samples.PureCN_loh.isna()) & (wgs_samples.PureCN_loh != \"NA\")].index.tolist()\n", - "wgs_wm.update_sample_set(sample_set_id=\"PureCN\", sample_ids=wgs_purecn)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if isCCLE:\n", - " wespriosegs, wgspriosegs = await dm_omics.cnPostProcessing(samplesetname=constants.SAMPLESETNAME, wesrefworkspace=env_config.WESCNWORKSPACE, wgsrefworkspace=env_config.WGSWORKSPACE, dryrun=False, useCache=False)\n", - "else:\n", - " segments, genecn, failed, purecn_segments, purecn_genecn, loh_status, feature_table = await omics_cn.postProcess(env_config.WGSWORKSPACE, sampleset=constants.SAMPLESETNAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Somatic Mutations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if isCCLE:\n", - " await dm_omics.mutationPostProcessing(wesrefworkspace=env_config.WESCNWORKSPACE, wgsrefworkspace=env_config.WGSWORKSPACE, run_guidemat=False, run_sv=True)\n", - "else:\n", - " await omics_mut.postProcess(env_config.WGSWORKSPACE, samplesetname=constants.SAMPLESETNAME)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Subset and upload\n", - "\n", - "Based on release dates and embargo status in gumbo, subset and upload datasets for each release audience, and hand off to the portal team." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from depmap_omics_upload import tracker\n", - "from depmap_omics_upload import upload\n", - "from mgenepy.utils import helper as h" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from datetime import date\n", - "import datetime\n", - "release_date = datetime.date(2023, 5, 3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "virtual = upload.initVirtualDatasets(samplesetname=constants.SAMPLESETNAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "upload.checkDataPermission()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "upload.uploadAuxTables(taiga_ids=virtual, today=release_date)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "upload.makeModelLvMatrices(virtual_ids=virtual, today=release_date)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "upload.makePRLvMatrices(virtual_ids=virtual, files_nummat={}, files_table={}, files_raw={\"mutations-latest-ed72\": {\"somaticMutations_profile_maf\": \"OmicsSomaticMutationsMAFProfile.maf\"}}, today=release_date)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Managing release readmes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ! cd .. && git clone https://github.com/broadinstitute/depmap-release-readmes.git && cd -" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! cd ../depmap-release-readmes && git pull --no-commit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!cd ../depmap-release-readmes/ && python3 make_new_release.py $constants.RELEASE && git add . && git commit -m $constants.RELEASE && git push " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### cleaning workspaces" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from depmap_omics_upload.mgenepy import terra as terra_cleanup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if doCleanup:\n", - " print(\"cleaning workspaces\")\n", - " torm = await terra_cleanup.deleteHeavyFiles(env_config.WGSWORKSPACE)\n", - " h.parrun(['gsutil rm '+i for i in torm], cores=8)\n", - " terra_cleanup.removeFromFailedWorkflows(env_config.WGSWORKSPACE, dryrun=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Saving workspace configs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! terra-sync export broad-firecloud-ccle/DepMap_WGS_CN data/$constants.SAMPLESETNAME/WGSconfig" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! terra-sync export broad-firecloud-ccle/DepMap_hg38_RNAseq data/$constants.SAMPLESETNAME/RNAconfig" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! cd data/$constants.SAMPLESETNAME/WGSconfig && mv */*/* . && rm -r configs/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! cd ../RNAconfig && mv */*/* . && rm -r configs/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "latex_envs": { - "LaTeX_envs_menu_present": true, - "autoclose": false, - "autocomplete": true, - "bibliofile": "biblio.bib", - "cite_by": "apalike", - "current_citInitial": 1, - "eqLabelWithNumbers": true, - "eqNumInitial": 1, - "hotkeys": { - "equation": "Ctrl-E", - "itemize": "Ctrl-I" - }, - "labels_anchors": false, - "latex_user_defs": false, - "report_style_numbering": false, - "user_envs_cfg": false - }, - "notify_time": "5", - "toc": { - "base_numbering": 1, - "nav_menu": { - "height": "277px", - "width": "375px" - }, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": { - "height": "386.4px", - "left": "762px", - "top": "202px", - "width": "198.8px" - }, - "toc_section_display": true, - "toc_window_display": true - }, - "varInspector": { - "cols": { - "lenName": 16, - "lenType": 16, - "lenVar": 40 - }, - "kernels_config": { - "python": { - "delete_cmd_postfix": "", - "delete_cmd_prefix": "del ", - "library": "var_list.py", - "varRefreshCmd": "print(var_dic_list())" - }, - "r": { - "delete_cmd_postfix": ") ", - "delete_cmd_prefix": "rm(", - "library": "var_list.r", - "varRefreshCmd": "cat(var_dic_list()) " - } - }, - "oldHeight": 577.8, - "position": { - "height": "40px", - "left": "1285.4px", - "right": "20px", - "top": "106px", - "width": "254.8px" - }, - "types_to_exclude": [ - "module", - "function", - "builtin_function_or_method", - "instance", - "_Feature" - ], - "varInspector_section_display": "none", - "window_display": false - }, - "vscode": { - "interpreter": { - "hash": "1f170dacc8a54decc88aa6dec68bcf2a5d65c3893a5412f67865d27128dae76b" - } - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Loading the necessary packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import depmapomics.patch_firecloud\n", + "depmapomics.patch_firecloud.install_patches()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import print_function\n", + "\n", + "from depmapomics import constants\n", + "from depmapomics import env_config\n", + "\n", + "from depmapomics import dm_omics\n", + "from depmapomics import mutations as omics_mut\n", + "from depmapomics import copynumbers as omics_cn\n", + "from depmapomics import fingerprinting as fp\n", + "\n", + "from mgenepy import terra\n", + "import dalmatian as dm\n", + "from bokeh.plotting import output_notebook\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "output_notebook()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isCCLE = True\n", + "doCleanup = False" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Loading new data\n", + "\n", + "Currently, sequenced data for DepMap is generated by the Genomics Platform (GP) at the Broad who deposits them into several different Terra workspaces. Therefore, the first step of this pipeline is to look at these workspaces and:\n", + "\n", + " - identify new samples by looking at the bam files and compare them with bams we have already onboarded\n", + " - remove duplicates and ones with broken file paths\n", + " - map files to profiles in Gumbo, if possible\n", + " - onboard new samples and new versions of old cell lines if we find any" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### The following two cells scan the delivery workspaces and add new samples to gumbo. Currently under construction to be regularly run off-cycle" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Currently working on running this step off-cycle\n", + "# if isCCLE:\n", + "# print(\"loading new WGS data\")\n", + "# from depmap_omics_upload import loading\n", + "# wgssamples, unmapped = loading.loadFromMultipleWorkspaces(WGSWORKSPACES, EXTRACT_DEFAULTS[\"sm_id\"], \"SMIDOrdered\", \"wgs\", bamcol=\"cram_path\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Currently working on running this step off-cycle\n", + "# if isCCLE:\n", + "# from depmap_omics_upload import loading\n", + "# # write samples to Sequencing table, copy bam files to internal storage bucket:\n", + "# wgssamples, cmds = loading.addSamplesToGumbo(wgssamples, 'wgs', WGS_GCS_PATH, filetypes=[\"cram\", \"crai\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### All WGS sequencingIDs in gumbo that are not in the WGS terra workspace yet are considered \"new\" for the current release. Here we add them to the terra processing workspace as a sample set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if isCCLE:\n", + " from depmap_omics_upload import loading\n", + " # load new rna samples from gumbo to WGS terra workspace:\n", + " loading.addSamplesToDepMapWorkspace('wgs', env_config.WGSWORKSPACE, samplesetname=constants.SAMPLESETNAME, add_to_samplesets=['allcurrent'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run SNP fingerprinting, new (rna + wgs) vs all existing samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wgs_wm = dm.WorkspaceManager(env_config.WGSWORKSPACE)\n", + "rna_wm = dm.WorkspaceManager(env_config.RNAWORKSPACE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wgs_all = wgs_wm.get_sample_sets().loc[constants.SAMPLESETNAME, \"samples\"]\n", + "rna_all = rna_wm.get_sample_sets().loc[constants.SAMPLESETNAME, \"samples\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if isCCLE:\n", + " updated_lod_mat, mismatches, matches = await fp._CCLEFingerPrint(rna_all, wgs_all)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run pipeline on Terra\n", + "\n", + "We are using Dalmatian to send requests to Terra. See [our readme](https://github.com/broadinstitute/depmap_omics/blob/master/documentation/DepMap_processing_pipeline.md) for detailed breakdown of the subtasks in our WGS pipeline." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For non internal users, please make sure that your workspace is correctly setup\n", + "\n", + "To set up your workspace, follow the instructions in the README page." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"running Terra pipeline\")\n", + "refwm = dm.WorkspaceManager(env_config.WGSWORKSPACE)\n", + "submission_id = refwm.create_submission(\"WGS_pipeline\", constants.SAMPLESETNAME, 'sample_set', expression='this.samples')\n", + "await terra.waitForSubmission(env_config.WGSWORKSPACE, submission_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "submission_id = refwm.create_submission(\"Aggregate_CN_seg_files\", 'all')\n", + "await terra.waitForSubmission(env_config.WGSWORKSPACE, submission_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save the workflow configurations used" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "terra.saveWorkspace(env_config.WGSWORKSPACE,'data/'+constants.SAMPLESETNAME+'/WGSconfig/')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Postprocessing on local\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Copy Number" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wgs_wm = dm.WorkspaceManager(env_config.WGSWORKSPACE)\n", + "wgs_samples = wgs_wm.get_samples()\n", + "wgs_purecn = wgs_samples[(~wgs_samples.PureCN_loh.isna()) & (wgs_samples.PureCN_loh != \"NA\")].index.tolist()\n", + "wgs_wm.update_sample_set(sample_set_id=\"PureCN\", sample_ids=wgs_purecn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if isCCLE:\n", + " wespriosegs, wgspriosegs = await dm_omics.cnPostProcessing(samplesetname=constants.SAMPLESETNAME, wesrefworkspace=env_config.WESCNWORKSPACE, wgsrefworkspace=env_config.WGSWORKSPACE, dryrun=False, useCache=False)\n", + "else:\n", + " segments, genecn, failed, purecn_segments, purecn_genecn, loh_status, feature_table = await omics_cn.postProcess(env_config.WGSWORKSPACE, sampleset=constants.SAMPLESETNAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Somatic Mutations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if isCCLE:\n", + " await dm_omics.mutationPostProcessing(wesrefworkspace=env_config.WESCNWORKSPACE, wgsrefworkspace=env_config.WGSWORKSPACE, run_guidemat=False, run_sv=True, mafcol=\"depmap_maf_23q4\")\n", + "else:\n", + " await omics_mut.postProcess(env_config.WGSWORKSPACE, samplesetname=constants.SAMPLESETNAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Subset and upload\n", + "\n", + "Based on release dates and embargo status in gumbo, subset and upload datasets for each release audience, and hand off to the portal team." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from depmap_omics_upload import tracker\n", + "from depmap_omics_upload import upload\n", + "from mgenepy.utils import helper as h" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import date\n", + "import datetime\n", + "release_date = datetime.date(2023, 11, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "virtual = upload.initVirtualDatasets(samplesetname=constants.SAMPLESETNAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "virtual" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "upload.checkDataPermission()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "upload.uploadAuxTables(taiga_ids=virtual, today=release_date)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "upload.makeModelLvMatrices(virtual_ids=virtual, today=release_date)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "upload.makePRLvMatrices(virtual_ids=virtual, today=release_date)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "upload.updateEternal(virtual=virtual)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Managing release readmes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ! cd .. && git clone https://github.com/broadinstitute/depmap-release-readmes.git && cd -" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! cd ../depmap-release-readmes && git pull --no-commit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!cd ../depmap-release-readmes/ && python3 make_new_release.py $constants.RELEASE && git add . && git commit -m $constants.RELEASE && git push " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### cleaning workspaces" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from depmap_omics_upload.mgenepy import terra as terra_cleanup\n", + "from mgenepy.utils import helper as h " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if doCleanup:\n", + " print(\"cleaning workspaces\")\n", + " torm = await terra_cleanup.deleteHeavyFiles(\"broad-firecloud-ccle/DEV_DepMap_WGS_CN\")\n", + " h.parrun(['gsutil rm '+i for i in torm], cores=8)\n", + " terra_cleanup.removeFromFailedWorkflows(\"broad-firecloud-ccle/DEV_DepMap_WGS_CN\", dryrun=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Saving workspace configs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! terra-sync export broad-firecloud-ccle/DepMap_WGS_CN data/$constants.SAMPLESETNAME/WGSconfig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! terra-sync export broad-firecloud-ccle/DepMap_hg38_RNAseq data/$constants.SAMPLESETNAME/RNAconfig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! cd data/$constants.SAMPLESETNAME/WGSconfig && mv */*/* . && rm -r configs/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! cd data/$constants.SAMPLESETNAME/RNAconfig && mv */*/* . && rm -r configs/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + }, + "notify_time": "5", + "toc": { + "base_numbering": 1, + "nav_menu": { + "height": "277px", + "width": "375px" + }, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "386.4px", + "left": "762px", + "top": "202px", + "width": "198.8px" + }, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "oldHeight": 577.8, + "position": { + "height": "40px", + "left": "1285.4px", + "right": "20px", + "top": "106px", + "width": "254.8px" + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "varInspector_section_display": "none", + "window_display": false + }, + "vscode": { + "interpreter": { + "hash": "1f170dacc8a54decc88aa6dec68bcf2a5d65c3893a5412f67865d27128dae76b" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}