From fb2867457d24c559eed367d17e6e70399c1c579c Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Thu, 15 Aug 2024 00:36:37 -0700 Subject: [PATCH 01/39] Add patch release --- scripts/patch_release/Dockerfile | 5 + scripts/patch_release/patch.py | 358 +++++++++++++++++++++++++++++++ 2 files changed, 363 insertions(+) create mode 100644 scripts/patch_release/Dockerfile create mode 100644 scripts/patch_release/patch.py diff --git a/scripts/patch_release/Dockerfile b/scripts/patch_release/Dockerfile new file mode 100644 index 0000000..1412ce2 --- /dev/null +++ b/scripts/patch_release/Dockerfile @@ -0,0 +1,5 @@ +FROM sagebionetworks/synapsepythonclient:v2.6.0 + +WORKDIR /patch_release + +COPY . . diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py new file mode 100644 index 0000000..05af729 --- /dev/null +++ b/scripts/patch_release/patch.py @@ -0,0 +1,358 @@ +""" +Patch releases occur when samples need to be retracted due to +patients withdrawing consent. + +Patches should always occur on the latest consortium release for +the specific public release. Due to the GENIE retraction policy, +it is best to retract data on the 3rd consortium release of the +subsequent release series. +""" +import argparse +import os +import shutil +import tempfile + +import pandas as pd +import synapseclient + +from genie import process_functions +from genie import create_case_lists +from genie import dashboard_table_updater + + +# Run time functions +def revise_meta_file(meta_file_path: str, old_version: str, new_version: str) -> None: + """ + Replaces the old version with the new version in the meta file. + + Args: + meta_file_path (str): The path to the meta file. + old_version (str): The old version to be replaced. + new_version (str): The new version to replace the old version. + + Returns: + None + """ + with open(meta_file_path, "r") as meta: + meta_text = meta.read() + with open(meta_file_path, "w") as meta: + meta_text = meta_text.replace(old_version, new_version) + meta.write(meta_text) + + +def store_file( + syn: synapseclient.Synapse, new_path: str, new_release_synid: str, release_name: str +) -> None: + """ + Stores a file into Synapse. + + Args: + syn (synapseclient.Synapse): The Synapse client object. + new_path (str): The path to the file to be stored. + new_release_synid (str): The Synapse ID of the release folder where the file will be stored. + release_name (str): The name of the release. + + Returns: + None + """ + ent_name = os.path.basename(new_path.replace(f"_{release_name}", "")) + new_ent = synapseclient.File(new_path, name=ent_name, parentId=new_release_synid) + syn.store(new_ent) + + +def patch_release_workflow( + release_synid: str, new_release_synid: str, retracted_sample_synid: str +): + """ + These need to be modified per retraction. + The release_synid, new_release_synid, and retracted_sample_synid + variables need to be changed to reflect different Synapse ids per release. + """ + syn = synapseclient.login() + + remove_centers = [] + remove_seqassays = [] + # 11 release series + release_synid = "" # Fill in synapse id here + old_release = syn.get(release_synid).name + # The new release folder MUST be created on synapse first. + new_release_synid = "" # Fill in synapse id here + new_release = syn.get(new_release_synid).name + # samples to retract. Example: syn27734047 (12.3 consortium) + # If you are creating the 11.1-public patch, you will be using the + # 12.3-consortium samples to retract file + # Synapse id configurations + retracted_sample_synid = "" + # Data base mapping synid + database_mapping_synid = "syn10967259" + + retracted_samples_ent = syn.get(retracted_sample_synid) + retracted_samplesdf = pd.read_csv(retracted_samples_ent.path) + release_files = syn.getChildren(release_synid) + + # Get file mapping + file_mapping = { + release_file["name"]: release_file["id"] for release_file in release_files + } + # case_list_folder_synid = file_mapping['case_lists'] + case_list_folder_synid = syn.store( + synapseclient.Folder("case_lists", parentId=new_release_synid) + ).id + + sample_synid = file_mapping["data_clinical_sample.txt"] + patient_synid = file_mapping["data_clinical_patient.txt"] + cna_synid = file_mapping["data_CNA.txt"] + fusion_synid = file_mapping["data_fusions.txt"] + gene_synid = file_mapping["data_gene_matrix.txt"] + maf_synid = file_mapping["data_mutations_extended.txt"] + genomic_info_synid = file_mapping.get("genie_combined.bed") + if genomic_info_synid is None: + genomic_info_synid = file_mapping["genomic_information.txt"] + seg_synid = file_mapping.get("genie_public_data_cna_hg19.seg") + if seg_synid is None: + seg_synid = file_mapping["genie_private_data_cna_hg19.seg"] + assay_info_synid = file_mapping["assay_information.txt"] + + # Sample and patient column to cBioPortal mappings + mapping_table = syn.tableQuery("SELECT * FROM syn9621600") + mapping = mapping_table.asDataFrame() + + # Create temporary directory to download files + tempdir_o = tempfile.TemporaryDirectory() + tempdir = tempdir_o.name + # Create clinical file + + # Obtain samples retracted + sample_ent = syn.get(sample_synid, followLink=True) + sampledf = pd.read_csv(sample_ent.path, sep="\t", comment="#") + centers = [patient.split("-")[1] for patient in sampledf.PATIENT_ID] + sampledf["CENTER"] = centers + # Retract samples from SEQ_ASSAY_ID, CENTER and retract samples list + to_remove_seqassay_rows = sampledf["SEQ_ASSAY_ID"].isin(remove_seqassays) + sampledf = sampledf[~to_remove_seqassay_rows] + to_remove_center_rows = sampledf["CENTER"].isin(remove_centers) + sampledf = sampledf[~to_remove_center_rows] + to_remove_samples = sampledf["SAMPLE_ID"].isin(retracted_samplesdf.SAMPLE_ID) + final_sampledf = sampledf[~to_remove_samples] + # Check number of seq assay ids is the same after removal of samples + # Must add to removal of seq assay list for gene panel removal + seq_assay_after = final_sampledf["SEQ_ASSAY_ID"].unique() + seq_assay_before = sampledf["SEQ_ASSAY_ID"].unique() + if len(seq_assay_after) != len(seq_assay_before): + remove_seqassays.extend( + seq_assay_before[~seq_assay_before.isin(seq_assay_after)].tolist() + ) + # Check number of centers is the same after removal of samples + # Must add to removal of seq assay list for gene panel removal + center_after = final_sampledf["CENTER"].unique() + center_before = sampledf["CENTER"].unique() + if len(center_after) != len(center_before): + remove_centers.extend(center_before[~center_before.isin(center_after)].tolist()) + + del final_sampledf["CENTER"] + + keep_samples = final_sampledf["SAMPLE_ID"].drop_duplicates() + keep_patients = final_sampledf["PATIENT_ID"].drop_duplicates() + + patient_ent = syn.get(patient_synid, followLink=True) + patientdf = pd.read_csv(patient_ent.path, sep="\t", comment="#") + patientdf = patientdf[patientdf["PATIENT_ID"].isin(keep_patients)] + + clinicaldf = final_sampledf.merge(patientdf, on="PATIENT_ID", how="outer") + + clin_ent = syn.get(file_mapping.get("data_clinical.txt"), followLink=True) + full_clin_df = pd.read_csv(clin_ent.path, sep="\t", comment="#") + clinical_path = os.path.join(tempdir, os.path.basename(clin_ent.path)) + # GEN-646: Make sure to subset the clinical dataframe or else + # There will be issues downstream. The dashboard code along with + # public release code rely on the merged clinical file. + full_clin_df = full_clin_df[full_clin_df["SAMPLE_ID"].isin(keep_samples)] + full_clin_df.to_csv(clinical_path, sep="\t", index=False) + store_file(syn, clinical_path, new_release_synid, new_release) + + sample_path = os.path.join( + tempdir, os.path.basename(sample_ent.path).replace(old_release, new_release) + ) + patient_path = os.path.join( + tempdir, os.path.basename(patient_ent.path).replace(old_release, new_release) + ) + + process_functions.addClinicalHeaders( + clinicaldf, + mapping, + patientdf.columns, + sampledf.columns, + sample_path, + patient_path, + ) + store_file(syn, sample_path, new_release_synid, new_release) + store_file(syn, patient_path, new_release_synid, new_release) + # Patch CNA file + cna_ent = syn.get(cna_synid, followLink=True) + cnadf = pd.read_csv(cna_ent.path, sep="\t", comment="#") + cna_cols = ["Hugo_Symbol"] + cna_cols.extend(keep_samples.tolist()) + cna_cols_idx = cnadf.columns.isin(cna_cols) + if not cna_cols_idx.all(): + cnadf = cnadf[cnadf.columns[cna_cols_idx]] + cnatext = process_functions.removePandasDfFloat(cnadf) + cna_path = os.path.join( + tempdir, os.path.basename(cna_ent.path).replace(old_release, new_release) + ) + with open(cna_path, "w") as cna_file: + cna_file.write(cnatext) + store_file(syn, cna_path, new_release_synid, new_release) + # Patch Fusion file + fusion_ent = syn.get(fusion_synid, followLink=True) + fusiondf = pd.read_csv(fusion_ent.path, sep="\t", comment="#") + # if not fusiondf.Tumor_Sample_Barcode.isin(keep_samples).all(): + fusiondf = fusiondf[fusiondf.Tumor_Sample_Barcode.isin(keep_samples)] + fusiontext = process_functions.removePandasDfFloat(fusiondf) + fusion_path = os.path.join( + tempdir, os.path.basename(fusion_ent.path).replace(old_release, new_release) + ) + with open(fusion_path, "w") as fusion_file: + fusion_file.write(fusiontext) + store_file(syn, fusion_path, new_release_synid, new_release) + # Patch SEG file + seg_ent = syn.get(seg_synid, followLink=True) + segdf = pd.read_csv(seg_ent.path, sep="\t", comment="#") + # if not segdf.ID.isin(keep_samples).all(): + segdf = segdf[segdf.ID.isin(keep_samples)] + segtext = process_functions.removePandasDfFloat(segdf) + seg_path = os.path.join( + tempdir, os.path.basename(seg_ent.path).replace(old_release, new_release) + ) + with open(seg_path, "w") as seg_file: + seg_file.write(segtext) + store_file(syn, seg_path, new_release_synid, new_release) + + # Patch gene matrix file + gene_ent = syn.get(gene_synid, followLink=True) + genedf = pd.read_csv(gene_ent.path, sep="\t", comment="#") + genedf = genedf[genedf.SAMPLE_ID.isin(keep_samples)] + genedf[genedf.isnull()] = "NA" + gene_path = os.path.join( + tempdir, os.path.basename(gene_ent.path).replace(old_release, new_release) + ) + genedf.to_csv(gene_path, sep="\t", index=False) + store_file(syn, gene_path, new_release_synid, new_release) + # Patch maf file + maf_ent = syn.get(maf_synid, followLink=True) + mafdf = pd.read_csv(maf_ent.path, sep="\t", comment="#") + mafdf = mafdf[mafdf["Tumor_Sample_Barcode"].isin(keep_samples)] + maftext = process_functions.removePandasDfFloat(mafdf) + maf_path = os.path.join( + tempdir, os.path.basename(maf_ent.path).replace(old_release, new_release) + ) + with open(maf_path, "w") as maf_file: + maf_file.write(maftext) + store_file(syn, maf_path, new_release_synid, new_release) + # Patch genomic information file + # clinicalReported column needs to be added + # Patch genomic information file + genome_info_ent = syn.get(genomic_info_synid, followLink=True) + genome_info_df = pd.read_csv(genome_info_ent.path, sep="\t", comment="#") + keep_rows = [ + seq not in remove_seqassays and not seq.startswith(tuple(remove_centers)) + for seq in genome_info_df["SEQ_ASSAY_ID"] + ] + genome_info_df = genome_info_df[keep_rows] + + # Write genomic file + genome_info_text = process_functions.removePandasDfFloat(genome_info_df) + genome_info_path = os.path.join( + tempdir, + os.path.basename(genome_info_ent.path).replace(old_release, new_release), + ) + + with open(genome_info_path, "w") as bed_file: + bed_file.write(genome_info_text) + store_file(syn, genome_info_path, new_release_synid, new_release) + # Create cBioPortal gene panel and meta files + for name in file_mapping: + if name.startswith("data_gene_panel"): + seq_name = name.replace("data_gene_panel_", "").replace(".txt", "") + if seq_name not in remove_seqassays: + gene_panel_ent = syn.get(file_mapping[name], followLink=True) + new_panel_path = os.path.join( + tempdir, + os.path.basename(gene_panel_ent.path).replace( + old_release, new_release + ), + ) + shutil.copyfile(gene_panel_ent.path, new_panel_path) + store_file(syn, new_panel_path, new_release_synid, new_release) + elif name.startswith("meta") or "_meta_" in name: + meta_ent = syn.get(file_mapping[name], followLink=True) + new_meta_path = os.path.join(tempdir, os.path.basename(meta_ent.path)) + shutil.copyfile(meta_ent.path, new_meta_path) + revise_meta_file(new_meta_path, old_release, new_release) + store_file(syn, new_meta_path, new_release_synid, new_release) + # Patch assay information file + assay_ent = syn.get(assay_info_synid, followLink=True) + assaydf = pd.read_csv(assay_ent.path, sep="\t", comment="#") + keep_rows = [ + seq not in remove_seqassays and not seq.startswith(tuple(remove_centers)) + for seq in assaydf["SEQ_ASSAY_ID"] + ] + assaydf = assaydf[keep_rows] + assay_text = process_functions.removePandasDfFloat(assaydf) + assay_path = os.path.join( + tempdir, os.path.basename(assay_ent.path).replace(old_release, new_release) + ) + with open(assay_path, "w") as assay_file: + assay_file.write(assay_text) + store_file(syn, assay_path, new_release_synid, new_release) + # Create cBioPortal case lists + case_list_path = os.path.join(tempdir, "case_lists") + if not os.path.exists(case_list_path): + os.mkdir(case_list_path) + create_case_lists.main(clinical_path, assay_path, case_list_path, "genie_private") + + case_list_files = os.listdir(case_list_path) + + for case_filename in case_list_files: + # if case_filename in case_file_synids: + case_path = os.path.join(case_list_path, case_filename) + store_file(syn, case_path, case_list_folder_synid, new_release) + + tempdir_o.cleanup() + # Update dashboard tables + # You may have to execute this twice in case the file view isn't updated + database_mapping = syn.tableQuery(f"select * from {database_mapping_synid} limit 1") + database_mapping = syn.tableQuery(f"select * from {database_mapping_synid}") + database_mappingdf = database_mapping.asDataFrame() + dashboard_table_updater.run_dashboard(syn, database_mappingdf, new_release) + + +def main(): + parser = argparse.ArgumentParser(description="Store a file in Synapse.") + + parser.add_argument( + "release_synid", + type=str, + help="The Synapse Id of the consortium release folder", + ) + parser.add_argument( + "new_release_synid", + type=str, + help="The Synapse Id of the new release folder (has to be created)", + ) + parser.add_argument( + "retracted_sample_synid", + type=str, + help="The Synapse Id of the samples_to_retract.csv file generated in the current 3rd consortium release.", + ) + + args = parser.parse_args() + + patch_release_workflow( + release_synid=args.release_synid, + new_release_synid=args.new_release_synid, + retracted_sample_synid=args.retracted_sample_synid, + ) + + +if __name__ == "__main__": + main() From a2c0152774e7536411239eabf8c376a06944a90e Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Thu, 15 Aug 2024 00:48:02 -0700 Subject: [PATCH 02/39] Add patch release --- modules/patch_release.nf | 21 +++++++++++++++++++++ nextflow.config | 2 ++ patch_release_main.nf | 14 ++++++++++++++ 3 files changed, 37 insertions(+) create mode 100644 modules/patch_release.nf create mode 100644 patch_release_main.nf diff --git a/modules/patch_release.nf b/modules/patch_release.nf new file mode 100644 index 0000000..913883f --- /dev/null +++ b/modules/patch_release.nf @@ -0,0 +1,21 @@ +// Patch release +process patch_release { + container "$params.patch_release_docker" + secret 'SYNAPSE_AUTH_TOKEN' + + input: + val release_synid + val new_release_synid + val retracted_sample_synid + + output: + stdout + + script: + """ + python3 /patch_release/patch.py \ + --release_synid $release_syn_id \ + --new_release_synid $new_release_synid \ + --retracted-sample_synid $retracted_sample_synid + """ +} diff --git a/nextflow.config b/nextflow.config index 64f7e07..78cde0b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -22,6 +22,7 @@ profiles { main_release_utils_docker = "sagebionetworks/main-genie-release-utils" find_maf_artifacts_docker = "sagebionetworks/genie-artifact-finder" create_data_guide_docker = "sagebionetworks/genie-data-guide" + patch_release_docker = "sagebionetworks/genie-patch-main-release" } } aws_prod { @@ -57,6 +58,7 @@ profiles { main_release_utils_docker = "sagebionetworks/main-genie-release-utils" find_maf_artifacts_docker = "sagebionetworks/genie-artifact-finder" create_data_guide_docker = "sagebionetworks/genie-data-guide" + patch_release_docker = "sagebionetworks/genie-patch-main-release" } } } \ No newline at end of file diff --git a/patch_release_main.nf b/patch_release_main.nf new file mode 100644 index 0000000..e8e7146 --- /dev/null +++ b/patch_release_main.nf @@ -0,0 +1,14 @@ +#!/usr/bin/env nextflow +// Ensure DSL2 +nextflow.enable.dsl = 2 + +// IMPORT MODULES +include { patch_release } from './modules/patch_release' + +params.release_synid = null +params.new_release_synid = null +params.retracted_sample_synid = null + +workflow { + patch_release(params.release_synid, params.new_release_synid, params.retracted_sample_synid) +} From c77a853f784c040955cdff7e5816fb4ae639e5bd Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Thu, 15 Aug 2024 09:46:07 -0700 Subject: [PATCH 03/39] Use genie container --- scripts/patch_release/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/patch_release/Dockerfile b/scripts/patch_release/Dockerfile index 1412ce2..43bdf51 100644 --- a/scripts/patch_release/Dockerfile +++ b/scripts/patch_release/Dockerfile @@ -1,4 +1,4 @@ -FROM sagebionetworks/synapsepythonclient:v2.6.0 +FROM sagebionetworks/genie:version-16.4.0 WORKDIR /patch_release From 3a627c9896dd35980eb2af31533bb406fc5f8f5b Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Thu, 15 Aug 2024 21:08:31 -0700 Subject: [PATCH 04/39] Create dashboard html module --- modules/create_dashboard_html.nf | 29 ++++++ patch_release_main.nf | 15 ++- scripts/patch_release/patch.py | 171 +++++++++++++++++-------------- 3 files changed, 135 insertions(+), 80 deletions(-) create mode 100644 modules/create_dashboard_html.nf diff --git a/modules/create_dashboard_html.nf b/modules/create_dashboard_html.nf new file mode 100644 index 0000000..f58abaf --- /dev/null +++ b/modules/create_dashboard_html.nf @@ -0,0 +1,29 @@ +// Create data guide +process create_dashboard_html { + debug true + container "$params.main_pipeline_docker" + secret 'SYNAPSE_AUTH_TOKEN' + + input: + val previous + val release + val production + + output: + stdout + // path "data_guide.pdf" + + script: + if (production) { + """ + Rscript ./R/dashboard_markdown_generator.R" $release \ + --template_path ./templates/dashboardTemplate.Rmd + """ + } else { + """ + Rscript ./R/dashboard_markdown_generator.R" $release \ + --template_path ./templates/dashboardTemplate.Rmd \ + --staging + """ + } +} diff --git a/patch_release_main.nf b/patch_release_main.nf index e8e7146..5ef067d 100644 --- a/patch_release_main.nf +++ b/patch_release_main.nf @@ -4,11 +4,20 @@ nextflow.enable.dsl = 2 // IMPORT MODULES include { patch_release } from './modules/patch_release' +include { create_dashboard_html } from './modules/create_dashboard_html' -params.release_synid = null -params.new_release_synid = null -params.retracted_sample_synid = null + + +params.release_synid = "syn53170398" // 15.4-consortium +params.new_release_synid = "syn62069187" // 15.6-consortium (in staging) +params.retracted_sample_synid = "syn54082015" // 16.3-consortium samples_to_retract.csv +params.release = "15.6-consortium" +// project_id = "syn7208886" +params.project_id = "syn22033066" // staging project +params.production = false // production is false workflow { patch_release(params.release_synid, params.new_release_synid, params.retracted_sample_synid) + create_dashboard_html(patch_release.out, params.release, params.production) + create_data_guide(patch_release.out, params.release, params.project_id) } diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index 05af729..cabe7af 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -10,15 +10,17 @@ import argparse import os import shutil +import subprocess import tempfile import pandas as pd import synapseclient -from genie import process_functions -from genie import create_case_lists -from genie import dashboard_table_updater - +from genie import ( + create_case_lists, + dashboard_table_updater, + process_functions +) # Run time functions def revise_meta_file(meta_file_path: str, old_version: str, new_version: str) -> None: @@ -60,8 +62,30 @@ def store_file( syn.store(new_ent) +def generate_dashboard_html(genie_version, staging=False): + """Generates dashboard html writeout that gets uploaded to the + release folder + + Args: + genie_version: GENIE release + staging: Use staging files. Default is False + + """ + markdown_render_cmd = [ + "Rscript", + "/root/Genie/R/dashboard_markdown_generator.R", + genie_version, + "--template_path", + "/root/Genie/templates/dashboardTemplate.Rmd", + ] + + if staging: + markdown_render_cmd.append("--staging") + subprocess.check_call(markdown_render_cmd) + + def patch_release_workflow( - release_synid: str, new_release_synid: str, retracted_sample_synid: str + release_synid: str, new_release_synid: str, retracted_sample_synid: str, production: bool = False ): """ These need to be modified per retraction. @@ -70,21 +94,11 @@ def patch_release_workflow( """ syn = synapseclient.login() - remove_centers = [] - remove_seqassays = [] - # 11 release series - release_synid = "" # Fill in synapse id here + # remove_centers = [] + # remove_seqassays = [] + # release_synid = "" # Fill in synapse id here old_release = syn.get(release_synid).name - # The new release folder MUST be created on synapse first. - new_release_synid = "" # Fill in synapse id here new_release = syn.get(new_release_synid).name - # samples to retract. Example: syn27734047 (12.3 consortium) - # If you are creating the 11.1-public patch, you will be using the - # 12.3-consortium samples to retract file - # Synapse id configurations - retracted_sample_synid = "" - # Data base mapping synid - database_mapping_synid = "syn10967259" retracted_samples_ent = syn.get(retracted_sample_synid) retracted_samplesdf = pd.read_csv(retracted_samples_ent.path) @@ -102,15 +116,13 @@ def patch_release_workflow( sample_synid = file_mapping["data_clinical_sample.txt"] patient_synid = file_mapping["data_clinical_patient.txt"] cna_synid = file_mapping["data_CNA.txt"] - fusion_synid = file_mapping["data_fusions.txt"] + fusion_synid = file_mapping["data_sv.txt"] gene_synid = file_mapping["data_gene_matrix.txt"] maf_synid = file_mapping["data_mutations_extended.txt"] genomic_info_synid = file_mapping.get("genie_combined.bed") if genomic_info_synid is None: genomic_info_synid = file_mapping["genomic_information.txt"] - seg_synid = file_mapping.get("genie_public_data_cna_hg19.seg") - if seg_synid is None: - seg_synid = file_mapping["genie_private_data_cna_hg19.seg"] + seg_synid = file_mapping.get("data_cna_hg19.seg") assay_info_synid = file_mapping["assay_information.txt"] # Sample and patient column to cBioPortal mappings @@ -128,26 +140,26 @@ def patch_release_workflow( centers = [patient.split("-")[1] for patient in sampledf.PATIENT_ID] sampledf["CENTER"] = centers # Retract samples from SEQ_ASSAY_ID, CENTER and retract samples list - to_remove_seqassay_rows = sampledf["SEQ_ASSAY_ID"].isin(remove_seqassays) - sampledf = sampledf[~to_remove_seqassay_rows] - to_remove_center_rows = sampledf["CENTER"].isin(remove_centers) - sampledf = sampledf[~to_remove_center_rows] + # to_remove_seqassay_rows = sampledf["SEQ_ASSAY_ID"].isin(remove_seqassays) + # sampledf = sampledf[~to_remove_seqassay_rows] + # to_remove_center_rows = sampledf["CENTER"].isin(remove_centers) + # sampledf = sampledf[~to_remove_center_rows] to_remove_samples = sampledf["SAMPLE_ID"].isin(retracted_samplesdf.SAMPLE_ID) final_sampledf = sampledf[~to_remove_samples] # Check number of seq assay ids is the same after removal of samples # Must add to removal of seq assay list for gene panel removal - seq_assay_after = final_sampledf["SEQ_ASSAY_ID"].unique() - seq_assay_before = sampledf["SEQ_ASSAY_ID"].unique() - if len(seq_assay_after) != len(seq_assay_before): - remove_seqassays.extend( - seq_assay_before[~seq_assay_before.isin(seq_assay_after)].tolist() - ) + # seq_assay_after = final_sampledf["SEQ_ASSAY_ID"].unique() + # seq_assay_before = sampledf["SEQ_ASSAY_ID"].unique() + # if len(seq_assay_after) != len(seq_assay_before): + # remove_seqassays.extend( + # seq_assay_before[~seq_assay_before.isin(seq_assay_after)].tolist() + # ) # Check number of centers is the same after removal of samples # Must add to removal of seq assay list for gene panel removal - center_after = final_sampledf["CENTER"].unique() - center_before = sampledf["CENTER"].unique() - if len(center_after) != len(center_before): - remove_centers.extend(center_before[~center_before.isin(center_after)].tolist()) + # center_after = final_sampledf["CENTER"].unique() + # center_before = sampledf["CENTER"].unique() + # if len(center_after) != len(center_before): + # remove_centers.extend(center_before[~center_before.isin(center_after)].tolist()) del final_sampledf["CENTER"] @@ -187,26 +199,27 @@ def patch_release_workflow( ) store_file(syn, sample_path, new_release_synid, new_release) store_file(syn, patient_path, new_release_synid, new_release) - # Patch CNA file - cna_ent = syn.get(cna_synid, followLink=True) - cnadf = pd.read_csv(cna_ent.path, sep="\t", comment="#") - cna_cols = ["Hugo_Symbol"] - cna_cols.extend(keep_samples.tolist()) - cna_cols_idx = cnadf.columns.isin(cna_cols) - if not cna_cols_idx.all(): - cnadf = cnadf[cnadf.columns[cna_cols_idx]] - cnatext = process_functions.removePandasDfFloat(cnadf) - cna_path = os.path.join( - tempdir, os.path.basename(cna_ent.path).replace(old_release, new_release) - ) - with open(cna_path, "w") as cna_file: - cna_file.write(cnatext) - store_file(syn, cna_path, new_release_synid, new_release) + # # Patch CNA file + # cna_ent = syn.get(cna_synid, followLink=True) + # cnadf = pd.read_csv(cna_ent.path, sep="\t", comment="#") + # cna_cols = ["Hugo_Symbol"] + # cna_cols.extend(keep_samples.tolist()) + # cna_cols_idx = cnadf.columns.isin(cna_cols) + # if not cna_cols_idx.all(): + # cnadf = cnadf[cnadf.columns[cna_cols_idx]] + # cnatext = process_functions.removePandasDfFloat(cnadf) + # cna_path = os.path.join( + # tempdir, os.path.basename(cna_ent.path).replace(old_release, new_release) + # ) + # with open(cna_path, "w") as cna_file: + # cna_file.write(cnatext) + # store_file(syn, cna_path, new_release_synid, new_release) # Patch Fusion file fusion_ent = syn.get(fusion_synid, followLink=True) fusiondf = pd.read_csv(fusion_ent.path, sep="\t", comment="#") # if not fusiondf.Tumor_Sample_Barcode.isin(keep_samples).all(): - fusiondf = fusiondf[fusiondf.Tumor_Sample_Barcode.isin(keep_samples)] + # fusiondf = fusiondf[fusiondf.Tumor_Sample_Barcode.isin(keep_samples)] + fusiondf = fusiondf[fusiondf['Sample_Id'].isin(keep_samples)] fusiontext = process_functions.removePandasDfFloat(fusiondf) fusion_path = os.path.join( tempdir, os.path.basename(fusion_ent.path).replace(old_release, new_release) @@ -218,7 +231,7 @@ def patch_release_workflow( seg_ent = syn.get(seg_synid, followLink=True) segdf = pd.read_csv(seg_ent.path, sep="\t", comment="#") # if not segdf.ID.isin(keep_samples).all(): - segdf = segdf[segdf.ID.isin(keep_samples)] + segdf = segdf[segdf['ID'].isin(keep_samples)] segtext = process_functions.removePandasDfFloat(segdf) seg_path = os.path.join( tempdir, os.path.basename(seg_ent.path).replace(old_release, new_release) @@ -230,7 +243,7 @@ def patch_release_workflow( # Patch gene matrix file gene_ent = syn.get(gene_synid, followLink=True) genedf = pd.read_csv(gene_ent.path, sep="\t", comment="#") - genedf = genedf[genedf.SAMPLE_ID.isin(keep_samples)] + genedf = genedf[genedf['SAMPLE_ID'].isin(keep_samples)] genedf[genedf.isnull()] = "NA" gene_path = os.path.join( tempdir, os.path.basename(gene_ent.path).replace(old_release, new_release) @@ -253,11 +266,11 @@ def patch_release_workflow( # Patch genomic information file genome_info_ent = syn.get(genomic_info_synid, followLink=True) genome_info_df = pd.read_csv(genome_info_ent.path, sep="\t", comment="#") - keep_rows = [ - seq not in remove_seqassays and not seq.startswith(tuple(remove_centers)) - for seq in genome_info_df["SEQ_ASSAY_ID"] - ] - genome_info_df = genome_info_df[keep_rows] + # keep_rows = [ + # seq not in remove_seqassays and not seq.startswith(tuple(remove_centers)) + # for seq in genome_info_df["SEQ_ASSAY_ID"] + # ] + # genome_info_df = genome_info_df[keep_rows] # Write genomic file genome_info_text = process_functions.removePandasDfFloat(genome_info_df) @@ -272,17 +285,17 @@ def patch_release_workflow( # Create cBioPortal gene panel and meta files for name in file_mapping: if name.startswith("data_gene_panel"): - seq_name = name.replace("data_gene_panel_", "").replace(".txt", "") - if seq_name not in remove_seqassays: - gene_panel_ent = syn.get(file_mapping[name], followLink=True) - new_panel_path = os.path.join( - tempdir, - os.path.basename(gene_panel_ent.path).replace( - old_release, new_release - ), - ) - shutil.copyfile(gene_panel_ent.path, new_panel_path) - store_file(syn, new_panel_path, new_release_synid, new_release) + # seq_name = name.replace("data_gene_panel_", "").replace(".txt", "") + # if seq_name not in remove_seqassays: + gene_panel_ent = syn.get(file_mapping[name], followLink=True) + new_panel_path = os.path.join( + tempdir, + os.path.basename(gene_panel_ent.path).replace( + old_release, new_release + ), + ) + shutil.copyfile(gene_panel_ent.path, new_panel_path) + store_file(syn, new_panel_path, new_release_synid, new_release) elif name.startswith("meta") or "_meta_" in name: meta_ent = syn.get(file_mapping[name], followLink=True) new_meta_path = os.path.join(tempdir, os.path.basename(meta_ent.path)) @@ -292,11 +305,11 @@ def patch_release_workflow( # Patch assay information file assay_ent = syn.get(assay_info_synid, followLink=True) assaydf = pd.read_csv(assay_ent.path, sep="\t", comment="#") - keep_rows = [ - seq not in remove_seqassays and not seq.startswith(tuple(remove_centers)) - for seq in assaydf["SEQ_ASSAY_ID"] - ] - assaydf = assaydf[keep_rows] + # keep_rows = [ + # seq not in remove_seqassays and not seq.startswith(tuple(remove_centers)) + # for seq in assaydf["SEQ_ASSAY_ID"] + # ] + # assaydf = assaydf[keep_rows] assay_text = process_functions.removePandasDfFloat(assaydf) assay_path = os.path.join( tempdir, os.path.basename(assay_ent.path).replace(old_release, new_release) @@ -319,10 +332,14 @@ def patch_release_workflow( tempdir_o.cleanup() # Update dashboard tables - # You may have to execute this twice in case the file view isn't updated - database_mapping = syn.tableQuery(f"select * from {database_mapping_synid} limit 1") + # Data base mapping synid + if production: + database_mapping_synid = "syn10967259" + else: + database_mapping_synid = "syn12094210" database_mapping = syn.tableQuery(f"select * from {database_mapping_synid}") database_mappingdf = database_mapping.asDataFrame() + # You may have to execute this twice in case the file view isn't updated dashboard_table_updater.run_dashboard(syn, database_mappingdf, new_release) From b6564dbce87ed195a56a3ccce8e93e9e8158d109 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Thu, 15 Aug 2024 21:27:56 -0700 Subject: [PATCH 05/39] Add staging option --- scripts/patch_release/patch.py | 56 +++++++++++----------------------- 1 file changed, 18 insertions(+), 38 deletions(-) diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index cabe7af..4288ff1 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -62,28 +62,6 @@ def store_file( syn.store(new_ent) -def generate_dashboard_html(genie_version, staging=False): - """Generates dashboard html writeout that gets uploaded to the - release folder - - Args: - genie_version: GENIE release - staging: Use staging files. Default is False - - """ - markdown_render_cmd = [ - "Rscript", - "/root/Genie/R/dashboard_markdown_generator.R", - genie_version, - "--template_path", - "/root/Genie/templates/dashboardTemplate.Rmd", - ] - - if staging: - markdown_render_cmd.append("--staging") - subprocess.check_call(markdown_render_cmd) - - def patch_release_workflow( release_synid: str, new_release_synid: str, retracted_sample_synid: str, production: bool = False ): @@ -93,6 +71,8 @@ def patch_release_workflow( variables need to be changed to reflect different Synapse ids per release. """ syn = synapseclient.login() + # Update dashboard tables + # Data base mapping synid # remove_centers = [] # remove_seqassays = [] @@ -199,21 +179,21 @@ def patch_release_workflow( ) store_file(syn, sample_path, new_release_synid, new_release) store_file(syn, patient_path, new_release_synid, new_release) - # # Patch CNA file - # cna_ent = syn.get(cna_synid, followLink=True) - # cnadf = pd.read_csv(cna_ent.path, sep="\t", comment="#") - # cna_cols = ["Hugo_Symbol"] - # cna_cols.extend(keep_samples.tolist()) - # cna_cols_idx = cnadf.columns.isin(cna_cols) - # if not cna_cols_idx.all(): - # cnadf = cnadf[cnadf.columns[cna_cols_idx]] - # cnatext = process_functions.removePandasDfFloat(cnadf) - # cna_path = os.path.join( - # tempdir, os.path.basename(cna_ent.path).replace(old_release, new_release) - # ) - # with open(cna_path, "w") as cna_file: - # cna_file.write(cnatext) - # store_file(syn, cna_path, new_release_synid, new_release) + # Patch CNA file + cna_ent = syn.get(cna_synid, followLink=True) + cnadf = pd.read_csv(cna_ent.path, sep="\t", comment="#") + cna_cols = ["Hugo_Symbol"] + cna_cols.extend(keep_samples.tolist()) + cna_cols_idx = cnadf.columns.isin(cna_cols) + if not cna_cols_idx.all(): + cnadf = cnadf[cnadf.columns[cna_cols_idx]] + cnatext = process_functions.removePandasDfFloat(cnadf) + cna_path = os.path.join( + tempdir, os.path.basename(cna_ent.path).replace(old_release, new_release) + ) + with open(cna_path, "w") as cna_file: + cna_file.write(cnatext) + store_file(syn, cna_path, new_release_synid, new_release) # Patch Fusion file fusion_ent = syn.get(fusion_synid, followLink=True) fusiondf = pd.read_csv(fusion_ent.path, sep="\t", comment="#") @@ -340,7 +320,7 @@ def patch_release_workflow( database_mapping = syn.tableQuery(f"select * from {database_mapping_synid}") database_mappingdf = database_mapping.asDataFrame() # You may have to execute this twice in case the file view isn't updated - dashboard_table_updater.run_dashboard(syn, database_mappingdf, new_release) + dashboard_table_updater.run_dashboard(syn, database_mappingdf, new_release, staging=not production) def main(): From 0f3ccc966663852a96faae58dad23c5c6e002ace Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Thu, 15 Aug 2024 21:28:30 -0700 Subject: [PATCH 06/39] Lint --- patch_release_main.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/patch_release_main.nf b/patch_release_main.nf index 5ef067d..9b41bc0 100644 --- a/patch_release_main.nf +++ b/patch_release_main.nf @@ -6,8 +6,6 @@ nextflow.enable.dsl = 2 include { patch_release } from './modules/patch_release' include { create_dashboard_html } from './modules/create_dashboard_html' - - params.release_synid = "syn53170398" // 15.4-consortium params.new_release_synid = "syn62069187" // 15.6-consortium (in staging) params.retracted_sample_synid = "syn54082015" // 16.3-consortium samples_to_retract.csv From 10d7f05f676423717b25c613897b2d1fb641df4a Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Thu, 15 Aug 2024 21:37:55 -0700 Subject: [PATCH 07/39] Update process config --- nextflow.config | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nextflow.config b/nextflow.config index 78cde0b..5a104e9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -43,6 +43,10 @@ profiles { memory = 32.GB cpus = 4 } + withName: create_dashboard_html { + memory = 32.GB + cpus = 4 + } withName: create_public_release { memory = 16.GB cpus = 4 From 8613e0db2bf42c0cf728b89e12f7a860fddbdbed Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Thu, 15 Aug 2024 21:43:08 -0700 Subject: [PATCH 08/39] Add missing module --- patch_release_main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/patch_release_main.nf b/patch_release_main.nf index 9b41bc0..62d8df6 100644 --- a/patch_release_main.nf +++ b/patch_release_main.nf @@ -4,6 +4,7 @@ nextflow.enable.dsl = 2 // IMPORT MODULES include { patch_release } from './modules/patch_release' +include { create_data_guide } from './modules/create_data_guide' include { create_dashboard_html } from './modules/create_dashboard_html' params.release_synid = "syn53170398" // 15.4-consortium From bcdb828c3c86f4538104ab24884238fcdbf4a433 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Thu, 15 Aug 2024 22:13:03 -0700 Subject: [PATCH 09/39] Add channel values --- patch_release_main.nf | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/patch_release_main.nf b/patch_release_main.nf index 62d8df6..8e7c546 100644 --- a/patch_release_main.nf +++ b/patch_release_main.nf @@ -16,7 +16,12 @@ params.project_id = "syn22033066" // staging project params.production = false // production is false workflow { - patch_release(params.release_synid, params.new_release_synid, params.retracted_sample_synid) - create_dashboard_html(patch_release.out, params.release, params.production) - create_data_guide(patch_release.out, params.release, params.project_id) + ch_release_synid = Channel.value(params.release_synid) + ch_new_release_synid = Channel.value(params.new_release_synid) + ch_retracted_sample_synid = Channel.value(params.retracted_sample_synid) + ch_release = Channel.value(params.release) + ch_project_id = Channel.value(params.project_id) + patch_release(ch_release_synid, ch_new_release_synid, ch_retracted_sample_synid) + create_dashboard_html(patch_release.out, ch_release, params.production) + create_data_guide(patch_release.out, ch_release, ch_project_id) } From e8b08f8c0e16636d461801d931bbceb2e9777e88 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Thu, 15 Aug 2024 22:22:04 -0700 Subject: [PATCH 10/39] Patch --- modules/patch_release.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/patch_release.nf b/modules/patch_release.nf index 913883f..668c2c4 100644 --- a/modules/patch_release.nf +++ b/modules/patch_release.nf @@ -14,7 +14,7 @@ process patch_release { script: """ python3 /patch_release/patch.py \ - --release_synid $release_syn_id \ + --release_synid $release_synid \ --new_release_synid $new_release_synid \ --retracted-sample_synid $retracted_sample_synid """ From 20b90cbbed16d794cce6b75ac02ba5d64f532c57 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Thu, 15 Aug 2024 22:37:14 -0700 Subject: [PATCH 11/39] Positional arguments --- modules/patch_release.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/patch_release.nf b/modules/patch_release.nf index 668c2c4..8ff76c6 100644 --- a/modules/patch_release.nf +++ b/modules/patch_release.nf @@ -14,8 +14,8 @@ process patch_release { script: """ python3 /patch_release/patch.py \ - --release_synid $release_synid \ - --new_release_synid $new_release_synid \ - --retracted-sample_synid $retracted_sample_synid + $release_synid \ + $new_release_synid \ + $retracted_sample_synid """ } From 946afde2ca76538eb4da5a945923cd1be582c5a3 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Thu, 15 Aug 2024 22:54:23 -0700 Subject: [PATCH 12/39] Bump memory for patch release --- nextflow.config | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nextflow.config b/nextflow.config index 5a104e9..a32a8ff 100644 --- a/nextflow.config +++ b/nextflow.config @@ -39,6 +39,10 @@ profiles { memory = 16.GB cpus = 4 } + withName: patch_release { + memory = 16.GB + cpus = 4 + } withName: create_consortium_release { memory = 32.GB cpus = 4 From 38093029341efc596f971705f24d02daca3b5f67 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Fri, 16 Aug 2024 00:46:42 -0700 Subject: [PATCH 13/39] Remove quote --- modules/create_dashboard_html.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/create_dashboard_html.nf b/modules/create_dashboard_html.nf index f58abaf..4707cdb 100644 --- a/modules/create_dashboard_html.nf +++ b/modules/create_dashboard_html.nf @@ -16,12 +16,12 @@ process create_dashboard_html { script: if (production) { """ - Rscript ./R/dashboard_markdown_generator.R" $release \ + Rscript ./R/dashboard_markdown_generator.R $release \ --template_path ./templates/dashboardTemplate.Rmd """ } else { """ - Rscript ./R/dashboard_markdown_generator.R" $release \ + Rscript ./R/dashboard_markdown_generator.R $release \ --template_path ./templates/dashboardTemplate.Rmd \ --staging """ From 34ca5c22d3a9695240c759a21092842478c9cd97 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Fri, 16 Aug 2024 07:21:11 -0700 Subject: [PATCH 14/39] cd into /root/Genie --- modules/create_dashboard_html.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/create_dashboard_html.nf b/modules/create_dashboard_html.nf index 4707cdb..dcdf9c0 100644 --- a/modules/create_dashboard_html.nf +++ b/modules/create_dashboard_html.nf @@ -16,11 +16,13 @@ process create_dashboard_html { script: if (production) { """ + cd /root/Genie Rscript ./R/dashboard_markdown_generator.R $release \ --template_path ./templates/dashboardTemplate.Rmd """ } else { """ + cd /root/Genie Rscript ./R/dashboard_markdown_generator.R $release \ --template_path ./templates/dashboardTemplate.Rmd \ --staging From ea22341902cd313a23a724b9ef39f4e0b681dac6 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Fri, 16 Aug 2024 07:51:57 -0700 Subject: [PATCH 15/39] Add readme --- scripts/patch_release/README.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 scripts/patch_release/README.md diff --git a/scripts/patch_release/README.md b/scripts/patch_release/README.md new file mode 100644 index 0000000..c686326 --- /dev/null +++ b/scripts/patch_release/README.md @@ -0,0 +1,6 @@ +# Public Patch Release + +All sample / patient retractions have to occur within 3 months of the public release. If problems are found with older public releases, we will not patch, but will add information to the release notes. The patch releases are not meant to resolve data issues, but just for removing samples that are retracted consent. + +1. Create another consortium release +1. Generate the data guide, dashboard html, and release notes From 318b50b7672166c517cff98ebf82d24a421e466b Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Sun, 18 Aug 2024 12:37:35 -0700 Subject: [PATCH 16/39] Compare two folders that should contain identical data --- scripts/patch_release/compare_patch.py | 72 ++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 scripts/patch_release/compare_patch.py diff --git a/scripts/patch_release/compare_patch.py b/scripts/patch_release/compare_patch.py new file mode 100644 index 0000000..ddd45fd --- /dev/null +++ b/scripts/patch_release/compare_patch.py @@ -0,0 +1,72 @@ +""" +The command ran: +python patch.py syn53170398 syn62069187 syn54082015 +In leu of lack of unit or integration tests, the above command replicates the + this is to test 15.5-consortium (syn55146141) and 15.6-consortium (Staging) + +""" +import synapseclient +import synapseutils as synu + +def _get_file_dict(syn: synapseclient.Synapse, synid: str) -> dict[str, str]: + """ + This function generates a dictionary of files from a Synapse ID. + + Args: + syn (synapseclient.Synapse): A Synapse client object. + synid (str): The Synapse ID of the files to retrieve. + + Returns: + dict[str, str]: A dictionary mapping Synapse IDs to file names. + """ + all_files = synu.walk(syn, synid) + file_list = {} + for _, _, files in all_files: + files = {name: syn.get(synid, downloadFile=False) for name, synid in files} + file_list.update(files) + return file_list + + +def compare_releases(original_synid: str, new_synid: str): + """ + This function compares two folders that should have identifical files + with each file's MD5s + + Args: + original_synid (str): The Synapse ID of the original release. + new_synid (str): The Synapse ID of the new release. + + Returns: + tuple: A tuple containing the original release entity, the new release entity, + and a list of retracted entities. + """ + + # Log in to Synapse + syn = synapseclient.login() + + # Get the entities for the original and new releases + # original_ent = syn.get(original_synid) + # original_files = synu.walk(original_synid) + original_file_list = _get_file_dict(syn, original_synid) + # new_ent = syn.get(new_synid) + # new_files = synu.walk(new_synid) + new_file_list = _get_file_dict(syn, new_synid) + + # Check that the two folders have the same number of files + print(len(original_file_list)) + print(len(new_file_list)) + assert len(original_file_list) == len(new_file_list), "Folders have different number of files" + print("Number of files: ", len(original_file_list)) + for filename in original_file_list.keys(): + if new_file_list.get(filename) is None: + print("File not found in new folder: ", filename) + else: + if original_file_list[filename].md5 != new_file_list[filename].md5: + print("Files are different: ", filename) + + + +if __name__ == "__main__": + original_synid = "syn55146141" + new_synid = "syn62069187" + compare_releases(original_synid, new_synid) From 3971366473428b33a4953a00f4e1cf98d0f52631 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Sun, 18 Aug 2024 13:55:04 -0700 Subject: [PATCH 17/39] Add production --- modules/patch_release.nf | 24 ++++++++++++++++++------ patch_release_main.nf | 2 +- scripts/patch_release/compare_patch.py | 10 ++++++---- scripts/patch_release/patch.py | 8 +++++++- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/modules/patch_release.nf b/modules/patch_release.nf index 8ff76c6..82333f7 100644 --- a/modules/patch_release.nf +++ b/modules/patch_release.nf @@ -7,15 +7,27 @@ process patch_release { val release_synid val new_release_synid val retracted_sample_synid + val production output: stdout script: - """ - python3 /patch_release/patch.py \ - $release_synid \ - $new_release_synid \ - $retracted_sample_synid - """ + if (production) { + """ + python3 /patch_release/patch.py \ + $release_synid \ + $new_release_synid \ + $retracted_sample_synid \ + --production + """ + } + else { + """ + python3 /patch_release/patch.py \ + $release_synid \ + $new_release_synid \ + $retracted_sample_synid \ + """ + } } diff --git a/patch_release_main.nf b/patch_release_main.nf index 8e7c546..1b9c638 100644 --- a/patch_release_main.nf +++ b/patch_release_main.nf @@ -21,7 +21,7 @@ workflow { ch_retracted_sample_synid = Channel.value(params.retracted_sample_synid) ch_release = Channel.value(params.release) ch_project_id = Channel.value(params.project_id) - patch_release(ch_release_synid, ch_new_release_synid, ch_retracted_sample_synid) + patch_release(ch_release_synid, ch_new_release_synid, ch_retracted_sample_synid, params.production) create_dashboard_html(patch_release.out, ch_release, params.production) create_data_guide(patch_release.out, ch_release, ch_project_id) } diff --git a/scripts/patch_release/compare_patch.py b/scripts/patch_release/compare_patch.py index ddd45fd..5733e18 100644 --- a/scripts/patch_release/compare_patch.py +++ b/scripts/patch_release/compare_patch.py @@ -53,10 +53,12 @@ def compare_releases(original_synid: str, new_synid: str): new_file_list = _get_file_dict(syn, new_synid) # Check that the two folders have the same number of files - print(len(original_file_list)) - print(len(new_file_list)) - assert len(original_file_list) == len(new_file_list), "Folders have different number of files" - print("Number of files: ", len(original_file_list)) + print("Number of files in old folder: ", len(original_file_list)) + print("Number of files in new folder: ", len(new_file_list)) + for filename in new_file_list.keys(): + if original_file_list.get(filename) is None: + print("File not found in old folder: ", filename) + for filename in original_file_list.keys(): if new_file_list.get(filename) is None: print("File not found in new folder: ", filename) diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index 4288ff1..7ab96f1 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -341,13 +341,19 @@ def main(): type=str, help="The Synapse Id of the samples_to_retract.csv file generated in the current 3rd consortium release.", ) - + # this parameter is mainly for the dashboard upload step + parser.add_argument( + "--production", + action="store_true", + help="Run production workload or it will default to the staging workload" + ) args = parser.parse_args() patch_release_workflow( release_synid=args.release_synid, new_release_synid=args.new_release_synid, retracted_sample_synid=args.retracted_sample_synid, + production=arg.production ) From 00f22572916e5f6c56af1d1aa65f9c99001ea75e Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Sun, 18 Aug 2024 14:02:24 -0700 Subject: [PATCH 18/39] Since project_id is specified, use project_id to determine if production workload to reduce parameters --- patch_release_main.nf | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/patch_release_main.nf b/patch_release_main.nf index 1b9c638..1c21e6b 100644 --- a/patch_release_main.nf +++ b/patch_release_main.nf @@ -13,7 +13,13 @@ params.retracted_sample_synid = "syn54082015" // 16.3-consortium samples_to_ret params.release = "15.6-consortium" // project_id = "syn7208886" params.project_id = "syn22033066" // staging project -params.production = false // production is false +if (params.project_id == "syn22033066") { + is_production = false +} else if (params.project_id == "syn3380222") { + is_production = true +} else { + exit 1, "project_id must be syn22033066 or syn3380222" +} workflow { ch_release_synid = Channel.value(params.release_synid) @@ -21,7 +27,7 @@ workflow { ch_retracted_sample_synid = Channel.value(params.retracted_sample_synid) ch_release = Channel.value(params.release) ch_project_id = Channel.value(params.project_id) - patch_release(ch_release_synid, ch_new_release_synid, ch_retracted_sample_synid, params.production) - create_dashboard_html(patch_release.out, ch_release, params.production) + patch_release(ch_release_synid, ch_new_release_synid, ch_retracted_sample_synid, is_production) + create_dashboard_html(patch_release.out, ch_release, is_production) create_data_guide(patch_release.out, ch_release, ch_project_id) } From 5414262bcf0bbd158d016c6323607124cc6d6f05 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Sun, 18 Aug 2024 14:19:38 -0700 Subject: [PATCH 19/39] Fix args --- scripts/patch_release/patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index 7ab96f1..e2c8207 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -353,7 +353,7 @@ def main(): release_synid=args.release_synid, new_release_synid=args.new_release_synid, retracted_sample_synid=args.retracted_sample_synid, - production=arg.production + production=args.production ) From b6e191943108c5cf4b1c7a357be87babb1097adc Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Sun, 18 Aug 2024 14:20:25 -0700 Subject: [PATCH 20/39] Remove subprocess --- scripts/patch_release/patch.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index e2c8207..fead985 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -10,7 +10,6 @@ import argparse import os import shutil -import subprocess import tempfile import pandas as pd From 72da9d492d08866f0e38fd0b35754905a444103b Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Sun, 18 Aug 2024 15:00:30 -0700 Subject: [PATCH 21/39] Modularize patch code by splitting off patch_file function --- scripts/patch_release/compare_patch.py | 5 +- scripts/patch_release/patch.py | 231 +++++++++++++++---------- 2 files changed, 141 insertions(+), 95 deletions(-) diff --git a/scripts/patch_release/compare_patch.py b/scripts/patch_release/compare_patch.py index 5733e18..104a096 100644 --- a/scripts/patch_release/compare_patch.py +++ b/scripts/patch_release/compare_patch.py @@ -2,8 +2,11 @@ The command ran: python patch.py syn53170398 syn62069187 syn54082015 In leu of lack of unit or integration tests, the above command replicates the - this is to test 15.5-consortium (syn55146141) and 15.6-consortium (Staging) +this is to test 15.5-consortium (syn55146141) and 15.6-consortium (Staging) +python compare_patch.py + +TODO: Add argparse """ import synapseclient import synapseutils as synu diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index fead985..ded8f26 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -40,9 +40,27 @@ def revise_meta_file(meta_file_path: str, old_version: str, new_version: str) -> meta_text = meta_text.replace(old_version, new_version) meta.write(meta_text) +def _filter_tsv(filepath: str, keep_values: pd.Series, column: str) -> pd.DataFrame: + """ + Patches a tsv in Synapse by filtering out rows based on the provided keep values. + + Args: + syn (synapseclient.Synapse): The Synapse client object. + synid (str): The Synapse ID of the entity to be patched. + keep_values (pd.Series): The values to keep in the dataframe. + column (str): The column name to filter on. + Returns: + pd.DataFrame: The patched dataframe. + """ + df = pd.read_csv(filepath, sep="\t", comment="#") + # if not segdf.ID.isin(keep_samples).all(): + df = df[df[column].isin(keep_values)] + return df + +# TODO remove new_release parameter soon def store_file( - syn: synapseclient.Synapse, new_path: str, new_release_synid: str, release_name: str + syn: synapseclient.Synapse, new_path: str, new_release_synid: str, new_release: str = None ) -> None: """ Stores a file into Synapse. @@ -51,16 +69,45 @@ def store_file( syn (synapseclient.Synapse): The Synapse client object. new_path (str): The path to the file to be stored. new_release_synid (str): The Synapse ID of the release folder where the file will be stored. - release_name (str): The name of the release. Returns: None """ - ent_name = os.path.basename(new_path.replace(f"_{release_name}", "")) - new_ent = synapseclient.File(new_path, name=ent_name, parentId=new_release_synid) + new_ent = synapseclient.File(new_path, parentId=new_release_synid) syn.store(new_ent) +def patch_file(syn: synapseclient.Synapse, synid: str, tempdir: str, new_release_synid: str, keep_values: pd.Series, column: str) -> str: + """ + Patches a file in Synapse by filtering out rows based on the provided keep values. + + Args: + syn (synapseclient.Synapse): The Synapse client object. + synid (str): The Synapse ID of the entity to be patched. + tempdir (str): The temporary directory to store the patched file. + new_release_synid (str): The Synapse ID of the release folder where the patched file will be stored. + keep_values (pd.Series): The values to keep in the dataframe. + column (str): The column name to filter on. + + Returns: + None + """ + entity = syn.get(synid, followLink=True) + df = _filter_tsv(path=entity.path, keep_values=keep_values, column=column) + # Specific filtering fro the data gene matrix file because the string NA must + # replace the blank values + if entity.name == "data_gene_matrix.txt": + df[df.isnull()] = "NA" + # df = pd.read_csv(entity.path, sep="\t", comment="#") + # df = df[df[column].isin(keep_values)] + dftext = process_functions.removePandasDfFloat(df) + new_path = os.path.join(tempdir, os.path.basename(entity.path)) + with open(new_path, "w") as o_file: + o_file.write(dftext) + store_file(syn, new_path, new_release_synid) + # TODO: return a named tuple so its not just returning the path + return new_path + def patch_release_workflow( release_synid: str, new_release_synid: str, retracted_sample_synid: str, production: bool = False ): @@ -144,6 +191,7 @@ def patch_release_workflow( keep_samples = final_sampledf["SAMPLE_ID"].drop_duplicates() keep_patients = final_sampledf["PATIENT_ID"].drop_duplicates() + keep_seq_assay_id = final_sampledf["SEQ_ASSAY_ID"].drop_duplicates() patient_ent = syn.get(patient_synid, followLink=True) patientdf = pd.read_csv(patient_ent.path, sep="\t", comment="#") @@ -161,12 +209,8 @@ def patch_release_workflow( full_clin_df.to_csv(clinical_path, sep="\t", index=False) store_file(syn, clinical_path, new_release_synid, new_release) - sample_path = os.path.join( - tempdir, os.path.basename(sample_ent.path).replace(old_release, new_release) - ) - patient_path = os.path.join( - tempdir, os.path.basename(patient_ent.path).replace(old_release, new_release) - ) + sample_path = os.path.join(tempdir, os.path.basename(sample_ent.path)) + patient_path = os.path.join(tempdir, os.path.basename(patient_ent.path)) process_functions.addClinicalHeaders( clinicaldf, @@ -187,92 +231,90 @@ def patch_release_workflow( if not cna_cols_idx.all(): cnadf = cnadf[cnadf.columns[cna_cols_idx]] cnatext = process_functions.removePandasDfFloat(cnadf) - cna_path = os.path.join( - tempdir, os.path.basename(cna_ent.path).replace(old_release, new_release) - ) + cna_path = os.path.join(tempdir, os.path.basename(cna_ent.path)) with open(cna_path, "w") as cna_file: cna_file.write(cnatext) store_file(syn, cna_path, new_release_synid, new_release) # Patch Fusion file - fusion_ent = syn.get(fusion_synid, followLink=True) - fusiondf = pd.read_csv(fusion_ent.path, sep="\t", comment="#") - # if not fusiondf.Tumor_Sample_Barcode.isin(keep_samples).all(): - # fusiondf = fusiondf[fusiondf.Tumor_Sample_Barcode.isin(keep_samples)] - fusiondf = fusiondf[fusiondf['Sample_Id'].isin(keep_samples)] - fusiontext = process_functions.removePandasDfFloat(fusiondf) - fusion_path = os.path.join( - tempdir, os.path.basename(fusion_ent.path).replace(old_release, new_release) - ) - with open(fusion_path, "w") as fusion_file: - fusion_file.write(fusiontext) - store_file(syn, fusion_path, new_release_synid, new_release) + patch_file(syn, fusion_synid, tempdir, new_release_synid, keep_samples, "Sample_Id") + # fusion_ent = syn.get(fusion_synid, followLink=True) + # fusiondf = pd.read_csv(fusion_ent.path, sep="\t", comment="#") + # # if not fusiondf.Tumor_Sample_Barcode.isin(keep_samples).all(): + # # fusiondf = fusiondf[fusiondf.Tumor_Sample_Barcode.isin(keep_samples)] + # fusiondf = fusiondf[fusiondf['Sample_Id'].isin(keep_samples)] + # fusiontext = process_functions.removePandasDfFloat(fusiondf) + # fusion_path = os.path.join( + # tempdir, os.path.basename(fusion_ent.path).replace(old_release, new_release) + # ) + # with open(fusion_path, "w") as fusion_file: + # fusion_file.write(fusiontext) + # store_file(syn, fusion_path, new_release_synid, new_release) # Patch SEG file - seg_ent = syn.get(seg_synid, followLink=True) - segdf = pd.read_csv(seg_ent.path, sep="\t", comment="#") - # if not segdf.ID.isin(keep_samples).all(): - segdf = segdf[segdf['ID'].isin(keep_samples)] - segtext = process_functions.removePandasDfFloat(segdf) - seg_path = os.path.join( - tempdir, os.path.basename(seg_ent.path).replace(old_release, new_release) - ) - with open(seg_path, "w") as seg_file: - seg_file.write(segtext) - store_file(syn, seg_path, new_release_synid, new_release) + patch_file(syn, seg_synid, tempdir, new_release_synid, keep_samples, "ID") + # seg_ent = syn.get(seg_synid, followLink=True) + # segdf = pd.read_csv(seg_ent.path, sep="\t", comment="#") + # # if not segdf.ID.isin(keep_samples).all(): + # segdf = segdf[segdf['ID'].isin(keep_samples)] + # segtext = process_functions.removePandasDfFloat(segdf) + # seg_path = os.path.join( + # tempdir, os.path.basename(seg_ent.path).replace(old_release, new_release) + # ) + # with open(seg_path, "w") as seg_file: + # seg_file.write(segtext) + # store_file(syn, seg_path, new_release_synid, new_release) # Patch gene matrix file - gene_ent = syn.get(gene_synid, followLink=True) - genedf = pd.read_csv(gene_ent.path, sep="\t", comment="#") - genedf = genedf[genedf['SAMPLE_ID'].isin(keep_samples)] - genedf[genedf.isnull()] = "NA" - gene_path = os.path.join( - tempdir, os.path.basename(gene_ent.path).replace(old_release, new_release) - ) - genedf.to_csv(gene_path, sep="\t", index=False) - store_file(syn, gene_path, new_release_synid, new_release) + patch_file(syn, gene_synid, tempdir, new_release_synid, keep_samples, "SAMPLE_ID") + # gene_ent = syn.get(gene_synid, followLink=True) + # genedf = pd.read_csv(gene_ent.path, sep="\t", comment="#") + # genedf = genedf[genedf['SAMPLE_ID'].isin(keep_samples)] + # genedf[genedf.isnull()] = "NA" + # gene_path = os.path.join(tempdir, os.path.basename(gene_ent.path)) + # genedf.to_csv(gene_path, sep="\t", index=False) + # store_file(syn, gene_path, new_release_synid, new_release) # Patch maf file - maf_ent = syn.get(maf_synid, followLink=True) - mafdf = pd.read_csv(maf_ent.path, sep="\t", comment="#") - mafdf = mafdf[mafdf["Tumor_Sample_Barcode"].isin(keep_samples)] - maftext = process_functions.removePandasDfFloat(mafdf) - maf_path = os.path.join( - tempdir, os.path.basename(maf_ent.path).replace(old_release, new_release) - ) - with open(maf_path, "w") as maf_file: - maf_file.write(maftext) - store_file(syn, maf_path, new_release_synid, new_release) + patch_file(syn, maf_synid, tempdir, new_release_synid, keep_samples, "Tumor_Sample_Barcode") + # maf_ent = syn.get(maf_synid, followLink=True) + # mafdf = pd.read_csv(maf_ent.path, sep="\t", comment="#") + # mafdf = mafdf[mafdf["Tumor_Sample_Barcode"].isin(keep_samples)] + # maftext = process_functions.removePandasDfFloat(mafdf) + # maf_path = os.path.join( + # tempdir, os.path.basename(maf_ent.path).replace(old_release, new_release) + # ) + # with open(maf_path, "w") as maf_file: + # maf_file.write(maftext) + # store_file(syn, maf_path, new_release_synid, new_release) # Patch genomic information file # clinicalReported column needs to be added # Patch genomic information file - genome_info_ent = syn.get(genomic_info_synid, followLink=True) - genome_info_df = pd.read_csv(genome_info_ent.path, sep="\t", comment="#") - # keep_rows = [ - # seq not in remove_seqassays and not seq.startswith(tuple(remove_centers)) - # for seq in genome_info_df["SEQ_ASSAY_ID"] - # ] - # genome_info_df = genome_info_df[keep_rows] - - # Write genomic file - genome_info_text = process_functions.removePandasDfFloat(genome_info_df) - genome_info_path = os.path.join( - tempdir, - os.path.basename(genome_info_ent.path).replace(old_release, new_release), - ) - with open(genome_info_path, "w") as bed_file: - bed_file.write(genome_info_text) - store_file(syn, genome_info_path, new_release_synid, new_release) + patch_file(syn, genomic_info_synid, tempdir, new_release_synid, keep_seq_assay_id, "SEQ_ASSAY_ID") + # genome_info_ent = syn.get(genomic_info_synid, followLink=True) + # genome_info_df = pd.read_csv(genome_info_ent.path, sep="\t", comment="#") + # # keep_rows = [ + # # seq not in remove_seqassays and not seq.startswith(tuple(remove_centers)) + # # for seq in genome_info_df["SEQ_ASSAY_ID"] + # # ] + # # genome_info_df = genome_info_df[keep_rows] + + # # Write genomic file + # genome_info_text = process_functions.removePandasDfFloat(genome_info_df) + # genome_info_path = os.path.join( + # tempdir, + # os.path.basename(genome_info_ent.path).replace(old_release, new_release), + # ) + + # with open(genome_info_path, "w") as bed_file: + # bed_file.write(genome_info_text) + # store_file(syn, genome_info_path, new_release_synid, new_release) # Create cBioPortal gene panel and meta files for name in file_mapping: if name.startswith("data_gene_panel"): - # seq_name = name.replace("data_gene_panel_", "").replace(".txt", "") - # if seq_name not in remove_seqassays: + seq_name = name.replace("data_gene_panel_", "").replace(".txt", "") + if seq_name not in keep_seq_assay_id: + continue gene_panel_ent = syn.get(file_mapping[name], followLink=True) - new_panel_path = os.path.join( - tempdir, - os.path.basename(gene_panel_ent.path).replace( - old_release, new_release - ), - ) + new_panel_path = os.path.join(tempdir, os.path.basename(gene_panel_ent.path)) shutil.copyfile(gene_panel_ent.path, new_panel_path) store_file(syn, new_panel_path, new_release_synid, new_release) elif name.startswith("meta") or "_meta_" in name: @@ -282,20 +324,21 @@ def patch_release_workflow( revise_meta_file(new_meta_path, old_release, new_release) store_file(syn, new_meta_path, new_release_synid, new_release) # Patch assay information file - assay_ent = syn.get(assay_info_synid, followLink=True) - assaydf = pd.read_csv(assay_ent.path, sep="\t", comment="#") - # keep_rows = [ - # seq not in remove_seqassays and not seq.startswith(tuple(remove_centers)) - # for seq in assaydf["SEQ_ASSAY_ID"] - # ] - # assaydf = assaydf[keep_rows] - assay_text = process_functions.removePandasDfFloat(assaydf) - assay_path = os.path.join( - tempdir, os.path.basename(assay_ent.path).replace(old_release, new_release) - ) - with open(assay_path, "w") as assay_file: - assay_file.write(assay_text) - store_file(syn, assay_path, new_release_synid, new_release) + assay_path = patch_file(syn, assay_info_synid, tempdir, new_release_synid, keep_seq_assay_id, "SEQ_ASSAY_ID") + # assay_ent = syn.get(assay_info_synid, followLink=True) + # assaydf = pd.read_csv(assay_ent.path, sep="\t", comment="#") + # # keep_rows = [ + # # seq not in remove_seqassays and not seq.startswith(tuple(remove_centers)) + # # for seq in assaydf["SEQ_ASSAY_ID"] + # # ] + # # assaydf = assaydf[keep_rows] + # assay_text = process_functions.removePandasDfFloat(assaydf) + # assay_path = os.path.join( + # tempdir, os.path.basename(assay_ent.path).replace(old_release, new_release) + # ) + # with open(assay_path, "w") as assay_file: + # assay_file.write(assay_text) + # store_file(syn, assay_path, new_release_synid, new_release) # Create cBioPortal case lists case_list_path = os.path.join(tempdir, "case_lists") if not os.path.exists(case_list_path): From 89c7be55ae5f5197c3430e4c4f1a4ce1eb131761 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Sun, 18 Aug 2024 15:09:40 -0700 Subject: [PATCH 22/39] Comment out code and shuffle things around --- scripts/patch_release/patch.py | 116 +++++++-------------------------- 1 file changed, 24 insertions(+), 92 deletions(-) diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index ded8f26..6989fa9 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -220,8 +220,8 @@ def patch_release_workflow( sample_path, patient_path, ) - store_file(syn, sample_path, new_release_synid, new_release) - store_file(syn, patient_path, new_release_synid, new_release) + store_file(syn, sample_path, new_release_synid) + store_file(syn, patient_path, new_release_synid) # Patch CNA file cna_ent = syn.get(cna_synid, followLink=True) cnadf = pd.read_csv(cna_ent.path, sep="\t", comment="#") @@ -234,79 +234,39 @@ def patch_release_workflow( cna_path = os.path.join(tempdir, os.path.basename(cna_ent.path)) with open(cna_path, "w") as cna_file: cna_file.write(cnatext) - store_file(syn, cna_path, new_release_synid, new_release) + store_file(syn, cna_path, new_release_synid) + # Patch Fusion file patch_file(syn, fusion_synid, tempdir, new_release_synid, keep_samples, "Sample_Id") - # fusion_ent = syn.get(fusion_synid, followLink=True) - # fusiondf = pd.read_csv(fusion_ent.path, sep="\t", comment="#") - # # if not fusiondf.Tumor_Sample_Barcode.isin(keep_samples).all(): - # # fusiondf = fusiondf[fusiondf.Tumor_Sample_Barcode.isin(keep_samples)] - # fusiondf = fusiondf[fusiondf['Sample_Id'].isin(keep_samples)] - # fusiontext = process_functions.removePandasDfFloat(fusiondf) - # fusion_path = os.path.join( - # tempdir, os.path.basename(fusion_ent.path).replace(old_release, new_release) - # ) - # with open(fusion_path, "w") as fusion_file: - # fusion_file.write(fusiontext) - # store_file(syn, fusion_path, new_release_synid, new_release) + # Patch SEG file patch_file(syn, seg_synid, tempdir, new_release_synid, keep_samples, "ID") - # seg_ent = syn.get(seg_synid, followLink=True) - # segdf = pd.read_csv(seg_ent.path, sep="\t", comment="#") - # # if not segdf.ID.isin(keep_samples).all(): - # segdf = segdf[segdf['ID'].isin(keep_samples)] - # segtext = process_functions.removePandasDfFloat(segdf) - # seg_path = os.path.join( - # tempdir, os.path.basename(seg_ent.path).replace(old_release, new_release) - # ) - # with open(seg_path, "w") as seg_file: - # seg_file.write(segtext) - # store_file(syn, seg_path, new_release_synid, new_release) # Patch gene matrix file patch_file(syn, gene_synid, tempdir, new_release_synid, keep_samples, "SAMPLE_ID") - # gene_ent = syn.get(gene_synid, followLink=True) - # genedf = pd.read_csv(gene_ent.path, sep="\t", comment="#") - # genedf = genedf[genedf['SAMPLE_ID'].isin(keep_samples)] - # genedf[genedf.isnull()] = "NA" - # gene_path = os.path.join(tempdir, os.path.basename(gene_ent.path)) - # genedf.to_csv(gene_path, sep="\t", index=False) - # store_file(syn, gene_path, new_release_synid, new_release) + # Patch maf file patch_file(syn, maf_synid, tempdir, new_release_synid, keep_samples, "Tumor_Sample_Barcode") - # maf_ent = syn.get(maf_synid, followLink=True) - # mafdf = pd.read_csv(maf_ent.path, sep="\t", comment="#") - # mafdf = mafdf[mafdf["Tumor_Sample_Barcode"].isin(keep_samples)] - # maftext = process_functions.removePandasDfFloat(mafdf) - # maf_path = os.path.join( - # tempdir, os.path.basename(maf_ent.path).replace(old_release, new_release) - # ) - # with open(maf_path, "w") as maf_file: - # maf_file.write(maftext) - # store_file(syn, maf_path, new_release_synid, new_release) - # Patch genomic information file - # clinicalReported column needs to be added - # Patch genomic information file + # Patch genomic information file patch_file(syn, genomic_info_synid, tempdir, new_release_synid, keep_seq_assay_id, "SEQ_ASSAY_ID") - # genome_info_ent = syn.get(genomic_info_synid, followLink=True) - # genome_info_df = pd.read_csv(genome_info_ent.path, sep="\t", comment="#") - # # keep_rows = [ - # # seq not in remove_seqassays and not seq.startswith(tuple(remove_centers)) - # # for seq in genome_info_df["SEQ_ASSAY_ID"] - # # ] - # # genome_info_df = genome_info_df[keep_rows] - - # # Write genomic file - # genome_info_text = process_functions.removePandasDfFloat(genome_info_df) - # genome_info_path = os.path.join( - # tempdir, - # os.path.basename(genome_info_ent.path).replace(old_release, new_release), - # ) - - # with open(genome_info_path, "w") as bed_file: - # bed_file.write(genome_info_text) - # store_file(syn, genome_info_path, new_release_synid, new_release) + + # Patch assay information file + assay_path = patch_file(syn, assay_info_synid, tempdir, new_release_synid, keep_seq_assay_id, "SEQ_ASSAY_ID") + + # Create cBioPortal case lists + case_list_path = os.path.join(tempdir, "case_lists") + if not os.path.exists(case_list_path): + os.mkdir(case_list_path) + create_case_lists.main(clinical_path, assay_path, case_list_path, "genie_private") + + case_list_files = os.listdir(case_list_path) + + for case_filename in case_list_files: + # if case_filename in case_file_synids: + case_path = os.path.join(case_list_path, case_filename) + store_file(syn, case_path, case_list_folder_synid, new_release) + # Create cBioPortal gene panel and meta files for name in file_mapping: if name.startswith("data_gene_panel"): @@ -323,34 +283,6 @@ def patch_release_workflow( shutil.copyfile(meta_ent.path, new_meta_path) revise_meta_file(new_meta_path, old_release, new_release) store_file(syn, new_meta_path, new_release_synid, new_release) - # Patch assay information file - assay_path = patch_file(syn, assay_info_synid, tempdir, new_release_synid, keep_seq_assay_id, "SEQ_ASSAY_ID") - # assay_ent = syn.get(assay_info_synid, followLink=True) - # assaydf = pd.read_csv(assay_ent.path, sep="\t", comment="#") - # # keep_rows = [ - # # seq not in remove_seqassays and not seq.startswith(tuple(remove_centers)) - # # for seq in assaydf["SEQ_ASSAY_ID"] - # # ] - # # assaydf = assaydf[keep_rows] - # assay_text = process_functions.removePandasDfFloat(assaydf) - # assay_path = os.path.join( - # tempdir, os.path.basename(assay_ent.path).replace(old_release, new_release) - # ) - # with open(assay_path, "w") as assay_file: - # assay_file.write(assay_text) - # store_file(syn, assay_path, new_release_synid, new_release) - # Create cBioPortal case lists - case_list_path = os.path.join(tempdir, "case_lists") - if not os.path.exists(case_list_path): - os.mkdir(case_list_path) - create_case_lists.main(clinical_path, assay_path, case_list_path, "genie_private") - - case_list_files = os.listdir(case_list_path) - - for case_filename in case_list_files: - # if case_filename in case_file_synids: - case_path = os.path.join(case_list_path, case_filename) - store_file(syn, case_path, case_list_folder_synid, new_release) tempdir_o.cleanup() # Update dashboard tables From 15ba5763f42b31c99c8ce518f918453f85f12f49 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Sun, 18 Aug 2024 15:12:32 -0700 Subject: [PATCH 23/39] the release name is no longer included in the filename --- scripts/patch_release/patch.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index 6989fa9..9c70c20 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -117,12 +117,10 @@ def patch_release_workflow( variables need to be changed to reflect different Synapse ids per release. """ syn = synapseclient.login() - # Update dashboard tables - # Data base mapping synid + # TODO: Add ability to provide list of centers / seq assay ids to remove # remove_centers = [] # remove_seqassays = [] - # release_synid = "" # Fill in synapse id here old_release = syn.get(release_synid).name new_release = syn.get(new_release_synid).name @@ -265,7 +263,7 @@ def patch_release_workflow( for case_filename in case_list_files: # if case_filename in case_file_synids: case_path = os.path.join(case_list_path, case_filename) - store_file(syn, case_path, case_list_folder_synid, new_release) + store_file(syn, case_path, case_list_folder_synid) # Create cBioPortal gene panel and meta files for name in file_mapping: @@ -276,13 +274,13 @@ def patch_release_workflow( gene_panel_ent = syn.get(file_mapping[name], followLink=True) new_panel_path = os.path.join(tempdir, os.path.basename(gene_panel_ent.path)) shutil.copyfile(gene_panel_ent.path, new_panel_path) - store_file(syn, new_panel_path, new_release_synid, new_release) + store_file(syn, new_panel_path, new_release_synid) elif name.startswith("meta") or "_meta_" in name: meta_ent = syn.get(file_mapping[name], followLink=True) new_meta_path = os.path.join(tempdir, os.path.basename(meta_ent.path)) shutil.copyfile(meta_ent.path, new_meta_path) revise_meta_file(new_meta_path, old_release, new_release) - store_file(syn, new_meta_path, new_release_synid, new_release) + store_file(syn, new_meta_path, new_release_synid) tempdir_o.cleanup() # Update dashboard tables From 9e0fb00f7d72b95f10f13f581af6e41eeb947600 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Sun, 18 Aug 2024 15:41:01 -0700 Subject: [PATCH 24/39] Patch --- scripts/patch_release/patch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index 9c70c20..e6f0d65 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -60,7 +60,7 @@ def _filter_tsv(filepath: str, keep_values: pd.Series, column: str) -> pd.DataFr # TODO remove new_release parameter soon def store_file( - syn: synapseclient.Synapse, new_path: str, new_release_synid: str, new_release: str = None + syn: synapseclient.Synapse, new_path: str, new_release_synid: str ) -> None: """ Stores a file into Synapse. @@ -93,7 +93,7 @@ def patch_file(syn: synapseclient.Synapse, synid: str, tempdir: str, new_release None """ entity = syn.get(synid, followLink=True) - df = _filter_tsv(path=entity.path, keep_values=keep_values, column=column) + df = _filter_tsv(filepath=entity.path, keep_values=keep_values, column=column) # Specific filtering fro the data gene matrix file because the string NA must # replace the blank values if entity.name == "data_gene_matrix.txt": @@ -205,7 +205,7 @@ def patch_release_workflow( # public release code rely on the merged clinical file. full_clin_df = full_clin_df[full_clin_df["SAMPLE_ID"].isin(keep_samples)] full_clin_df.to_csv(clinical_path, sep="\t", index=False) - store_file(syn, clinical_path, new_release_synid, new_release) + store_file(syn, clinical_path, new_release_synid) sample_path = os.path.join(tempdir, os.path.basename(sample_ent.path)) patient_path = os.path.join(tempdir, os.path.basename(patient_ent.path)) From 312a2f8f06892f73d4271b0be3495aaa5dcdc79c Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Sun, 18 Aug 2024 16:01:50 -0700 Subject: [PATCH 25/39] Add patch cna file function --- scripts/patch_release/patch.py | 42 ++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index e6f0d65..13062f2 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -108,6 +108,35 @@ def patch_file(syn: synapseclient.Synapse, synid: str, tempdir: str, new_release # TODO: return a named tuple so its not just returning the path return new_path + +def patch_cna_file(syn: synapseclient.Synapse, cna_synid: str, tempdir: str, new_release_synid: str, keep_samples: pd.Series) -> None: + """ + Patches the CNA file in Synapse by filtering out columns based on the provided keep samples. + + Args: + syn (synapseclient.Synapse): The Synapse client object. + cna_synid (str): The Synapse ID of the CNA file to be patched. + tempdir (str): The temporary directory to store the patched file. + new_release_synid (str): The Synapse ID of the release folder where the patched file will be stored. + keep_samples (pd.Series): The samples to keep in the CNA file. + + Returns: + None + """ + cna_ent = syn.get(cna_synid, followLink=True) + cnadf = pd.read_csv(cna_ent.path, sep="\t", comment="#") + cna_cols = ["Hugo_Symbol"] + cna_cols.extend(keep_samples.tolist()) + cna_cols_idx = cnadf.columns.isin(cna_cols) + if not cna_cols_idx.all(): + cnadf = cnadf[cnadf.columns[cna_cols_idx]] + cnatext = process_functions.removePandasDfFloat(cnadf) + cna_path = os.path.join(tempdir, os.path.basename(cna_ent.path)) + with open(cna_path, "w") as cna_file: + cna_file.write(cnatext) + store_file(syn, cna_path, new_release_synid) + + def patch_release_workflow( release_synid: str, new_release_synid: str, retracted_sample_synid: str, production: bool = False ): @@ -221,18 +250,7 @@ def patch_release_workflow( store_file(syn, sample_path, new_release_synid) store_file(syn, patient_path, new_release_synid) # Patch CNA file - cna_ent = syn.get(cna_synid, followLink=True) - cnadf = pd.read_csv(cna_ent.path, sep="\t", comment="#") - cna_cols = ["Hugo_Symbol"] - cna_cols.extend(keep_samples.tolist()) - cna_cols_idx = cnadf.columns.isin(cna_cols) - if not cna_cols_idx.all(): - cnadf = cnadf[cnadf.columns[cna_cols_idx]] - cnatext = process_functions.removePandasDfFloat(cnadf) - cna_path = os.path.join(tempdir, os.path.basename(cna_ent.path)) - with open(cna_path, "w") as cna_file: - cna_file.write(cnatext) - store_file(syn, cna_path, new_release_synid) + patch_cna_file(syn, cna_synid, tempdir, new_release_synid, keep_samples) # Patch Fusion file patch_file(syn, fusion_synid, tempdir, new_release_synid, keep_samples, "Sample_Id") From ef6b2ec9ea649629818bae1ae0f6d627f90ccc0a Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Sun, 18 Aug 2024 19:02:08 -0700 Subject: [PATCH 26/39] Revoke access for data clinical file, and shuffle code around --- scripts/patch_release/patch.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index 13062f2..26e2dc3 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -74,7 +74,8 @@ def store_file( None """ new_ent = synapseclient.File(new_path, parentId=new_release_synid) - syn.store(new_ent) + new_ent = syn.store(new_ent) + return new_ent def patch_file(syn: synapseclient.Synapse, synid: str, tempdir: str, new_release_synid: str, keep_values: pd.Series, column: str) -> str: @@ -161,11 +162,6 @@ def patch_release_workflow( file_mapping = { release_file["name"]: release_file["id"] for release_file in release_files } - # case_list_folder_synid = file_mapping['case_lists'] - case_list_folder_synid = syn.store( - synapseclient.Folder("case_lists", parentId=new_release_synid) - ).id - sample_synid = file_mapping["data_clinical_sample.txt"] patient_synid = file_mapping["data_clinical_patient.txt"] cna_synid = file_mapping["data_CNA.txt"] @@ -234,7 +230,10 @@ def patch_release_workflow( # public release code rely on the merged clinical file. full_clin_df = full_clin_df[full_clin_df["SAMPLE_ID"].isin(keep_samples)] full_clin_df.to_csv(clinical_path, sep="\t", index=False) - store_file(syn, clinical_path, new_release_synid) + full_clinical_entity = store_file(syn, clinical_path, new_release_synid) + # Revoke access to general GENIE consortium on data_clinical.txt file + # Because it has more data than the consortium should see. + syn.setPermissions(full_clinical_entity, principalId=3326313, accessType=[]) sample_path = os.path.join(tempdir, os.path.basename(sample_ent.path)) patient_path = os.path.join(tempdir, os.path.basename(patient_ent.path)) @@ -277,7 +276,9 @@ def patch_release_workflow( create_case_lists.main(clinical_path, assay_path, case_list_path, "genie_private") case_list_files = os.listdir(case_list_path) - + case_list_folder_synid = syn.store( + synapseclient.Folder("case_lists", parentId=new_release_synid) + ).id for case_filename in case_list_files: # if case_filename in case_file_synids: case_path = os.path.join(case_list_path, case_filename) From 27de200b3bc0b8d029d9c8732180275f9f7d3c0a Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Sun, 18 Aug 2024 19:31:17 -0700 Subject: [PATCH 27/39] Patch releases --- scripts/patch_release/patch.py | 144 ++++++++++++++++++++++----------- 1 file changed, 95 insertions(+), 49 deletions(-) diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index 26e2dc3..7b8be5e 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -89,9 +89,6 @@ def patch_file(syn: synapseclient.Synapse, synid: str, tempdir: str, new_release new_release_synid (str): The Synapse ID of the release folder where the patched file will be stored. keep_values (pd.Series): The values to keep in the dataframe. column (str): The column name to filter on. - - Returns: - None """ entity = syn.get(synid, followLink=True) df = _filter_tsv(filepath=entity.path, keep_values=keep_values, column=column) @@ -120,9 +117,6 @@ def patch_cna_file(syn: synapseclient.Synapse, cna_synid: str, tempdir: str, new tempdir (str): The temporary directory to store the patched file. new_release_synid (str): The Synapse ID of the release folder where the patched file will be stored. keep_samples (pd.Series): The samples to keep in the CNA file. - - Returns: - None """ cna_ent = syn.get(cna_synid, followLink=True) cnadf = pd.read_csv(cna_ent.path, sep="\t", comment="#") @@ -138,16 +132,81 @@ def patch_cna_file(syn: synapseclient.Synapse, cna_synid: str, tempdir: str, new store_file(syn, cna_path, new_release_synid) +def patch_case_list_files(syn: synapseclient.Synapse, new_release_synid: str, tempdir: str, clinical_path: str, assay_path: str) -> None: + """ + Creates a folder for case lists in Synapse and populates it with case list files. + The reason why case list files cannot be copied because samples and patients are retracted + so `create_case_lists.main` must be called to regenerated case lists from the new + sample list. + + Args: + syn (synapseclient.Synapse): The Synapse client object. + new_release_synid (str): The Synapse ID of the release folder where the case lists will be stored. + tempdir (str): The temporary directory to store the case list files. + clinical_path (str): The path to the clinical data. + assay_path (str): The path to the assay data. + """ + case_list_path = os.path.join(tempdir, "case_lists") + if not os.path.exists(case_list_path): + os.mkdir(case_list_path) + create_case_lists.main(clinical_path, assay_path, case_list_path, "genie_private") + + case_list_files = os.listdir(case_list_path) + case_list_folder_synid = syn.store( + synapseclient.Folder("case_lists", parentId=new_release_synid) + ).id + for case_filename in case_list_files: + case_path = os.path.join(case_list_path, case_filename) + store_file(syn, case_path, case_list_folder_synid) + + +def patch_gene_panel_and_meta_files(syn: synapseclient.Synapse, file_mapping: dict, tempdir: str, new_release_synid: str, keep_seq_assay_id: pd.Series, old_release: str, new_release: str) -> None: + """ + Creates cBioPortal gene panel and meta files. + + Args: + syn (synapseclient.Synapse): The Synapse client object. + file_mapping (dict): A dictionary mapping file names to their Synapse IDs. + tempdir (str): The temporary directory to store the files. + new_release_synid (str): The Synapse ID of the new release folder. + keep_seq_assay_id (pd.Series): The series of SEQ_ASSAY_IDs to keep. + old_release (str): The version name of the orignal consortium release linking to the public release. + new_release (str): The version name of the new consortium release linking to the patch release. + """ + for name in file_mapping: + if name.startswith("data_gene_panel"): + seq_name = name.replace("data_gene_panel_", "").replace(".txt", "") + if seq_name not in keep_seq_assay_id: + continue + gene_panel_ent = syn.get(file_mapping[name], followLink=True) + new_panel_path = os.path.join(tempdir, os.path.basename(gene_panel_ent.path)) + shutil.copyfile(gene_panel_ent.path, new_panel_path) + store_file(syn, new_panel_path, new_release_synid) + elif name.startswith("meta") or "_meta_" in name: + meta_ent = syn.get(file_mapping[name], followLink=True) + new_meta_path = os.path.join(tempdir, os.path.basename(meta_ent.path)) + shutil.copyfile(meta_ent.path, new_meta_path) + revise_meta_file(new_meta_path, old_release, new_release) + store_file(syn, new_meta_path, new_release_synid) + + def patch_release_workflow( release_synid: str, new_release_synid: str, retracted_sample_synid: str, production: bool = False ): """ - These need to be modified per retraction. - The release_synid, new_release_synid, and retracted_sample_synid - variables need to be changed to reflect different Synapse ids per release. + Patches a release by removing retracted samples from the clinical, sample, and patient files. + Also patches CNA, fusion, SEG, gene matrix, MAF, genomic information, and assay information files. + Creates cBioPortal case lists and gene panel and meta files. + Updates the dashboard tables. + + Args: + release_synid (str): The Synapse ID of the release to be patched. + new_release_synid (str): The Synapse ID of the new release. + retracted_sample_synid (str): The Synapse ID of the file containing the retracted samples. + production (bool, optional): Whether the patch release is for production. Defaults to False. """ - syn = synapseclient.login() + syn = synapseclient.login() # TODO: Add ability to provide list of centers / seq assay ids to remove # remove_centers = [] # remove_seqassays = [] @@ -246,61 +305,48 @@ def patch_release_workflow( sample_path, patient_path, ) - store_file(syn, sample_path, new_release_synid) - store_file(syn, patient_path, new_release_synid) + store_file(syn=syn, new_path=sample_path, new_release_synid=new_release_synid) + store_file(syn=syn, new_path=patient_path, new_release_synid=new_release_synid) # Patch CNA file - patch_cna_file(syn, cna_synid, tempdir, new_release_synid, keep_samples) + patch_cna_file(syn=syn, cna_synid=cna_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_samples=keep_samples) # Patch Fusion file - patch_file(syn, fusion_synid, tempdir, new_release_synid, keep_samples, "Sample_Id") + patch_file(syn=syn, synid=fusion_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_values=keep_samples, column="Sample_Id") # Patch SEG file - patch_file(syn, seg_synid, tempdir, new_release_synid, keep_samples, "ID") + patch_file(syn=syn, synid=seg_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_values=keep_samples, column="ID") # Patch gene matrix file - patch_file(syn, gene_synid, tempdir, new_release_synid, keep_samples, "SAMPLE_ID") + patch_file(syn=syn, synid=gene_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_values=keep_samples, column="SAMPLE_ID") # Patch maf file - patch_file(syn, maf_synid, tempdir, new_release_synid, keep_samples, "Tumor_Sample_Barcode") + patch_file(syn=syn, synid=maf_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_values=keep_samples, column="Tumor_Sample_Barcode") # Patch genomic information file - patch_file(syn, genomic_info_synid, tempdir, new_release_synid, keep_seq_assay_id, "SEQ_ASSAY_ID") + patch_file(syn=syn, synid=genomic_info_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_values=keep_seq_assay_id, column="SEQ_ASSAY_ID") # Patch assay information file - assay_path = patch_file(syn, assay_info_synid, tempdir, new_release_synid, keep_seq_assay_id, "SEQ_ASSAY_ID") + assay_path = patch_file(syn=syn, synid=assay_info_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_values=keep_seq_assay_id, column="SEQ_ASSAY_ID") # Create cBioPortal case lists - case_list_path = os.path.join(tempdir, "case_lists") - if not os.path.exists(case_list_path): - os.mkdir(case_list_path) - create_case_lists.main(clinical_path, assay_path, case_list_path, "genie_private") - - case_list_files = os.listdir(case_list_path) - case_list_folder_synid = syn.store( - synapseclient.Folder("case_lists", parentId=new_release_synid) - ).id - for case_filename in case_list_files: - # if case_filename in case_file_synids: - case_path = os.path.join(case_list_path, case_filename) - store_file(syn, case_path, case_list_folder_synid) - + patch_case_list_files(syn=syn, new_release_synid=new_release_synid, tempdir=tempdir, clinical_path=clinical_path, assay_path=assay_path) # Create cBioPortal gene panel and meta files - for name in file_mapping: - if name.startswith("data_gene_panel"): - seq_name = name.replace("data_gene_panel_", "").replace(".txt", "") - if seq_name not in keep_seq_assay_id: - continue - gene_panel_ent = syn.get(file_mapping[name], followLink=True) - new_panel_path = os.path.join(tempdir, os.path.basename(gene_panel_ent.path)) - shutil.copyfile(gene_panel_ent.path, new_panel_path) - store_file(syn, new_panel_path, new_release_synid) - elif name.startswith("meta") or "_meta_" in name: - meta_ent = syn.get(file_mapping[name], followLink=True) - new_meta_path = os.path.join(tempdir, os.path.basename(meta_ent.path)) - shutil.copyfile(meta_ent.path, new_meta_path) - revise_meta_file(new_meta_path, old_release, new_release) - store_file(syn, new_meta_path, new_release_synid) - + # for name in file_mapping: + # if name.startswith("data_gene_panel"): + # seq_name = name.replace("data_gene_panel_", "").replace(".txt", "") + # if seq_name not in keep_seq_assay_id: + # continue + # gene_panel_ent = syn.get(file_mapping[name], followLink=True) + # new_panel_path = os.path.join(tempdir, os.path.basename(gene_panel_ent.path)) + # shutil.copyfile(gene_panel_ent.path, new_panel_path) + # store_file(syn, new_panel_path, new_release_synid) + # elif name.startswith("meta") or "_meta_" in name: + # meta_ent = syn.get(file_mapping[name], followLink=True) + # new_meta_path = os.path.join(tempdir, os.path.basename(meta_ent.path)) + # shutil.copyfile(meta_ent.path, new_meta_path) + # revise_meta_file(new_meta_path, old_release, new_release) + # store_file(syn, new_meta_path, new_release_synid) + patch_gene_panel_and_meta_files(syn=syn, file_mapping=file_mapping, tempdir=tempdir, new_release_synid=new_release_synid, keep_seq_assay_id=keep_seq_assay_id, old_release=old_release, new_release=new_release) tempdir_o.cleanup() # Update dashboard tables # Data base mapping synid From 4bc8e2ec318d9d9a2b9d293da0d9a764018a6e2c Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Sun, 18 Aug 2024 21:09:47 -0700 Subject: [PATCH 28/39] Remove dead code and add TODOs --- scripts/patch_release/patch.py | 39 +++++----------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index 7b8be5e..bae96db 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -247,27 +247,10 @@ def patch_release_workflow( sampledf = pd.read_csv(sample_ent.path, sep="\t", comment="#") centers = [patient.split("-")[1] for patient in sampledf.PATIENT_ID] sampledf["CENTER"] = centers - # Retract samples from SEQ_ASSAY_ID, CENTER and retract samples list - # to_remove_seqassay_rows = sampledf["SEQ_ASSAY_ID"].isin(remove_seqassays) - # sampledf = sampledf[~to_remove_seqassay_rows] - # to_remove_center_rows = sampledf["CENTER"].isin(remove_centers) - # sampledf = sampledf[~to_remove_center_rows] + # Retract samples from retract samples list + # TODO: Add code here to support redaction for entire center or seq assays to_remove_samples = sampledf["SAMPLE_ID"].isin(retracted_samplesdf.SAMPLE_ID) final_sampledf = sampledf[~to_remove_samples] - # Check number of seq assay ids is the same after removal of samples - # Must add to removal of seq assay list for gene panel removal - # seq_assay_after = final_sampledf["SEQ_ASSAY_ID"].unique() - # seq_assay_before = sampledf["SEQ_ASSAY_ID"].unique() - # if len(seq_assay_after) != len(seq_assay_before): - # remove_seqassays.extend( - # seq_assay_before[~seq_assay_before.isin(seq_assay_after)].tolist() - # ) - # Check number of centers is the same after removal of samples - # Must add to removal of seq assay list for gene panel removal - # center_after = final_sampledf["CENTER"].unique() - # center_before = sampledf["CENTER"].unique() - # if len(center_after) != len(center_before): - # remove_centers.extend(center_before[~center_before.isin(center_after)].tolist()) del final_sampledf["CENTER"] @@ -307,6 +290,7 @@ def patch_release_workflow( ) store_file(syn=syn, new_path=sample_path, new_release_synid=new_release_synid) store_file(syn=syn, new_path=patient_path, new_release_synid=new_release_synid) + # Patch CNA file patch_cna_file(syn=syn, cna_synid=cna_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_samples=keep_samples) @@ -330,23 +314,10 @@ def patch_release_workflow( # Create cBioPortal case lists patch_case_list_files(syn=syn, new_release_synid=new_release_synid, tempdir=tempdir, clinical_path=clinical_path, assay_path=assay_path) + # Create cBioPortal gene panel and meta files - # for name in file_mapping: - # if name.startswith("data_gene_panel"): - # seq_name = name.replace("data_gene_panel_", "").replace(".txt", "") - # if seq_name not in keep_seq_assay_id: - # continue - # gene_panel_ent = syn.get(file_mapping[name], followLink=True) - # new_panel_path = os.path.join(tempdir, os.path.basename(gene_panel_ent.path)) - # shutil.copyfile(gene_panel_ent.path, new_panel_path) - # store_file(syn, new_panel_path, new_release_synid) - # elif name.startswith("meta") or "_meta_" in name: - # meta_ent = syn.get(file_mapping[name], followLink=True) - # new_meta_path = os.path.join(tempdir, os.path.basename(meta_ent.path)) - # shutil.copyfile(meta_ent.path, new_meta_path) - # revise_meta_file(new_meta_path, old_release, new_release) - # store_file(syn, new_meta_path, new_release_synid) patch_gene_panel_and_meta_files(syn=syn, file_mapping=file_mapping, tempdir=tempdir, new_release_synid=new_release_synid, keep_seq_assay_id=keep_seq_assay_id, old_release=old_release, new_release=new_release) + tempdir_o.cleanup() # Update dashboard tables # Data base mapping synid From 965e609dcf466eee961357431938438fcc226299 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Mon, 19 Aug 2024 22:33:54 -0700 Subject: [PATCH 29/39] Update scripts/patch_release/patch.py Co-authored-by: BryanFauble <17128019+BryanFauble@users.noreply.github.com> --- scripts/patch_release/patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index bae96db..1b1d49c 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -92,7 +92,7 @@ def patch_file(syn: synapseclient.Synapse, synid: str, tempdir: str, new_release """ entity = syn.get(synid, followLink=True) df = _filter_tsv(filepath=entity.path, keep_values=keep_values, column=column) - # Specific filtering fro the data gene matrix file because the string NA must + # Specific filtering for the data gene matrix file because the string NA must # replace the blank values if entity.name == "data_gene_matrix.txt": df[df.isnull()] = "NA" From 532269a4cc42bfa2cb1aa894300fbaae15d662b4 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Mon, 19 Aug 2024 22:34:42 -0700 Subject: [PATCH 30/39] Add returns --- scripts/patch_release/patch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index bae96db..b35037d 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -89,6 +89,9 @@ def patch_file(syn: synapseclient.Synapse, synid: str, tempdir: str, new_release new_release_synid (str): The Synapse ID of the release folder where the patched file will be stored. keep_values (pd.Series): The values to keep in the dataframe. column (str): The column name to filter on. + + Returns: + str: The file path to the patched file """ entity = syn.get(synid, followLink=True) df = _filter_tsv(filepath=entity.path, keep_values=keep_values, column=column) @@ -103,7 +106,7 @@ def patch_file(syn: synapseclient.Synapse, synid: str, tempdir: str, new_release with open(new_path, "w") as o_file: o_file.write(dftext) store_file(syn, new_path, new_release_synid) - # TODO: return a named tuple so its not just returning the path + # TODO: return a named tuple if needed (YAGNI for now) return new_path From ff8b4bd778b1ff0649765d6eec79fcaffbb356de Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Mon, 19 Aug 2024 22:36:11 -0700 Subject: [PATCH 31/39] lint --- scripts/patch_release/compare_patch.py | 4 +- scripts/patch_release/patch.py | 143 ++++++++++++++++++++----- 2 files changed, 121 insertions(+), 26 deletions(-) diff --git a/scripts/patch_release/compare_patch.py b/scripts/patch_release/compare_patch.py index 104a096..18a7076 100644 --- a/scripts/patch_release/compare_patch.py +++ b/scripts/patch_release/compare_patch.py @@ -11,6 +11,7 @@ import synapseclient import synapseutils as synu + def _get_file_dict(syn: synapseclient.Synapse, synid: str) -> dict[str, str]: """ This function generates a dictionary of files from a Synapse ID. @@ -54,7 +55,7 @@ def compare_releases(original_synid: str, new_synid: str): # new_ent = syn.get(new_synid) # new_files = synu.walk(new_synid) new_file_list = _get_file_dict(syn, new_synid) - + # Check that the two folders have the same number of files print("Number of files in old folder: ", len(original_file_list)) print("Number of files in new folder: ", len(new_file_list)) @@ -68,7 +69,6 @@ def compare_releases(original_synid: str, new_synid: str): else: if original_file_list[filename].md5 != new_file_list[filename].md5: print("Files are different: ", filename) - if __name__ == "__main__": diff --git a/scripts/patch_release/patch.py b/scripts/patch_release/patch.py index 6cc5fdb..bef9b71 100644 --- a/scripts/patch_release/patch.py +++ b/scripts/patch_release/patch.py @@ -15,11 +15,8 @@ import pandas as pd import synapseclient -from genie import ( - create_case_lists, - dashboard_table_updater, - process_functions -) +from genie import create_case_lists, dashboard_table_updater, process_functions + # Run time functions def revise_meta_file(meta_file_path: str, old_version: str, new_version: str) -> None: @@ -40,7 +37,8 @@ def revise_meta_file(meta_file_path: str, old_version: str, new_version: str) -> meta_text = meta_text.replace(old_version, new_version) meta.write(meta_text) -def _filter_tsv(filepath: str, keep_values: pd.Series, column: str) -> pd.DataFrame: + +def _filter_tsv(filepath: str, keep_values: pd.Series, column: str) -> pd.DataFrame: """ Patches a tsv in Synapse by filtering out rows based on the provided keep values. @@ -58,6 +56,7 @@ def _filter_tsv(filepath: str, keep_values: pd.Series, column: str) -> pd.DataFr df = df[df[column].isin(keep_values)] return df + # TODO remove new_release parameter soon def store_file( syn: synapseclient.Synapse, new_path: str, new_release_synid: str @@ -78,7 +77,14 @@ def store_file( return new_ent -def patch_file(syn: synapseclient.Synapse, synid: str, tempdir: str, new_release_synid: str, keep_values: pd.Series, column: str) -> str: +def patch_file( + syn: synapseclient.Synapse, + synid: str, + tempdir: str, + new_release_synid: str, + keep_values: pd.Series, + column: str, +) -> str: """ Patches a file in Synapse by filtering out rows based on the provided keep values. @@ -110,7 +116,13 @@ def patch_file(syn: synapseclient.Synapse, synid: str, tempdir: str, new_release return new_path -def patch_cna_file(syn: synapseclient.Synapse, cna_synid: str, tempdir: str, new_release_synid: str, keep_samples: pd.Series) -> None: +def patch_cna_file( + syn: synapseclient.Synapse, + cna_synid: str, + tempdir: str, + new_release_synid: str, + keep_samples: pd.Series, +) -> None: """ Patches the CNA file in Synapse by filtering out columns based on the provided keep samples. @@ -135,7 +147,13 @@ def patch_cna_file(syn: synapseclient.Synapse, cna_synid: str, tempdir: str, new store_file(syn, cna_path, new_release_synid) -def patch_case_list_files(syn: synapseclient.Synapse, new_release_synid: str, tempdir: str, clinical_path: str, assay_path: str) -> None: +def patch_case_list_files( + syn: synapseclient.Synapse, + new_release_synid: str, + tempdir: str, + clinical_path: str, + assay_path: str, +) -> None: """ Creates a folder for case lists in Synapse and populates it with case list files. The reason why case list files cannot be copied because samples and patients are retracted @@ -163,7 +181,15 @@ def patch_case_list_files(syn: synapseclient.Synapse, new_release_synid: str, te store_file(syn, case_path, case_list_folder_synid) -def patch_gene_panel_and_meta_files(syn: synapseclient.Synapse, file_mapping: dict, tempdir: str, new_release_synid: str, keep_seq_assay_id: pd.Series, old_release: str, new_release: str) -> None: +def patch_gene_panel_and_meta_files( + syn: synapseclient.Synapse, + file_mapping: dict, + tempdir: str, + new_release_synid: str, + keep_seq_assay_id: pd.Series, + old_release: str, + new_release: str, +) -> None: """ Creates cBioPortal gene panel and meta files. @@ -182,7 +208,9 @@ def patch_gene_panel_and_meta_files(syn: synapseclient.Synapse, file_mapping: di if seq_name not in keep_seq_assay_id: continue gene_panel_ent = syn.get(file_mapping[name], followLink=True) - new_panel_path = os.path.join(tempdir, os.path.basename(gene_panel_ent.path)) + new_panel_path = os.path.join( + tempdir, os.path.basename(gene_panel_ent.path) + ) shutil.copyfile(gene_panel_ent.path, new_panel_path) store_file(syn, new_panel_path, new_release_synid) elif name.startswith("meta") or "_meta_" in name: @@ -194,7 +222,10 @@ def patch_gene_panel_and_meta_files(syn: synapseclient.Synapse, file_mapping: di def patch_release_workflow( - release_synid: str, new_release_synid: str, retracted_sample_synid: str, production: bool = False + release_synid: str, + new_release_synid: str, + retracted_sample_synid: str, + production: bool = False, ): """ Patches a release by removing retracted samples from the clinical, sample, and patient files. @@ -295,31 +326,93 @@ def patch_release_workflow( store_file(syn=syn, new_path=patient_path, new_release_synid=new_release_synid) # Patch CNA file - patch_cna_file(syn=syn, cna_synid=cna_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_samples=keep_samples) + patch_cna_file( + syn=syn, + cna_synid=cna_synid, + tempdir=tempdir, + new_release_synid=new_release_synid, + keep_samples=keep_samples, + ) # Patch Fusion file - patch_file(syn=syn, synid=fusion_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_values=keep_samples, column="Sample_Id") + patch_file( + syn=syn, + synid=fusion_synid, + tempdir=tempdir, + new_release_synid=new_release_synid, + keep_values=keep_samples, + column="Sample_Id", + ) # Patch SEG file - patch_file(syn=syn, synid=seg_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_values=keep_samples, column="ID") + patch_file( + syn=syn, + synid=seg_synid, + tempdir=tempdir, + new_release_synid=new_release_synid, + keep_values=keep_samples, + column="ID", + ) # Patch gene matrix file - patch_file(syn=syn, synid=gene_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_values=keep_samples, column="SAMPLE_ID") + patch_file( + syn=syn, + synid=gene_synid, + tempdir=tempdir, + new_release_synid=new_release_synid, + keep_values=keep_samples, + column="SAMPLE_ID", + ) # Patch maf file - patch_file(syn=syn, synid=maf_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_values=keep_samples, column="Tumor_Sample_Barcode") + patch_file( + syn=syn, + synid=maf_synid, + tempdir=tempdir, + new_release_synid=new_release_synid, + keep_values=keep_samples, + column="Tumor_Sample_Barcode", + ) # Patch genomic information file - patch_file(syn=syn, synid=genomic_info_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_values=keep_seq_assay_id, column="SEQ_ASSAY_ID") + patch_file( + syn=syn, + synid=genomic_info_synid, + tempdir=tempdir, + new_release_synid=new_release_synid, + keep_values=keep_seq_assay_id, + column="SEQ_ASSAY_ID", + ) # Patch assay information file - assay_path = patch_file(syn=syn, synid=assay_info_synid, tempdir=tempdir, new_release_synid=new_release_synid, keep_values=keep_seq_assay_id, column="SEQ_ASSAY_ID") + assay_path = patch_file( + syn=syn, + synid=assay_info_synid, + tempdir=tempdir, + new_release_synid=new_release_synid, + keep_values=keep_seq_assay_id, + column="SEQ_ASSAY_ID", + ) # Create cBioPortal case lists - patch_case_list_files(syn=syn, new_release_synid=new_release_synid, tempdir=tempdir, clinical_path=clinical_path, assay_path=assay_path) + patch_case_list_files( + syn=syn, + new_release_synid=new_release_synid, + tempdir=tempdir, + clinical_path=clinical_path, + assay_path=assay_path, + ) # Create cBioPortal gene panel and meta files - patch_gene_panel_and_meta_files(syn=syn, file_mapping=file_mapping, tempdir=tempdir, new_release_synid=new_release_synid, keep_seq_assay_id=keep_seq_assay_id, old_release=old_release, new_release=new_release) + patch_gene_panel_and_meta_files( + syn=syn, + file_mapping=file_mapping, + tempdir=tempdir, + new_release_synid=new_release_synid, + keep_seq_assay_id=keep_seq_assay_id, + old_release=old_release, + new_release=new_release, + ) tempdir_o.cleanup() # Update dashboard tables @@ -331,7 +424,9 @@ def patch_release_workflow( database_mapping = syn.tableQuery(f"select * from {database_mapping_synid}") database_mappingdf = database_mapping.asDataFrame() # You may have to execute this twice in case the file view isn't updated - dashboard_table_updater.run_dashboard(syn, database_mappingdf, new_release, staging=not production) + dashboard_table_updater.run_dashboard( + syn, database_mappingdf, new_release, staging=not production + ) def main(): @@ -356,7 +451,7 @@ def main(): parser.add_argument( "--production", action="store_true", - help="Run production workload or it will default to the staging workload" + help="Run production workload or it will default to the staging workload", ) args = parser.parse_args() @@ -364,7 +459,7 @@ def main(): release_synid=args.release_synid, new_release_synid=args.new_release_synid, retracted_sample_synid=args.retracted_sample_synid, - production=args.production + production=args.production, ) From 4915f3847dab936b2447153c2bddb6e520386911 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Mon, 26 Aug 2024 23:38:00 -0700 Subject: [PATCH 32/39] Add schema --- patch_release_nf_schema.json | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 patch_release_nf_schema.json diff --git a/patch_release_nf_schema.json b/patch_release_nf_schema.json new file mode 100644 index 0000000..78216c1 --- /dev/null +++ b/patch_release_nf_schema.json @@ -0,0 +1,34 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/Sage-Bionetworks-Workflows/nf-genie/master/nextflow_schema.json", + "title": "Sage-Bionetworks-Workflows/nf-genie pipeline parameters", + "description": "Nextflow pipeline for main GENIE processing", + "type": "object", + "definitions": { + "patch_release": { + "title": "patch_release", + "type": "object", + "description": "Patch release configurations", + "default": "", + "properties": { + "release_synid": { + "type": "string", + "description": "Existing consortium release synapse folder id" + }, + "new_release_synid": { + "type": "string", + "description": "New consoritum release synapse folder id" + }, + "retracted_sample_synid": { + "type": "string", + "description": "samples_to_retract.csv of 3rd consortium release" + } + } + } + }, + "allOf": [ + { + "$ref": "#/definitions/patch_release" + } + ] +} From b7b3f4e9652890328194027b0c1e63ad3f3729b0 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Mon, 26 Aug 2024 23:40:15 -0700 Subject: [PATCH 33/39] Add release --- patch_release_nf_schema.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/patch_release_nf_schema.json b/patch_release_nf_schema.json index 78216c1..9a7eb49 100644 --- a/patch_release_nf_schema.json +++ b/patch_release_nf_schema.json @@ -22,6 +22,12 @@ "retracted_sample_synid": { "type": "string", "description": "samples_to_retract.csv of 3rd consortium release" + }, + "release": { + "type": "string", + "description": "Release name. E.g: 13.1-consortium", + "default": "TEST.consortium", + "pattern": "\\d+[.]\\d+-(consortium)$" } } } From d9cffb1b1df57f763fdce83416bce43224f350a9 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Mon, 26 Aug 2024 23:41:12 -0700 Subject: [PATCH 34/39] Rename --- ...h_release_nf_schema.json => nextflow_schema_patch_release.json | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename patch_release_nf_schema.json => nextflow_schema_patch_release.json (100%) diff --git a/patch_release_nf_schema.json b/nextflow_schema_patch_release.json similarity index 100% rename from patch_release_nf_schema.json rename to nextflow_schema_patch_release.json From 4e385d905547aa8bcf792320b804fe079a0f34dd Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Mon, 26 Aug 2024 23:47:26 -0700 Subject: [PATCH 35/39] Add argparse --- scripts/patch_release/compare_patch.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/scripts/patch_release/compare_patch.py b/scripts/patch_release/compare_patch.py index 18a7076..a8efde6 100644 --- a/scripts/patch_release/compare_patch.py +++ b/scripts/patch_release/compare_patch.py @@ -2,12 +2,12 @@ The command ran: python patch.py syn53170398 syn62069187 syn54082015 In leu of lack of unit or integration tests, the above command replicates the -this is to test 15.5-consortium (syn55146141) and 15.6-consortium (Staging) +this is to test 15.5-consortium (syn55146141) and 15.6-consortium (Staging syn62069187) -python compare_patch.py - -TODO: Add argparse +python compare_patch.py --original_synid syn55146141 --new_synid syn62069187 """ +import argparse + import synapseclient import synapseutils as synu @@ -70,8 +70,14 @@ def compare_releases(original_synid: str, new_synid: str): if original_file_list[filename].md5 != new_file_list[filename].md5: print("Files are different: ", filename) +def main(): + parser = argparse.ArgumentParser(description='Compare two Synapse releases.') + parser.add_argument('--original_synid', type=str, help='The Synapse ID of the original release') + parser.add_argument('--new_synid', type=str, help='The Synapse ID of the new release') + + args = parser.parse_args() + + compare_releases(args.original_synid, args.new_synid) if __name__ == "__main__": - original_synid = "syn55146141" - new_synid = "syn62069187" - compare_releases(original_synid, new_synid) + main() From efe387ba1c51b7a441a90fedc9e8092849c39d3a Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Mon, 26 Aug 2024 23:50:54 -0700 Subject: [PATCH 36/39] Add compare release module --- modules/compare_releases.nf | 20 ++++++++++++++++++++ patch_release_main.nf | 2 ++ 2 files changed, 22 insertions(+) create mode 100644 modules/compare_releases.nf diff --git a/modules/compare_releases.nf b/modules/compare_releases.nf new file mode 100644 index 0000000..2f46429 --- /dev/null +++ b/modules/compare_releases.nf @@ -0,0 +1,20 @@ +// Compares two GENIE releases given two synapse ids +process compare_releases { + container "$params.patch_release_docker" + secret 'SYNAPSE_AUTH_TOKEN' + + input: + val previous + val release_synid + val new_release_synid + + output: + stdout + + script: + """ + python3 /patch_release/compare_patch.py \ + --original_synid $release_synid \ + --new_synid $new_release_synid + """ +} diff --git a/patch_release_main.nf b/patch_release_main.nf index 1c21e6b..4cc9cfc 100644 --- a/patch_release_main.nf +++ b/patch_release_main.nf @@ -6,6 +6,7 @@ nextflow.enable.dsl = 2 include { patch_release } from './modules/patch_release' include { create_data_guide } from './modules/create_data_guide' include { create_dashboard_html } from './modules/create_dashboard_html' +include { compare_releases } from './modules/compare_releases' params.release_synid = "syn53170398" // 15.4-consortium params.new_release_synid = "syn62069187" // 15.6-consortium (in staging) @@ -30,4 +31,5 @@ workflow { patch_release(ch_release_synid, ch_new_release_synid, ch_retracted_sample_synid, is_production) create_dashboard_html(patch_release.out, ch_release, is_production) create_data_guide(patch_release.out, ch_release, ch_project_id) + compare_releases(create_data_guide.out, ch_release_synid, ch_new_release_synid) } From d86aa42d667574436ef2f2edaa60a8d3b97b297d Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Tue, 27 Aug 2024 01:09:26 -0700 Subject: [PATCH 37/39] Update schema and workflow --- nextflow_schema_patch_release.json | 4 ++++ patch_release_main.nf | 8 +++++++- scripts/patch_release/compare_patch.py | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/nextflow_schema_patch_release.json b/nextflow_schema_patch_release.json index 9a7eb49..e1696de 100644 --- a/nextflow_schema_patch_release.json +++ b/nextflow_schema_patch_release.json @@ -28,6 +28,10 @@ "description": "Release name. E.g: 13.1-consortium", "default": "TEST.consortium", "pattern": "\\d+[.]\\d+-(consortium)$" + }, + "project_id": { + "type": "string", + "description": "Synapse GENIE internal projects." } } } diff --git a/patch_release_main.nf b/patch_release_main.nf index 4cc9cfc..10b3fc1 100644 --- a/patch_release_main.nf +++ b/patch_release_main.nf @@ -31,5 +31,11 @@ workflow { patch_release(ch_release_synid, ch_new_release_synid, ch_retracted_sample_synid, is_production) create_dashboard_html(patch_release.out, ch_release, is_production) create_data_guide(patch_release.out, ch_release, ch_project_id) - compare_releases(create_data_guide.out, ch_release_synid, ch_new_release_synid) + // This syn55146141 is hard coded because the ch_release used will certainly + // definitely be different from ch_new_release_synid because that is the patch. + // TODO: we will want to implement a different comparison report to look at diffs + // This current comparison looks at similarities and it good for staging pipeline. + if (not is_production) { + compare_releases(create_data_guide.out, "syn55146141", ch_new_release_synid) + } } diff --git a/scripts/patch_release/compare_patch.py b/scripts/patch_release/compare_patch.py index a8efde6..ddcc554 100644 --- a/scripts/patch_release/compare_patch.py +++ b/scripts/patch_release/compare_patch.py @@ -3,6 +3,7 @@ python patch.py syn53170398 syn62069187 syn54082015 In leu of lack of unit or integration tests, the above command replicates the this is to test 15.5-consortium (syn55146141) and 15.6-consortium (Staging syn62069187) +that they are the same. python compare_patch.py --original_synid syn55146141 --new_synid syn62069187 """ From 5ee64be8341595ed431c0822b12d44c97f08d56c Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Tue, 27 Aug 2024 01:17:21 -0700 Subject: [PATCH 38/39] Use --- patch_release_main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/patch_release_main.nf b/patch_release_main.nf index 10b3fc1..b4c0874 100644 --- a/patch_release_main.nf +++ b/patch_release_main.nf @@ -35,7 +35,7 @@ workflow { // definitely be different from ch_new_release_synid because that is the patch. // TODO: we will want to implement a different comparison report to look at diffs // This current comparison looks at similarities and it good for staging pipeline. - if (not is_production) { + if (!is_production) { compare_releases(create_data_guide.out, "syn55146141", ch_new_release_synid) } } From 264bf137178a0f517eae5b4184c31400217f5413 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Mon, 9 Sep 2024 23:19:12 -0700 Subject: [PATCH 39/39] Update scripts/patch_release/compare_patch.py --- scripts/patch_release/compare_patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/patch_release/compare_patch.py b/scripts/patch_release/compare_patch.py index ddcc554..3a9cf28 100644 --- a/scripts/patch_release/compare_patch.py +++ b/scripts/patch_release/compare_patch.py @@ -13,7 +13,7 @@ import synapseutils as synu -def _get_file_dict(syn: synapseclient.Synapse, synid: str) -> dict[str, str]: +def _get_file_dict(syn: synapseclient.Synapse, synid: str): """ This function generates a dictionary of files from a Synapse ID.