From 43a93f1e936dd82ec9ee81195e465cf1429ec92f Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Fri, 6 Jan 2023 15:58:47 -0500 Subject: [PATCH 01/11] add toggle for unstable af filter during label step in batch effect --- wdl/Module07XfBatchEffect.wdl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/wdl/Module07XfBatchEffect.wdl b/wdl/Module07XfBatchEffect.wdl index b2148955e..48de966aa 100644 --- a/wdl/Module07XfBatchEffect.wdl +++ b/wdl/Module07XfBatchEffect.wdl @@ -24,6 +24,7 @@ workflow XfBatchEffect { Int? onevsall_cutoff=2 String prefix File af_pcrmins_premingq + Boolean? enable_unstable_af_filter String sv_pipeline_docker RuntimeAttr? runtime_attr_merge_labeled_vcfs @@ -176,6 +177,7 @@ workflow XfBatchEffect { contig=contig[0], reclassification_table=MakeReclassificationTable.reclassification_table, mingq_prePost_pcrminus_fails=CompareFreqsPrePostMinGQPcrminus.pcrminus_fails, + enable_unstable_af_filter = select_first([enable_unstable_af_filter, true]), prefix="~{prefix}.~{contig[0]}", sv_pipeline_docker=sv_pipeline_docker } @@ -647,6 +649,7 @@ task ApplyBatchEffectLabels { File reclassification_table File mingq_prePost_pcrminus_fails String prefix + Boolean enable_unstable_af_filter String sv_pipeline_docker RuntimeAttr? runtime_attr_override } @@ -663,7 +666,7 @@ task ApplyBatchEffectLabels { set -euo pipefail tabix -h ~{vcf} ~{contig} \ | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/label_batch_effects.PCRMinus_only.py \ - --unstable-af-pcrminus ~{mingq_prePost_pcrminus_fails} \ + ~{if (enable_unstable_af_filter) then "--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails else ""} \ stdin \ ~{reclassification_table} \ stdout \ From 38e11d61a5bf83a1a1e928abe5dad4d406884f42 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 23 Jan 2023 11:53:17 -0500 Subject: [PATCH 02/11] skip AF comparison and filtering if no input provided --- wdl/Module07XfBatchEffect.wdl | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/wdl/Module07XfBatchEffect.wdl b/wdl/Module07XfBatchEffect.wdl index 48de966aa..0f816d387 100644 --- a/wdl/Module07XfBatchEffect.wdl +++ b/wdl/Module07XfBatchEffect.wdl @@ -24,7 +24,6 @@ workflow XfBatchEffect { Int? onevsall_cutoff=2 String prefix File af_pcrmins_premingq - Boolean? enable_unstable_af_filter String sv_pipeline_docker RuntimeAttr? runtime_attr_merge_labeled_vcfs @@ -87,12 +86,14 @@ workflow XfBatchEffect { # Compare frequencies before and after minGQ, and generate list of variants # that are significantly different between the steps - call CompareFreqsPrePostMinGQPcrminus { - input: - af_pcrmins_premingq=af_pcrmins_premingq, - AF_postMinGQ_table=MergeFreqTables_allPops.merged_table, - prefix=prefix, - sv_pipeline_docker=sv_pipeline_docker + if (defined(af_pcrmins_premingq)) { + call CompareFreqsPrePostMinGQPcrminus { + input: + af_pcrmins_premingq=select_first([af_pcrmins_premingq]), + AF_postMinGQ_table=MergeFreqTables_allPops.merged_table, + prefix=prefix, + sv_pipeline_docker=sv_pipeline_docker + } } # Generate matrix of correlation coefficients for all batches, by population & SVTYPE @@ -177,7 +178,6 @@ workflow XfBatchEffect { contig=contig[0], reclassification_table=MakeReclassificationTable.reclassification_table, mingq_prePost_pcrminus_fails=CompareFreqsPrePostMinGQPcrminus.pcrminus_fails, - enable_unstable_af_filter = select_first([enable_unstable_af_filter, true]), prefix="~{prefix}.~{contig[0]}", sv_pipeline_docker=sv_pipeline_docker } @@ -647,9 +647,8 @@ task ApplyBatchEffectLabels { File vcf_idx String contig File reclassification_table - File mingq_prePost_pcrminus_fails + File? mingq_prePost_pcrminus_fails String prefix - Boolean enable_unstable_af_filter String sv_pipeline_docker RuntimeAttr? runtime_attr_override } @@ -666,7 +665,7 @@ task ApplyBatchEffectLabels { set -euo pipefail tabix -h ~{vcf} ~{contig} \ | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/label_batch_effects.PCRMinus_only.py \ - ~{if (enable_unstable_af_filter) then "--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails else ""} \ + ~{"--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails} \ stdin \ ~{reclassification_table} \ stdout \ From 682ca8f041264f91e201ad61cfb100c654287e79 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 23 Jan 2023 11:57:26 -0500 Subject: [PATCH 03/11] make af input actually optional --- wdl/Module07XfBatchEffect.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/Module07XfBatchEffect.wdl b/wdl/Module07XfBatchEffect.wdl index 0f816d387..5a248c63e 100644 --- a/wdl/Module07XfBatchEffect.wdl +++ b/wdl/Module07XfBatchEffect.wdl @@ -23,7 +23,7 @@ workflow XfBatchEffect { Int? pairwise_cutoff=2 Int? onevsall_cutoff=2 String prefix - File af_pcrmins_premingq + File? af_pcrmins_premingq String sv_pipeline_docker RuntimeAttr? runtime_attr_merge_labeled_vcfs From 37662c8b7c46b9800b52bc4284e0d098ead5719f Mon Sep 17 00:00:00 2001 From: Shadi Zaheri Date: Thu, 9 Nov 2023 15:04:43 -0500 Subject: [PATCH 04/11] Removed pairwise batch effect checks from WDL workflow --- wdl/Module07XfBatchEffect_no_pairwise.wdl | 612 ++++++++++++++++++++++ 1 file changed, 612 insertions(+) create mode 100644 wdl/Module07XfBatchEffect_no_pairwise.wdl diff --git a/wdl/Module07XfBatchEffect_no_pairwise.wdl b/wdl/Module07XfBatchEffect_no_pairwise.wdl new file mode 100644 index 000000000..f7958ba92 --- /dev/null +++ b/wdl/Module07XfBatchEffect_no_pairwise.wdl @@ -0,0 +1,612 @@ +########################## +## EXPERIMENTAL WORKFLOW +########################## + +version 1.0 + +import "prune_add_af.wdl" as calcAF +import "batch_effect_helper.wdl" as helper +import "TasksMakeCohortVcf.wdl" as MiniTasks + +workflow XfBatchEffect { + input{ + File vcf + File vcf_idx + File sample_batch_assignments + File batches_list + File sample_pop_assignments + File excludesamples_list #empty file if need be + File famfile + File contiglist + File? par_bed + Int variants_per_shard + Int? onevsall_cutoff=2 + String prefix + File af_pcrmins_premingq + String sv_pipeline_docker + + RuntimeAttr? runtime_attr_merge_labeled_vcfs + } + Array[String] batches = read_lines(batches_list) + Array[Array[String]] contigs = read_tsv(contiglist) + + # Shard VCF per batch, compute pops-specific AFs, and convert to table of VID & AF stats + scatter ( batch in batches ) { + # Get list of samples to include & exclude per batch + call GetBatchSamplesList { + input: + vcf=vcf, + vcf_idx=vcf_idx, + batch=batch, + sample_batch_assignments=sample_batch_assignments, + probands_list=excludesamples_list, + sv_pipeline_docker=sv_pipeline_docker + } + # Prune VCF to samples + call calcAF.prune_and_add_vafs as getAFs { + input: + vcf=vcf, + vcf_idx=vcf_idx, + prefix=batch, + sample_pop_assignments=sample_pop_assignments, + prune_list=GetBatchSamplesList.exclude_samples_list, + famfile=famfile, + sv_per_shard=25000, + contiglist=contiglist, + drop_empty_records="FALSE", + par_bed=par_bed, + sv_pipeline_docker=sv_pipeline_docker + } + # Get minimal table of AF data per batch, split by ancestry + call GetFreqTable { + input: + vcf=getAFs.output_vcf, + sample_pop_assignments=sample_pop_assignments, + prefix=batch, + sv_pipeline_docker=sv_pipeline_docker + } + } + + # Merge frequency results per batch into a single table of all variants with AF data across batches + call MergeFreqTables { + input: + tables=GetFreqTable.freq_data, + batches_list=batches_list, + prefix=prefix, + sv_pipeline_docker=sv_pipeline_docker + } + call MergeFreqTables as MergeFreqTables_allPops { + input: + tables=GetFreqTable.freq_data_allPops, + batches_list=batches_list, + prefix=prefix, + sv_pipeline_docker=sv_pipeline_docker + } + + # Compare frequencies before and after minGQ, and generate list of variants + # that are significantly different between the steps + call CompareFreqsPrePostMinGQPcrminus { + input: + af_pcrmins_premingq=af_pcrmins_premingq, + AF_postMinGQ_table=MergeFreqTables_allPops.merged_table, + prefix=prefix, + sv_pipeline_docker=sv_pipeline_docker + } + + # Generate matrix of correlation coefficients for all batches, by population & SVTYPE + #scatter ( pop in populations ) { + # call MakeCorrelationMatrices { + # input: + # freq_table=MergeFreqTables.merged_table, + # pop=pop, + # batches_list=batches_list, + # prefix=prefix, + # sv_pipeline_docker=sv_pipeline_docker + # } + #} + + # Perform one-vs-all comparison of AFs per batch to find batch-specific sites + scatter ( batch in batches ) { + call helper.check_batch_effects as one_vs_all_comparison { + input: + freq_table=MergeFreqTables.merged_table, + batch1=batch, + batch2="ALL_OTHERS", + prefix=prefix, + variants_per_shard=variants_per_shard, + sv_pipeline_docker=sv_pipeline_docker + } + } + # Collect results from pairwise batch effect detection + call MergeVariantFailureLists as merge_one_vs_all_checks { + input: + fail_variant_lists=one_vs_all_comparison.batch_effect_variants, + prefix="~{prefix}.one_vs_all_comparisons", + sv_pipeline_docker=sv_pipeline_docker + } + + # Distill final table of variants to be reclassified + call MakeReclassificationTable { + input: + freq_table=MergeFreqTables.merged_table, + onevsall_fails=merge_one_vs_all_checks.fails_per_variant_all, + prefix=prefix, + onevsall_cutoff = onevsall_cutoff, + sv_pipeline_docker=sv_pipeline_docker + } + + # Apply batch effect labels + scatter ( contig in contigs ) { + call ApplyBatchEffectLabels as apply_labels_perContig { + input: + vcf=vcf, + vcf_idx=vcf_idx, + contig=contig[0], + reclassification_table=MakeReclassificationTable.reclassification_table, + mingq_prePost_pcrminus_fails=CompareFreqsPrePostMinGQPcrminus.pcrminus_fails, + prefix="~{prefix}.~{contig[0]}", + sv_pipeline_docker=sv_pipeline_docker + } + } + call MiniTasks.ConcatVcfs as merge_labeled_vcfs { + input: + vcfs=apply_labels_perContig.labeled_vcf, + naive=true, + outfile_prefix="~{prefix}.batch_effects_labeled_merged", + sv_base_mini_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_merge_labeled_vcfs + } + + output { + File labeled_vcf = merge_labeled_vcfs.concat_vcf + File labeled_vcf_idx = merge_labeled_vcfs.concat_vcf_idx + } +} + + +# Get list of samples to include & exclude per batch +# Always exclude probands from all batches +task GetBatchSamplesList { + input{ + File vcf + File vcf_idx + String batch + File sample_batch_assignments + File probands_list + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: 50, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + command <<< + set -euo pipefail + # Get list of all samples present in VCF header + tabix -H ~{vcf} | fgrep -v "##" | cut -f10- | sed 's/\t/\n/g' | sort -Vk1,1 \ + > all_samples.list + # Get list of samples in batch + fgrep -w ~{batch} ~{sample_batch_assignments} | cut -f1 \ + | fgrep -wf - all_samples.list \ + | fgrep -wvf ~{probands_list} \ + > "~{batch}.samples.list" || true + # Get list of samples not in batch + fgrep -wv ~{batch} ~{sample_batch_assignments} | cut -f1 \ + | cat - ~{probands_list} | sort -Vk1,1 | uniq \ + | fgrep -wf - all_samples.list \ + > "~{batch}.exclude_samples.list" || true + >>> + + output { + File include_samples_list = "~{batch}.samples.list" + File exclude_samples_list = "~{batch}.exclude_samples.list" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + + +# Run vcf2bed and subset to just include VID, SVTYPE, SVLEN, _AC, and _AN +task GetFreqTable { + input{ + File vcf + File sample_pop_assignments + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 6, + disk_gb: 50, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + command <<< + set -euo pipefail + #Run vcf2bed + svtk vcf2bed \ + --info ALL \ + --no-samples \ + ~{vcf} "~{prefix}.vcf2bed.bed" + ### Create table of freqs by ancestry + #Cut to necessary columns + idxs=$( sed -n '1p' "~{prefix}.vcf2bed.bed" \ + | sed 's/\t/\n/g' \ + | awk -v OFS="\t" '{ print $1, NR }' \ + | grep -e 'name\|SVLEN\|SVTYPE\|_AC\|_AN\|_CN_NONREF_COUNT\|_CN_NUMBER' \ + | fgrep -v "OTH" \ + | cut -f2 \ + | paste -s -d\, || true ) + cut -f"$idxs" "~{prefix}.vcf2bed.bed" \ + | sed 's/^name/\#VID/g' \ + | gzip -c \ + > "~{prefix}.frequencies.preclean.txt.gz" + #Clean frequencies (note that this script automatically gzips the output file taken as the last argument) + /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/clean_frequencies_table.R \ + "~{prefix}.frequencies.preclean.txt.gz" \ + "~{sample_pop_assignments}" \ + "~{prefix}.frequencies.txt" + ### Create table of freqs, irrespective of ancestry + #Cut to necessary columns + idxs=$( sed -n '1p' "~{prefix}.vcf2bed.bed" \ + | sed 's/\t/\n/g' \ + | awk -v OFS="\t" '{ if ($1=="name" || $1=="SVLEN" || $1=="SVTYPE" || $1=="AC" || $1=="AN" || $1=="CN_NUMBER" || $1=="CN_NONREF_COUNT") print NR }' \ + | paste -s -d\, || true ) + cut -f"$idxs" "~{prefix}.vcf2bed.bed" > minfreq.subset.bed + svtype_idx=$( sed -n '1p' minfreq.subset.bed \ + | sed 's/\t/\n/g' \ + | awk -v OFS="\t" '{ if ($1=="SVTYPE") print NR }' || true ) + awk -v OFS="\t" -v sidx="$svtype_idx" '{ if ($sidx=="CNV" || $sidx=="MCNV") print $1, $2, $3, $6, $7; else print $1, $2, $3, $4, $5 }' minfreq.subset.bed \ + | sed 's/^name/\#VID/g' \ + | gzip -c \ + > "~{prefix}.frequencies.allPops.txt.gz" + >>> + + output { + File freq_data = "~{prefix}.frequencies.txt.gz" + File freq_data_allPops = "~{prefix}.frequencies.allPops.txt.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + + +# Combine frequency data across batches +task MergeFreqTables { + input{ + Array[File] tables + File batches_list + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 16, + disk_gb: 100, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + command <<< + set -euo pipefail + + #Get list of batch IDs and batch table paths + while read batch; do + echo "$batch" + find ./ -name "$batch.frequencies*txt.gz" | sed -n '1p' + done < ~{batches_list} | paste - - \ + > input.list + + #Make sure all input files have the same number of lines + while read batch file; do + zcat "$file" | wc -l + done < input.list > nlines.list + nlines=$( sort nlines.list | uniq | wc -l ) + if [ "$nlines" -gt 1 ]; then + echo "AT LEAST ONE INPUT FILE HAS A DIFFERENT NUMBER OF LINES" + exit 0 + fi + + #Prep files for paste joining + echo "PREPPING FILES FOR MERGING" + while read batch file; do + #Header + zcat "$file" | sed -n '1p' | cut -f1-3 + #Body + zcat "$file" | sed '1d' \ + | sort -Vk1,1 \ + | cut -f1-3 + done < <( sed -n '1p' input.list ) \ + > header.txt + while read batch file; do + for wrapper in 1; do + #Header + zcat "$file" | sed -n '1p' \ + | cut -f4- | sed 's/\t/\n/g' \ + | awk -v batch="$batch" '{ print $1"."batch }' \ + | paste -s + #Body + zcat "$file" | sed '1d' \ + | sort -Vk1,1 \ + | cut -f4- + done > "$batch.prepped.txt" + done < input.list + + #Join files with simple paste + paste \ + header.txt \ + $( awk -v ORS=" " '{ print $1".prepped.txt" }' input.list ) \ + | gzip -c \ + > "~{prefix}.merged_AF_table.txt.gz" + >>> + + output { + File merged_table = "~{prefix}.merged_AF_table.txt.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + + +# Compare +task CompareFreqsPrePostMinGQPcrminus { + input{ + File af_pcrmins_premingq + File AF_postMinGQ_table + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 8, + disk_gb: 30, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + command <<< + set -euo pipefail + /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/compare_freqs_pre_post_minGQ.PCRMinus_only.R \ + ~{af_pcrmins_premingq} \ + ~{AF_postMinGQ_table} \ + ./ \ + "~{prefix}." + >>> + + output { + File pcrminus_fails = "~{prefix}.PCRMINUS_minGQ_AF_prePost_fails.VIDs.list" + File minGQ_prePost_comparison_data = "~{prefix}.minGQ_AF_prePost_comparison.data.txt.gz" + File minGQ_prePost_comparison_plot = "~{prefix}.minGQ_AF_prePost_comparison.plot.png" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + + +# Calculate & plot cross-batch correlation coefficient matrixes +task MakeCorrelationMatrices { + input{ + File freq_table + String pop + File batches_list + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 8, + disk_gb: 50, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + command <<< + set -euo pipefail + /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/correlate_batches_singlePop.R \ + ~{batches_list} \ + ~{freq_table} \ + "~{pop}" \ + "~{prefix}.~{pop}" + >>> + output { + Array[File] corr_matrixes = glob("~{prefix}.~{pop}.*.R2_matrix.txt") + Array[File] heat_maps = glob("~{prefix}.~{pop}.*heatmap*.pdf") + Array[File] dot_plots = glob("~{prefix}.~{pop}.*perBatch_R2_sina_plot.pdf") + } + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +# Merge lists of batch effect checks and count total number of times each variant failed +task MergeVariantFailureLists { + input{ + Array[File] fail_variant_lists + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + command <<< + set -euo pipefail + #Write list of paths to all batch effect variant lists + #Get master list of PCR+ to PCR+ failures #removed from the PCR- only projects + #Get master list of PCR- to PCR- failures #removed from the PCR- only projects + #Get master list of PCR+ to PCR- failures #removed from the PCR- only projects + #Get master list of all possible failures + cat ~{write_lines(fail_variant_lists)} \ + | xargs -I {} cat {} \ + | sort -Vk1,1 | uniq -c \ + | awk -v OFS="\t" '{ print $2, $1 }' \ + > "~{prefix}.all.failures.txt" || true + >>> + + output { + File fails_per_variant_all = "~{prefix}.all.failures.txt" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + + +# Consolidate all batch effect check results into a single table with reclassification per variant +task MakeReclassificationTable { + input{ + File freq_table + File onevsall_fails + String prefix + Int? onevsall_cutoff + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 8, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + command <<< + set -euo pipefail + /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R \ + ~{freq_table} \ + ~{onevsall_fails} \ + "~{prefix}.batch_effect_reclassification_table.txt" \ + ~{onevsall_cutoff} + >>> + + output { + File reclassification_table = "~{prefix}.batch_effect_reclassification_table.txt" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + + +# Apply batch effect labels to VCF +task ApplyBatchEffectLabels { + input{ + File vcf + File vcf_idx + String contig + File reclassification_table + File mingq_prePost_pcrminus_fails + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: 50, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + command <<< + set -euo pipefail + tabix -h ~{vcf} ~{contig} \ + | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/label_batch_effects.PCRMinus_only.py \ + --unstable-af-pcrminus ~{mingq_prePost_pcrminus_fails} \ + stdin \ + ~{reclassification_table} \ + stdout \ + | bgzip -c \ + > "~{prefix}.batch_effects_labeled.vcf.gz" + tabix -p vcf -f "~{prefix}.batch_effects_labeled.vcf.gz" + >>> + + output { + File labeled_vcf = "~{prefix}.batch_effects_labeled.vcf.gz" + File labeled_vcf_idx = "~{prefix}.batch_effects_labeled.vcf.gz.tbi" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} From b691bd63fa99f33bd3e96835987ae5c2b6619046 Mon Sep 17 00:00:00 2001 From: Shadi Zaheri Date: Fri, 10 Nov 2023 10:57:46 -0500 Subject: [PATCH 05/11] no_pairwise_no_unstable_af_filter --- wdl/Module07XfBatchEffect_no_pairwise.wdl | 63 +++++++++++++++++++---- 1 file changed, 53 insertions(+), 10 deletions(-) diff --git a/wdl/Module07XfBatchEffect_no_pairwise.wdl b/wdl/Module07XfBatchEffect_no_pairwise.wdl index f7958ba92..14ac8cceb 100644 --- a/wdl/Module07XfBatchEffect_no_pairwise.wdl +++ b/wdl/Module07XfBatchEffect_no_pairwise.wdl @@ -22,7 +22,7 @@ workflow XfBatchEffect { Int variants_per_shard Int? onevsall_cutoff=2 String prefix - File af_pcrmins_premingq + File? af_pcrmins_premingq String sv_pipeline_docker RuntimeAttr? runtime_attr_merge_labeled_vcfs @@ -85,12 +85,14 @@ workflow XfBatchEffect { # Compare frequencies before and after minGQ, and generate list of variants # that are significantly different between the steps - call CompareFreqsPrePostMinGQPcrminus { - input: - af_pcrmins_premingq=af_pcrmins_premingq, - AF_postMinGQ_table=MergeFreqTables_allPops.merged_table, - prefix=prefix, - sv_pipeline_docker=sv_pipeline_docker + if (defined(af_pcrmins_premingq)) { + call CompareFreqsPrePostMinGQPcrminus { + input: + af_pcrmins_premingq=select_first([af_pcrmins_premingq]), + AF_postMinGQ_table=MergeFreqTables_allPops.merged_table, + prefix=prefix, + sv_pipeline_docker=sv_pipeline_docker + } } # Generate matrix of correlation coefficients for all batches, by population & SVTYPE @@ -470,6 +472,47 @@ task MakeCorrelationMatrices { } } + +# Generate list of all pairs of batches to be compared +task MakeBatchPairsList { + input{ + File batches_list + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + command <<< + set -euo pipefail + /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_pairs_list.R \ + ~{batches_list} \ + "~{prefix}.nonredundant_batch_pairs.txt" + >>> + + output { + File batch_pairs_list = "~{prefix}.nonredundant_batch_pairs.txt" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + + # Merge lists of batch effect checks and count total number of times each variant failed task MergeVariantFailureLists { input{ @@ -568,7 +611,7 @@ task ApplyBatchEffectLabels { File vcf_idx String contig File reclassification_table - File mingq_prePost_pcrminus_fails + File? mingq_prePost_pcrminus_fails String prefix String sv_pipeline_docker RuntimeAttr? runtime_attr_override @@ -586,7 +629,7 @@ task ApplyBatchEffectLabels { set -euo pipefail tabix -h ~{vcf} ~{contig} \ | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/label_batch_effects.PCRMinus_only.py \ - --unstable-af-pcrminus ~{mingq_prePost_pcrminus_fails} \ + ~{"--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails} \ stdin \ ~{reclassification_table} \ stdout \ @@ -609,4 +652,4 @@ task ApplyBatchEffectLabels { preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) } -} +} \ No newline at end of file From 592d93aac3cef3e013de9fb3937c6b179c3f2d4b Mon Sep 17 00:00:00 2001 From: Shadi Zaheri Date: Mon, 20 Nov 2023 10:08:53 -0500 Subject: [PATCH 06/11] updated the related wdl and R script --- .../.Rhistory | 0 ...ect_reclassification_table.PCRMinus_only.R | 50 +-- wdl/Module07XfBatchEffect.wdl | 308 ++++++++---------- 3 files changed, 146 insertions(+), 212 deletions(-) create mode 100644 src/sv-pipeline/scripts/downstream_analysis_and_filtering/.Rhistory diff --git a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/.Rhistory b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/.Rhistory new file mode 100644 index 000000000..e69de29bb diff --git a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R index a409405bf..1f233d25d 100755 --- a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R +++ b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R @@ -54,13 +54,10 @@ import.fails <- function(minus.in,prefix){ } #Categorize failing sites -categorize.failures <- function(dat,pairwise.cutoff,onevsall.cutoff){ - dat$max_plus_frac <- apply(data.frame(dat$frac_plus_fails_pairwise, - dat$frac_plus_fails_onevsall), +analyze.failures <- function(dat,onevsall.cutoff){ + dat$max_plus_frac <- apply(data.frame(dat$frac_plus_fails_onevsall), 1,max,na.rm=T) - pairwise.fail.idx <- which(dat$fails_pairwise>=pairwise.cutoff & dat$fails_onevsall=onevsall.cutoff) - both.fail.idx <- which(dat$fails_pairwise>=pairwise.cutoff & dat$fails_onevsall>=onevsall.cutoff) + onevsall.fail.idx <- which(dat$fails_onevsall>=onevsall.cutoff) plus.gt.minus <- which(dat$PCRPLUS_AF>=dat$PCRMINUS_AF) minus.gt.plus <- which(dat$PCRPLUS_AF=0.11){ - if(as.numeric(row$PCRPLUS_AF)>=as.numeric(row$PCRMINUS_AF)){ - plus.enriched <- c(plus.enriched,i) - }else{ - plus.depleted <- c(plus.depleted,i) - } - }else{ - batch.variable <- c(batch.variable,i) - } - } for(i in onevsall.fail.idx){ row <- dat[i,] if(as.numeric(row$frac_plus_fails_onevsall)>=0.11){ @@ -92,18 +77,6 @@ categorize.failures <- function(dat,pairwise.cutoff,onevsall.cutoff){ batch.variable <- c(batch.variable,i) } } - for(i in both.fail.idx){ - row <- dat[i,] - if(as.numeric(row$max_plus_frac)>=0.11){ - if(as.numeric(row$PCRPLUS_AF)>=as.numeric(row$PCRMINUS_AF)){ - plus.enriched <- c(plus.enriched,i) - }else{ - plus.depleted <- c(plus.depleted,i) - } - }else{ - batch.variable <- c(batch.variable,i) - } - } #Build classification table out.table <- data.frame("VID"=dat$VID[c(plus.enriched,plus.depleted,batch.variable)], @@ -118,13 +91,13 @@ categorize.failures <- function(dat,pairwise.cutoff,onevsall.cutoff){ ###Read command-line arguments args <- commandArgs(trailingOnly=T) freq.table.in <- as.character(args[1]) -pairwise.in <- as.character(args[2]) +#pairwise.in <- as.character(args[2]) #pairwise.minus.in <- as.character(args[3]) -onevsall.in <- as.character(args[3]) +onevsall.in <- as.character(args[2]) #onevsall.minus.in <- as.character(args[5]) -OUTFILE <- as.character(args[4]) -pairwise.cutoff <- as.integer(args[5]) -onevsall.cutoff <- as.integer(args[6]) +OUTFILE <- as.character(args[3]) +#pairwise.cutoff <- as.integer(args[5]) +onevsall.cutoff <- as.integer(args[4]) # #Dev parameters: # freq.table.in <- "~/scratch/gnomAD_v2_SV_MASTER.merged_AF_table.txt.gz" @@ -139,14 +112,12 @@ onevsall.cutoff <- as.integer(args[6]) freq.dat <- import.freqs(freq.table.in) -pairwise.fails <- import.fails(pairwise.in, prefix="pairwise") -pairwise.fails <- pairwise.fails[which(pairwise.fails$fails_pairwise>=pairwise.cutoff), ] onevsall.fails <- import.fails(onevsall.in, prefix="onevsall") onevsall.fails <- onevsall.fails[which(onevsall.fails$fails_onevsall>=onevsall.cutoff), ] ###Combine data -merged <- merge(pairwise.fails,onevsall.fails,all=T,sort=F,by="VID") +merged <- merge(onevsall.fails,all=T,sort=F,by="VID") if(nrow(merged) > 0){ merged[,-1] <- apply(merged[,-1],2,function(vals){ vals[which(is.na(vals))] <- 0 @@ -157,8 +128,7 @@ merged <- merge(merged,freq.dat,by="VID",sort=F,all=F) ##Categorize batch effect failure sites -out.table <- categorize.failures(dat=merged, - pairwise.cutoff=pairwise.cutoff, +out.table <- analyze.failures(dat=merged, onevsall.cutoff=onevsall.cutoff) write.table(out.table,OUTFILE,col.names=F,row.names=F,sep="\t",quote=F) diff --git a/wdl/Module07XfBatchEffect.wdl b/wdl/Module07XfBatchEffect.wdl index 5a248c63e..14ac8cceb 100644 --- a/wdl/Module07XfBatchEffect.wdl +++ b/wdl/Module07XfBatchEffect.wdl @@ -20,7 +20,6 @@ workflow XfBatchEffect { File contiglist File? par_bed Int variants_per_shard - Int? pairwise_cutoff=2 Int? onevsall_cutoff=2 String prefix File? af_pcrmins_premingq @@ -28,9 +27,9 @@ workflow XfBatchEffect { RuntimeAttr? runtime_attr_merge_labeled_vcfs } - Array[String] batches = read_lines(batches_list) - Array[Array[String]] contigs = read_tsv(contiglist) - + Array[String] batches = read_lines(batches_list) + Array[Array[String]] contigs = read_tsv(contiglist) + # Shard VCF per batch, compute pops-specific AFs, and convert to table of VID & AF stats scatter ( batch in batches ) { # Get list of samples to include & exclude per batch @@ -43,7 +42,7 @@ workflow XfBatchEffect { probands_list=excludesamples_list, sv_pipeline_docker=sv_pipeline_docker } - # Prune VCF to samples + # Prune VCF to samples call calcAF.prune_and_add_vafs as getAFs { input: vcf=vcf, @@ -108,35 +107,6 @@ workflow XfBatchEffect { # } #} - # Make list of nonredundant pairs of batches to be evaluated - call MakeBatchPairsList { - input: - batches_list=batches_list, - prefix=prefix, - sv_pipeline_docker=sv_pipeline_docker - } - Array[Array[String]] batch_pairs = read_tsv(MakeBatchPairsList.batch_pairs_list) - - # Compute AF stats per pair of batches & determine variants with batch effects - scatter ( pair in batch_pairs ) { - call helper.check_batch_effects as check_batch_effects { - input: - freq_table=MergeFreqTables.merged_table, - batch1=pair[0], - batch2=pair[1], - prefix=prefix, - variants_per_shard=variants_per_shard, - sv_pipeline_docker=sv_pipeline_docker - } - } - # Collect results from pairwise batch effect detection - call MergeVariantFailureLists as merge_pairwise_checks { - input: - fail_variant_lists=check_batch_effects.batch_effect_variants, - prefix="~{prefix}.pairwise_comparisons", - sv_pipeline_docker=sv_pipeline_docker - } - # Perform one-vs-all comparison of AFs per batch to find batch-specific sites scatter ( batch in batches ) { call helper.check_batch_effects as one_vs_all_comparison { @@ -161,10 +131,8 @@ workflow XfBatchEffect { call MakeReclassificationTable { input: freq_table=MergeFreqTables.merged_table, - pairwise_fails=merge_pairwise_checks.fails_per_variant_all, onevsall_fails=merge_one_vs_all_checks.fails_per_variant_all, prefix=prefix, - pairwise_cutoff = pairwise_cutoff, onevsall_cutoff = onevsall_cutoff, sv_pipeline_docker=sv_pipeline_docker } @@ -211,13 +179,13 @@ task GetBatchSamplesList { RuntimeAttr? runtime_attr_override } RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: 50, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } + cpu_cores: 1, + mem_gb: 4, + disk_gb: 50, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) command <<< set -euo pipefail @@ -263,49 +231,49 @@ task GetFreqTable { RuntimeAttr? runtime_attr_override } RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 6, - disk_gb: 50, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } + cpu_cores: 1, + mem_gb: 6, + disk_gb: 50, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) command <<< set -euo pipefail #Run vcf2bed svtk vcf2bed \ - --info ALL \ - --no-samples \ - ~{vcf} "~{prefix}.vcf2bed.bed" + --info ALL \ + --no-samples \ + ~{vcf} "~{prefix}.vcf2bed.bed" ### Create table of freqs by ancestry #Cut to necessary columns idxs=$( sed -n '1p' "~{prefix}.vcf2bed.bed" \ - | sed 's/\t/\n/g' \ - | awk -v OFS="\t" '{ print $1, NR }' \ - | grep -e 'name\|SVLEN\|SVTYPE\|_AC\|_AN\|_CN_NONREF_COUNT\|_CN_NUMBER' \ - | fgrep -v "OTH" \ - | cut -f2 \ - | paste -s -d\, || true ) + | sed 's/\t/\n/g' \ + | awk -v OFS="\t" '{ print $1, NR }' \ + | grep -e 'name\|SVLEN\|SVTYPE\|_AC\|_AN\|_CN_NONREF_COUNT\|_CN_NUMBER' \ + | fgrep -v "OTH" \ + | cut -f2 \ + | paste -s -d\, || true ) cut -f"$idxs" "~{prefix}.vcf2bed.bed" \ | sed 's/^name/\#VID/g' \ | gzip -c \ > "~{prefix}.frequencies.preclean.txt.gz" #Clean frequencies (note that this script automatically gzips the output file taken as the last argument) /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/clean_frequencies_table.R \ - "~{prefix}.frequencies.preclean.txt.gz" \ - "~{sample_pop_assignments}" \ - "~{prefix}.frequencies.txt" + "~{prefix}.frequencies.preclean.txt.gz" \ + "~{sample_pop_assignments}" \ + "~{prefix}.frequencies.txt" ### Create table of freqs, irrespective of ancestry #Cut to necessary columns idxs=$( sed -n '1p' "~{prefix}.vcf2bed.bed" \ - | sed 's/\t/\n/g' \ - | awk -v OFS="\t" '{ if ($1=="name" || $1=="SVLEN" || $1=="SVTYPE" || $1=="AC" || $1=="AN" || $1=="CN_NUMBER" || $1=="CN_NONREF_COUNT") print NR }' \ - | paste -s -d\, || true ) + | sed 's/\t/\n/g' \ + | awk -v OFS="\t" '{ if ($1=="name" || $1=="SVLEN" || $1=="SVTYPE" || $1=="AC" || $1=="AN" || $1=="CN_NUMBER" || $1=="CN_NONREF_COUNT") print NR }' \ + | paste -s -d\, || true ) cut -f"$idxs" "~{prefix}.vcf2bed.bed" > minfreq.subset.bed svtype_idx=$( sed -n '1p' minfreq.subset.bed \ - | sed 's/\t/\n/g' \ - | awk -v OFS="\t" '{ if ($1=="SVTYPE") print NR }' || true ) + | sed 's/\t/\n/g' \ + | awk -v OFS="\t" '{ if ($1=="SVTYPE") print NR }' || true ) awk -v OFS="\t" -v sidx="$svtype_idx" '{ if ($sidx=="CNV" || $sidx=="MCNV") print $1, $2, $3, $6, $7; else print $1, $2, $3, $4, $5 }' minfreq.subset.bed \ | sed 's/^name/\#VID/g' \ | gzip -c \ @@ -339,63 +307,63 @@ task MergeFreqTables { RuntimeAttr? runtime_attr_override } RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 16, - disk_gb: 100, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } + cpu_cores: 1, + mem_gb: 16, + disk_gb: 100, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) command <<< set -euo pipefail #Get list of batch IDs and batch table paths while read batch; do - echo "$batch" - find ./ -name "$batch.frequencies*txt.gz" | sed -n '1p' + echo "$batch" + find ./ -name "$batch.frequencies*txt.gz" | sed -n '1p' done < ~{batches_list} | paste - - \ > input.list #Make sure all input files have the same number of lines while read batch file; do - zcat "$file" | wc -l + zcat "$file" | wc -l done < input.list > nlines.list nlines=$( sort nlines.list | uniq | wc -l ) if [ "$nlines" -gt 1 ]; then - echo "AT LEAST ONE INPUT FILE HAS A DIFFERENT NUMBER OF LINES" - exit 0 + echo "AT LEAST ONE INPUT FILE HAS A DIFFERENT NUMBER OF LINES" + exit 0 fi #Prep files for paste joining echo "PREPPING FILES FOR MERGING" while read batch file; do - #Header - zcat "$file" | sed -n '1p' | cut -f1-3 - #Body - zcat "$file" | sed '1d' \ - | sort -Vk1,1 \ - | cut -f1-3 + #Header + zcat "$file" | sed -n '1p' | cut -f1-3 + #Body + zcat "$file" | sed '1d' \ + | sort -Vk1,1 \ + | cut -f1-3 done < <( sed -n '1p' input.list ) \ > header.txt while read batch file; do - for wrapper in 1; do - #Header - zcat "$file" | sed -n '1p' \ - | cut -f4- | sed 's/\t/\n/g' \ - | awk -v batch="$batch" '{ print $1"."batch }' \ - | paste -s - #Body - zcat "$file" | sed '1d' \ - | sort -Vk1,1 \ - | cut -f4- - done > "$batch.prepped.txt" + for wrapper in 1; do + #Header + zcat "$file" | sed -n '1p' \ + | cut -f4- | sed 's/\t/\n/g' \ + | awk -v batch="$batch" '{ print $1"."batch }' \ + | paste -s + #Body + zcat "$file" | sed '1d' \ + | sort -Vk1,1 \ + | cut -f4- + done > "$batch.prepped.txt" done < input.list #Join files with simple paste paste \ - header.txt \ - $( awk -v ORS=" " '{ print $1".prepped.txt" }' input.list ) \ + header.txt \ + $( awk -v ORS=" " '{ print $1".prepped.txt" }' input.list ) \ | gzip -c \ > "~{prefix}.merged_AF_table.txt.gz" >>> @@ -416,31 +384,31 @@ task MergeFreqTables { } -# Compare +# Compare task CompareFreqsPrePostMinGQPcrminus { input{ - File af_pcrmins_premingq - File AF_postMinGQ_table - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override + File af_pcrmins_premingq + File AF_postMinGQ_table + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override } RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 8, - disk_gb: 30, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } + cpu_cores: 1, + mem_gb: 8, + disk_gb: 30, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) command <<< set -euo pipefail /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/compare_freqs_pre_post_minGQ.PCRMinus_only.R \ - ~{af_pcrmins_premingq} \ - ~{AF_postMinGQ_table} \ - ./ \ - "~{prefix}." + ~{af_pcrmins_premingq} \ + ~{AF_postMinGQ_table} \ + ./ \ + "~{prefix}." >>> output { @@ -469,24 +437,24 @@ task MakeCorrelationMatrices { File batches_list String prefix String sv_pipeline_docker - RuntimeAttr? runtime_attr_override + RuntimeAttr? runtime_attr_override } RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 8, - disk_gb: 50, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } + cpu_cores: 1, + mem_gb: 8, + disk_gb: 50, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) command <<< set -euo pipefail /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/correlate_batches_singlePop.R \ - ~{batches_list} \ - ~{freq_table} \ - "~{pop}" \ - "~{prefix}.~{pop}" + ~{batches_list} \ + ~{freq_table} \ + "~{pop}" \ + "~{prefix}.~{pop}" >>> output { Array[File] corr_matrixes = glob("~{prefix}.~{pop}.*.R2_matrix.txt") @@ -514,19 +482,19 @@ task MakeBatchPairsList { RuntimeAttr? runtime_attr_override } RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: 10, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } + cpu_cores: 1, + mem_gb: 4, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) command <<< set -euo pipefail /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_pairs_list.R \ - ~{batches_list} \ - "~{prefix}.nonredundant_batch_pairs.txt" + ~{batches_list} \ + "~{prefix}.nonredundant_batch_pairs.txt" >>> output { @@ -554,13 +522,13 @@ task MergeVariantFailureLists { RuntimeAttr? runtime_attr_override } RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: 10, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } + cpu_cores: 1, + mem_gb: 4, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) command <<< set -euo pipefail @@ -596,32 +564,28 @@ task MergeVariantFailureLists { task MakeReclassificationTable { input{ File freq_table - File pairwise_fails File onevsall_fails String prefix - Int? pairwise_cutoff Int? onevsall_cutoff String sv_pipeline_docker RuntimeAttr? runtime_attr_override } RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 8, - disk_gb: 10, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } + cpu_cores: 1, + mem_gb: 8, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) command <<< set -euo pipefail /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R \ - ~{freq_table} \ - ~{pairwise_fails} \ - ~{onevsall_fails} \ - "~{prefix}.batch_effect_reclassification_table.txt" \ - ~{pairwise_cutoff} \ - ~{onevsall_cutoff} + ~{freq_table} \ + ~{onevsall_fails} \ + "~{prefix}.batch_effect_reclassification_table.txt" \ + ~{onevsall_cutoff} >>> output { @@ -653,24 +617,24 @@ task ApplyBatchEffectLabels { RuntimeAttr? runtime_attr_override } RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: 50, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } + cpu_cores: 1, + mem_gb: 4, + disk_gb: 50, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) command <<< set -euo pipefail tabix -h ~{vcf} ~{contig} \ | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/label_batch_effects.PCRMinus_only.py \ - ~{"--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails} \ - stdin \ - ~{reclassification_table} \ - stdout \ - | bgzip -c \ - > "~{prefix}.batch_effects_labeled.vcf.gz" + ~{"--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails} \ + stdin \ + ~{reclassification_table} \ + stdout \ + | bgzip -c \ + > "~{prefix}.batch_effects_labeled.vcf.gz" tabix -p vcf -f "~{prefix}.batch_effects_labeled.vcf.gz" >>> @@ -688,4 +652,4 @@ task ApplyBatchEffectLabels { preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) } -} +} \ No newline at end of file From 35331814f49a292d9b2d2adf38b0857b36f86785 Mon Sep 17 00:00:00 2001 From: Shadi Zaheri Date: Mon, 20 Nov 2023 10:30:52 -0500 Subject: [PATCH 07/11] updated the related wdl and deleted the copy of the wdl --- wdl/Module07XfBatchEffect.wdl | 41 -- wdl/Module07XfBatchEffect_no_pairwise.wdl | 655 ---------------------- 2 files changed, 696 deletions(-) delete mode 100644 wdl/Module07XfBatchEffect_no_pairwise.wdl diff --git a/wdl/Module07XfBatchEffect.wdl b/wdl/Module07XfBatchEffect.wdl index 14ac8cceb..a0f1ffd66 100644 --- a/wdl/Module07XfBatchEffect.wdl +++ b/wdl/Module07XfBatchEffect.wdl @@ -472,47 +472,6 @@ task MakeCorrelationMatrices { } } - -# Generate list of all pairs of batches to be compared -task MakeBatchPairsList { - input{ - File batches_list - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: 10, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - command <<< - set -euo pipefail - /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_pairs_list.R \ - ~{batches_list} \ - "~{prefix}.nonredundant_batch_pairs.txt" - >>> - - output { - File batch_pairs_list = "~{prefix}.nonredundant_batch_pairs.txt" - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - - # Merge lists of batch effect checks and count total number of times each variant failed task MergeVariantFailureLists { input{ diff --git a/wdl/Module07XfBatchEffect_no_pairwise.wdl b/wdl/Module07XfBatchEffect_no_pairwise.wdl deleted file mode 100644 index 14ac8cceb..000000000 --- a/wdl/Module07XfBatchEffect_no_pairwise.wdl +++ /dev/null @@ -1,655 +0,0 @@ -########################## -## EXPERIMENTAL WORKFLOW -########################## - -version 1.0 - -import "prune_add_af.wdl" as calcAF -import "batch_effect_helper.wdl" as helper -import "TasksMakeCohortVcf.wdl" as MiniTasks - -workflow XfBatchEffect { - input{ - File vcf - File vcf_idx - File sample_batch_assignments - File batches_list - File sample_pop_assignments - File excludesamples_list #empty file if need be - File famfile - File contiglist - File? par_bed - Int variants_per_shard - Int? onevsall_cutoff=2 - String prefix - File? af_pcrmins_premingq - String sv_pipeline_docker - - RuntimeAttr? runtime_attr_merge_labeled_vcfs - } - Array[String] batches = read_lines(batches_list) - Array[Array[String]] contigs = read_tsv(contiglist) - - # Shard VCF per batch, compute pops-specific AFs, and convert to table of VID & AF stats - scatter ( batch in batches ) { - # Get list of samples to include & exclude per batch - call GetBatchSamplesList { - input: - vcf=vcf, - vcf_idx=vcf_idx, - batch=batch, - sample_batch_assignments=sample_batch_assignments, - probands_list=excludesamples_list, - sv_pipeline_docker=sv_pipeline_docker - } - # Prune VCF to samples - call calcAF.prune_and_add_vafs as getAFs { - input: - vcf=vcf, - vcf_idx=vcf_idx, - prefix=batch, - sample_pop_assignments=sample_pop_assignments, - prune_list=GetBatchSamplesList.exclude_samples_list, - famfile=famfile, - sv_per_shard=25000, - contiglist=contiglist, - drop_empty_records="FALSE", - par_bed=par_bed, - sv_pipeline_docker=sv_pipeline_docker - } - # Get minimal table of AF data per batch, split by ancestry - call GetFreqTable { - input: - vcf=getAFs.output_vcf, - sample_pop_assignments=sample_pop_assignments, - prefix=batch, - sv_pipeline_docker=sv_pipeline_docker - } - } - - # Merge frequency results per batch into a single table of all variants with AF data across batches - call MergeFreqTables { - input: - tables=GetFreqTable.freq_data, - batches_list=batches_list, - prefix=prefix, - sv_pipeline_docker=sv_pipeline_docker - } - call MergeFreqTables as MergeFreqTables_allPops { - input: - tables=GetFreqTable.freq_data_allPops, - batches_list=batches_list, - prefix=prefix, - sv_pipeline_docker=sv_pipeline_docker - } - - # Compare frequencies before and after minGQ, and generate list of variants - # that are significantly different between the steps - if (defined(af_pcrmins_premingq)) { - call CompareFreqsPrePostMinGQPcrminus { - input: - af_pcrmins_premingq=select_first([af_pcrmins_premingq]), - AF_postMinGQ_table=MergeFreqTables_allPops.merged_table, - prefix=prefix, - sv_pipeline_docker=sv_pipeline_docker - } - } - - # Generate matrix of correlation coefficients for all batches, by population & SVTYPE - #scatter ( pop in populations ) { - # call MakeCorrelationMatrices { - # input: - # freq_table=MergeFreqTables.merged_table, - # pop=pop, - # batches_list=batches_list, - # prefix=prefix, - # sv_pipeline_docker=sv_pipeline_docker - # } - #} - - # Perform one-vs-all comparison of AFs per batch to find batch-specific sites - scatter ( batch in batches ) { - call helper.check_batch_effects as one_vs_all_comparison { - input: - freq_table=MergeFreqTables.merged_table, - batch1=batch, - batch2="ALL_OTHERS", - prefix=prefix, - variants_per_shard=variants_per_shard, - sv_pipeline_docker=sv_pipeline_docker - } - } - # Collect results from pairwise batch effect detection - call MergeVariantFailureLists as merge_one_vs_all_checks { - input: - fail_variant_lists=one_vs_all_comparison.batch_effect_variants, - prefix="~{prefix}.one_vs_all_comparisons", - sv_pipeline_docker=sv_pipeline_docker - } - - # Distill final table of variants to be reclassified - call MakeReclassificationTable { - input: - freq_table=MergeFreqTables.merged_table, - onevsall_fails=merge_one_vs_all_checks.fails_per_variant_all, - prefix=prefix, - onevsall_cutoff = onevsall_cutoff, - sv_pipeline_docker=sv_pipeline_docker - } - - # Apply batch effect labels - scatter ( contig in contigs ) { - call ApplyBatchEffectLabels as apply_labels_perContig { - input: - vcf=vcf, - vcf_idx=vcf_idx, - contig=contig[0], - reclassification_table=MakeReclassificationTable.reclassification_table, - mingq_prePost_pcrminus_fails=CompareFreqsPrePostMinGQPcrminus.pcrminus_fails, - prefix="~{prefix}.~{contig[0]}", - sv_pipeline_docker=sv_pipeline_docker - } - } - call MiniTasks.ConcatVcfs as merge_labeled_vcfs { - input: - vcfs=apply_labels_perContig.labeled_vcf, - naive=true, - outfile_prefix="~{prefix}.batch_effects_labeled_merged", - sv_base_mini_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_merge_labeled_vcfs - } - - output { - File labeled_vcf = merge_labeled_vcfs.concat_vcf - File labeled_vcf_idx = merge_labeled_vcfs.concat_vcf_idx - } -} - - -# Get list of samples to include & exclude per batch -# Always exclude probands from all batches -task GetBatchSamplesList { - input{ - File vcf - File vcf_idx - String batch - File sample_batch_assignments - File probands_list - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: 50, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - command <<< - set -euo pipefail - # Get list of all samples present in VCF header - tabix -H ~{vcf} | fgrep -v "##" | cut -f10- | sed 's/\t/\n/g' | sort -Vk1,1 \ - > all_samples.list - # Get list of samples in batch - fgrep -w ~{batch} ~{sample_batch_assignments} | cut -f1 \ - | fgrep -wf - all_samples.list \ - | fgrep -wvf ~{probands_list} \ - > "~{batch}.samples.list" || true - # Get list of samples not in batch - fgrep -wv ~{batch} ~{sample_batch_assignments} | cut -f1 \ - | cat - ~{probands_list} | sort -Vk1,1 | uniq \ - | fgrep -wf - all_samples.list \ - > "~{batch}.exclude_samples.list" || true - >>> - - output { - File include_samples_list = "~{batch}.samples.list" - File exclude_samples_list = "~{batch}.exclude_samples.list" - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - - -# Run vcf2bed and subset to just include VID, SVTYPE, SVLEN, _AC, and _AN -task GetFreqTable { - input{ - File vcf - File sample_pop_assignments - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 6, - disk_gb: 50, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - command <<< - set -euo pipefail - #Run vcf2bed - svtk vcf2bed \ - --info ALL \ - --no-samples \ - ~{vcf} "~{prefix}.vcf2bed.bed" - ### Create table of freqs by ancestry - #Cut to necessary columns - idxs=$( sed -n '1p' "~{prefix}.vcf2bed.bed" \ - | sed 's/\t/\n/g' \ - | awk -v OFS="\t" '{ print $1, NR }' \ - | grep -e 'name\|SVLEN\|SVTYPE\|_AC\|_AN\|_CN_NONREF_COUNT\|_CN_NUMBER' \ - | fgrep -v "OTH" \ - | cut -f2 \ - | paste -s -d\, || true ) - cut -f"$idxs" "~{prefix}.vcf2bed.bed" \ - | sed 's/^name/\#VID/g' \ - | gzip -c \ - > "~{prefix}.frequencies.preclean.txt.gz" - #Clean frequencies (note that this script automatically gzips the output file taken as the last argument) - /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/clean_frequencies_table.R \ - "~{prefix}.frequencies.preclean.txt.gz" \ - "~{sample_pop_assignments}" \ - "~{prefix}.frequencies.txt" - ### Create table of freqs, irrespective of ancestry - #Cut to necessary columns - idxs=$( sed -n '1p' "~{prefix}.vcf2bed.bed" \ - | sed 's/\t/\n/g' \ - | awk -v OFS="\t" '{ if ($1=="name" || $1=="SVLEN" || $1=="SVTYPE" || $1=="AC" || $1=="AN" || $1=="CN_NUMBER" || $1=="CN_NONREF_COUNT") print NR }' \ - | paste -s -d\, || true ) - cut -f"$idxs" "~{prefix}.vcf2bed.bed" > minfreq.subset.bed - svtype_idx=$( sed -n '1p' minfreq.subset.bed \ - | sed 's/\t/\n/g' \ - | awk -v OFS="\t" '{ if ($1=="SVTYPE") print NR }' || true ) - awk -v OFS="\t" -v sidx="$svtype_idx" '{ if ($sidx=="CNV" || $sidx=="MCNV") print $1, $2, $3, $6, $7; else print $1, $2, $3, $4, $5 }' minfreq.subset.bed \ - | sed 's/^name/\#VID/g' \ - | gzip -c \ - > "~{prefix}.frequencies.allPops.txt.gz" - >>> - - output { - File freq_data = "~{prefix}.frequencies.txt.gz" - File freq_data_allPops = "~{prefix}.frequencies.allPops.txt.gz" - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - - -# Combine frequency data across batches -task MergeFreqTables { - input{ - Array[File] tables - File batches_list - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 16, - disk_gb: 100, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - command <<< - set -euo pipefail - - #Get list of batch IDs and batch table paths - while read batch; do - echo "$batch" - find ./ -name "$batch.frequencies*txt.gz" | sed -n '1p' - done < ~{batches_list} | paste - - \ - > input.list - - #Make sure all input files have the same number of lines - while read batch file; do - zcat "$file" | wc -l - done < input.list > nlines.list - nlines=$( sort nlines.list | uniq | wc -l ) - if [ "$nlines" -gt 1 ]; then - echo "AT LEAST ONE INPUT FILE HAS A DIFFERENT NUMBER OF LINES" - exit 0 - fi - - #Prep files for paste joining - echo "PREPPING FILES FOR MERGING" - while read batch file; do - #Header - zcat "$file" | sed -n '1p' | cut -f1-3 - #Body - zcat "$file" | sed '1d' \ - | sort -Vk1,1 \ - | cut -f1-3 - done < <( sed -n '1p' input.list ) \ - > header.txt - while read batch file; do - for wrapper in 1; do - #Header - zcat "$file" | sed -n '1p' \ - | cut -f4- | sed 's/\t/\n/g' \ - | awk -v batch="$batch" '{ print $1"."batch }' \ - | paste -s - #Body - zcat "$file" | sed '1d' \ - | sort -Vk1,1 \ - | cut -f4- - done > "$batch.prepped.txt" - done < input.list - - #Join files with simple paste - paste \ - header.txt \ - $( awk -v ORS=" " '{ print $1".prepped.txt" }' input.list ) \ - | gzip -c \ - > "~{prefix}.merged_AF_table.txt.gz" - >>> - - output { - File merged_table = "~{prefix}.merged_AF_table.txt.gz" - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - - -# Compare -task CompareFreqsPrePostMinGQPcrminus { - input{ - File af_pcrmins_premingq - File AF_postMinGQ_table - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 8, - disk_gb: 30, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - command <<< - set -euo pipefail - /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/compare_freqs_pre_post_minGQ.PCRMinus_only.R \ - ~{af_pcrmins_premingq} \ - ~{AF_postMinGQ_table} \ - ./ \ - "~{prefix}." - >>> - - output { - File pcrminus_fails = "~{prefix}.PCRMINUS_minGQ_AF_prePost_fails.VIDs.list" - File minGQ_prePost_comparison_data = "~{prefix}.minGQ_AF_prePost_comparison.data.txt.gz" - File minGQ_prePost_comparison_plot = "~{prefix}.minGQ_AF_prePost_comparison.plot.png" - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - - -# Calculate & plot cross-batch correlation coefficient matrixes -task MakeCorrelationMatrices { - input{ - File freq_table - String pop - File batches_list - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 8, - disk_gb: 50, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - command <<< - set -euo pipefail - /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/correlate_batches_singlePop.R \ - ~{batches_list} \ - ~{freq_table} \ - "~{pop}" \ - "~{prefix}.~{pop}" - >>> - output { - Array[File] corr_matrixes = glob("~{prefix}.~{pop}.*.R2_matrix.txt") - Array[File] heat_maps = glob("~{prefix}.~{pop}.*heatmap*.pdf") - Array[File] dot_plots = glob("~{prefix}.~{pop}.*perBatch_R2_sina_plot.pdf") - } - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - - -# Generate list of all pairs of batches to be compared -task MakeBatchPairsList { - input{ - File batches_list - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: 10, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - command <<< - set -euo pipefail - /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_pairs_list.R \ - ~{batches_list} \ - "~{prefix}.nonredundant_batch_pairs.txt" - >>> - - output { - File batch_pairs_list = "~{prefix}.nonredundant_batch_pairs.txt" - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - - -# Merge lists of batch effect checks and count total number of times each variant failed -task MergeVariantFailureLists { - input{ - Array[File] fail_variant_lists - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: 10, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - command <<< - set -euo pipefail - #Write list of paths to all batch effect variant lists - #Get master list of PCR+ to PCR+ failures #removed from the PCR- only projects - #Get master list of PCR- to PCR- failures #removed from the PCR- only projects - #Get master list of PCR+ to PCR- failures #removed from the PCR- only projects - #Get master list of all possible failures - cat ~{write_lines(fail_variant_lists)} \ - | xargs -I {} cat {} \ - | sort -Vk1,1 | uniq -c \ - | awk -v OFS="\t" '{ print $2, $1 }' \ - > "~{prefix}.all.failures.txt" || true - >>> - - output { - File fails_per_variant_all = "~{prefix}.all.failures.txt" - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - - -# Consolidate all batch effect check results into a single table with reclassification per variant -task MakeReclassificationTable { - input{ - File freq_table - File onevsall_fails - String prefix - Int? onevsall_cutoff - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 8, - disk_gb: 10, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - command <<< - set -euo pipefail - /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R \ - ~{freq_table} \ - ~{onevsall_fails} \ - "~{prefix}.batch_effect_reclassification_table.txt" \ - ~{onevsall_cutoff} - >>> - - output { - File reclassification_table = "~{prefix}.batch_effect_reclassification_table.txt" - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - - -# Apply batch effect labels to VCF -task ApplyBatchEffectLabels { - input{ - File vcf - File vcf_idx - String contig - File reclassification_table - File? mingq_prePost_pcrminus_fails - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: 50, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - command <<< - set -euo pipefail - tabix -h ~{vcf} ~{contig} \ - | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/label_batch_effects.PCRMinus_only.py \ - ~{"--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails} \ - stdin \ - ~{reclassification_table} \ - stdout \ - | bgzip -c \ - > "~{prefix}.batch_effects_labeled.vcf.gz" - tabix -p vcf -f "~{prefix}.batch_effects_labeled.vcf.gz" - >>> - - output { - File labeled_vcf = "~{prefix}.batch_effects_labeled.vcf.gz" - File labeled_vcf_idx = "~{prefix}.batch_effects_labeled.vcf.gz.tbi" - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} \ No newline at end of file From baba8aaccbc512a8e164de40ca9de190a3ca8c87 Mon Sep 17 00:00:00 2001 From: Shadi Zaheri <74751641+shadizaheri@users.noreply.github.com> Date: Wed, 29 Nov 2023 11:57:02 -0500 Subject: [PATCH 08/11] Update make_batch_effect_reclassification_table.PCRMinus_only.R # Just use onevsall.fails directly. merged <- onevsall.fails --- ...ect_reclassification_table.PCRMinus_only.R | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R index 1f233d25d..c9687d62d 100755 --- a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R +++ b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R @@ -117,19 +117,23 @@ onevsall.fails <- onevsall.fails[which(onevsall.fails$fails_onevsall>=onevsall.c ###Combine data -merged <- merge(onevsall.fails,all=T,sort=F,by="VID") -if(nrow(merged) > 0){ - merged[,-1] <- apply(merged[,-1],2,function(vals){ +#merged <- merge(onevsall.fails,all=T,sort=F,by="VID") +merged <- onevsall.fails +# If merged data frame is not empty, replace NA values with 0 +if (nrow(merged) > 0) { + merged[,-1] <- apply(merged[,-1], 2, function(vals) { vals[which(is.na(vals))] <- 0 return(vals) }) } -merged <- merge(merged,freq.dat,by="VID",sort=F,all=F) +# Merge with frequency data +merged <- merge(merged, freq.dat, by = "VID", sort = F, all = F) -##Categorize batch effect failure sites -out.table <- analyze.failures(dat=merged, - onevsall.cutoff=onevsall.cutoff) -write.table(out.table,OUTFILE,col.names=F,row.names=F,sep="\t",quote=F) +## Categorize batch effect failure sites +# Assuming the categorize.failures function only requires onevsall.cutoff +# and is compatible with the new merged data structure +out.table <- categorize.failures(dat = merged, onevsall.cutoff = onevsall.cutoff) - \ No newline at end of file +# Write the output table +write.table(out.table, OUTFILE, col.names = F, row.names = F, sep = "\t", quote = F) From b253052c97b698bd3caba889c24fce2c7bff8abe Mon Sep 17 00:00:00 2001 From: Shadi Zaheri <74751641+shadizaheri@users.noreply.github.com> Date: Wed, 29 Nov 2023 12:15:32 -0500 Subject: [PATCH 09/11] Update Module07XfBatchEffect.wdl adjust tab spaces --- wdl/Module07XfBatchEffect.wdl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/wdl/Module07XfBatchEffect.wdl b/wdl/Module07XfBatchEffect.wdl index a0f1ffd66..161f01a51 100644 --- a/wdl/Module07XfBatchEffect.wdl +++ b/wdl/Module07XfBatchEffect.wdl @@ -588,12 +588,12 @@ task ApplyBatchEffectLabels { set -euo pipefail tabix -h ~{vcf} ~{contig} \ | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/label_batch_effects.PCRMinus_only.py \ - ~{"--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails} \ - stdin \ - ~{reclassification_table} \ - stdout \ - | bgzip -c \ - > "~{prefix}.batch_effects_labeled.vcf.gz" + ~{"--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails} \ + stdin \ + ~{reclassification_table} \ + stdout \ + | bgzip -c \ + > "~{prefix}.batch_effects_labeled.vcf.gz" tabix -p vcf -f "~{prefix}.batch_effects_labeled.vcf.gz" >>> @@ -611,4 +611,4 @@ task ApplyBatchEffectLabels { preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) } -} \ No newline at end of file +} From 54c421e4f48dcf0db088a11fb5ab364d47f4763c Mon Sep 17 00:00:00 2001 From: Shadi Zaheri <74751641+shadizaheri@users.noreply.github.com> Date: Wed, 29 Nov 2023 12:19:36 -0500 Subject: [PATCH 10/11] Update Module07XfBatchEffect.wdl --- wdl/Module07XfBatchEffect.wdl | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/wdl/Module07XfBatchEffect.wdl b/wdl/Module07XfBatchEffect.wdl index 161f01a51..d2b737a5e 100644 --- a/wdl/Module07XfBatchEffect.wdl +++ b/wdl/Module07XfBatchEffect.wdl @@ -576,13 +576,14 @@ task ApplyBatchEffectLabels { RuntimeAttr? runtime_attr_override } RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: 50, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } + cpu_cores: 1, + mem_gb: 4, + disk_gb: 50, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) command <<< set -euo pipefail From 30647d36527a20bd4cbb5a133537c6354c8f2036 Mon Sep 17 00:00:00 2001 From: Shadi Zaheri <74751641+shadizaheri@users.noreply.github.com> Date: Fri, 1 Dec 2023 17:52:22 -0500 Subject: [PATCH 11/11] Update make_batch_effect_reclassification_table.PCRMinus_only.R analyze.failures ----> categorize.failures --- .../make_batch_effect_reclassification_table.PCRMinus_only.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R index c9687d62d..450c6c0a4 100755 --- a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R +++ b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R @@ -54,7 +54,7 @@ import.fails <- function(minus.in,prefix){ } #Categorize failing sites -analyze.failures <- function(dat,onevsall.cutoff){ +categorize.failures <- function(dat,onevsall.cutoff){ dat$max_plus_frac <- apply(data.frame(dat$frac_plus_fails_onevsall), 1,max,na.rm=T) onevsall.fail.idx <- which(dat$fails_onevsall>=onevsall.cutoff)