From 43a93f1e936dd82ec9ee81195e465cf1429ec92f Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Fri, 6 Jan 2023 15:58:47 -0500
Subject: [PATCH 01/11] add toggle for unstable af filter during label step in
 batch effect

---
 wdl/Module07XfBatchEffect.wdl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/wdl/Module07XfBatchEffect.wdl b/wdl/Module07XfBatchEffect.wdl
index b2148955e..48de966aa 100644
--- a/wdl/Module07XfBatchEffect.wdl
+++ b/wdl/Module07XfBatchEffect.wdl
@@ -24,6 +24,7 @@ workflow XfBatchEffect {
     Int? onevsall_cutoff=2
     String prefix
     File af_pcrmins_premingq
+    Boolean? enable_unstable_af_filter
     String sv_pipeline_docker
 
     RuntimeAttr? runtime_attr_merge_labeled_vcfs
@@ -176,6 +177,7 @@ workflow XfBatchEffect {
         contig=contig[0],
         reclassification_table=MakeReclassificationTable.reclassification_table,
         mingq_prePost_pcrminus_fails=CompareFreqsPrePostMinGQPcrminus.pcrminus_fails,
+        enable_unstable_af_filter = select_first([enable_unstable_af_filter, true]),
         prefix="~{prefix}.~{contig[0]}",
         sv_pipeline_docker=sv_pipeline_docker
     }
@@ -647,6 +649,7 @@ task ApplyBatchEffectLabels {
     File reclassification_table
     File mingq_prePost_pcrminus_fails
     String prefix
+    Boolean enable_unstable_af_filter
     String sv_pipeline_docker
     RuntimeAttr? runtime_attr_override
   }
@@ -663,7 +666,7 @@ task ApplyBatchEffectLabels {
     set -euo pipefail
     tabix -h ~{vcf} ~{contig} \
     | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/label_batch_effects.PCRMinus_only.py \
-        --unstable-af-pcrminus ~{mingq_prePost_pcrminus_fails} \
+        ~{if (enable_unstable_af_filter) then "--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails else ""} \
         stdin \
         ~{reclassification_table} \
         stdout \

From 38e11d61a5bf83a1a1e928abe5dad4d406884f42 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 23 Jan 2023 11:53:17 -0500
Subject: [PATCH 02/11] skip AF comparison and filtering if no input provided

---
 wdl/Module07XfBatchEffect.wdl | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/wdl/Module07XfBatchEffect.wdl b/wdl/Module07XfBatchEffect.wdl
index 48de966aa..0f816d387 100644
--- a/wdl/Module07XfBatchEffect.wdl
+++ b/wdl/Module07XfBatchEffect.wdl
@@ -24,7 +24,6 @@ workflow XfBatchEffect {
     Int? onevsall_cutoff=2
     String prefix
     File af_pcrmins_premingq
-    Boolean? enable_unstable_af_filter
     String sv_pipeline_docker
 
     RuntimeAttr? runtime_attr_merge_labeled_vcfs
@@ -87,12 +86,14 @@ workflow XfBatchEffect {
 
   # Compare frequencies before and after minGQ, and generate list of variants
   # that are significantly different between the steps
-  call CompareFreqsPrePostMinGQPcrminus {
-    input:
-      af_pcrmins_premingq=af_pcrmins_premingq,
-      AF_postMinGQ_table=MergeFreqTables_allPops.merged_table,
-      prefix=prefix,
-      sv_pipeline_docker=sv_pipeline_docker
+  if (defined(af_pcrmins_premingq)) {
+    call CompareFreqsPrePostMinGQPcrminus {
+      input:
+        af_pcrmins_premingq=select_first([af_pcrmins_premingq]),
+        AF_postMinGQ_table=MergeFreqTables_allPops.merged_table,
+        prefix=prefix,
+        sv_pipeline_docker=sv_pipeline_docker
+    }
   }
 
   # Generate matrix of correlation coefficients for all batches, by population & SVTYPE
@@ -177,7 +178,6 @@ workflow XfBatchEffect {
         contig=contig[0],
         reclassification_table=MakeReclassificationTable.reclassification_table,
         mingq_prePost_pcrminus_fails=CompareFreqsPrePostMinGQPcrminus.pcrminus_fails,
-        enable_unstable_af_filter = select_first([enable_unstable_af_filter, true]),
         prefix="~{prefix}.~{contig[0]}",
         sv_pipeline_docker=sv_pipeline_docker
     }
@@ -647,9 +647,8 @@ task ApplyBatchEffectLabels {
     File vcf_idx
     String contig
     File reclassification_table
-    File mingq_prePost_pcrminus_fails
+    File? mingq_prePost_pcrminus_fails
     String prefix
-    Boolean enable_unstable_af_filter
     String sv_pipeline_docker
     RuntimeAttr? runtime_attr_override
   }
@@ -666,7 +665,7 @@ task ApplyBatchEffectLabels {
     set -euo pipefail
     tabix -h ~{vcf} ~{contig} \
     | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/label_batch_effects.PCRMinus_only.py \
-        ~{if (enable_unstable_af_filter) then "--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails else ""} \
+        ~{"--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails} \
         stdin \
         ~{reclassification_table} \
         stdout \

From 682ca8f041264f91e201ad61cfb100c654287e79 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 23 Jan 2023 11:57:26 -0500
Subject: [PATCH 03/11] make af input actually optional

---
 wdl/Module07XfBatchEffect.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/Module07XfBatchEffect.wdl b/wdl/Module07XfBatchEffect.wdl
index 0f816d387..5a248c63e 100644
--- a/wdl/Module07XfBatchEffect.wdl
+++ b/wdl/Module07XfBatchEffect.wdl
@@ -23,7 +23,7 @@ workflow XfBatchEffect {
     Int? pairwise_cutoff=2
     Int? onevsall_cutoff=2
     String prefix
-    File af_pcrmins_premingq
+    File? af_pcrmins_premingq
     String sv_pipeline_docker
 
     RuntimeAttr? runtime_attr_merge_labeled_vcfs

From 37662c8b7c46b9800b52bc4284e0d098ead5719f Mon Sep 17 00:00:00 2001
From: Shadi Zaheri <szaheri@broadinstitute.org>
Date: Thu, 9 Nov 2023 15:04:43 -0500
Subject: [PATCH 04/11] Removed pairwise batch effect checks from WDL workflow

---
 wdl/Module07XfBatchEffect_no_pairwise.wdl | 612 ++++++++++++++++++++++
 1 file changed, 612 insertions(+)
 create mode 100644 wdl/Module07XfBatchEffect_no_pairwise.wdl

diff --git a/wdl/Module07XfBatchEffect_no_pairwise.wdl b/wdl/Module07XfBatchEffect_no_pairwise.wdl
new file mode 100644
index 000000000..f7958ba92
--- /dev/null
+++ b/wdl/Module07XfBatchEffect_no_pairwise.wdl
@@ -0,0 +1,612 @@
+##########################
+## EXPERIMENTAL WORKFLOW
+##########################
+
+version 1.0
+
+import "prune_add_af.wdl" as calcAF
+import "batch_effect_helper.wdl" as helper
+import "TasksMakeCohortVcf.wdl" as MiniTasks
+
+workflow XfBatchEffect {
+  input{
+    File vcf
+    File vcf_idx
+    File sample_batch_assignments
+    File batches_list
+    File sample_pop_assignments
+    File excludesamples_list #empty file if need be
+    File famfile
+    File contiglist
+    File? par_bed
+    Int variants_per_shard
+    Int? onevsall_cutoff=2
+    String prefix
+    File af_pcrmins_premingq
+    String sv_pipeline_docker
+
+    RuntimeAttr? runtime_attr_merge_labeled_vcfs
+  }
+  Array[String] batches = read_lines(batches_list)
+  Array[Array[String]] contigs = read_tsv(contiglist)
+
+  # Shard VCF per batch, compute pops-specific AFs, and convert to table of VID & AF stats
+  scatter ( batch in batches ) {
+    # Get list of samples to include & exclude per batch
+    call GetBatchSamplesList {
+      input:
+        vcf=vcf,
+        vcf_idx=vcf_idx,
+        batch=batch,
+        sample_batch_assignments=sample_batch_assignments,
+        probands_list=excludesamples_list,
+        sv_pipeline_docker=sv_pipeline_docker
+    }
+    # Prune VCF to samples
+    call calcAF.prune_and_add_vafs as getAFs {
+      input:
+        vcf=vcf,
+        vcf_idx=vcf_idx,
+        prefix=batch,
+        sample_pop_assignments=sample_pop_assignments,
+        prune_list=GetBatchSamplesList.exclude_samples_list,
+        famfile=famfile,
+        sv_per_shard=25000,
+        contiglist=contiglist,
+        drop_empty_records="FALSE",
+        par_bed=par_bed,
+        sv_pipeline_docker=sv_pipeline_docker
+    }
+    # Get minimal table of AF data per batch, split by ancestry
+    call GetFreqTable {
+      input:
+        vcf=getAFs.output_vcf,
+        sample_pop_assignments=sample_pop_assignments,
+        prefix=batch,
+        sv_pipeline_docker=sv_pipeline_docker
+    }
+  }
+
+  # Merge frequency results per batch into a single table of all variants with AF data across batches
+  call MergeFreqTables {
+    input:
+      tables=GetFreqTable.freq_data,
+      batches_list=batches_list,
+      prefix=prefix,
+      sv_pipeline_docker=sv_pipeline_docker
+  }
+  call MergeFreqTables as MergeFreqTables_allPops {
+    input:
+      tables=GetFreqTable.freq_data_allPops,
+      batches_list=batches_list,
+      prefix=prefix,
+      sv_pipeline_docker=sv_pipeline_docker
+  }
+
+  # Compare frequencies before and after minGQ, and generate list of variants
+  # that are significantly different between the steps
+  call CompareFreqsPrePostMinGQPcrminus {
+    input:
+      af_pcrmins_premingq=af_pcrmins_premingq,
+      AF_postMinGQ_table=MergeFreqTables_allPops.merged_table,
+      prefix=prefix,
+      sv_pipeline_docker=sv_pipeline_docker
+  }
+
+  # Generate matrix of correlation coefficients for all batches, by population & SVTYPE
+  #scatter ( pop in populations ) {
+  #  call MakeCorrelationMatrices {
+  #    input:
+  #      freq_table=MergeFreqTables.merged_table,
+  #      pop=pop,
+  #      batches_list=batches_list,
+  #      prefix=prefix,
+  #      sv_pipeline_docker=sv_pipeline_docker
+  #  }
+  #}
+
+  # Perform one-vs-all comparison of AFs per batch to find batch-specific sites
+  scatter ( batch in batches ) {
+    call helper.check_batch_effects as one_vs_all_comparison {
+      input:
+        freq_table=MergeFreqTables.merged_table,
+        batch1=batch,
+        batch2="ALL_OTHERS",
+        prefix=prefix,
+        variants_per_shard=variants_per_shard,
+        sv_pipeline_docker=sv_pipeline_docker
+    }
+  }
+  # Collect results from pairwise batch effect detection
+  call MergeVariantFailureLists as merge_one_vs_all_checks {
+    input:
+      fail_variant_lists=one_vs_all_comparison.batch_effect_variants,
+      prefix="~{prefix}.one_vs_all_comparisons",
+      sv_pipeline_docker=sv_pipeline_docker
+  }
+
+  # Distill final table of variants to be reclassified
+  call MakeReclassificationTable {
+    input:
+      freq_table=MergeFreqTables.merged_table,
+      onevsall_fails=merge_one_vs_all_checks.fails_per_variant_all,
+      prefix=prefix,
+      onevsall_cutoff = onevsall_cutoff,
+      sv_pipeline_docker=sv_pipeline_docker
+  }
+
+  # Apply batch effect labels
+  scatter ( contig in contigs ) {
+    call ApplyBatchEffectLabels as apply_labels_perContig {
+      input:
+        vcf=vcf,
+        vcf_idx=vcf_idx,
+        contig=contig[0],
+        reclassification_table=MakeReclassificationTable.reclassification_table,
+        mingq_prePost_pcrminus_fails=CompareFreqsPrePostMinGQPcrminus.pcrminus_fails,
+        prefix="~{prefix}.~{contig[0]}",
+        sv_pipeline_docker=sv_pipeline_docker
+    }
+  }
+  call MiniTasks.ConcatVcfs as merge_labeled_vcfs {
+    input:
+      vcfs=apply_labels_perContig.labeled_vcf,
+      naive=true,
+      outfile_prefix="~{prefix}.batch_effects_labeled_merged",
+      sv_base_mini_docker=sv_pipeline_docker,
+      runtime_attr_override=runtime_attr_merge_labeled_vcfs
+  }
+
+  output {
+    File labeled_vcf = merge_labeled_vcfs.concat_vcf
+    File labeled_vcf_idx = merge_labeled_vcfs.concat_vcf_idx
+  }
+}
+
+
+# Get list of samples to include & exclude per batch
+# Always exclude probands from all batches
+task GetBatchSamplesList {
+  input{
+    File vcf
+    File vcf_idx
+    String batch
+    File sample_batch_assignments
+    File probands_list
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+  RuntimeAttr default_attr = object {
+                               cpu_cores: 1,
+                               mem_gb: 4,
+                               disk_gb: 50,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+  command <<<
+    set -euo pipefail
+    # Get list of all samples present in VCF header
+    tabix -H ~{vcf} | fgrep -v "##" | cut -f10- | sed 's/\t/\n/g' | sort -Vk1,1 \
+    > all_samples.list
+    # Get list of samples in batch
+    fgrep -w ~{batch} ~{sample_batch_assignments} | cut -f1 \
+    | fgrep -wf - all_samples.list \
+    | fgrep -wvf ~{probands_list} \
+    > "~{batch}.samples.list" || true
+    # Get list of samples not in batch
+    fgrep -wv ~{batch} ~{sample_batch_assignments} | cut -f1 \
+    | cat - ~{probands_list} | sort -Vk1,1 | uniq \
+    | fgrep -wf - all_samples.list \
+    > "~{batch}.exclude_samples.list" || true
+  >>>
+
+  output {
+    File include_samples_list = "~{batch}.samples.list"
+    File exclude_samples_list = "~{batch}.exclude_samples.list"
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
+
+# Run vcf2bed and subset to just include VID, SVTYPE, SVLEN, _AC, and _AN
+task GetFreqTable {
+  input{
+    File vcf
+    File sample_pop_assignments
+    String prefix
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+  RuntimeAttr default_attr = object {
+                               cpu_cores: 1,
+                               mem_gb: 6,
+                               disk_gb: 50,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+  command <<<
+    set -euo pipefail
+    #Run vcf2bed
+    svtk vcf2bed \
+    --info ALL \
+    --no-samples \
+    ~{vcf} "~{prefix}.vcf2bed.bed"
+    ### Create table of freqs by ancestry
+    #Cut to necessary columns
+    idxs=$( sed -n '1p' "~{prefix}.vcf2bed.bed" \
+    | sed 's/\t/\n/g' \
+    | awk -v OFS="\t" '{ print $1, NR }' \
+    | grep -e 'name\|SVLEN\|SVTYPE\|_AC\|_AN\|_CN_NONREF_COUNT\|_CN_NUMBER' \
+    | fgrep -v "OTH" \
+    | cut -f2 \
+    | paste -s -d\, || true )
+    cut -f"$idxs" "~{prefix}.vcf2bed.bed" \
+    | sed 's/^name/\#VID/g' \
+    | gzip -c \
+    > "~{prefix}.frequencies.preclean.txt.gz"
+    #Clean frequencies (note that this script automatically gzips the output file taken as the last argument)
+    /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/clean_frequencies_table.R \
+    "~{prefix}.frequencies.preclean.txt.gz" \
+    "~{sample_pop_assignments}" \
+    "~{prefix}.frequencies.txt"
+    ### Create table of freqs, irrespective of ancestry
+    #Cut to necessary columns
+    idxs=$( sed -n '1p' "~{prefix}.vcf2bed.bed" \
+    | sed 's/\t/\n/g' \
+    | awk -v OFS="\t" '{ if ($1=="name" || $1=="SVLEN" || $1=="SVTYPE" || $1=="AC" || $1=="AN" || $1=="CN_NUMBER" || $1=="CN_NONREF_COUNT") print NR }' \
+    | paste -s -d\, || true )
+    cut -f"$idxs" "~{prefix}.vcf2bed.bed" > minfreq.subset.bed
+    svtype_idx=$( sed -n '1p' minfreq.subset.bed \
+    | sed 's/\t/\n/g' \
+    | awk -v OFS="\t" '{ if ($1=="SVTYPE") print NR }' || true )
+    awk -v OFS="\t" -v sidx="$svtype_idx" '{ if ($sidx=="CNV" || $sidx=="MCNV") print $1, $2, $3, $6, $7; else print $1, $2, $3, $4, $5 }' minfreq.subset.bed \
+    | sed 's/^name/\#VID/g' \
+    | gzip -c \
+    > "~{prefix}.frequencies.allPops.txt.gz"
+  >>>
+
+  output {
+    File freq_data = "~{prefix}.frequencies.txt.gz"
+    File freq_data_allPops = "~{prefix}.frequencies.allPops.txt.gz"
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
+
+# Combine frequency data across batches
+task MergeFreqTables {
+  input{
+    Array[File] tables
+    File batches_list
+    String prefix
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+  RuntimeAttr default_attr = object {
+                               cpu_cores: 1,
+                               mem_gb: 16,
+                               disk_gb: 100,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+  command <<<
+    set -euo pipefail
+
+    #Get list of batch IDs and batch table paths
+    while read batch; do
+    echo "$batch"
+    find ./ -name "$batch.frequencies*txt.gz" | sed -n '1p'
+    done < ~{batches_list} | paste - - \
+    > input.list
+
+    #Make sure all input files have the same number of lines
+    while read batch file; do
+    zcat "$file" | wc -l
+    done < input.list > nlines.list
+    nlines=$( sort nlines.list | uniq | wc -l )
+    if [ "$nlines" -gt 1 ]; then
+    echo "AT LEAST ONE INPUT FILE HAS A DIFFERENT NUMBER OF LINES"
+    exit 0
+    fi
+
+    #Prep files for paste joining
+    echo "PREPPING FILES FOR MERGING"
+    while read batch file; do
+    #Header
+    zcat "$file" | sed -n '1p' | cut -f1-3
+    #Body
+    zcat "$file" | sed '1d' \
+    | sort -Vk1,1 \
+    | cut -f1-3
+    done < <( sed -n '1p' input.list ) \
+    > header.txt
+    while read batch file; do
+    for wrapper in 1; do
+    #Header
+    zcat "$file" | sed -n '1p' \
+    | cut -f4- | sed 's/\t/\n/g' \
+    | awk -v batch="$batch" '{ print $1"."batch }' \
+    | paste -s
+    #Body
+    zcat "$file" | sed '1d' \
+    | sort -Vk1,1 \
+    | cut -f4-
+    done > "$batch.prepped.txt"
+    done < input.list
+
+    #Join files with simple paste
+    paste \
+    header.txt \
+    $( awk -v ORS=" " '{ print $1".prepped.txt" }' input.list ) \
+    | gzip -c \
+    > "~{prefix}.merged_AF_table.txt.gz"
+  >>>
+
+  output {
+    File merged_table = "~{prefix}.merged_AF_table.txt.gz"
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
+
+# Compare
+task CompareFreqsPrePostMinGQPcrminus {
+  input{
+    File af_pcrmins_premingq
+    File AF_postMinGQ_table
+    String prefix
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+  RuntimeAttr default_attr = object {
+                               cpu_cores: 1,
+                               mem_gb: 8,
+                               disk_gb: 30,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+  command <<<
+    set -euo pipefail
+    /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/compare_freqs_pre_post_minGQ.PCRMinus_only.R \
+    ~{af_pcrmins_premingq} \
+    ~{AF_postMinGQ_table} \
+    ./ \
+    "~{prefix}."
+  >>>
+
+  output {
+    File pcrminus_fails = "~{prefix}.PCRMINUS_minGQ_AF_prePost_fails.VIDs.list"
+    File minGQ_prePost_comparison_data = "~{prefix}.minGQ_AF_prePost_comparison.data.txt.gz"
+    File minGQ_prePost_comparison_plot = "~{prefix}.minGQ_AF_prePost_comparison.plot.png"
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
+
+# Calculate & plot cross-batch correlation coefficient matrixes
+task MakeCorrelationMatrices {
+  input{
+    File freq_table
+    String pop
+    File batches_list
+    String prefix
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+  RuntimeAttr default_attr = object {
+                               cpu_cores: 1,
+                               mem_gb: 8,
+                               disk_gb: 50,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+  command <<<
+    set -euo pipefail
+    /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/correlate_batches_singlePop.R \
+    ~{batches_list} \
+    ~{freq_table} \
+    "~{pop}" \
+    "~{prefix}.~{pop}"
+  >>>
+  output {
+    Array[File] corr_matrixes = glob("~{prefix}.~{pop}.*.R2_matrix.txt")
+    Array[File] heat_maps = glob("~{prefix}.~{pop}.*heatmap*.pdf")
+    Array[File] dot_plots = glob("~{prefix}.~{pop}.*perBatch_R2_sina_plot.pdf")
+  }
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
+# Merge lists of batch effect checks and count total number of times each variant failed
+task MergeVariantFailureLists {
+  input{
+    Array[File] fail_variant_lists
+    String prefix
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+  RuntimeAttr default_attr = object {
+                               cpu_cores: 1,
+                               mem_gb: 4,
+                               disk_gb: 10,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+  command <<<
+    set -euo pipefail
+    #Write list of paths to all batch effect variant lists
+    #Get master list of PCR+ to PCR+ failures   #removed from the PCR- only projects
+    #Get master list of PCR- to PCR- failures   #removed from the PCR- only projects
+    #Get master list of PCR+ to PCR- failures   #removed from the PCR- only projects
+    #Get master list of all possible failures
+    cat ~{write_lines(fail_variant_lists)} \
+    | xargs -I {} cat {} \
+    | sort -Vk1,1 | uniq -c \
+    | awk -v OFS="\t" '{ print $2, $1 }' \
+    > "~{prefix}.all.failures.txt" || true
+  >>>
+
+  output {
+    File fails_per_variant_all = "~{prefix}.all.failures.txt"
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
+
+# Consolidate all batch effect check results into a single table with reclassification per variant
+task MakeReclassificationTable {
+  input{
+    File freq_table
+    File onevsall_fails
+    String prefix
+    Int? onevsall_cutoff
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+  RuntimeAttr default_attr = object {
+                               cpu_cores: 1,
+                               mem_gb: 8,
+                               disk_gb: 10,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+  command <<<
+    set -euo pipefail
+    /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R \
+    ~{freq_table} \
+    ~{onevsall_fails} \
+    "~{prefix}.batch_effect_reclassification_table.txt" \
+    ~{onevsall_cutoff}
+  >>>
+
+  output {
+    File reclassification_table = "~{prefix}.batch_effect_reclassification_table.txt"
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
+
+# Apply batch effect labels to VCF
+task ApplyBatchEffectLabels {
+  input{
+    File vcf
+    File vcf_idx
+    String contig
+    File reclassification_table
+    File mingq_prePost_pcrminus_fails
+    String prefix
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+  RuntimeAttr default_attr = object {
+                               cpu_cores: 1,
+                               mem_gb: 4,
+                               disk_gb: 50,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+  command <<<
+    set -euo pipefail
+    tabix -h ~{vcf} ~{contig} \
+    | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/label_batch_effects.PCRMinus_only.py \
+    --unstable-af-pcrminus ~{mingq_prePost_pcrminus_fails} \
+    stdin \
+    ~{reclassification_table} \
+    stdout \
+    | bgzip -c \
+    > "~{prefix}.batch_effects_labeled.vcf.gz"
+    tabix -p vcf -f "~{prefix}.batch_effects_labeled.vcf.gz"
+  >>>
+
+  output {
+    File labeled_vcf = "~{prefix}.batch_effects_labeled.vcf.gz"
+    File labeled_vcf_idx = "~{prefix}.batch_effects_labeled.vcf.gz.tbi"
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}

From b691bd63fa99f33bd3e96835987ae5c2b6619046 Mon Sep 17 00:00:00 2001
From: Shadi Zaheri <szaheri@broadinstitute.org>
Date: Fri, 10 Nov 2023 10:57:46 -0500
Subject: [PATCH 05/11] no_pairwise_no_unstable_af_filter

---
 wdl/Module07XfBatchEffect_no_pairwise.wdl | 63 +++++++++++++++++++----
 1 file changed, 53 insertions(+), 10 deletions(-)

diff --git a/wdl/Module07XfBatchEffect_no_pairwise.wdl b/wdl/Module07XfBatchEffect_no_pairwise.wdl
index f7958ba92..14ac8cceb 100644
--- a/wdl/Module07XfBatchEffect_no_pairwise.wdl
+++ b/wdl/Module07XfBatchEffect_no_pairwise.wdl
@@ -22,7 +22,7 @@ workflow XfBatchEffect {
     Int variants_per_shard
     Int? onevsall_cutoff=2
     String prefix
-    File af_pcrmins_premingq
+    File? af_pcrmins_premingq
     String sv_pipeline_docker
 
     RuntimeAttr? runtime_attr_merge_labeled_vcfs
@@ -85,12 +85,14 @@ workflow XfBatchEffect {
 
   # Compare frequencies before and after minGQ, and generate list of variants
   # that are significantly different between the steps
-  call CompareFreqsPrePostMinGQPcrminus {
-    input:
-      af_pcrmins_premingq=af_pcrmins_premingq,
-      AF_postMinGQ_table=MergeFreqTables_allPops.merged_table,
-      prefix=prefix,
-      sv_pipeline_docker=sv_pipeline_docker
+  if (defined(af_pcrmins_premingq)) {
+    call CompareFreqsPrePostMinGQPcrminus {
+      input:
+        af_pcrmins_premingq=select_first([af_pcrmins_premingq]),
+        AF_postMinGQ_table=MergeFreqTables_allPops.merged_table,
+        prefix=prefix,
+        sv_pipeline_docker=sv_pipeline_docker
+    }
   }
 
   # Generate matrix of correlation coefficients for all batches, by population & SVTYPE
@@ -470,6 +472,47 @@ task MakeCorrelationMatrices {
   }
 }
 
+
+# Generate list of all pairs of batches to be compared
+task MakeBatchPairsList {
+  input{
+    File batches_list
+    String prefix
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+  RuntimeAttr default_attr = object {
+                               cpu_cores: 1,
+                               mem_gb: 4,
+                               disk_gb: 10,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+  command <<<
+    set -euo pipefail
+    /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_pairs_list.R \
+    ~{batches_list} \
+    "~{prefix}.nonredundant_batch_pairs.txt"
+  >>>
+
+  output {
+    File batch_pairs_list = "~{prefix}.nonredundant_batch_pairs.txt"
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
+
 # Merge lists of batch effect checks and count total number of times each variant failed
 task MergeVariantFailureLists {
   input{
@@ -568,7 +611,7 @@ task ApplyBatchEffectLabels {
     File vcf_idx
     String contig
     File reclassification_table
-    File mingq_prePost_pcrminus_fails
+    File? mingq_prePost_pcrminus_fails
     String prefix
     String sv_pipeline_docker
     RuntimeAttr? runtime_attr_override
@@ -586,7 +629,7 @@ task ApplyBatchEffectLabels {
     set -euo pipefail
     tabix -h ~{vcf} ~{contig} \
     | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/label_batch_effects.PCRMinus_only.py \
-    --unstable-af-pcrminus ~{mingq_prePost_pcrminus_fails} \
+    ~{"--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails} \
     stdin \
     ~{reclassification_table} \
     stdout \
@@ -609,4 +652,4 @@ task ApplyBatchEffectLabels {
     preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
     maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
   }
-}
+}
\ No newline at end of file

From 592d93aac3cef3e013de9fb3937c6b179c3f2d4b Mon Sep 17 00:00:00 2001
From: Shadi Zaheri <szaheri@broadinstitute.org>
Date: Mon, 20 Nov 2023 10:08:53 -0500
Subject: [PATCH 06/11] updated the related wdl and R script

---
 .../.Rhistory                                 |   0
 ...ect_reclassification_table.PCRMinus_only.R |  50 +--
 wdl/Module07XfBatchEffect.wdl                 | 308 ++++++++----------
 3 files changed, 146 insertions(+), 212 deletions(-)
 create mode 100644 src/sv-pipeline/scripts/downstream_analysis_and_filtering/.Rhistory

diff --git a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/.Rhistory b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/.Rhistory
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R
index a409405bf..1f233d25d 100755
--- a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R
+++ b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R
@@ -54,13 +54,10 @@ import.fails <- function(minus.in,prefix){
 }
 
 #Categorize failing sites
-categorize.failures <- function(dat,pairwise.cutoff,onevsall.cutoff){
-  dat$max_plus_frac <- apply(data.frame(dat$frac_plus_fails_pairwise,
-                                        dat$frac_plus_fails_onevsall),
+analyze.failures <- function(dat,onevsall.cutoff){
+  dat$max_plus_frac <- apply(data.frame(dat$frac_plus_fails_onevsall),
                              1,max,na.rm=T)
-  pairwise.fail.idx <- which(dat$fails_pairwise>=pairwise.cutoff & dat$fails_onevsall<onevsall.cutoff)
-  onevsall.fail.idx <- which(dat$fails_pairwise<pairwise.cutoff & dat$fails_onevsall>=onevsall.cutoff)
-  both.fail.idx <- which(dat$fails_pairwise>=pairwise.cutoff & dat$fails_onevsall>=onevsall.cutoff)
+  onevsall.fail.idx <- which(dat$fails_onevsall>=onevsall.cutoff)
   plus.gt.minus <- which(dat$PCRPLUS_AF>=dat$PCRMINUS_AF)
   minus.gt.plus <- which(dat$PCRPLUS_AF<dat$PCRMINUS_AF)
   
@@ -68,18 +65,6 @@ categorize.failures <- function(dat,pairwise.cutoff,onevsall.cutoff){
   plus.enriched <- c()
   plus.depleted <- c()
   batch.variable <- c()
-  for(i in pairwise.fail.idx){
-    row <- dat[i,]
-    if(as.numeric(row$frac_plus_fails_pairwise)>=0.11){
-      if(as.numeric(row$PCRPLUS_AF)>=as.numeric(row$PCRMINUS_AF)){
-        plus.enriched <- c(plus.enriched,i)
-      }else{
-        plus.depleted <- c(plus.depleted,i)
-      }
-    }else{
-      batch.variable <- c(batch.variable,i)
-    }
-  }
   for(i in onevsall.fail.idx){
     row <- dat[i,]
     if(as.numeric(row$frac_plus_fails_onevsall)>=0.11){
@@ -92,18 +77,6 @@ categorize.failures <- function(dat,pairwise.cutoff,onevsall.cutoff){
       batch.variable <- c(batch.variable,i)
     }
   }
-  for(i in both.fail.idx){
-    row <- dat[i,]
-    if(as.numeric(row$max_plus_frac)>=0.11){
-      if(as.numeric(row$PCRPLUS_AF)>=as.numeric(row$PCRMINUS_AF)){
-        plus.enriched <- c(plus.enriched,i)
-      }else{
-        plus.depleted <- c(plus.depleted,i)
-      }
-    }else{
-      batch.variable <- c(batch.variable,i)
-    }
-  }
   
   #Build classification table
   out.table <- data.frame("VID"=dat$VID[c(plus.enriched,plus.depleted,batch.variable)],
@@ -118,13 +91,13 @@ categorize.failures <- function(dat,pairwise.cutoff,onevsall.cutoff){
 ###Read command-line arguments
 args <- commandArgs(trailingOnly=T)
 freq.table.in <- as.character(args[1])
-pairwise.in <- as.character(args[2])
+#pairwise.in <- as.character(args[2])
 #pairwise.minus.in <- as.character(args[3])
-onevsall.in <- as.character(args[3])
+onevsall.in <- as.character(args[2])
 #onevsall.minus.in <- as.character(args[5])
-OUTFILE <- as.character(args[4])
-pairwise.cutoff <- as.integer(args[5])
-onevsall.cutoff <- as.integer(args[6])
+OUTFILE <- as.character(args[3])
+#pairwise.cutoff <- as.integer(args[5])
+onevsall.cutoff <- as.integer(args[4])
 
 # #Dev parameters:
 # freq.table.in <- "~/scratch/gnomAD_v2_SV_MASTER.merged_AF_table.txt.gz"
@@ -139,14 +112,12 @@ onevsall.cutoff <- as.integer(args[6])
 freq.dat <- import.freqs(freq.table.in)
 
 
-pairwise.fails <- import.fails(pairwise.in, prefix="pairwise")
-pairwise.fails <- pairwise.fails[which(pairwise.fails$fails_pairwise>=pairwise.cutoff), ]
 onevsall.fails <- import.fails(onevsall.in, prefix="onevsall")
 onevsall.fails <- onevsall.fails[which(onevsall.fails$fails_onevsall>=onevsall.cutoff), ]
 
 
 ###Combine data
-merged <- merge(pairwise.fails,onevsall.fails,all=T,sort=F,by="VID")
+merged <- merge(onevsall.fails,all=T,sort=F,by="VID")
 if(nrow(merged) > 0){
   merged[,-1] <- apply(merged[,-1],2,function(vals){
     vals[which(is.na(vals))] <- 0
@@ -157,8 +128,7 @@ merged <- merge(merged,freq.dat,by="VID",sort=F,all=F)
 
 
 ##Categorize batch effect failure sites
-out.table <- categorize.failures(dat=merged,
-                                 pairwise.cutoff=pairwise.cutoff,
+out.table <- analyze.failures(dat=merged,
                                  onevsall.cutoff=onevsall.cutoff)
 write.table(out.table,OUTFILE,col.names=F,row.names=F,sep="\t",quote=F)
 
diff --git a/wdl/Module07XfBatchEffect.wdl b/wdl/Module07XfBatchEffect.wdl
index 5a248c63e..14ac8cceb 100644
--- a/wdl/Module07XfBatchEffect.wdl
+++ b/wdl/Module07XfBatchEffect.wdl
@@ -20,7 +20,6 @@ workflow XfBatchEffect {
     File contiglist
     File? par_bed
     Int variants_per_shard
-    Int? pairwise_cutoff=2
     Int? onevsall_cutoff=2
     String prefix
     File? af_pcrmins_premingq
@@ -28,9 +27,9 @@ workflow XfBatchEffect {
 
     RuntimeAttr? runtime_attr_merge_labeled_vcfs
   }
-    Array[String] batches = read_lines(batches_list)
-    Array[Array[String]] contigs = read_tsv(contiglist)
-    
+  Array[String] batches = read_lines(batches_list)
+  Array[Array[String]] contigs = read_tsv(contiglist)
+
   # Shard VCF per batch, compute pops-specific AFs, and convert to table of VID & AF stats
   scatter ( batch in batches ) {
     # Get list of samples to include & exclude per batch
@@ -43,7 +42,7 @@ workflow XfBatchEffect {
         probands_list=excludesamples_list,
         sv_pipeline_docker=sv_pipeline_docker
     }
-    # Prune VCF to samples 
+    # Prune VCF to samples
     call calcAF.prune_and_add_vafs as getAFs {
       input:
         vcf=vcf,
@@ -108,35 +107,6 @@ workflow XfBatchEffect {
   #  }
   #}
 
-  # Make list of nonredundant pairs of batches to be evaluated
-  call MakeBatchPairsList {
-    input:
-      batches_list=batches_list,
-      prefix=prefix,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-  Array[Array[String]] batch_pairs = read_tsv(MakeBatchPairsList.batch_pairs_list)
-
-  # Compute AF stats per pair of batches & determine variants with batch effects
-  scatter ( pair in batch_pairs ) {
-    call helper.check_batch_effects as check_batch_effects {
-      input:
-        freq_table=MergeFreqTables.merged_table,
-        batch1=pair[0],
-        batch2=pair[1],
-        prefix=prefix,
-        variants_per_shard=variants_per_shard,
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-  }
-  # Collect results from pairwise batch effect detection
-  call MergeVariantFailureLists as merge_pairwise_checks {
-    input:
-      fail_variant_lists=check_batch_effects.batch_effect_variants,
-      prefix="~{prefix}.pairwise_comparisons",
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
   # Perform one-vs-all comparison of AFs per batch to find batch-specific sites
   scatter ( batch in batches ) {
     call helper.check_batch_effects as one_vs_all_comparison {
@@ -161,10 +131,8 @@ workflow XfBatchEffect {
   call MakeReclassificationTable {
     input:
       freq_table=MergeFreqTables.merged_table,
-      pairwise_fails=merge_pairwise_checks.fails_per_variant_all,
       onevsall_fails=merge_one_vs_all_checks.fails_per_variant_all,
       prefix=prefix,
-      pairwise_cutoff = pairwise_cutoff,
       onevsall_cutoff = onevsall_cutoff,
       sv_pipeline_docker=sv_pipeline_docker
   }
@@ -211,13 +179,13 @@ task GetBatchSamplesList {
     RuntimeAttr? runtime_attr_override
   }
   RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
-    mem_gb: 4,
-    disk_gb: 50,
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
+                               cpu_cores: 1,
+                               mem_gb: 4,
+                               disk_gb: 50,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
   RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
   command <<<
     set -euo pipefail
@@ -263,49 +231,49 @@ task GetFreqTable {
     RuntimeAttr? runtime_attr_override
   }
   RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
-    mem_gb: 6,
-    disk_gb: 50,
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
+                               cpu_cores: 1,
+                               mem_gb: 6,
+                               disk_gb: 50,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
   RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
   command <<<
     set -euo pipefail
     #Run vcf2bed
     svtk vcf2bed \
-      --info ALL \
-      --no-samples \
-      ~{vcf} "~{prefix}.vcf2bed.bed"
+    --info ALL \
+    --no-samples \
+    ~{vcf} "~{prefix}.vcf2bed.bed"
     ### Create table of freqs by ancestry
     #Cut to necessary columns
     idxs=$( sed -n '1p' "~{prefix}.vcf2bed.bed" \
-            | sed 's/\t/\n/g' \
-            | awk -v OFS="\t" '{ print $1, NR }' \
-            | grep -e 'name\|SVLEN\|SVTYPE\|_AC\|_AN\|_CN_NONREF_COUNT\|_CN_NUMBER' \
-            | fgrep -v "OTH" \
-            | cut -f2 \
-            | paste -s -d\, || true )
+    | sed 's/\t/\n/g' \
+    | awk -v OFS="\t" '{ print $1, NR }' \
+    | grep -e 'name\|SVLEN\|SVTYPE\|_AC\|_AN\|_CN_NONREF_COUNT\|_CN_NUMBER' \
+    | fgrep -v "OTH" \
+    | cut -f2 \
+    | paste -s -d\, || true )
     cut -f"$idxs" "~{prefix}.vcf2bed.bed" \
     | sed 's/^name/\#VID/g' \
     | gzip -c \
     > "~{prefix}.frequencies.preclean.txt.gz"
     #Clean frequencies (note that this script automatically gzips the output file taken as the last argument)
     /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/clean_frequencies_table.R \
-      "~{prefix}.frequencies.preclean.txt.gz" \
-      "~{sample_pop_assignments}" \
-      "~{prefix}.frequencies.txt"
+    "~{prefix}.frequencies.preclean.txt.gz" \
+    "~{sample_pop_assignments}" \
+    "~{prefix}.frequencies.txt"
     ### Create table of freqs, irrespective of ancestry
     #Cut to necessary columns
     idxs=$( sed -n '1p' "~{prefix}.vcf2bed.bed" \
-            | sed 's/\t/\n/g' \
-            | awk -v OFS="\t" '{ if ($1=="name" || $1=="SVLEN" || $1=="SVTYPE" || $1=="AC" || $1=="AN" || $1=="CN_NUMBER" || $1=="CN_NONREF_COUNT") print NR }' \
-            | paste -s -d\, || true )
+    | sed 's/\t/\n/g' \
+    | awk -v OFS="\t" '{ if ($1=="name" || $1=="SVLEN" || $1=="SVTYPE" || $1=="AC" || $1=="AN" || $1=="CN_NUMBER" || $1=="CN_NONREF_COUNT") print NR }' \
+    | paste -s -d\, || true )
     cut -f"$idxs" "~{prefix}.vcf2bed.bed" > minfreq.subset.bed
     svtype_idx=$( sed -n '1p' minfreq.subset.bed \
-                  | sed 's/\t/\n/g' \
-                  | awk -v OFS="\t" '{ if ($1=="SVTYPE") print NR }' || true )
+    | sed 's/\t/\n/g' \
+    | awk -v OFS="\t" '{ if ($1=="SVTYPE") print NR }' || true )
     awk -v OFS="\t" -v sidx="$svtype_idx" '{ if ($sidx=="CNV" || $sidx=="MCNV") print $1, $2, $3, $6, $7; else print $1, $2, $3, $4, $5 }' minfreq.subset.bed \
     | sed 's/^name/\#VID/g' \
     | gzip -c \
@@ -339,63 +307,63 @@ task MergeFreqTables {
     RuntimeAttr? runtime_attr_override
   }
   RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
-    mem_gb: 16,
-    disk_gb: 100,
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
+                               cpu_cores: 1,
+                               mem_gb: 16,
+                               disk_gb: 100,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
   RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
   command <<<
     set -euo pipefail
 
     #Get list of batch IDs and batch table paths
     while read batch; do
-      echo "$batch"
-      find ./ -name "$batch.frequencies*txt.gz" | sed -n '1p'
+    echo "$batch"
+    find ./ -name "$batch.frequencies*txt.gz" | sed -n '1p'
     done < ~{batches_list} | paste - - \
     > input.list
 
     #Make sure all input files have the same number of lines
     while read batch file; do
-      zcat "$file" | wc -l
+    zcat "$file" | wc -l
     done < input.list > nlines.list
     nlines=$( sort nlines.list | uniq | wc -l )
     if [ "$nlines" -gt 1 ]; then
-      echo "AT LEAST ONE INPUT FILE HAS A DIFFERENT NUMBER OF LINES"
-      exit 0
+    echo "AT LEAST ONE INPUT FILE HAS A DIFFERENT NUMBER OF LINES"
+    exit 0
     fi
 
     #Prep files for paste joining
     echo "PREPPING FILES FOR MERGING"
     while read batch file; do
-        #Header
-        zcat "$file" | sed -n '1p' | cut -f1-3
-        #Body
-        zcat "$file" | sed '1d' \
-        | sort -Vk1,1 \
-        | cut -f1-3
+    #Header
+    zcat "$file" | sed -n '1p' | cut -f1-3
+    #Body
+    zcat "$file" | sed '1d' \
+    | sort -Vk1,1 \
+    | cut -f1-3
     done < <( sed -n '1p' input.list ) \
     > header.txt
     while read batch file; do
-      for wrapper in 1; do
-        #Header
-        zcat "$file" | sed -n '1p' \
-        | cut -f4- | sed 's/\t/\n/g' \
-        | awk -v batch="$batch" '{ print $1"."batch }' \
-        | paste -s
-        #Body
-        zcat "$file" | sed '1d' \
-        | sort -Vk1,1 \
-        | cut -f4-
-      done > "$batch.prepped.txt" 
+    for wrapper in 1; do
+    #Header
+    zcat "$file" | sed -n '1p' \
+    | cut -f4- | sed 's/\t/\n/g' \
+    | awk -v batch="$batch" '{ print $1"."batch }' \
+    | paste -s
+    #Body
+    zcat "$file" | sed '1d' \
+    | sort -Vk1,1 \
+    | cut -f4-
+    done > "$batch.prepped.txt"
     done < input.list
 
     #Join files with simple paste
     paste \
-      header.txt \
-      $( awk -v ORS=" " '{ print $1".prepped.txt" }' input.list ) \
+    header.txt \
+    $( awk -v ORS=" " '{ print $1".prepped.txt" }' input.list ) \
     | gzip -c \
     > "~{prefix}.merged_AF_table.txt.gz"
   >>>
@@ -416,31 +384,31 @@ task MergeFreqTables {
 }
 
 
-# Compare 
+# Compare
 task CompareFreqsPrePostMinGQPcrminus {
   input{
-  File af_pcrmins_premingq
-  File AF_postMinGQ_table
-  String prefix
-  String sv_pipeline_docker
-  RuntimeAttr? runtime_attr_override
+    File af_pcrmins_premingq
+    File AF_postMinGQ_table
+    String prefix
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
   }
   RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
-    mem_gb: 8,
-    disk_gb: 30,
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
+                               cpu_cores: 1,
+                               mem_gb: 8,
+                               disk_gb: 30,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
   RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
   command <<<
     set -euo pipefail
     /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/compare_freqs_pre_post_minGQ.PCRMinus_only.R \
-      ~{af_pcrmins_premingq} \
-      ~{AF_postMinGQ_table} \
-      ./ \
-      "~{prefix}."
+    ~{af_pcrmins_premingq} \
+    ~{AF_postMinGQ_table} \
+    ./ \
+    "~{prefix}."
   >>>
 
   output {
@@ -469,24 +437,24 @@ task MakeCorrelationMatrices {
     File batches_list
     String prefix
     String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override  
+    RuntimeAttr? runtime_attr_override
   }
   RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
-    mem_gb: 8,
-    disk_gb: 50,
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
+                               cpu_cores: 1,
+                               mem_gb: 8,
+                               disk_gb: 50,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
   RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
   command <<<
     set -euo pipefail
     /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/correlate_batches_singlePop.R \
-      ~{batches_list} \
-      ~{freq_table} \
-      "~{pop}" \
-      "~{prefix}.~{pop}"
+    ~{batches_list} \
+    ~{freq_table} \
+    "~{pop}" \
+    "~{prefix}.~{pop}"
   >>>
   output {
     Array[File] corr_matrixes = glob("~{prefix}.~{pop}.*.R2_matrix.txt")
@@ -514,19 +482,19 @@ task MakeBatchPairsList {
     RuntimeAttr? runtime_attr_override
   }
   RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
-    mem_gb: 4,
-    disk_gb: 10,
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
+                               cpu_cores: 1,
+                               mem_gb: 4,
+                               disk_gb: 10,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
   RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
   command <<<
     set -euo pipefail
     /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_pairs_list.R \
-      ~{batches_list} \
-      "~{prefix}.nonredundant_batch_pairs.txt"
+    ~{batches_list} \
+    "~{prefix}.nonredundant_batch_pairs.txt"
   >>>
 
   output {
@@ -554,13 +522,13 @@ task MergeVariantFailureLists {
     RuntimeAttr? runtime_attr_override
   }
   RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
-    mem_gb: 4,
-    disk_gb: 10,
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
+                               cpu_cores: 1,
+                               mem_gb: 4,
+                               disk_gb: 10,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
   RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
   command <<<
     set -euo pipefail
@@ -596,32 +564,28 @@ task MergeVariantFailureLists {
 task MakeReclassificationTable {
   input{
     File freq_table
-    File pairwise_fails
     File onevsall_fails
     String prefix
-    Int? pairwise_cutoff
     Int? onevsall_cutoff
     String sv_pipeline_docker
     RuntimeAttr? runtime_attr_override
   }
   RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
-    mem_gb: 8,
-    disk_gb: 10,
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
+                               cpu_cores: 1,
+                               mem_gb: 8,
+                               disk_gb: 10,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
   RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
   command <<<
     set -euo pipefail
     /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R \
-      ~{freq_table} \
-      ~{pairwise_fails} \
-      ~{onevsall_fails} \
-      "~{prefix}.batch_effect_reclassification_table.txt" \
-      ~{pairwise_cutoff} \
-      ~{onevsall_cutoff}
+    ~{freq_table} \
+    ~{onevsall_fails} \
+    "~{prefix}.batch_effect_reclassification_table.txt" \
+    ~{onevsall_cutoff}
   >>>
 
   output {
@@ -653,24 +617,24 @@ task ApplyBatchEffectLabels {
     RuntimeAttr? runtime_attr_override
   }
   RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
-    mem_gb: 4,
-    disk_gb: 50,
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
+                               cpu_cores: 1,
+                               mem_gb: 4,
+                               disk_gb: 50,
+                               boot_disk_gb: 10,
+                               preemptible_tries: 3,
+                               max_retries: 1
+                             }
   RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
   command <<<
     set -euo pipefail
     tabix -h ~{vcf} ~{contig} \
     | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/label_batch_effects.PCRMinus_only.py \
-        ~{"--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails} \
-        stdin \
-        ~{reclassification_table} \
-        stdout \
-        | bgzip -c \
-        > "~{prefix}.batch_effects_labeled.vcf.gz"
+    ~{"--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails} \
+    stdin \
+    ~{reclassification_table} \
+    stdout \
+    | bgzip -c \
+    > "~{prefix}.batch_effects_labeled.vcf.gz"
     tabix -p vcf -f "~{prefix}.batch_effects_labeled.vcf.gz"
   >>>
 
@@ -688,4 +652,4 @@ task ApplyBatchEffectLabels {
     preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
     maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
   }
-}
+}
\ No newline at end of file

From 35331814f49a292d9b2d2adf38b0857b36f86785 Mon Sep 17 00:00:00 2001
From: Shadi Zaheri <szaheri@broadinstitute.org>
Date: Mon, 20 Nov 2023 10:30:52 -0500
Subject: [PATCH 07/11] updated the related wdl and deleted the copy of the wdl

---
 wdl/Module07XfBatchEffect.wdl             |  41 --
 wdl/Module07XfBatchEffect_no_pairwise.wdl | 655 ----------------------
 2 files changed, 696 deletions(-)
 delete mode 100644 wdl/Module07XfBatchEffect_no_pairwise.wdl

diff --git a/wdl/Module07XfBatchEffect.wdl b/wdl/Module07XfBatchEffect.wdl
index 14ac8cceb..a0f1ffd66 100644
--- a/wdl/Module07XfBatchEffect.wdl
+++ b/wdl/Module07XfBatchEffect.wdl
@@ -472,47 +472,6 @@ task MakeCorrelationMatrices {
   }
 }
 
-
-# Generate list of all pairs of batches to be compared
-task MakeBatchPairsList {
-  input{
-    File batches_list
-    String prefix
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-                               cpu_cores: 1,
-                               mem_gb: 4,
-                               disk_gb: 10,
-                               boot_disk_gb: 10,
-                               preemptible_tries: 3,
-                               max_retries: 1
-                             }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-  command <<<
-    set -euo pipefail
-    /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_pairs_list.R \
-    ~{batches_list} \
-    "~{prefix}.nonredundant_batch_pairs.txt"
-  >>>
-
-  output {
-    File batch_pairs_list = "~{prefix}.nonredundant_batch_pairs.txt"
-  }
-
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
-
-
 # Merge lists of batch effect checks and count total number of times each variant failed
 task MergeVariantFailureLists {
   input{
diff --git a/wdl/Module07XfBatchEffect_no_pairwise.wdl b/wdl/Module07XfBatchEffect_no_pairwise.wdl
deleted file mode 100644
index 14ac8cceb..000000000
--- a/wdl/Module07XfBatchEffect_no_pairwise.wdl
+++ /dev/null
@@ -1,655 +0,0 @@
-##########################
-## EXPERIMENTAL WORKFLOW
-##########################
-
-version 1.0
-
-import "prune_add_af.wdl" as calcAF
-import "batch_effect_helper.wdl" as helper
-import "TasksMakeCohortVcf.wdl" as MiniTasks
-
-workflow XfBatchEffect {
-  input{
-    File vcf
-    File vcf_idx
-    File sample_batch_assignments
-    File batches_list
-    File sample_pop_assignments
-    File excludesamples_list #empty file if need be
-    File famfile
-    File contiglist
-    File? par_bed
-    Int variants_per_shard
-    Int? onevsall_cutoff=2
-    String prefix
-    File? af_pcrmins_premingq
-    String sv_pipeline_docker
-
-    RuntimeAttr? runtime_attr_merge_labeled_vcfs
-  }
-  Array[String] batches = read_lines(batches_list)
-  Array[Array[String]] contigs = read_tsv(contiglist)
-
-  # Shard VCF per batch, compute pops-specific AFs, and convert to table of VID & AF stats
-  scatter ( batch in batches ) {
-    # Get list of samples to include & exclude per batch
-    call GetBatchSamplesList {
-      input:
-        vcf=vcf,
-        vcf_idx=vcf_idx,
-        batch=batch,
-        sample_batch_assignments=sample_batch_assignments,
-        probands_list=excludesamples_list,
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-    # Prune VCF to samples
-    call calcAF.prune_and_add_vafs as getAFs {
-      input:
-        vcf=vcf,
-        vcf_idx=vcf_idx,
-        prefix=batch,
-        sample_pop_assignments=sample_pop_assignments,
-        prune_list=GetBatchSamplesList.exclude_samples_list,
-        famfile=famfile,
-        sv_per_shard=25000,
-        contiglist=contiglist,
-        drop_empty_records="FALSE",
-        par_bed=par_bed,
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-    # Get minimal table of AF data per batch, split by ancestry
-    call GetFreqTable {
-      input:
-        vcf=getAFs.output_vcf,
-        sample_pop_assignments=sample_pop_assignments,
-        prefix=batch,
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-  }
-
-  # Merge frequency results per batch into a single table of all variants with AF data across batches
-  call MergeFreqTables {
-    input:
-      tables=GetFreqTable.freq_data,
-      batches_list=batches_list,
-      prefix=prefix,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-  call MergeFreqTables as MergeFreqTables_allPops {
-    input:
-      tables=GetFreqTable.freq_data_allPops,
-      batches_list=batches_list,
-      prefix=prefix,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  # Compare frequencies before and after minGQ, and generate list of variants
-  # that are significantly different between the steps
-  if (defined(af_pcrmins_premingq)) {
-    call CompareFreqsPrePostMinGQPcrminus {
-      input:
-        af_pcrmins_premingq=select_first([af_pcrmins_premingq]),
-        AF_postMinGQ_table=MergeFreqTables_allPops.merged_table,
-        prefix=prefix,
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-  }
-
-  # Generate matrix of correlation coefficients for all batches, by population & SVTYPE
-  #scatter ( pop in populations ) {
-  #  call MakeCorrelationMatrices {
-  #    input:
-  #      freq_table=MergeFreqTables.merged_table,
-  #      pop=pop,
-  #      batches_list=batches_list,
-  #      prefix=prefix,
-  #      sv_pipeline_docker=sv_pipeline_docker
-  #  }
-  #}
-
-  # Perform one-vs-all comparison of AFs per batch to find batch-specific sites
-  scatter ( batch in batches ) {
-    call helper.check_batch_effects as one_vs_all_comparison {
-      input:
-        freq_table=MergeFreqTables.merged_table,
-        batch1=batch,
-        batch2="ALL_OTHERS",
-        prefix=prefix,
-        variants_per_shard=variants_per_shard,
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-  }
-  # Collect results from pairwise batch effect detection
-  call MergeVariantFailureLists as merge_one_vs_all_checks {
-    input:
-      fail_variant_lists=one_vs_all_comparison.batch_effect_variants,
-      prefix="~{prefix}.one_vs_all_comparisons",
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  # Distill final table of variants to be reclassified
-  call MakeReclassificationTable {
-    input:
-      freq_table=MergeFreqTables.merged_table,
-      onevsall_fails=merge_one_vs_all_checks.fails_per_variant_all,
-      prefix=prefix,
-      onevsall_cutoff = onevsall_cutoff,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  # Apply batch effect labels
-  scatter ( contig in contigs ) {
-    call ApplyBatchEffectLabels as apply_labels_perContig {
-      input:
-        vcf=vcf,
-        vcf_idx=vcf_idx,
-        contig=contig[0],
-        reclassification_table=MakeReclassificationTable.reclassification_table,
-        mingq_prePost_pcrminus_fails=CompareFreqsPrePostMinGQPcrminus.pcrminus_fails,
-        prefix="~{prefix}.~{contig[0]}",
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-  }
-  call MiniTasks.ConcatVcfs as merge_labeled_vcfs {
-    input:
-      vcfs=apply_labels_perContig.labeled_vcf,
-      naive=true,
-      outfile_prefix="~{prefix}.batch_effects_labeled_merged",
-      sv_base_mini_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_attr_merge_labeled_vcfs
-  }
-
-  output {
-    File labeled_vcf = merge_labeled_vcfs.concat_vcf
-    File labeled_vcf_idx = merge_labeled_vcfs.concat_vcf_idx
-  }
-}
-
-
-# Get list of samples to include & exclude per batch
-# Always exclude probands from all batches
-task GetBatchSamplesList {
-  input{
-    File vcf
-    File vcf_idx
-    String batch
-    File sample_batch_assignments
-    File probands_list
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-                               cpu_cores: 1,
-                               mem_gb: 4,
-                               disk_gb: 50,
-                               boot_disk_gb: 10,
-                               preemptible_tries: 3,
-                               max_retries: 1
-                             }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-  command <<<
-    set -euo pipefail
-    # Get list of all samples present in VCF header
-    tabix -H ~{vcf} | fgrep -v "##" | cut -f10- | sed 's/\t/\n/g' | sort -Vk1,1 \
-    > all_samples.list
-    # Get list of samples in batch
-    fgrep -w ~{batch} ~{sample_batch_assignments} | cut -f1 \
-    | fgrep -wf - all_samples.list \
-    | fgrep -wvf ~{probands_list} \
-    > "~{batch}.samples.list" || true
-    # Get list of samples not in batch
-    fgrep -wv ~{batch} ~{sample_batch_assignments} | cut -f1 \
-    | cat - ~{probands_list} | sort -Vk1,1 | uniq \
-    | fgrep -wf - all_samples.list \
-    > "~{batch}.exclude_samples.list" || true
-  >>>
-
-  output {
-    File include_samples_list = "~{batch}.samples.list"
-    File exclude_samples_list = "~{batch}.exclude_samples.list"
-  }
-
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
-
-
-# Run vcf2bed and subset to just include VID, SVTYPE, SVLEN, _AC, and _AN
-task GetFreqTable {
-  input{
-    File vcf
-    File sample_pop_assignments
-    String prefix
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-                               cpu_cores: 1,
-                               mem_gb: 6,
-                               disk_gb: 50,
-                               boot_disk_gb: 10,
-                               preemptible_tries: 3,
-                               max_retries: 1
-                             }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-  command <<<
-    set -euo pipefail
-    #Run vcf2bed
-    svtk vcf2bed \
-    --info ALL \
-    --no-samples \
-    ~{vcf} "~{prefix}.vcf2bed.bed"
-    ### Create table of freqs by ancestry
-    #Cut to necessary columns
-    idxs=$( sed -n '1p' "~{prefix}.vcf2bed.bed" \
-    | sed 's/\t/\n/g' \
-    | awk -v OFS="\t" '{ print $1, NR }' \
-    | grep -e 'name\|SVLEN\|SVTYPE\|_AC\|_AN\|_CN_NONREF_COUNT\|_CN_NUMBER' \
-    | fgrep -v "OTH" \
-    | cut -f2 \
-    | paste -s -d\, || true )
-    cut -f"$idxs" "~{prefix}.vcf2bed.bed" \
-    | sed 's/^name/\#VID/g' \
-    | gzip -c \
-    > "~{prefix}.frequencies.preclean.txt.gz"
-    #Clean frequencies (note that this script automatically gzips the output file taken as the last argument)
-    /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/clean_frequencies_table.R \
-    "~{prefix}.frequencies.preclean.txt.gz" \
-    "~{sample_pop_assignments}" \
-    "~{prefix}.frequencies.txt"
-    ### Create table of freqs, irrespective of ancestry
-    #Cut to necessary columns
-    idxs=$( sed -n '1p' "~{prefix}.vcf2bed.bed" \
-    | sed 's/\t/\n/g' \
-    | awk -v OFS="\t" '{ if ($1=="name" || $1=="SVLEN" || $1=="SVTYPE" || $1=="AC" || $1=="AN" || $1=="CN_NUMBER" || $1=="CN_NONREF_COUNT") print NR }' \
-    | paste -s -d\, || true )
-    cut -f"$idxs" "~{prefix}.vcf2bed.bed" > minfreq.subset.bed
-    svtype_idx=$( sed -n '1p' minfreq.subset.bed \
-    | sed 's/\t/\n/g' \
-    | awk -v OFS="\t" '{ if ($1=="SVTYPE") print NR }' || true )
-    awk -v OFS="\t" -v sidx="$svtype_idx" '{ if ($sidx=="CNV" || $sidx=="MCNV") print $1, $2, $3, $6, $7; else print $1, $2, $3, $4, $5 }' minfreq.subset.bed \
-    | sed 's/^name/\#VID/g' \
-    | gzip -c \
-    > "~{prefix}.frequencies.allPops.txt.gz"
-  >>>
-
-  output {
-    File freq_data = "~{prefix}.frequencies.txt.gz"
-    File freq_data_allPops = "~{prefix}.frequencies.allPops.txt.gz"
-  }
-
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
-
-
-# Combine frequency data across batches
-task MergeFreqTables {
-  input{
-    Array[File] tables
-    File batches_list
-    String prefix
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-                               cpu_cores: 1,
-                               mem_gb: 16,
-                               disk_gb: 100,
-                               boot_disk_gb: 10,
-                               preemptible_tries: 3,
-                               max_retries: 1
-                             }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-  command <<<
-    set -euo pipefail
-
-    #Get list of batch IDs and batch table paths
-    while read batch; do
-    echo "$batch"
-    find ./ -name "$batch.frequencies*txt.gz" | sed -n '1p'
-    done < ~{batches_list} | paste - - \
-    > input.list
-
-    #Make sure all input files have the same number of lines
-    while read batch file; do
-    zcat "$file" | wc -l
-    done < input.list > nlines.list
-    nlines=$( sort nlines.list | uniq | wc -l )
-    if [ "$nlines" -gt 1 ]; then
-    echo "AT LEAST ONE INPUT FILE HAS A DIFFERENT NUMBER OF LINES"
-    exit 0
-    fi
-
-    #Prep files for paste joining
-    echo "PREPPING FILES FOR MERGING"
-    while read batch file; do
-    #Header
-    zcat "$file" | sed -n '1p' | cut -f1-3
-    #Body
-    zcat "$file" | sed '1d' \
-    | sort -Vk1,1 \
-    | cut -f1-3
-    done < <( sed -n '1p' input.list ) \
-    > header.txt
-    while read batch file; do
-    for wrapper in 1; do
-    #Header
-    zcat "$file" | sed -n '1p' \
-    | cut -f4- | sed 's/\t/\n/g' \
-    | awk -v batch="$batch" '{ print $1"."batch }' \
-    | paste -s
-    #Body
-    zcat "$file" | sed '1d' \
-    | sort -Vk1,1 \
-    | cut -f4-
-    done > "$batch.prepped.txt"
-    done < input.list
-
-    #Join files with simple paste
-    paste \
-    header.txt \
-    $( awk -v ORS=" " '{ print $1".prepped.txt" }' input.list ) \
-    | gzip -c \
-    > "~{prefix}.merged_AF_table.txt.gz"
-  >>>
-
-  output {
-    File merged_table = "~{prefix}.merged_AF_table.txt.gz"
-  }
-
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
-
-
-# Compare
-task CompareFreqsPrePostMinGQPcrminus {
-  input{
-    File af_pcrmins_premingq
-    File AF_postMinGQ_table
-    String prefix
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-                               cpu_cores: 1,
-                               mem_gb: 8,
-                               disk_gb: 30,
-                               boot_disk_gb: 10,
-                               preemptible_tries: 3,
-                               max_retries: 1
-                             }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-  command <<<
-    set -euo pipefail
-    /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/compare_freqs_pre_post_minGQ.PCRMinus_only.R \
-    ~{af_pcrmins_premingq} \
-    ~{AF_postMinGQ_table} \
-    ./ \
-    "~{prefix}."
-  >>>
-
-  output {
-    File pcrminus_fails = "~{prefix}.PCRMINUS_minGQ_AF_prePost_fails.VIDs.list"
-    File minGQ_prePost_comparison_data = "~{prefix}.minGQ_AF_prePost_comparison.data.txt.gz"
-    File minGQ_prePost_comparison_plot = "~{prefix}.minGQ_AF_prePost_comparison.plot.png"
-  }
-
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
-
-
-# Calculate & plot cross-batch correlation coefficient matrixes
-task MakeCorrelationMatrices {
-  input{
-    File freq_table
-    String pop
-    File batches_list
-    String prefix
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-                               cpu_cores: 1,
-                               mem_gb: 8,
-                               disk_gb: 50,
-                               boot_disk_gb: 10,
-                               preemptible_tries: 3,
-                               max_retries: 1
-                             }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-  command <<<
-    set -euo pipefail
-    /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/correlate_batches_singlePop.R \
-    ~{batches_list} \
-    ~{freq_table} \
-    "~{pop}" \
-    "~{prefix}.~{pop}"
-  >>>
-  output {
-    Array[File] corr_matrixes = glob("~{prefix}.~{pop}.*.R2_matrix.txt")
-    Array[File] heat_maps = glob("~{prefix}.~{pop}.*heatmap*.pdf")
-    Array[File] dot_plots = glob("~{prefix}.~{pop}.*perBatch_R2_sina_plot.pdf")
-  }
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
-
-
-# Generate list of all pairs of batches to be compared
-task MakeBatchPairsList {
-  input{
-    File batches_list
-    String prefix
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-                               cpu_cores: 1,
-                               mem_gb: 4,
-                               disk_gb: 10,
-                               boot_disk_gb: 10,
-                               preemptible_tries: 3,
-                               max_retries: 1
-                             }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-  command <<<
-    set -euo pipefail
-    /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_pairs_list.R \
-    ~{batches_list} \
-    "~{prefix}.nonredundant_batch_pairs.txt"
-  >>>
-
-  output {
-    File batch_pairs_list = "~{prefix}.nonredundant_batch_pairs.txt"
-  }
-
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
-
-
-# Merge lists of batch effect checks and count total number of times each variant failed
-task MergeVariantFailureLists {
-  input{
-    Array[File] fail_variant_lists
-    String prefix
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-                               cpu_cores: 1,
-                               mem_gb: 4,
-                               disk_gb: 10,
-                               boot_disk_gb: 10,
-                               preemptible_tries: 3,
-                               max_retries: 1
-                             }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-  command <<<
-    set -euo pipefail
-    #Write list of paths to all batch effect variant lists
-    #Get master list of PCR+ to PCR+ failures   #removed from the PCR- only projects
-    #Get master list of PCR- to PCR- failures   #removed from the PCR- only projects
-    #Get master list of PCR+ to PCR- failures   #removed from the PCR- only projects
-    #Get master list of all possible failures
-    cat ~{write_lines(fail_variant_lists)} \
-    | xargs -I {} cat {} \
-    | sort -Vk1,1 | uniq -c \
-    | awk -v OFS="\t" '{ print $2, $1 }' \
-    > "~{prefix}.all.failures.txt" || true
-  >>>
-
-  output {
-    File fails_per_variant_all = "~{prefix}.all.failures.txt"
-  }
-
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
-
-
-# Consolidate all batch effect check results into a single table with reclassification per variant
-task MakeReclassificationTable {
-  input{
-    File freq_table
-    File onevsall_fails
-    String prefix
-    Int? onevsall_cutoff
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-                               cpu_cores: 1,
-                               mem_gb: 8,
-                               disk_gb: 10,
-                               boot_disk_gb: 10,
-                               preemptible_tries: 3,
-                               max_retries: 1
-                             }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-  command <<<
-    set -euo pipefail
-    /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R \
-    ~{freq_table} \
-    ~{onevsall_fails} \
-    "~{prefix}.batch_effect_reclassification_table.txt" \
-    ~{onevsall_cutoff}
-  >>>
-
-  output {
-    File reclassification_table = "~{prefix}.batch_effect_reclassification_table.txt"
-  }
-
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
-
-
-# Apply batch effect labels to VCF
-task ApplyBatchEffectLabels {
-  input{
-    File vcf
-    File vcf_idx
-    String contig
-    File reclassification_table
-    File? mingq_prePost_pcrminus_fails
-    String prefix
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-                               cpu_cores: 1,
-                               mem_gb: 4,
-                               disk_gb: 50,
-                               boot_disk_gb: 10,
-                               preemptible_tries: 3,
-                               max_retries: 1
-                             }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-  command <<<
-    set -euo pipefail
-    tabix -h ~{vcf} ~{contig} \
-    | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/label_batch_effects.PCRMinus_only.py \
-    ~{"--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails} \
-    stdin \
-    ~{reclassification_table} \
-    stdout \
-    | bgzip -c \
-    > "~{prefix}.batch_effects_labeled.vcf.gz"
-    tabix -p vcf -f "~{prefix}.batch_effects_labeled.vcf.gz"
-  >>>
-
-  output {
-    File labeled_vcf = "~{prefix}.batch_effects_labeled.vcf.gz"
-    File labeled_vcf_idx = "~{prefix}.batch_effects_labeled.vcf.gz.tbi"
-  }
-
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
\ No newline at end of file

From baba8aaccbc512a8e164de40ca9de190a3ca8c87 Mon Sep 17 00:00:00 2001
From: Shadi Zaheri <74751641+shadizaheri@users.noreply.github.com>
Date: Wed, 29 Nov 2023 11:57:02 -0500
Subject: [PATCH 08/11] Update
 make_batch_effect_reclassification_table.PCRMinus_only.R

# Just use onevsall.fails directly.
merged <- onevsall.fails
---
 ...ect_reclassification_table.PCRMinus_only.R | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R
index 1f233d25d..c9687d62d 100755
--- a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R
+++ b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R
@@ -117,19 +117,23 @@ onevsall.fails <- onevsall.fails[which(onevsall.fails$fails_onevsall>=onevsall.c
 
 
 ###Combine data
-merged <- merge(onevsall.fails,all=T,sort=F,by="VID")
-if(nrow(merged) > 0){
-  merged[,-1] <- apply(merged[,-1],2,function(vals){
+#merged <- merge(onevsall.fails,all=T,sort=F,by="VID")
+merged <- onevsall.fails
+# If merged data frame is not empty, replace NA values with 0
+if (nrow(merged) > 0) {
+  merged[,-1] <- apply(merged[,-1], 2, function(vals) {
     vals[which(is.na(vals))] <- 0
     return(vals)
   })
 }
-merged <- merge(merged,freq.dat,by="VID",sort=F,all=F)
 
+# Merge with frequency data
+merged <- merge(merged, freq.dat, by = "VID", sort = F, all = F)
 
-##Categorize batch effect failure sites
-out.table <- analyze.failures(dat=merged,
-                                 onevsall.cutoff=onevsall.cutoff)
-write.table(out.table,OUTFILE,col.names=F,row.names=F,sep="\t",quote=F)
+## Categorize batch effect failure sites
+# Assuming the categorize.failures function only requires onevsall.cutoff 
+# and is compatible with the new merged data structure
+out.table <- categorize.failures(dat = merged, onevsall.cutoff = onevsall.cutoff)
 
-  
\ No newline at end of file
+# Write the output table
+write.table(out.table, OUTFILE, col.names = F, row.names = F, sep = "\t", quote = F)

From b253052c97b698bd3caba889c24fce2c7bff8abe Mon Sep 17 00:00:00 2001
From: Shadi Zaheri <74751641+shadizaheri@users.noreply.github.com>
Date: Wed, 29 Nov 2023 12:15:32 -0500
Subject: [PATCH 09/11] Update Module07XfBatchEffect.wdl

adjust tab spaces
---
 wdl/Module07XfBatchEffect.wdl | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/wdl/Module07XfBatchEffect.wdl b/wdl/Module07XfBatchEffect.wdl
index a0f1ffd66..161f01a51 100644
--- a/wdl/Module07XfBatchEffect.wdl
+++ b/wdl/Module07XfBatchEffect.wdl
@@ -588,12 +588,12 @@ task ApplyBatchEffectLabels {
     set -euo pipefail
     tabix -h ~{vcf} ~{contig} \
     | /opt/sv-pipeline/scripts/downstream_analysis_and_filtering/label_batch_effects.PCRMinus_only.py \
-    ~{"--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails} \
-    stdin \
-    ~{reclassification_table} \
-    stdout \
-    | bgzip -c \
-    > "~{prefix}.batch_effects_labeled.vcf.gz"
+        ~{"--unstable-af-pcrminus " + mingq_prePost_pcrminus_fails} \
+        stdin \
+        ~{reclassification_table} \
+        stdout \
+        | bgzip -c \
+        > "~{prefix}.batch_effects_labeled.vcf.gz"
     tabix -p vcf -f "~{prefix}.batch_effects_labeled.vcf.gz"
   >>>
 
@@ -611,4 +611,4 @@ task ApplyBatchEffectLabels {
     preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
     maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
   }
-}
\ No newline at end of file
+}

From 54c421e4f48dcf0db088a11fb5ab364d47f4763c Mon Sep 17 00:00:00 2001
From: Shadi Zaheri <74751641+shadizaheri@users.noreply.github.com>
Date: Wed, 29 Nov 2023 12:19:36 -0500
Subject: [PATCH 10/11] Update Module07XfBatchEffect.wdl

---
 wdl/Module07XfBatchEffect.wdl | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/wdl/Module07XfBatchEffect.wdl b/wdl/Module07XfBatchEffect.wdl
index 161f01a51..d2b737a5e 100644
--- a/wdl/Module07XfBatchEffect.wdl
+++ b/wdl/Module07XfBatchEffect.wdl
@@ -576,13 +576,14 @@ task ApplyBatchEffectLabels {
     RuntimeAttr? runtime_attr_override
   }
   RuntimeAttr default_attr = object {
-                               cpu_cores: 1,
-                               mem_gb: 4,
-                               disk_gb: 50,
-                               boot_disk_gb: 10,
-                               preemptible_tries: 3,
-                               max_retries: 1
-                             }
+    cpu_cores: 1, 
+    mem_gb: 4,
+    disk_gb: 50,
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+
   RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
   command <<<
     set -euo pipefail

From 30647d36527a20bd4cbb5a133537c6354c8f2036 Mon Sep 17 00:00:00 2001
From: Shadi Zaheri <74751641+shadizaheri@users.noreply.github.com>
Date: Fri, 1 Dec 2023 17:52:22 -0500
Subject: [PATCH 11/11] Update
 make_batch_effect_reclassification_table.PCRMinus_only.R

analyze.failures ----> categorize.failures
---
 .../make_batch_effect_reclassification_table.PCRMinus_only.R    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R
index c9687d62d..450c6c0a4 100755
--- a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R
+++ b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/make_batch_effect_reclassification_table.PCRMinus_only.R
@@ -54,7 +54,7 @@ import.fails <- function(minus.in,prefix){
 }
 
 #Categorize failing sites
-analyze.failures <- function(dat,onevsall.cutoff){
+categorize.failures <- function(dat,onevsall.cutoff){
   dat$max_plus_frac <- apply(data.frame(dat$frac_plus_fails_onevsall),
                              1,max,na.rm=T)
   onevsall.fail.idx <- which(dat$fails_onevsall>=onevsall.cutoff)