Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend functionality for outlier sample exclusion workflow #496

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion inputs/values/dockers.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@
"sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2023-02-01-v0.26.8-beta-9b25c72d",
"gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/tbrookin/gatk:0a7e1d86f",
"str": "us.gcr.io/broad-dsde-methods/vjalili/str:5994670"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,32 @@
# in final_outlier_sample_filter.wdl


###Set master parameters & read arguments
options(stringsAsFactors=F,scipen=1000)
### Set parameters & read arguments
options(stringsAsFactors=F, scipen=1000)
args <- commandArgs(trailingOnly=TRUE)
INFILE <- args[1]
OUTFILE <- args[2]

###Read input data & reformat
dat <- read.table(INFILE,header=F)
colnames(dat) <- c("sample","svtype","count","chrom")
### Read input data & reformat
dat <- read.table(INFILE, header=F, check.names=F, sep="\t", comment.char="")
drop.rows <- which(dat[, 1] %in% c("sample", "#sample"))
if(length(drop.rows) > 0){
dat <- dat[-drop.rows, ]
}
colnames(dat) <- c("sample", "svtype", "count", "chrom")[1:ncol(dat)]
dat$count <- as.numeric(dat$count)
samples <- as.character(unique(dat$sample))
svtypes <- as.character(unique(dat$svtype))

###Get sum of counts per sample per svtype
summed.res <- do.call("rbind", lapply(samples,function(sample){
return(do.call("rbind", lapply(svtypes,function(svtype){
return(data.frame("sample"=sample,
### Get sum of counts per sample per svtype
summed.res <- do.call("rbind", lapply(samples, function(sample){
do.call("rbind", lapply(svtypes, function(svtype){
data.frame("sample"=sample,
"svtype"=svtype,
"count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype),]$count,na.rm=T)))
})))
"count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype), ]$count,
na.rm=T))
}))
}))

###Write summed results to outfile
write.table(summed.res,OUTFILE,col.names=T,row.names=F,sep="\t",quote=F)
### Write summed results to outfile
write.table(summed.res, OUTFILE, col.names=T, row.names=F, sep="\t", quote=F)
56 changes: 42 additions & 14 deletions wdl/FilterOutlierSamples.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -4,54 +4,81 @@ import "Structs.wdl"
import "Utils.wdl" as util
import "IdentifyOutlierSamples.wdl" as identify_outliers

# Filter outlier samples by IQR or cutoff table for a single VCF. Recommended to run PlotSVCountsPerSample first to choose cutoff
# Filter outlier samples by IQR or cutoff table for one or more VCFs
# Recommended to run PlotSVCountsPerSample first to choose cutoff
workflow FilterOutlierSamples {
input {
String name # batch or cohort
File vcf
Array[File] vcfs
File? sv_counts # SV counts file from PlotSVCountsPerSample - if not provided, will create
Int N_IQR_cutoff
File? outlier_cutoff_table
String? vcf_identifier # required (enter algorithm here) if providing outlier_cutoff_table, otherwise used in some file prefixes
String? bcftools_preprocessing_options
Boolean plot_counts = false
Array[String]? sample_subset_prefixes # if provided, will identify outliers separately within each subset
Array[String]? sample_subset_lists # if provided, will identify outliers separately within each subset
Int samples_per_shard = 5000
String sv_pipeline_docker
String sv_base_mini_docker
String linux_docker
RuntimeAttr? runtime_override_preprocess_vcf
RuntimeAttr? runtime_attr_identify_outliers
RuntimeAttr? runtime_attr_subset_vcf
RuntimeAttr? runtime_attr_cat_outliers
RuntimeAttr? runtime_attr_subset_counts
RuntimeAttr? runtime_attr_filter_samples
RuntimeAttr? runtime_attr_ids_from_vcf
RuntimeAttr? runtime_attr_count_svs
RuntimeAttr? runtime_attr_combine_counts
RuntimeAttr? runtime_attr_plot_svcounts
}

if (defined(sample_subset_prefixes) && defined(sample_subset_lists)) {
Array[Pair[String, File]]? sample_subsets = zip(select_first([sample_subset_prefixes]),
select_first([sample_subset_lists]))
}

call identify_outliers.IdentifyOutlierSamples {
input:
vcf = vcf,
vcfs = vcfs,
name = name,
sv_counts = sv_counts,
N_IQR_cutoff = N_IQR_cutoff,
outlier_cutoff_table = outlier_cutoff_table,
vcf_identifier = vcf_identifier,
bcftools_preprocessing_options = bcftools_preprocessing_options,
plot_counts = plot_counts,
sample_subsets = sample_subsets,
samples_per_shard = samples_per_shard,
sv_pipeline_docker = sv_pipeline_docker,
sv_base_mini_docker = sv_base_mini_docker,
linux_docker = linux_docker,
runtime_attr_ids_from_vcf = runtime_attr_ids_from_vcf,
runtime_override_preprocess_vcf = runtime_override_preprocess_vcf,
runtime_attr_identify_outliers = runtime_attr_identify_outliers,
runtime_attr_cat_outliers = runtime_attr_cat_outliers,
runtime_attr_count_svs = runtime_attr_count_svs
runtime_attr_subset_counts = runtime_attr_subset_counts,
runtime_attr_count_svs = runtime_attr_count_svs,
runtime_attr_combine_counts = runtime_attr_combine_counts,
runtime_attr_plot_svcounts = runtime_attr_plot_svcounts
}

call util.SubsetVcfBySamplesList {
input:
vcf = vcf,
list_of_samples = IdentifyOutlierSamples.outlier_samples_file,
outfile_name = "${name}.outliers_removed.vcf.gz",
remove_samples = true,
sv_base_mini_docker = sv_base_mini_docker,
runtime_attr_override = runtime_attr_subset_vcf
scatter ( vcf in vcfs ) {
call util.SubsetVcfBySamplesList {
input:
vcf = vcf,
list_of_samples = IdentifyOutlierSamples.outlier_samples_file,
outfile_name = basename(vcf, ".vcf.gz") + ".${name}.outliers_removed.vcf.gz",
remove_samples = true,
sv_base_mini_docker = sv_base_mini_docker,
runtime_attr_override = runtime_attr_subset_vcf
}
}

call util.GetSampleIdsFromVcf {
input:
vcf = vcf,
vcf = vcfs[0],
sv_base_mini_docker = sv_base_mini_docker,
runtime_attr_override = runtime_attr_ids_from_vcf
}
Expand All @@ -67,7 +94,8 @@ workflow FilterOutlierSamples {
}

output {
File outlier_filtered_vcf = SubsetVcfBySamplesList.vcf_subset
Array[File] outlier_filtered_vcfs = SubsetVcfBySamplesList.vcf_subset
Array[File] outlier_filtered_vcf_idxs = SubsetVcfBySamplesList.vcf_subset_index
Array[String] filtered_samples_list = FilterSampleList.filtered_samples_list
File filtered_samples_file = FilterSampleList.filtered_samples_file
Array[String] outlier_samples_excluded = IdentifyOutlierSamples.outlier_samples_list
Expand Down
Loading