Skip to content

Commit

Permalink
finish pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
r1cheu committed Mar 22, 2024
1 parent 24ae24b commit b5f2b5f
Show file tree
Hide file tree
Showing 11 changed files with 178 additions and 40 deletions.
6 changes: 3 additions & 3 deletions cluster/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ local-cores: 1
default-resources:
slurm_partition: "normal"
slurm_account: "rlchen"
mem_mb: 8000
nodes: 1
ntasks: 1
mem_mb: 2000
nodes: 1
cpus_per_task: 1
1 change: 1 addition & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ samples: config/samples.tsv
units: config/units.tsv

sif_path: /public/home/rlchen/ProJect/snp_gatk/snpcalling.sif

2 changes: 1 addition & 1 deletion config/samples.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
sample
R-246
test
2 changes: 1 addition & 1 deletion config/units.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
sample platform fq1 fq2
R-246 ILLUMINA data/reads/R-246.1.fastq.gz data/reads/R-246.2.fastq.gz
test ILLUMINA data/reads/test.1.fastq.gz data/reads/test.2.fastq.gz
29 changes: 25 additions & 4 deletions snp_call.def
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ chmod a+x ${GATK_HOME}/gatk

###############################################
#HTSLIB 1.9
HTSLIB_VERSION=1.9
HTSLIB_VERSION=1.19.1
HTSLIB_HOME=${APPS_ROOT}/htslib/${HTSLIB_VERSION}

MANPATH=$MANPATH:${HTSLIB_HOME}/share/man
Expand All @@ -106,10 +106,27 @@ wget https://github.com/samtools/htslib/releases/download/${HTSLIB_VERSION}/htsl
&& make \
&& make install

###############################################
#bcftools 1.9
BCFTOOLS_VERSION=1.19
BCFTOOLS_HOME=${APPS_ROOT}/bcftools/${BCFTOOLS_VERSION}


wget https://github.com/samtools/bcftools/releases/download/${BCFTOOLS_VERSION}/bcftools-${BCFTOOLS_VERSION}.tar.bz2 \
&& tar xjf bcftools-${HTSLIB_VERSION}.tar.bz2 \
&& rm bcftools-${HTSLIB_VERSION}.tar.bz2 \
&& cd bcftools-${HTSLIB_VERSION} \
&& autoheader \
&& autoconf \
&& ./configure --prefix=${BCFTOOLS_HOME} --with-htslib=${HTSLIB_HOME} \
&& make \
&& make install


###############################################
#SAMTOOLS = 'samtools/intel/1.9'

SAMTOOLS_VERSION=1.9
SAMTOOLS_VERSION=1.19.2
SAMTOOLS_HOME=${APPS_ROOT}/samtools/${SAMTOOLS_VERSION}

MANPATH=${SAMTOOLS_HOME}/share/man
Expand Down Expand Up @@ -150,7 +167,7 @@ export GATK_LOCAL_JAR=${GATK_HOME}/gatk-package-${GATK_VERSION}-local.jar
export GATK_SPARK_JAR=${GATK_HOME}/gatk-package-${GATK_VERSION}-spark.jar
export GATK_JAR=${GATK_HOME}/gatk-package-${GATK_VERSION}-local.jar
export PATH=${GATK_HOME}:${PATH}
export HTSLIB_VERSION=1.9
export HTSLIB_VERSION=1.19.1
export HTSLIB_HOME=${APPS_ROOT}/htslib/${HTSLIB_VERSION}
export MANPATH=$MANPATH:${HTSLIB_HOME}/share/man
export PATH=${PATH}:${HTSLIB_HOME}/bin
Expand All @@ -159,14 +176,18 @@ export PKG_CONFIG_PATH=${HTSLIB_HOME}/lib/pkgconfig
export HTSLIB_HOME=${HTSLIB_HOME}
export HTSLIB_INC=${HTSLIB_HOME}/include
export HTSLIB_LIB=${HTSLIB_HOME}/lib
export SAMTOOLS_VERSION=1.9
export SAMTOOLS_VERSION=1.19.2
export SAMTOOLS_HOME=${APPS_ROOT}/samtools/${SAMTOOLS_VERSION}
export MANPATH=${SAMTOOLS_HOME}/share/man
export PATH=${SAMTOOLS_HOME}/bin:${PATH}
export LD_LIBRARY_PATH=${SAMTOOLS_HOME}/lib:${LD_LIBRARY_PATH}
export SAMTOOLS_HOME=${SAMTOOLS_HOME}
export SAMTOOLS_INC=${SAMTOOLS_HOME}/include
export SAMTOOLS_LIB=${SAMTOOLS_HOME}/lib
export BCFTOOLS_VERSION=1.19
export BCFTOOLS_HOME=${APPS_ROOT}/bcftools/${BCFTOOLS_VERSION}
export PATH=${BCFTOOLS_HOME}/bin:${PATH}

%runscript
exec /bin/bash "$@"
%startscript
Expand Down
7 changes: 5 additions & 2 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@ include: "rules/common.smk"

singularity: config["sif_path"]

CHROMOSOMES = [f"chr{i:02d}" for i in range(1, 13)]

rule all:
input:
"results/called/joint.vcf.gz",

expand("results/vcf/all.{chrom}.vcf.gz", chrom=CHROMOSOMES),
expand("results/all.snp.final.vcf.gz")
##### Modules #####

include: "rules/ref.smk"
include: "rules/mapping.smk"
include: "rules/calling.smk"
include: "rules/filter.smk"
44 changes: 29 additions & 15 deletions workflow/rules/calling.smk
Original file line number Diff line number Diff line change
@@ -1,39 +1,53 @@
rule call_variants:
input:
bam=get_sample_bams,
bam_idx=get_sample_bams_idx,
ref="resources/IRGSP-1.0_genome.fasta",
fai="resources/IRGSP-1.0_genome.fasta.fai",
idx="resources/IRGSP-1.0_genome.dict",
output:
gvcf=protected("results/called/{sample}.g.vcf.gz"),
gvcf=protected("results/called/gvcfs/{sample}.g.vcf.gz"),
log:
"logs/gatk/haplotypecaller/{sample}.log",
run:
benchmark:
"logs/gatk/haplotypecaller/{sample}.benchmark",
shell:
"gatk HaplotypeCaller -R {input.ref} -I {input.bam} -O {output.gvcf} "
"-ERC GVCF"
"-ERC GVCF > {log} 2>&1"


rule genomics_db_import:
input:
gvcfs=expand("results/called/{sample}.g.vcf.gz", sample=samples.index),
gvcfs=expand("results/called/gvcfs/{sample}.g.vcf.gz", sample=samples.index),
output:
db="results/called/genomics_db",
db=directory("results/called/genome_db/{chrom}_db"),
log:
"logs/gatk/genomics_db_import.log",
threads: 5,
run:
"gatk GenomicsDBImport --genomicsdb-workspace-path {output.db} "
"--batch-size 100 --reader-threads {threads} {input.gvcfs}"
"logs/gatk/genomics_db/genomics_db_import_{chrom}.log",
benchmark:
"logs/gatk/genomics_db/genomics_db_import_{chrom}.benchmark",
resources:
cpus_per_task=4,
shell:
"""
gatk GenomicsDBImport --genomicsdb-workspace-path {output.db} \
--batch-size 100 --reader-threads {resources.cpus_per_task} \
-V {input.gvcfs} \
-L {wildcards.chrom} > {log} 2>&1
"""


rule joint_calling:
input:
db="results/called/genomics_db",
db="results/called/genome_db/{chrom}_db",
ref="resources/IRGSP-1.0_genome.fasta",
idx="resources/IRGSP-1.0_genome.dict",
output:
vcf="results/called/joint.vcf.gz",
vcf="results/vcf/all.{chrom}.vcf.gz",
log:
"logs/gatk/joint_calling.log",
run:
"logs/gatk/joint_calling/{chrom}_joint_calling.log",
benchmark:
"logs/gatk/joint_calling/{chrom}_joint_calling.benchmark",
shell:
"gatk GenotypeGVCFs -R {input.ref} -V gendb://{input.db} "
"-O {output.vcf} 2> {log}"
"-O {output.vcf} 2> {log} 2>&1"

13 changes: 13 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,16 @@ def get_sample_bams(wildcards):
"results/dedup/{sample}.sorted.bam",
sample=wildcards.sample,
)

def get_sample_bams_idx(wildcards):
"""Get all aligned reads of given sample."""
return expand(
"results/dedup/{sample}.sorted.bam.bai",
sample=wildcards.sample,
)
def get_vcfs(wildcards):
"""Get all VCF files of given sample."""
return expand(
"results/vcf/{sample}.vcf.gz",
sample=wildcards.sample,
)
63 changes: 63 additions & 0 deletions workflow/rules/filter.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
rule get_merge_list:
input:
expand("results/vcf/all.{chrom}.vcf.gz", chrom=CHROMOSOMES),
output:
"results/vcf/merge_list.txt",
run:
with open(output[0], "w") as f:
for inp in input:
f.write(inp + '\n')


rule merge_vcfs:
input:
"results/vcf/merge_list.txt"
output:
"results/all.vcf.gz"
log:
"logs/gatk/merge_vcfs.log"
benchmark:
"logs/gatk/merge_vcfs.benchmark"
shell:
"gatk MergeVcfs -I {input} -O {output} > {log} 2>&1"

rule select_snp:
input:
"results/all.vcf.gz"
output:
"results/all.snp.vcf.gz"
log:
"logs/gatk/select_snp.log"
benchmark:
"logs/gatk/select_snp.benchmark"
shell:
"gatk SelectVariants -V {input} -select-type SNP -O {output} > {log} 2>&1"


rule filter_snp:
input:
vcf="results/all.snp.vcf.gz",
ref="resources/IRGSP-1.0_genome.fasta"
output:
"results/all.snp.filter.vcf.gz"
log:
"logs/gatk/filter_snp.log"
benchmark:
"logs/gatk/filter_snp.benchmark"
shell:
"gatk VariantFiltration -R {input.ref} --variant {input.vcf} "
"--cluster-size 3 --cluster-window-size 10 --filter-expression "
"'QD<10.00' --filter-name lowQD --filter-expression 'FS>15.000' "
"--filter-name highFS --genotype-filter-expression 'DP>200||DP<5' "
"--genotype-filter-name InvalidDP --output {output} > {log} 2>&1"


rule final_vcf:
input:
"results/all.snp.filter.vcf.gz"
output:
"results/all.snp.final.vcf.gz"
log:
"logs/gatk/final_vcf.log"
shell:
"bcftools view -f PASS {input} -Oz -o {output}"
45 changes: 34 additions & 11 deletions workflow/rules/mapping.smk
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os.path as osp


rule fastp_trim_reads_pe:
input:
unpack(get_fastq)
Expand All @@ -8,33 +9,46 @@ rule fastp_trim_reads_pe:
r2=temp("results/trimmed/{sample}.2.fastq.gz")
log:
html="logs/fastp/{sample}.html", json="logs/fastp/{sample}.json"
benchmark:
"logs/fastp/{sample}.bench"
resources:
ntasks=4
mem_mb=
cpus_per_task=4
shell:
"fastp -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} "
"-h {log.html} -j {log.json} -w {resources.ntasks}"
"-h {log.html} -j {log.json} -w {resources.cpus_per_task}"


rule map_reads:
input:
reads=get_trimmed_reads,
idx=rules.bwa_index.output
output:
temp("results/mapped/{sample}.sorted.bam"),
temp("results/mapped/{sample}.bam"),
log:
"logs/bwa/{sample}.log"
benchmark:
"logs/bwa/{sample}.bench"
params:
index=lambda w, input: os.path.splitext(input.idx[0])[0],
extra=get_read_group,
sort_order="coordinate",
resources:
ntasks=4
mem_mb=
cpus_per_task=4
shell:
"bwa mem -t {resources.cpus_per_task} -M {params.extra} {params.index} {input.reads} | "
"samtools view -F 0x100 -Sb -o {output} - > {log} 2>&1"


rule sort_bam:
input:
"results/mapped/{sample}.bam"
output:
temp("results/mapped/{sample}.sorted.bam")
benchmark:
"logs/picard/sort/{sample}.bench"
shell:
"bwa mem -t {resources.ntasks} {params.extra} {params.index} {input.reads} | "
"samtools sort -T {output}.tmp -o {output} - > {log} 2>&1"

"gatk SortSam -I {input} -O {output} --SORT_ORDER coordinate"


rule mark_duplicates:
input:
Expand All @@ -44,8 +58,17 @@ rule mark_duplicates:
metrics=temp("results/qc/dedup/{sample}.metrics")
log:
"logs/picard/dedup/{sample}.log"
benchmark:
"logs/picard/dedup/{sample}.bench"
shell:
"gatk MarkDuplicates -I {input} -O {output.bam} -M {output.metrics} "
"--VALIDATION_STRINGENCY SILENT --OPTICAL_DUPLICATE_PIXEL_DISTANCE 100 "
"--ASSUME_SORT_ORDER 'coordinate' --CREATE_INDEX true > {log} 2>&1"

"--ASSUME_SORT_ORDER 'coordinate' > {log} 2>&1"

rule index_bam:
input:
"results/dedup/{sample}.sorted.bam"
output:
temp("results/dedup/{sample}.sorted.bam.bai")
shell:
"samtools index {input} {output}"
6 changes: 3 additions & 3 deletions workflow/rules/ref.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ rule genome_faidx:
"resources/IRGSP-1.0_genome.fasta.fai"
cache: True
shell:
"samtools faidx {input}"
"samtools faidx {input} > {output} {log}"

rule genome_dict:
input:
Expand All @@ -16,7 +16,7 @@ rule genome_dict:
"logs/samtools/create_dict.log"
cache: True
shell:
"samtools dict {input} > {output} > {log} 2>&1"
"samtools dict {input} -o {output} > {log} 2>&1"

rule bwa_index:
input:
Expand All @@ -26,7 +26,7 @@ rule bwa_index:
log:
"logs/bwa_index.log"
resources:
mem_mb=3000
mem_mb=4000
cache: True
shell:
"bwa index {input} > {log} 2>&1"

0 comments on commit b5f2b5f

Please sign in to comment.