diff --git a/CHANGELOG b/CHANGELOG index 1e2bd65..963ee55 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,21 @@ ##### ##### ################################################################################ +0.5.15: * Report adjusted +0.5.14: * Bugfix: Rule FinalMockVsRef_alignment didn't have resource allocations +0.5.13: * Statistics on final mock reference added +0.5.12: * Coverage for finalMock alignments are now also calculated +0.5.11: * Input rule all cleaned up a bit to account for temporary output +0.5.10: * Mapping stats for final mock vs reference genome added +0.5.9 : * Bugfix - Final report crashed, in case that no casecontrol is given in samplelist +0.5.8 : * Final mock reference was not declared in output +0.5.7 : * Dependencies between the rules improved +0.5.6 : * Bugfix - wrong path for reference based vcf fixed +0.5.5 : * Bugfix - wrong reference vcf address in module 6 +0.5.4 : * Variant calling for final mock added +0.5.3 : * Rule all rule simplified +0.5.2 : * Script to refine the mock reference added +0.5.1 : * samtools index set to csi instead of bai 0.4.1 : * Bugfix - Missing outputfile script added 0.4 : Release version 0.3.16: * Several other small path typos diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index e3c36f2..d1154d6 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -8,12 +8,13 @@ import os ##### GBS-snakemake pipeline ##### ##### Daniel Fischer (daniel.fischer@luke.fi) ##### Natural Resources Institute Finland (Luke) -##### This pipeline is build upon the the GBS-SNP-CROP pipeline -##### Version: 0.4.1 -version = "0.4.1" +##### This pipeline is build upon the the GBS-SNP-CROP pipeline: +##### https://github.com/halelab/GBS-SNP-CROP +##### Version: 0.5.15 +version = "0.5.15" ##### set minimum snakemake version ##### -min_version("5.24") +min_version("6.0") ##### Sample sheets ##### @@ -30,6 +31,7 @@ wildcard_constraints: config["genome-bwa-index"] = config["genome"]+".bwt" config["genome-star-index"] = config["project-folder"]+"/references/STAR2.7.3a" config["report-script"] = config["pipeline-folder"]+"/scripts/workflow-report.Rmd" +config["refinement-script"] = config["pipeline-folder"]+"/scripts/refineMockReference.R" config["adapter"]=config["pipeline-folder"]+"/adapter.fa" ##### Singularity container ##### @@ -79,18 +81,10 @@ print("######################################################################### rule all: input: - # OUTPUT: PREPARATION MODULE - expand("%s/FASTQ/CONCATENATED/{samples}_R1_001.merged.fastq.gz" % (config["project-folder"]), samples=samples), - expand("%s/FASTQ/CONCATENATED/{samples}_R2_001.merged.fastq.gz" % (config["project-folder"]), samples=samples), -# # QC OF RAW AND CONCATENATED FILES + # QC OF RAW AND CONCATENATED FILES "%s/QC/RAW/multiqc_R1/" % (config["project-folder"]), "%s/QC/CONCATENATED/multiqc_R1/" % (config["project-folder"]), "%s/QC/TRIMMED/multiqc_R1/" % (config["project-folder"]), - # OUTPUT STEP 2 - expand("%s/FASTQ/TRIMMED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples), - expand("%s/FASTQ/TRIMMED/{samples}.R2.fq.gz" % (config["project-folder"]), samples=samples), - # OUTPUT STEP 2b - expand("%s/FASTQ/SUBSTITUTED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples), # OUTPUT STEP 4 "%s/FASTQ/TRIMMED/GSC.MR.Genome.fa" % (config["project-folder"]), "%s/BAM/Mockref/mockToRef.sam.flagstat" % (config["project-folder"]), @@ -119,9 +113,13 @@ rule all: # OUTPUT STEP 9 "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]), # Quality check + expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), + "%s/MockReference/MockReference.fa" % (config["project-folder"]), + "%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]), "%s/finalReport.html" % (config["project-folder"]) + ### setup report ##### report: "report/workflow.rst" diff --git a/GBS-pipeline_config.yaml b/GBS-pipeline_config.yaml index a251cb7..6fc01ff 100644 --- a/GBS-pipeline_config.yaml +++ b/GBS-pipeline_config.yaml @@ -6,15 +6,11 @@ project-folder: "/scratch/project_2001746/Example" # Absolute path to the project root folder pipeline-folder: "/scratch/project_2001746/Pipeline-GBS" # Absolute path to the pipeline folder pipeline-config: "/scratch/project_2001746/Pipeline-GBS/GBS-pipeline_config-example.yaml" # Absolute path to the pipeline configuration file -#report-script: "/scratch/project_2001746/Pipeline-GBS/scripts/workflow-report.Rmd" # Absolute path to the report scripts (THIS OPTION SHOULD BE OBSOLETE) barcodes: "/scratch/project_2001746/Example/barcodesID.txt" # Absolute path to the barcodes file (MAKE THIS OPTIONAL) rawsamples: "/scratch/project_2001746/Example/rawsamples" # Absolute path to the raw samples file (MAKE THIS OPTIONAL) genome: "/scratch/project_2001746/Example/references/Vulpes_vulpes.VulVul2.2.dna.toplevel.fa" # Absolute path to an existing reference genome, leave empty "" if not available -#genome-bwa-index: "/scratch/project_2001746/Example/references/Vulpes_vulpes.VulVul2.2.dna.toplevel.fa.bwt" # Absolute path to the bwa index (THIS OPTION SHOULD BE OBSOLETE) -#genome-star-index: "/scratch/project_2001746/Example/references/STAR2.7.3a" # Absolute path to the star index (THIS OPTION SHOULD BE OBSOLETE) refsample: "" # Id of ref sample used to build mock reference (NOT SURE, IF USED, CHECK THIS) local-scratch: $LOCAL_SCRATCH # Path for fast local storage for tmp files (Here, a system variable is used. Check with your server configs) -#adapter: "/scratch/project_2001746/Pipeline-GBS/adapter.fa" # Path for the adapter sequences, used by Trimmomatic (MAKE THIS OPTIONAL) tmpdir: "/scratch/project_2001746/tmp" # Parameters for the GBS-SNP-CROP tools @@ -22,7 +18,13 @@ tmpdir: "/scratch/project_2001746/tmp" enz1: "AATTC" # First restriction enzyme sequence enz2: "GCATG" # Second restriction enzyme sequence libname: "AllReads" # -libtype: "PE" # LIbrary type, paired-end (PE) or single-end (SE) +libtype: "PE" # Library type, paired-end (PE) or single-end (SE) + +# Filter parameters for the mock reference refinement +################################################################################ +mockref: + TotalReadCoverage: 25 # How many overall reads need to be on a cluster to remain in mock reference? + minSampleCoverage: 3 # How many different samples need to have reads on a cluster to remain in mock reference? # Tools specific settings ################################################################################ diff --git a/GBS-pipeline_server-config.yaml b/GBS-pipeline_server-config.yaml index 6156a07..aaffa1c 100644 --- a/GBS-pipeline_server-config.yaml +++ b/GBS-pipeline_server-config.yaml @@ -46,7 +46,17 @@ IndexClustersSamtools: time: 02:00:00 job-name: IndexMockRefClusters_Samtools mem-per-cpu: 25000 - + +IndexFinalMockBWA: + time: 02:00:00 + job-name: IndexFinalMockBWA + mem-per-cpu: 25000 + +IndexFinalMockSamtools: + time: 02:00:00 + job-name: IndexFinalMockSamtools + mem-per-cpu: 25000 + # Module 1 - QC: ################################################################################ fastqc_quality_control_raw_data: @@ -130,10 +140,26 @@ MockVsRef_AlignmentStats: time: 05:00:00 job-name: MockVsRef_AlignmentStats +FinalMockVsRef_alignment: + time: 05:00:00 + cpus-per-task: 20 + job-name: FinalMockVsRef_alignment + +FinalMockVsRef_samtools_SamToSortedBam: + time: 05:00:00 + job-name: FinalMockVsRef_samtools_SamToSortedBam + +FinalMockVsRef_AlignmentStats: + time: 05:00:00 + job-name: FinalMockVsRef_AlignmentStats + MockVsRef_SortedBamToMpileup: time: 05:00:00 job-name: MockVsRef_SortedBamToMpileup +refine_mock_reference: + job-name: refine_mock_reference + # Module 4 - Read alignment ################################################################################ RefGenome_AlignReads: @@ -200,6 +226,30 @@ MockRefClusters_AlignmentStats: cpus-per-task: 20 mem-per-cpu: 5000 +FinalMockRef_AlignReads: + time: 01:00:00 + job-name: FinalMockRef_AlignReads + cpus-per-task: 20 + mem-per-cpu: 5000 + +FinalMockRef_SamToSortedBam: + time: 01:00:00 + job-name: FinalMockRef_SamToSortedBam + cpus-per-task: 20 + mem-per-cpu: 5000 + +FinalMockRef_SortedBamToMpileup: + time: 01:00:00 + job-name: FinalMockRef_SortedBamToMpileup + cpus-per-task: 20 + mem-per-cpu: 5000 + +finalMockRef_AlignmentStats: + time: 01:00:00 + job-name: FinalMockRef_AlignmentStats + cpus-per-task: 20 + mem-per-cpu: 5000 + # Module 5 - Call Variants ################################################################################ @@ -231,6 +281,9 @@ cut_verticalRef: cut_verticalRef_reference: job-name: cut_verticalRef_reference +cut_verticalRef_finalMock: + job-name: cut_verticalRef_finalMock + create_MasterMatrix_parallel: time: 0-12:00:00 job-name: create_MasterMatrix_parallel @@ -243,6 +296,13 @@ create_MasterMatrix_parallel_reference: mem-per-cpu: 64000 nvme: 20 +create_MasterMatrix_parallel_finalMock: + time: 0-12:00:00 + job-name: create_MasterMatrix_parallel_finalMock + mem-per-cpu: 64000 + nvme: 20 + + aggregate_MasterMatrix: job-name: aggregate_MasterMatrix time: 0-12:00:00 @@ -254,6 +314,12 @@ aggregate_MasterMatrix_reference: time: 0-12:00:00 mem-per-cpu: 48000 nvme: 20 + +aggregate_MasterMatrix_finalMock: + job-name: aggregate_MasterMatrix_finalMock + time: 0-12:00:00 + mem-per-cpu: 48000 + nvme: 20 FilterVariants: time: 1-12:00:00 @@ -265,6 +331,11 @@ FilterVariants_reference: job-name: filterVariants_reference mem-per-cpu: 30000 +FilterVariants_finalMock: + time: 1-12:00:00 + job-name: filterVariants_finalMock + mem-per-cpu: 30000 + CreateVCF: time: 01:00:00 job-name: CreateVCF @@ -275,6 +346,22 @@ CreateVCF_reference: job-name: CreateVCF_reference mem-per-cpu: 30000 +CreateVCF_finalMock: + time: 01:00:00 + job-name: CreateVCF_finalMock + mem-per-cpu: 30000 + +ParseMpileup_createCountFiles_finalMock: + time: 05:00:00 + job-name: createCountFiles_finalmock + mem-per-cpu: 16000 + +create_verticalRef_finalMock: + time: 01:00:00 + job-name: create_verticalRef_finalMock + mem-per-cpu: 16000 + nvme: 20 + # Module 6 - Postprocessing ################################################################################ Ref_getVariantFlanking: diff --git a/rules/Module0-PreparationsAndIndexing b/rules/Module0-PreparationsAndIndexing index bd35309..a98c567 100644 --- a/rules/Module0-PreparationsAndIndexing +++ b/rules/Module0-PreparationsAndIndexing @@ -142,3 +142,39 @@ rule IndexClustersSamtools: shell:""" samtools faidx {input} """ + +rule IndexFinalMockBWA: + """ + Index the final Mock Reference Genome (BWA). + """ + input: + "%s/MockReference/MockReference.fa" % (config["project-folder"]) + output: + "%s/MockReference/MockReference.fa.bwt" % (config["project-folder"]) + log: + "%s/logs/BWA/IndexFinalMock.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/BWA/IndexFinalMock.benchmark.tsv" % (config["project-folder"]) + conda:"envs/gbs.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + bwa index -a bwtsw {input} + """ + +rule IndexFinalMockSamtools: + """ + Index the final Mock Reference Genome Clusters (samtools). + """ + input: + "%s/MockReference/MockReference.fa" % (config["project-folder"]) + output: + "%s/MockReference/MockReference.fai" % (config["project-folder"]) + log: + "%s/logs/Samtools/IndexFinalMockSam.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Samtools/IndexFinalMockSam.benchmark.tsv" % (config["project-folder"]) + conda:"envs/samtools.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + samtools faidx {input} + """ \ No newline at end of file diff --git a/rules/Module3-MockReference b/rules/Module3-MockReference index b1a8e30..b4a130e 100644 --- a/rules/Module3-MockReference +++ b/rules/Module3-MockReference @@ -176,3 +176,142 @@ else: df -h &> {log} samtools mpileup -Q {params.Q} -q {params.q} -B -C 50 -f {input.refgenome} {input.bam} > {output} """ + +########################################################## +## +## Mock Reference refinement +## +########################################################## + +rule refine_mock_reference: + """ + Refine the mock reference (R). + """ + input: + clusters="%s/FASTQ/TRIMMED/GSC.MR.Clusters.fa" % (config["project-folder"]), + script=config["refinement-script"], + c=expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]), samples=samples) + output: + "%s/MockReference/MockReference.fa" % (config["project-folder"]) + log: + "%s/logs/R/refine_mock_reference.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/R/refine_mock_reference.benchmark.tsv" % (config["project-folder"]) + singularity: + "docker://fischuu/r-gbs:3.6.3-0.2" + params: + projFolder=config["project-folder"], + pipeFolder=config["pipeline-folder"], + minTotalReadCoverage=config["mockref"]["TotalReadCoverage"], + minSampleCoverage=config["mockref"]["minSampleCoverage"] + shell:""" + R -e "projFolder <- '{params.projFolder}'; \ + mockClusters.file <- '{input.clusters}'; \ + minTotalReadCoverage <- '{params.minTotalReadCoverage}'; \ + minSampleCoverage <- '{params.minSampleCoverage}'; \ + mockClusters.refined.file <- '{output}'; \ + snakemake <- TRUE;\ + source('{input.script}')" &> {log} + """ + +if config["genome"] == "": + print(f"No reference genome provided, skipping the star indexing step for reference genome") +else: + rule FinalMockVsRef_alignment: + """ + Map the mock reference clusters to the genome (minimap2). + """ + input: + refgenome=config["genome"], + mockgenome="%s/MockReference/MockReference.fa" % (config["project-folder"]) + output: + "%s/SAM/finalMockToRef.sam" % (config["project-folder"]) + log: + "%s/logs/MINIMAP2/mm2_map_finalMockVSref.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/MINIMAP2/mm2_finalMockVSref.benchmark.tsv" % (config["project-folder"]) + threads: 20 + singularity: config["singularity"]["minimap2"] + shell:""" + minimap2 -ax sr {input.refgenome} {input.mockgenome} > {output} 2> {log} + """ + +if config["genome"] == "": + print(f"No reference genome provided, skipping the star indexing step for reference genome") +else: + rule FinalMockVsRef_samtools_SamToSortedBam: + """ + Index the Mock Reference Genome. + """ + input: + ref=config["genome"], + sam="%s/SAM/finalMockToRef.sam" % (config["project-folder"]) + output: + bam="%s/BAM/FinalMockref/mockToRef.bam" % (config["project-folder"]), + sorted="%s/BAM/FinalMockref/mockToRef.sorted.bam" % (config["project-folder"]) + log: + "%s/logs/Samtools/FinalMockVsRef_samtools_SamToSortedBam.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Samtools/FinalMockVsRef_samtools_SamToSortedBam.benchmark.tsv" % (config["project-folder"]) + singularity: config["singularity"]["gbs"] + shell:""" + df -h &> {log} + samtools view -b {input.sam} > {output.bam} + samtools sort {output.bam} -o {output.sorted} + samtools index {output.sorted} + """ + + +if config["genome"] == "": + print(f"No reference genome provided, skipping the star indexing step for reference genome") +else: + rule FinalMockVsRef_AlignmentStats: + """ + Get the mapping stats for the mock vs. reference. + """ + input: + ref=config["genome"], + sam="%s/SAM/finalMockToRef.sam" % (config["project-folder"]), + sbam="%s/BAM/FinalMockref/mockToRef.sorted.bam" % (config["project-folder"]) + output: + fs="%s/BAM/FinalMockref/mockToRef.sam.flagstat" % (config["project-folder"]), + samflags="%s/BAM/FinalMockref/mockToRef.sam.samflags" % (config["project-folder"]), + stats="%s/BAM/FinalMockref/mockToRef.sam.stats" % (config["project-folder"]), + c="%s/BAM/FinalMockref/mockToRef.coverage" % (config["project-folder"]), + bed="%s/BAM/FinalMockref/mockToRef.bed" % (config["project-folder"]), + bedMerged="%s/BAM/FinalMockref/mockToRef.merged.bed" % (config["project-folder"]), + leftBed="%s/BAM/FinalMockref/mockToRef.merged.left.bed" % (config["project-folder"]), + rightBed="%s/BAM/FinalMockref/mockToRef.merged.right.bed" % (config["project-folder"]), + leftFACounts="%s/SAM/finalMockToRef.left.fa.counts" % (config["project-folder"]), + rightFACounts="%s/SAM/finalMockToRef.right.fa.counts" % (config["project-folder"]), + leftFA="%s/BAM/FinalMockref/mockToRef.merged.left.fa" % (config["project-folder"]), + rightFA="%s/BAM/FinalMockref/mockToRef.merged.right.fa" % (config["project-folder"]), + FA="%s/BAM/FinalMockref/mockToRef.merged.combined.fa" % (config["project-folder"]), + FACounts="%s/BAM/FinalMockref/mockToRef.merged.combined.fa.counts" % (config["project-folder"]) + log: + "%s/logs/FinalMockVsRef_AlignmentStats.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/MockVsRef_AlignmentStats.benchmark.tsv" % (config["project-folder"]) + threads: lambda cores: cpu_count() + conda:"envs/step3.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + df -h &> {log} + echo "Number of threads used:" {threads} + sed '/^@/d' {input.sam} | cut -f2 | sort | uniq -c > {output.samflags} + samtools flagstat {input.sam} > {output.fs} + samtools stats {input.sam} > {output.stats} + samtools idxstats {input.sbam} | awk '{{print $1\" \"$3}}' > {output.c} + bedtools genomecov -ibam {input.sbam} -bg > {output.bed} + bedtools merge -i {output.bed} > {output.bedMerged} + awk '{{print $1 "\\t" (($2 - 6)<0?0:($2 - 6)) "\\t" $2}}' {output.bedMerged} > {output.leftBed} + awk '{{print $1 "\\t" $3 "\\t" $3 + 6}}' {output.bedMerged} > {output.rightBed} + bedtools getfasta -fi {input.ref} -bed {output.leftBed} > {output.leftFA} + bedtools getfasta -fi {input.ref} -bed {output.rightBed} > {output.rightFA} + + sed '/^>/d' {output.leftFA} | sort | awk '{{ print toupper($0) }}' | uniq -c > {output.leftFACounts} + sed '/^>/d' {output.rightFA} | sort | awk '{{ print toupper($0) }}' | uniq -c > {output.rightFACounts} + + paste -d '+' {output.leftFA} {output.rightFA} > {output.FA} + sed '/^>/d' {output.FA} | sort | awk '{{ print toupper($0) }}' | uniq -c > {output.FACounts} + """ diff --git a/rules/Module4-ReadAlignment b/rules/Module4-ReadAlignment index 75a8e95..1649c3b 100644 --- a/rules/Module4-ReadAlignment +++ b/rules/Module4-ReadAlignment @@ -52,7 +52,7 @@ else: echo "Number of threads used:" {threads} samtools view -b -q {params.q} -f {params.f} -F {params.F} {input} > {output.bam} samtools sort {output.bam} -o {output.sorted} - samtools index {output.sorted} + samtools index -c {output.sorted} """ if config["genome"] == "": @@ -182,7 +182,7 @@ rule MockRef_SamToSortedBam: df -h &> {log} samtools view -b -q {params.q} -f {params.f} -F {params.F} {input} > {output.bam} samtools sort {output.bam} -o {output.sorted} - samtools index {output.sorted} + samtools index -c {output.sorted} """ rule MockRefClusters_SamToSortedBam: @@ -209,7 +209,7 @@ rule MockRefClusters_SamToSortedBam: df -h &> {log} samtools view -b -q {params.q} -f {params.f} -F {params.F} {input} > {output.bam} samtools sort {output.bam} -o {output.sorted} - samtools index {output.sorted} + samtools index -c {output.sorted} """ rule MockRef_SortedBamToMpileup: @@ -278,3 +278,103 @@ rule MockRefClusters_AlignmentStats: samtools idxstats {input.sbam} | awk '{{print $1\" \"$3}}' > {output.c} """ +################################################################################ +## +## Process the final Mock + +rule FinalMockRef_AlignReads: + """ + Align reads to the final mock reference Genome. + """ + input: + reference="%s/MockReference/MockReference.fa" % (config["project-folder"]), + refFiles="%s/MockReference/MockReference.fa.bwt" % (config["project-folder"]), + R1="%s/FASTQ/TRIMMED/{samples}.R1.fq.gz" % (config["project-folder"]), + R2="%s/FASTQ/TRIMMED/{samples}.R2.fq.gz" % (config["project-folder"]), + output: + temp("%s/SAM/alignments_finalMock/{samples}.sam" % (config["project-folder"])) + log: + "%s/logs/BWA/FinalMockRef_AlignReads.{samples}.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/BWA/FinalMockRef_AlignReads.{samples}.benchmark.tsv" % (config["project-folder"]) + params: + threads=config["params"]["step5b"]["threads"], + conda:"envs/gbs.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + df -h &> {log} + bwa mem -t {params.threads} -M {input.reference} {input.R1} {input.R2} > {output} 2> {log} + """ + +rule FinalMockRef_SamToSortedBam: + """ + Sam to sorted bam for final mock reference (SAMTOOLS) + """ + input: + "%s/SAM/alignments_finalMock/{samples}.sam" % (config["project-folder"]) + output: + bam=temp("%s/BAM/alignments_finalMock/{samples}.bam" % (config["project-folder"])), + sorted="%s/BAM/alignments_finalMock/{samples}.sorted.bam" % (config["project-folder"]) + log: + "%s/logs/Samtools/FinalMockRef_SamToSortedBam.{samples}.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Samtools/FinalMockRef_SamToSortedBam.{samples}.benchmark.tsv" % (config["project-folder"]) + params: + threads=config["params"]["step5d"]["threads"], + q=config["params"]["step5d"]["q"], + f=config["params"]["step5d"]["f"], + F=config["params"]["step5d"]["F"] + conda:"envs/samtools.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + df -h &> {log} + samtools view -b -q {params.q} -f {params.f} -F {params.F} {input} > {output.bam} + samtools sort {output.bam} -o {output.sorted} + samtools index -c {output.sorted} + """ + +rule FinalMockRef_SortedBamToMpileup: + """ + Get Mpileup for final Mock Reference Genome. + """ + input: + bam="%s/BAM/alignments_finalMock/{samples}.sorted.bam" % (config["project-folder"]), + reference="%s/MockReference/MockReference.fa" % (config["project-folder"]) + output: + bam="%s/MPILEUP/mpileup_finalMock/{samples}.mpileup" % (config["project-folder"]) + log: + "%s/logs/Samtoools/FinalMockRef_SortedBamToMpileup.{samples}.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Samtools/FinalMockRef_SortedBamToMpileup.{samples}.benchmark.tsv" % (config["project-folder"]) + params: + threads=config["params"]["step5d"]["threads"], + Q=config["params"]["step5d"]["Q"], + q=config["params"]["step5d"]["q"] + conda:"envs/samtools.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + df -h &> {log} + samtools mpileup -Q {params.Q} -q {params.q} -B -C 50 -f {input.reference} {input.bam} > {output} + """ + + +rule finalMockRef_AlignmentStats: + """ + Alignment stats for reads mapped to final mock ref + """ + input: + sam="%s/SAM/alignments_finalMock/{samples}.sam" % (config["project-folder"]), + sbam="%s/BAM/alignments_finalMock/{samples}.sorted.bam" % (config["project-folder"]) + output: + fs="%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), + c="%s/BAM/alignments_finalMock/{samples}.coverage" % (config["project-folder"]) + log: + "%s/logs/Samtools/FinalMockRef_AlignmentStats.{samples}.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Samtools/FinalMockRef_AlignmentStats.{samples}.benchmark.tsv" % (config["project-folder"]) + conda:"envs/samtools.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + samtools flagstat {input.sam} > {output.fs} + samtools idxstats {input.sbam} | awk '{{print $1\" \"$3}}' > {output.c} + """ diff --git a/rules/Module5-CallVariants b/rules/Module5-CallVariants index c9b96dd..37015ac 100644 --- a/rules/Module5-CallVariants +++ b/rules/Module5-CallVariants @@ -76,7 +76,7 @@ rule create_verticalRef: threads: 1 shell:""" sort -u -m -k2n {input.ref} -o {params.wd}/VerticalRefPos.tmp; - sort -k2n {params.wd}/VerticalRefPos.tmp | uniq >{output.vref}; + sort -k2n {params.wd}/VerticalRefPos.tmp | uniq > {output.vref}; #ls {params.wd}/*count.txt | xargs -n1 basename > {output.co} ls {params.wd}/*count.txt > {output.co} """ @@ -394,7 +394,7 @@ else: filtered="%s/MPILEUP/mpileup_reference/variants/GSC.GenoMatrix.txt" % (config["project-folder"]), unfiltered="%s/MPILEUP/mpileup_reference/variants/GSC.GenoMatrix.unfiltered.txt" % (config["project-folder"]) output: - "%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]) + "%s/VCF/FinalSetVariants_referenceGenome.vcf" % (config["project-folder"]) params: barcodes=config["barcodes"], genoMatrix=config["params"]["step8"]["genoMatrix"], @@ -402,6 +402,8 @@ else: unfilteredOut=config["params"]["step8"]["unfiltered"], format=config["params"]["step8"]["format"], wd="%s/MPILEUP/mpileup_reference" % config["project-folder"], + outdir="%s/VCF" % config["project-folder"], + pipeout="%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]), pipefolder=config["pipeline-folder"] log: "%s/logs/Perl/CreateVCF_reference.log" % (config["project-folder"]) @@ -419,5 +421,206 @@ else: cd {params.wd} perl {params.pipefolder}/scripts/GBS-SNP-CROP-8.pl -in {input.filtered} -out {params.out} -b {params.barcodes} -formats {params.format} &> {log} perl {params.pipefolder}/scripts/GBS-SNP-CROP-8.pl -in {input.unfiltered} -out {params.unfilteredOut} -b {params.barcodes} -formats {params.format} &> {log} + cp {params.pipeout} {output} cd ../.. """ + +################################################################################ +## +## Process the final mock + +rule ParseMpileup_createCountFiles_finalMock: + """ + Parse mpileup outputs and create count/ref files for final mock + """ + input: + "%s/MPILEUP/mpileup_finalMock/{samples}.mpileup" % (config["project-folder"]) + output: + co="%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), + ref="%s/MPILEUP/mpileup_finalMock/{samples}.ref.txt" % (config["project-folder"]) + params: + p=config["params"]["step6"]["p"], + wd="%s/MPILEUP/mpileup_finalMock" % config["project-folder"], + pipefolder=config["pipeline-folder"] + log: + "%s/logs/Perl/ParseMpileup_createCountFiles_finalMock.{samples}.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Perl/ParseMpileup_createCountFiles_finalMock.{samples}.benchmark.tsv" % (config["project-folder"]) + threads: 1 + conda:"envs/gbs.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + cd {params.wd} + perl {params.pipefolder}/scripts/GBS-SNP-CROP-6_1.pl -b {input} -p {params.p} &> {log} + """ + +rule create_verticalRef_finalMock: + """ + Merge the separate ref files for final mock + """ + input: + co=expand("%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), samples=samples), + ref=expand("%s/MPILEUP/mpileup_finalMock/{samples}.ref.txt" % (config["project-folder"]), samples=samples) + output: + vref="%s/MPILEUP/mpileup_finalMock/VerticalRefPos.txt" % (config["project-folder"]), + co="%s/MPILEUP/mpileup_finalMock/CountFileList.txt" % (config["project-folder"]) + log: + "%s/logs/BASH/create_verticalRef_finalMock.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/BASH/create_verticalRef_finalMock.benchmark.tsv" % (config["project-folder"]) + params: + wd="%s/MPILEUP/mpileup_finalMock" % config["project-folder"], + threads: 1 + shell:""" + sort -u -m -k2n {input.ref} -o {params.wd}/VerticalRefPos.tmp; + sort -k2n {params.wd}/VerticalRefPos.tmp | uniq > {output.vref}; + #ls {params.wd}/*count.txt | xargs -n1 basename > {output.co} + ls {params.wd}/*count.txt > {output.co} + """ + +checkpoint cut_verticalRef_finalMock: + """ + Divide the input verticalRef-file for parallel processing in finalMock + """ + input: + "%s/MPILEUP/mpileup_finalMock/VerticalRefPos.txt" % (config["project-folder"]) + output: + directory("%s/MPILEUP/mpileup_finalMock/VerticalRefPos/" % (config["project-folder"])) + params: + out="%s/MPILEUP/mpileup_finalMock/VerticalRefPos/VerticalRefPos." % (config["project-folder"]), + split=1000000 + shell:""" + mkdir -p {output} + split -l {params.split} --numeric-suffixes {input} {params.out} + """ + +rule create_MasterMatrix_parallel_finalMock: + """ + Process verticalRef parallel to create MasterMatrix for final mock + """ + input: + verRef="%s/MPILEUP/mpileup_finalMock/VerticalRefPos/VerticalRefPos.{i}" % (config["project-folder"]), + counts="%s/MPILEUP/mpileup_finalMock/CountFileList.txt" % (config["project-folder"]) + output: + "%s/MPILEUP/mpileup_finalMock/VerticalRefPos/GSC.MasterMatrix_{i}.tsv" % (config["project-folder"]) + log: + "%s/logs/BASH/create_MasterMatrix_parallel_finalMock_{i}.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/BASH/create_MasterMatrix_parallel_finalMock.benchmark_{i}.tsv" % (config["project-folder"]) + params: + p=config["params"]["step6"]["p"], + wd="%s/MPILEUP/mpileup_finalMock/VerticalRefPos" % config["project-folder"], + pipefolder=config["pipeline-folder"] + conda:"envs/gbs.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + cd {params.wd} + perl {params.pipefolder}/scripts/GBS-SNP-CROP-6_2.pl -in {input.verRef} -count {input.counts} -out {output} -p {params.p} &> {log} + """ + +def aggregate_inputMasterMatrix_finalMock(wildcards): + """ + Aggregate the input object for the final MasterMatrix for finalMock + """ + checkpoint_outputMM = checkpoints.cut_verticalRef_finalMock.get(**wildcards).output[0] + return expand("%s/MPILEUP/mpileup_finalMock/VerticalRefPos/GSC.MasterMatrix_{i}.tsv" % (config["project-folder"]), + i=glob_wildcards(os.path.join(checkpoint_outputMM, "VerticalRefPos.{i}")).i) + +rule aggregate_MasterMatrix_finalMock: + input: + aggregate_inputMasterMatrix_finalMock + output: + "%s/MPILEUP/mpileup_finalMock/GSC.MasterMatrix.txt" % (config["project-folder"]) + shell:""" + cat {input} > {output} + """ + +rule FilterVariants_finalMock: + """ + Filter variants and call genotypes using the final mock. + """ + input: + "%s/MPILEUP/mpileup_finalMock/GSC.MasterMatrix.txt" % (config["project-folder"]) + output: + filtered="%s/MPILEUP/mpileup_finalMock/variants/GSC.GenoMatrix.txt" % (config["project-folder"]), + unfiltered="%s/MPILEUP/mpileup_finalMock/variants/GSC.GenoMatrix.unfiltered.txt" % (config["project-folder"]) + params: + input=config["params"]["step7"]["input"], + out=config["params"]["step7"]["out"], + unfiltered=config["params"]["step7"]["unfiltered"], + p=config["params"]["step7"]["p"], + mnHoDepth0=config["params"]["step7"]["mnHoDepth0"], + mnHoDepth1=config["params"]["step7"]["mnHoDepth1"], + mnHetDepth=config["params"]["step7"]["mnHetDepth"], + altStrength=config["params"]["step7"]["altStrength"], + mnAlleleRatio=config["params"]["step7"]["mnAlleleRatio"], + mnCall=config["params"]["step7"]["mnCall"], + mnAvgDepth=config["params"]["step7"]["mnAvgDepth"], + mxAvgDepth=config["params"]["step7"]["mxAvgDepth"], + wd="%s/MPILEUP/mpileup_finalMock" % config["project-folder"], + pipefolder=config["pipeline-folder"] + log: + "%s/logs/Perl/FilterVariants_finalMock.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Perl/FilterVariants_finalMock.benchmark.tsv" % (config["project-folder"]) + conda:"envs/gbs.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + #******PARAMETERS***** + # -in Discovery master matrix input file. The output from last step File GSC.MasterMatrix.txt + # -out Genotyping matrix for the population Output file GSC.GenoMatrix.txt + # -p Identify SNPs only (snp) or SNPs + indels (indel) String -- + # -mnHoDepth0 Minimum depth required for calling a homozygote when the alternative allele depth = 0 Numeric 5 + # -mnHoDepth1 Minimum depth required for calling a homozygote when the alternative allele depth = 1 Numeric 20 + # -mnHetDepth Minimum depth required for each allele when calling a heterozygote Numeric 3 + # -altStrength Across the population for a given putative bi-allelic variant, this alternate allele strength parameter is the minimum proportion of non-primary allele reads that are the secondary allele Numeric 0.8 + # -mnAlleleRatio Minimum required ratio of less frequent allele depth to more frequent allele depth Numeric 0.25 + # -mnCall Minimum acceptable proportion of genotyped individuals to retain a variant Numeric 0.75 + # -mnAvgDepth Minimum average depth of an acceptable variant Numeric 3 + # -mxAvgDepth Maximum average depth of an acceptable variant Numeric 200 + + cd {params.wd} + perl {params.pipefolder}/scripts/GBS-SNP-CROP-7.pl -in {params.input} -out {params.out} -p {params.p} -mnHoDepth0 {params.mnHoDepth0} -mnHoDepth1 {params.mnHoDepth1} -mnHetDepth {params.mnHetDepth} -altStrength {params.altStrength} -mnAlleleRatio {params.mnAlleleRatio} -mnCall {params.mnCall} -mnAvgDepth {params.mnAvgDepth} -mxAvgDepth {params.mxAvgDepth} &> {log} + perl {params.pipefolder}/scripts/GBS-SNP-CROP-7.pl -in {params.input} -out {params.unfiltered} -p {params.p} -mnHoDepth0 0 -mnHoDepth1 0 -mnHetDepth 0 -altStrength 0 -mnAlleleRatio 0 -mnCall 0 -mnAvgDepth 0 -mxAvgDepth 1000000 &> {log} + """ + +rule CreateVCF_finalMock: + """ + Create the VCF output of the pipeline for the final mock. + """ + input: + filtered="%s/MPILEUP/mpileup_finalMock/variants/GSC.GenoMatrix.txt" % (config["project-folder"]), + unfiltered="%s/MPILEUP/mpileup_finalMock/variants/GSC.GenoMatrix.unfiltered.txt" % (config["project-folder"]) + output: + "%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]), + params: + barcodes=config["barcodes"], + genoMatrix=config["params"]["step8"]["genoMatrix"], + out=config["params"]["step8"]["out"], + unfilteredOut=config["params"]["step8"]["unfiltered"], + format=config["params"]["step8"]["format"], + wd="%s/MPILEUP/mpileup_finalMock" % config["project-folder"], + outdir="%s/VCF" % config["project-folder"], + pipeout="%s/MPILEUP/mpileup_finalMock/GSC.vcf" % (config["project-folder"]), + pipefolder=config["pipeline-folder"] + log: + "%s/logs/Perl/CreateVCF_finalMock.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Perl/CreateVCF_finalMock.benchmark.tsv" % (config["project-folder"]) + conda:"envs/gbs.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + #******PARAMETERS***** + # -in Final genotyping matrix input file File GSC.GenoMatrix.txt + # -out Output label name without extension String GSC + # -b BarcodeID file name See Appendix A File -- + # -formats The name(s) (R, Tassel, Plink, vcf, H) for which the GBS-SNP-CROP final genotyping matrix should be converted. If more than one format is desired, the names should be separated by commas without any space, as in the examples shown below String R,T,P,V,H + + mkdir -p {params.outdir} + cd {params.wd} + perl {params.pipefolder}/scripts/GBS-SNP-CROP-8.pl -in {input.filtered} -out {params.out} -b {params.barcodes} -formats {params.format} &> {log} + perl {params.pipefolder}/scripts/GBS-SNP-CROP-8.pl -in {input.unfiltered} -out {params.unfilteredOut} -b {params.barcodes} -formats {params.format} &> {log} + + cp {params.pipeout} {output} + cd ../.. + """ \ No newline at end of file diff --git a/rules/Module6-PostProcessing b/rules/Module6-PostProcessing index 5a4c6c9..285e32c 100644 --- a/rules/Module6-PostProcessing +++ b/rules/Module6-PostProcessing @@ -33,7 +33,7 @@ else: Get the flanking sequences from identified variants for reference genome. """ input: - vcf="%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]), + vcf="%s/VCF/FinalSetVariants_referenceGenome.vcf" % (config["project-folder"]), genome=config["genome"] output: fl="%s/MPILEUP/mpileup_reference/GSC.vcf.flanking" % (config["project-folder"]), diff --git a/rules/Module7-Reporting b/rules/Module7-Reporting index 666f836..63f36a3 100644 --- a/rules/Module7-Reporting +++ b/rules/Module7-Reporting @@ -8,13 +8,22 @@ rule R_finalReport: script=config["report-script"], fa="%s/FASTQ/TRIMMED/GSC.vcf.fa" % (config["project-folder"]), mockToRef="%s/BAM/Mockref/mockToRef.sorted.bam" % (config["project-folder"]), + mockToRefSamflags="%s/BAM/Mockref/mockToRef.sam.samflags" % (config["project-folder"]), + mockFlagstats=expand("%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), + c=expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]), samples=samples), mqcraw="%s/QC/RAW/multiqc_R1/" % (config["project-folder"]), mqcconcatenated="%s/QC/CONCATENATED/multiqc_R1/" % (config["project-folder"]), mqctrimmed="%s/QC/TRIMMED/multiqc_R1/" % (config["project-folder"]), faRef="%s/MPILEUP/mpileup_reference/GSC.vcf.fa" % (config["project-folder"]), qdRaw=expand("%s/QC/RAW/{rawsamples}_R1_qualdist.txt" % (config["project-folder"]), rawsamples=rawsamples), qdConc=expand("%s/QC/CONCATENATED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples), - qdTrimmed=expand("%s/QC/TRIMMED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples) + qdTrimmed=expand("%s/QC/TRIMMED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples), + fsFinal=expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), + fsFinalVsRef="%s/BAM/FinalMockref/mockToRef.sam.flagstat" % (config["project-folder"]), + finalSetVariants="%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]), + fsRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), + cRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.coverage" % (config["project-folder"]), samples=samples), + cFinalMock=expand("%s/BAM/alignments_finalMock/{samples}.coverage" % (config["project-folder"]), samples=samples) output: "%s/finalReport.html" % (config["project-folder"]) log: diff --git a/runGBSPipeline-example.sh b/runGBSPipeline-example.sh index 2a2e3d0..4fe7304 100644 --- a/runGBSPipeline-example.sh +++ b/runGBSPipeline-example.sh @@ -51,4 +51,5 @@ snakemake -s /scratch/project_2001746/Pipeline-GBS/GBS-pipeline.smk \ --latency-wait 60 \ --cluster-config /scratch/project_2001746/Pipeline-GBS/GBS-pipeline_server-config.yaml \ --cluster "sbatch -t {cluster.time} --account={cluster.account} --gres=nvme:{cluster.nvme} --job-name={cluster.job-name} --tasks-per-node={cluster.ntasks} --cpus-per-task={cluster.cpus-per-task} --mem-per-cpu={cluster.mem-per-cpu} -p {cluster.partition} -D {cluster.working-directory}" \ + --scheduler greedy \ $@ diff --git a/scripts/refineMockReference.R b/scripts/refineMockReference.R new file mode 100644 index 0000000..4e97ff2 --- /dev/null +++ b/scripts/refineMockReference.R @@ -0,0 +1,39 @@ +# Load required libraries + library("GenomicTools") + +# This script imports the mock reference coverage values, analyses it and then created a final, refined mock reference file + if(!is.element("snakemake",ls())){ + mockClusters.file <- "GSC.MR.Clusters.fa" + projFolder <- "/scratch/project_2001746/BSF" + minTotalReadCoverage <- 25 + minSampleCoverage <- 3 + mockClusters.refined.file <- "/scratch/project_2001746/BSF/MockReference/MockReference.fa" + } + +# Import the coverage + clusterFiles <- list.files(file.path(projFolder, "FASTQ", "TRIMMED", "alignments_clusters"), pattern="*.coverage") + clusterCoverage <- read.table(file.path(projFolder, "FASTQ", "TRIMMED", "alignments_clusters", clusterFiles[1])) + names(clusterCoverage)[1:2] <- c("cluster", clusterFiles[1]) + + for(i in 2:length(clusterFiles)){ + tmp <- read.table(file.path(projFolder, "FASTQ", "TRIMMED", "alignments_clusters", clusterFiles[i])) + names(tmp)[1:2] <- c("cluster", clusterFiles[i]) + clusterCoverage <- merge(clusterCoverage, tmp, by="cluster") + } + +# Get the total coverage + totalCoverage <- apply(clusterCoverage[,-1],1,sum) + +# Filter the coverage + clusterCoverage.refined <- clusterCoverage[totalCoverage>minTotalReadCoverage,] + clusterCoverage.refined <- clusterCoverage.refined[apply(clusterCoverage.refined[,-1]>0,1,sum)>=minSampleCoverage,] + +# Import the initial mock clusters + mockClusters <- importFA(file.path(mockClusters.file)) + +# Refine the mock clusters + mockClusters.refined <- mockClusters[is.element(gsub(">","",names(mockClusters)), clusterCoverage.refined[,1])] + +# Export the refined mock reference + exportFA(mockClusters.refined, file=mockClusters.refined.file) + \ No newline at end of file diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd index adb2055..4200790 100644 --- a/scripts/workflow-report.Rmd +++ b/scripts/workflow-report.Rmd @@ -43,13 +43,13 @@ knitr::opts_chunk$set(echo = FALSE, fig.align = 'center', fig.height = 5, fig.width = 8.5) mockClusters.file <- "GSC.MR.Clusters.fa" mockReference.file <- "GSC.MR.Genome.fa" -refGenome.file <- basename(refGenome.file) if(!is.element("snakemake",ls())){ - projFolder <- "/scratch/project_2001746/BSF" + projFolder <- "/scratch/project_2001746/Whitefish_benchMS" pipelineFolder <- "/scratch/project_2001746/Pipeline-GBS/" pipelineConfig <- "/scratch/project_2001746/BSF/Pipeline-GBS/GBS-pipeline_config-BSF.yaml" - refGenome.file <- "hermetiaRef_112020.fasta" + refGenome.file <- "GCA_902175075.1_AWG_v1_genomic.fna" } +refGenome.file <- basename(refGenome.file) ifelse(refGenome.file == "", refAvail <- FALSE, refAvail <- TRUE) ``` @@ -163,7 +163,7 @@ getFastQCJSON <- function(path){ The DAG of the used pipeline with rule dependencies. ```{r import workflow, echo=FALSE, fig.cap="Overview of the applied workflow", out.width = '100%'} -if(file.exists(file.path(pipelineFolder,"workflow.png"))) knitr::include_graphics(file.path(pipelineFolder,"workflow.png")) +if(file.exists(file.path(projFolder,"workflow.png"))) knitr::include_graphics(file.path(projFolder,"workflow.png")) ``` @@ -199,7 +199,7 @@ kable_styling(out_html, "striped", position = "left") ``` ## Concatenating stats -As the concatenating is a crucial step in this pipeline, it is important to make this right. As a double check that things went right, we check how +As the concatenating is a crucial step in this pipeline, it is important to make this correct. As a double check that things went right, we check how many raw samples were concatenated per sample. We do this visually. Basically, the height of the bars should correspond to your number of used lanes, normally 1,2 or 4. Sometimes, you might want to concatenate also more files together (e.g. technical replicates), in this case the bar can be also higher. @@ -337,8 +337,9 @@ mqcStats.raw <- read.table(file.path(projFolder,"QC","RAW","multiqc_R1", "multiq tmp <- mqcStats.raw[,-c(1:4,7,11:21)] totalRawSequences <- sum(mqcStats.raw$Total.Sequences) -out <- c(totalRawSequences,apply(tmp,2,mean)) +out <- c(totalRawSequences,sd(tmp[,1]),apply(tmp,2,mean)) names(out) <- c("Tot. number sequences", + "SD total sequences", "Avg. total sequences", "Avg. poor quality", "Avg. GC Percent", @@ -679,12 +680,16 @@ plotMQCFeature(x=file.path(projFolder,"QC","CONCATENATED","multiqc_R1"), y=file.path(projFolder,"QC","TRIMMED","multiqc_R1"), feature="fastqc_per_base_sequence_quality_plot") ``` -# Mock Reference + +# Preliminary Mock Reference + +## To Add + * Get statistics on multi-mapping of the reads. ## Basic stats ```{r import required files, warning=FALSE, } -mockReference <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", mockReference.file)) -mockClusters <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", mockClusters.file)) +prelimMockReference <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", mockReference.file)) +prelimMockClusters <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", mockClusters.file)) vsearchIN <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", "VsearchIN.fa")) #vsearchOUT <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", "VsearchOUT.fa")) ``` @@ -700,7 +705,7 @@ Basic information on the sequences used to build the mock reference: Basic information on the identified clusters, used as mock reference ```{r summarise mock reference clusters} - out <- summary(mockClusters) + out <- summary(prelimMockClusters) out[1,1] <- c("No. of Clusters") out[,2] <- formatC(out[,2], format="d", big.mark=",") out_html <- knitr::kable(out, col.names = NULL, "html") @@ -709,18 +714,18 @@ Basic information on the identified clusters, used as mock reference Total length of the input fasta to build reference: `r format(sum(nchar(vsearchIN)), big.mark=",")` bp. -Total length of the Mock Reference Clusters: `r format(sum(nchar(mockClusters)), big.mark=",")` bp. +Total length of the Mock Reference Clusters: `r format(sum(nchar(prelimMockClusters)), big.mark=",")` bp. -Total length of the Mock Reference (Used as reference): `r format(sum(nchar(mockReference)), big.mark=",")` bp. +Total length of the Mock Reference (Used as reference): `r format(sum(nchar(prelimMockReference)), big.mark=",")` bp. ## Theoretical coverage of Mock per sample Given the length of the mock reference assembly and the total number of sequenced bases per sample, we can estimate the theoretical coverage of the mock for each sample -Currently I hard-code here the paired-end reads and assume equal lengths and reads in both pairs. Adjust this later 'to reality'... +Currently I hard-code here the paired-end reads and assume equal lengths and reads in both pairs. ```{r mock coverage} par(mar=c(10,5,1,1)) -mockLength <- sum(nchar(mockReference)) +mockLength <- sum(nchar(prelimMockReference)) mockCov <- 2*mqcStats.trim[,10] * mqcStats.trim[,5]/mockLength names(mockCov) <- mqcStats.trim[,1] barplot(mockCov, ylab="'x'-fold Coverage", col=viridis(20)[8], las=2) @@ -778,7 +783,7 @@ barplot(t(atDist), main="A,C,T,G distribution", col=viridis(20)[8]) ``` ### Assembling statistics -Statistics on how many reads have been assembled (=had a minimum overlap of 10 bases between R1 and R2), have been discarded ( due to length and/or quality) or were not possible to overlap. +Statistics on how many reads have been assembled (=had a minimum overlap of x-bases (see table above) between R1 and R2), have been discarded ( due to length and/or quality) or were not possible to overlap. ```{r assembling stat} tmp <- pearIn[c(startPos+23, startPos+24, startPos+25)] @@ -829,7 +834,7 @@ out_html <- knitr::kable(out, col.names = NULL, "html") kable_styling(out_html, "striped", position = "left") ``` -Quality values for the first 100.000 sequences. As a rule of thumb, capital letters are good, special characters are bad quality. +Quality values for the first 100,000 sequences. As a rule of thumb, capital letters are good, special characters are bad quality. ```{r quality measues} outQual <- table(unlist(strsplit(discardedReads$qual[1:100000],""))) out_html <- knitr::kable(formatC(outQual, format="d", big.mark=","), "html") @@ -843,6 +848,34 @@ barplot(outQual, ylab="Frequency", main="Quality-score distribution") ## Ratio samles used to length Mock reference How much were the reads merged down? Length of Mock reference vs the length of all used reads together. +# Mock reference + +## Basic stats +```{r import required files mock, warning=FALSE, } +finalMockClusters <- importFA(file.path(projFolder,"MockReference","MockReference.fa")) +``` + +Basic information for the final mock reference: +```{r final mock build stats} + out <- summary(finalMockClusters) + out[,2] <- formatC(out[,2], format="d", big.mark=",") + out_html <- knitr::kable(out, col.names = NULL, "html") + kable_styling(out_html, "striped", position = "left") +``` +## Theoretical coverage of Mock per sample +Given the length of the final mock reference assembly and the total number of sequenced bases per sample, we can estimate the theoretical coverage of the final mock for each sample + +```{r final mock coverage} +par(mar=c(10,5,1,1)) +finalMockLength <- sum(nchar(finalMockClusters)) +mockCov <- 2*mqcStats.trim[,10] * mqcStats.trim[,5]/finalMockLength +names(mockCov) <- mqcStats.trim[,1] +barplot(mockCov, ylab="'x'-fold Coverage", col=viridis(20)[8], las=2) +lines(c(-100,10000),c(1,1), lty="dotted") +lines(c(-100,10000),c(10,10), lty="dotted") +``` + + # Existing Reference In case a reference genome was provided, here are a few statistics regarding it @@ -869,7 +902,7 @@ if(refAvail){ ## Mock vs reference These are the flagstats from mapping the mock clusters against an existing reference genome. Typical flag stats are: -* 0: Read aligned aginst forward strand +* 0: Read aligned against forward strand * 16: Read aligned against reverse strand * 2048: Supplementary alignment forward * 2064: Supplementary alignment reverse @@ -902,7 +935,7 @@ totalBases <- sum(abs(mergedLoci$V3 - mergedLoci$V2)) ``` However, in total we have `r nrow(unmergedLoci)` mapping loci that can be merged down (by intersection of the bed) to `r nrow(mergedLoci)` if their chromosomal location is intersected. -Please note, if the difference between these two numbers is too large, it indicates that the mock reference needs to be stricted merged. +Please note, if the difference between these two numbers is too large, it indicates that the mock reference needs to be merged more strict. In total `r formatC(totalBases, big.mark=",")` bases are covered in the reference genome from mock cluster sequences. @@ -996,10 +1029,10 @@ datatable(tmp) WE COULD COUNT HERE STILL HOW MANY SEQUENCES MATCH THE GIVEN ONE WITH ONE SUBSTITUTION ALLOWED. - # Alignment -## Mock Reference +## Preliminary Mock Reference +### Mapping stats Import the mapping statistics to the Mock reference @@ -1145,8 +1178,156 @@ for(i in 3:ncol(clusterCoverage)){ ### Cluster coverage Mock samples vs others ADD HERE STILL A COMPARISON, HOW THE READS FROM THE MOCK DISTRIBUTE ACROSS THE REFERENCE VS ALL OTHER SAMPLES -## Reference genome +## Mock Reference +### Mapping stats + +Import the mapping statistics to the Mock reference + +```{r import final flagstats} +flagstatFiles <- list.files(file.path(projFolder,"BAM", "alignments_finalMock"), pattern="*.flagstat") +flagstats <- list() +for(i in 1:length(flagstatFiles)){ + flagstats[[i]] <- readLines(file.path(projFolder,"BAM", "alignments_finalMock",flagstatFiles[i])) +} +``` + +Visualization of the alignments, red stars indicate the mock reference samples + +```{r vis mapping final stats} +par(oma=c(6,3,0,0)) +mapStats <- matrix(0,ncol=length(flagstatFiles), nrow=2) + +sampleNames <- gsub(".sam.flagstat", "", flagstatFiles) + +colnames(mapStats) <- sampleNames +tmp <- as.numeric(sapply(strsplit(sapply(flagstats,"[",1), " +"),"[",1)) +mapStats[1,] <- as.numeric(sapply(strsplit(sapply(flagstats,"[",5), " +"),"[",1)) +mapStats[2,] <- tmp - mapStats[1,] + +p <- barplot(mapStats, col=c(viridis(20)[8], viridis(20)[16]), las=2) + +legend("topleft", pch=c(20,20), col=c(viridis(20)[16], viridis(20)[8]), legend=c("Unmapped", "Mapped"), fill="white") + +# highlight the mock reference samples +mockFiles <- paste(mockSamples, ".sam.flagstat", sep="") +mockPos <- which(is.element(flagstatFiles, mockFiles)) + +points(p[mockPos], rep(200000, length(mockPos)), pch="*", col="red", cex=4) +``` + +```{r mapping final percentage} +barplot(mapStats[1,] / (apply(mapStats,2,sum)), ylim=c(0,1), ylab="Mapping in Percent", col=viridis(20)[8], las=2) +``` + + +### Coverage + +Data mapped against the clusters and then the reads per cluster visualized +```{r importFinalClusterCoverage, warning=FALSE} +clusterFiles <- list.files(file.path(projFolder, "BAM", "alignments_finalMock"), pattern="*.coverage") +clusterCoverage <- read.table(file.path(projFolder, "BAM", "alignments_finalMock", clusterFiles[1])) +names(clusterCoverage)[1:2] <- c("cluster", clusterFiles[1]) + +for(i in 2:length(clusterFiles)){ + tmp <- read.table(file.path(projFolder, "BAM", "alignments_finalMock", clusterFiles[i])) + names(tmp)[1:2] <- c("cluster", clusterFiles[i]) + clusterCoverage <- merge(clusterCoverage, tmp, by="cluster") +} +names(clusterCoverage)[2:(length(clusterFiles)+1)] <- clusterFiles +clusterCoverage[,1] <- as.numeric(gsub("Cluster", "", clusterCoverage[,1])) +clusterCoverage <- clusterCoverage[order(clusterCoverage[,1]),] +clusterCoverage <- clusterCoverage[is.na(clusterCoverage[,1])==FALSE,] + +clusterCoverage.std <- t(t(clusterCoverage)/apply(clusterCoverage,2,sum))*100000 +``` + +### Lorenz curve +Concentration measure of reads against clusters + +```{r final lorenz} +plot(cumsum(sort(clusterCoverage[,2] / sum(clusterCoverage[,2]))), type="l", xlab="Cluster", ylab="Concentration") +for(i in 3:ncol(clusterCoverage)){ + lines(cumsum(sort(clusterCoverage[,i] / sum(clusterCoverage[,i])))) +} +``` + + +### Stats on coverage + +These are the amounts of clusters with different samples. + +```{r final cluster coverage stats} +coveredClusters <- apply(clusterCoverage[,-1]>0,1,sum) +barplot(table(coveredClusters), xlab="No. of different samples aligned to cluster", col=viridis(20)[8]) +``` + +Now we have here the number of reads per coverage class. That means, instead of having it binary as in the previous plot, we now count all the reads per coverage group. + +```{r final reads per coverage group} +readsPerCoverageGroup <- c() + +for(i in 1:max(coveredClusters)){ + readsPerCoverageGroup[i] <- sum(clusterCoverage[coveredClusters==i,-1]) +} + +names(readsPerCoverageGroup) <- 1:max(coveredClusters) + +barplot(readsPerCoverageGroup, xlab="Coverage group", ylab="Reads on cluster group", col=viridis(20)[8]) +``` + + +```{r final reads per coverage group in percent} +barplot(readsPerCoverageGroup/sum(readsPerCoverageGroup)*100, xlab="Coverage group", ylab="Reads on cluster group (in %)", ylim=c(0,100), col=viridis(20)[8]) +``` + +Then still the number of clusters without coverage per sample + +```{r final clusters without coverage} +nonHittedClusters <- apply(clusterCoverage[,-1]==0,2,sum) +names(nonHittedClusters) <- gsub(".coverage", "", names(nonHittedClusters)) +barplot(nonHittedClusters, col=viridis(20)[8], las=2) +lines(c(0,100000), c(nrow(clusterCoverage), nrow(clusterCoverage)), lty="dotted") +``` + +And then this still as percentage + +```{r final clusterhits percentage} +barplot(nonHittedClusters/nrow(clusterCoverage), ylim=c(0,1), las=2, col=viridis(20)[8]) +lines(c(0,10000),c(0.2,0.2), lty="dotted") +lines(c(0,10000),c(0.4,0.4), lty="dotted") +lines(c(0,10000),c(0.5,0.5), lty="dashed") +lines(c(0,10000),c(0.6,0.6), lty="dotted") +lines(c(0,10000),c(0.8,0.8), lty="dotted") +``` + +### Smoothed log-coverage per cluster + +```{r final vizCusterCoverage} +plot(smooth.spline(clusterCoverage[,1], log(clusterCoverage[,2]+1), all.knots=FALSE), type="l", xlab="Cluster", ylab="Log-coverage", ylim=c(0, max(log(clusterCoverage))/2)) + +for(i in 3:ncol(clusterCoverage)){ + lines(smooth.spline(clusterCoverage[,1], log(clusterCoverage[,i]+1), all.knots=FALSE)) +} +``` + +### Smoothed std-log-coverage per cluster + +Now the coverages are divided by the total amount of reads per sample and then multiplied by 10^5. + +```{r final vizStdCusterCoverage} +# I use here the first column of the other matrix to keep the same coordinates. It does not affect the plot. +plot(smooth.spline(clusterCoverage[,1], log(clusterCoverage.std[,2]+1), all.knots=FALSE), type="l", xlab="Cluster", ylab="", ylim=c(0, max(log(clusterCoverage.std))/4)) + +for(i in 3:ncol(clusterCoverage)){ + lines(smooth.spline(clusterCoverage[,1], log(clusterCoverage.std[,i]+1), all.knots=FALSE)) +} +``` + + + +## Reference genome +### Basic stats Import the mapping statistics to the reference genome ```{r import ref flagstats} @@ -1187,6 +1368,7 @@ points(p[mockPos], rep(200000, length(mockPos)), pch="*", col="red", cex=4) barplot(mapStats[1,] / (apply(mapStats,2,sum)), ylim=c(0,1), ylab="Mapping in Percent", col=viridis(20)[8], las=2) ``` + # Variants Import the Result vcf @@ -1436,68 +1618,71 @@ accepted mistakes). ```{r} -if(ncol(sampleInfo)>1){ - mmCase <- 2 - mmControl <- 5 - vcf.cases <- vcf$genotypes[sampleInfo$casecontrol=="case",] - vcf.control <- vcf$genotypes[sampleInfo$casecontrol=="control",] - - vcf.cases.table <- apply(vcf.cases,2,table) - vcf.control.table <- apply(vcf.control,2,table) - - nCases <- nrow(vcf.cases) - nControl <- nrow(vcf.control) - - candidateLociRef <- c() - candidateLociAlt <- c() - for(i in 1:length(vcf.cases.table)){ - tmp <- vcf.cases.table[[i]] - homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"] - homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"] - if(length(homRef)==0) homRef <- 0 - if(length(homAlt)==0) homAlt <- 0 - if(homRef>=(nCases - mmCase)) candidateLociRef <- c(candidateLociRef, names(vcf.cases.table)[i]) - if(homAlt>=(nCases - mmCase)) candidateLociAlt <- c(candidateLociAlt, names(vcf.cases.table)[i]) - } - - causativeRef <- c() - causativeAlt <- c() - for(i in 1:length(candidateLociRef)){ - tmp <- vcf.control.table[[which(names(vcf.control.table)==candidateLociRef[i])]] - homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"] - homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"] - if(length(homRef)==0) homRef <- 0 - if(length(homAlt)==0) homAlt <- 0 - # if(homRef>=(nControl - 1)) causativeRef <- c(causativeRef, names(vcf.control.table)[i]) - if(homAlt>=(nControl - mmControl)) causativeRef <- c(causativeRef, names(vcf.control.table)[i]) - } - - for(i in 1:length(candidateLociAlt)){ - tmp <- vcf.control.table[[which(names(vcf.control.table)==candidateLociAlt[i])]] - homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"] - homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"] - if(length(homRef)==0) homRef <- 0 - if(length(homAlt)==0) homAlt <- 0 - if(homRef>=(nControl - mmControl)) causativeAlt <- c(causativeAlt, names(vcf.control.table)[i]) - # if(homAlt>=(nControl - 1)) causativeAlt <- c(causativeAlt, names(vcf.control.table)[i]) +if(is.element(colnames(sampleInfo), "casecontrol")){ + if(ncol(sampleInfo)>1){ + mmCase <- 2 + mmControl <- 5 + vcf.cases <- vcf$genotypes[sampleInfo$casecontrol=="case",] + vcf.control <- vcf$genotypes[sampleInfo$casecontrol=="control",] + + vcf.cases.table <- apply(vcf.cases,2,table) + vcf.control.table <- apply(vcf.control,2,table) + + nCases <- nrow(vcf.cases) + nControl <- nrow(vcf.control) + + candidateLociRef <- c() + candidateLociAlt <- c() + for(i in 1:length(vcf.cases.table)){ + tmp <- vcf.cases.table[[i]] + homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"] + homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"] + if(length(homRef)==0) homRef <- 0 + if(length(homAlt)==0) homAlt <- 0 + if(homRef>=(nCases - mmCase)) candidateLociRef <- c(candidateLociRef, names(vcf.cases.table)[i]) + if(homAlt>=(nCases - mmCase)) candidateLociAlt <- c(candidateLociAlt, names(vcf.cases.table)[i]) + } + + causativeRef <- c() + causativeAlt <- c() + for(i in 1:length(candidateLociRef)){ + tmp <- vcf.control.table[[which(names(vcf.control.table)==candidateLociRef[i])]] + homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"] + homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"] + if(length(homRef)==0) homRef <- 0 + if(length(homAlt)==0) homAlt <- 0 + # if(homRef>=(nControl - 1)) causativeRef <- c(causativeRef, names(vcf.control.table)[i]) + if(homAlt>=(nControl - mmControl)) causativeRef <- c(causativeRef, names(vcf.control.table)[i]) + } + + for(i in 1:length(candidateLociAlt)){ + tmp <- vcf.control.table[[which(names(vcf.control.table)==candidateLociAlt[i])]] + homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"] + homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"] + if(length(homRef)==0) homRef <- 0 + if(length(homAlt)==0) homAlt <- 0 + if(homRef>=(nControl - mmControl)) causativeAlt <- c(causativeAlt, names(vcf.control.table)[i]) + # if(homAlt>=(nControl - 1)) causativeAlt <- c(causativeAlt, names(vcf.control.table)[i]) + } } } - ``` Then we identify the following potentially causative mutations: ```{r} -if(ncol(sampleInfo)>1){ - if(length(causativeRef)>0){ - datatable(as.data.frame(causativeRef)) - } else { - cat("No potentially mutations found for homozygeous reference!\n") - } - if(length(causativeAlt)>0){ - datatable(as.data.frame(causativeAlt)) - } else { - cat("No potentially mutations found for homozygeous alternative!\n") +if(is.element(colnames(sampleInfo), "casecontrol")){ + if(ncol(sampleInfo)>1){ + if(length(causativeRef)>0){ + datatable(as.data.frame(causativeRef)) + } else { + cat("No potentially mutations found for homozygeous reference!\n") + } + if(length(causativeAlt)>0){ + datatable(as.data.frame(causativeAlt)) + } else { + cat("No potentially mutations found for homozygeous alternative!\n") + } } } ``` diff --git a/scripts/workflow-report_files/figure-html/visualize concatenation reports-1.png b/scripts/workflow-report_files/figure-html/visualize concatenation reports-1.png new file mode 100644 index 0000000..e69de29