From 890fdad704ef6963b6091df7c6746a2bdac03490 Mon Sep 17 00:00:00 2001 From: fischuu Date: Mon, 22 Feb 2021 17:40:29 +0200 Subject: [PATCH 01/34] samtools index set to csi instead of bai --- CHANGELOG | 1 + GBS-pipeline.smk | 4 ++-- rules/Module4-ReadAlignment | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 2b09ad3..6a29e4e 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.1 : * samtools index set to csi instead of bai 0.4.1 : * Bugfix - Missing outputfile script added 0.4 : Release version 0.3.16: * Several other small path typos diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index e3c36f2..701e40e 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -9,8 +9,8 @@ import os ##### Daniel Fischer (daniel.fischer@luke.fi) ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline -##### Version: 0.4.1 -version = "0.4.1" +##### Version: 0.5.1 +version = "0.5.1" ##### set minimum snakemake version ##### min_version("5.24") diff --git a/rules/Module4-ReadAlignment b/rules/Module4-ReadAlignment index 75a8e95..5d93159 100644 --- a/rules/Module4-ReadAlignment +++ b/rules/Module4-ReadAlignment @@ -52,7 +52,7 @@ else: echo "Number of threads used:" {threads} samtools view -b -q {params.q} -f {params.f} -F {params.F} {input} > {output.bam} samtools sort {output.bam} -o {output.sorted} - samtools index {output.sorted} + samtools index -c {output.sorted} """ if config["genome"] == "": @@ -182,7 +182,7 @@ rule MockRef_SamToSortedBam: df -h &> {log} samtools view -b -q {params.q} -f {params.f} -F {params.F} {input} > {output.bam} samtools sort {output.bam} -o {output.sorted} - samtools index {output.sorted} + samtools index -c {output.sorted} """ rule MockRefClusters_SamToSortedBam: @@ -209,7 +209,7 @@ rule MockRefClusters_SamToSortedBam: df -h &> {log} samtools view -b -q {params.q} -f {params.f} -F {params.F} {input} > {output.bam} samtools sort {output.bam} -o {output.sorted} - samtools index {output.sorted} + samtools index -c {output.sorted} """ rule MockRef_SortedBamToMpileup: From 73d0942b815c7b496aeab94064e9ba8f5e5f2395 Mon Sep 17 00:00:00 2001 From: Daniel Fischer Date: Mon, 22 Feb 2021 20:57:48 +0200 Subject: [PATCH 02/34] typo fixed --- scripts/workflow-report.Rmd | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd index 7686280..3aea73d 100644 --- a/scripts/workflow-report.Rmd +++ b/scripts/workflow-report.Rmd @@ -43,13 +43,13 @@ knitr::opts_chunk$set(echo = FALSE, fig.align = 'center', fig.height = 5, fig.width = 8.5) mockClusters.file <- "GSC.MR.Clusters.fa" mockReference.file <- "GSC.MR.Genome.fa" -refGenome.file <- basename(refGenome.file) if(!is.element("snakemake",ls())){ projFolder <- "/scratch/project_2001746/BSF" pipelineFolder <- "/scratch/project_2001746/Pipeline-GBS/" pipelineConfig <- "/scratch/project_2001746/BSF/Pipeline-GBS/GBS-pipeline_config-BSF.yaml" refGenome.file <- "hermetiaRef_112020.fasta" } +refGenome.file <- basename(refGenome.file) ifelse(refGenome.file == "", refAvail <- FALSE, refAvail <- TRUE) ``` @@ -163,7 +163,7 @@ getFastQCJSON <- function(path){ The DAG of the used pipeline with rule dependencies. ```{r import workflow, echo=FALSE, fig.cap="Overview of the applied workflow", out.width = '100%'} -if(file.exists(file.path(pipelineFolder,"workflow.png"))) knitr::include_graphics(file.path(pipelineFolder,"workflow.png")) +if(file.exists(file.path(projFolder,"workflow.png"))) knitr::include_graphics(file.path(projFolder,"workflow.png")) ``` @@ -199,7 +199,7 @@ kable_styling(out_html, "striped", position = "left") ``` ## Concatenating stats -As the concatenating is a crucial step in this pipeline, it is important to make this right. As a double check that things went right, we check how +As the concatenating is a crucial step in this pipeline, it is important to make this correct. As a double check that things went right, we check how many raw samples were concatenated per sample. We do this visually. Basically, the height of the bars should correspond to your number of used lanes, normally 1,2 or 4. Sometimes, you might want to concatenate also more files together (e.g. technical replicates), in this case the bar can be also higher. @@ -707,7 +707,7 @@ Total length of the Mock Reference (Used as reference): `r format(sum(nchar(mock ## Theoretical coverage of Mock per sample Given the length of the mock reference assembly and the total number of sequenced bases per sample, we can estimate the theoretical coverage of the mock for each sample -Currently I hard-code here the paired-end reads and assume equal lengths and reads in both pairs. Adjust this later 'to reality'... +Currently I hard-code here the paired-end reads and assume equal lengths and reads in both pairs. ```{r mock coverage} par(mar=c(10,5,1,1)) @@ -769,7 +769,7 @@ barplot(t(atDist), main="A,C,T,G distribution", col=viridis(20)[8]) ``` ### Assembling statistics -Statistics on how many reads have been assembled (=had a minimum overlap of 10 bases between R1 and R2), have been discarded ( due to length and/or quality) or were not possible to overlap. +Statistics on how many reads have been assembled (=had a minimum overlap of x-bases (see table above) between R1 and R2), have been discarded ( due to length and/or quality) or were not possible to overlap. ```{r assembling stat} tmp <- pearIn[c(startPos+23, startPos+24, startPos+25)] @@ -868,7 +868,7 @@ These are the flagstats from mapping the mock clusters against an existing refer ```{r sam.flagstats} if(refAvail){ -samFlagstats <- read.table(file.path(projFolder,"SAM","mockToRef.samflags")) +samFlagstats <- read.table(file.path(projFolder,"BAM", "Mockref","mockToRef.sam.samflags")) colnames(samFlagstats) <- c("Freq.", "Flag") totalLoci <- samFlagstats$V1[samFlagstats$V2==0] + samFlagstats$V1[samFlagstats$V2==16] out_html <- knitr::kable(samFlagstats, "html") From 1eac011c5df0c17ff29f8a91bf87b743a24d905e Mon Sep 17 00:00:00 2001 From: Daniel Fischer Date: Mon, 22 Feb 2021 22:04:35 +0200 Subject: [PATCH 03/34] Script to refine the mock reference added --- CHANGELOG | 1 + GBS-pipeline.smk | 5 ++- GBS-pipeline_config.yaml | 8 +++- rules/Module3-MockReference | 37 +++++++++++++++++++ scripts/refineMockReference.R | 36 ++++++++++++++++++ scripts/workflow-report.Rmd | 30 +++++++++++---- .../visualize concatenation reports-1.png | 0 7 files changed, 106 insertions(+), 11 deletions(-) create mode 100644 scripts/refineMockReference.R create mode 100644 scripts/workflow-report_files/figure-html/visualize concatenation reports-1.png diff --git a/CHANGELOG b/CHANGELOG index 6a29e4e..7289930 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.2 : * Script to refine the mock reference added 0.5.1 : * samtools index set to csi instead of bai 0.4.1 : * Bugfix - Missing outputfile script added 0.4 : Release version diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 701e40e..a4b5a46 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -9,8 +9,8 @@ import os ##### Daniel Fischer (daniel.fischer@luke.fi) ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline -##### Version: 0.5.1 -version = "0.5.1" +##### Version: 0.5.2 +version = "0.5.2" ##### set minimum snakemake version ##### min_version("5.24") @@ -30,6 +30,7 @@ wildcard_constraints: config["genome-bwa-index"] = config["genome"]+".bwt" config["genome-star-index"] = config["project-folder"]+"/references/STAR2.7.3a" config["report-script"] = config["pipeline-folder"]+"/scripts/workflow-report.Rmd" +config["refinement-script"] = config["pipeline-folder"]+"/scripts/refineMockReference.R" config["adapter"]=config["pipeline-folder"]+"/adapter.fa" ##### Singularity container ##### diff --git a/GBS-pipeline_config.yaml b/GBS-pipeline_config.yaml index a251cb7..f971037 100644 --- a/GBS-pipeline_config.yaml +++ b/GBS-pipeline_config.yaml @@ -22,7 +22,13 @@ tmpdir: "/scratch/project_2001746/tmp" enz1: "AATTC" # First restriction enzyme sequence enz2: "GCATG" # Second restriction enzyme sequence libname: "AllReads" # -libtype: "PE" # LIbrary type, paired-end (PE) or single-end (SE) +libtype: "PE" # Library type, paired-end (PE) or single-end (SE) + +# Filter parameters for the mock reference refinement +################################################################################ +mockref: + TotalReadCoverage: 25 # How many overall reads need to be on a cluster to remain in mock reference? + minSampleCoverage: 3 # How many different samples need to have reads on a cluster to remain in mock reference? # Tools specific settings ################################################################################ diff --git a/rules/Module3-MockReference b/rules/Module3-MockReference index b1a8e30..754701d 100644 --- a/rules/Module3-MockReference +++ b/rules/Module3-MockReference @@ -176,3 +176,40 @@ else: df -h &> {log} samtools mpileup -Q {params.Q} -q {params.q} -B -C 50 -f {input.refgenome} {input.bam} > {output} """ + +########################################################## +## +## Mock Reference refinement +## +########################################################## + +rule refine_mock_reference: + """ + Refine the mock reference (R). + """ + input: + clusters="%s/FASTQ/TRIMMED/GSC.MR.Clusters.fa" % (config["project-folder"]), + script=config["refinement-script"], + c="%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]) + output: + "%s/MockReference/MockReference.fa" % (config["project-folder"]) + log: + "%s/logs/R/refine_mock_reference.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/R/refine_mock_reference.benchmark.tsv" % (config["project-folder"]) + singularity: + "docker://fischuu/r-gbs:3.6.3-0.2" + params: + projFolder=config["project-folder"], + pipeFolder=config["pipeline-folder"], + minTotalReadCoverage=config["mockref"]["TotalReadCoverage"], + minSampleCoverage=config["mockref"]["minSampleCoverage"] + shell:""" + R -e "projFolder <- '{params.projFolder}'; \ + mockClusters.file <- '{input.clusters}'; \ + minTotalReadCoverage <- '{params.minTotalReadCoverage}'; \ + minSampleCoverage <- '{params.minSampleCoverage}'; \ + mockClusters.refined.file <- '{output}'; \ + snakemake <- TRUE;\ + source('{input.script}')" &> {log} + """ diff --git a/scripts/refineMockReference.R b/scripts/refineMockReference.R new file mode 100644 index 0000000..217a2cf --- /dev/null +++ b/scripts/refineMockReference.R @@ -0,0 +1,36 @@ +# This script imports the mock reference coverage values, analyses it and then created a final, refined mock reference file +if(!is.element("snakemake",ls())){ + mockClusters.file <- "GSC.MR.Clusters.fa" + projFolder <- "/scratch/project_2001746/BSF" + minTotalReadCoverage <- 25 + minSampleCoverage <- 3 + mockClusters.refined.file <- "/scratch/project_2001746/BSF/MockReference/MockReference.fa" +} + +# Import the coverage + clusterFiles <- list.files(file.path(projFolder, "FASTQ", "TRIMMED", "alignments_clusters"), pattern="*.coverage") + clusterCoverage <- read.table(file.path(projFolder, "FASTQ", "TRIMMED", "alignments_clusters", clusterFiles[1])) + names(clusterCoverage)[1:2] <- c("cluster", clusterFiles[1]) + + for(i in 2:length(clusterFiles)){ + tmp <- read.table(file.path(projFolder, "FASTQ", "TRIMMED", "alignments_clusters", clusterFiles[i])) + names(tmp)[1:2] <- c("cluster", clusterFiles[i]) + clusterCoverage <- merge(clusterCoverage, tmp, by="cluster") + } + +# Get the total coverage + totalCoverage <- apply(clusterCoverage[,-1],1,sum) + +# Filter the coverage + clusterCoverage.refined <- clusterCoverage[totalCoverage>minTotalReadCoverage,] + clusterCoverage.refined <- clusterCoverage.refined[apply(clusterCoverage.refined[,-1]>0,1,sum)>=minSampleCoverage,] + +# Import the initial mock clusters + mockClusters <- importFA(file.path(mockClusters.file)) + +# Refine the mock clusters + mockClusters.refined <- mockClusters[is.element(gsub(">","",names(mockClusters)), clusterCoverage.refined[,1])] + +# Export the refined mock reference + exportFA(mockClusters.refined, file=mockClusters.refined.file) + \ No newline at end of file diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd index 6e7f85c..c04af5e 100644 --- a/scripts/workflow-report.Rmd +++ b/scripts/workflow-report.Rmd @@ -679,7 +679,7 @@ plotMQCFeature(x=file.path(projFolder,"QC","CONCATENATED","multiqc_R1"), y=file.path(projFolder,"QC","TRIMMED","multiqc_R1"), feature="fastqc_per_base_sequence_quality_plot") ``` -# Mock Reference +# Preliminary Mock Reference ## Basic stats ```{r import required files, warning=FALSE, } @@ -843,6 +843,22 @@ barplot(outQual, ylab="Frequency", main="Quality-score distribution") ## Ratio samles used to length Mock reference How much were the reads merged down? Length of Mock reference vs the length of all used reads together. +# Mock reference + +## Basic stats +```{r import required files, warning=FALSE, } +finalMockClusters <- importFA(file.path(projFolder,"MockReference","MockReference.fa")) +``` + +Basic information for the final mock reference: +```{r mock build stats} + out <- summary(finalMockClusters) + out[,2] <- formatC(out[,2], format="d", big.mark=",") + out_html <- knitr::kable(out, col.names = NULL, "html") + kable_styling(out_html, "striped", position = "left") +``` + + # Existing Reference In case a reference genome was provided, here are a few statistics regarding it @@ -869,7 +885,7 @@ if(refAvail){ ## Mock vs reference These are the flagstats from mapping the mock clusters against an existing reference genome. Typical flag stats are: -* 0: Read aligned aginst forward strand +* 0: Read aligned against forward strand * 16: Read aligned against reverse strand * 2048: Supplementary alignment forward * 2064: Supplementary alignment reverse @@ -877,11 +893,7 @@ These are the flagstats from mapping the mock clusters against an existing refer ```{r samflags} if(refAvail){ -<<<<<<< HEAD -samFlagstats <- read.table(file.path(projFolder,"BAM", "Mockref","mockToRef.sam.samflags")) -======= samFlagstats <- read.table(file.path(projFolder,"BAM", "Mockref", "mockToRef.sam.samflags")) ->>>>>>> 890fdad704ef6963b6091df7c6746a2bdac03490 colnames(samFlagstats) <- c("Freq.", "Flag") totalLoci <- samFlagstats$V1[samFlagstats$V2==0] + samFlagstats$V1[samFlagstats$V2==16] out_html <- knitr::kable(samFlagstats, "html") @@ -906,7 +918,7 @@ totalBases <- sum(abs(mergedLoci$V3 - mergedLoci$V2)) ``` However, in total we have `r nrow(unmergedLoci)` mapping loci that can be merged down (by intersection of the bed) to `r nrow(mergedLoci)` if their chromosomal location is intersected. -Please note, if the difference between these two numbers is too large, it indicates that the mock reference needs to be stricted merged. +Please note, if the difference between these two numbers is too large, it indicates that the mock reference needs to be merged more strict. In total `r formatC(totalBases, big.mark=",")` bases are covered in the reference genome from mock cluster sequences. @@ -1000,7 +1012,6 @@ datatable(tmp) WE COULD COUNT HERE STILL HOW MANY SEQUENCES MATCH THE GIVEN ONE WITH ONE SUBSTITUTION ALLOWED. - # Alignment ## Mock Reference @@ -1149,6 +1160,9 @@ for(i in 3:ncol(clusterCoverage)){ ### Cluster coverage Mock samples vs others ADD HERE STILL A COMPARISON, HOW THE READS FROM THE MOCK DISTRIBUTE ACROSS THE REFERENCE VS ALL OTHER SAMPLES +## Refind Mock reference +ADD THIS STILL TO THE REPORT!!! + ## Reference genome Import the mapping statistics to the reference genome diff --git a/scripts/workflow-report_files/figure-html/visualize concatenation reports-1.png b/scripts/workflow-report_files/figure-html/visualize concatenation reports-1.png new file mode 100644 index 0000000..e69de29 From 74f85ae1911c960d0b13abcf7cf69e56c3c18322 Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 23 Feb 2021 06:29:31 +0200 Subject: [PATCH 04/34] Rule all rule simplified --- CHANGELOG | 1 + GBS-pipeline.smk | 56 ++++++++++++++++++++++++------------------------ 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 7289930..bf2f888 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.3 : * Rule all rule simplified 0.5.2 : * Script to refine the mock reference added 0.5.1 : * samtools index set to csi instead of bai 0.4.1 : * Bugfix - Missing outputfile script added diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index a4b5a46..51cbd57 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -9,8 +9,8 @@ import os ##### Daniel Fischer (daniel.fischer@luke.fi) ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline -##### Version: 0.5.2 -version = "0.5.2" +##### Version: 0.5.3 +version = "0.5.3" ##### set minimum snakemake version ##### min_version("5.24") @@ -81,44 +81,44 @@ print("######################################################################### rule all: input: # OUTPUT: PREPARATION MODULE - expand("%s/FASTQ/CONCATENATED/{samples}_R1_001.merged.fastq.gz" % (config["project-folder"]), samples=samples), - expand("%s/FASTQ/CONCATENATED/{samples}_R2_001.merged.fastq.gz" % (config["project-folder"]), samples=samples), +# expand("%s/FASTQ/CONCATENATED/{samples}_R1_001.merged.fastq.gz" % (config["project-folder"]), samples=samples), +# expand("%s/FASTQ/CONCATENATED/{samples}_R2_001.merged.fastq.gz" % (config["project-folder"]), samples=samples), # # QC OF RAW AND CONCATENATED FILES "%s/QC/RAW/multiqc_R1/" % (config["project-folder"]), "%s/QC/CONCATENATED/multiqc_R1/" % (config["project-folder"]), "%s/QC/TRIMMED/multiqc_R1/" % (config["project-folder"]), # OUTPUT STEP 2 - expand("%s/FASTQ/TRIMMED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples), - expand("%s/FASTQ/TRIMMED/{samples}.R2.fq.gz" % (config["project-folder"]), samples=samples), +# expand("%s/FASTQ/TRIMMED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples), +# expand("%s/FASTQ/TRIMMED/{samples}.R2.fq.gz" % (config["project-folder"]), samples=samples), # OUTPUT STEP 2b - expand("%s/FASTQ/SUBSTITUTED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples), +# expand("%s/FASTQ/SUBSTITUTED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples), # OUTPUT STEP 4 - "%s/FASTQ/TRIMMED/GSC.MR.Genome.fa" % (config["project-folder"]), - "%s/BAM/Mockref/mockToRef.sam.flagstat" % (config["project-folder"]), - "%s/MPILEUP/mpileup_mockToRef/mockToRef.mpileup" % (config["project-folder"]), +# "%s/FASTQ/TRIMMED/GSC.MR.Genome.fa" % (config["project-folder"]), +# "%s/BAM/Mockref/mockToRef.sam.flagstat" % (config["project-folder"]), +# "%s/MPILEUP/mpileup_mockToRef/mockToRef.mpileup" % (config["project-folder"]), # OUTPUT STEP 5 - expand("%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), - expand("%s/FASTQ/TRIMMED/alignments/{samples}.sorted.bam" % (config["project-folder"]), samples=samples), - expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), - expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sorted.bam" % (config["project-folder"]), samples=samples), - expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]), samples=samples), - expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sorted.bam" % (config["project-folder"]), samples=samples), - expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), - expand("%s/MPILEUP/mpileup_reference/{samples}.mpileup" % (config["project-folder"]), samples=samples), +# expand("%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), +# expand("%s/FASTQ/TRIMMED/alignments/{samples}.sorted.bam" % (config["project-folder"]), samples=samples), +# expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), +# expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sorted.bam" % (config["project-folder"]), samples=samples), +# expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]), samples=samples), +# expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sorted.bam" % (config["project-folder"]), samples=samples), +# expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), +# expand("%s/MPILEUP/mpileup_reference/{samples}.mpileup" % (config["project-folder"]), samples=samples), # OUTPUT STEP 6 - "%s/FASTQ/TRIMMED/GSC.MasterMatrix.txt" % (config["project-folder"]), - "%s/MPILEUP/mpileup_reference/GSC.MasterMatrix.txt" % (config["project-folder"]), - expand("%s/FASTQ/TRIMMED/{samples}.count.txt" % (config["project-folder"]), samples=samples), +# "%s/FASTQ/TRIMMED/GSC.MasterMatrix.txt" % (config["project-folder"]), +# "%s/MPILEUP/mpileup_reference/GSC.MasterMatrix.txt" % (config["project-folder"]), +# expand("%s/FASTQ/TRIMMED/{samples}.count.txt" % (config["project-folder"]), samples=samples), # OUTPUT STEP 7 - "%s/FASTQ/TRIMMED/variants/GSC.GenoMatrix.txt" % (config["project-folder"]), - "%s/MPILEUP/mpileup_reference/variants/GSC.GenoMatrix.txt" % (config["project-folder"]), +# "%s/FASTQ/TRIMMED/variants/GSC.GenoMatrix.txt" % (config["project-folder"]), +# "%s/MPILEUP/mpileup_reference/variants/GSC.GenoMatrix.txt" % (config["project-folder"]), # OUTPUT STEP 8 - "%s/FASTQ/TRIMMED/GSC.vcf" % (config["project-folder"]), - "%s/FASTQ/TRIMMED/GSC.vcf.fa" % (config["project-folder"]), - "%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]), - "%s/MPILEUP/mpileup_reference/GSC.vcf.fa" % (config["project-folder"]), +# "%s/FASTQ/TRIMMED/GSC.vcf" % (config["project-folder"]), +# "%s/FASTQ/TRIMMED/GSC.vcf.fa" % (config["project-folder"]), +# "%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]), +# "%s/MPILEUP/mpileup_reference/GSC.vcf.fa" % (config["project-folder"]), # OUTPUT STEP 9 - "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]), +# "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]), # Quality check "%s/finalReport.html" % (config["project-folder"]) From 4e93d63f941e5d5cb9a2bb652ed6fc8e136b08cb Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 23 Feb 2021 06:40:30 +0200 Subject: [PATCH 05/34] Index final mock reference added --- CHANGELOG | 1 + GBS-pipeline.smk | 4 +++- GBS-pipeline_server-config.yaml | 9 +++++++++ rules/Module0-PreparationsAndIndexing | 18 ++++++++++++++++++ rules/Module3-MockReference | 2 +- 5 files changed, 32 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index bf2f888..2c4c5ec 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.4 : * Index final mock reference added 0.5.3 : * Rule all rule simplified 0.5.2 : * Script to refine the mock reference added 0.5.1 : * samtools index set to csi instead of bai diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 51cbd57..0ff6ced 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -120,7 +120,9 @@ rule all: # OUTPUT STEP 9 # "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]), # Quality check - "%s/finalReport.html" % (config["project-folder"]) + #"%s/finalReport.html" % (config["project-folder"]), + "%s/MockReference/MockReference.fa.bwt" % (config["project-folder"]) + ### setup report ##### diff --git a/GBS-pipeline_server-config.yaml b/GBS-pipeline_server-config.yaml index 6156a07..0ecaa48 100644 --- a/GBS-pipeline_server-config.yaml +++ b/GBS-pipeline_server-config.yaml @@ -46,6 +46,12 @@ IndexClustersSamtools: time: 02:00:00 job-name: IndexMockRefClusters_Samtools mem-per-cpu: 25000 + +IndexFinalMockBWA: + time: 02:00:00 + job-name: IndexFinalMockBWA + mem-per-cpu: 25000 + # Module 1 - QC: ################################################################################ @@ -134,6 +140,9 @@ MockVsRef_SortedBamToMpileup: time: 05:00:00 job-name: MockVsRef_SortedBamToMpileup +refine_mock_reference: + job-name: refine_mock_reference + # Module 4 - Read alignment ################################################################################ RefGenome_AlignReads: diff --git a/rules/Module0-PreparationsAndIndexing b/rules/Module0-PreparationsAndIndexing index bd35309..8b86958 100644 --- a/rules/Module0-PreparationsAndIndexing +++ b/rules/Module0-PreparationsAndIndexing @@ -142,3 +142,21 @@ rule IndexClustersSamtools: shell:""" samtools faidx {input} """ + +rule IndexFinalMockBWA: + """ + Index the final Mock Reference Genome (BWA). + """ + input: + "%s/MockReference/MockReference.fa" % (config["project-folder"]) + output: + "%s/MockReference/MockReference.fa.bwt" % (config["project-folder"]) + log: + "%s/logs/BWA/IndexFinalMock.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/BWA/IndexFinalMock.benchmark.tsv" % (config["project-folder"]) + conda:"envs/gbs.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + bwa index -a bwtsw {input} + """ diff --git a/rules/Module3-MockReference b/rules/Module3-MockReference index 754701d..544a92f 100644 --- a/rules/Module3-MockReference +++ b/rules/Module3-MockReference @@ -190,7 +190,7 @@ rule refine_mock_reference: input: clusters="%s/FASTQ/TRIMMED/GSC.MR.Clusters.fa" % (config["project-folder"]), script=config["refinement-script"], - c="%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]) + c=expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]), samples=samples) output: "%s/MockReference/MockReference.fa" % (config["project-folder"]) log: From 16fddbb4def509b72eacf5bdece72f2b84345288 Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 23 Feb 2021 06:43:14 +0200 Subject: [PATCH 06/34] Align reads to final mock reference added --- CHANGELOG | 1 - GBS-pipeline.smk | 2 +- rules/Module4-ReadAlignment | 27 +++++++++++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 2c4c5ec..bf2f888 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,7 +4,6 @@ ##### ##### ################################################################################ -0.5.4 : * Index final mock reference added 0.5.3 : * Rule all rule simplified 0.5.2 : * Script to refine the mock reference added 0.5.1 : * samtools index set to csi instead of bai diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 0ff6ced..d05b8ee 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -121,7 +121,7 @@ rule all: # "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]), # Quality check #"%s/finalReport.html" % (config["project-folder"]), - "%s/MockReference/MockReference.fa.bwt" % (config["project-folder"]) + expand("%s/logs/BWA/FinalMockRef_AlignReads.{samples}.log" % (config["project-folder"]), samples=samples) diff --git a/rules/Module4-ReadAlignment b/rules/Module4-ReadAlignment index 5d93159..a2f1063 100644 --- a/rules/Module4-ReadAlignment +++ b/rules/Module4-ReadAlignment @@ -278,3 +278,30 @@ rule MockRefClusters_AlignmentStats: samtools idxstats {input.sbam} | awk '{{print $1\" \"$3}}' > {output.c} """ +################################################################################ +## +## Process the final Mock + +rule FinalMockRef_AlignReads: + """ + Align reads to the final mock reference Genome. + """ + input: + reference="%s/MockReference/MockReference.fa" % (config["project-folder"]), + refFiles="%s/MockReference/MockReference.fa.bwt" % (config["project-folder"]), + R1="%s/FASTQ/TRIMMED/{samples}.R1.fq.gz" % (config["project-folder"]), + R2="%s/FASTQ/TRIMMED/{samples}.R2.fq.gz" % (config["project-folder"]), + output: + temp("%s/SAM/alignments_finalMock/{samples}.sam" % (config["project-folder"])) + log: + "%s/logs/BWA/FinalMockRef_AlignReads.{samples}.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/BWA/FinalMockRef_AlignReads.{samples}.benchmark.tsv" % (config["project-folder"]) + params: + threads=config["params"]["step5b"]["threads"], + conda:"envs/gbs.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + df -h &> {log} + bwa mem -t {params.threads} -M {input.reference} {input.R1} {input.R2} > {output} 2> {log} + """ From 53753e08865338244c0d20c7b9df48c0e66f8b62 Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 23 Feb 2021 06:48:39 +0200 Subject: [PATCH 07/34] sam to sorted bam for final mock alignments added --- GBS-pipeline.smk | 2 +- GBS-pipeline_server-config.yaml | 14 ++++++++++++++ rules/Module4-ReadAlignment | 28 ++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index d05b8ee..dcd18ea 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -121,7 +121,7 @@ rule all: # "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]), # Quality check #"%s/finalReport.html" % (config["project-folder"]), - expand("%s/logs/BWA/FinalMockRef_AlignReads.{samples}.log" % (config["project-folder"]), samples=samples) + expand("%s/BAM/alignments_finalMock/{samples}.sorted.bam" % (config["project-folder"]), samples=samples) diff --git a/GBS-pipeline_server-config.yaml b/GBS-pipeline_server-config.yaml index 0ecaa48..571b19c 100644 --- a/GBS-pipeline_server-config.yaml +++ b/GBS-pipeline_server-config.yaml @@ -209,6 +209,20 @@ MockRefClusters_AlignmentStats: cpus-per-task: 20 mem-per-cpu: 5000 +FinalMockRef_AlignReads: + time: 01:00:00 + job-name: FinalMockRef_AlignReads + cpus-per-task: 20 + mem-per-cpu: 5000 + +FinalMockRef_SamToSortedBam: + time: 01:00:00 + job-name: FinalMockRef_SamToSortedBam + cpus-per-task: 20 + mem-per-cpu: 5000 + + + # Module 5 - Call Variants ################################################################################ diff --git a/rules/Module4-ReadAlignment b/rules/Module4-ReadAlignment index a2f1063..148c6c1 100644 --- a/rules/Module4-ReadAlignment +++ b/rules/Module4-ReadAlignment @@ -305,3 +305,31 @@ rule FinalMockRef_AlignReads: df -h &> {log} bwa mem -t {params.threads} -M {input.reference} {input.R1} {input.R2} > {output} 2> {log} """ + +rule FinalMockRef_SamToSortedBam: + """ + Sam to sorted bam for final mock reference (SAMTOOLS) + """ + input: + "%s/SAM/alignments_finalMock/{samples}.sam" % (config["project-folder"]) + output: + bam=temp("%s/BAM/alignments_finalMock/{samples}.bam" % (config["project-folder"])), + sorted="%s/BAM/alignments_finalMock/{samples}.sorted.bam" % (config["project-folder"]) + log: + "%s/logs/Samtools/FinalMockRef_SamToSortedBam.{samples}.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Samtools/FinalMockRef_SamToSortedBam.{samples}.benchmark.tsv" % (config["project-folder"]) + params: + threads=config["params"]["step5d"]["threads"], + q=config["params"]["step5d"]["q"], + f=config["params"]["step5d"]["f"], + F=config["params"]["step5d"]["F"] + conda:"envs/samtools.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + df -h &> {log} + samtools view -b -q {params.q} -f {params.f} -F {params.F} {input} > {output.bam} + samtools sort {output.bam} -o {output.sorted} + samtools index -c {output.sorted} + """ + From 745fb7f281d54d85461403b93af873ecefb34064 Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 23 Feb 2021 06:52:57 +0200 Subject: [PATCH 08/34] final mock mpileup added --- GBS-pipeline.smk | 2 +- rules/Module4-ReadAlignment | 42 +++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index dcd18ea..d09d355 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -121,7 +121,7 @@ rule all: # "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]), # Quality check #"%s/finalReport.html" % (config["project-folder"]), - expand("%s/BAM/alignments_finalMock/{samples}.sorted.bam" % (config["project-folder"]), samples=samples) + expand("%s/MPILEUP/mpileup_finalMock/{samples}.mpileup" % (config["project-folder"]), samples=samples) diff --git a/rules/Module4-ReadAlignment b/rules/Module4-ReadAlignment index 148c6c1..cd0669c 100644 --- a/rules/Module4-ReadAlignment +++ b/rules/Module4-ReadAlignment @@ -333,3 +333,45 @@ rule FinalMockRef_SamToSortedBam: samtools index -c {output.sorted} """ +rule FinalMockRef_SortedBamToMpileup: + """ + Get Mpileup for final Mock Reference Genome. + """ + input: + bam="%s/BAM/alignments_finalMock/{samples}.sorted.bam" % (config["project-folder"]), + reference="%s/MockReference/MockReference.fa" % (config["project-folder"]) + output: + bam="%s/MPILEUP/mpileup_finalMock/{samples}.mpileup" % (config["project-folder"]) + log: + "%s/logs/Samtoools/FinalMockRef_SortedBamToMpileup.{samples}.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Samtools/FinalMockRef_SortedBamToMpileup.{samples}.benchmark.tsv" % (config["project-folder"]) + params: + threads=config["params"]["step5d"]["threads"], + Q=config["params"]["step5d"]["Q"], + q=config["params"]["step5d"]["q"] + conda:"envs/samtools.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + df -h &> {log} + samtools mpileup -Q {params.Q} -q {params.q} -B -C 50 -f {input.reference} {input.bam} > {output} + """ + + +rule finalMockRef_AlignmentStats: + """ + Alignment stats for reads mapped to mock ref + """ + input: + "%s/FASTQ/TRIMMED/alignments/{samples}.sam" % (config["project-folder"]) + output: + "%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"]) + log: + "%s/logs/Samtools/MockRef_AlignmentStats.{samples}.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Samtools/MockRef_AlignmentStats.{samples}.benchmark.tsv" % (config["project-folder"]) + conda:"envs/samtools.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + samtools flagstat {input} > {output} + """ From ecccca781a220a29dd26621ff0fbe2b29bf9aeac Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 23 Feb 2021 06:55:33 +0200 Subject: [PATCH 09/34] final mock alignment stats added --- GBS-pipeline.smk | 2 +- rules/Module4-ReadAlignment | 10 +++++----- rules/Module7-Reporting | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index d09d355..5680983 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -120,7 +120,7 @@ rule all: # OUTPUT STEP 9 # "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]), # Quality check - #"%s/finalReport.html" % (config["project-folder"]), + "%s/finalReport.html" % (config["project-folder"]), expand("%s/MPILEUP/mpileup_finalMock/{samples}.mpileup" % (config["project-folder"]), samples=samples) diff --git a/rules/Module4-ReadAlignment b/rules/Module4-ReadAlignment index cd0669c..68c52a9 100644 --- a/rules/Module4-ReadAlignment +++ b/rules/Module4-ReadAlignment @@ -360,16 +360,16 @@ rule FinalMockRef_SortedBamToMpileup: rule finalMockRef_AlignmentStats: """ - Alignment stats for reads mapped to mock ref + Alignment stats for reads mapped to final mock ref """ input: - "%s/FASTQ/TRIMMED/alignments/{samples}.sam" % (config["project-folder"]) + "%s/SAM/alignments_finalMock/{samples}.sam" % (config["project-folder"]) output: - "%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"]) + "%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]) log: - "%s/logs/Samtools/MockRef_AlignmentStats.{samples}.log" % (config["project-folder"]) + "%s/logs/Samtools/FinalMockRef_AlignmentStats.{samples}.log" % (config["project-folder"]) benchmark: - "%s/benchmark/Samtools/MockRef_AlignmentStats.{samples}.benchmark.tsv" % (config["project-folder"]) + "%s/benchmark/Samtools/FinalMockRef_AlignmentStats.{samples}.benchmark.tsv" % (config["project-folder"]) conda:"envs/samtools.yaml" singularity: config["singularity"]["gbs"] shell:""" diff --git a/rules/Module7-Reporting b/rules/Module7-Reporting index 666f836..414469a 100644 --- a/rules/Module7-Reporting +++ b/rules/Module7-Reporting @@ -14,7 +14,8 @@ rule R_finalReport: faRef="%s/MPILEUP/mpileup_reference/GSC.vcf.fa" % (config["project-folder"]), qdRaw=expand("%s/QC/RAW/{rawsamples}_R1_qualdist.txt" % (config["project-folder"]), rawsamples=rawsamples), qdConc=expand("%s/QC/CONCATENATED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples), - qdTrimmed=expand("%s/QC/TRIMMED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples) + qdTrimmed=expand("%s/QC/TRIMMED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples), + fsFinal=expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples) output: "%s/finalReport.html" % (config["project-folder"]) log: From 7738ff6fd75facf59d4446b4d407cf06ecad94f1 Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 23 Feb 2021 06:57:41 +0200 Subject: [PATCH 10/34] final mock sam index added --- rules/Module0-PreparationsAndIndexing | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/rules/Module0-PreparationsAndIndexing b/rules/Module0-PreparationsAndIndexing index 8b86958..a98c567 100644 --- a/rules/Module0-PreparationsAndIndexing +++ b/rules/Module0-PreparationsAndIndexing @@ -160,3 +160,21 @@ rule IndexFinalMockBWA: shell:""" bwa index -a bwtsw {input} """ + +rule IndexFinalMockSamtools: + """ + Index the final Mock Reference Genome Clusters (samtools). + """ + input: + "%s/MockReference/MockReference.fa" % (config["project-folder"]) + output: + "%s/MockReference/MockReference.fai" % (config["project-folder"]) + log: + "%s/logs/Samtools/IndexFinalMockSam.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Samtools/IndexFinalMockSam.benchmark.tsv" % (config["project-folder"]) + conda:"envs/samtools.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + samtools faidx {input} + """ \ No newline at end of file From fd438cc603651b0ceb0bde240f597fab156acb8f Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 23 Feb 2021 07:03:14 +0200 Subject: [PATCH 11/34] Create count files for final mock --- GBS-pipeline.smk | 2 +- GBS-pipeline_server-config.yaml | 21 ++++++++++++++++++++- rules/Module5-CallVariants | 29 +++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 5680983..238cebc 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -121,7 +121,7 @@ rule all: # "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]), # Quality check "%s/finalReport.html" % (config["project-folder"]), - expand("%s/MPILEUP/mpileup_finalMock/{samples}.mpileup" % (config["project-folder"]), samples=samples) + expand("%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), samples=samples) diff --git a/GBS-pipeline_server-config.yaml b/GBS-pipeline_server-config.yaml index 571b19c..86a6982 100644 --- a/GBS-pipeline_server-config.yaml +++ b/GBS-pipeline_server-config.yaml @@ -52,7 +52,11 @@ IndexFinalMockBWA: job-name: IndexFinalMockBWA mem-per-cpu: 25000 - +IndexFinalMockSamtools: + time: 02:00:00 + job-name: IndexFinalMockSamtools + mem-per-cpu: 25000 + # Module 1 - QC: ################################################################################ fastqc_quality_control_raw_data: @@ -221,7 +225,17 @@ FinalMockRef_SamToSortedBam: cpus-per-task: 20 mem-per-cpu: 5000 +FinalMockRef_SortedBamToMpileup: + time: 01:00:00 + job-name: FinalMockRef_SortedBamToMpileup + cpus-per-task: 20 + mem-per-cpu: 5000 +finalMockRef_AlignmentStats: + time: 01:00:00 + job-name: FinalMockRef_AlignmentStats + cpus-per-task: 20 + mem-per-cpu: 5000 # Module 5 - Call Variants ################################################################################ @@ -298,6 +312,11 @@ CreateVCF_reference: job-name: CreateVCF_reference mem-per-cpu: 30000 +ParseMpileup_createCountFiles_finalMock: + time: 05:00:00 + job-name: createCountFiles_finalmock + mem-per-cpu: 16000 + # Module 6 - Postprocessing ################################################################################ Ref_getVariantFlanking: diff --git a/rules/Module5-CallVariants b/rules/Module5-CallVariants index c9b96dd..de08791 100644 --- a/rules/Module5-CallVariants +++ b/rules/Module5-CallVariants @@ -421,3 +421,32 @@ else: perl {params.pipefolder}/scripts/GBS-SNP-CROP-8.pl -in {input.unfiltered} -out {params.unfilteredOut} -b {params.barcodes} -formats {params.format} &> {log} cd ../.. """ + +################################################################################ +## +## Process the final mock + +rule ParseMpileup_createCountFiles_finalMock: + """ + Parse mpileup outputs and create count/ref files for final mock + """ + input: + "%s/MPILEUP/mpileup_finalMock/{samples}.mpileup" % (config["project-folder"]) + output: + co="%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), + ref="%s/MPILEUP/mpileup_finalMock/{samples}.ref.txt" % (config["project-folder"]) + params: + p=config["params"]["step6"]["p"], + wd="%s/FASTQ/TRIMMED" % config["project-folder"], + pipefolder=config["pipeline-folder"] + log: + "%s/logs/Perl/ParseMpileup_createCountFiles_finalMock.{samples}.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Perl/ParseMpileup_createCountFiles_finalMock.{samples}.benchmark.tsv" % (config["project-folder"]) + threads: 1 + conda:"envs/gbs.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + cd {params.wd} + perl {params.pipefolder}/scripts/GBS-SNP-CROP-6_1.pl -b {input} -p {params.p} &> {log} + """ \ No newline at end of file From a4d75f1446f94517fd7bfefc82103d325040e186 Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 23 Feb 2021 07:11:15 +0200 Subject: [PATCH 12/34] merge counts aded --- GBS-pipeline.smk | 3 ++- GBS-pipeline_server-config.yaml | 6 ++++++ rules/Module5-CallVariants | 26 +++++++++++++++++++++++++- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 238cebc..46458a1 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -121,7 +121,8 @@ rule all: # "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]), # Quality check "%s/finalReport.html" % (config["project-folder"]), - expand("%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), samples=samples) + # expand("%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), samples=samples) + "%s/MPILEUP/mpileup_finalMock/CountFileList.txt" % (config["project-folder"]) diff --git a/GBS-pipeline_server-config.yaml b/GBS-pipeline_server-config.yaml index 86a6982..c927baa 100644 --- a/GBS-pipeline_server-config.yaml +++ b/GBS-pipeline_server-config.yaml @@ -316,6 +316,12 @@ ParseMpileup_createCountFiles_finalMock: time: 05:00:00 job-name: createCountFiles_finalmock mem-per-cpu: 16000 + +create_verticalRef_finalMock: + time: 01:00:00 + job-name: create_verticalRef_finalMock + mem-per-cpu: 16000 + nvme: 20 # Module 6 - Postprocessing ################################################################################ diff --git a/rules/Module5-CallVariants b/rules/Module5-CallVariants index de08791..6e0c617 100644 --- a/rules/Module5-CallVariants +++ b/rules/Module5-CallVariants @@ -449,4 +449,28 @@ rule ParseMpileup_createCountFiles_finalMock: shell:""" cd {params.wd} perl {params.pipefolder}/scripts/GBS-SNP-CROP-6_1.pl -b {input} -p {params.p} &> {log} - """ \ No newline at end of file + """ + +rule create_verticalRef_finalMock: + """ + Merge the separate ref files for final mock + """ + input: + co=expand("%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), samples=samples), + ref=expand("%s/MPILEUP/mpileup_finalMock/{samples}.ref.txt" % (config["project-folder"]), samples=samples) + output: + vref="%s/MPILEUP/mpileup_finalMock/VerticalRefPos.txt" % (config["project-folder"]), + co="%s/MPILEUP/mpileup_finalMock/CountFileList.txt" % (config["project-folder"]) + log: + "%s/logs/BASH/create_verticalRef_finalMock.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/BASH/create_verticalRef_finalMock.benchmark.tsv" % (config["project-folder"]) + params: + wd="%s/MPILEUP/mpileup_finalMock" % config["project-folder"], + threads: 1 + shell:""" + sort -u -m -k2n {input.ref} -o {params.wd}/VerticalRefPos.tmp; + sort -k2n {params.wd}/VerticalRefPos.tmp | uniq >{output.vref}; + #ls {params.wd}/*count.txt | xargs -n1 basename > {output.co} + ls {params.wd}/*count.txt > {output.co} + """ \ No newline at end of file From 2060af19a8c4e67baa19a26e8f44994cb7de2dc8 Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 23 Feb 2021 07:35:28 +0200 Subject: [PATCH 13/34] Variant calling added --- GBS-pipeline.smk | 2 +- GBS-pipeline_server-config.yaml | 26 ++++++ rules/Module5-CallVariants | 154 +++++++++++++++++++++++++++++++- 3 files changed, 179 insertions(+), 3 deletions(-) diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 46458a1..528192c 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -122,7 +122,7 @@ rule all: # Quality check "%s/finalReport.html" % (config["project-folder"]), # expand("%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), samples=samples) - "%s/MPILEUP/mpileup_finalMock/CountFileList.txt" % (config["project-folder"]) + "%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]), diff --git a/GBS-pipeline_server-config.yaml b/GBS-pipeline_server-config.yaml index c927baa..4701d55 100644 --- a/GBS-pipeline_server-config.yaml +++ b/GBS-pipeline_server-config.yaml @@ -268,6 +268,9 @@ cut_verticalRef: cut_verticalRef_reference: job-name: cut_verticalRef_reference +cut_verticalRef_finalMock: + job-name: cut_verticalRef_finalMock + create_MasterMatrix_parallel: time: 0-12:00:00 job-name: create_MasterMatrix_parallel @@ -280,6 +283,13 @@ create_MasterMatrix_parallel_reference: mem-per-cpu: 64000 nvme: 20 +create_MasterMatrix_parallel_finalMock: + time: 0-12:00:00 + job-name: create_MasterMatrix_parallel_finalMock + mem-per-cpu: 64000 + nvme: 20 + + aggregate_MasterMatrix: job-name: aggregate_MasterMatrix time: 0-12:00:00 @@ -291,6 +301,12 @@ aggregate_MasterMatrix_reference: time: 0-12:00:00 mem-per-cpu: 48000 nvme: 20 + +aggregate_MasterMatrix_finalMock: + job-name: aggregate_MasterMatrix_finalMock + time: 0-12:00:00 + mem-per-cpu: 48000 + nvme: 20 FilterVariants: time: 1-12:00:00 @@ -302,6 +318,11 @@ FilterVariants_reference: job-name: filterVariants_reference mem-per-cpu: 30000 +FilterVariants_finalMock: + time: 1-12:00:00 + job-name: filterVariants_finalMock + mem-per-cpu: 30000 + CreateVCF: time: 01:00:00 job-name: CreateVCF @@ -312,6 +333,11 @@ CreateVCF_reference: job-name: CreateVCF_reference mem-per-cpu: 30000 +CreateVCF_finalMock: + time: 01:00:00 + job-name: CreateVCF_finalMock + mem-per-cpu: 30000 + ParseMpileup_createCountFiles_finalMock: time: 05:00:00 job-name: createCountFiles_finalmock diff --git a/rules/Module5-CallVariants b/rules/Module5-CallVariants index 6e0c617..208adcf 100644 --- a/rules/Module5-CallVariants +++ b/rules/Module5-CallVariants @@ -394,7 +394,7 @@ else: filtered="%s/MPILEUP/mpileup_reference/variants/GSC.GenoMatrix.txt" % (config["project-folder"]), unfiltered="%s/MPILEUP/mpileup_reference/variants/GSC.GenoMatrix.unfiltered.txt" % (config["project-folder"]) output: - "%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]) + "%s/VCF/FinalSetVariants_referenceGenome.vcf" % (config["project-folder"]) params: barcodes=config["barcodes"], genoMatrix=config["params"]["step8"]["genoMatrix"], @@ -402,6 +402,8 @@ else: unfilteredOut=config["params"]["step8"]["unfiltered"], format=config["params"]["step8"]["format"], wd="%s/MPILEUP/mpileup_reference" % config["project-folder"], + outdir="%s/VCF" % config["project-folder"], + pipeout="%s/MPILEUP/mpileup_finalMock/GSC.vcf" % (config["project-folder"]), pipefolder=config["pipeline-folder"] log: "%s/logs/Perl/CreateVCF_reference.log" % (config["project-folder"]) @@ -419,6 +421,7 @@ else: cd {params.wd} perl {params.pipefolder}/scripts/GBS-SNP-CROP-8.pl -in {input.filtered} -out {params.out} -b {params.barcodes} -formats {params.format} &> {log} perl {params.pipefolder}/scripts/GBS-SNP-CROP-8.pl -in {input.unfiltered} -out {params.unfilteredOut} -b {params.barcodes} -formats {params.format} &> {log} + cp {params.pipeout} {output} cd ../.. """ @@ -473,4 +476,151 @@ rule create_verticalRef_finalMock: sort -k2n {params.wd}/VerticalRefPos.tmp | uniq >{output.vref}; #ls {params.wd}/*count.txt | xargs -n1 basename > {output.co} ls {params.wd}/*count.txt > {output.co} - """ \ No newline at end of file + """ + +checkpoint cut_verticalRef_finalMock: + """ + Divide the input verticalRef-file for parallel processing in finalMock + """ + input: + "%s/MPILEUP/mpileup_finalMock/VerticalRefPos.txt" % (config["project-folder"]) + output: + directory("%s/MPILEUP/mpileup_finalMock/VerticalRefPos/" % (config["project-folder"])) + params: + out="%s/MPILEUP/mpileup_finalMock/VerticalRefPos/VerticalRefPos." % (config["project-folder"]), + split=1000000 + shell:""" + mkdir -p {output} + split -l {params.split} --numeric-suffixes {input} {params.out} + """ + +rule create_MasterMatrix_parallel_finalMock: + """ + Process verticalRef parallel to create MasterMatrix for final mock + """ + input: + verRef="%s/MPILEUP/mpileup_finalMock/VerticalRefPos/VerticalRefPos.{i}" % (config["project-folder"]), + counts="%s/MPILEUP/mpileup_finalMock/CountFileList.txt" % (config["project-folder"]) + output: + "%s/MPILEUP/mpileup_finalMock/VerticalRefPos/GSC.MasterMatrix_{i}.tsv" % (config["project-folder"]) + log: + "%s/logs/BASH/create_MasterMatrix_parallel_finalMock_{i}.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/BASH/create_MasterMatrix_parallel_finalMock.benchmark_{i}.tsv" % (config["project-folder"]) + params: + p=config["params"]["step6"]["p"], + wd="%s/MPILEUP/mpileup_finalMock/VerticalRefPos" % config["project-folder"], + pipefolder=config["pipeline-folder"] + conda:"envs/gbs.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + cd {params.wd} + perl {params.pipefolder}/scripts/GBS-SNP-CROP-6_2.pl -in {input.verRef} -count {input.counts} -out {output} -p {params.p} &> {log} + """ + +def aggregate_inputMasterMatrix_finalMock(wildcards): + """ + Aggregate the input object for the final MasterMatrix for finalMock + """ + checkpoint_outputMM = checkpoints.cut_verticalRef_finalMock.get(**wildcards).output[0] + return expand("%s/MPILEUP/mpileup_finalMock/VerticalRefPos/GSC.MasterMatrix_{i}.tsv" % (config["project-folder"]), + i=glob_wildcards(os.path.join(checkpoint_outputMM, "VerticalRefPos.{i}")).i) + +rule aggregate_MasterMatrix_finalMock: + input: + aggregate_inputMasterMatrix_finalMock + output: + "%s/MPILEUP/mpileup_finalMock/GSC.MasterMatrix.txt" % (config["project-folder"]) + shell:""" + cat {input} > {output} + """ + +rule FilterVariants_finalMock: + """ + Filter variants and call genotypes using the final mock. + """ + input: + "%s/MPILEUP/mpileup_finalMock/GSC.MasterMatrix.txt" % (config["project-folder"]) + output: + filtered="%s/MPILEUP/mpileup_finalMock/variants/GSC.GenoMatrix.txt" % (config["project-folder"]), + unfiltered="%s/MPILEUP/mpileup_finalMock/variants/GSC.GenoMatrix.unfiltered.txt" % (config["project-folder"]) + params: + input=config["params"]["step7"]["input"], + out=config["params"]["step7"]["out"], + unfiltered=config["params"]["step7"]["unfiltered"], + p=config["params"]["step7"]["p"], + mnHoDepth0=config["params"]["step7"]["mnHoDepth0"], + mnHoDepth1=config["params"]["step7"]["mnHoDepth1"], + mnHetDepth=config["params"]["step7"]["mnHetDepth"], + altStrength=config["params"]["step7"]["altStrength"], + mnAlleleRatio=config["params"]["step7"]["mnAlleleRatio"], + mnCall=config["params"]["step7"]["mnCall"], + mnAvgDepth=config["params"]["step7"]["mnAvgDepth"], + mxAvgDepth=config["params"]["step7"]["mxAvgDepth"], + wd="%s/MPILEUP/mpileup_finalMock" % config["project-folder"], + pipefolder=config["pipeline-folder"] + log: + "%s/logs/Perl/FilterVariants_finalMock.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Perl/FilterVariants_finalMock.benchmark.tsv" % (config["project-folder"]) + conda:"envs/gbs.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + #******PARAMETERS***** + # -in Discovery master matrix input file. The output from last step File GSC.MasterMatrix.txt + # -out Genotyping matrix for the population Output file GSC.GenoMatrix.txt + # -p Identify SNPs only (snp) or SNPs + indels (indel) String -- + # -mnHoDepth0 Minimum depth required for calling a homozygote when the alternative allele depth = 0 Numeric 5 + # -mnHoDepth1 Minimum depth required for calling a homozygote when the alternative allele depth = 1 Numeric 20 + # -mnHetDepth Minimum depth required for each allele when calling a heterozygote Numeric 3 + # -altStrength Across the population for a given putative bi-allelic variant, this alternate allele strength parameter is the minimum proportion of non-primary allele reads that are the secondary allele Numeric 0.8 + # -mnAlleleRatio Minimum required ratio of less frequent allele depth to more frequent allele depth Numeric 0.25 + # -mnCall Minimum acceptable proportion of genotyped individuals to retain a variant Numeric 0.75 + # -mnAvgDepth Minimum average depth of an acceptable variant Numeric 3 + # -mxAvgDepth Maximum average depth of an acceptable variant Numeric 200 + + cd {params.wd} + perl {params.pipefolder}/scripts/GBS-SNP-CROP-7.pl -in {params.input} -out {params.out} -p {params.p} -mnHoDepth0 {params.mnHoDepth0} -mnHoDepth1 {params.mnHoDepth1} -mnHetDepth {params.mnHetDepth} -altStrength {params.altStrength} -mnAlleleRatio {params.mnAlleleRatio} -mnCall {params.mnCall} -mnAvgDepth {params.mnAvgDepth} -mxAvgDepth {params.mxAvgDepth} &> {log} + perl {params.pipefolder}/scripts/GBS-SNP-CROP-7.pl -in {params.input} -out {params.unfiltered} -p {params.p} -mnHoDepth0 0 -mnHoDepth1 0 -mnHetDepth 0 -altStrength 0 -mnAlleleRatio 0 -mnCall 0 -mnAvgDepth 0 -mxAvgDepth 1000000 &> {log} + """ + +rule CreateVCF_finalMock: + """ + Create the VCF output of the pipeline for the final mock. + """ + input: + filtered="%s/MPILEUP/mpileup_finalMock/variants/GSC.GenoMatrix.txt" % (config["project-folder"]), + unfiltered="%s/MPILEUP/mpileup_finalMock/variants/GSC.GenoMatrix.unfiltered.txt" % (config["project-folder"]) + output: + "%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]), + params: + barcodes=config["barcodes"], + genoMatrix=config["params"]["step8"]["genoMatrix"], + out=config["params"]["step8"]["out"], + unfilteredOut=config["params"]["step8"]["unfiltered"], + format=config["params"]["step8"]["format"], + wd="%s/MPILEUP/mpileup_finalMock" % config["project-folder"], + outdir="%s/VCF" % config["project-folder"], + pipeout="%s/MPILEUP/mpileup_finalMock/GSC.vcf" % (config["project-folder"]), + pipefolder=config["pipeline-folder"] + log: + "%s/logs/Perl/CreateVCF_finalMock.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Perl/CreateVCF_finalMock.benchmark.tsv" % (config["project-folder"]) + conda:"envs/gbs.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + #******PARAMETERS***** + # -in Final genotyping matrix input file File GSC.GenoMatrix.txt + # -out Output label name without extension String GSC + # -b BarcodeID file name See Appendix A File -- + # -formats The name(s) (R, Tassel, Plink, vcf, H) for which the GBS-SNP-CROP final genotyping matrix should be converted. If more than one format is desired, the names should be separated by commas without any space, as in the examples shown below String R,T,P,V,H + + mkdir -p {params.outdir} + cd {params.wd} + perl {params.pipefolder}/scripts/GBS-SNP-CROP-8.pl -in {input.filtered} -out {params.out} -b {params.barcodes} -formats {params.format} &> {log} + perl {params.pipefolder}/scripts/GBS-SNP-CROP-8.pl -in {input.unfiltered} -out {params.unfilteredOut} -b {params.barcodes} -formats {params.format} &> {log} + + cp {params.pipeout} {output} + cd ../.. + """ \ No newline at end of file From 10fdf79d8680adc1d68cc0facce3773fae7616bc Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 23 Feb 2021 07:37:57 +0200 Subject: [PATCH 14/34] main rule sompified again --- GBS-pipeline.smk | 4 +--- rules/Module7-Reporting | 3 ++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 528192c..65147fc 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -120,9 +120,7 @@ rule all: # OUTPUT STEP 9 # "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]), # Quality check - "%s/finalReport.html" % (config["project-folder"]), - # expand("%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), samples=samples) - "%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]), + "%s/finalReport.html" % (config["project-folder"]) diff --git a/rules/Module7-Reporting b/rules/Module7-Reporting index 414469a..10d915b 100644 --- a/rules/Module7-Reporting +++ b/rules/Module7-Reporting @@ -15,7 +15,8 @@ rule R_finalReport: qdRaw=expand("%s/QC/RAW/{rawsamples}_R1_qualdist.txt" % (config["project-folder"]), rawsamples=rawsamples), qdConc=expand("%s/QC/CONCATENATED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples), qdTrimmed=expand("%s/QC/TRIMMED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples), - fsFinal=expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples) + fsFinal=expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), + finalSetVariants="%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]) output: "%s/finalReport.html" % (config["project-folder"]) log: From 85f44992412719ab7c2fe57f63abd76ba0579850 Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 23 Feb 2021 07:38:48 +0200 Subject: [PATCH 15/34] Variant calling for final mock added --- CHANGELOG | 1 + GBS-pipeline.smk | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index bf2f888..b34b498 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.4 : * Variant calling for final mock added 0.5.3 : * Rule all rule simplified 0.5.2 : * Script to refine the mock reference added 0.5.1 : * samtools index set to csi instead of bai diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 65147fc..e7eae19 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -9,8 +9,8 @@ import os ##### Daniel Fischer (daniel.fischer@luke.fi) ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline -##### Version: 0.5.3 -version = "0.5.3" +##### Version: 0.5.4 +version = "0.5.4" ##### set minimum snakemake version ##### min_version("5.24") @@ -68,7 +68,7 @@ print("##### barcodes-file : "+ config["barcodes"]) print("##### rawsample file : "+ config["rawsamples"]) print("#####") print("##### Derived runtime parameters") -print("##### --------------------------------") +print("##### --------------------------------")Variant calling for final mock added print("##### BWA-Genome index : "+config["genome-bwa-index"]) print("##### STAR-Genome index : "+config["genome-star-index"]) print("##### Adapter file : "+ config["adapter"]) From 8a56a38ae797d2403c4708ba34410020a9994567 Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 23 Feb 2021 07:41:58 +0200 Subject: [PATCH 16/34] Missing library dependency added --- GBS-pipeline.smk | 2 +- scripts/refineMockReference.R | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index e7eae19..7261930 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -68,7 +68,7 @@ print("##### barcodes-file : "+ config["barcodes"]) print("##### rawsample file : "+ config["rawsamples"]) print("#####") print("##### Derived runtime parameters") -print("##### --------------------------------")Variant calling for final mock added +print("##### --------------------------------") print("##### BWA-Genome index : "+config["genome-bwa-index"]) print("##### STAR-Genome index : "+config["genome-star-index"]) print("##### Adapter file : "+ config["adapter"]) diff --git a/scripts/refineMockReference.R b/scripts/refineMockReference.R index 217a2cf..4e97ff2 100644 --- a/scripts/refineMockReference.R +++ b/scripts/refineMockReference.R @@ -1,11 +1,14 @@ +# Load required libraries + library("GenomicTools") + # This script imports the mock reference coverage values, analyses it and then created a final, refined mock reference file -if(!is.element("snakemake",ls())){ - mockClusters.file <- "GSC.MR.Clusters.fa" - projFolder <- "/scratch/project_2001746/BSF" - minTotalReadCoverage <- 25 - minSampleCoverage <- 3 - mockClusters.refined.file <- "/scratch/project_2001746/BSF/MockReference/MockReference.fa" -} + if(!is.element("snakemake",ls())){ + mockClusters.file <- "GSC.MR.Clusters.fa" + projFolder <- "/scratch/project_2001746/BSF" + minTotalReadCoverage <- 25 + minSampleCoverage <- 3 + mockClusters.refined.file <- "/scratch/project_2001746/BSF/MockReference/MockReference.fa" + } # Import the coverage clusterFiles <- list.files(file.path(projFolder, "FASTQ", "TRIMMED", "alignments_clusters"), pattern="*.coverage") From 350c1bb2ba951dbef04f7c39864f025bb2638e0d Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 23 Feb 2021 07:59:58 +0200 Subject: [PATCH 17/34] typo fixed --- rules/Module5-CallVariants | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules/Module5-CallVariants b/rules/Module5-CallVariants index 208adcf..abfec49 100644 --- a/rules/Module5-CallVariants +++ b/rules/Module5-CallVariants @@ -440,7 +440,7 @@ rule ParseMpileup_createCountFiles_finalMock: ref="%s/MPILEUP/mpileup_finalMock/{samples}.ref.txt" % (config["project-folder"]) params: p=config["params"]["step6"]["p"], - wd="%s/FASTQ/TRIMMED" % config["project-folder"], + wd="%s/MPILEUP/mpileup_finalMock" % config["project-folder"], pipefolder=config["pipeline-folder"] log: "%s/logs/Perl/ParseMpileup_createCountFiles_finalMock.{samples}.log" % (config["project-folder"]) From 442d5535c85e31edc9d6a86d9fc3b47d5576f329 Mon Sep 17 00:00:00 2001 From: fischuu Date: Thu, 25 Feb 2021 20:53:18 +0200 Subject: [PATCH 18/34] asd --- scripts/workflow-report.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd index c04af5e..23970a9 100644 --- a/scripts/workflow-report.Rmd +++ b/scripts/workflow-report.Rmd @@ -846,12 +846,12 @@ How much were the reads merged down? Length of Mock reference vs the length of a # Mock reference ## Basic stats -```{r import required files, warning=FALSE, } +```{r import required files mock, warning=FALSE, } finalMockClusters <- importFA(file.path(projFolder,"MockReference","MockReference.fa")) ``` Basic information for the final mock reference: -```{r mock build stats} +```{r final mock build stats} out <- summary(finalMockClusters) out[,2] <- formatC(out[,2], format="d", big.mark=",") out_html <- knitr::kable(out, col.names = NULL, "html") From 4fbe5aeb5637db6002748b37914f4a8f32fe49d4 Mon Sep 17 00:00:00 2001 From: fischuu Date: Mon, 1 Mar 2021 15:40:21 +0200 Subject: [PATCH 19/34] Bugfix - wrong reference vcf address in module 6 --- CHANGELOG | 1 + GBS-pipeline.smk | 4 ++-- rules/Module6-PostProcessing | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index b34b498..3e0766d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.5 : * Bugfix - wrong reference vcf address in module 6 0.5.4 : * Variant calling for final mock added 0.5.3 : * Rule all rule simplified 0.5.2 : * Script to refine the mock reference added diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 7261930..c515f5f 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -9,8 +9,8 @@ import os ##### Daniel Fischer (daniel.fischer@luke.fi) ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline -##### Version: 0.5.4 -version = "0.5.4" +##### Version: 0.5.5 +version = "0.5.5" ##### set minimum snakemake version ##### min_version("5.24") diff --git a/rules/Module6-PostProcessing b/rules/Module6-PostProcessing index 5a4c6c9..285e32c 100644 --- a/rules/Module6-PostProcessing +++ b/rules/Module6-PostProcessing @@ -33,7 +33,7 @@ else: Get the flanking sequences from identified variants for reference genome. """ input: - vcf="%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]), + vcf="%s/VCF/FinalSetVariants_referenceGenome.vcf" % (config["project-folder"]), genome=config["genome"] output: fl="%s/MPILEUP/mpileup_reference/GSC.vcf.flanking" % (config["project-folder"]), From fb09f2ffae6c4d06af4baaf65ac39c3fc80db9bb Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 2 Mar 2021 09:51:36 +0200 Subject: [PATCH 20/34] Bugfix - wrong path for reference based vcf fixed --- CHANGELOG | 1 + GBS-pipeline.smk | 4 ++-- rules/Module5-CallVariants | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 3e0766d..6aa052c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.6 : * Bugfix - wrong path for reference based vcf fixed 0.5.5 : * Bugfix - wrong reference vcf address in module 6 0.5.4 : * Variant calling for final mock added 0.5.3 : * Rule all rule simplified diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index c515f5f..65fedb0 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -9,8 +9,8 @@ import os ##### Daniel Fischer (daniel.fischer@luke.fi) ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline -##### Version: 0.5.5 -version = "0.5.5" +##### Version: 0.5.6 +version = "0.5.6" ##### set minimum snakemake version ##### min_version("5.24") diff --git a/rules/Module5-CallVariants b/rules/Module5-CallVariants index abfec49..e01a63b 100644 --- a/rules/Module5-CallVariants +++ b/rules/Module5-CallVariants @@ -403,7 +403,7 @@ else: format=config["params"]["step8"]["format"], wd="%s/MPILEUP/mpileup_reference" % config["project-folder"], outdir="%s/VCF" % config["project-folder"], - pipeout="%s/MPILEUP/mpileup_finalMock/GSC.vcf" % (config["project-folder"]), + pipeout="%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]), pipefolder=config["pipeline-folder"] log: "%s/logs/Perl/CreateVCF_reference.log" % (config["project-folder"]) From 6d04c708aff2fe360628315688bbf748f555de14 Mon Sep 17 00:00:00 2001 From: fischuu Date: Wed, 3 Mar 2021 09:08:31 +0200 Subject: [PATCH 21/34] Dependencies between the rules improved --- CHANGELOG | 1 + GBS-pipeline.smk | 60 +++++++++++++++++++++-------------------- rules/Module7-Reporting | 7 ++++- 3 files changed, 38 insertions(+), 30 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 6aa052c..aa31f0e 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.7 : * Dependencies between the rules improved 0.5.6 : * Bugfix - wrong path for reference based vcf fixed 0.5.5 : * Bugfix - wrong reference vcf address in module 6 0.5.4 : * Variant calling for final mock added diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 65fedb0..cee143e 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -9,8 +9,8 @@ import os ##### Daniel Fischer (daniel.fischer@luke.fi) ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline -##### Version: 0.5.6 -version = "0.5.6" +##### Version: 0.5.7 +version = "0.5.7" ##### set minimum snakemake version ##### min_version("5.24") @@ -81,45 +81,47 @@ print("######################################################################### rule all: input: # OUTPUT: PREPARATION MODULE -# expand("%s/FASTQ/CONCATENATED/{samples}_R1_001.merged.fastq.gz" % (config["project-folder"]), samples=samples), -# expand("%s/FASTQ/CONCATENATED/{samples}_R2_001.merged.fastq.gz" % (config["project-folder"]), samples=samples), -# # QC OF RAW AND CONCATENATED FILES + expand("%s/FASTQ/CONCATENATED/{samples}_R1_001.merged.fastq.gz" % (config["project-folder"]), samples=samples), + expand("%s/FASTQ/CONCATENATED/{samples}_R2_001.merged.fastq.gz" % (config["project-folder"]), samples=samples), + # QC OF RAW AND CONCATENATED FILES "%s/QC/RAW/multiqc_R1/" % (config["project-folder"]), "%s/QC/CONCATENATED/multiqc_R1/" % (config["project-folder"]), "%s/QC/TRIMMED/multiqc_R1/" % (config["project-folder"]), # OUTPUT STEP 2 -# expand("%s/FASTQ/TRIMMED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples), -# expand("%s/FASTQ/TRIMMED/{samples}.R2.fq.gz" % (config["project-folder"]), samples=samples), + expand("%s/FASTQ/TRIMMED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples), + expand("%s/FASTQ/TRIMMED/{samples}.R2.fq.gz" % (config["project-folder"]), samples=samples), # OUTPUT STEP 2b -# expand("%s/FASTQ/SUBSTITUTED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples), + expand("%s/FASTQ/SUBSTITUTED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples), # OUTPUT STEP 4 -# "%s/FASTQ/TRIMMED/GSC.MR.Genome.fa" % (config["project-folder"]), -# "%s/BAM/Mockref/mockToRef.sam.flagstat" % (config["project-folder"]), -# "%s/MPILEUP/mpileup_mockToRef/mockToRef.mpileup" % (config["project-folder"]), + "%s/FASTQ/TRIMMED/GSC.MR.Genome.fa" % (config["project-folder"]), + "%s/BAM/Mockref/mockToRef.sam.flagstat" % (config["project-folder"]), + "%s/MPILEUP/mpileup_mockToRef/mockToRef.mpileup" % (config["project-folder"]), # OUTPUT STEP 5 -# expand("%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), -# expand("%s/FASTQ/TRIMMED/alignments/{samples}.sorted.bam" % (config["project-folder"]), samples=samples), -# expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), -# expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sorted.bam" % (config["project-folder"]), samples=samples), -# expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]), samples=samples), -# expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sorted.bam" % (config["project-folder"]), samples=samples), -# expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), -# expand("%s/MPILEUP/mpileup_reference/{samples}.mpileup" % (config["project-folder"]), samples=samples), + expand("%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), + expand("%s/FASTQ/TRIMMED/alignments/{samples}.sorted.bam" % (config["project-folder"]), samples=samples), + expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), + expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sorted.bam" % (config["project-folder"]), samples=samples), + expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]), samples=samples), + expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sorted.bam" % (config["project-folder"]), samples=samples), + expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), + expand("%s/MPILEUP/mpileup_reference/{samples}.mpileup" % (config["project-folder"]), samples=samples), # OUTPUT STEP 6 -# "%s/FASTQ/TRIMMED/GSC.MasterMatrix.txt" % (config["project-folder"]), -# "%s/MPILEUP/mpileup_reference/GSC.MasterMatrix.txt" % (config["project-folder"]), -# expand("%s/FASTQ/TRIMMED/{samples}.count.txt" % (config["project-folder"]), samples=samples), + "%s/FASTQ/TRIMMED/GSC.MasterMatrix.txt" % (config["project-folder"]), + "%s/MPILEUP/mpileup_reference/GSC.MasterMatrix.txt" % (config["project-folder"]), + expand("%s/FASTQ/TRIMMED/{samples}.count.txt" % (config["project-folder"]), samples=samples), # OUTPUT STEP 7 -# "%s/FASTQ/TRIMMED/variants/GSC.GenoMatrix.txt" % (config["project-folder"]), -# "%s/MPILEUP/mpileup_reference/variants/GSC.GenoMatrix.txt" % (config["project-folder"]), + "%s/FASTQ/TRIMMED/variants/GSC.GenoMatrix.txt" % (config["project-folder"]), + "%s/MPILEUP/mpileup_reference/variants/GSC.GenoMatrix.txt" % (config["project-folder"]), # OUTPUT STEP 8 -# "%s/FASTQ/TRIMMED/GSC.vcf" % (config["project-folder"]), -# "%s/FASTQ/TRIMMED/GSC.vcf.fa" % (config["project-folder"]), -# "%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]), -# "%s/MPILEUP/mpileup_reference/GSC.vcf.fa" % (config["project-folder"]), + "%s/FASTQ/TRIMMED/GSC.vcf" % (config["project-folder"]), + "%s/FASTQ/TRIMMED/GSC.vcf.fa" % (config["project-folder"]), + "%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]), + "%s/MPILEUP/mpileup_reference/GSC.vcf.fa" % (config["project-folder"]), # OUTPUT STEP 9 -# "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]), + "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]), # Quality check + expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), + "%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]), "%s/finalReport.html" % (config["project-folder"]) diff --git a/rules/Module7-Reporting b/rules/Module7-Reporting index 10d915b..7c665de 100644 --- a/rules/Module7-Reporting +++ b/rules/Module7-Reporting @@ -8,6 +8,9 @@ rule R_finalReport: script=config["report-script"], fa="%s/FASTQ/TRIMMED/GSC.vcf.fa" % (config["project-folder"]), mockToRef="%s/BAM/Mockref/mockToRef.sorted.bam" % (config["project-folder"]), + mockToRefSamflags="%s/BAM/Mockref/mockToRef.sam.samflags" % (config["project-folder"]), + mockFlagstats=expand("%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), + c=expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]), samples=samples), mqcraw="%s/QC/RAW/multiqc_R1/" % (config["project-folder"]), mqcconcatenated="%s/QC/CONCATENATED/multiqc_R1/" % (config["project-folder"]), mqctrimmed="%s/QC/TRIMMED/multiqc_R1/" % (config["project-folder"]), @@ -16,7 +19,9 @@ rule R_finalReport: qdConc=expand("%s/QC/CONCATENATED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples), qdTrimmed=expand("%s/QC/TRIMMED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples), fsFinal=expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), - finalSetVariants="%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]) + finalSetVariants="%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]), + fsRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), + cRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.coverage" % (config["project-folder"]), samples=samples) output: "%s/finalReport.html" % (config["project-folder"]) log: From 12e944b772842d84e75cec1324125dc3dbdb0a4c Mon Sep 17 00:00:00 2001 From: fischuu Date: Wed, 3 Mar 2021 18:24:25 +0200 Subject: [PATCH 22/34] Final mock reference ws not declared in output --- CHANGELOG | 1 + GBS-pipeline.smk | 5 +- scripts/workflow-report.Rmd | 91 +++++++++++++++++++------------------ 3 files changed, 50 insertions(+), 47 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index aa31f0e..74c8571 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.8 : * Final mock reference was not declared in output 0.5.7 : * Dependencies between the rules improved 0.5.6 : * Bugfix - wrong path for reference based vcf fixed 0.5.5 : * Bugfix - wrong reference vcf address in module 6 diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index cee143e..cc6c6e5 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -9,8 +9,8 @@ import os ##### Daniel Fischer (daniel.fischer@luke.fi) ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline -##### Version: 0.5.7 -version = "0.5.7" +##### Version: 0.5.8 +version = "0.5.8" ##### set minimum snakemake version ##### min_version("5.24") @@ -121,6 +121,7 @@ rule all: "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]), # Quality check expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), + "%s/MockReference/MockReference.fa" % (config["project-folder"]), "%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]), "%s/finalReport.html" % (config["project-folder"]) diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd index 23970a9..92d7391 100644 --- a/scripts/workflow-report.Rmd +++ b/scripts/workflow-report.Rmd @@ -1454,53 +1454,54 @@ accepted mistakes). ```{r} -if(ncol(sampleInfo)>1){ - mmCase <- 2 - mmControl <- 5 - vcf.cases <- vcf$genotypes[sampleInfo$casecontrol=="case",] - vcf.control <- vcf$genotypes[sampleInfo$casecontrol=="control",] - - vcf.cases.table <- apply(vcf.cases,2,table) - vcf.control.table <- apply(vcf.control,2,table) - - nCases <- nrow(vcf.cases) - nControl <- nrow(vcf.control) - - candidateLociRef <- c() - candidateLociAlt <- c() - for(i in 1:length(vcf.cases.table)){ - tmp <- vcf.cases.table[[i]] - homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"] - homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"] - if(length(homRef)==0) homRef <- 0 - if(length(homAlt)==0) homAlt <- 0 - if(homRef>=(nCases - mmCase)) candidateLociRef <- c(candidateLociRef, names(vcf.cases.table)[i]) - if(homAlt>=(nCases - mmCase)) candidateLociAlt <- c(candidateLociAlt, names(vcf.cases.table)[i]) - } - - causativeRef <- c() - causativeAlt <- c() - for(i in 1:length(candidateLociRef)){ - tmp <- vcf.control.table[[which(names(vcf.control.table)==candidateLociRef[i])]] - homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"] - homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"] - if(length(homRef)==0) homRef <- 0 - if(length(homAlt)==0) homAlt <- 0 - # if(homRef>=(nControl - 1)) causativeRef <- c(causativeRef, names(vcf.control.table)[i]) - if(homAlt>=(nControl - mmControl)) causativeRef <- c(causativeRef, names(vcf.control.table)[i]) - } - - for(i in 1:length(candidateLociAlt)){ - tmp <- vcf.control.table[[which(names(vcf.control.table)==candidateLociAlt[i])]] - homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"] - homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"] - if(length(homRef)==0) homRef <- 0 - if(length(homAlt)==0) homAlt <- 0 - if(homRef>=(nControl - mmControl)) causativeAlt <- c(causativeAlt, names(vcf.control.table)[i]) - # if(homAlt>=(nControl - 1)) causativeAlt <- c(causativeAlt, names(vcf.control.table)[i]) +if(is.element(colnames(sampleInfo), "casecontrol")){ + if(ncol(sampleInfo)>1){ + mmCase <- 2 + mmControl <- 5 + vcf.cases <- vcf$genotypes[sampleInfo$casecontrol=="case",] + vcf.control <- vcf$genotypes[sampleInfo$casecontrol=="control",] + + vcf.cases.table <- apply(vcf.cases,2,table) + vcf.control.table <- apply(vcf.control,2,table) + + nCases <- nrow(vcf.cases) + nControl <- nrow(vcf.control) + + candidateLociRef <- c() + candidateLociAlt <- c() + for(i in 1:length(vcf.cases.table)){ + tmp <- vcf.cases.table[[i]] + homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"] + homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"] + if(length(homRef)==0) homRef <- 0 + if(length(homAlt)==0) homAlt <- 0 + if(homRef>=(nCases - mmCase)) candidateLociRef <- c(candidateLociRef, names(vcf.cases.table)[i]) + if(homAlt>=(nCases - mmCase)) candidateLociAlt <- c(candidateLociAlt, names(vcf.cases.table)[i]) + } + + causativeRef <- c() + causativeAlt <- c() + for(i in 1:length(candidateLociRef)){ + tmp <- vcf.control.table[[which(names(vcf.control.table)==candidateLociRef[i])]] + homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"] + homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"] + if(length(homRef)==0) homRef <- 0 + if(length(homAlt)==0) homAlt <- 0 + # if(homRef>=(nControl - 1)) causativeRef <- c(causativeRef, names(vcf.control.table)[i]) + if(homAlt>=(nControl - mmControl)) causativeRef <- c(causativeRef, names(vcf.control.table)[i]) + } + + for(i in 1:length(candidateLociAlt)){ + tmp <- vcf.control.table[[which(names(vcf.control.table)==candidateLociAlt[i])]] + homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"] + homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"] + if(length(homRef)==0) homRef <- 0 + if(length(homAlt)==0) homAlt <- 0 + if(homRef>=(nControl - mmControl)) causativeAlt <- c(causativeAlt, names(vcf.control.table)[i]) + # if(homAlt>=(nControl - 1)) causativeAlt <- c(causativeAlt, names(vcf.control.table)[i]) + } } } - ``` Then we identify the following potentially causative mutations: From 37be033b40746ff03c4522b00448e800275208b6 Mon Sep 17 00:00:00 2001 From: fischuu Date: Thu, 4 Mar 2021 08:30:03 +0200 Subject: [PATCH 23/34] Bugfix - Final report crashed, in case that no casecontrol is given in samplelist --- CHANGELOG | 1 + GBS-pipeline.smk | 4 ++-- scripts/workflow-report.Rmd | 22 ++++++++++++---------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 74c8571..9e9330f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.9 : * Bugfix - Final report crashed, in case that no casecontrol is given in samplelist 0.5.8 : * Final mock reference was not declared in output 0.5.7 : * Dependencies between the rules improved 0.5.6 : * Bugfix - wrong path for reference based vcf fixed diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index cc6c6e5..eb5595c 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -9,8 +9,8 @@ import os ##### Daniel Fischer (daniel.fischer@luke.fi) ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline -##### Version: 0.5.8 -version = "0.5.8" +##### Version: 0.5.9 +version = "0.5.9" ##### set minimum snakemake version ##### min_version("5.24") diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd index 92d7391..5dddc5d 100644 --- a/scripts/workflow-report.Rmd +++ b/scripts/workflow-report.Rmd @@ -1507,16 +1507,18 @@ if(is.element(colnames(sampleInfo), "casecontrol")){ Then we identify the following potentially causative mutations: ```{r} -if(ncol(sampleInfo)>1){ - if(length(causativeRef)>0){ - datatable(as.data.frame(causativeRef)) - } else { - cat("No potentially mutations found for homozygeous reference!\n") - } - if(length(causativeAlt)>0){ - datatable(as.data.frame(causativeAlt)) - } else { - cat("No potentially mutations found for homozygeous alternative!\n") +if(is.element(colnames(sampleInfo), "casecontrol")){ + if(ncol(sampleInfo)>1){ + if(length(causativeRef)>0){ + datatable(as.data.frame(causativeRef)) + } else { + cat("No potentially mutations found for homozygeous reference!\n") + } + if(length(causativeAlt)>0){ + datatable(as.data.frame(causativeAlt)) + } else { + cat("No potentially mutations found for homozygeous alternative!\n") + } } } ``` From d61a768dcc19b49eaf485ede604f6dc03d441a69 Mon Sep 17 00:00:00 2001 From: fischuu Date: Sun, 7 Mar 2021 11:14:40 +0200 Subject: [PATCH 24/34] OUtput html formatting fixed --- scripts/workflow-report.Rmd | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd index 5dddc5d..73c84fe 100644 --- a/scripts/workflow-report.Rmd +++ b/scripts/workflow-report.Rmd @@ -679,6 +679,7 @@ plotMQCFeature(x=file.path(projFolder,"QC","CONCATENATED","multiqc_R1"), y=file.path(projFolder,"QC","TRIMMED","multiqc_R1"), feature="fastqc_per_base_sequence_quality_plot") ``` + # Preliminary Mock Reference ## Basic stats From 300e72f3a958377d93c869980bd59e2b79121117 Mon Sep 17 00:00:00 2001 From: Daniel Fischer Date: Sun, 7 Mar 2021 11:30:16 +0200 Subject: [PATCH 25/34] Figure added --- scripts/workflow-report.Rmd | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd index c04af5e..8a16078 100644 --- a/scripts/workflow-report.Rmd +++ b/scripts/workflow-report.Rmd @@ -857,6 +857,18 @@ Basic information for the final mock reference: out_html <- knitr::kable(out, col.names = NULL, "html") kable_styling(out_html, "striped", position = "left") ``` +## Theoretical coverage of Mock per sample +Given the length of the final mock reference assembly and the total number of sequenced bases per sample, we can estimate the theoretical coverage of the final mock for each sample + +```{r final mock coverage} +par(mar=c(10,5,1,1)) +finalMockLength <- sum(nchar(finalMockClusters)) +mockCov <- 2*mqcStats.trim[,10] * mqcStats.trim[,5]/finalMockLength +names(mockCov) <- mqcStats.trim[,1] +barplot(mockCov, ylab="'x'-fold Coverage", col=viridis(20)[8], las=2) +lines(c(-100,10000),c(1,1), lty="dotted") +lines(c(-100,10000),c(10,10), lty="dotted") +``` # Existing Reference @@ -1160,7 +1172,7 @@ for(i in 3:ncol(clusterCoverage)){ ### Cluster coverage Mock samples vs others ADD HERE STILL A COMPARISON, HOW THE READS FROM THE MOCK DISTRIBUTE ACROSS THE REFERENCE VS ALL OTHER SAMPLES -## Refind Mock reference +## Refined Mock reference ADD THIS STILL TO THE REPORT!!! ## Reference genome From 08620a3f115b4d169cedf0c08d3a05250cce89d9 Mon Sep 17 00:00:00 2001 From: fischuu Date: Sun, 7 Mar 2021 21:49:51 +0200 Subject: [PATCH 26/34] ref added --- GBS-pipeline.smk | 3 ++- GBS-pipeline_config.yaml | 4 ---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index eb5595c..9c0d2f0 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -8,7 +8,8 @@ import os ##### GBS-snakemake pipeline ##### ##### Daniel Fischer (daniel.fischer@luke.fi) ##### Natural Resources Institute Finland (Luke) -##### This pipeline is build upon the the GBS-SNP-CROP pipeline +##### This pipeline is build upon the the GBS-SNP-CROP pipeline: +##### https://github.com/halelab/GBS-SNP-CROP ##### Version: 0.5.9 version = "0.5.9" diff --git a/GBS-pipeline_config.yaml b/GBS-pipeline_config.yaml index f971037..6fc01ff 100644 --- a/GBS-pipeline_config.yaml +++ b/GBS-pipeline_config.yaml @@ -6,15 +6,11 @@ project-folder: "/scratch/project_2001746/Example" # Absolute path to the project root folder pipeline-folder: "/scratch/project_2001746/Pipeline-GBS" # Absolute path to the pipeline folder pipeline-config: "/scratch/project_2001746/Pipeline-GBS/GBS-pipeline_config-example.yaml" # Absolute path to the pipeline configuration file -#report-script: "/scratch/project_2001746/Pipeline-GBS/scripts/workflow-report.Rmd" # Absolute path to the report scripts (THIS OPTION SHOULD BE OBSOLETE) barcodes: "/scratch/project_2001746/Example/barcodesID.txt" # Absolute path to the barcodes file (MAKE THIS OPTIONAL) rawsamples: "/scratch/project_2001746/Example/rawsamples" # Absolute path to the raw samples file (MAKE THIS OPTIONAL) genome: "/scratch/project_2001746/Example/references/Vulpes_vulpes.VulVul2.2.dna.toplevel.fa" # Absolute path to an existing reference genome, leave empty "" if not available -#genome-bwa-index: "/scratch/project_2001746/Example/references/Vulpes_vulpes.VulVul2.2.dna.toplevel.fa.bwt" # Absolute path to the bwa index (THIS OPTION SHOULD BE OBSOLETE) -#genome-star-index: "/scratch/project_2001746/Example/references/STAR2.7.3a" # Absolute path to the star index (THIS OPTION SHOULD BE OBSOLETE) refsample: "" # Id of ref sample used to build mock reference (NOT SURE, IF USED, CHECK THIS) local-scratch: $LOCAL_SCRATCH # Path for fast local storage for tmp files (Here, a system variable is used. Check with your server configs) -#adapter: "/scratch/project_2001746/Pipeline-GBS/adapter.fa" # Path for the adapter sequences, used by Trimmomatic (MAKE THIS OPTIONAL) tmpdir: "/scratch/project_2001746/tmp" # Parameters for the GBS-SNP-CROP tools From 4dcb7000064388b9ae936f25125de8567f711d54 Mon Sep 17 00:00:00 2001 From: fischuu Date: Mon, 8 Mar 2021 14:59:24 +0200 Subject: [PATCH 27/34] Mapping stats for final mock vs reference genome added --- CHANGELOG | 1 + GBS-pipeline.smk | 4 +- rules/Module3-MockReference | 98 +++++++++++++++++++++++++++++++++++++ rules/Module7-Reporting | 1 + 4 files changed, 102 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 9e9330f..777f942 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.10: * Mapping stats for final mock vs reference genome added 0.5.9 : * Bugfix - Final report crashed, in case that no casecontrol is given in samplelist 0.5.8 : * Final mock reference was not declared in output 0.5.7 : * Dependencies between the rules improved diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 9c0d2f0..b662b7c 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -10,8 +10,8 @@ import os ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline: ##### https://github.com/halelab/GBS-SNP-CROP -##### Version: 0.5.9 -version = "0.5.9" +##### Version: 0.5.10 +version = "0.5.10" ##### set minimum snakemake version ##### min_version("5.24") diff --git a/rules/Module3-MockReference b/rules/Module3-MockReference index 544a92f..134c892 100644 --- a/rules/Module3-MockReference +++ b/rules/Module3-MockReference @@ -213,3 +213,101 @@ rule refine_mock_reference: snakemake <- TRUE;\ source('{input.script}')" &> {log} """ + +if config["genome"] == "": + print(f"No reference genome provided, skipping the star indexing step for reference genome") +else: + rule FinalMockVsRef: + """ + Map the mock reference clusters to the genome (minimap2). + """ + input: + refgenome=config["genome"], + mockgenome="%s/MockReference/MockReference.fa" % (config["project-folder"]) + output: + "%s/SAM/finalMockToRef.sam" % (config["project-folder"]) + singularity: config["singularity"]["minimap2"] + shell:""" + minimap2 -ax sr {input.refgenome} {input.mockgenome} > {output} 2> {log} + + """ + +if config["genome"] == "": + print(f"No reference genome provided, skipping the star indexing step for reference genome") +else: + rule FinalMockVsRef_samtools_SamToSortedBam: + """ + Index the Mock Reference Genome. + """ + input: + ref=config["genome"], + sam="%s/SAM/finalMockToRef.sam" % (config["project-folder"]) + output: + bam="%s/BAM/FinalMockref/mockToRef.bam" % (config["project-folder"]), + sorted="%s/BAM/FinalMockref/mockToRef.sorted.bam" % (config["project-folder"]) + log: + "%s/logs/Samtools/FinalMockVsRef_samtools_SamToSortedBam.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/Samtools/FinalMockVsRef_samtools_SamToSortedBam.benchmark.tsv" % (config["project-folder"]) + singularity: config["singularity"]["gbs"] + shell:""" + df -h &> {log} + samtools view -b {input.sam} > {output.bam} + samtools sort {output.bam} -o {output.sorted} + samtools index {output.sorted} + """ + + +if config["genome"] == "": + print(f"No reference genome provided, skipping the star indexing step for reference genome") +else: + rule FinalMockVsRef_AlignmentStats: + """ + Get the mapping stats for the mock vs. reference. + """ + input: + ref=config["genome"], + sam="%s/SAM/finalMockToRef.sam" % (config["project-folder"]), + sbam="%s/BAM/FinalMockref/mockToRef.sorted.bam" % (config["project-folder"]) + output: + fs="%s/BAM/FinalMockref/mockToRef.sam.flagstat" % (config["project-folder"]), + samflags="%s/BAM/FinalMockref/mockToRef.sam.samflags" % (config["project-folder"]), + stats="%s/BAM/FinalMockref/mockToRef.sam.stats" % (config["project-folder"]), + c="%s/BAM/FinalMockref/mockToRef.coverage" % (config["project-folder"]), + bed="%s/BAM/FinalMockref/mockToRef.bed" % (config["project-folder"]), + bedMerged="%s/BAM/FinalMockref/mockToRef.merged.bed" % (config["project-folder"]), + leftBed="%s/BAM/FinalMockref/mockToRef.merged.left.bed" % (config["project-folder"]), + rightBed="%s/BAM/FinalMockref/mockToRef.merged.right.bed" % (config["project-folder"]), + leftFACounts="%s/SAM/finalMockToRef.left.fa.counts" % (config["project-folder"]), + rightFACounts="%s/SAM/finalMockToRef.right.fa.counts" % (config["project-folder"]), + leftFA="%s/BAM/FinalMockref/mockToRef.merged.left.fa" % (config["project-folder"]), + rightFA="%s/BAM/FinalMockref/mockToRef.merged.right.fa" % (config["project-folder"]), + FA="%s/BAM/FinalMockref/mockToRef.merged.combined.fa" % (config["project-folder"]), + FACounts="%s/BAM/FinalMockref/mockToRef.merged.combined.fa.counts" % (config["project-folder"]) + log: + "%s/logs/FinalMockVsRef_AlignmentStats.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/MockVsRef_AlignmentStats.benchmark.tsv" % (config["project-folder"]) + threads: lambda cores: cpu_count() + conda:"envs/step3.yaml" + singularity: config["singularity"]["gbs"] + shell:""" + df -h &> {log} + echo "Number of threads used:" {threads} + sed '/^@/d' {input.sam} | cut -f2 | sort | uniq -c > {output.samflags} + samtools flagstat {input.sam} > {output.fs} + samtools stats {input.sam} > {output.stats} + samtools idxstats {input.sbam} | awk '{{print $1\" \"$3}}' > {output.c} + bedtools genomecov -ibam {input.sbam} -bg > {output.bed} + bedtools merge -i {output.bed} > {output.bedMerged} + awk '{{print $1 "\\t" (($2 - 6)<0?0:($2 - 6)) "\\t" $2}}' {output.bedMerged} > {output.leftBed} + awk '{{print $1 "\\t" $3 "\\t" $3 + 6}}' {output.bedMerged} > {output.rightBed} + bedtools getfasta -fi {input.ref} -bed {output.leftBed} > {output.leftFA} + bedtools getfasta -fi {input.ref} -bed {output.rightBed} > {output.rightFA} + + sed '/^>/d' {output.leftFA} | sort | awk '{{ print toupper($0) }}' | uniq -c > {output.leftFACounts} + sed '/^>/d' {output.rightFA} | sort | awk '{{ print toupper($0) }}' | uniq -c > {output.rightFACounts} + + paste -d '+' {output.leftFA} {output.rightFA} > {output.FA} + sed '/^>/d' {output.FA} | sort | awk '{{ print toupper($0) }}' | uniq -c > {output.FACounts} + """ diff --git a/rules/Module7-Reporting b/rules/Module7-Reporting index 7c665de..c0a2fdb 100644 --- a/rules/Module7-Reporting +++ b/rules/Module7-Reporting @@ -19,6 +19,7 @@ rule R_finalReport: qdConc=expand("%s/QC/CONCATENATED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples), qdTrimmed=expand("%s/QC/TRIMMED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples), fsFinal=expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), + fsFinalVsRef="%s/BAM/FinalMockref/mockToRef.sam.flagstat" % (config["project-folder"]), finalSetVariants="%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]), fsRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), cRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.coverage" % (config["project-folder"]), samples=samples) From f9dea0d7b6f6547fc51b40b6d6b1df093d28fe63 Mon Sep 17 00:00:00 2001 From: fischuu Date: Mon, 8 Mar 2021 15:08:44 +0200 Subject: [PATCH 28/34] Inpur rule all cleaned up a bit to account for temporary output --- CHANGELOG | 1 + GBS-pipeline.smk | 12 ++---------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 777f942..f7a6f3d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.11: * Inpur rule all cleaned up a bit to account for temporary output 0.5.10: * Mapping stats for final mock vs reference genome added 0.5.9 : * Bugfix - Final report crashed, in case that no casecontrol is given in samplelist 0.5.8 : * Final mock reference was not declared in output diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index b662b7c..848d822 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -10,8 +10,8 @@ import os ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline: ##### https://github.com/halelab/GBS-SNP-CROP -##### Version: 0.5.10 -version = "0.5.10" +##### Version: 0.5.11 +version = "0.5.11" ##### set minimum snakemake version ##### min_version("5.24") @@ -81,18 +81,10 @@ print("######################################################################### rule all: input: - # OUTPUT: PREPARATION MODULE - expand("%s/FASTQ/CONCATENATED/{samples}_R1_001.merged.fastq.gz" % (config["project-folder"]), samples=samples), - expand("%s/FASTQ/CONCATENATED/{samples}_R2_001.merged.fastq.gz" % (config["project-folder"]), samples=samples), # QC OF RAW AND CONCATENATED FILES "%s/QC/RAW/multiqc_R1/" % (config["project-folder"]), "%s/QC/CONCATENATED/multiqc_R1/" % (config["project-folder"]), "%s/QC/TRIMMED/multiqc_R1/" % (config["project-folder"]), - # OUTPUT STEP 2 - expand("%s/FASTQ/TRIMMED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples), - expand("%s/FASTQ/TRIMMED/{samples}.R2.fq.gz" % (config["project-folder"]), samples=samples), - # OUTPUT STEP 2b - expand("%s/FASTQ/SUBSTITUTED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples), # OUTPUT STEP 4 "%s/FASTQ/TRIMMED/GSC.MR.Genome.fa" % (config["project-folder"]), "%s/BAM/Mockref/mockToRef.sam.flagstat" % (config["project-folder"]), From 959e2263172306c60d727af1f209a93f36a50802 Mon Sep 17 00:00:00 2001 From: fischuu Date: Mon, 8 Mar 2021 16:17:37 +0200 Subject: [PATCH 29/34] Coverage for finalMock alignments are now also calculated --- CHANGELOG | 3 ++- GBS-pipeline.smk | 4 ++-- GBS-pipeline_server-config.yaml | 13 +++++++++++++ rules/Module3-MockReference | 5 +++++ rules/Module4-ReadAlignment | 9 ++++++--- rules/Module7-Reporting | 3 ++- 6 files changed, 30 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index f7a6f3d..cf8224c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,7 +4,8 @@ ##### ##### ################################################################################ -0.5.11: * Inpur rule all cleaned up a bit to account for temporary output +0.5.12: * Coverage for finalMock alignments are now also calculated +0.5.11: * Input rule all cleaned up a bit to account for temporary output 0.5.10: * Mapping stats for final mock vs reference genome added 0.5.9 : * Bugfix - Final report crashed, in case that no casecontrol is given in samplelist 0.5.8 : * Final mock reference was not declared in output diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 848d822..3d23f2a 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -10,8 +10,8 @@ import os ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline: ##### https://github.com/halelab/GBS-SNP-CROP -##### Version: 0.5.11 -version = "0.5.11" +##### Version: 0.5.12 +version = "0.5.12" ##### set minimum snakemake version ##### min_version("5.24") diff --git a/GBS-pipeline_server-config.yaml b/GBS-pipeline_server-config.yaml index 4701d55..aaffa1c 100644 --- a/GBS-pipeline_server-config.yaml +++ b/GBS-pipeline_server-config.yaml @@ -140,6 +140,19 @@ MockVsRef_AlignmentStats: time: 05:00:00 job-name: MockVsRef_AlignmentStats +FinalMockVsRef_alignment: + time: 05:00:00 + cpus-per-task: 20 + job-name: FinalMockVsRef_alignment + +FinalMockVsRef_samtools_SamToSortedBam: + time: 05:00:00 + job-name: FinalMockVsRef_samtools_SamToSortedBam + +FinalMockVsRef_AlignmentStats: + time: 05:00:00 + job-name: FinalMockVsRef_AlignmentStats + MockVsRef_SortedBamToMpileup: time: 05:00:00 job-name: MockVsRef_SortedBamToMpileup diff --git a/rules/Module3-MockReference b/rules/Module3-MockReference index 134c892..f5c25a4 100644 --- a/rules/Module3-MockReference +++ b/rules/Module3-MockReference @@ -226,6 +226,11 @@ else: mockgenome="%s/MockReference/MockReference.fa" % (config["project-folder"]) output: "%s/SAM/finalMockToRef.sam" % (config["project-folder"]) + log: + "%s/logs/MINIMAP2/mm2_map_finalMockVSref.log" % (config["project-folder"]) + benchmark: + "%s/benchmark/MINIMAP2/mm2_finalMockVSref.benchmark.tsv" % (config["project-folder"]) + threads: 20 singularity: config["singularity"]["minimap2"] shell:""" minimap2 -ax sr {input.refgenome} {input.mockgenome} > {output} 2> {log} diff --git a/rules/Module4-ReadAlignment b/rules/Module4-ReadAlignment index 68c52a9..1649c3b 100644 --- a/rules/Module4-ReadAlignment +++ b/rules/Module4-ReadAlignment @@ -363,9 +363,11 @@ rule finalMockRef_AlignmentStats: Alignment stats for reads mapped to final mock ref """ input: - "%s/SAM/alignments_finalMock/{samples}.sam" % (config["project-folder"]) + sam="%s/SAM/alignments_finalMock/{samples}.sam" % (config["project-folder"]), + sbam="%s/BAM/alignments_finalMock/{samples}.sorted.bam" % (config["project-folder"]) output: - "%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]) + fs="%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), + c="%s/BAM/alignments_finalMock/{samples}.coverage" % (config["project-folder"]) log: "%s/logs/Samtools/FinalMockRef_AlignmentStats.{samples}.log" % (config["project-folder"]) benchmark: @@ -373,5 +375,6 @@ rule finalMockRef_AlignmentStats: conda:"envs/samtools.yaml" singularity: config["singularity"]["gbs"] shell:""" - samtools flagstat {input} > {output} + samtools flagstat {input.sam} > {output.fs} + samtools idxstats {input.sbam} | awk '{{print $1\" \"$3}}' > {output.c} """ diff --git a/rules/Module7-Reporting b/rules/Module7-Reporting index c0a2fdb..63f36a3 100644 --- a/rules/Module7-Reporting +++ b/rules/Module7-Reporting @@ -22,7 +22,8 @@ rule R_finalReport: fsFinalVsRef="%s/BAM/FinalMockref/mockToRef.sam.flagstat" % (config["project-folder"]), finalSetVariants="%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]), fsRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples), - cRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.coverage" % (config["project-folder"]), samples=samples) + cRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.coverage" % (config["project-folder"]), samples=samples), + cFinalMock=expand("%s/BAM/alignments_finalMock/{samples}.coverage" % (config["project-folder"]), samples=samples) output: "%s/finalReport.html" % (config["project-folder"]) log: From 81ca94985441a1a5637adaf41ba798cdeaadcd97 Mon Sep 17 00:00:00 2001 From: Daniel Fischer Date: Mon, 8 Mar 2021 16:27:59 +0200 Subject: [PATCH 30/34] Statistics on final mock reference added --- CHANGELOG | 1 + GBS-pipeline.smk | 4 +- scripts/workflow-report.Rmd | 179 +++++++++++++++++++++++++++++++++--- 3 files changed, 168 insertions(+), 16 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index cf8224c..f831c20 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.13: * Statistics on final mock reference added 0.5.12: * Coverage for finalMock alignments are now also calculated 0.5.11: * Input rule all cleaned up a bit to account for temporary output 0.5.10: * Mapping stats for final mock vs reference genome added diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 3d23f2a..764c511 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -10,8 +10,8 @@ import os ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline: ##### https://github.com/halelab/GBS-SNP-CROP -##### Version: 0.5.12 -version = "0.5.12" +##### Version: 0.5.13 +version = "0.5.13" ##### set minimum snakemake version ##### min_version("5.24") diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd index b58b4f4..16f9ee0 100644 --- a/scripts/workflow-report.Rmd +++ b/scripts/workflow-report.Rmd @@ -44,10 +44,10 @@ knitr::opts_chunk$set(echo = FALSE, mockClusters.file <- "GSC.MR.Clusters.fa" mockReference.file <- "GSC.MR.Genome.fa" if(!is.element("snakemake",ls())){ - projFolder <- "/scratch/project_2001746/BSF" + projFolder <- "/scratch/project_2001746/Whitefish_benchMS" pipelineFolder <- "/scratch/project_2001746/Pipeline-GBS/" pipelineConfig <- "/scratch/project_2001746/BSF/Pipeline-GBS/GBS-pipeline_config-BSF.yaml" - refGenome.file <- "hermetiaRef_112020.fasta" + refGenome.file <- "GCA_902175075.1_AWG_v1_genomic.fna" } refGenome.file <- basename(refGenome.file) ifelse(refGenome.file == "", refAvail <- FALSE, refAvail <- TRUE) @@ -337,8 +337,9 @@ mqcStats.raw <- read.table(file.path(projFolder,"QC","RAW","multiqc_R1", "multiq tmp <- mqcStats.raw[,-c(1:4,7,11:21)] totalRawSequences <- sum(mqcStats.raw$Total.Sequences) -out <- c(totalRawSequences,apply(tmp,2,mean)) +out <- c(totalRawSequences,sd(tmp[,1]),apply(tmp,2,mean)) names(out) <- c("Tot. number sequences", + "SD total sequences", "Avg. total sequences", "Avg. poor quality", "Avg. GC Percent", @@ -682,10 +683,13 @@ plotMQCFeature(x=file.path(projFolder,"QC","CONCATENATED","multiqc_R1"), # Preliminary Mock Reference +## To Add + * Get statistics on multi-mapping of the reads. + ## Basic stats ```{r import required files, warning=FALSE, } -mockReference <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", mockReference.file)) -mockClusters <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", mockClusters.file)) +prelimMockReference <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", mockReference.file)) +prelimMockClusters <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", mockClusters.file)) vsearchIN <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", "VsearchIN.fa")) #vsearchOUT <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", "VsearchOUT.fa")) ``` @@ -701,7 +705,7 @@ Basic information on the sequences used to build the mock reference: Basic information on the identified clusters, used as mock reference ```{r summarise mock reference clusters} - out <- summary(mockClusters) + out <- summary(prelimMockClusters) out[1,1] <- c("No. of Clusters") out[,2] <- formatC(out[,2], format="d", big.mark=",") out_html <- knitr::kable(out, col.names = NULL, "html") @@ -710,9 +714,9 @@ Basic information on the identified clusters, used as mock reference Total length of the input fasta to build reference: `r format(sum(nchar(vsearchIN)), big.mark=",")` bp. -Total length of the Mock Reference Clusters: `r format(sum(nchar(mockClusters)), big.mark=",")` bp. +Total length of the Mock Reference Clusters: `r format(sum(nchar(prelimMockClusters)), big.mark=",")` bp. -Total length of the Mock Reference (Used as reference): `r format(sum(nchar(mockReference)), big.mark=",")` bp. +Total length of the Mock Reference (Used as reference): `r format(sum(nchar(prelimMockReference)), big.mark=",")` bp. ## Theoretical coverage of Mock per sample Given the length of the mock reference assembly and the total number of sequenced bases per sample, we can estimate the theoretical coverage of the mock for each sample @@ -721,7 +725,7 @@ Currently I hard-code here the paired-end reads and assume equal lengths and rea ```{r mock coverage} par(mar=c(10,5,1,1)) -mockLength <- sum(nchar(mockReference)) +mockLength <- sum(nchar(prelimMockReference)) mockCov <- 2*mqcStats.trim[,10] * mqcStats.trim[,5]/mockLength names(mockCov) <- mqcStats.trim[,1] barplot(mockCov, ylab="'x'-fold Coverage", col=viridis(20)[8], las=2) @@ -830,7 +834,7 @@ out_html <- knitr::kable(out, col.names = NULL, "html") kable_styling(out_html, "striped", position = "left") ``` -Quality values for the first 100.000 sequences. As a rule of thumb, capital letters are good, special characters are bad quality. +Quality values for the first 100,000 sequences. As a rule of thumb, capital letters are good, special characters are bad quality. ```{r quality measues} outQual <- table(unlist(strsplit(discardedReads$qual[1:100000],""))) out_html <- knitr::kable(formatC(outQual, format="d", big.mark=","), "html") @@ -1027,7 +1031,8 @@ WE COULD COUNT HERE STILL HOW MANY SEQUENCES MATCH THE GIVEN ONE WITH ONE SUBSTI # Alignment -## Mock Reference +## Preliminary Mock Reference +### Mapping stats Import the mapping statistics to the Mock reference @@ -1173,11 +1178,156 @@ for(i in 3:ncol(clusterCoverage)){ ### Cluster coverage Mock samples vs others ADD HERE STILL A COMPARISON, HOW THE READS FROM THE MOCK DISTRIBUTE ACROSS THE REFERENCE VS ALL OTHER SAMPLES -## Refined Mock reference -ADD THIS STILL TO THE REPORT!!! +## Mock Reference +### Mapping stats -## Reference genome +Import the mapping statistics to the Mock reference + +```{r import flagstats} +flagstatFiles <- list.files(file.path(projFolder,"BAM", "alignments_finalMock"), pattern="*.flagstat") +flagstats <- list() +for(i in 1:length(flagstatFiles)){ + flagstats[[i]] <- readLines(file.path(projFolder,"BAM", "alignments_finalMock",flagstatFiles[i])) +} +``` + +Visualization of the alignments, red stars indicate the mock reference samples + +```{r vis mapping stats} +par(oma=c(6,3,0,0)) +mapStats <- matrix(0,ncol=length(flagstatFiles), nrow=2) + +sampleNames <- gsub(".sam.flagstat", "", flagstatFiles) +colnames(mapStats) <- sampleNames + +tmp <- as.numeric(sapply(strsplit(sapply(flagstats,"[",1), " +"),"[",1)) +mapStats[1,] <- as.numeric(sapply(strsplit(sapply(flagstats,"[",5), " +"),"[",1)) +mapStats[2,] <- tmp - mapStats[1,] + +p <- barplot(mapStats, col=c(viridis(20)[8], viridis(20)[16]), las=2) + +legend("topleft", pch=c(20,20), col=c(viridis(20)[16], viridis(20)[8]), legend=c("Unmapped", "Mapped"), fill="white") + +# highlight the mock reference samples +mockFiles <- paste(mockSamples, ".sam.flagstat", sep="") +mockPos <- which(is.element(flagstatFiles, mockFiles)) + +points(p[mockPos], rep(200000, length(mockPos)), pch="*", col="red", cex=4) +``` + +```{r mapping percentage} +barplot(mapStats[1,] / (apply(mapStats,2,sum)), ylim=c(0,1), ylab="Mapping in Percent", col=viridis(20)[8], las=2) +``` + + +### Coverage + +Data mapped against the clusters and then the reads per cluster visualized +```{r importClusterCoverage, warning=FALSE} +clusterFiles <- list.files(file.path(projFolder, "BAM", "alignments_finalMock"), pattern="*.coverage") +clusterCoverage <- read.table(file.path(projFolder, "BAM", "alignments_finalMock", clusterFiles[1])) +names(clusterCoverage)[1:2] <- c("cluster", clusterFiles[1]) + +for(i in 2:length(clusterFiles)){ + tmp <- read.table(file.path(projFolder, "BAM", "alignments_finalMock", clusterFiles[i])) + names(tmp)[1:2] <- c("cluster", clusterFiles[i]) + clusterCoverage <- merge(clusterCoverage, tmp, by="cluster") +} +names(clusterCoverage)[2:(length(clusterFiles)+1)] <- clusterFiles +clusterCoverage[,1] <- as.numeric(gsub("Cluster", "", clusterCoverage[,1])) +clusterCoverage <- clusterCoverage[order(clusterCoverage[,1]),] +clusterCoverage <- clusterCoverage[is.na(clusterCoverage[,1])==FALSE,] + +clusterCoverage.std <- t(t(clusterCoverage)/apply(clusterCoverage,2,sum))*100000 +``` + +### Lorenz curve +Concentration measure of reads against clusters + +```{r lorenz} +plot(cumsum(sort(clusterCoverage[,2] / sum(clusterCoverage[,2]))), type="l", xlab="Cluster", ylab="Concentration") +for(i in 3:ncol(clusterCoverage)){ + lines(cumsum(sort(clusterCoverage[,i] / sum(clusterCoverage[,i])))) +} +``` + + +### Stats on coverage + +These are the amounts of clusters with different samples. + +```{r cluster coverage stats} +coveredClusters <- apply(clusterCoverage[,-1]>0,1,sum) +barplot(table(coveredClusters), xlab="No. of different samples aligned to cluster", col=viridis(20)[8]) +``` + +Now we have here the number of reads per coverage class. That means, instead of having it binary as in the previous plot, we now count all the reads per coverage group. + +```{r reads per coverage group} +readsPerCoverageGroup <- c() + +for(i in 1:max(coveredClusters)){ + readsPerCoverageGroup[i] <- sum(clusterCoverage[coveredClusters==i,-1]) +} + +names(readsPerCoverageGroup) <- 1:max(coveredClusters) + +barplot(readsPerCoverageGroup, xlab="Coverage group", ylab="Reads on cluster group", col=viridis(20)[8]) +``` + + +```{r reads per coverage group in percent} +barplot(readsPerCoverageGroup/sum(readsPerCoverageGroup)*100, xlab="Coverage group", ylab="Reads on cluster group (in %)", ylim=c(0,100), col=viridis(20)[8]) +``` + +Then still the number of clusters without coverage per sample + +```{r clusters without coverage} +nonHittedClusters <- apply(clusterCoverage[,-1]==0,2,sum) +names(nonHittedClusters) <- gsub(".coverage", "", names(nonHittedClusters)) +barplot(nonHittedClusters, col=viridis(20)[8], las=2) +lines(c(0,100000), c(nrow(clusterCoverage), nrow(clusterCoverage)), lty="dotted") +``` + +And then this still as percentage + +```{r clusterhits percentage} +barplot(nonHittedClusters/nrow(clusterCoverage), ylim=c(0,1), las=2, col=viridis(20)[8]) +lines(c(0,10000),c(0.2,0.2), lty="dotted") +lines(c(0,10000),c(0.4,0.4), lty="dotted") +lines(c(0,10000),c(0.5,0.5), lty="dashed") +lines(c(0,10000),c(0.6,0.6), lty="dotted") +lines(c(0,10000),c(0.8,0.8), lty="dotted") +``` + +### Smoothed log-coverage per cluster + +```{r vizCusterCoverage} +plot(smooth.spline(clusterCoverage[,1], log(clusterCoverage[,2]+1), all.knots=FALSE), type="l", xlab="Cluster", ylab="Log-coverage", ylim=c(0, max(log(clusterCoverage))/2)) + +for(i in 3:ncol(clusterCoverage)){ + lines(smooth.spline(clusterCoverage[,1], log(clusterCoverage[,i]+1), all.knots=FALSE)) +} +``` + +### Smoothed std-log-coverage per cluster + +Now the coverages are divided by the total amount of reads per sample and then multiplied by 10^5. + +```{r vizStdCusterCoverage} +# I use here the first column of the other matrix to keep the same coordinates. It does not affect the plot. +plot(smooth.spline(clusterCoverage[,1], log(clusterCoverage.std[,2]+1), all.knots=FALSE), type="l", xlab="Cluster", ylab="", ylim=c(0, max(log(clusterCoverage.std))/4)) + +for(i in 3:ncol(clusterCoverage)){ + lines(smooth.spline(clusterCoverage[,1], log(clusterCoverage.std[,i]+1), all.knots=FALSE)) +} +``` + + + +## Reference genome +### Basic stats Import the mapping statistics to the reference genome ```{r import ref flagstats} @@ -1218,6 +1368,7 @@ points(p[mockPos], rep(200000, length(mockPos)), pch="*", col="red", cex=4) barplot(mapStats[1,] / (apply(mapStats,2,sum)), ylim=c(0,1), ylab="Mapping in Percent", col=viridis(20)[8], las=2) ``` + # Variants Import the Result vcf From d57a112daef60c19a3776359b5204675b05083d3 Mon Sep 17 00:00:00 2001 From: fischuu Date: Tue, 9 Mar 2021 15:36:41 +0200 Subject: [PATCH 31/34] Bugfix: Rule FinalMockVsRef_alignment didn't have resource allocations --- CHANGELOG | 1 + GBS-pipeline.smk | 4 ++-- rules/Module3-MockReference | 3 +-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index cf8224c..c49a41d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.13: * Bugfix: Rule FinalMockVsRef_alignment didn't have resource allocations 0.5.12: * Coverage for finalMock alignments are now also calculated 0.5.11: * Input rule all cleaned up a bit to account for temporary output 0.5.10: * Mapping stats for final mock vs reference genome added diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 3d23f2a..764c511 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -10,8 +10,8 @@ import os ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline: ##### https://github.com/halelab/GBS-SNP-CROP -##### Version: 0.5.12 -version = "0.5.12" +##### Version: 0.5.13 +version = "0.5.13" ##### set minimum snakemake version ##### min_version("5.24") diff --git a/rules/Module3-MockReference b/rules/Module3-MockReference index f5c25a4..b4a130e 100644 --- a/rules/Module3-MockReference +++ b/rules/Module3-MockReference @@ -217,7 +217,7 @@ rule refine_mock_reference: if config["genome"] == "": print(f"No reference genome provided, skipping the star indexing step for reference genome") else: - rule FinalMockVsRef: + rule FinalMockVsRef_alignment: """ Map the mock reference clusters to the genome (minimap2). """ @@ -234,7 +234,6 @@ else: singularity: config["singularity"]["minimap2"] shell:""" minimap2 -ax sr {input.refgenome} {input.mockgenome} > {output} 2> {log} - """ if config["genome"] == "": From 4f97226f70fb5273829d051173fce842be7572c5 Mon Sep 17 00:00:00 2001 From: fischuu Date: Wed, 10 Mar 2021 18:17:21 +0200 Subject: [PATCH 32/34] snakemake version updated and quick fix for freezing optimiser added --- GBS-pipeline.smk | 2 +- runGBSPipeline-example.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index 200e775..e17d0a1 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -14,7 +14,7 @@ import os version = "0.5.14" ##### set minimum snakemake version ##### -min_version("5.24") +min_version("6.0") ##### Sample sheets ##### diff --git a/runGBSPipeline-example.sh b/runGBSPipeline-example.sh index 2a2e3d0..4fe7304 100644 --- a/runGBSPipeline-example.sh +++ b/runGBSPipeline-example.sh @@ -51,4 +51,5 @@ snakemake -s /scratch/project_2001746/Pipeline-GBS/GBS-pipeline.smk \ --latency-wait 60 \ --cluster-config /scratch/project_2001746/Pipeline-GBS/GBS-pipeline_server-config.yaml \ --cluster "sbatch -t {cluster.time} --account={cluster.account} --gres=nvme:{cluster.nvme} --job-name={cluster.job-name} --tasks-per-node={cluster.ntasks} --cpus-per-task={cluster.cpus-per-task} --mem-per-cpu={cluster.mem-per-cpu} -p {cluster.partition} -D {cluster.working-directory}" \ + --scheduler greedy \ $@ From cc214929a80b57413d4787052f20f52f2b3ba366 Mon Sep 17 00:00:00 2001 From: fischuu Date: Thu, 11 Mar 2021 12:05:31 +0200 Subject: [PATCH 33/34] small typos fixed --- rules/Module5-CallVariants | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rules/Module5-CallVariants b/rules/Module5-CallVariants index e01a63b..37015ac 100644 --- a/rules/Module5-CallVariants +++ b/rules/Module5-CallVariants @@ -76,7 +76,7 @@ rule create_verticalRef: threads: 1 shell:""" sort -u -m -k2n {input.ref} -o {params.wd}/VerticalRefPos.tmp; - sort -k2n {params.wd}/VerticalRefPos.tmp | uniq >{output.vref}; + sort -k2n {params.wd}/VerticalRefPos.tmp | uniq > {output.vref}; #ls {params.wd}/*count.txt | xargs -n1 basename > {output.co} ls {params.wd}/*count.txt > {output.co} """ @@ -473,7 +473,7 @@ rule create_verticalRef_finalMock: threads: 1 shell:""" sort -u -m -k2n {input.ref} -o {params.wd}/VerticalRefPos.tmp; - sort -k2n {params.wd}/VerticalRefPos.tmp | uniq >{output.vref}; + sort -k2n {params.wd}/VerticalRefPos.tmp | uniq > {output.vref}; #ls {params.wd}/*count.txt | xargs -n1 basename > {output.co} ls {params.wd}/*count.txt > {output.co} """ From 02257d0087c2983ab3581203acdbc2d65917a44f Mon Sep 17 00:00:00 2001 From: fischuu Date: Thu, 11 Mar 2021 15:14:35 +0200 Subject: [PATCH 34/34] Report adjusted --- CHANGELOG | 1 + GBS-pipeline.smk | 4 ++-- scripts/workflow-report.Rmd | 24 ++++++++++++------------ 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index bbb7f8f..84e2686 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.15: * Report adjusted 0.5.14: * Bugfix: Rule FinalMockVsRef_alignment didn't have resource allocations 0.5.13: * Statistics on final mock reference added 0.5.12: * Coverage for finalMock alignments are now also calculated diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index e17d0a1..d1154d6 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -10,8 +10,8 @@ import os ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline: ##### https://github.com/halelab/GBS-SNP-CROP -##### Version: 0.5.14 -version = "0.5.14" +##### Version: 0.5.15 +version = "0.5.15" ##### set minimum snakemake version ##### min_version("6.0") diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd index 16f9ee0..4200790 100644 --- a/scripts/workflow-report.Rmd +++ b/scripts/workflow-report.Rmd @@ -1183,7 +1183,7 @@ ADD HERE STILL A COMPARISON, HOW THE READS FROM THE MOCK DISTRIBUTE ACROSS THE R Import the mapping statistics to the Mock reference -```{r import flagstats} +```{r import final flagstats} flagstatFiles <- list.files(file.path(projFolder,"BAM", "alignments_finalMock"), pattern="*.flagstat") flagstats <- list() for(i in 1:length(flagstatFiles)){ @@ -1193,7 +1193,7 @@ for(i in 1:length(flagstatFiles)){ Visualization of the alignments, red stars indicate the mock reference samples -```{r vis mapping stats} +```{r vis mapping final stats} par(oma=c(6,3,0,0)) mapStats <- matrix(0,ncol=length(flagstatFiles), nrow=2) @@ -1216,7 +1216,7 @@ mockPos <- which(is.element(flagstatFiles, mockFiles)) points(p[mockPos], rep(200000, length(mockPos)), pch="*", col="red", cex=4) ``` -```{r mapping percentage} +```{r mapping final percentage} barplot(mapStats[1,] / (apply(mapStats,2,sum)), ylim=c(0,1), ylab="Mapping in Percent", col=viridis(20)[8], las=2) ``` @@ -1224,7 +1224,7 @@ barplot(mapStats[1,] / (apply(mapStats,2,sum)), ylim=c(0,1), ylab="Mapping in Pe ### Coverage Data mapped against the clusters and then the reads per cluster visualized -```{r importClusterCoverage, warning=FALSE} +```{r importFinalClusterCoverage, warning=FALSE} clusterFiles <- list.files(file.path(projFolder, "BAM", "alignments_finalMock"), pattern="*.coverage") clusterCoverage <- read.table(file.path(projFolder, "BAM", "alignments_finalMock", clusterFiles[1])) names(clusterCoverage)[1:2] <- c("cluster", clusterFiles[1]) @@ -1245,7 +1245,7 @@ clusterCoverage.std <- t(t(clusterCoverage)/apply(clusterCoverage,2,sum))*100000 ### Lorenz curve Concentration measure of reads against clusters -```{r lorenz} +```{r final lorenz} plot(cumsum(sort(clusterCoverage[,2] / sum(clusterCoverage[,2]))), type="l", xlab="Cluster", ylab="Concentration") for(i in 3:ncol(clusterCoverage)){ lines(cumsum(sort(clusterCoverage[,i] / sum(clusterCoverage[,i])))) @@ -1257,14 +1257,14 @@ for(i in 3:ncol(clusterCoverage)){ These are the amounts of clusters with different samples. -```{r cluster coverage stats} +```{r final cluster coverage stats} coveredClusters <- apply(clusterCoverage[,-1]>0,1,sum) barplot(table(coveredClusters), xlab="No. of different samples aligned to cluster", col=viridis(20)[8]) ``` Now we have here the number of reads per coverage class. That means, instead of having it binary as in the previous plot, we now count all the reads per coverage group. -```{r reads per coverage group} +```{r final reads per coverage group} readsPerCoverageGroup <- c() for(i in 1:max(coveredClusters)){ @@ -1277,13 +1277,13 @@ barplot(readsPerCoverageGroup, xlab="Coverage group", ylab="Reads on cluster gro ``` -```{r reads per coverage group in percent} +```{r final reads per coverage group in percent} barplot(readsPerCoverageGroup/sum(readsPerCoverageGroup)*100, xlab="Coverage group", ylab="Reads on cluster group (in %)", ylim=c(0,100), col=viridis(20)[8]) ``` Then still the number of clusters without coverage per sample -```{r clusters without coverage} +```{r final clusters without coverage} nonHittedClusters <- apply(clusterCoverage[,-1]==0,2,sum) names(nonHittedClusters) <- gsub(".coverage", "", names(nonHittedClusters)) barplot(nonHittedClusters, col=viridis(20)[8], las=2) @@ -1292,7 +1292,7 @@ lines(c(0,100000), c(nrow(clusterCoverage), nrow(clusterCoverage)), lty="dotted" And then this still as percentage -```{r clusterhits percentage} +```{r final clusterhits percentage} barplot(nonHittedClusters/nrow(clusterCoverage), ylim=c(0,1), las=2, col=viridis(20)[8]) lines(c(0,10000),c(0.2,0.2), lty="dotted") lines(c(0,10000),c(0.4,0.4), lty="dotted") @@ -1303,7 +1303,7 @@ lines(c(0,10000),c(0.8,0.8), lty="dotted") ### Smoothed log-coverage per cluster -```{r vizCusterCoverage} +```{r final vizCusterCoverage} plot(smooth.spline(clusterCoverage[,1], log(clusterCoverage[,2]+1), all.knots=FALSE), type="l", xlab="Cluster", ylab="Log-coverage", ylim=c(0, max(log(clusterCoverage))/2)) for(i in 3:ncol(clusterCoverage)){ @@ -1315,7 +1315,7 @@ for(i in 3:ncol(clusterCoverage)){ Now the coverages are divided by the total amount of reads per sample and then multiplied by 10^5. -```{r vizStdCusterCoverage} +```{r final vizStdCusterCoverage} # I use here the first column of the other matrix to keep the same coordinates. It does not affect the plot. plot(smooth.spline(clusterCoverage[,1], log(clusterCoverage.std[,2]+1), all.knots=FALSE), type="l", xlab="Cluster", ylab="", ylim=c(0, max(log(clusterCoverage.std))/4))