diff --git a/CHANGELOG b/CHANGELOG index bbb7f8f..84e2686 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ ##### ##### ################################################################################ +0.5.15: * Report adjusted 0.5.14: * Bugfix: Rule FinalMockVsRef_alignment didn't have resource allocations 0.5.13: * Statistics on final mock reference added 0.5.12: * Coverage for finalMock alignments are now also calculated diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk index e17d0a1..d1154d6 100644 --- a/GBS-pipeline.smk +++ b/GBS-pipeline.smk @@ -10,8 +10,8 @@ import os ##### Natural Resources Institute Finland (Luke) ##### This pipeline is build upon the the GBS-SNP-CROP pipeline: ##### https://github.com/halelab/GBS-SNP-CROP -##### Version: 0.5.14 -version = "0.5.14" +##### Version: 0.5.15 +version = "0.5.15" ##### set minimum snakemake version ##### min_version("6.0") diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd index 16f9ee0..4200790 100644 --- a/scripts/workflow-report.Rmd +++ b/scripts/workflow-report.Rmd @@ -1183,7 +1183,7 @@ ADD HERE STILL A COMPARISON, HOW THE READS FROM THE MOCK DISTRIBUTE ACROSS THE R Import the mapping statistics to the Mock reference -```{r import flagstats} +```{r import final flagstats} flagstatFiles <- list.files(file.path(projFolder,"BAM", "alignments_finalMock"), pattern="*.flagstat") flagstats <- list() for(i in 1:length(flagstatFiles)){ @@ -1193,7 +1193,7 @@ for(i in 1:length(flagstatFiles)){ Visualization of the alignments, red stars indicate the mock reference samples -```{r vis mapping stats} +```{r vis mapping final stats} par(oma=c(6,3,0,0)) mapStats <- matrix(0,ncol=length(flagstatFiles), nrow=2) @@ -1216,7 +1216,7 @@ mockPos <- which(is.element(flagstatFiles, mockFiles)) points(p[mockPos], rep(200000, length(mockPos)), pch="*", col="red", cex=4) ``` -```{r mapping percentage} +```{r mapping final percentage} barplot(mapStats[1,] / (apply(mapStats,2,sum)), ylim=c(0,1), ylab="Mapping in Percent", col=viridis(20)[8], las=2) ``` @@ -1224,7 +1224,7 @@ barplot(mapStats[1,] / (apply(mapStats,2,sum)), ylim=c(0,1), ylab="Mapping in Pe ### Coverage Data mapped against the clusters and then the reads per cluster visualized -```{r importClusterCoverage, warning=FALSE} +```{r importFinalClusterCoverage, warning=FALSE} clusterFiles <- list.files(file.path(projFolder, "BAM", "alignments_finalMock"), pattern="*.coverage") clusterCoverage <- read.table(file.path(projFolder, "BAM", "alignments_finalMock", clusterFiles[1])) names(clusterCoverage)[1:2] <- c("cluster", clusterFiles[1]) @@ -1245,7 +1245,7 @@ clusterCoverage.std <- t(t(clusterCoverage)/apply(clusterCoverage,2,sum))*100000 ### Lorenz curve Concentration measure of reads against clusters -```{r lorenz} +```{r final lorenz} plot(cumsum(sort(clusterCoverage[,2] / sum(clusterCoverage[,2]))), type="l", xlab="Cluster", ylab="Concentration") for(i in 3:ncol(clusterCoverage)){ lines(cumsum(sort(clusterCoverage[,i] / sum(clusterCoverage[,i])))) @@ -1257,14 +1257,14 @@ for(i in 3:ncol(clusterCoverage)){ These are the amounts of clusters with different samples. -```{r cluster coverage stats} +```{r final cluster coverage stats} coveredClusters <- apply(clusterCoverage[,-1]>0,1,sum) barplot(table(coveredClusters), xlab="No. of different samples aligned to cluster", col=viridis(20)[8]) ``` Now we have here the number of reads per coverage class. That means, instead of having it binary as in the previous plot, we now count all the reads per coverage group. -```{r reads per coverage group} +```{r final reads per coverage group} readsPerCoverageGroup <- c() for(i in 1:max(coveredClusters)){ @@ -1277,13 +1277,13 @@ barplot(readsPerCoverageGroup, xlab="Coverage group", ylab="Reads on cluster gro ``` -```{r reads per coverage group in percent} +```{r final reads per coverage group in percent} barplot(readsPerCoverageGroup/sum(readsPerCoverageGroup)*100, xlab="Coverage group", ylab="Reads on cluster group (in %)", ylim=c(0,100), col=viridis(20)[8]) ``` Then still the number of clusters without coverage per sample -```{r clusters without coverage} +```{r final clusters without coverage} nonHittedClusters <- apply(clusterCoverage[,-1]==0,2,sum) names(nonHittedClusters) <- gsub(".coverage", "", names(nonHittedClusters)) barplot(nonHittedClusters, col=viridis(20)[8], las=2) @@ -1292,7 +1292,7 @@ lines(c(0,100000), c(nrow(clusterCoverage), nrow(clusterCoverage)), lty="dotted" And then this still as percentage -```{r clusterhits percentage} +```{r final clusterhits percentage} barplot(nonHittedClusters/nrow(clusterCoverage), ylim=c(0,1), las=2, col=viridis(20)[8]) lines(c(0,10000),c(0.2,0.2), lty="dotted") lines(c(0,10000),c(0.4,0.4), lty="dotted") @@ -1303,7 +1303,7 @@ lines(c(0,10000),c(0.8,0.8), lty="dotted") ### Smoothed log-coverage per cluster -```{r vizCusterCoverage} +```{r final vizCusterCoverage} plot(smooth.spline(clusterCoverage[,1], log(clusterCoverage[,2]+1), all.knots=FALSE), type="l", xlab="Cluster", ylab="Log-coverage", ylim=c(0, max(log(clusterCoverage))/2)) for(i in 3:ncol(clusterCoverage)){ @@ -1315,7 +1315,7 @@ for(i in 3:ncol(clusterCoverage)){ Now the coverages are divided by the total amount of reads per sample and then multiplied by 10^5. -```{r vizStdCusterCoverage} +```{r final vizStdCusterCoverage} # I use here the first column of the other matrix to keep the same coordinates. It does not affect the plot. plot(smooth.spline(clusterCoverage[,1], log(clusterCoverage.std[,2]+1), all.knots=FALSE), type="l", xlab="Cluster", ylab="", ylim=c(0, max(log(clusterCoverage.std))/4))