From 890fdad704ef6963b6091df7c6746a2bdac03490 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Mon, 22 Feb 2021 17:40:29 +0200
Subject: [PATCH 01/34] samtools index set to csi instead of bai

---
 CHANGELOG                   | 1 +
 GBS-pipeline.smk            | 4 ++--
 rules/Module4-ReadAlignment | 6 +++---
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 2b09ad3..6a29e4e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.1 : * samtools index set to csi instead of bai
 0.4.1 : * Bugfix - Missing outputfile script added
 0.4   : Release version
 0.3.16: * Several other small path typos
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index e3c36f2..701e40e 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -9,8 +9,8 @@ import os
 ##### Daniel Fischer (daniel.fischer@luke.fi)
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline
-##### Version: 0.4.1
-version = "0.4.1"
+##### Version: 0.5.1
+version = "0.5.1"
 
 ##### set minimum snakemake version #####
 min_version("5.24")
diff --git a/rules/Module4-ReadAlignment b/rules/Module4-ReadAlignment
index 75a8e95..5d93159 100644
--- a/rules/Module4-ReadAlignment
+++ b/rules/Module4-ReadAlignment
@@ -52,7 +52,7 @@ else:
             echo "Number of threads used:" {threads}
             samtools view -b -q {params.q} -f {params.f} -F {params.F} {input} > {output.bam}
             samtools sort {output.bam} -o {output.sorted}
-            samtools index {output.sorted}
+            samtools index -c {output.sorted}
       	"""
 
 if config["genome"] == "":
@@ -182,7 +182,7 @@ rule MockRef_SamToSortedBam:
         df -h &> {log}
         samtools view -b -q {params.q} -f {params.f} -F {params.F} {input} > {output.bam}
         samtools sort {output.bam} -o {output.sorted}
-        samtools index {output.sorted}
+        samtools index -c {output.sorted}
   	"""
 
 rule MockRefClusters_SamToSortedBam:
@@ -209,7 +209,7 @@ rule MockRefClusters_SamToSortedBam:
         df -h &> {log}
         samtools view -b -q {params.q} -f {params.f} -F {params.F} {input} > {output.bam}
         samtools sort {output.bam} -o {output.sorted}
-        samtools index {output.sorted}
+        samtools index -c {output.sorted}
   	"""
 
 rule MockRef_SortedBamToMpileup:

From 73d0942b815c7b496aeab94064e9ba8f5e5f2395 Mon Sep 17 00:00:00 2001
From: Daniel Fischer <fischerd@r07c52.bullx>
Date: Mon, 22 Feb 2021 20:57:48 +0200
Subject: [PATCH 02/34] typo fixed

---
 scripts/workflow-report.Rmd | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd
index 7686280..3aea73d 100644
--- a/scripts/workflow-report.Rmd
+++ b/scripts/workflow-report.Rmd
@@ -43,13 +43,13 @@ knitr::opts_chunk$set(echo = FALSE,
                       fig.align = 'center', fig.height = 5, fig.width = 8.5)
 mockClusters.file <- "GSC.MR.Clusters.fa"
 mockReference.file <- "GSC.MR.Genome.fa"
-refGenome.file <- basename(refGenome.file)
 if(!is.element("snakemake",ls())){
   projFolder <- "/scratch/project_2001746/BSF"
   pipelineFolder <- "/scratch/project_2001746/Pipeline-GBS/"
   pipelineConfig <- "/scratch/project_2001746/BSF/Pipeline-GBS/GBS-pipeline_config-BSF.yaml"
   refGenome.file <- "hermetiaRef_112020.fasta"
 }
+refGenome.file <- basename(refGenome.file)
 ifelse(refGenome.file == "", refAvail <- FALSE, refAvail <- TRUE)
 ```
 
@@ -163,7 +163,7 @@ getFastQCJSON <- function(path){
 The DAG of the used pipeline with rule dependencies.
 
 ```{r import workflow, echo=FALSE, fig.cap="Overview of the applied workflow", out.width = '100%'}
-if(file.exists(file.path(pipelineFolder,"workflow.png"))) knitr::include_graphics(file.path(pipelineFolder,"workflow.png"))
+if(file.exists(file.path(projFolder,"workflow.png"))) knitr::include_graphics(file.path(projFolder,"workflow.png"))
 ```
 
 
@@ -199,7 +199,7 @@ kable_styling(out_html, "striped", position = "left")
 ```
 
 ## Concatenating stats
-As the concatenating is a crucial step in this pipeline, it is important to make this right. As a double check that things went right, we check how
+As the concatenating is a crucial step in this pipeline, it is important to make this correct. As a double check that things went right, we check how
 many raw samples were concatenated per sample. We do this visually. Basically, the height of the bars should correspond to your number of used lanes,
 normally 1,2 or 4. Sometimes, you might want to concatenate also more files together (e.g. technical replicates), in this case the bar can be also higher.
 
@@ -707,7 +707,7 @@ Total length of the Mock Reference (Used as reference): `r format(sum(nchar(mock
 ## Theoretical coverage of Mock per sample
 Given the length of the mock reference assembly and the total number of sequenced bases per sample, we can estimate the theoretical coverage of the mock for each sample
 
-Currently I hard-code here the paired-end reads and assume equal lengths and reads in both pairs. Adjust this later 'to reality'...
+Currently I hard-code here the paired-end reads and assume equal lengths and reads in both pairs.
 
 ```{r mock coverage}
 par(mar=c(10,5,1,1))
@@ -769,7 +769,7 @@ barplot(t(atDist), main="A,C,T,G distribution", col=viridis(20)[8])
 ```
 
 ### Assembling statistics
-Statistics on how many reads have been assembled (=had a minimum overlap of 10 bases between R1 and R2), have been discarded ( due to length and/or quality) or were not possible to overlap.
+Statistics on how many reads have been assembled (=had a minimum overlap of x-bases (see table above) between R1 and R2), have been discarded ( due to length and/or quality) or were not possible to overlap.
 
 ```{r assembling stat}
 tmp <- pearIn[c(startPos+23, startPos+24, startPos+25)]
@@ -868,7 +868,7 @@ These are the flagstats from mapping the mock clusters against an existing refer
 
 ```{r sam.flagstats}
 if(refAvail){
-samFlagstats <- read.table(file.path(projFolder,"SAM","mockToRef.samflags"))
+samFlagstats <- read.table(file.path(projFolder,"BAM", "Mockref","mockToRef.sam.samflags"))
 colnames(samFlagstats) <- c("Freq.", "Flag")
 totalLoci <- samFlagstats$V1[samFlagstats$V2==0] + samFlagstats$V1[samFlagstats$V2==16]
 out_html <- knitr::kable(samFlagstats, "html")

From 1eac011c5df0c17ff29f8a91bf87b743a24d905e Mon Sep 17 00:00:00 2001
From: Daniel Fischer <fischerd@r07c52.bullx>
Date: Mon, 22 Feb 2021 22:04:35 +0200
Subject: [PATCH 03/34] Script to refine the mock reference added

---
 CHANGELOG                                     |  1 +
 GBS-pipeline.smk                              |  5 ++-
 GBS-pipeline_config.yaml                      |  8 +++-
 rules/Module3-MockReference                   | 37 +++++++++++++++++++
 scripts/refineMockReference.R                 | 36 ++++++++++++++++++
 scripts/workflow-report.Rmd                   | 30 +++++++++++----
 .../visualize concatenation reports-1.png     |  0
 7 files changed, 106 insertions(+), 11 deletions(-)
 create mode 100644 scripts/refineMockReference.R
 create mode 100644 scripts/workflow-report_files/figure-html/visualize concatenation reports-1.png

diff --git a/CHANGELOG b/CHANGELOG
index 6a29e4e..7289930 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.2 : * Script to refine the mock reference added
 0.5.1 : * samtools index set to csi instead of bai
 0.4.1 : * Bugfix - Missing outputfile script added
 0.4   : Release version
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 701e40e..a4b5a46 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -9,8 +9,8 @@ import os
 ##### Daniel Fischer (daniel.fischer@luke.fi)
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline
-##### Version: 0.5.1
-version = "0.5.1"
+##### Version: 0.5.2
+version = "0.5.2"
 
 ##### set minimum snakemake version #####
 min_version("5.24")
@@ -30,6 +30,7 @@ wildcard_constraints:
 config["genome-bwa-index"] = config["genome"]+".bwt"
 config["genome-star-index"] = config["project-folder"]+"/references/STAR2.7.3a"
 config["report-script"] = config["pipeline-folder"]+"/scripts/workflow-report.Rmd"
+config["refinement-script"] = config["pipeline-folder"]+"/scripts/refineMockReference.R"
 config["adapter"]=config["pipeline-folder"]+"/adapter.fa"
 
 ##### Singularity container #####
diff --git a/GBS-pipeline_config.yaml b/GBS-pipeline_config.yaml
index a251cb7..f971037 100644
--- a/GBS-pipeline_config.yaml
+++ b/GBS-pipeline_config.yaml
@@ -22,7 +22,13 @@ tmpdir: "/scratch/project_2001746/tmp"
 enz1: "AATTC"                     # First restriction enzyme sequence
 enz2: "GCATG"                     # Second restriction enzyme sequence
 libname: "AllReads"               # 
-libtype: "PE"                     # LIbrary type, paired-end (PE) or single-end (SE)
+libtype: "PE"                     # Library type, paired-end (PE) or single-end (SE)
+
+# Filter parameters for the mock reference refinement
+################################################################################
+mockref:
+    TotalReadCoverage: 25         # How many overall reads need to be on a cluster to remain in mock reference?
+    minSampleCoverage: 3          # How many different samples need to have reads on a cluster to remain in mock reference?
 
 # Tools specific settings
 ################################################################################
diff --git a/rules/Module3-MockReference b/rules/Module3-MockReference
index b1a8e30..754701d 100644
--- a/rules/Module3-MockReference
+++ b/rules/Module3-MockReference
@@ -176,3 +176,40 @@ else:
             df -h &> {log}
             samtools mpileup -Q {params.Q} -q {params.q} -B -C 50 -f {input.refgenome} {input.bam} > {output}
       	"""
+
+##########################################################
+##
+## Mock Reference refinement
+##
+##########################################################
+
+rule refine_mock_reference:
+    """
+    Refine the mock reference (R).
+    """
+    input:
+        clusters="%s/FASTQ/TRIMMED/GSC.MR.Clusters.fa" % (config["project-folder"]),
+        script=config["refinement-script"],
+        c="%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"])
+    output:
+        "%s/MockReference/MockReference.fa" % (config["project-folder"])
+    log:
+        "%s/logs/R/refine_mock_reference.log" % (config["project-folder"])
+    benchmark:
+        "%s/benchmark/R/refine_mock_reference.benchmark.tsv" % (config["project-folder"])
+    singularity:
+        "docker://fischuu/r-gbs:3.6.3-0.2"
+    params:
+       projFolder=config["project-folder"],
+       pipeFolder=config["pipeline-folder"],
+       minTotalReadCoverage=config["mockref"]["TotalReadCoverage"],
+       minSampleCoverage=config["mockref"]["minSampleCoverage"]
+    shell:"""
+       R -e "projFolder <- '{params.projFolder}'; \
+             mockClusters.file <- '{input.clusters}'; \
+             minTotalReadCoverage <- '{params.minTotalReadCoverage}'; \
+             minSampleCoverage <- '{params.minSampleCoverage}'; \
+             mockClusters.refined.file <- '{output}'; \
+             snakemake <- TRUE;\
+             source('{input.script}')" &> {log}
+    """
diff --git a/scripts/refineMockReference.R b/scripts/refineMockReference.R
new file mode 100644
index 0000000..217a2cf
--- /dev/null
+++ b/scripts/refineMockReference.R
@@ -0,0 +1,36 @@
+# This script imports the mock reference coverage values, analyses it and then created a final, refined mock reference file
+if(!is.element("snakemake",ls())){
+  mockClusters.file <- "GSC.MR.Clusters.fa"
+  projFolder <- "/scratch/project_2001746/BSF"
+  minTotalReadCoverage <- 25
+  minSampleCoverage <- 3
+  mockClusters.refined.file <- "/scratch/project_2001746/BSF/MockReference/MockReference.fa"
+}
+
+# Import the coverage
+  clusterFiles <- list.files(file.path(projFolder, "FASTQ", "TRIMMED", "alignments_clusters"), pattern="*.coverage")
+  clusterCoverage <- read.table(file.path(projFolder, "FASTQ", "TRIMMED", "alignments_clusters", clusterFiles[1]))
+  names(clusterCoverage)[1:2] <- c("cluster", clusterFiles[1])
+  
+  for(i in 2:length(clusterFiles)){
+    tmp <- read.table(file.path(projFolder, "FASTQ", "TRIMMED", "alignments_clusters", clusterFiles[i]))
+    names(tmp)[1:2] <- c("cluster", clusterFiles[i])
+    clusterCoverage <- merge(clusterCoverage, tmp, by="cluster")
+  }
+
+# Get the total coverage
+  totalCoverage <- apply(clusterCoverage[,-1],1,sum)
+
+# Filter the coverage
+  clusterCoverage.refined <- clusterCoverage[totalCoverage>minTotalReadCoverage,]
+  clusterCoverage.refined <- clusterCoverage.refined[apply(clusterCoverage.refined[,-1]>0,1,sum)>=minSampleCoverage,]
+
+# Import the initial mock clusters
+  mockClusters <- importFA(file.path(mockClusters.file))
+  
+# Refine the mock clusters
+  mockClusters.refined <- mockClusters[is.element(gsub(">","",names(mockClusters)), clusterCoverage.refined[,1])]
+  
+# Export the refined mock reference
+  exportFA(mockClusters.refined, file=mockClusters.refined.file)
+  
\ No newline at end of file
diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd
index 6e7f85c..c04af5e 100644
--- a/scripts/workflow-report.Rmd
+++ b/scripts/workflow-report.Rmd
@@ -679,7 +679,7 @@ plotMQCFeature(x=file.path(projFolder,"QC","CONCATENATED","multiqc_R1"),
                y=file.path(projFolder,"QC","TRIMMED","multiqc_R1"),
                feature="fastqc_per_base_sequence_quality_plot")
 ```
-# Mock Reference
+# Preliminary Mock Reference
 
 ## Basic stats
 ```{r import required files, warning=FALSE, }
@@ -843,6 +843,22 @@ barplot(outQual, ylab="Frequency", main="Quality-score distribution")
 ## Ratio samles used to length Mock reference
 How much were the reads merged down? Length of Mock reference vs the length of all used reads together.
 
+# Mock reference
+
+## Basic stats
+```{r import required files, warning=FALSE, }
+finalMockClusters <- importFA(file.path(projFolder,"MockReference","MockReference.fa"))
+```
+
+Basic information for the final mock reference:
+```{r mock build stats}
+  out <- summary(finalMockClusters)
+  out[,2] <- formatC(out[,2],  format="d", big.mark=",")
+  out_html <- knitr::kable(out, col.names = NULL, "html")
+  kable_styling(out_html, "striped", position = "left")
+```
+
+
 # Existing Reference
 In case a reference genome was provided, here are a few statistics regarding it
 
@@ -869,7 +885,7 @@ if(refAvail){
 ## Mock vs reference
 
 These are the flagstats from mapping the mock clusters against an existing reference genome. Typical flag stats are:
-* 0: Read aligned aginst forward strand
+* 0: Read aligned against forward strand
 * 16: Read aligned against reverse strand
 * 2048: Supplementary alignment forward
 * 2064: Supplementary alignment reverse
@@ -877,11 +893,7 @@ These are the flagstats from mapping the mock clusters against an existing refer
 
 ```{r samflags}
 if(refAvail){
-<<<<<<< HEAD
-samFlagstats <- read.table(file.path(projFolder,"BAM", "Mockref","mockToRef.sam.samflags"))
-=======
 samFlagstats <- read.table(file.path(projFolder,"BAM", "Mockref", "mockToRef.sam.samflags"))
->>>>>>> 890fdad704ef6963b6091df7c6746a2bdac03490
 colnames(samFlagstats) <- c("Freq.", "Flag")
 totalLoci <- samFlagstats$V1[samFlagstats$V2==0] + samFlagstats$V1[samFlagstats$V2==16]
 out_html <- knitr::kable(samFlagstats, "html")
@@ -906,7 +918,7 @@ totalBases <- sum(abs(mergedLoci$V3 - mergedLoci$V2))
 ```
 
 However, in total we have `r nrow(unmergedLoci)` mapping loci that can be merged down (by intersection of the bed) to `r nrow(mergedLoci)` if their chromosomal location is intersected.
-Please note, if the difference between these two numbers is too large, it indicates that the mock reference needs to be stricted merged.
+Please note, if the difference between these two numbers is too large, it indicates that the mock reference needs to be merged more strict.
 
 In total `r formatC(totalBases, big.mark=",")` bases are covered in the reference genome from mock cluster sequences.
 
@@ -1000,7 +1012,6 @@ datatable(tmp)
 
 WE COULD COUNT HERE STILL HOW MANY SEQUENCES MATCH THE GIVEN ONE WITH ONE SUBSTITUTION ALLOWED.
 
-
 # Alignment
 
 ## Mock Reference
@@ -1149,6 +1160,9 @@ for(i in 3:ncol(clusterCoverage)){
 ### Cluster coverage Mock samples vs others
 ADD HERE STILL A COMPARISON, HOW THE READS FROM THE MOCK DISTRIBUTE ACROSS THE REFERENCE VS ALL OTHER SAMPLES
 
+## Refind Mock reference
+ADD THIS STILL TO THE REPORT!!!
+
 ## Reference genome
 
 Import the mapping statistics to the reference genome
diff --git a/scripts/workflow-report_files/figure-html/visualize concatenation reports-1.png b/scripts/workflow-report_files/figure-html/visualize concatenation reports-1.png
new file mode 100644
index 0000000..e69de29

From 74f85ae1911c960d0b13abcf7cf69e56c3c18322 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 23 Feb 2021 06:29:31 +0200
Subject: [PATCH 04/34] Rule all rule simplified

---
 CHANGELOG        |  1 +
 GBS-pipeline.smk | 56 ++++++++++++++++++++++++------------------------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 7289930..bf2f888 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.3 : * Rule all rule simplified
 0.5.2 : * Script to refine the mock reference added
 0.5.1 : * samtools index set to csi instead of bai
 0.4.1 : * Bugfix - Missing outputfile script added
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index a4b5a46..51cbd57 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -9,8 +9,8 @@ import os
 ##### Daniel Fischer (daniel.fischer@luke.fi)
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline
-##### Version: 0.5.2
-version = "0.5.2"
+##### Version: 0.5.3
+version = "0.5.3"
 
 ##### set minimum snakemake version #####
 min_version("5.24")
@@ -81,44 +81,44 @@ print("#########################################################################
 rule all:
     input:
       # OUTPUT: PREPARATION MODULE
-        expand("%s/FASTQ/CONCATENATED/{samples}_R1_001.merged.fastq.gz" % (config["project-folder"]), samples=samples),
-        expand("%s/FASTQ/CONCATENATED/{samples}_R2_001.merged.fastq.gz" % (config["project-folder"]), samples=samples),
+#        expand("%s/FASTQ/CONCATENATED/{samples}_R1_001.merged.fastq.gz" % (config["project-folder"]), samples=samples),
+#        expand("%s/FASTQ/CONCATENATED/{samples}_R2_001.merged.fastq.gz" % (config["project-folder"]), samples=samples),
 #      # QC OF RAW AND CONCATENATED FILES
         "%s/QC/RAW/multiqc_R1/" % (config["project-folder"]),
         "%s/QC/CONCATENATED/multiqc_R1/" % (config["project-folder"]),
         "%s/QC/TRIMMED/multiqc_R1/" % (config["project-folder"]),
       # OUTPUT STEP 2
-        expand("%s/FASTQ/TRIMMED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples),
-        expand("%s/FASTQ/TRIMMED/{samples}.R2.fq.gz" % (config["project-folder"]), samples=samples),
+#        expand("%s/FASTQ/TRIMMED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples),
+#        expand("%s/FASTQ/TRIMMED/{samples}.R2.fq.gz" % (config["project-folder"]), samples=samples),
       # OUTPUT STEP 2b
-        expand("%s/FASTQ/SUBSTITUTED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples),
+#        expand("%s/FASTQ/SUBSTITUTED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples),
       # OUTPUT STEP 4
-        "%s/FASTQ/TRIMMED/GSC.MR.Genome.fa" % (config["project-folder"]),
-        "%s/BAM/Mockref/mockToRef.sam.flagstat" % (config["project-folder"]),
-        "%s/MPILEUP/mpileup_mockToRef/mockToRef.mpileup" % (config["project-folder"]),
+#        "%s/FASTQ/TRIMMED/GSC.MR.Genome.fa" % (config["project-folder"]),
+#        "%s/BAM/Mockref/mockToRef.sam.flagstat" % (config["project-folder"]),
+#        "%s/MPILEUP/mpileup_mockToRef/mockToRef.mpileup" % (config["project-folder"]),
       # OUTPUT STEP 5
-        expand("%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
-        expand("%s/FASTQ/TRIMMED/alignments/{samples}.sorted.bam" % (config["project-folder"]), samples=samples),
-        expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
-        expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sorted.bam" % (config["project-folder"]), samples=samples),
-        expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]), samples=samples),
-        expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sorted.bam" % (config["project-folder"]), samples=samples),
-        expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
-        expand("%s/MPILEUP/mpileup_reference/{samples}.mpileup" % (config["project-folder"]), samples=samples),
+#        expand("%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
+#        expand("%s/FASTQ/TRIMMED/alignments/{samples}.sorted.bam" % (config["project-folder"]), samples=samples),
+#        expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
+#        expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sorted.bam" % (config["project-folder"]), samples=samples),
+#        expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]), samples=samples),
+#        expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sorted.bam" % (config["project-folder"]), samples=samples),
+#        expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
+#        expand("%s/MPILEUP/mpileup_reference/{samples}.mpileup" % (config["project-folder"]), samples=samples),
       # OUTPUT STEP 6
-        "%s/FASTQ/TRIMMED/GSC.MasterMatrix.txt" % (config["project-folder"]),
-        "%s/MPILEUP/mpileup_reference/GSC.MasterMatrix.txt" % (config["project-folder"]),
-        expand("%s/FASTQ/TRIMMED/{samples}.count.txt" % (config["project-folder"]), samples=samples),
+#        "%s/FASTQ/TRIMMED/GSC.MasterMatrix.txt" % (config["project-folder"]),
+#        "%s/MPILEUP/mpileup_reference/GSC.MasterMatrix.txt" % (config["project-folder"]),
+#        expand("%s/FASTQ/TRIMMED/{samples}.count.txt" % (config["project-folder"]), samples=samples),
       # OUTPUT STEP 7
-        "%s/FASTQ/TRIMMED/variants/GSC.GenoMatrix.txt" % (config["project-folder"]),
-        "%s/MPILEUP/mpileup_reference/variants/GSC.GenoMatrix.txt" % (config["project-folder"]),
+#        "%s/FASTQ/TRIMMED/variants/GSC.GenoMatrix.txt" % (config["project-folder"]),
+#        "%s/MPILEUP/mpileup_reference/variants/GSC.GenoMatrix.txt" % (config["project-folder"]),
       # OUTPUT STEP 8  
-        "%s/FASTQ/TRIMMED/GSC.vcf" % (config["project-folder"]),
-        "%s/FASTQ/TRIMMED/GSC.vcf.fa" % (config["project-folder"]),
-        "%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]),
-        "%s/MPILEUP/mpileup_reference/GSC.vcf.fa" % (config["project-folder"]),
+#        "%s/FASTQ/TRIMMED/GSC.vcf" % (config["project-folder"]),
+#        "%s/FASTQ/TRIMMED/GSC.vcf.fa" % (config["project-folder"]),
+#        "%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]),
+#        "%s/MPILEUP/mpileup_reference/GSC.vcf.fa" % (config["project-folder"]),
       # OUTPUT STEP 9
-        "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]),
+#        "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]),
       # Quality check
         "%s/finalReport.html" % (config["project-folder"])
 

From 4e93d63f941e5d5cb9a2bb652ed6fc8e136b08cb Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 23 Feb 2021 06:40:30 +0200
Subject: [PATCH 05/34] Index final mock reference added

---
 CHANGELOG                             |  1 +
 GBS-pipeline.smk                      |  4 +++-
 GBS-pipeline_server-config.yaml       |  9 +++++++++
 rules/Module0-PreparationsAndIndexing | 18 ++++++++++++++++++
 rules/Module3-MockReference           |  2 +-
 5 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index bf2f888..2c4c5ec 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.4 : * Index final mock reference added
 0.5.3 : * Rule all rule simplified
 0.5.2 : * Script to refine the mock reference added
 0.5.1 : * samtools index set to csi instead of bai
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 51cbd57..0ff6ced 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -120,7 +120,9 @@ rule all:
       # OUTPUT STEP 9
 #        "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]),
       # Quality check
-        "%s/finalReport.html" % (config["project-folder"])
+        #"%s/finalReport.html" % (config["project-folder"]),
+        "%s/MockReference/MockReference.fa.bwt" % (config["project-folder"])
+
 
 
 ### setup report #####
diff --git a/GBS-pipeline_server-config.yaml b/GBS-pipeline_server-config.yaml
index 6156a07..0ecaa48 100644
--- a/GBS-pipeline_server-config.yaml
+++ b/GBS-pipeline_server-config.yaml
@@ -46,6 +46,12 @@ IndexClustersSamtools:
     time: 02:00:00
     job-name:  IndexMockRefClusters_Samtools
     mem-per-cpu: 25000
+
+IndexFinalMockBWA:
+    time: 02:00:00
+    job-name:  IndexFinalMockBWA
+    mem-per-cpu: 25000
+
     
 # Module 1 - QC:
 ################################################################################
@@ -134,6 +140,9 @@ MockVsRef_SortedBamToMpileup:
     time: 05:00:00
     job-name: MockVsRef_SortedBamToMpileup
 
+refine_mock_reference:
+    job-name: refine_mock_reference
+
 # Module 4 - Read alignment
 ################################################################################   
 RefGenome_AlignReads:
diff --git a/rules/Module0-PreparationsAndIndexing b/rules/Module0-PreparationsAndIndexing
index bd35309..8b86958 100644
--- a/rules/Module0-PreparationsAndIndexing
+++ b/rules/Module0-PreparationsAndIndexing
@@ -142,3 +142,21 @@ rule IndexClustersSamtools:
     shell:"""
         samtools faidx {input}
   	"""
+  	
+rule IndexFinalMockBWA:
+    """
+    Index the final Mock Reference Genome (BWA).
+    """
+    input:
+        "%s/MockReference/MockReference.fa" % (config["project-folder"])
+    output:
+        "%s/MockReference/MockReference.fa.bwt" % (config["project-folder"])
+    log:
+        "%s/logs/BWA/IndexFinalMock.log" % (config["project-folder"])
+    benchmark:
+        "%s/benchmark/BWA/IndexFinalMock.benchmark.tsv" % (config["project-folder"])
+    conda:"envs/gbs.yaml"
+    singularity: config["singularity"]["gbs"]
+    shell:"""
+        bwa index -a bwtsw {input}
+  	"""
diff --git a/rules/Module3-MockReference b/rules/Module3-MockReference
index 754701d..544a92f 100644
--- a/rules/Module3-MockReference
+++ b/rules/Module3-MockReference
@@ -190,7 +190,7 @@ rule refine_mock_reference:
     input:
         clusters="%s/FASTQ/TRIMMED/GSC.MR.Clusters.fa" % (config["project-folder"]),
         script=config["refinement-script"],
-        c="%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"])
+        c=expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]), samples=samples)
     output:
         "%s/MockReference/MockReference.fa" % (config["project-folder"])
     log:

From 16fddbb4def509b72eacf5bdece72f2b84345288 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 23 Feb 2021 06:43:14 +0200
Subject: [PATCH 06/34] Align reads to final mock reference added

---
 CHANGELOG                   |  1 -
 GBS-pipeline.smk            |  2 +-
 rules/Module4-ReadAlignment | 27 +++++++++++++++++++++++++++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 2c4c5ec..bf2f888 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,7 +4,6 @@
 #####                                                                      #####
 ################################################################################
 
-0.5.4 : * Index final mock reference added
 0.5.3 : * Rule all rule simplified
 0.5.2 : * Script to refine the mock reference added
 0.5.1 : * samtools index set to csi instead of bai
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 0ff6ced..d05b8ee 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -121,7 +121,7 @@ rule all:
 #        "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]),
       # Quality check
         #"%s/finalReport.html" % (config["project-folder"]),
-        "%s/MockReference/MockReference.fa.bwt" % (config["project-folder"])
+        expand("%s/logs/BWA/FinalMockRef_AlignReads.{samples}.log" % (config["project-folder"]), samples=samples)
 
 
 
diff --git a/rules/Module4-ReadAlignment b/rules/Module4-ReadAlignment
index 5d93159..a2f1063 100644
--- a/rules/Module4-ReadAlignment
+++ b/rules/Module4-ReadAlignment
@@ -278,3 +278,30 @@ rule MockRefClusters_AlignmentStats:
         samtools idxstats {input.sbam} | awk '{{print $1\" \"$3}}' > {output.c}
   	"""
 
+################################################################################
+##
+## Process the final Mock
+
+rule FinalMockRef_AlignReads:
+    """
+    Align reads to the final mock reference Genome.
+    """
+    input:
+        reference="%s/MockReference/MockReference.fa" % (config["project-folder"]),
+        refFiles="%s/MockReference/MockReference.fa.bwt" % (config["project-folder"]),
+        R1="%s/FASTQ/TRIMMED/{samples}.R1.fq.gz" % (config["project-folder"]),
+        R2="%s/FASTQ/TRIMMED/{samples}.R2.fq.gz" % (config["project-folder"]),
+    output:
+        temp("%s/SAM/alignments_finalMock/{samples}.sam" % (config["project-folder"]))
+    log:
+        "%s/logs/BWA/FinalMockRef_AlignReads.{samples}.log" % (config["project-folder"])
+    benchmark:
+        "%s/benchmark/BWA/FinalMockRef_AlignReads.{samples}.benchmark.tsv" % (config["project-folder"])
+    params:
+        threads=config["params"]["step5b"]["threads"],
+    conda:"envs/gbs.yaml"
+    singularity: config["singularity"]["gbs"]
+    shell:"""
+        df -h &> {log}
+        bwa mem -t {params.threads} -M {input.reference} {input.R1} {input.R2} > {output} 2> {log}
+  	"""

From 53753e08865338244c0d20c7b9df48c0e66f8b62 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 23 Feb 2021 06:48:39 +0200
Subject: [PATCH 07/34] sam to sorted bam for final mock alignments added

---
 GBS-pipeline.smk                |  2 +-
 GBS-pipeline_server-config.yaml | 14 ++++++++++++++
 rules/Module4-ReadAlignment     | 28 ++++++++++++++++++++++++++++
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index d05b8ee..dcd18ea 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -121,7 +121,7 @@ rule all:
 #        "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]),
       # Quality check
         #"%s/finalReport.html" % (config["project-folder"]),
-        expand("%s/logs/BWA/FinalMockRef_AlignReads.{samples}.log" % (config["project-folder"]), samples=samples)
+        expand("%s/BAM/alignments_finalMock/{samples}.sorted.bam" % (config["project-folder"]), samples=samples)
 
 
 
diff --git a/GBS-pipeline_server-config.yaml b/GBS-pipeline_server-config.yaml
index 0ecaa48..571b19c 100644
--- a/GBS-pipeline_server-config.yaml
+++ b/GBS-pipeline_server-config.yaml
@@ -209,6 +209,20 @@ MockRefClusters_AlignmentStats:
     cpus-per-task: 20
     mem-per-cpu: 5000
 
+FinalMockRef_AlignReads:
+    time: 01:00:00
+    job-name:  FinalMockRef_AlignReads
+    cpus-per-task: 20
+    mem-per-cpu: 5000
+  
+FinalMockRef_SamToSortedBam:
+    time: 01:00:00
+    job-name: FinalMockRef_SamToSortedBam
+    cpus-per-task: 20
+    mem-per-cpu: 5000
+
+
+
 # Module 5 - Call Variants
 ################################################################################   
 
diff --git a/rules/Module4-ReadAlignment b/rules/Module4-ReadAlignment
index a2f1063..148c6c1 100644
--- a/rules/Module4-ReadAlignment
+++ b/rules/Module4-ReadAlignment
@@ -305,3 +305,31 @@ rule FinalMockRef_AlignReads:
         df -h &> {log}
         bwa mem -t {params.threads} -M {input.reference} {input.R1} {input.R2} > {output} 2> {log}
   	"""
+  	
+rule FinalMockRef_SamToSortedBam:
+    """
+    Sam to sorted bam for final mock reference (SAMTOOLS)
+    """
+    input:
+        "%s/SAM/alignments_finalMock/{samples}.sam" % (config["project-folder"])
+    output:
+        bam=temp("%s/BAM/alignments_finalMock/{samples}.bam" % (config["project-folder"])),
+        sorted="%s/BAM/alignments_finalMock/{samples}.sorted.bam" % (config["project-folder"])
+    log:
+        "%s/logs/Samtools/FinalMockRef_SamToSortedBam.{samples}.log" % (config["project-folder"])
+    benchmark:
+        "%s/benchmark/Samtools/FinalMockRef_SamToSortedBam.{samples}.benchmark.tsv" % (config["project-folder"])
+    params:
+        threads=config["params"]["step5d"]["threads"],
+        q=config["params"]["step5d"]["q"],
+        f=config["params"]["step5d"]["f"],
+        F=config["params"]["step5d"]["F"]
+    conda:"envs/samtools.yaml"
+    singularity: config["singularity"]["gbs"]
+    shell:"""
+        df -h &> {log}
+        samtools view -b -q {params.q} -f {params.f} -F {params.F} {input} > {output.bam}
+        samtools sort {output.bam} -o {output.sorted}
+        samtools index -c {output.sorted}
+  	"""
+

From 745fb7f281d54d85461403b93af873ecefb34064 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 23 Feb 2021 06:52:57 +0200
Subject: [PATCH 08/34] final mock mpileup added

---
 GBS-pipeline.smk            |  2 +-
 rules/Module4-ReadAlignment | 42 +++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index dcd18ea..d09d355 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -121,7 +121,7 @@ rule all:
 #        "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]),
       # Quality check
         #"%s/finalReport.html" % (config["project-folder"]),
-        expand("%s/BAM/alignments_finalMock/{samples}.sorted.bam" % (config["project-folder"]), samples=samples)
+        expand("%s/MPILEUP/mpileup_finalMock/{samples}.mpileup" % (config["project-folder"]), samples=samples)
 
 
 
diff --git a/rules/Module4-ReadAlignment b/rules/Module4-ReadAlignment
index 148c6c1..cd0669c 100644
--- a/rules/Module4-ReadAlignment
+++ b/rules/Module4-ReadAlignment
@@ -333,3 +333,45 @@ rule FinalMockRef_SamToSortedBam:
         samtools index -c {output.sorted}
   	"""
 
+rule FinalMockRef_SortedBamToMpileup:
+    """
+    Get Mpileup for final Mock Reference Genome.
+    """
+    input:
+        bam="%s/BAM/alignments_finalMock/{samples}.sorted.bam" % (config["project-folder"]),
+        reference="%s/MockReference/MockReference.fa" % (config["project-folder"])
+    output:
+        bam="%s/MPILEUP/mpileup_finalMock/{samples}.mpileup" % (config["project-folder"])
+    log:
+        "%s/logs/Samtoools/FinalMockRef_SortedBamToMpileup.{samples}.log" % (config["project-folder"])
+    benchmark:
+        "%s/benchmark/Samtools/FinalMockRef_SortedBamToMpileup.{samples}.benchmark.tsv" % (config["project-folder"])
+    params:
+        threads=config["params"]["step5d"]["threads"],
+        Q=config["params"]["step5d"]["Q"],
+        q=config["params"]["step5d"]["q"]
+    conda:"envs/samtools.yaml"
+    singularity: config["singularity"]["gbs"]
+    shell:"""
+        df -h &> {log}
+        samtools mpileup -Q {params.Q} -q {params.q} -B -C 50 -f {input.reference} {input.bam} > {output}
+  	"""
+
+
+rule finalMockRef_AlignmentStats:
+    """
+    Alignment stats for reads mapped to mock ref
+    """
+    input:
+        "%s/FASTQ/TRIMMED/alignments/{samples}.sam" % (config["project-folder"])
+    output:
+        "%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"])
+    log:
+        "%s/logs/Samtools/MockRef_AlignmentStats.{samples}.log" % (config["project-folder"])
+    benchmark:
+        "%s/benchmark/Samtools/MockRef_AlignmentStats.{samples}.benchmark.tsv" % (config["project-folder"])
+    conda:"envs/samtools.yaml"
+    singularity: config["singularity"]["gbs"]
+    shell:"""
+        samtools flagstat {input} > {output}
+  	"""

From ecccca781a220a29dd26621ff0fbe2b29bf9aeac Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 23 Feb 2021 06:55:33 +0200
Subject: [PATCH 09/34] final mock alignment stats added

---
 GBS-pipeline.smk            |  2 +-
 rules/Module4-ReadAlignment | 10 +++++-----
 rules/Module7-Reporting     |  3 ++-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index d09d355..5680983 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -120,7 +120,7 @@ rule all:
       # OUTPUT STEP 9
 #        "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]),
       # Quality check
-        #"%s/finalReport.html" % (config["project-folder"]),
+        "%s/finalReport.html" % (config["project-folder"]),
         expand("%s/MPILEUP/mpileup_finalMock/{samples}.mpileup" % (config["project-folder"]), samples=samples)
 
 
diff --git a/rules/Module4-ReadAlignment b/rules/Module4-ReadAlignment
index cd0669c..68c52a9 100644
--- a/rules/Module4-ReadAlignment
+++ b/rules/Module4-ReadAlignment
@@ -360,16 +360,16 @@ rule FinalMockRef_SortedBamToMpileup:
 
 rule finalMockRef_AlignmentStats:
     """
-    Alignment stats for reads mapped to mock ref
+    Alignment stats for reads mapped to final mock ref
     """
     input:
-        "%s/FASTQ/TRIMMED/alignments/{samples}.sam" % (config["project-folder"])
+        "%s/SAM/alignments_finalMock/{samples}.sam" % (config["project-folder"])
     output:
-        "%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"])
+        "%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"])
     log:
-        "%s/logs/Samtools/MockRef_AlignmentStats.{samples}.log" % (config["project-folder"])
+        "%s/logs/Samtools/FinalMockRef_AlignmentStats.{samples}.log" % (config["project-folder"])
     benchmark:
-        "%s/benchmark/Samtools/MockRef_AlignmentStats.{samples}.benchmark.tsv" % (config["project-folder"])
+        "%s/benchmark/Samtools/FinalMockRef_AlignmentStats.{samples}.benchmark.tsv" % (config["project-folder"])
     conda:"envs/samtools.yaml"
     singularity: config["singularity"]["gbs"]
     shell:"""
diff --git a/rules/Module7-Reporting b/rules/Module7-Reporting
index 666f836..414469a 100644
--- a/rules/Module7-Reporting
+++ b/rules/Module7-Reporting
@@ -14,7 +14,8 @@ rule R_finalReport:
         faRef="%s/MPILEUP/mpileup_reference/GSC.vcf.fa" % (config["project-folder"]),
         qdRaw=expand("%s/QC/RAW/{rawsamples}_R1_qualdist.txt" % (config["project-folder"]), rawsamples=rawsamples),
         qdConc=expand("%s/QC/CONCATENATED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples),
-        qdTrimmed=expand("%s/QC/TRIMMED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples)
+        qdTrimmed=expand("%s/QC/TRIMMED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples),
+        fsFinal=expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples)
     output:
         "%s/finalReport.html" % (config["project-folder"])
     log:

From 7738ff6fd75facf59d4446b4d407cf06ecad94f1 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 23 Feb 2021 06:57:41 +0200
Subject: [PATCH 10/34] final mock sam index added

---
 rules/Module0-PreparationsAndIndexing | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/rules/Module0-PreparationsAndIndexing b/rules/Module0-PreparationsAndIndexing
index 8b86958..a98c567 100644
--- a/rules/Module0-PreparationsAndIndexing
+++ b/rules/Module0-PreparationsAndIndexing
@@ -160,3 +160,21 @@ rule IndexFinalMockBWA:
     shell:"""
         bwa index -a bwtsw {input}
   	"""
+
+rule IndexFinalMockSamtools:
+    """
+    Index the final Mock Reference Genome Clusters (samtools).
+    """
+    input:
+        "%s/MockReference/MockReference.fa" % (config["project-folder"])
+    output:
+        "%s/MockReference/MockReference.fai" % (config["project-folder"])
+    log:
+        "%s/logs/Samtools/IndexFinalMockSam.log" % (config["project-folder"])
+    benchmark:
+        "%s/benchmark/Samtools/IndexFinalMockSam.benchmark.tsv" % (config["project-folder"])
+    conda:"envs/samtools.yaml"
+    singularity: config["singularity"]["gbs"]
+    shell:"""
+        samtools faidx {input}
+  	"""
\ No newline at end of file

From fd438cc603651b0ceb0bde240f597fab156acb8f Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 23 Feb 2021 07:03:14 +0200
Subject: [PATCH 11/34] Create count files for final mock

---
 GBS-pipeline.smk                |  2 +-
 GBS-pipeline_server-config.yaml | 21 ++++++++++++++++++++-
 rules/Module5-CallVariants      | 29 +++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 5680983..238cebc 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -121,7 +121,7 @@ rule all:
 #        "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]),
       # Quality check
         "%s/finalReport.html" % (config["project-folder"]),
-        expand("%s/MPILEUP/mpileup_finalMock/{samples}.mpileup" % (config["project-folder"]), samples=samples)
+        expand("%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), samples=samples)
 
 
 
diff --git a/GBS-pipeline_server-config.yaml b/GBS-pipeline_server-config.yaml
index 571b19c..86a6982 100644
--- a/GBS-pipeline_server-config.yaml
+++ b/GBS-pipeline_server-config.yaml
@@ -52,7 +52,11 @@ IndexFinalMockBWA:
     job-name:  IndexFinalMockBWA
     mem-per-cpu: 25000
 
-    
+IndexFinalMockSamtools:
+    time: 02:00:00
+    job-name:  IndexFinalMockSamtools
+    mem-per-cpu: 25000
+
 # Module 1 - QC:
 ################################################################################
 fastqc_quality_control_raw_data:
@@ -221,7 +225,17 @@ FinalMockRef_SamToSortedBam:
     cpus-per-task: 20
     mem-per-cpu: 5000
 
+FinalMockRef_SortedBamToMpileup:
+    time: 01:00:00
+    job-name:  FinalMockRef_SortedBamToMpileup
+    cpus-per-task: 20
+    mem-per-cpu: 5000
 
+finalMockRef_AlignmentStats:
+    time: 01:00:00
+    job-name:  FinalMockRef_AlignmentStats
+    cpus-per-task: 20
+    mem-per-cpu: 5000
 
 # Module 5 - Call Variants
 ################################################################################   
@@ -298,6 +312,11 @@ CreateVCF_reference:
     job-name:  CreateVCF_reference
     mem-per-cpu: 30000
 
+ParseMpileup_createCountFiles_finalMock:
+    time: 05:00:00
+    job-name:  createCountFiles_finalmock
+    mem-per-cpu: 16000
+
 # Module 6 - Postprocessing
 ################################################################################
 Ref_getVariantFlanking:
diff --git a/rules/Module5-CallVariants b/rules/Module5-CallVariants
index c9b96dd..de08791 100644
--- a/rules/Module5-CallVariants
+++ b/rules/Module5-CallVariants
@@ -421,3 +421,32 @@ else:
             perl {params.pipefolder}/scripts/GBS-SNP-CROP-8.pl -in {input.unfiltered} -out {params.unfilteredOut} -b {params.barcodes} -formats {params.format} &> {log}
             cd ../..
       	"""
+
+################################################################################
+##
+## Process the final mock
+
+rule ParseMpileup_createCountFiles_finalMock:
+    """
+    Parse mpileup outputs and create count/ref files for final mock
+    """
+    input:
+        "%s/MPILEUP/mpileup_finalMock/{samples}.mpileup" % (config["project-folder"])
+    output:
+        co="%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]),
+        ref="%s/MPILEUP/mpileup_finalMock/{samples}.ref.txt" % (config["project-folder"])
+    params:
+        p=config["params"]["step6"]["p"],
+        wd="%s/FASTQ/TRIMMED" % config["project-folder"],
+        pipefolder=config["pipeline-folder"]
+    log:
+        "%s/logs/Perl/ParseMpileup_createCountFiles_finalMock.{samples}.log" % (config["project-folder"])
+    benchmark:
+        "%s/benchmark/Perl/ParseMpileup_createCountFiles_finalMock.{samples}.benchmark.tsv" % (config["project-folder"])
+    threads: 1
+    conda:"envs/gbs.yaml"
+    singularity: config["singularity"]["gbs"]
+    shell:"""
+        cd {params.wd}
+        perl {params.pipefolder}/scripts/GBS-SNP-CROP-6_1.pl -b {input} -p {params.p} &> {log}
+  	"""
\ No newline at end of file

From a4d75f1446f94517fd7bfefc82103d325040e186 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 23 Feb 2021 07:11:15 +0200
Subject: [PATCH 12/34] merge counts aded

---
 GBS-pipeline.smk                |  3 ++-
 GBS-pipeline_server-config.yaml |  6 ++++++
 rules/Module5-CallVariants      | 26 +++++++++++++++++++++++++-
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 238cebc..46458a1 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -121,7 +121,8 @@ rule all:
 #        "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]),
       # Quality check
         "%s/finalReport.html" % (config["project-folder"]),
-        expand("%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), samples=samples)
+      #  expand("%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), samples=samples)
+        "%s/MPILEUP/mpileup_finalMock/CountFileList.txt" % (config["project-folder"])
 
 
 
diff --git a/GBS-pipeline_server-config.yaml b/GBS-pipeline_server-config.yaml
index 86a6982..c927baa 100644
--- a/GBS-pipeline_server-config.yaml
+++ b/GBS-pipeline_server-config.yaml
@@ -316,6 +316,12 @@ ParseMpileup_createCountFiles_finalMock:
     time: 05:00:00
     job-name:  createCountFiles_finalmock
     mem-per-cpu: 16000
+    
+create_verticalRef_finalMock:
+    time: 01:00:00
+    job-name:  create_verticalRef_finalMock
+    mem-per-cpu: 16000
+    nvme: 20
 
 # Module 6 - Postprocessing
 ################################################################################
diff --git a/rules/Module5-CallVariants b/rules/Module5-CallVariants
index de08791..6e0c617 100644
--- a/rules/Module5-CallVariants
+++ b/rules/Module5-CallVariants
@@ -449,4 +449,28 @@ rule ParseMpileup_createCountFiles_finalMock:
     shell:"""
         cd {params.wd}
         perl {params.pipefolder}/scripts/GBS-SNP-CROP-6_1.pl -b {input} -p {params.p} &> {log}
-  	"""
\ No newline at end of file
+  	"""
+  	
+rule create_verticalRef_finalMock:
+    """
+    Merge the separate ref files for final mock
+    """
+    input:
+        co=expand("%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), samples=samples),
+        ref=expand("%s/MPILEUP/mpileup_finalMock/{samples}.ref.txt" % (config["project-folder"]), samples=samples)
+    output:
+       vref="%s/MPILEUP/mpileup_finalMock/VerticalRefPos.txt" % (config["project-folder"]),
+       co="%s/MPILEUP/mpileup_finalMock/CountFileList.txt" % (config["project-folder"])
+    log:
+       "%s/logs/BASH/create_verticalRef_finalMock.log" % (config["project-folder"])
+    benchmark:
+        "%s/benchmark/BASH/create_verticalRef_finalMock.benchmark.tsv" % (config["project-folder"])
+    params:
+        wd="%s/MPILEUP/mpileup_finalMock" % config["project-folder"],
+    threads: 1
+    shell:"""
+        sort -u -m -k2n {input.ref} -o {params.wd}/VerticalRefPos.tmp;
+        sort -k2n {params.wd}/VerticalRefPos.tmp | uniq >{output.vref};
+        #ls {params.wd}/*count.txt | xargs -n1 basename > {output.co}
+        ls {params.wd}/*count.txt > {output.co}
+    """
\ No newline at end of file

From 2060af19a8c4e67baa19a26e8f44994cb7de2dc8 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 23 Feb 2021 07:35:28 +0200
Subject: [PATCH 13/34] Variant calling added

---
 GBS-pipeline.smk                |   2 +-
 GBS-pipeline_server-config.yaml |  26 ++++++
 rules/Module5-CallVariants      | 154 +++++++++++++++++++++++++++++++-
 3 files changed, 179 insertions(+), 3 deletions(-)

diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 46458a1..528192c 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -122,7 +122,7 @@ rule all:
       # Quality check
         "%s/finalReport.html" % (config["project-folder"]),
       #  expand("%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), samples=samples)
-        "%s/MPILEUP/mpileup_finalMock/CountFileList.txt" % (config["project-folder"])
+        "%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]),
 
 
 
diff --git a/GBS-pipeline_server-config.yaml b/GBS-pipeline_server-config.yaml
index c927baa..4701d55 100644
--- a/GBS-pipeline_server-config.yaml
+++ b/GBS-pipeline_server-config.yaml
@@ -268,6 +268,9 @@ cut_verticalRef:
 cut_verticalRef_reference:
     job-name: cut_verticalRef_reference
 
+cut_verticalRef_finalMock:
+    job-name: cut_verticalRef_finalMock
+
 create_MasterMatrix_parallel:
     time: 0-12:00:00
     job-name:  create_MasterMatrix_parallel
@@ -280,6 +283,13 @@ create_MasterMatrix_parallel_reference:
     mem-per-cpu: 64000
     nvme: 20
 
+create_MasterMatrix_parallel_finalMock:
+    time: 0-12:00:00
+    job-name:  create_MasterMatrix_parallel_finalMock
+    mem-per-cpu: 64000
+    nvme: 20
+
+
 aggregate_MasterMatrix:
     job-name: aggregate_MasterMatrix
     time: 0-12:00:00
@@ -291,6 +301,12 @@ aggregate_MasterMatrix_reference:
     time: 0-12:00:00
     mem-per-cpu: 48000
     nvme: 20
+    
+aggregate_MasterMatrix_finalMock:
+    job-name: aggregate_MasterMatrix_finalMock
+    time: 0-12:00:00
+    mem-per-cpu: 48000
+    nvme: 20
 
 FilterVariants:
     time: 1-12:00:00
@@ -302,6 +318,11 @@ FilterVariants_reference:
     job-name:  filterVariants_reference
     mem-per-cpu: 30000
     
+FilterVariants_finalMock:
+    time: 1-12:00:00
+    job-name:  filterVariants_finalMock
+    mem-per-cpu: 30000
+ 
 CreateVCF:
     time: 01:00:00
     job-name: CreateVCF
@@ -312,6 +333,11 @@ CreateVCF_reference:
     job-name:  CreateVCF_reference
     mem-per-cpu: 30000
 
+CreateVCF_finalMock:
+    time: 01:00:00
+    job-name:  CreateVCF_finalMock
+    mem-per-cpu: 30000
+
 ParseMpileup_createCountFiles_finalMock:
     time: 05:00:00
     job-name:  createCountFiles_finalmock
diff --git a/rules/Module5-CallVariants b/rules/Module5-CallVariants
index 6e0c617..208adcf 100644
--- a/rules/Module5-CallVariants
+++ b/rules/Module5-CallVariants
@@ -394,7 +394,7 @@ else:
             filtered="%s/MPILEUP/mpileup_reference/variants/GSC.GenoMatrix.txt" % (config["project-folder"]),
             unfiltered="%s/MPILEUP/mpileup_reference/variants/GSC.GenoMatrix.unfiltered.txt" % (config["project-folder"])
         output:
-            "%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"])
+            "%s/VCF/FinalSetVariants_referenceGenome.vcf" % (config["project-folder"])
         params:
             barcodes=config["barcodes"],
             genoMatrix=config["params"]["step8"]["genoMatrix"],
@@ -402,6 +402,8 @@ else:
             unfilteredOut=config["params"]["step8"]["unfiltered"],
             format=config["params"]["step8"]["format"],
             wd="%s/MPILEUP/mpileup_reference" % config["project-folder"],
+            outdir="%s/VCF" % config["project-folder"],
+            pipeout="%s/MPILEUP/mpileup_finalMock/GSC.vcf" % (config["project-folder"]),
             pipefolder=config["pipeline-folder"]
         log:
             "%s/logs/Perl/CreateVCF_reference.log" % (config["project-folder"])
@@ -419,6 +421,7 @@ else:
             cd {params.wd}
             perl {params.pipefolder}/scripts/GBS-SNP-CROP-8.pl -in {input.filtered} -out {params.out} -b {params.barcodes} -formats {params.format} &> {log}
             perl {params.pipefolder}/scripts/GBS-SNP-CROP-8.pl -in {input.unfiltered} -out {params.unfilteredOut} -b {params.barcodes} -formats {params.format} &> {log}
+            cp {params.pipeout} {output}
             cd ../..
       	"""
 
@@ -473,4 +476,151 @@ rule create_verticalRef_finalMock:
         sort -k2n {params.wd}/VerticalRefPos.tmp | uniq >{output.vref};
         #ls {params.wd}/*count.txt | xargs -n1 basename > {output.co}
         ls {params.wd}/*count.txt > {output.co}
-    """
\ No newline at end of file
+    """
+
+checkpoint cut_verticalRef_finalMock:
+    """
+    Divide the input verticalRef-file for parallel processing in finalMock
+    """
+    input:
+       "%s/MPILEUP/mpileup_finalMock/VerticalRefPos.txt" % (config["project-folder"])
+    output:
+        directory("%s/MPILEUP/mpileup_finalMock/VerticalRefPos/" % (config["project-folder"]))
+    params:
+        out="%s/MPILEUP/mpileup_finalMock/VerticalRefPos/VerticalRefPos." % (config["project-folder"]),
+        split=1000000
+    shell:"""
+        mkdir -p {output}
+        split -l {params.split} --numeric-suffixes {input} {params.out}
+    """
+    
+rule create_MasterMatrix_parallel_finalMock:
+    """
+    Process verticalRef parallel to create MasterMatrix for final mock
+    """
+    input:
+        verRef="%s/MPILEUP/mpileup_finalMock/VerticalRefPos/VerticalRefPos.{i}" % (config["project-folder"]),
+        counts="%s/MPILEUP/mpileup_finalMock/CountFileList.txt" % (config["project-folder"])
+    output:
+        "%s/MPILEUP/mpileup_finalMock/VerticalRefPos/GSC.MasterMatrix_{i}.tsv" % (config["project-folder"])
+    log:
+        "%s/logs/BASH/create_MasterMatrix_parallel_finalMock_{i}.log" % (config["project-folder"])
+    benchmark:
+        "%s/benchmark/BASH/create_MasterMatrix_parallel_finalMock.benchmark_{i}.tsv" % (config["project-folder"])
+    params:
+        p=config["params"]["step6"]["p"],
+        wd="%s/MPILEUP/mpileup_finalMock/VerticalRefPos" % config["project-folder"],
+        pipefolder=config["pipeline-folder"]
+    conda:"envs/gbs.yaml"
+    singularity: config["singularity"]["gbs"]
+    shell:"""
+        cd {params.wd}
+        perl {params.pipefolder}/scripts/GBS-SNP-CROP-6_2.pl -in {input.verRef} -count {input.counts} -out {output} -p {params.p} &> {log}
+     """
+     
+def aggregate_inputMasterMatrix_finalMock(wildcards):
+    """
+    Aggregate the input object for the final MasterMatrix for finalMock
+    """
+    checkpoint_outputMM = checkpoints.cut_verticalRef_finalMock.get(**wildcards).output[0]
+    return expand("%s/MPILEUP/mpileup_finalMock/VerticalRefPos/GSC.MasterMatrix_{i}.tsv" % (config["project-folder"]),
+                  i=glob_wildcards(os.path.join(checkpoint_outputMM, "VerticalRefPos.{i}")).i)   
+                  
+rule aggregate_MasterMatrix_finalMock:
+    input:
+        aggregate_inputMasterMatrix_finalMock
+    output:
+        "%s/MPILEUP/mpileup_finalMock/GSC.MasterMatrix.txt" % (config["project-folder"])
+    shell:"""
+        cat {input} > {output}
+    """    
+    
+rule FilterVariants_finalMock:
+    """
+    Filter variants and call genotypes using the final mock.
+    """
+    input:
+        "%s/MPILEUP/mpileup_finalMock/GSC.MasterMatrix.txt" % (config["project-folder"])
+    output:
+        filtered="%s/MPILEUP/mpileup_finalMock/variants/GSC.GenoMatrix.txt" % (config["project-folder"]),
+        unfiltered="%s/MPILEUP/mpileup_finalMock/variants/GSC.GenoMatrix.unfiltered.txt" % (config["project-folder"])
+    params:
+        input=config["params"]["step7"]["input"],
+        out=config["params"]["step7"]["out"],
+        unfiltered=config["params"]["step7"]["unfiltered"],
+        p=config["params"]["step7"]["p"],
+        mnHoDepth0=config["params"]["step7"]["mnHoDepth0"],
+        mnHoDepth1=config["params"]["step7"]["mnHoDepth1"],
+        mnHetDepth=config["params"]["step7"]["mnHetDepth"],
+        altStrength=config["params"]["step7"]["altStrength"],
+        mnAlleleRatio=config["params"]["step7"]["mnAlleleRatio"],
+        mnCall=config["params"]["step7"]["mnCall"],
+        mnAvgDepth=config["params"]["step7"]["mnAvgDepth"],
+        mxAvgDepth=config["params"]["step7"]["mxAvgDepth"],
+        wd="%s/MPILEUP/mpileup_finalMock" % config["project-folder"],
+        pipefolder=config["pipeline-folder"]
+    log:
+        "%s/logs/Perl/FilterVariants_finalMock.log" % (config["project-folder"])
+    benchmark:
+        "%s/benchmark/Perl/FilterVariants_finalMock.benchmark.tsv" % (config["project-folder"])
+    conda:"envs/gbs.yaml"
+    singularity: config["singularity"]["gbs"]
+    shell:"""
+        #******PARAMETERS*****
+        # -in 	Discovery master matrix input file. The output from last step 	File 	GSC.MasterMatrix.txt
+        # -out 	Genotyping matrix for the population 	Output file 	GSC.GenoMatrix.txt
+        # -p 	Identify SNPs only (snp) or SNPs + indels (indel) 	String 	--
+        # -mnHoDepth0 	Minimum depth required for calling a homozygote when the alternative allele depth = 0 	Numeric 	5
+        # -mnHoDepth1 	Minimum depth required for calling a homozygote when the alternative allele depth = 1 	Numeric 	20
+        # -mnHetDepth 	Minimum depth required for each allele when calling a heterozygote 	Numeric 	3
+        # -altStrength 	Across the population for a given putative bi-allelic variant, this alternate allele strength parameter is the minimum proportion of non-primary allele reads that are the secondary allele 	Numeric 	0.8
+        # -mnAlleleRatio 	Minimum required ratio of less frequent allele depth to more frequent allele depth 	Numeric 	0.25
+        # -mnCall 	Minimum acceptable proportion of genotyped individuals to retain a variant 	Numeric 	0.75
+        # -mnAvgDepth 	Minimum average depth of an acceptable variant 	Numeric 	3
+        # -mxAvgDepth 	Maximum average depth of an acceptable variant 	Numeric 	200
+    
+        cd {params.wd}
+        perl {params.pipefolder}/scripts/GBS-SNP-CROP-7.pl -in {params.input} -out {params.out} -p {params.p} -mnHoDepth0 {params.mnHoDepth0} -mnHoDepth1 {params.mnHoDepth1} -mnHetDepth {params.mnHetDepth} -altStrength {params.altStrength} -mnAlleleRatio {params.mnAlleleRatio} -mnCall {params.mnCall} -mnAvgDepth {params.mnAvgDepth} -mxAvgDepth {params.mxAvgDepth} &> {log}
+        perl {params.pipefolder}/scripts/GBS-SNP-CROP-7.pl -in {params.input} -out {params.unfiltered} -p {params.p} -mnHoDepth0 0 -mnHoDepth1 0 -mnHetDepth 0 -altStrength 0 -mnAlleleRatio 0 -mnCall 0 -mnAvgDepth 0 -mxAvgDepth 1000000 &> {log}
+    """
+    
+rule CreateVCF_finalMock:
+    """
+    Create the VCF output of the pipeline for the final mock.
+    """
+    input:
+        filtered="%s/MPILEUP/mpileup_finalMock/variants/GSC.GenoMatrix.txt" % (config["project-folder"]),
+        unfiltered="%s/MPILEUP/mpileup_finalMock/variants/GSC.GenoMatrix.unfiltered.txt" % (config["project-folder"])
+    output:
+        "%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]),
+    params:
+        barcodes=config["barcodes"],
+        genoMatrix=config["params"]["step8"]["genoMatrix"],
+        out=config["params"]["step8"]["out"],
+        unfilteredOut=config["params"]["step8"]["unfiltered"],
+        format=config["params"]["step8"]["format"],
+        wd="%s/MPILEUP/mpileup_finalMock" % config["project-folder"],
+        outdir="%s/VCF" % config["project-folder"],
+        pipeout="%s/MPILEUP/mpileup_finalMock/GSC.vcf" % (config["project-folder"]),
+        pipefolder=config["pipeline-folder"]
+    log:
+        "%s/logs/Perl/CreateVCF_finalMock.log" % (config["project-folder"])
+    benchmark:
+        "%s/benchmark/Perl/CreateVCF_finalMock.benchmark.tsv" % (config["project-folder"])
+    conda:"envs/gbs.yaml"
+    singularity: config["singularity"]["gbs"]
+    shell:"""
+        #******PARAMETERS*****
+        # -in 	Final genotyping matrix input file 	File 	GSC.GenoMatrix.txt
+        # -out 	Output label name without extension 	String 	GSC
+        # -b 	BarcodeID file name See Appendix A 	File 	--
+        # -formats 	The name(s) (R, Tassel, Plink, vcf, H) for which the GBS-SNP-CROP final genotyping matrix should be converted. If more than one format is desired, the names should be separated by commas without any space, as in the examples shown below 	String 	R,T,P,V,H
+     
+        mkdir -p {params.outdir}
+        cd {params.wd}
+        perl {params.pipefolder}/scripts/GBS-SNP-CROP-8.pl -in {input.filtered} -out {params.out} -b {params.barcodes} -formats {params.format} &> {log}
+        perl {params.pipefolder}/scripts/GBS-SNP-CROP-8.pl -in {input.unfiltered} -out {params.unfilteredOut} -b {params.barcodes} -formats {params.format} &> {log}
+        
+        cp {params.pipeout} {output}
+        cd ../..
+  	"""
\ No newline at end of file

From 10fdf79d8680adc1d68cc0facce3773fae7616bc Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 23 Feb 2021 07:37:57 +0200
Subject: [PATCH 14/34] main rule sompified again

---
 GBS-pipeline.smk        | 4 +---
 rules/Module7-Reporting | 3 ++-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 528192c..65147fc 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -120,9 +120,7 @@ rule all:
       # OUTPUT STEP 9
 #        "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]),
       # Quality check
-        "%s/finalReport.html" % (config["project-folder"]),
-      #  expand("%s/MPILEUP/mpileup_finalMock/{samples}.count.txt" % (config["project-folder"]), samples=samples)
-        "%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]),
+        "%s/finalReport.html" % (config["project-folder"])
 
 
 
diff --git a/rules/Module7-Reporting b/rules/Module7-Reporting
index 414469a..10d915b 100644
--- a/rules/Module7-Reporting
+++ b/rules/Module7-Reporting
@@ -15,7 +15,8 @@ rule R_finalReport:
         qdRaw=expand("%s/QC/RAW/{rawsamples}_R1_qualdist.txt" % (config["project-folder"]), rawsamples=rawsamples),
         qdConc=expand("%s/QC/CONCATENATED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples),
         qdTrimmed=expand("%s/QC/TRIMMED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples),
-        fsFinal=expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples)
+        fsFinal=expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
+        finalSetVariants="%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"])
     output:
         "%s/finalReport.html" % (config["project-folder"])
     log:

From 85f44992412719ab7c2fe57f63abd76ba0579850 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 23 Feb 2021 07:38:48 +0200
Subject: [PATCH 15/34] Variant calling for final mock added

---
 CHANGELOG        | 1 +
 GBS-pipeline.smk | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index bf2f888..b34b498 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.4 : * Variant calling for final mock added
 0.5.3 : * Rule all rule simplified
 0.5.2 : * Script to refine the mock reference added
 0.5.1 : * samtools index set to csi instead of bai
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 65147fc..e7eae19 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -9,8 +9,8 @@ import os
 ##### Daniel Fischer (daniel.fischer@luke.fi)
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline
-##### Version: 0.5.3
-version = "0.5.3"
+##### Version: 0.5.4
+version = "0.5.4"
 
 ##### set minimum snakemake version #####
 min_version("5.24")
@@ -68,7 +68,7 @@ print("##### barcodes-file  : "+ config["barcodes"])
 print("##### rawsample file : "+ config["rawsamples"])
 print("#####")
 print("##### Derived runtime parameters")
-print("##### --------------------------------")
+print("##### --------------------------------")Variant calling for final mock added
 print("##### BWA-Genome index  : "+config["genome-bwa-index"])
 print("##### STAR-Genome index : "+config["genome-star-index"])
 print("##### Adapter file      : "+ config["adapter"])

From 8a56a38ae797d2403c4708ba34410020a9994567 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 23 Feb 2021 07:41:58 +0200
Subject: [PATCH 16/34] Missing library dependency added

---
 GBS-pipeline.smk              |  2 +-
 scripts/refineMockReference.R | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index e7eae19..7261930 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -68,7 +68,7 @@ print("##### barcodes-file  : "+ config["barcodes"])
 print("##### rawsample file : "+ config["rawsamples"])
 print("#####")
 print("##### Derived runtime parameters")
-print("##### --------------------------------")Variant calling for final mock added
+print("##### --------------------------------")
 print("##### BWA-Genome index  : "+config["genome-bwa-index"])
 print("##### STAR-Genome index : "+config["genome-star-index"])
 print("##### Adapter file      : "+ config["adapter"])
diff --git a/scripts/refineMockReference.R b/scripts/refineMockReference.R
index 217a2cf..4e97ff2 100644
--- a/scripts/refineMockReference.R
+++ b/scripts/refineMockReference.R
@@ -1,11 +1,14 @@
+# Load required libraries
+  library("GenomicTools")
+
 # This script imports the mock reference coverage values, analyses it and then created a final, refined mock reference file
-if(!is.element("snakemake",ls())){
-  mockClusters.file <- "GSC.MR.Clusters.fa"
-  projFolder <- "/scratch/project_2001746/BSF"
-  minTotalReadCoverage <- 25
-  minSampleCoverage <- 3
-  mockClusters.refined.file <- "/scratch/project_2001746/BSF/MockReference/MockReference.fa"
-}
+  if(!is.element("snakemake",ls())){
+    mockClusters.file <- "GSC.MR.Clusters.fa"
+    projFolder <- "/scratch/project_2001746/BSF"
+    minTotalReadCoverage <- 25
+    minSampleCoverage <- 3
+    mockClusters.refined.file <- "/scratch/project_2001746/BSF/MockReference/MockReference.fa"
+  }
 
 # Import the coverage
   clusterFiles <- list.files(file.path(projFolder, "FASTQ", "TRIMMED", "alignments_clusters"), pattern="*.coverage")

From 350c1bb2ba951dbef04f7c39864f025bb2638e0d Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 23 Feb 2021 07:59:58 +0200
Subject: [PATCH 17/34] typo fixed

---
 rules/Module5-CallVariants | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rules/Module5-CallVariants b/rules/Module5-CallVariants
index 208adcf..abfec49 100644
--- a/rules/Module5-CallVariants
+++ b/rules/Module5-CallVariants
@@ -440,7 +440,7 @@ rule ParseMpileup_createCountFiles_finalMock:
         ref="%s/MPILEUP/mpileup_finalMock/{samples}.ref.txt" % (config["project-folder"])
     params:
         p=config["params"]["step6"]["p"],
-        wd="%s/FASTQ/TRIMMED" % config["project-folder"],
+        wd="%s/MPILEUP/mpileup_finalMock" % config["project-folder"],
         pipefolder=config["pipeline-folder"]
     log:
         "%s/logs/Perl/ParseMpileup_createCountFiles_finalMock.{samples}.log" % (config["project-folder"])

From 442d5535c85e31edc9d6a86d9fc3b47d5576f329 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Thu, 25 Feb 2021 20:53:18 +0200
Subject: [PATCH 18/34] asd

---
 scripts/workflow-report.Rmd | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd
index c04af5e..23970a9 100644
--- a/scripts/workflow-report.Rmd
+++ b/scripts/workflow-report.Rmd
@@ -846,12 +846,12 @@ How much were the reads merged down? Length of Mock reference vs the length of a
 # Mock reference
 
 ## Basic stats
-```{r import required files, warning=FALSE, }
+```{r import required files mock, warning=FALSE, }
 finalMockClusters <- importFA(file.path(projFolder,"MockReference","MockReference.fa"))
 ```
 
 Basic information for the final mock reference:
-```{r mock build stats}
+```{r final mock build stats}
   out <- summary(finalMockClusters)
   out[,2] <- formatC(out[,2],  format="d", big.mark=",")
   out_html <- knitr::kable(out, col.names = NULL, "html")

From 4fbe5aeb5637db6002748b37914f4a8f32fe49d4 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Mon, 1 Mar 2021 15:40:21 +0200
Subject: [PATCH 19/34] Bugfix - wrong reference vcf address in module 6

---
 CHANGELOG                    | 1 +
 GBS-pipeline.smk             | 4 ++--
 rules/Module6-PostProcessing | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index b34b498..3e0766d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.5 : * Bugfix - wrong reference vcf address in module 6
 0.5.4 : * Variant calling for final mock added
 0.5.3 : * Rule all rule simplified
 0.5.2 : * Script to refine the mock reference added
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 7261930..c515f5f 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -9,8 +9,8 @@ import os
 ##### Daniel Fischer (daniel.fischer@luke.fi)
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline
-##### Version: 0.5.4
-version = "0.5.4"
+##### Version: 0.5.5
+version = "0.5.5"
 
 ##### set minimum snakemake version #####
 min_version("5.24")
diff --git a/rules/Module6-PostProcessing b/rules/Module6-PostProcessing
index 5a4c6c9..285e32c 100644
--- a/rules/Module6-PostProcessing
+++ b/rules/Module6-PostProcessing
@@ -33,7 +33,7 @@ else:
         Get the flanking sequences from identified variants for reference genome.
         """
         input:
-            vcf="%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]),
+            vcf="%s/VCF/FinalSetVariants_referenceGenome.vcf" % (config["project-folder"]),
             genome=config["genome"]
         output:
             fl="%s/MPILEUP/mpileup_reference/GSC.vcf.flanking" % (config["project-folder"]),

From fb09f2ffae6c4d06af4baaf65ac39c3fc80db9bb Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 2 Mar 2021 09:51:36 +0200
Subject: [PATCH 20/34] Bugfix - wrong path for reference based vcf fixed

---
 CHANGELOG                  | 1 +
 GBS-pipeline.smk           | 4 ++--
 rules/Module5-CallVariants | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 3e0766d..6aa052c 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.6 : * Bugfix - wrong path for reference based vcf fixed
 0.5.5 : * Bugfix - wrong reference vcf address in module 6
 0.5.4 : * Variant calling for final mock added
 0.5.3 : * Rule all rule simplified
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index c515f5f..65fedb0 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -9,8 +9,8 @@ import os
 ##### Daniel Fischer (daniel.fischer@luke.fi)
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline
-##### Version: 0.5.5
-version = "0.5.5"
+##### Version: 0.5.6
+version = "0.5.6"
 
 ##### set minimum snakemake version #####
 min_version("5.24")
diff --git a/rules/Module5-CallVariants b/rules/Module5-CallVariants
index abfec49..e01a63b 100644
--- a/rules/Module5-CallVariants
+++ b/rules/Module5-CallVariants
@@ -403,7 +403,7 @@ else:
             format=config["params"]["step8"]["format"],
             wd="%s/MPILEUP/mpileup_reference" % config["project-folder"],
             outdir="%s/VCF" % config["project-folder"],
-            pipeout="%s/MPILEUP/mpileup_finalMock/GSC.vcf" % (config["project-folder"]),
+            pipeout="%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]),
             pipefolder=config["pipeline-folder"]
         log:
             "%s/logs/Perl/CreateVCF_reference.log" % (config["project-folder"])

From 6d04c708aff2fe360628315688bbf748f555de14 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Wed, 3 Mar 2021 09:08:31 +0200
Subject: [PATCH 21/34] Dependencies between the rules improved

---
 CHANGELOG               |  1 +
 GBS-pipeline.smk        | 60 +++++++++++++++++++++--------------------
 rules/Module7-Reporting |  7 ++++-
 3 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 6aa052c..aa31f0e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.7 : * Dependencies between the rules improved
 0.5.6 : * Bugfix - wrong path for reference based vcf fixed
 0.5.5 : * Bugfix - wrong reference vcf address in module 6
 0.5.4 : * Variant calling for final mock added
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 65fedb0..cee143e 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -9,8 +9,8 @@ import os
 ##### Daniel Fischer (daniel.fischer@luke.fi)
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline
-##### Version: 0.5.6
-version = "0.5.6"
+##### Version: 0.5.7
+version = "0.5.7"
 
 ##### set minimum snakemake version #####
 min_version("5.24")
@@ -81,45 +81,47 @@ print("#########################################################################
 rule all:
     input:
       # OUTPUT: PREPARATION MODULE
-#        expand("%s/FASTQ/CONCATENATED/{samples}_R1_001.merged.fastq.gz" % (config["project-folder"]), samples=samples),
-#        expand("%s/FASTQ/CONCATENATED/{samples}_R2_001.merged.fastq.gz" % (config["project-folder"]), samples=samples),
-#      # QC OF RAW AND CONCATENATED FILES
+        expand("%s/FASTQ/CONCATENATED/{samples}_R1_001.merged.fastq.gz" % (config["project-folder"]), samples=samples),
+        expand("%s/FASTQ/CONCATENATED/{samples}_R2_001.merged.fastq.gz" % (config["project-folder"]), samples=samples),
+      # QC OF RAW AND CONCATENATED FILES
         "%s/QC/RAW/multiqc_R1/" % (config["project-folder"]),
         "%s/QC/CONCATENATED/multiqc_R1/" % (config["project-folder"]),
         "%s/QC/TRIMMED/multiqc_R1/" % (config["project-folder"]),
       # OUTPUT STEP 2
-#        expand("%s/FASTQ/TRIMMED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples),
-#        expand("%s/FASTQ/TRIMMED/{samples}.R2.fq.gz" % (config["project-folder"]), samples=samples),
+        expand("%s/FASTQ/TRIMMED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples),
+        expand("%s/FASTQ/TRIMMED/{samples}.R2.fq.gz" % (config["project-folder"]), samples=samples),
       # OUTPUT STEP 2b
-#        expand("%s/FASTQ/SUBSTITUTED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples),
+        expand("%s/FASTQ/SUBSTITUTED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples),
       # OUTPUT STEP 4
-#        "%s/FASTQ/TRIMMED/GSC.MR.Genome.fa" % (config["project-folder"]),
-#        "%s/BAM/Mockref/mockToRef.sam.flagstat" % (config["project-folder"]),
-#        "%s/MPILEUP/mpileup_mockToRef/mockToRef.mpileup" % (config["project-folder"]),
+        "%s/FASTQ/TRIMMED/GSC.MR.Genome.fa" % (config["project-folder"]),
+        "%s/BAM/Mockref/mockToRef.sam.flagstat" % (config["project-folder"]),
+        "%s/MPILEUP/mpileup_mockToRef/mockToRef.mpileup" % (config["project-folder"]),
       # OUTPUT STEP 5
-#        expand("%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
-#        expand("%s/FASTQ/TRIMMED/alignments/{samples}.sorted.bam" % (config["project-folder"]), samples=samples),
-#        expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
-#        expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sorted.bam" % (config["project-folder"]), samples=samples),
-#        expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]), samples=samples),
-#        expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sorted.bam" % (config["project-folder"]), samples=samples),
-#        expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
-#        expand("%s/MPILEUP/mpileup_reference/{samples}.mpileup" % (config["project-folder"]), samples=samples),
+        expand("%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
+        expand("%s/FASTQ/TRIMMED/alignments/{samples}.sorted.bam" % (config["project-folder"]), samples=samples),
+        expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
+        expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.sorted.bam" % (config["project-folder"]), samples=samples),
+        expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]), samples=samples),
+        expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sorted.bam" % (config["project-folder"]), samples=samples),
+        expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
+        expand("%s/MPILEUP/mpileup_reference/{samples}.mpileup" % (config["project-folder"]), samples=samples),
       # OUTPUT STEP 6
-#        "%s/FASTQ/TRIMMED/GSC.MasterMatrix.txt" % (config["project-folder"]),
-#        "%s/MPILEUP/mpileup_reference/GSC.MasterMatrix.txt" % (config["project-folder"]),
-#        expand("%s/FASTQ/TRIMMED/{samples}.count.txt" % (config["project-folder"]), samples=samples),
+        "%s/FASTQ/TRIMMED/GSC.MasterMatrix.txt" % (config["project-folder"]),
+        "%s/MPILEUP/mpileup_reference/GSC.MasterMatrix.txt" % (config["project-folder"]),
+        expand("%s/FASTQ/TRIMMED/{samples}.count.txt" % (config["project-folder"]), samples=samples),
       # OUTPUT STEP 7
-#        "%s/FASTQ/TRIMMED/variants/GSC.GenoMatrix.txt" % (config["project-folder"]),
-#        "%s/MPILEUP/mpileup_reference/variants/GSC.GenoMatrix.txt" % (config["project-folder"]),
+        "%s/FASTQ/TRIMMED/variants/GSC.GenoMatrix.txt" % (config["project-folder"]),
+        "%s/MPILEUP/mpileup_reference/variants/GSC.GenoMatrix.txt" % (config["project-folder"]),
       # OUTPUT STEP 8  
-#        "%s/FASTQ/TRIMMED/GSC.vcf" % (config["project-folder"]),
-#        "%s/FASTQ/TRIMMED/GSC.vcf.fa" % (config["project-folder"]),
-#        "%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]),
-#        "%s/MPILEUP/mpileup_reference/GSC.vcf.fa" % (config["project-folder"]),
+        "%s/FASTQ/TRIMMED/GSC.vcf" % (config["project-folder"]),
+        "%s/FASTQ/TRIMMED/GSC.vcf.fa" % (config["project-folder"]),
+        "%s/MPILEUP/mpileup_reference/GSC.vcf" % (config["project-folder"]),
+        "%s/MPILEUP/mpileup_reference/GSC.vcf.fa" % (config["project-folder"]),
       # OUTPUT STEP 9
-#        "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]),
+        "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]),
       # Quality check
+        expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
+        "%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]),
         "%s/finalReport.html" % (config["project-folder"])
 
 
diff --git a/rules/Module7-Reporting b/rules/Module7-Reporting
index 10d915b..7c665de 100644
--- a/rules/Module7-Reporting
+++ b/rules/Module7-Reporting
@@ -8,6 +8,9 @@ rule R_finalReport:
         script=config["report-script"],
         fa="%s/FASTQ/TRIMMED/GSC.vcf.fa" % (config["project-folder"]),
         mockToRef="%s/BAM/Mockref/mockToRef.sorted.bam" % (config["project-folder"]),
+        mockToRefSamflags="%s/BAM/Mockref/mockToRef.sam.samflags" % (config["project-folder"]), 
+        mockFlagstats=expand("%s/FASTQ/TRIMMED/alignments/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
+        c=expand("%s/FASTQ/TRIMMED/alignments_clusters/{samples}.coverage" % (config["project-folder"]), samples=samples),
         mqcraw="%s/QC/RAW/multiqc_R1/" % (config["project-folder"]),
         mqcconcatenated="%s/QC/CONCATENATED/multiqc_R1/" % (config["project-folder"]),
         mqctrimmed="%s/QC/TRIMMED/multiqc_R1/" % (config["project-folder"]),
@@ -16,7 +19,9 @@ rule R_finalReport:
         qdConc=expand("%s/QC/CONCATENATED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples),
         qdTrimmed=expand("%s/QC/TRIMMED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples),
         fsFinal=expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
-        finalSetVariants="%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"])
+        finalSetVariants="%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]),
+        fsRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
+        cRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.coverage" % (config["project-folder"]), samples=samples)
     output:
         "%s/finalReport.html" % (config["project-folder"])
     log:

From 12e944b772842d84e75cec1324125dc3dbdb0a4c Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Wed, 3 Mar 2021 18:24:25 +0200
Subject: [PATCH 22/34] Final mock reference ws not declared in output

---
 CHANGELOG                   |  1 +
 GBS-pipeline.smk            |  5 +-
 scripts/workflow-report.Rmd | 91 +++++++++++++++++++------------------
 3 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index aa31f0e..74c8571 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.8 : * Final mock reference was not declared in output
 0.5.7 : * Dependencies between the rules improved
 0.5.6 : * Bugfix - wrong path for reference based vcf fixed
 0.5.5 : * Bugfix - wrong reference vcf address in module 6
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index cee143e..cc6c6e5 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -9,8 +9,8 @@ import os
 ##### Daniel Fischer (daniel.fischer@luke.fi)
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline
-##### Version: 0.5.7
-version = "0.5.7"
+##### Version: 0.5.8
+version = "0.5.8"
 
 ##### set minimum snakemake version #####
 min_version("5.24")
@@ -121,6 +121,7 @@ rule all:
         "%s/BAM/mockVariantsToReference/mockVariantsToReference.bam" % (config["project-folder"]),
       # Quality check
         expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
+        "%s/MockReference/MockReference.fa" % (config["project-folder"]),
         "%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]),
         "%s/finalReport.html" % (config["project-folder"])
 
diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd
index 23970a9..92d7391 100644
--- a/scripts/workflow-report.Rmd
+++ b/scripts/workflow-report.Rmd
@@ -1454,53 +1454,54 @@ accepted mistakes).
 
 ```{r}
 
-if(ncol(sampleInfo)>1){
-  mmCase <- 2
-  mmControl <- 5
-  vcf.cases <- vcf$genotypes[sampleInfo$casecontrol=="case",]
-  vcf.control <- vcf$genotypes[sampleInfo$casecontrol=="control",]
-  
-  vcf.cases.table <- apply(vcf.cases,2,table)
-  vcf.control.table <- apply(vcf.control,2,table)
-  
-  nCases <- nrow(vcf.cases)
-  nControl <- nrow(vcf.control)
-  
-  candidateLociRef <- c()
-  candidateLociAlt <- c()
-  for(i in 1:length(vcf.cases.table)){
-    tmp <- vcf.cases.table[[i]]
-    homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"]
-    homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"]
-    if(length(homRef)==0) homRef <- 0
-    if(length(homAlt)==0) homAlt <- 0
-    if(homRef>=(nCases - mmCase)) candidateLociRef <- c(candidateLociRef, names(vcf.cases.table)[i])
-    if(homAlt>=(nCases - mmCase)) candidateLociAlt <- c(candidateLociAlt, names(vcf.cases.table)[i])
-  }
-  
-  causativeRef <- c()
-  causativeAlt <- c()
-  for(i in 1:length(candidateLociRef)){
-    tmp <- vcf.control.table[[which(names(vcf.control.table)==candidateLociRef[i])]]
-    homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"]
-    homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"]
-    if(length(homRef)==0) homRef <- 0
-    if(length(homAlt)==0) homAlt <- 0
-  #  if(homRef>=(nControl - 1)) causativeRef <- c(causativeRef, names(vcf.control.table)[i])
-    if(homAlt>=(nControl - mmControl)) causativeRef <- c(causativeRef, names(vcf.control.table)[i])
-  }
-  
-  for(i in 1:length(candidateLociAlt)){
-    tmp <- vcf.control.table[[which(names(vcf.control.table)==candidateLociAlt[i])]]
-    homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"]
-    homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"]
-    if(length(homRef)==0) homRef <- 0
-    if(length(homAlt)==0) homAlt <- 0
-    if(homRef>=(nControl - mmControl)) causativeAlt <- c(causativeAlt, names(vcf.control.table)[i])
-  #  if(homAlt>=(nControl - 1)) causativeAlt <- c(causativeAlt, names(vcf.control.table)[i])
+if(is.element(colnames(sampleInfo), "casecontrol")){
+  if(ncol(sampleInfo)>1){
+    mmCase <- 2
+    mmControl <- 5
+    vcf.cases <- vcf$genotypes[sampleInfo$casecontrol=="case",]
+    vcf.control <- vcf$genotypes[sampleInfo$casecontrol=="control",]
+    
+    vcf.cases.table <- apply(vcf.cases,2,table)
+    vcf.control.table <- apply(vcf.control,2,table)
+    
+    nCases <- nrow(vcf.cases)
+    nControl <- nrow(vcf.control)
+    
+    candidateLociRef <- c()
+    candidateLociAlt <- c()
+    for(i in 1:length(vcf.cases.table)){
+      tmp <- vcf.cases.table[[i]]
+      homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"]
+      homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"]
+      if(length(homRef)==0) homRef <- 0
+      if(length(homAlt)==0) homAlt <- 0
+      if(homRef>=(nCases - mmCase)) candidateLociRef <- c(candidateLociRef, names(vcf.cases.table)[i])
+      if(homAlt>=(nCases - mmCase)) candidateLociAlt <- c(candidateLociAlt, names(vcf.cases.table)[i])
+    }
+    
+    causativeRef <- c()
+    causativeAlt <- c()
+    for(i in 1:length(candidateLociRef)){
+      tmp <- vcf.control.table[[which(names(vcf.control.table)==candidateLociRef[i])]]
+      homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"]
+      homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"]
+      if(length(homRef)==0) homRef <- 0
+      if(length(homAlt)==0) homAlt <- 0
+    #  if(homRef>=(nControl - 1)) causativeRef <- c(causativeRef, names(vcf.control.table)[i])
+      if(homAlt>=(nControl - mmControl)) causativeRef <- c(causativeRef, names(vcf.control.table)[i])
+    }
+    
+    for(i in 1:length(candidateLociAlt)){
+      tmp <- vcf.control.table[[which(names(vcf.control.table)==candidateLociAlt[i])]]
+      homRef <- tmp[names(tmp)=="00"] + tmp[names(tmp)=="03"]
+      homAlt <- tmp[names(tmp)=="02"] + tmp[names(tmp)=="03"]
+      if(length(homRef)==0) homRef <- 0
+      if(length(homAlt)==0) homAlt <- 0
+      if(homRef>=(nControl - mmControl)) causativeAlt <- c(causativeAlt, names(vcf.control.table)[i])
+    #  if(homAlt>=(nControl - 1)) causativeAlt <- c(causativeAlt, names(vcf.control.table)[i])
+    }
   }
 }
-
 ```
 
 Then we identify the following potentially causative mutations:

From 37be033b40746ff03c4522b00448e800275208b6 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Thu, 4 Mar 2021 08:30:03 +0200
Subject: [PATCH 23/34] Bugfix - Final report crashed, in case that no
 casecontrol is given in samplelist

---
 CHANGELOG                   |  1 +
 GBS-pipeline.smk            |  4 ++--
 scripts/workflow-report.Rmd | 22 ++++++++++++----------
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 74c8571..9e9330f 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.9 : * Bugfix - Final report crashed, in case that no casecontrol is given in samplelist
 0.5.8 : * Final mock reference was not declared in output
 0.5.7 : * Dependencies between the rules improved
 0.5.6 : * Bugfix - wrong path for reference based vcf fixed
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index cc6c6e5..eb5595c 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -9,8 +9,8 @@ import os
 ##### Daniel Fischer (daniel.fischer@luke.fi)
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline
-##### Version: 0.5.8
-version = "0.5.8"
+##### Version: 0.5.9
+version = "0.5.9"
 
 ##### set minimum snakemake version #####
 min_version("5.24")
diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd
index 92d7391..5dddc5d 100644
--- a/scripts/workflow-report.Rmd
+++ b/scripts/workflow-report.Rmd
@@ -1507,16 +1507,18 @@ if(is.element(colnames(sampleInfo), "casecontrol")){
 Then we identify the following potentially causative mutations:
 
 ```{r}
-if(ncol(sampleInfo)>1){
-  if(length(causativeRef)>0){
-    datatable(as.data.frame(causativeRef))
-  } else {
-    cat("No potentially mutations found for homozygeous reference!\n")
-  }
-  if(length(causativeAlt)>0){
-    datatable(as.data.frame(causativeAlt))
-  } else {
-    cat("No potentially mutations found for homozygeous alternative!\n")
+if(is.element(colnames(sampleInfo), "casecontrol")){
+  if(ncol(sampleInfo)>1){
+    if(length(causativeRef)>0){
+      datatable(as.data.frame(causativeRef))
+    } else {
+      cat("No potentially mutations found for homozygeous reference!\n")
+    }
+    if(length(causativeAlt)>0){
+      datatable(as.data.frame(causativeAlt))
+    } else {
+      cat("No potentially mutations found for homozygeous alternative!\n")
+    }
   }
 }
 ```

From d61a768dcc19b49eaf485ede604f6dc03d441a69 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Sun, 7 Mar 2021 11:14:40 +0200
Subject: [PATCH 24/34] OUtput html formatting fixed

---
 scripts/workflow-report.Rmd | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd
index 5dddc5d..73c84fe 100644
--- a/scripts/workflow-report.Rmd
+++ b/scripts/workflow-report.Rmd
@@ -679,6 +679,7 @@ plotMQCFeature(x=file.path(projFolder,"QC","CONCATENATED","multiqc_R1"),
                y=file.path(projFolder,"QC","TRIMMED","multiqc_R1"),
                feature="fastqc_per_base_sequence_quality_plot")
 ```
+
 # Preliminary Mock Reference
 
 ## Basic stats

From 300e72f3a958377d93c869980bd59e2b79121117 Mon Sep 17 00:00:00 2001
From: Daniel Fischer <fischerd@r07c50.bullx>
Date: Sun, 7 Mar 2021 11:30:16 +0200
Subject: [PATCH 25/34] Figure added

---
 scripts/workflow-report.Rmd | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd
index c04af5e..8a16078 100644
--- a/scripts/workflow-report.Rmd
+++ b/scripts/workflow-report.Rmd
@@ -857,6 +857,18 @@ Basic information for the final mock reference:
   out_html <- knitr::kable(out, col.names = NULL, "html")
   kable_styling(out_html, "striped", position = "left")
 ```
+## Theoretical coverage of Mock per sample
+Given the length of the final mock reference assembly and the total number of sequenced bases per sample, we can estimate the theoretical coverage of the final mock for each sample
+
+```{r final mock coverage}
+par(mar=c(10,5,1,1))
+finalMockLength <- sum(nchar(finalMockClusters))
+mockCov <- 2*mqcStats.trim[,10] * mqcStats.trim[,5]/finalMockLength
+names(mockCov) <- mqcStats.trim[,1]
+barplot(mockCov, ylab="'x'-fold Coverage", col=viridis(20)[8], las=2)
+lines(c(-100,10000),c(1,1), lty="dotted")
+lines(c(-100,10000),c(10,10), lty="dotted")
+```
 
 
 # Existing Reference
@@ -1160,7 +1172,7 @@ for(i in 3:ncol(clusterCoverage)){
 ### Cluster coverage Mock samples vs others
 ADD HERE STILL A COMPARISON, HOW THE READS FROM THE MOCK DISTRIBUTE ACROSS THE REFERENCE VS ALL OTHER SAMPLES
 
-## Refind Mock reference
+## Refined Mock reference
 ADD THIS STILL TO THE REPORT!!!
 
 ## Reference genome

From 08620a3f115b4d169cedf0c08d3a05250cce89d9 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Sun, 7 Mar 2021 21:49:51 +0200
Subject: [PATCH 26/34] ref added

---
 GBS-pipeline.smk         | 3 ++-
 GBS-pipeline_config.yaml | 4 ----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index eb5595c..9c0d2f0 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -8,7 +8,8 @@ import os
 ##### GBS-snakemake pipeline #####
 ##### Daniel Fischer (daniel.fischer@luke.fi)
 ##### Natural Resources Institute Finland (Luke)
-##### This pipeline is build upon the the GBS-SNP-CROP pipeline
+##### This pipeline is build upon the the GBS-SNP-CROP pipeline:
+##### https://github.com/halelab/GBS-SNP-CROP
 ##### Version: 0.5.9
 version = "0.5.9"
 
diff --git a/GBS-pipeline_config.yaml b/GBS-pipeline_config.yaml
index f971037..6fc01ff 100644
--- a/GBS-pipeline_config.yaml
+++ b/GBS-pipeline_config.yaml
@@ -6,15 +6,11 @@
 project-folder: "/scratch/project_2001746/Example"                                                                  # Absolute path to the project root folder
 pipeline-folder: "/scratch/project_2001746/Pipeline-GBS"                                                            # Absolute path to the pipeline folder
 pipeline-config: "/scratch/project_2001746/Pipeline-GBS/GBS-pipeline_config-example.yaml"                           # Absolute path to the pipeline configuration file
-#report-script: "/scratch/project_2001746/Pipeline-GBS/scripts/workflow-report.Rmd"                                  # Absolute path to the report scripts (THIS OPTION SHOULD BE OBSOLETE)
 barcodes: "/scratch/project_2001746/Example/barcodesID.txt"                                                         # Absolute path to the barcodes file (MAKE THIS OPTIONAL)
 rawsamples: "/scratch/project_2001746/Example/rawsamples"                                                           # Absolute path to the raw samples file (MAKE THIS OPTIONAL)
 genome: "/scratch/project_2001746/Example/references/Vulpes_vulpes.VulVul2.2.dna.toplevel.fa"                       # Absolute path to an existing reference genome, leave empty "" if not available
-#genome-bwa-index: "/scratch/project_2001746/Example/references/Vulpes_vulpes.VulVul2.2.dna.toplevel.fa.bwt"         # Absolute path to the bwa index (THIS OPTION SHOULD BE OBSOLETE)
-#genome-star-index:  "/scratch/project_2001746/Example/references/STAR2.7.3a"                                        # Absolute path to the star index (THIS OPTION SHOULD BE OBSOLETE)
 refsample: ""                                                                                                       # Id of ref sample used to build mock reference (NOT SURE, IF USED, CHECK THIS)
 local-scratch: $LOCAL_SCRATCH                                                                                       # Path for fast local storage for tmp files (Here, a system variable is used. Check with your server configs)
-#adapter: "/scratch/project_2001746/Pipeline-GBS/adapter.fa"                                                         # Path for the adapter sequences, used by Trimmomatic (MAKE THIS OPTIONAL)
 tmpdir: "/scratch/project_2001746/tmp"
 
 # Parameters for the GBS-SNP-CROP tools

From 4dcb7000064388b9ae936f25125de8567f711d54 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Mon, 8 Mar 2021 14:59:24 +0200
Subject: [PATCH 27/34] Mapping stats for final mock vs reference genome added

---
 CHANGELOG                   |  1 +
 GBS-pipeline.smk            |  4 +-
 rules/Module3-MockReference | 98 +++++++++++++++++++++++++++++++++++++
 rules/Module7-Reporting     |  1 +
 4 files changed, 102 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 9e9330f..777f942 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.10: * Mapping stats for final mock vs reference genome added
 0.5.9 : * Bugfix - Final report crashed, in case that no casecontrol is given in samplelist
 0.5.8 : * Final mock reference was not declared in output
 0.5.7 : * Dependencies between the rules improved
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 9c0d2f0..b662b7c 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -10,8 +10,8 @@ import os
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline:
 ##### https://github.com/halelab/GBS-SNP-CROP
-##### Version: 0.5.9
-version = "0.5.9"
+##### Version: 0.5.10
+version = "0.5.10"
 
 ##### set minimum snakemake version #####
 min_version("5.24")
diff --git a/rules/Module3-MockReference b/rules/Module3-MockReference
index 544a92f..134c892 100644
--- a/rules/Module3-MockReference
+++ b/rules/Module3-MockReference
@@ -213,3 +213,101 @@ rule refine_mock_reference:
              snakemake <- TRUE;\
              source('{input.script}')" &> {log}
     """
+
+if config["genome"] == "":
+    print(f"No reference genome provided, skipping the star indexing step for reference genome")
+else:
+    rule FinalMockVsRef:
+        """
+        Map the mock reference clusters to the genome (minimap2).
+        """
+        input:
+            refgenome=config["genome"],
+            mockgenome="%s/MockReference/MockReference.fa" % (config["project-folder"])
+        output:
+            "%s/SAM/finalMockToRef.sam" % (config["project-folder"])
+        singularity: config["singularity"]["minimap2"]
+        shell:"""
+           minimap2 -ax sr {input.refgenome} {input.mockgenome} > {output} 2> {log}
+
+        """
+
+if config["genome"] == "":
+    print(f"No reference genome provided, skipping the star indexing step for reference genome")
+else:
+    rule FinalMockVsRef_samtools_SamToSortedBam:
+        """
+        Index the Mock Reference Genome.
+        """
+        input:
+            ref=config["genome"],
+            sam="%s/SAM/finalMockToRef.sam" % (config["project-folder"])
+        output:
+            bam="%s/BAM/FinalMockref/mockToRef.bam" % (config["project-folder"]),
+            sorted="%s/BAM/FinalMockref/mockToRef.sorted.bam" % (config["project-folder"])
+        log:
+            "%s/logs/Samtools/FinalMockVsRef_samtools_SamToSortedBam.log" % (config["project-folder"])
+        benchmark:
+            "%s/benchmark/Samtools/FinalMockVsRef_samtools_SamToSortedBam.benchmark.tsv" % (config["project-folder"])
+        singularity: config["singularity"]["gbs"]
+        shell:"""
+            df -h &> {log}
+            samtools view -b {input.sam} > {output.bam}
+            samtools sort {output.bam} -o {output.sorted}
+            samtools index {output.sorted}
+      	"""
+
+
+if config["genome"] == "":
+    print(f"No reference genome provided, skipping the star indexing step for reference genome")
+else:  	
+    rule FinalMockVsRef_AlignmentStats:
+        """
+        Get the mapping stats for the mock vs. reference.
+        """
+        input:
+            ref=config["genome"],
+            sam="%s/SAM/finalMockToRef.sam" % (config["project-folder"]),
+            sbam="%s/BAM/FinalMockref/mockToRef.sorted.bam" % (config["project-folder"])
+        output:
+            fs="%s/BAM/FinalMockref/mockToRef.sam.flagstat" % (config["project-folder"]),
+            samflags="%s/BAM/FinalMockref/mockToRef.sam.samflags" % (config["project-folder"]), 
+            stats="%s/BAM/FinalMockref/mockToRef.sam.stats" % (config["project-folder"]),
+            c="%s/BAM/FinalMockref/mockToRef.coverage" % (config["project-folder"]),
+            bed="%s/BAM/FinalMockref/mockToRef.bed" % (config["project-folder"]),
+            bedMerged="%s/BAM/FinalMockref/mockToRef.merged.bed" % (config["project-folder"]),
+            leftBed="%s/BAM/FinalMockref/mockToRef.merged.left.bed" % (config["project-folder"]),
+            rightBed="%s/BAM/FinalMockref/mockToRef.merged.right.bed" % (config["project-folder"]),
+            leftFACounts="%s/SAM/finalMockToRef.left.fa.counts" % (config["project-folder"]),
+            rightFACounts="%s/SAM/finalMockToRef.right.fa.counts" % (config["project-folder"]),
+            leftFA="%s/BAM/FinalMockref/mockToRef.merged.left.fa" % (config["project-folder"]),
+            rightFA="%s/BAM/FinalMockref/mockToRef.merged.right.fa" % (config["project-folder"]),
+            FA="%s/BAM/FinalMockref/mockToRef.merged.combined.fa" % (config["project-folder"]),
+            FACounts="%s/BAM/FinalMockref/mockToRef.merged.combined.fa.counts" % (config["project-folder"])
+        log:
+            "%s/logs/FinalMockVsRef_AlignmentStats.log" % (config["project-folder"])
+        benchmark:
+            "%s/benchmark/MockVsRef_AlignmentStats.benchmark.tsv" % (config["project-folder"])
+        threads: lambda cores: cpu_count()
+        conda:"envs/step3.yaml"
+        singularity: config["singularity"]["gbs"]
+        shell:"""
+            df -h &> {log}
+            echo "Number of threads used:" {threads}
+            sed '/^@/d' {input.sam} | cut -f2 | sort | uniq -c > {output.samflags}
+            samtools flagstat {input.sam} > {output.fs}
+            samtools stats {input.sam} > {output.stats}
+            samtools idxstats {input.sbam} | awk '{{print $1\" \"$3}}' > {output.c}
+            bedtools genomecov -ibam {input.sbam} -bg > {output.bed}
+            bedtools merge -i {output.bed} > {output.bedMerged}
+            awk '{{print $1 "\\t" (($2 - 6)<0?0:($2 - 6)) "\\t" $2}}' {output.bedMerged} > {output.leftBed}
+            awk '{{print $1 "\\t" $3 "\\t" $3 + 6}}' {output.bedMerged} > {output.rightBed}
+            bedtools getfasta -fi {input.ref} -bed {output.leftBed} > {output.leftFA}
+            bedtools getfasta -fi {input.ref} -bed {output.rightBed} > {output.rightFA}
+            
+            sed '/^>/d' {output.leftFA} | sort | awk '{{ print toupper($0) }}' | uniq -c > {output.leftFACounts}
+            sed '/^>/d' {output.rightFA} | sort | awk '{{ print toupper($0) }}' | uniq -c > {output.rightFACounts}
+
+            paste -d '+' {output.leftFA} {output.rightFA} > {output.FA}
+            sed '/^>/d' {output.FA} | sort | awk '{{ print toupper($0) }}' | uniq -c > {output.FACounts}
+      	"""
diff --git a/rules/Module7-Reporting b/rules/Module7-Reporting
index 7c665de..c0a2fdb 100644
--- a/rules/Module7-Reporting
+++ b/rules/Module7-Reporting
@@ -19,6 +19,7 @@ rule R_finalReport:
         qdConc=expand("%s/QC/CONCATENATED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples),
         qdTrimmed=expand("%s/QC/TRIMMED/{samples}_R1_qualdist.txt" % (config["project-folder"]), samples=samples),
         fsFinal=expand("%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
+        fsFinalVsRef="%s/BAM/FinalMockref/mockToRef.sam.flagstat" % (config["project-folder"]),
         finalSetVariants="%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]),
         fsRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
         cRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.coverage" % (config["project-folder"]), samples=samples)

From f9dea0d7b6f6547fc51b40b6d6b1df093d28fe63 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Mon, 8 Mar 2021 15:08:44 +0200
Subject: [PATCH 28/34] Inpur rule all cleaned up a bit to account for
 temporary output

---
 CHANGELOG        |  1 +
 GBS-pipeline.smk | 12 ++----------
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 777f942..f7a6f3d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.11: * Inpur rule all cleaned up a bit to account for temporary output
 0.5.10: * Mapping stats for final mock vs reference genome added
 0.5.9 : * Bugfix - Final report crashed, in case that no casecontrol is given in samplelist
 0.5.8 : * Final mock reference was not declared in output
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index b662b7c..848d822 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -10,8 +10,8 @@ import os
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline:
 ##### https://github.com/halelab/GBS-SNP-CROP
-##### Version: 0.5.10
-version = "0.5.10"
+##### Version: 0.5.11
+version = "0.5.11"
 
 ##### set minimum snakemake version #####
 min_version("5.24")
@@ -81,18 +81,10 @@ print("#########################################################################
 
 rule all:
     input:
-      # OUTPUT: PREPARATION MODULE
-        expand("%s/FASTQ/CONCATENATED/{samples}_R1_001.merged.fastq.gz" % (config["project-folder"]), samples=samples),
-        expand("%s/FASTQ/CONCATENATED/{samples}_R2_001.merged.fastq.gz" % (config["project-folder"]), samples=samples),
       # QC OF RAW AND CONCATENATED FILES
         "%s/QC/RAW/multiqc_R1/" % (config["project-folder"]),
         "%s/QC/CONCATENATED/multiqc_R1/" % (config["project-folder"]),
         "%s/QC/TRIMMED/multiqc_R1/" % (config["project-folder"]),
-      # OUTPUT STEP 2
-        expand("%s/FASTQ/TRIMMED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples),
-        expand("%s/FASTQ/TRIMMED/{samples}.R2.fq.gz" % (config["project-folder"]), samples=samples),
-      # OUTPUT STEP 2b
-        expand("%s/FASTQ/SUBSTITUTED/{samples}.R1.fq.gz" % (config["project-folder"]), samples=samples),
       # OUTPUT STEP 4
         "%s/FASTQ/TRIMMED/GSC.MR.Genome.fa" % (config["project-folder"]),
         "%s/BAM/Mockref/mockToRef.sam.flagstat" % (config["project-folder"]),

From 959e2263172306c60d727af1f209a93f36a50802 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Mon, 8 Mar 2021 16:17:37 +0200
Subject: [PATCH 29/34] Coverage for finalMock alignments are now also
 calculated

---
 CHANGELOG                       |  3 ++-
 GBS-pipeline.smk                |  4 ++--
 GBS-pipeline_server-config.yaml | 13 +++++++++++++
 rules/Module3-MockReference     |  5 +++++
 rules/Module4-ReadAlignment     |  9 ++++++---
 rules/Module7-Reporting         |  3 ++-
 6 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index f7a6f3d..cf8224c 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,7 +4,8 @@
 #####                                                                      #####
 ################################################################################
 
-0.5.11: * Inpur rule all cleaned up a bit to account for temporary output
+0.5.12: * Coverage for finalMock alignments are now also calculated
+0.5.11: * Input rule all cleaned up a bit to account for temporary output
 0.5.10: * Mapping stats for final mock vs reference genome added
 0.5.9 : * Bugfix - Final report crashed, in case that no casecontrol is given in samplelist
 0.5.8 : * Final mock reference was not declared in output
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 848d822..3d23f2a 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -10,8 +10,8 @@ import os
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline:
 ##### https://github.com/halelab/GBS-SNP-CROP
-##### Version: 0.5.11
-version = "0.5.11"
+##### Version: 0.5.12
+version = "0.5.12"
 
 ##### set minimum snakemake version #####
 min_version("5.24")
diff --git a/GBS-pipeline_server-config.yaml b/GBS-pipeline_server-config.yaml
index 4701d55..aaffa1c 100644
--- a/GBS-pipeline_server-config.yaml
+++ b/GBS-pipeline_server-config.yaml
@@ -140,6 +140,19 @@ MockVsRef_AlignmentStats:
     time: 05:00:00
     job-name: MockVsRef_AlignmentStats
 
+FinalMockVsRef_alignment:
+    time: 05:00:00
+    cpus-per-task: 20
+    job-name:  FinalMockVsRef_alignment
+
+FinalMockVsRef_samtools_SamToSortedBam:
+    time: 05:00:00
+    job-name:  FinalMockVsRef_samtools_SamToSortedBam
+
+FinalMockVsRef_AlignmentStats:
+    time: 05:00:00
+    job-name: FinalMockVsRef_AlignmentStats
+
 MockVsRef_SortedBamToMpileup:
     time: 05:00:00
     job-name: MockVsRef_SortedBamToMpileup
diff --git a/rules/Module3-MockReference b/rules/Module3-MockReference
index 134c892..f5c25a4 100644
--- a/rules/Module3-MockReference
+++ b/rules/Module3-MockReference
@@ -226,6 +226,11 @@ else:
             mockgenome="%s/MockReference/MockReference.fa" % (config["project-folder"])
         output:
             "%s/SAM/finalMockToRef.sam" % (config["project-folder"])
+        log:
+            "%s/logs/MINIMAP2/mm2_map_finalMockVSref.log" % (config["project-folder"])
+        benchmark:
+            "%s/benchmark/MINIMAP2/mm2_finalMockVSref.benchmark.tsv" % (config["project-folder"])
+        threads: 20
         singularity: config["singularity"]["minimap2"]
         shell:"""
            minimap2 -ax sr {input.refgenome} {input.mockgenome} > {output} 2> {log}
diff --git a/rules/Module4-ReadAlignment b/rules/Module4-ReadAlignment
index 68c52a9..1649c3b 100644
--- a/rules/Module4-ReadAlignment
+++ b/rules/Module4-ReadAlignment
@@ -363,9 +363,11 @@ rule finalMockRef_AlignmentStats:
     Alignment stats for reads mapped to final mock ref
     """
     input:
-        "%s/SAM/alignments_finalMock/{samples}.sam" % (config["project-folder"])
+        sam="%s/SAM/alignments_finalMock/{samples}.sam" % (config["project-folder"]),
+        sbam="%s/BAM/alignments_finalMock/{samples}.sorted.bam" % (config["project-folder"])
     output:
-        "%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"])
+        fs="%s/BAM/alignments_finalMock/{samples}.sam.flagstat" % (config["project-folder"]),
+        c="%s/BAM/alignments_finalMock/{samples}.coverage" % (config["project-folder"])
     log:
         "%s/logs/Samtools/FinalMockRef_AlignmentStats.{samples}.log" % (config["project-folder"])
     benchmark:
@@ -373,5 +375,6 @@ rule finalMockRef_AlignmentStats:
     conda:"envs/samtools.yaml"
     singularity: config["singularity"]["gbs"]
     shell:"""
-        samtools flagstat {input} > {output}
+        samtools flagstat {input.sam} > {output.fs}
+        samtools idxstats {input.sbam} | awk '{{print $1\" \"$3}}' > {output.c}
   	"""
diff --git a/rules/Module7-Reporting b/rules/Module7-Reporting
index c0a2fdb..63f36a3 100644
--- a/rules/Module7-Reporting
+++ b/rules/Module7-Reporting
@@ -22,7 +22,8 @@ rule R_finalReport:
         fsFinalVsRef="%s/BAM/FinalMockref/mockToRef.sam.flagstat" % (config["project-folder"]),
         finalSetVariants="%s/VCF/FinalSetVariants_finalMock.vcf" % (config["project-folder"]),
         fsRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.sam.flagstat" % (config["project-folder"]), samples=samples),
-        cRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.coverage" % (config["project-folder"]), samples=samples)
+        cRef=expand("%s/FASTQ/TRIMMED/alignments_reference/{samples}.coverage" % (config["project-folder"]), samples=samples),
+        cFinalMock=expand("%s/BAM/alignments_finalMock/{samples}.coverage" % (config["project-folder"]), samples=samples)
     output:
         "%s/finalReport.html" % (config["project-folder"])
     log:

From 81ca94985441a1a5637adaf41ba798cdeaadcd97 Mon Sep 17 00:00:00 2001
From: Daniel Fischer <fischerd@r07c51.bullx>
Date: Mon, 8 Mar 2021 16:27:59 +0200
Subject: [PATCH 30/34] Statistics on final mock reference added

---
 CHANGELOG                   |   1 +
 GBS-pipeline.smk            |   4 +-
 scripts/workflow-report.Rmd | 179 +++++++++++++++++++++++++++++++++---
 3 files changed, 168 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index cf8224c..f831c20 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.13: * Statistics on final mock reference added
 0.5.12: * Coverage for finalMock alignments are now also calculated
 0.5.11: * Input rule all cleaned up a bit to account for temporary output
 0.5.10: * Mapping stats for final mock vs reference genome added
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 3d23f2a..764c511 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -10,8 +10,8 @@ import os
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline:
 ##### https://github.com/halelab/GBS-SNP-CROP
-##### Version: 0.5.12
-version = "0.5.12"
+##### Version: 0.5.13
+version = "0.5.13"
 
 ##### set minimum snakemake version #####
 min_version("5.24")
diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd
index b58b4f4..16f9ee0 100644
--- a/scripts/workflow-report.Rmd
+++ b/scripts/workflow-report.Rmd
@@ -44,10 +44,10 @@ knitr::opts_chunk$set(echo = FALSE,
 mockClusters.file <- "GSC.MR.Clusters.fa"
 mockReference.file <- "GSC.MR.Genome.fa"
 if(!is.element("snakemake",ls())){
-  projFolder <- "/scratch/project_2001746/BSF"
+  projFolder <- "/scratch/project_2001746/Whitefish_benchMS"
   pipelineFolder <- "/scratch/project_2001746/Pipeline-GBS/"
   pipelineConfig <- "/scratch/project_2001746/BSF/Pipeline-GBS/GBS-pipeline_config-BSF.yaml"
-  refGenome.file <- "hermetiaRef_112020.fasta"
+  refGenome.file <- "GCA_902175075.1_AWG_v1_genomic.fna"
 }
 refGenome.file <- basename(refGenome.file)
 ifelse(refGenome.file == "", refAvail <- FALSE, refAvail <- TRUE)
@@ -337,8 +337,9 @@ mqcStats.raw <- read.table(file.path(projFolder,"QC","RAW","multiqc_R1", "multiq
 tmp <- mqcStats.raw[,-c(1:4,7,11:21)]
 
 totalRawSequences <- sum(mqcStats.raw$Total.Sequences)
-out <- c(totalRawSequences,apply(tmp,2,mean))
+out <- c(totalRawSequences,sd(tmp[,1]),apply(tmp,2,mean))
 names(out) <- c("Tot. number sequences",
+                "SD total sequences",
                 "Avg. total sequences",
                 "Avg. poor quality",
                 "Avg. GC Percent",
@@ -682,10 +683,13 @@ plotMQCFeature(x=file.path(projFolder,"QC","CONCATENATED","multiqc_R1"),
 
 # Preliminary Mock Reference
 
+## To Add
+ * Get statistics on multi-mapping of the reads.
+
 ## Basic stats
 ```{r import required files, warning=FALSE, }
-mockReference <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", mockReference.file))
-mockClusters <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", mockClusters.file))
+prelimMockReference <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", mockReference.file))
+prelimMockClusters <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", mockClusters.file))
 vsearchIN <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", "VsearchIN.fa"))
 #vsearchOUT <- importFA(file.path(projFolder,"FASTQ", "TRIMMED", "VsearchOUT.fa"))
 ```
@@ -701,7 +705,7 @@ Basic information on the sequences used to build the mock reference:
 Basic information on the identified clusters, used as mock reference
 
 ```{r summarise mock reference clusters}
-  out <- summary(mockClusters)
+  out <- summary(prelimMockClusters)
   out[1,1] <- c("No. of Clusters")
   out[,2] <- formatC(out[,2],  format="d", big.mark=",")
   out_html <- knitr::kable(out, col.names = NULL, "html")
@@ -710,9 +714,9 @@ Basic information on the identified clusters, used as mock reference
 
 Total length of the input fasta to build reference: `r format(sum(nchar(vsearchIN)), big.mark=",")` bp.
 
-Total length of the Mock Reference Clusters: `r format(sum(nchar(mockClusters)), big.mark=",")` bp.
+Total length of the Mock Reference Clusters: `r format(sum(nchar(prelimMockClusters)), big.mark=",")` bp.
 
-Total length of the Mock Reference (Used as reference): `r format(sum(nchar(mockReference)), big.mark=",")` bp.
+Total length of the Mock Reference (Used as reference): `r format(sum(nchar(prelimMockReference)), big.mark=",")` bp.
 
 ## Theoretical coverage of Mock per sample
 Given the length of the mock reference assembly and the total number of sequenced bases per sample, we can estimate the theoretical coverage of the mock for each sample
@@ -721,7 +725,7 @@ Currently I hard-code here the paired-end reads and assume equal lengths and rea
 
 ```{r mock coverage}
 par(mar=c(10,5,1,1))
-mockLength <- sum(nchar(mockReference))
+mockLength <- sum(nchar(prelimMockReference))
 mockCov <- 2*mqcStats.trim[,10] * mqcStats.trim[,5]/mockLength
 names(mockCov) <- mqcStats.trim[,1]
 barplot(mockCov, ylab="'x'-fold Coverage", col=viridis(20)[8], las=2)
@@ -830,7 +834,7 @@ out_html <- knitr::kable(out, col.names = NULL, "html")
 kable_styling(out_html, "striped", position = "left")
 ```
 
-Quality values for the first 100.000 sequences. As a rule of thumb, capital letters are good, special characters are bad quality.
+Quality values for the first 100,000 sequences. As a rule of thumb, capital letters are good, special characters are bad quality.
 ```{r quality measues}
 outQual <- table(unlist(strsplit(discardedReads$qual[1:100000],"")))
 out_html <- knitr::kable(formatC(outQual,  format="d", big.mark=","), "html")
@@ -1027,7 +1031,8 @@ WE COULD COUNT HERE STILL HOW MANY SEQUENCES MATCH THE GIVEN ONE WITH ONE SUBSTI
 
 # Alignment
 
-## Mock Reference
+## Preliminary Mock Reference
+### Mapping stats
  
 Import the mapping statistics to the Mock reference
 
@@ -1173,11 +1178,156 @@ for(i in 3:ncol(clusterCoverage)){
 ### Cluster coverage Mock samples vs others
 ADD HERE STILL A COMPARISON, HOW THE READS FROM THE MOCK DISTRIBUTE ACROSS THE REFERENCE VS ALL OTHER SAMPLES
 
-## Refined Mock reference
-ADD THIS STILL TO THE REPORT!!!
+## Mock Reference
+### Mapping stats
 
-## Reference genome
+Import the mapping statistics to the Mock reference
+
+```{r import flagstats}
+flagstatFiles <- list.files(file.path(projFolder,"BAM", "alignments_finalMock"), pattern="*.flagstat")
+flagstats <- list()
+for(i in 1:length(flagstatFiles)){
+  flagstats[[i]] <- readLines(file.path(projFolder,"BAM", "alignments_finalMock",flagstatFiles[i]))
+}
+```
+
+Visualization of the alignments, red stars indicate the mock reference samples
+
+```{r vis mapping stats}
+par(oma=c(6,3,0,0))
+mapStats <- matrix(0,ncol=length(flagstatFiles), nrow=2)
+
+sampleNames <- gsub(".sam.flagstat", "", flagstatFiles)
 
+colnames(mapStats) <- sampleNames
+
+tmp <- as.numeric(sapply(strsplit(sapply(flagstats,"[",1), " +"),"[",1))
+mapStats[1,] <- as.numeric(sapply(strsplit(sapply(flagstats,"[",5), " +"),"[",1))
+mapStats[2,] <- tmp - mapStats[1,]
+
+p <- barplot(mapStats, col=c(viridis(20)[8], viridis(20)[16]), las=2)
+
+legend("topleft", pch=c(20,20), col=c(viridis(20)[16], viridis(20)[8]), legend=c("Unmapped", "Mapped"), fill="white")
+
+# highlight the mock reference samples
+mockFiles <- paste(mockSamples, ".sam.flagstat", sep="")
+mockPos <- which(is.element(flagstatFiles, mockFiles))
+
+points(p[mockPos], rep(200000, length(mockPos)), pch="*", col="red", cex=4)
+```
+
+```{r mapping percentage}
+barplot(mapStats[1,] / (apply(mapStats,2,sum)), ylim=c(0,1), ylab="Mapping in Percent", col=viridis(20)[8], las=2)
+```
+
+
+### Coverage
+
+Data mapped against the clusters and then the reads per cluster visualized
+```{r importClusterCoverage, warning=FALSE}
+clusterFiles <- list.files(file.path(projFolder, "BAM", "alignments_finalMock"), pattern="*.coverage")
+clusterCoverage <- read.table(file.path(projFolder, "BAM", "alignments_finalMock", clusterFiles[1]))
+names(clusterCoverage)[1:2] <- c("cluster", clusterFiles[1])
+
+for(i in 2:length(clusterFiles)){
+  tmp <- read.table(file.path(projFolder, "BAM", "alignments_finalMock", clusterFiles[i]))
+  names(tmp)[1:2] <- c("cluster", clusterFiles[i])
+  clusterCoverage <- merge(clusterCoverage, tmp, by="cluster")
+}
+names(clusterCoverage)[2:(length(clusterFiles)+1)] <- clusterFiles
+clusterCoverage[,1] <- as.numeric(gsub("Cluster", "", clusterCoverage[,1]))
+clusterCoverage <- clusterCoverage[order(clusterCoverage[,1]),]
+clusterCoverage <- clusterCoverage[is.na(clusterCoverage[,1])==FALSE,]
+
+clusterCoverage.std <- t(t(clusterCoverage)/apply(clusterCoverage,2,sum))*100000
+```
+
+### Lorenz curve
+Concentration measure of reads against clusters
+
+```{r lorenz}
+plot(cumsum(sort(clusterCoverage[,2] / sum(clusterCoverage[,2]))), type="l", xlab="Cluster", ylab="Concentration")
+for(i in 3:ncol(clusterCoverage)){
+  lines(cumsum(sort(clusterCoverage[,i] / sum(clusterCoverage[,i]))))
+}
+```
+
+
+### Stats on coverage
+
+These are the amounts of clusters with different samples. 
+
+```{r cluster coverage stats}
+coveredClusters <- apply(clusterCoverage[,-1]>0,1,sum)
+barplot(table(coveredClusters), xlab="No. of different samples aligned to cluster", col=viridis(20)[8])
+```
+
+Now we have here the number of reads per coverage class. That means, instead of having it binary as in the previous plot, we now count all the reads per coverage group.
+
+```{r reads per coverage group}
+readsPerCoverageGroup <- c()
+
+for(i in 1:max(coveredClusters)){
+ readsPerCoverageGroup[i] <-  sum(clusterCoverage[coveredClusters==i,-1])
+}
+
+names(readsPerCoverageGroup) <- 1:max(coveredClusters)
+
+barplot(readsPerCoverageGroup, xlab="Coverage group", ylab="Reads on cluster group", col=viridis(20)[8])
+```
+
+
+```{r reads per coverage group in percent}
+barplot(readsPerCoverageGroup/sum(readsPerCoverageGroup)*100, xlab="Coverage group", ylab="Reads on cluster group (in %)", ylim=c(0,100), col=viridis(20)[8])
+```
+
+Then still the number of clusters without coverage per sample
+
+```{r clusters without coverage}
+nonHittedClusters <- apply(clusterCoverage[,-1]==0,2,sum)
+names(nonHittedClusters) <- gsub(".coverage", "", names(nonHittedClusters))
+barplot(nonHittedClusters, col=viridis(20)[8], las=2)
+lines(c(0,100000), c(nrow(clusterCoverage), nrow(clusterCoverage)), lty="dotted")
+```
+
+And then this still as percentage
+
+```{r clusterhits percentage}
+barplot(nonHittedClusters/nrow(clusterCoverage), ylim=c(0,1), las=2, col=viridis(20)[8])
+lines(c(0,10000),c(0.2,0.2), lty="dotted")
+lines(c(0,10000),c(0.4,0.4), lty="dotted")
+lines(c(0,10000),c(0.5,0.5), lty="dashed")
+lines(c(0,10000),c(0.6,0.6), lty="dotted")
+lines(c(0,10000),c(0.8,0.8), lty="dotted")
+```
+
+### Smoothed log-coverage per cluster
+
+```{r vizCusterCoverage}
+plot(smooth.spline(clusterCoverage[,1], log(clusterCoverage[,2]+1), all.knots=FALSE), type="l", xlab="Cluster", ylab="Log-coverage", ylim=c(0, max(log(clusterCoverage))/2))
+
+for(i in 3:ncol(clusterCoverage)){
+  lines(smooth.spline(clusterCoverage[,1], log(clusterCoverage[,i]+1), all.knots=FALSE))  
+}
+```
+
+### Smoothed std-log-coverage per cluster
+
+Now the coverages are divided by the total amount of reads per sample and then multiplied by 10^5.
+
+```{r vizStdCusterCoverage}
+# I use here the first column of the other matrix to keep the same coordinates. It does not affect the plot.
+plot(smooth.spline(clusterCoverage[,1], log(clusterCoverage.std[,2]+1), all.knots=FALSE), type="l", xlab="Cluster", ylab="", ylim=c(0, max(log(clusterCoverage.std))/4))
+
+for(i in 3:ncol(clusterCoverage)){
+  lines(smooth.spline(clusterCoverage[,1], log(clusterCoverage.std[,i]+1), all.knots=FALSE))  
+}
+```
+
+
+
+## Reference genome
+### Basic stats
 Import the mapping statistics to the reference genome
 
 ```{r import ref flagstats}
@@ -1218,6 +1368,7 @@ points(p[mockPos], rep(200000, length(mockPos)), pch="*", col="red", cex=4)
 barplot(mapStats[1,] / (apply(mapStats,2,sum)), ylim=c(0,1), ylab="Mapping in Percent", col=viridis(20)[8], las=2)
 ```
 
+
 # Variants
 Import the Result vcf
 

From d57a112daef60c19a3776359b5204675b05083d3 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Tue, 9 Mar 2021 15:36:41 +0200
Subject: [PATCH 31/34] Bugfix: Rule FinalMockVsRef_alignment didn't have
 resource allocations

---
 CHANGELOG                   | 1 +
 GBS-pipeline.smk            | 4 ++--
 rules/Module3-MockReference | 3 +--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index cf8224c..c49a41d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.13: * Bugfix: Rule FinalMockVsRef_alignment didn't have resource allocations
 0.5.12: * Coverage for finalMock alignments are now also calculated
 0.5.11: * Input rule all cleaned up a bit to account for temporary output
 0.5.10: * Mapping stats for final mock vs reference genome added
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 3d23f2a..764c511 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -10,8 +10,8 @@ import os
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline:
 ##### https://github.com/halelab/GBS-SNP-CROP
-##### Version: 0.5.12
-version = "0.5.12"
+##### Version: 0.5.13
+version = "0.5.13"
 
 ##### set minimum snakemake version #####
 min_version("5.24")
diff --git a/rules/Module3-MockReference b/rules/Module3-MockReference
index f5c25a4..b4a130e 100644
--- a/rules/Module3-MockReference
+++ b/rules/Module3-MockReference
@@ -217,7 +217,7 @@ rule refine_mock_reference:
 if config["genome"] == "":
     print(f"No reference genome provided, skipping the star indexing step for reference genome")
 else:
-    rule FinalMockVsRef:
+    rule FinalMockVsRef_alignment:
         """
         Map the mock reference clusters to the genome (minimap2).
         """
@@ -234,7 +234,6 @@ else:
         singularity: config["singularity"]["minimap2"]
         shell:"""
            minimap2 -ax sr {input.refgenome} {input.mockgenome} > {output} 2> {log}
-
         """
 
 if config["genome"] == "":

From 4f97226f70fb5273829d051173fce842be7572c5 Mon Sep 17 00:00:00 2001
From: fischuu <daniel.fischer@luke.fi>
Date: Wed, 10 Mar 2021 18:17:21 +0200
Subject: [PATCH 32/34] snakemake version updated and quick fix for freezing
 optimiser added

---
 GBS-pipeline.smk          | 2 +-
 runGBSPipeline-example.sh | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index 200e775..e17d0a1 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -14,7 +14,7 @@ import os
 version = "0.5.14"
 
 ##### set minimum snakemake version #####
-min_version("5.24")
+min_version("6.0")
 
 ##### Sample sheets #####
 
diff --git a/runGBSPipeline-example.sh b/runGBSPipeline-example.sh
index 2a2e3d0..4fe7304 100644
--- a/runGBSPipeline-example.sh
+++ b/runGBSPipeline-example.sh
@@ -51,4 +51,5 @@ snakemake -s /scratch/project_2001746/Pipeline-GBS/GBS-pipeline.smk \
           --latency-wait 60 \
           --cluster-config /scratch/project_2001746/Pipeline-GBS/GBS-pipeline_server-config.yaml \
           --cluster "sbatch -t {cluster.time} --account={cluster.account} --gres=nvme:{cluster.nvme} --job-name={cluster.job-name} --tasks-per-node={cluster.ntasks} --cpus-per-task={cluster.cpus-per-task} --mem-per-cpu={cluster.mem-per-cpu} -p {cluster.partition} -D {cluster.working-directory}" \
+          --scheduler greedy \
           $@

From cc214929a80b57413d4787052f20f52f2b3ba366 Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Thu, 11 Mar 2021 12:05:31 +0200
Subject: [PATCH 33/34] small typos fixed

---
 rules/Module5-CallVariants | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rules/Module5-CallVariants b/rules/Module5-CallVariants
index e01a63b..37015ac 100644
--- a/rules/Module5-CallVariants
+++ b/rules/Module5-CallVariants
@@ -76,7 +76,7 @@ rule create_verticalRef:
     threads: 1
     shell:"""
         sort -u -m -k2n {input.ref} -o {params.wd}/VerticalRefPos.tmp;
-        sort -k2n {params.wd}/VerticalRefPos.tmp | uniq >{output.vref};
+        sort -k2n {params.wd}/VerticalRefPos.tmp | uniq > {output.vref};
         #ls {params.wd}/*count.txt | xargs -n1 basename > {output.co}
         ls {params.wd}/*count.txt > {output.co}
     """
@@ -473,7 +473,7 @@ rule create_verticalRef_finalMock:
     threads: 1
     shell:"""
         sort -u -m -k2n {input.ref} -o {params.wd}/VerticalRefPos.tmp;
-        sort -k2n {params.wd}/VerticalRefPos.tmp | uniq >{output.vref};
+        sort -k2n {params.wd}/VerticalRefPos.tmp | uniq > {output.vref};
         #ls {params.wd}/*count.txt | xargs -n1 basename > {output.co}
         ls {params.wd}/*count.txt > {output.co}
     """

From 02257d0087c2983ab3581203acdbc2d65917a44f Mon Sep 17 00:00:00 2001
From: fischuu <github@danielfischer.name>
Date: Thu, 11 Mar 2021 15:14:35 +0200
Subject: [PATCH 34/34] Report adjusted

---
 CHANGELOG                   |  1 +
 GBS-pipeline.smk            |  4 ++--
 scripts/workflow-report.Rmd | 24 ++++++++++++------------
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index bbb7f8f..84e2686 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@
 #####                                                                      #####
 ################################################################################
 
+0.5.15: * Report adjusted
 0.5.14: * Bugfix: Rule FinalMockVsRef_alignment didn't have resource allocations
 0.5.13: * Statistics on final mock reference added
 0.5.12: * Coverage for finalMock alignments are now also calculated
diff --git a/GBS-pipeline.smk b/GBS-pipeline.smk
index e17d0a1..d1154d6 100644
--- a/GBS-pipeline.smk
+++ b/GBS-pipeline.smk
@@ -10,8 +10,8 @@ import os
 ##### Natural Resources Institute Finland (Luke)
 ##### This pipeline is build upon the the GBS-SNP-CROP pipeline:
 ##### https://github.com/halelab/GBS-SNP-CROP
-##### Version: 0.5.14
-version = "0.5.14"
+##### Version: 0.5.15
+version = "0.5.15"
 
 ##### set minimum snakemake version #####
 min_version("6.0")
diff --git a/scripts/workflow-report.Rmd b/scripts/workflow-report.Rmd
index 16f9ee0..4200790 100644
--- a/scripts/workflow-report.Rmd
+++ b/scripts/workflow-report.Rmd
@@ -1183,7 +1183,7 @@ ADD HERE STILL A COMPARISON, HOW THE READS FROM THE MOCK DISTRIBUTE ACROSS THE R
 
 Import the mapping statistics to the Mock reference
 
-```{r import flagstats}
+```{r import final flagstats}
 flagstatFiles <- list.files(file.path(projFolder,"BAM", "alignments_finalMock"), pattern="*.flagstat")
 flagstats <- list()
 for(i in 1:length(flagstatFiles)){
@@ -1193,7 +1193,7 @@ for(i in 1:length(flagstatFiles)){
 
 Visualization of the alignments, red stars indicate the mock reference samples
 
-```{r vis mapping stats}
+```{r vis mapping final stats}
 par(oma=c(6,3,0,0))
 mapStats <- matrix(0,ncol=length(flagstatFiles), nrow=2)
 
@@ -1216,7 +1216,7 @@ mockPos <- which(is.element(flagstatFiles, mockFiles))
 points(p[mockPos], rep(200000, length(mockPos)), pch="*", col="red", cex=4)
 ```
 
-```{r mapping percentage}
+```{r mapping final percentage}
 barplot(mapStats[1,] / (apply(mapStats,2,sum)), ylim=c(0,1), ylab="Mapping in Percent", col=viridis(20)[8], las=2)
 ```
 
@@ -1224,7 +1224,7 @@ barplot(mapStats[1,] / (apply(mapStats,2,sum)), ylim=c(0,1), ylab="Mapping in Pe
 ### Coverage
 
 Data mapped against the clusters and then the reads per cluster visualized
-```{r importClusterCoverage, warning=FALSE}
+```{r importFinalClusterCoverage, warning=FALSE}
 clusterFiles <- list.files(file.path(projFolder, "BAM", "alignments_finalMock"), pattern="*.coverage")
 clusterCoverage <- read.table(file.path(projFolder, "BAM", "alignments_finalMock", clusterFiles[1]))
 names(clusterCoverage)[1:2] <- c("cluster", clusterFiles[1])
@@ -1245,7 +1245,7 @@ clusterCoverage.std <- t(t(clusterCoverage)/apply(clusterCoverage,2,sum))*100000
 ### Lorenz curve
 Concentration measure of reads against clusters
 
-```{r lorenz}
+```{r final lorenz}
 plot(cumsum(sort(clusterCoverage[,2] / sum(clusterCoverage[,2]))), type="l", xlab="Cluster", ylab="Concentration")
 for(i in 3:ncol(clusterCoverage)){
   lines(cumsum(sort(clusterCoverage[,i] / sum(clusterCoverage[,i]))))
@@ -1257,14 +1257,14 @@ for(i in 3:ncol(clusterCoverage)){
 
 These are the amounts of clusters with different samples. 
 
-```{r cluster coverage stats}
+```{r final  cluster coverage stats}
 coveredClusters <- apply(clusterCoverage[,-1]>0,1,sum)
 barplot(table(coveredClusters), xlab="No. of different samples aligned to cluster", col=viridis(20)[8])
 ```
 
 Now we have here the number of reads per coverage class. That means, instead of having it binary as in the previous plot, we now count all the reads per coverage group.
 
-```{r reads per coverage group}
+```{r final reads per coverage group}
 readsPerCoverageGroup <- c()
 
 for(i in 1:max(coveredClusters)){
@@ -1277,13 +1277,13 @@ barplot(readsPerCoverageGroup, xlab="Coverage group", ylab="Reads on cluster gro
 ```
 
 
-```{r reads per coverage group in percent}
+```{r final reads per coverage group in percent}
 barplot(readsPerCoverageGroup/sum(readsPerCoverageGroup)*100, xlab="Coverage group", ylab="Reads on cluster group (in %)", ylim=c(0,100), col=viridis(20)[8])
 ```
 
 Then still the number of clusters without coverage per sample
 
-```{r clusters without coverage}
+```{r final clusters without coverage}
 nonHittedClusters <- apply(clusterCoverage[,-1]==0,2,sum)
 names(nonHittedClusters) <- gsub(".coverage", "", names(nonHittedClusters))
 barplot(nonHittedClusters, col=viridis(20)[8], las=2)
@@ -1292,7 +1292,7 @@ lines(c(0,100000), c(nrow(clusterCoverage), nrow(clusterCoverage)), lty="dotted"
 
 And then this still as percentage
 
-```{r clusterhits percentage}
+```{r final clusterhits percentage}
 barplot(nonHittedClusters/nrow(clusterCoverage), ylim=c(0,1), las=2, col=viridis(20)[8])
 lines(c(0,10000),c(0.2,0.2), lty="dotted")
 lines(c(0,10000),c(0.4,0.4), lty="dotted")
@@ -1303,7 +1303,7 @@ lines(c(0,10000),c(0.8,0.8), lty="dotted")
 
 ### Smoothed log-coverage per cluster
 
-```{r vizCusterCoverage}
+```{r final vizCusterCoverage}
 plot(smooth.spline(clusterCoverage[,1], log(clusterCoverage[,2]+1), all.knots=FALSE), type="l", xlab="Cluster", ylab="Log-coverage", ylim=c(0, max(log(clusterCoverage))/2))
 
 for(i in 3:ncol(clusterCoverage)){
@@ -1315,7 +1315,7 @@ for(i in 3:ncol(clusterCoverage)){
 
 Now the coverages are divided by the total amount of reads per sample and then multiplied by 10^5.
 
-```{r vizStdCusterCoverage}
+```{r final vizStdCusterCoverage}
 # I use here the first column of the other matrix to keep the same coordinates. It does not affect the plot.
 plot(smooth.spline(clusterCoverage[,1], log(clusterCoverage.std[,2]+1), all.knots=FALSE), type="l", xlab="Cluster", ylab="", ylim=c(0, max(log(clusterCoverage.std))/4))