From 455445b88f90d298fe34b3201a9182f6ad93d700 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar1.cedar.computecanada.ca>
Date: Wed, 28 Nov 2018 07:56:08 -0800
Subject: [PATCH 01/11] new improved master script

---
 README.md                        |  22 ++
 R_scripts/combining_umerged.R    |  61 +++++
 bash_scripts/master_samsa2.slurm | 431 +++++++++++++++++++++++++++++++
 3 files changed, 514 insertions(+)
 create mode 100755 R_scripts/combining_umerged.R
 create mode 100755 bash_scripts/master_samsa2.slurm

diff --git a/README.md b/README.md
index 2ae3d5f..189ac85 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,27 @@
 # SAMSA2 - A fork of the complete metatranscriptome analysis pipeline
 
+### modification by sebastien.renaut@gmail.com
+
+### New in my version. 
+There is a new master script (master_samsa2.slurm) with several modifications:
+
+* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond)
+* The script annomatically creates a checkpoint file. Once a step is finished, it writes the name of that specific step in the file. When you run the script again, it will SKIP the steps labelled in checkpoint.
+* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R`
+* Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum.
+* Each step contains an exit statement to be printed if the master script dies.
+* Trimommatic removes adapter contamination according to a specific file
+* All options are to be specified in the first section of the script
+* The script is formated to be run into a SLURM job scheduler, but this can be easily changed / removed.
+
+
+#### To do:
+* reverse the order of the merging and trimming step.
+* simplify some syntax / make sure all paths are relative...
+
+
+
+
 Version 2 of the SAMSA pipeline - faster!  Lighter!  More options!  Less waiting!  
 
 ### New in version 2:
diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R
new file mode 100755
index 0000000..c04f28c
--- /dev/null
+++ b/R_scripts/combining_umerged.R
@@ -0,0 +1,61 @@
+#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript
+
+args = commandArgs(TRUE)
+start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself.
+#start="/Users/jerry/Documents/CSBQ/shapiro/results"
+
+setwd(start)
+
+###files to work with 
+system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = ""))
+system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = ""))
+system(paste("ls -1 *.assembled.fastq >assembled.files",sep = ""))
+
+files_f = read.table("umerged.forward.files",stringsAsFactors = F)      
+files_r = read.table("umerged.reverse.files",stringsAsFactors = F) 
+files_a = read.table("assembled.files",stringsAsFactors = F) 
+
+
+for(i in 1:nrow(files_f))      
+  {
+#grep 1st and 2rd lines 
+  	unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="")
+  	system(unassembled.fastq)
+
+#grep sequences forward...
+  	unassembled.forward.fastq = paste("awk 'NR % 2 == 0' ",files_f[i,1]," >unassembled.seq.forward.fastq",sep="")
+  	system(unassembled.forward.fastq)
+
+#grep sequences reverse...
+  	unassembled.reverse.fastq = paste("awk 'NR % 2 == 0' ",files_r[i,1]," >unassembled.seq.reverse.fastq",sep="")
+  	system(unassembled.reverse.fastq)
+
+#N and E quality file
+ 	system("wc -l unassembled.seq.reverse.fastq >wc")
+ 	wc = read.table("wc")
+  
+ 	write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F)
+  
+#paste the unassembled sequences into a single file
+	system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN")
+
+#put everything back into a single fastq
+	back = paste("paste -d '\\n' unassembled.names.seq.fastq unassembledN >",gsub("merged.unassembled.forward","cat",files_f[i,1]),sep= "")
+	system(back)
+
+#add the merged sequences
+	all = paste("cat ",files_a[i,1]," ",gsub("merged.unassembled.forward","cat",files_f[i,1])," >",gsub("merged.assembled","merged.assembled2",files_a[i,1]),sep = "")
+	system(all)
+
+#remove the clutter (file specific)
+	remove = c("rm NE_file unassembledN wc unassembled.names.seq.fastq unassembled.seq.forward.fastq unassembled.seq.reverse.fastq")
+	system(remove)
+}
+
+#remove the clutter (listing of all files)
+system("rm assembled.files umerged.forward.files umerged.reverse.files")
+
+
+
+
+
diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm
new file mode 100755
index 0000000..cc60bb3
--- /dev/null
+++ b/bash_scripts/master_samsa2.slurm
@@ -0,0 +1,431 @@
+#!/bin/bash
+
+#SBATCH --time=0-23:59
+#SBATCH --mem=100000
+#SBATCH --account=def-bruneaua
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=48
+
+module load r/3.5.0
+
+
+# Lines starting with #SBATCH are for SLURM job management systems
+# and may be removed if user is not submitting jobs to SLURM
+####################################################################
+#set -x #echo
+#
+# master_script_for_sample_files.bash
+# Created April 2017 by Sam Westreich, github.com/transcript
+# This version modified by Michelle Treiber on August 9, 2017
+#
+####################################################################
+#
+# This script sets up and runs through ALL steps in the SAMSA pipeline
+# before the analysis (which is done in R, likely in RStudio).  Each
+# step is set up below.
+#
+# The steps are:
+#	1.   Merging with PEAR, if applicable
+#	2.   Read cleaning with Trimmomatic
+#	3.   rRNA removal with SortMeRNA
+#	4.   Annotation using DIAMOND (by default against the RefSeq database)
+#	5.   Aggregation using analysis_counter.py
+#   4.1  Annotation using DIAMOND against the Subsystems database
+#   5.1  Aggregation using Subsystems-specific analysis counter.py
+#	6.	 Running R scripts to get DESeq statistical analysis.
+#
+# NOTE: BEFORE running this script, please run package_installation.bash
+# and full_database_download.bash located at:
+# https://github.com/transcript/samsa2/tree/master/setup in order to set 
+# up SAMSA2 dependencies and download full databases.
+#
+####################################################################
+#
+echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n"
+#
+# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download
+#
+# 	0. Set starting location:
+starting_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/metagenome_St2
+
+#	0.5 Sequences location:
+sequence_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/sequences/metagenome/St2
+
+#	1. PEAR
+pear_location=/home/renaut/bin
+
+# 	2. Trimmomatic
+trimmomatic_location=/home/renaut/bin
+
+#	3. SortMeRNA
+sortmerna_location=/home/renaut/bin
+
+#	4. DIAMOND
+diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac"
+diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db"
+diamond_location=/home/renaut/bin
+
+#	5. Aggregation
+python_programs=/home/renaut/samsa2/python_scripts
+RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa"
+Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa"
+
+#	6. R scripts and paths
+export R_LIBS="/home/renaut/samsa2/R_scripts/packages"
+R_programs=/home/renaut/samsa2/R_scripts
+
+#	threads
+threads="48"
+
+##################
+#Step0.1: create/read checkpoint
+cd $starting_location
+
+printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
+if [ ! -f "pipeline/checkpoints" ]
+        then
+                printf "\tThe file checkpoints does not exist, we will create it.\n"
+        touch "pipeline/checkpoints"
+else
+        printf "\tThe file checkpoints already exist\n"
+fi
+
+####################################################################
+#
+#STEP 0.1: GUNZIP PROCESS. 
+#
+#
+Step=$(grep "GUNZIP" pipeline/checkpoints)
+if [ "${Step}" != "GUNZIP" ]
+	then
+		for file in $sequence_location/hiseq_raw/*gz
+			do
+				file_out=`echo $file | awk -F ".gz" '{print $1}'`	
+        			gunzip $file -cd >$file_out
+
+				if [ $? -ne 0 ]
+                        		then
+                                	printf "\t!!! There is a problem in the gunzip step"
+                                	exit 1
+                		fi
+			done
+		mv $sequence_location/hiseq_raw/*fastq $sequence_location/hiseq_unzip/.
+		printf "GUNZIP\n" >>pipeline/checkpoints
+                echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP"
+                date
+else
+        printf  "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 1: MERGING OF PAIRED-END FILES USING PEAR
+# Note: paired-end files are usually named using R1 and R2 in the name.
+#		Example: control_1.R1.fastq
+#				 control_1.R2.fastq
+#
+# Note: if using single-end sequencing, skip this step (comment out).
+# Note: if performing R analysis (step 6), be sure to name files with 
+# 	the appropriate prefix ("control_$file" and "experimental_$file")!
+
+
+
+Step=$(grep "MERGING" pipeline/checkpoints)
+if [ "${Step}" != "MERGING" ]
+	then
+		mkdir $starting_location/step_1_merging
+		touch $starting_location/step_1_merging/pear_log
+		for file in $sequence_location/hiseq_unzip/*_R1.fastq
+		do
+			file1=$file
+			file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'`
+			out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'`
+			out_name=`echo ${out_path##*/}`
+			$pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log
+			#FOR ATRAPP, GIVEN THAT IT APPEARS LIKE 50-60% ARE MERGED, AND I HAVE A LOT OF SEQUENCES, I WILL ONLY KEEP THE MERGED		
+		
+			if [ $? -ne 0 ]
+                        	then
+                                	printf "\t!!! There is a problem in the merging step"
+                                	exit 1
+                	fi
+		done
+		 #FOR ATRAPP, I WILL NOW MERGE:I will merge the umerged leaving 20 N's in between.
+	
+		combining_umerged.R $starting_location
+		printf "MERGING\n" >>pipeline/checkpoints
+		mkdir $starting_location/step_1_merging/
+		mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/
+		
+		rm $starting_location/*fastq
+		rm $sequence_location/hiseq_unzip/*fastq
+
+		echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR"
+		date        
+else
+        printf  "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 2: CLEANING FILES WITH TRIMMOMATIC
+# Note: if skipping PEAR, make sure that all starting files are in the
+# $starting_location/step_1_output/ folder!
+Step=$(grep "TRIMMING" pipeline/checkpoints)
+if [ "${Step}" != "TRIMMING" ]
+        then
+		mkdir $starting_location/step_2_trimming
+		touch $starting_location/step_2_trimming/trimmomatic_log
+
+	for file in $starting_location/step_1_merging/*assembled2.fastq
+		do
+		shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'`
+
+		java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the trimmmomatic step"
+				exit 1
+		fi
+		
+		done
+
+	mkdir $starting_location/step_2_trimming/
+	mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/
+	
+	rm step_1_merging/*assembled2.fastq
+	
+	echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC"
+	date
+	
+	printf "TRIMMING\n" >>pipeline/checkpoints
+else
+        printf  "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 2.5: GETTING RAW SEQUENCES COUNTS
+# Note: These are used later for statistical analysis.
+Step=$(grep "RAW" pipeline/checkpoints)
+if [ "${Step}" != "RAW" ]
+        then
+		if [ -f $starting_location/step_2_trimming/raw_counts.txt ]
+			then
+				rm $starting_location/step_2_trimming/raw_counts.txt
+				touch $starting_location/step_2_trimming/raw_counts.txt
+		else
+			touch $starting_location/step_2_trimming/raw_counts.txt
+		fi	
+
+		for file in $starting_location/step_2_trimming/*cleaned.fastq
+		do
+			python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the raw sequences step"
+				exit 1
+		fi
+		done
+		echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" 
+		date
+
+        printf "RAW\n" >>pipeline/checkpoints
+else
+        printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA
+# Note: this step assumes that the SortMeRNA databases are indexed.  If not,
+# do that first (see the SortMeRNA user manual for details).
+
+Step=$(grep "RIBODEPLETION" pipeline/checkpoints)
+if [ "${Step}" != "RIBODEPLETION" ]
+        then
+        mkdir $starting_location/step_3_ribodepletion
+		touch $starting_location/step_3_ribodepletion/sortmerna_log
+
+		for file in $starting_location/step_2_trimming/*cleaned.fastq
+		do
+			shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'`
+			$sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --num_alignments 0 --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the sortmerna step"
+				exit 1
+		fi
+		done
+
+		mkdir $starting_location/step_3_ribodepletion/
+		mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/
+		
+		rm step_2_trimming/*cleaned.fastq
+		rm step_2_trimming/*ribosomes.fastq
+		
+		echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA"
+		date
+	
+        	printf "RIBODEPLETION\n" >>pipeline/checkpoints
+else
+        	printf  "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
+date
+Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints)
+if [ "${Step}" != "ANNOTATION_SUBSYS" ]
+	then
+		mkdir $starting_location/step_4_annotation
+		touch $starting_location/step_4_annotation/diamond_log
+		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
+		do
+			shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'`
+			#removed the diamond sensitive option
+			$diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
+			$diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the diamond subsys step"
+				exit 1
+		fi
+		done
+		
+		mkdir $starting_location/step_4_annotation/daa_binary_files/
+	
+		mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/
+		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
+
+		echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS"
+        	date
+
+		printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints
+else
+       	 printf  "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n"
+fi
+##################################################################
+#
+# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER
+Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints)
+if [ "${Step}" != "AGGREGATION_SUBSYS" ]
+	then
+		mkdir $starting_location/step_5_aggregation/
+		for file in $starting_location/step_4_annotation/*subsys_annotated
+		do
+			python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt
+	
+			# This quick program reduces down identical hierarchy annotations
+			python $python_programs/subsys_reducer.py -I $file.hierarchy
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the aggregation subsys step"
+				exit 1
+		fi
+		done
+
+		mkdir $starting_location/step_5_aggregation/Subsystems_results/
+		mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/
+		mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/
+		mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/
+		rm $starting_location/step_4_annotation/*.hierarchy
+
+		echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER"
+		date
+
+        	printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n"
+fi
+
+
+####################################################################
+#
+# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ
+# Note: this step assumes that the DIAMOND database is already built.  If not,
+# do that first before running this step.
+Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints)
+if [ "${Step}" != "ANNOTATION_REFSEQ" ]
+        then
+		touch $starting_location/step_4_annotation/diamond_log
+
+		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
+			do
+				shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'`
+				#removed the diamond sensitive option
+				$diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
+				$diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
+			if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the diamond refseq step"
+				exit 1
+			fi
+			done
+
+		mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/
+		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
+
+		echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ"
+		date
+
+        	printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER
+Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints)
+if [ "${Step}" != "AGGREGATION_REFSEQ" ]
+	then
+		for file in $starting_location/step_4_annotation/*RefSeq_annotated*
+		do
+			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
+			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
+			if [ $? -ne 0 ]
+				then 
+					printf "\t!!! There is a problem in the aggregation refseq step"
+					exit 1
+			fi
+		done
+
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/
+		mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/
+		mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/
+
+		echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER"
+		date 
+
+        	printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n"
+fi
+
+
+
+##################################################################
+#
+# At this point, all the results files are ready for analysis using R.
+# This next step performs basic DESeq2 analysis of the RefSeq organism, function,
+# and Subsystems annotations.
+#
+# More complex R analyses may be performed using specific .sh analysis scripts.
+#
+# STEP 6: R ANALYSIS
+# Note: For R to properly identify files to compare/contrast, they must include 
+# the appropriate prefix (either "control_$file" or experimental_$file")!
+
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt
+
+echo "STEP 6 DONE: R ANALYSIS"
+echo "Master bash script finished running at: "; date
+exit
+####################################################################
+

From 5d90e0e4392d8210f95b828fa9c30121b9afb3cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar1.cedar.computecanada.ca>
Date: Wed, 28 Nov 2018 08:48:36 -0800
Subject: [PATCH 02/11] improved syntax

---
 bash_scripts/master_samsa2.slurm | 44 +++++++++++++++++---------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm
index cc60bb3..36b8297 100755
--- a/bash_scripts/master_samsa2.slurm
+++ b/bash_scripts/master_samsa2.slurm
@@ -25,14 +25,16 @@ module load r/3.5.0
 # step is set up below.
 #
 # The steps are:
-#	1.   Merging with PEAR, if applicable
-#	2.   Read cleaning with Trimmomatic
-#	3.   rRNA removal with SortMeRNA
-#	4.   Annotation using DIAMOND (by default against the RefSeq database)
-#	5.   Aggregation using analysis_counter.py
-#   4.1  Annotation using DIAMOND against the Subsystems database
-#   5.1  Aggregation using Subsystems-specific analysis counter.py
-#	6.	 Running R scripts to get DESeq statistical analysis.
+#	0.1	Create the checkpoint
+#	0.2	Gunzip the raw sequences
+#	1.	Merging with PEAR, if applicable
+#	2.	Read cleaning with Trimmomatic
+#	3.	rRNA removal with SortMeRNA
+#	4.	Annotation using DIAMOND against Subsys database
+#	5.	Aggregation using analysis_counter.py (Subsys)
+#   	4.1  	Annotation using DIAMOND against Refseq database
+#   	5.1	Aggregation using analysis_counter.py (Refseq)
+#	6.	Running R scripts to get DESeq statistical analysis. (NOT RUN)
 #
 # NOTE: BEFORE running this script, please run package_installation.bash
 # and full_database_download.bash located at:
@@ -48,14 +50,14 @@ echo -e "NOTE: Before running this script, user must run package_installation.ba
 # 	0. Set starting location:
 starting_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/metagenome_St2
 
-#	0.5 Sequences location:
+#	0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/hiseq_raw
 sequence_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/sequences/metagenome/St2
 
 #	1. PEAR
 pear_location=/home/renaut/bin
 
 # 	2. Trimmomatic
-trimmomatic_location=/home/renaut/bin
+trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36
 
 #	3. SortMeRNA
 sortmerna_location=/home/renaut/bin
@@ -66,19 +68,19 @@ diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db"
 diamond_location=/home/renaut/bin
 
 #	5. Aggregation
-python_programs=/home/renaut/samsa2/python_scripts
+python_programs=/home/renaut/samsa2.master/samsa2/python_scripts
 RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa"
 Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa"
 
 #	6. R scripts and paths
-export R_LIBS="/home/renaut/samsa2/R_scripts/packages"
-R_programs=/home/renaut/samsa2/R_scripts
+export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages"
+R_programs=/home/renaut/samsa2.master/samsa2/R_scripts
 
 #	threads
 threads="48"
 
 ##################
-#Step0.1: create/read checkpoint
+#STEP 0.1: create/read checkpoint
 cd $starting_location
 
 printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
@@ -92,7 +94,7 @@ fi
 
 ####################################################################
 #
-#STEP 0.1: GUNZIP PROCESS. 
+#STEP 0.2: GUNZIP PROCESS. 
 #
 #
 Step=$(grep "GUNZIP" pipeline/checkpoints)
@@ -109,6 +111,8 @@ if [ "${Step}" != "GUNZIP" ]
                                 	exit 1
                 		fi
 			done
+		
+		mkdir $sequence_location/hiseq_unzip
 		mv $sequence_location/hiseq_raw/*fastq $sequence_location/hiseq_unzip/.
 		printf "GUNZIP\n" >>pipeline/checkpoints
                 echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP"
@@ -142,7 +146,6 @@ if [ "${Step}" != "MERGING" ]
 			out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'`
 			out_name=`echo ${out_path##*/}`
 			$pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log
-			#FOR ATRAPP, GIVEN THAT IT APPEARS LIKE 50-60% ARE MERGED, AND I HAVE A LOT OF SEQUENCES, I WILL ONLY KEEP THE MERGED		
 		
 			if [ $? -ne 0 ]
                         	then
@@ -150,9 +153,10 @@ if [ "${Step}" != "MERGING" ]
                                 	exit 1
                 	fi
 		done
-		 #FOR ATRAPP, I WILL NOW MERGE:I will merge the umerged leaving 20 N's in between.
-	
-		combining_umerged.R $starting_location
+		
+		###I will concatenate forward and reverse complement with 20Ns in the middle.
+		$R_programs/combining_umerged.R $starting_location
+		
 		printf "MERGING\n" >>pipeline/checkpoints
 		mkdir $starting_location/step_1_merging/
 		mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/
@@ -181,7 +185,7 @@ if [ "${Step}" != "TRIMMING" ]
 		do
 		shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'`
 
-		java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log
+		java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log
 		if [ $? -ne 0 ]
 			then 
 				printf "\t!!! There is a problem in the trimmmomatic step"

From 33adea99d76ff7a05c8a8f1b3e244475e9f94df1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar5.cedar.computecanada.ca>
Date: Wed, 28 Nov 2018 12:16:07 -0800
Subject: [PATCH 03/11] sortmerna options

---
 README.md                        | 4 ++--
 bash_scripts/master_samsa2.slurm | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 189ac85..59417b5 100644
--- a/README.md
+++ b/README.md
@@ -13,10 +13,10 @@ There is a new master script (master_samsa2.slurm) with several modifications:
 * Trimommatic removes adapter contamination according to a specific file
 * All options are to be specified in the first section of the script
 * The script is formated to be run into a SLURM job scheduler, but this can be easily changed / removed.
-
+* I've removed the --num_alignments 0 is sortmrna. This causes problems and slows things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to one or 1,000 rRNA its out anyways...
 
 #### To do:
-* reverse the order of the merging and trimming step.
+* reverse the order of the merging and trimming step (but again, maybe not...)
 * simplify some syntax / make sure all paths are relative...
 
 
diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm
index 36b8297..6fa3c24 100755
--- a/bash_scripts/master_samsa2.slurm
+++ b/bash_scripts/master_samsa2.slurm
@@ -254,7 +254,7 @@ if [ "${Step}" != "RIBODEPLETION" ]
 		for file in $starting_location/step_2_trimming/*cleaned.fastq
 		do
 			shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'`
-			$sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --num_alignments 0 --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log
+			$sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log
 		if [ $? -ne 0 ]
 			then 
 				printf "\t!!! There is a problem in the sortmerna step"

From 58513fc8e1403670d54762ea086322a84ee6eccd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar1.cedar.computecanada.ca>
Date: Mon, 3 Dec 2018 10:01:23 -0800
Subject: [PATCH 04/11] improve flow

---
 README.md | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 59417b5..72c0ce0 100644
--- a/README.md
+++ b/README.md
@@ -3,23 +3,21 @@
 ### modification by sebastien.renaut@gmail.com
 
 ### New in my version. 
-There is a new master script (master_samsa2.slurm) with several modifications:
+This is a new master script (master_samsa2.slurm) with several modifications:
 
 * It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond)
-* The script annomatically creates a checkpoint file. Once a step is finished, it writes the name of that specific step in the file. When you run the script again, it will SKIP the steps labelled in checkpoint.
+* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary.
 * In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R`
 * Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum.
-* Each step contains an exit statement to be printed if the master script dies.
-* Trimommatic removes adapter contamination according to a specific file
-* All options are to be specified in the first section of the script
-* The script is formated to be run into a SLURM job scheduler, but this can be easily changed / removed.
-* I've removed the --num_alignments 0 is sortmrna. This causes problems and slows things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to one or 1,000 rRNA its out anyways...
+* Each step contains an exit statement to be printed if the master script dies due to an unforseen error.
+* Trimmomatic removes adapter contamination according to a specific fasta file.
+* All options, read & program location are to be specified in the first section of the script.
+* The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed.
+* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways...
 
 #### To do:
 * reverse the order of the merging and trimming step (but again, maybe not...)
-* simplify some syntax / make sure all paths are relative...
-
-
+* simplify some syntax / make sure all paths are relative (mostly done)
 
 
 Version 2 of the SAMSA pipeline - faster!  Lighter!  More options!  Less waiting!  

From 79e66eede3f488cbc6272c34e10d7fa199e46553 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar1.cedar.computecanada.ca>
Date: Mon, 18 Feb 2019 11:01:57 -0800
Subject: [PATCH 05/11] minor

---
 .gitignore                       |  5 +++++
 bash_scripts/master_samsa2.slurm | 18 +++++++++---------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9e8b5a7..c11349b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,8 @@
+# my files
+bash_scripts/master_samsa2.slurm
+bash_scripts/cyanotoxin_master_samsa2.slurm
+R_scripts/combining_umerged.R
+
 # Swap files
 *.swp
 *.swo
diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm
index 6fa3c24..a16d11b 100755
--- a/bash_scripts/master_samsa2.slurm
+++ b/bash_scripts/master_samsa2.slurm
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #SBATCH --time=0-23:59
-#SBATCH --mem=100000
+#SBATCH --mem=100G
 #SBATCH --account=def-bruneaua
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=48
@@ -48,10 +48,10 @@ echo -e "NOTE: Before running this script, user must run package_installation.ba
 # VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download
 #
 # 	0. Set starting location:
-starting_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/metagenome_St2
+starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1
 
-#	0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/hiseq_raw
-sequence_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/sequences/metagenome/St2
+#	0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped
+sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1
 
 #	1. PEAR
 pear_location=/home/renaut/bin
@@ -100,7 +100,7 @@ fi
 Step=$(grep "GUNZIP" pipeline/checkpoints)
 if [ "${Step}" != "GUNZIP" ]
 	then
-		for file in $sequence_location/hiseq_raw/*gz
+		for file in $sequence_location/zipped/*gz
 			do
 				file_out=`echo $file | awk -F ".gz" '{print $1}'`	
         			gunzip $file -cd >$file_out
@@ -112,8 +112,8 @@ if [ "${Step}" != "GUNZIP" ]
                 		fi
 			done
 		
-		mkdir $sequence_location/hiseq_unzip
-		mv $sequence_location/hiseq_raw/*fastq $sequence_location/hiseq_unzip/.
+		mkdir $sequence_location/unzipped
+		mv $sequence_location/zipped/*fastq $sequence_location/unzipped/.
 		printf "GUNZIP\n" >>pipeline/checkpoints
                 echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP"
                 date
@@ -139,7 +139,7 @@ if [ "${Step}" != "MERGING" ]
 	then
 		mkdir $starting_location/step_1_merging
 		touch $starting_location/step_1_merging/pear_log
-		for file in $sequence_location/hiseq_unzip/*_R1.fastq
+		for file in $sequence_location/unzipped/*_R1.fastq
 		do
 			file1=$file
 			file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'`
@@ -162,7 +162,7 @@ if [ "${Step}" != "MERGING" ]
 		mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/
 		
 		rm $starting_location/*fastq
-		rm $sequence_location/hiseq_unzip/*fastq
+		rm $sequence_location/unzipped/*fastq
 
 		echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR"
 		date        

From fb2fc99f0e4f369fa783f476b2322f5caad6422b Mon Sep 17 00:00:00 2001
From: Sebastien Renaut <sebastien.renaut@gmail.com>
Date: Mon, 18 Feb 2019 14:03:14 -0500
Subject: [PATCH 06/11] remove from main

---
 bash_scripts/master_samsa2.slurm | 435 -------------------------------
 1 file changed, 435 deletions(-)
 delete mode 100755 bash_scripts/master_samsa2.slurm

diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm
deleted file mode 100755
index a16d11b..0000000
--- a/bash_scripts/master_samsa2.slurm
+++ /dev/null
@@ -1,435 +0,0 @@
-#!/bin/bash
-
-#SBATCH --time=0-23:59
-#SBATCH --mem=100G
-#SBATCH --account=def-bruneaua
-#SBATCH --nodes=1
-#SBATCH --ntasks-per-node=48
-
-module load r/3.5.0
-
-
-# Lines starting with #SBATCH are for SLURM job management systems
-# and may be removed if user is not submitting jobs to SLURM
-####################################################################
-#set -x #echo
-#
-# master_script_for_sample_files.bash
-# Created April 2017 by Sam Westreich, github.com/transcript
-# This version modified by Michelle Treiber on August 9, 2017
-#
-####################################################################
-#
-# This script sets up and runs through ALL steps in the SAMSA pipeline
-# before the analysis (which is done in R, likely in RStudio).  Each
-# step is set up below.
-#
-# The steps are:
-#	0.1	Create the checkpoint
-#	0.2	Gunzip the raw sequences
-#	1.	Merging with PEAR, if applicable
-#	2.	Read cleaning with Trimmomatic
-#	3.	rRNA removal with SortMeRNA
-#	4.	Annotation using DIAMOND against Subsys database
-#	5.	Aggregation using analysis_counter.py (Subsys)
-#   	4.1  	Annotation using DIAMOND against Refseq database
-#   	5.1	Aggregation using analysis_counter.py (Refseq)
-#	6.	Running R scripts to get DESeq statistical analysis. (NOT RUN)
-#
-# NOTE: BEFORE running this script, please run package_installation.bash
-# and full_database_download.bash located at:
-# https://github.com/transcript/samsa2/tree/master/setup in order to set 
-# up SAMSA2 dependencies and download full databases.
-#
-####################################################################
-#
-echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n"
-#
-# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download
-#
-# 	0. Set starting location:
-starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1
-
-#	0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped
-sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1
-
-#	1. PEAR
-pear_location=/home/renaut/bin
-
-# 	2. Trimmomatic
-trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36
-
-#	3. SortMeRNA
-sortmerna_location=/home/renaut/bin
-
-#	4. DIAMOND
-diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac"
-diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db"
-diamond_location=/home/renaut/bin
-
-#	5. Aggregation
-python_programs=/home/renaut/samsa2.master/samsa2/python_scripts
-RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa"
-Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa"
-
-#	6. R scripts and paths
-export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages"
-R_programs=/home/renaut/samsa2.master/samsa2/R_scripts
-
-#	threads
-threads="48"
-
-##################
-#STEP 0.1: create/read checkpoint
-cd $starting_location
-
-printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
-if [ ! -f "pipeline/checkpoints" ]
-        then
-                printf "\tThe file checkpoints does not exist, we will create it.\n"
-        touch "pipeline/checkpoints"
-else
-        printf "\tThe file checkpoints already exist\n"
-fi
-
-####################################################################
-#
-#STEP 0.2: GUNZIP PROCESS. 
-#
-#
-Step=$(grep "GUNZIP" pipeline/checkpoints)
-if [ "${Step}" != "GUNZIP" ]
-	then
-		for file in $sequence_location/zipped/*gz
-			do
-				file_out=`echo $file | awk -F ".gz" '{print $1}'`	
-        			gunzip $file -cd >$file_out
-
-				if [ $? -ne 0 ]
-                        		then
-                                	printf "\t!!! There is a problem in the gunzip step"
-                                	exit 1
-                		fi
-			done
-		
-		mkdir $sequence_location/unzipped
-		mv $sequence_location/zipped/*fastq $sequence_location/unzipped/.
-		printf "GUNZIP\n" >>pipeline/checkpoints
-                echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP"
-                date
-else
-        printf  "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 1: MERGING OF PAIRED-END FILES USING PEAR
-# Note: paired-end files are usually named using R1 and R2 in the name.
-#		Example: control_1.R1.fastq
-#				 control_1.R2.fastq
-#
-# Note: if using single-end sequencing, skip this step (comment out).
-# Note: if performing R analysis (step 6), be sure to name files with 
-# 	the appropriate prefix ("control_$file" and "experimental_$file")!
-
-
-
-Step=$(grep "MERGING" pipeline/checkpoints)
-if [ "${Step}" != "MERGING" ]
-	then
-		mkdir $starting_location/step_1_merging
-		touch $starting_location/step_1_merging/pear_log
-		for file in $sequence_location/unzipped/*_R1.fastq
-		do
-			file1=$file
-			file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'`
-			out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'`
-			out_name=`echo ${out_path##*/}`
-			$pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log
-		
-			if [ $? -ne 0 ]
-                        	then
-                                	printf "\t!!! There is a problem in the merging step"
-                                	exit 1
-                	fi
-		done
-		
-		###I will concatenate forward and reverse complement with 20Ns in the middle.
-		$R_programs/combining_umerged.R $starting_location
-		
-		printf "MERGING\n" >>pipeline/checkpoints
-		mkdir $starting_location/step_1_merging/
-		mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/
-		
-		rm $starting_location/*fastq
-		rm $sequence_location/unzipped/*fastq
-
-		echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR"
-		date        
-else
-        printf  "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 2: CLEANING FILES WITH TRIMMOMATIC
-# Note: if skipping PEAR, make sure that all starting files are in the
-# $starting_location/step_1_output/ folder!
-Step=$(grep "TRIMMING" pipeline/checkpoints)
-if [ "${Step}" != "TRIMMING" ]
-        then
-		mkdir $starting_location/step_2_trimming
-		touch $starting_location/step_2_trimming/trimmomatic_log
-
-	for file in $starting_location/step_1_merging/*assembled2.fastq
-		do
-		shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'`
-
-		java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the trimmmomatic step"
-				exit 1
-		fi
-		
-		done
-
-	mkdir $starting_location/step_2_trimming/
-	mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/
-	
-	rm step_1_merging/*assembled2.fastq
-	
-	echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC"
-	date
-	
-	printf "TRIMMING\n" >>pipeline/checkpoints
-else
-        printf  "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 2.5: GETTING RAW SEQUENCES COUNTS
-# Note: These are used later for statistical analysis.
-Step=$(grep "RAW" pipeline/checkpoints)
-if [ "${Step}" != "RAW" ]
-        then
-		if [ -f $starting_location/step_2_trimming/raw_counts.txt ]
-			then
-				rm $starting_location/step_2_trimming/raw_counts.txt
-				touch $starting_location/step_2_trimming/raw_counts.txt
-		else
-			touch $starting_location/step_2_trimming/raw_counts.txt
-		fi	
-
-		for file in $starting_location/step_2_trimming/*cleaned.fastq
-		do
-			python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the raw sequences step"
-				exit 1
-		fi
-		done
-		echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" 
-		date
-
-        printf "RAW\n" >>pipeline/checkpoints
-else
-        printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA
-# Note: this step assumes that the SortMeRNA databases are indexed.  If not,
-# do that first (see the SortMeRNA user manual for details).
-
-Step=$(grep "RIBODEPLETION" pipeline/checkpoints)
-if [ "${Step}" != "RIBODEPLETION" ]
-        then
-        mkdir $starting_location/step_3_ribodepletion
-		touch $starting_location/step_3_ribodepletion/sortmerna_log
-
-		for file in $starting_location/step_2_trimming/*cleaned.fastq
-		do
-			shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'`
-			$sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the sortmerna step"
-				exit 1
-		fi
-		done
-
-		mkdir $starting_location/step_3_ribodepletion/
-		mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/
-		
-		rm step_2_trimming/*cleaned.fastq
-		rm step_2_trimming/*ribosomes.fastq
-		
-		echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA"
-		date
-	
-        	printf "RIBODEPLETION\n" >>pipeline/checkpoints
-else
-        	printf  "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
-date
-Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints)
-if [ "${Step}" != "ANNOTATION_SUBSYS" ]
-	then
-		mkdir $starting_location/step_4_annotation
-		touch $starting_location/step_4_annotation/diamond_log
-		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
-		do
-			shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'`
-			#removed the diamond sensitive option
-			$diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
-			$diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the diamond subsys step"
-				exit 1
-		fi
-		done
-		
-		mkdir $starting_location/step_4_annotation/daa_binary_files/
-	
-		mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/
-		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
-
-		echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS"
-        	date
-
-		printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints
-else
-       	 printf  "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n"
-fi
-##################################################################
-#
-# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER
-Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints)
-if [ "${Step}" != "AGGREGATION_SUBSYS" ]
-	then
-		mkdir $starting_location/step_5_aggregation/
-		for file in $starting_location/step_4_annotation/*subsys_annotated
-		do
-			python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt
-	
-			# This quick program reduces down identical hierarchy annotations
-			python $python_programs/subsys_reducer.py -I $file.hierarchy
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the aggregation subsys step"
-				exit 1
-		fi
-		done
-
-		mkdir $starting_location/step_5_aggregation/Subsystems_results/
-		mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/
-		mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/
-		mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/
-		rm $starting_location/step_4_annotation/*.hierarchy
-
-		echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER"
-		date
-
-        	printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints
-else
-	        printf  "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n"
-fi
-
-
-####################################################################
-#
-# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ
-# Note: this step assumes that the DIAMOND database is already built.  If not,
-# do that first before running this step.
-Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints)
-if [ "${Step}" != "ANNOTATION_REFSEQ" ]
-        then
-		touch $starting_location/step_4_annotation/diamond_log
-
-		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
-			do
-				shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'`
-				#removed the diamond sensitive option
-				$diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
-				$diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
-			if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the diamond refseq step"
-				exit 1
-			fi
-			done
-
-		mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/
-		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
-
-		echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ"
-		date
-
-        	printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints
-else
-	        printf  "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER
-Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints)
-if [ "${Step}" != "AGGREGATION_REFSEQ" ]
-	then
-		for file in $starting_location/step_4_annotation/*RefSeq_annotated*
-		do
-			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
-			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
-			if [ $? -ne 0 ]
-				then 
-					printf "\t!!! There is a problem in the aggregation refseq step"
-					exit 1
-			fi
-		done
-
-		mkdir $starting_location/step_5_aggregation/RefSeq_results/
-		mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/
-		mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/
-		mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/
-		mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/
-
-		echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER"
-		date 
-
-        	printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints
-else
-	        printf  "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n"
-fi
-
-
-
-##################################################################
-#
-# At this point, all the results files are ready for analysis using R.
-# This next step performs basic DESeq2 analysis of the RefSeq organism, function,
-# and Subsystems annotations.
-#
-# More complex R analyses may be performed using specific .sh analysis scripts.
-#
-# STEP 6: R ANALYSIS
-# Note: For R to properly identify files to compare/contrast, they must include 
-# the appropriate prefix (either "control_$file" or experimental_$file")!
-
-#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
-#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
-#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt
-
-echo "STEP 6 DONE: R ANALYSIS"
-echo "Master bash script finished running at: "; date
-exit
-####################################################################
-

From f68ba12b7bd1a88d9830c21e1c012cc2b6eec3a1 Mon Sep 17 00:00:00 2001
From: Sebastien Renaut <sebastien.renaut@gmail.com>
Date: Mon, 18 Feb 2019 14:07:22 -0500
Subject: [PATCH 07/11] delete from main branch

---
 R_scripts/combining_umerged.R | 61 -----------------------------------
 1 file changed, 61 deletions(-)
 delete mode 100755 R_scripts/combining_umerged.R

diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R
deleted file mode 100755
index c04f28c..0000000
--- a/R_scripts/combining_umerged.R
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript
-
-args = commandArgs(TRUE)
-start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself.
-#start="/Users/jerry/Documents/CSBQ/shapiro/results"
-
-setwd(start)
-
-###files to work with 
-system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = ""))
-system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = ""))
-system(paste("ls -1 *.assembled.fastq >assembled.files",sep = ""))
-
-files_f = read.table("umerged.forward.files",stringsAsFactors = F)      
-files_r = read.table("umerged.reverse.files",stringsAsFactors = F) 
-files_a = read.table("assembled.files",stringsAsFactors = F) 
-
-
-for(i in 1:nrow(files_f))      
-  {
-#grep 1st and 2rd lines 
-  	unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="")
-  	system(unassembled.fastq)
-
-#grep sequences forward...
-  	unassembled.forward.fastq = paste("awk 'NR % 2 == 0' ",files_f[i,1]," >unassembled.seq.forward.fastq",sep="")
-  	system(unassembled.forward.fastq)
-
-#grep sequences reverse...
-  	unassembled.reverse.fastq = paste("awk 'NR % 2 == 0' ",files_r[i,1]," >unassembled.seq.reverse.fastq",sep="")
-  	system(unassembled.reverse.fastq)
-
-#N and E quality file
- 	system("wc -l unassembled.seq.reverse.fastq >wc")
- 	wc = read.table("wc")
-  
- 	write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F)
-  
-#paste the unassembled sequences into a single file
-	system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN")
-
-#put everything back into a single fastq
-	back = paste("paste -d '\\n' unassembled.names.seq.fastq unassembledN >",gsub("merged.unassembled.forward","cat",files_f[i,1]),sep= "")
-	system(back)
-
-#add the merged sequences
-	all = paste("cat ",files_a[i,1]," ",gsub("merged.unassembled.forward","cat",files_f[i,1])," >",gsub("merged.assembled","merged.assembled2",files_a[i,1]),sep = "")
-	system(all)
-
-#remove the clutter (file specific)
-	remove = c("rm NE_file unassembledN wc unassembled.names.seq.fastq unassembled.seq.forward.fastq unassembled.seq.reverse.fastq")
-	system(remove)
-}
-
-#remove the clutter (listing of all files)
-system("rm assembled.files umerged.forward.files umerged.reverse.files")
-
-
-
-
-

From ceba37eea48f45f4bcc7171066d2c43af7aa2a5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar1.cedar.computecanada.ca>
Date: Mon, 18 Feb 2019 11:13:35 -0800
Subject: [PATCH 08/11] add a checkpoint file

---
 README.md                     | 20 --------------------
 bash_scripts/master_script.sh | 23 +++++++++++++++++++++++
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 72c0ce0..2ae3d5f 100644
--- a/README.md
+++ b/README.md
@@ -1,25 +1,5 @@
 # SAMSA2 - A fork of the complete metatranscriptome analysis pipeline
 
-### modification by sebastien.renaut@gmail.com
-
-### New in my version. 
-This is a new master script (master_samsa2.slurm) with several modifications:
-
-* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond)
-* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary.
-* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R`
-* Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum.
-* Each step contains an exit statement to be printed if the master script dies due to an unforseen error.
-* Trimmomatic removes adapter contamination according to a specific fasta file.
-* All options, read & program location are to be specified in the first section of the script.
-* The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed.
-* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways...
-
-#### To do:
-* reverse the order of the merging and trimming step (but again, maybe not...)
-* simplify some syntax / make sure all paths are relative (mostly done)
-
-
 Version 2 of the SAMSA pipeline - faster!  Lighter!  More options!  Less waiting!  
 
 ### New in version 2:
diff --git a/bash_scripts/master_script.sh b/bash_scripts/master_script.sh
index 5f8ff5e..cd6ac6c 100644
--- a/bash_scripts/master_script.sh
+++ b/bash_scripts/master_script.sh
@@ -72,8 +72,26 @@ else
 fi
 
 ####################################################################
+#STEP 0.1: create/read checkpoint
+
+printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
+if [ ! -f "$INPUT_DIR/checkpoints" ]
+        then
+                printf "\tThe file checkpoints does not exist, we will create it.\n"
+        touch "$INPUT_DIR/checkpoints"
+else
+        printf "\tThe file checkpoints already exist\n"
+fi
+
+####################################################################
+
+
 #
 # STEP 1: CLEANING FILES WITH TRIMMOMATIC
+Step=$(grep "TRIMMO" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "TRIMMO" ]
+        then
+
 
 if ls $INPUT_DIR/*.gz &>/dev/null; then
   for file in $INPUT_DIR/*.gz
@@ -105,6 +123,11 @@ else
   mv $INPUT_DIR/*".cleaned" $STEP_1
 fi
 
+else
+        printf  "\tThe variable TRIMMO is in the checkpoint file. STEP 1 will then be passed\n"
+fi
+
+
 ####################################################################
 #
 # STEP 2: MERGING OF PAIRED-END FILES USING PEAR

From bf29e7888a2b08872837c5cbb50e923abd89d711 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar1.cedar.computecanada.ca>
Date: Mon, 18 Feb 2019 11:26:35 -0800
Subject: [PATCH 09/11] checkpoint for all steps

---
 bash_scripts/master_script.sh | 74 +++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/bash_scripts/master_script.sh b/bash_scripts/master_script.sh
index cd6ac6c..f6ef74e 100644
--- a/bash_scripts/master_script.sh
+++ b/bash_scripts/master_script.sh
@@ -134,6 +134,10 @@ fi
 # Note: paired-end files are usually named using R1 and R2 in the name.
 #       Example: control_1.R1.fastq
 #                control_1.R2.fastq
+Step=$(grep "MERGING" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "MERGING" ]
+        then
+
 
 $MKDIR $STEP_2
 if $paired; then
@@ -151,10 +155,19 @@ else
   done
 fi
 
+printf "MERGING\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable MERGING is in the checkpoint file. STEP 2 will then be passed\n"
+fi
+
 ####################################################################
 #
 # STEP 2.9: GETTING RAW SEQUENCES COUNTS
 # Note: These are used later for statistical analysis.
+Step=$(grep "RAW" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "RAW" ]
+        then
 
 if [[ -f $STEP_2/raw_counts.txt ]]; then
     rm $STEP_2/raw_counts.txt
@@ -173,11 +186,20 @@ else
   done
 fi
 
+printf "RAW\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n"
+fi
+
 ####################################################################
 #
 # STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA
 # Note: this step assumes that the SortMeRNA databases are indexed.  If not,
 # do that first (see the SortMeRNA user manual for details).
+Step=$(grep "RIBO" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "RIBO" ]
+        then
 
 for file in $STEP_2/*.assembled.fastq
 do
@@ -191,11 +213,20 @@ done
 $MKDIR $STEP_3
 mv $STEP_2/*ribodepleted* $STEP_3
 
+printf "RIBO\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable RIBO is in the checkpoint file. STEP 3 will then be passed\n"
+fi
+
 ####################################################################
 #
 # STEP 4: ANNOTATING WITH DIAMOND AGAINST REFSEQ
 # Note: this step assumes that the DIAMOND database is already built.  If not,
 # do that first before running this step.
+Step=$(grep "REFSEQ_ANNOT" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "REFSEQ_ANNOT" ]
+        then
 
 echo "Now starting on DIAMOND org annotations at: "; date
 
@@ -215,9 +246,18 @@ mv $STEP_3/*.daa $STEP_4/daa_binary_files
 
 echo "RefSeq DIAMOND annotations completed at: "; date
 
+printf "REFSEQ_ANNOT\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable REFSEQ_ANNOT is in the checkpoint file. STEP 4 will then be passed\n"
+fi
+
 ####################################################################
 #
 # STEP 5: AGGREGATING WITH ANALYSIS_COUNTER
+Step=$(grep "REFSEQ_AGGREG" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "REFSEQ_AGGREG" ]
+        then
 
 for file in $STEP_4/*RefSeq_annotated
 do
@@ -230,9 +270,18 @@ $MKDIR $STEP_5/RefSeq_results/func_results
 mv $STEP_4/*organism.tsv $STEP_5/RefSeq_results/org_results
 mv $STEP_4/*function.tsv $STEP_5/RefSeq_results/func_results
 
+printf "REFSEQ_AGGREG\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable REFSEQ_AGGREG is in the checkpoint file. STEP 5 will then be passed\n"
+fi
+
 ####################################################################
 #
 # STEP 4.1: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
+Step=$(grep "SUBSYS_ANNOT" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "SUBSYS_ANNOT" ]
+        then
 
 echo "Now starting on DIAMOND Subsystems annotations at: "; date
 
@@ -249,9 +298,18 @@ mv $STEP_3/*.daa $STEP_4/daa_binary_files
 
 echo "DIAMOND Subsystems annotations completed at: "; date
 
+printf "SUBSYS_ANNOT\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable SUBSYS_ANNOT is in the checkpoint file. STEP 4.1 will then be passed\n"
+fi
+
 ##################################################################
 #
 # STEP 5.1: PYTHON SUBSYSTEMS ANALYSIS COUNTER
+Step=$(grep "SUBSYS_AGGREG" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "SUBSYS_AGGREG" ]
+        then
 
 for file in $STEP_4/*subsys_annotated
 do
@@ -267,6 +325,12 @@ mv $STEP_4/*.reduced $STEP_5/Subsystems_results
 mv $STEP_4/*.receipt $STEP_5/Subsystems_results/receipts
 rm $STEP_4/*.hierarchy
 
+printf "SUBSYS_AGGREG\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable SUBSYS_AGGREG is in the checkpoint file. STEP 5.1 will then be passed\n"
+fi
+
 ##################################################################
 #
 # At this point, all the results files are ready for analysis using R.
@@ -278,6 +342,9 @@ rm $STEP_4/*.hierarchy
 # STEP 6: R ANALYSIS
 # Note: For R to properly identify files to compare/contrast, they must include
 # the appropriate prefix (either "control_$file" or experimental_$file")!
+Step=$(grep "R_ANALYSIS" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "R_ANALYSIS" ]
+        then
 
 checked Rscript $R_DIR/run_DESeq_stats.R \
   -I $STEP_5/RefSeq_results/org_results \
@@ -292,6 +359,13 @@ checked Rscript $R_DIR/Subsystems_DESeq_stats.R \
   -O Subsystems_level-1_DESeq_results.tab -L 1 \
   -R $STEP_2/raw_counts.txt
 
+printf "R_ANALYSIS\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable R_ANALYSIS is in the checkpoint file. STEP 6 will then be passed\n"
+fi
+
 echo "Master bash script finished running at: "; date
 exit
 ####################################################################
+

From c7474db08cc89320ae587fc0c7def606508f2063 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar1.cedar.computecanada.ca>
Date: Mon, 4 Mar 2019 12:07:28 -0800
Subject: [PATCH 10/11] adding my own scripts to the forked samsa2 pipeline

---
 .gitignore                                  |   6 +-
 README.md                                   |  20 +
 R_scripts/combining_umerged.R               |  61 +++
 bash_scripts/cyanotoxin_master_samsa2.slurm | 505 ++++++++++++++++++++
 bash_scripts/master_samsa2.slurm            | 435 +++++++++++++++++
 5 files changed, 1024 insertions(+), 3 deletions(-)
 create mode 100755 R_scripts/combining_umerged.R
 create mode 100755 bash_scripts/cyanotoxin_master_samsa2.slurm
 create mode 100755 bash_scripts/master_samsa2.slurm

diff --git a/.gitignore b/.gitignore
index c11349b..30b79f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 # my files
-bash_scripts/master_samsa2.slurm
-bash_scripts/cyanotoxin_master_samsa2.slurm
-R_scripts/combining_umerged.R
+#bash_scripts/master_samsa2.slurm
+#bash_scripts/cyanotoxin_master_samsa2.slurm
+#R_scripts/combining_umerged.R
 
 # Swap files
 *.swp
diff --git a/README.md b/README.md
index 2ae3d5f..72c0ce0 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,25 @@
 # SAMSA2 - A fork of the complete metatranscriptome analysis pipeline
 
+### modification by sebastien.renaut@gmail.com
+
+### New in my version. 
+This is a new master script (master_samsa2.slurm) with several modifications:
+
+* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond)
+* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary.
+* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R`
+* Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum.
+* Each step contains an exit statement to be printed if the master script dies due to an unforseen error.
+* Trimmomatic removes adapter contamination according to a specific fasta file.
+* All options, read & program location are to be specified in the first section of the script.
+* The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed.
+* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways...
+
+#### To do:
+* reverse the order of the merging and trimming step (but again, maybe not...)
+* simplify some syntax / make sure all paths are relative (mostly done)
+
+
 Version 2 of the SAMSA pipeline - faster!  Lighter!  More options!  Less waiting!  
 
 ### New in version 2:
diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R
new file mode 100755
index 0000000..c04f28c
--- /dev/null
+++ b/R_scripts/combining_umerged.R
@@ -0,0 +1,61 @@
+#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript
+
+args = commandArgs(TRUE)
+start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself.
+#start="/Users/jerry/Documents/CSBQ/shapiro/results"
+
+setwd(start)
+
+###files to work with 
+system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = ""))
+system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = ""))
+system(paste("ls -1 *.assembled.fastq >assembled.files",sep = ""))
+
+files_f = read.table("umerged.forward.files",stringsAsFactors = F)      
+files_r = read.table("umerged.reverse.files",stringsAsFactors = F) 
+files_a = read.table("assembled.files",stringsAsFactors = F) 
+
+
+for(i in 1:nrow(files_f))      
+  {
+#grep 1st and 2rd lines 
+  	unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="")
+  	system(unassembled.fastq)
+
+#grep sequences forward...
+  	unassembled.forward.fastq = paste("awk 'NR % 2 == 0' ",files_f[i,1]," >unassembled.seq.forward.fastq",sep="")
+  	system(unassembled.forward.fastq)
+
+#grep sequences reverse...
+  	unassembled.reverse.fastq = paste("awk 'NR % 2 == 0' ",files_r[i,1]," >unassembled.seq.reverse.fastq",sep="")
+  	system(unassembled.reverse.fastq)
+
+#N and E quality file
+ 	system("wc -l unassembled.seq.reverse.fastq >wc")
+ 	wc = read.table("wc")
+  
+ 	write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F)
+  
+#paste the unassembled sequences into a single file
+	system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN")
+
+#put everything back into a single fastq
+	back = paste("paste -d '\\n' unassembled.names.seq.fastq unassembledN >",gsub("merged.unassembled.forward","cat",files_f[i,1]),sep= "")
+	system(back)
+
+#add the merged sequences
+	all = paste("cat ",files_a[i,1]," ",gsub("merged.unassembled.forward","cat",files_f[i,1])," >",gsub("merged.assembled","merged.assembled2",files_a[i,1]),sep = "")
+	system(all)
+
+#remove the clutter (file specific)
+	remove = c("rm NE_file unassembledN wc unassembled.names.seq.fastq unassembled.seq.forward.fastq unassembled.seq.reverse.fastq")
+	system(remove)
+}
+
+#remove the clutter (listing of all files)
+system("rm assembled.files umerged.forward.files umerged.reverse.files")
+
+
+
+
+
diff --git a/bash_scripts/cyanotoxin_master_samsa2.slurm b/bash_scripts/cyanotoxin_master_samsa2.slurm
new file mode 100755
index 0000000..2e278da
--- /dev/null
+++ b/bash_scripts/cyanotoxin_master_samsa2.slurm
@@ -0,0 +1,505 @@
+#!/bin/bash
+
+#SBATCH --time=0-07:59
+#SBATCH --mem=40G
+#SBATCH --account=def-bruneaua
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=12
+
+module load r/3.5.0
+
+
+# Lines starting with #SBATCH are for SLURM job management systems
+# and may be removed if user is not submitting jobs to SLURM
+####################################################################
+#set -x #echo
+#
+# master_script_for_sample_files.bash
+# Created April 2017 by Sam Westreich, github.com/transcript
+# This version modified by Michelle Treiber on August 9, 2017
+#
+####################################################################
+#
+# This script sets up and runs through ALL steps in the SAMSA pipeline
+# before the analysis (which is done in R, likely in RStudio).  Each
+# step is set up below.
+#
+# The steps are:
+#	0.1	Create the checkpoint
+#	0.2	Gunzip the raw sequences
+#	1.	Merging with PEAR, if applicable
+#	2.	Read cleaning with Trimmomatic
+#	3.	rRNA removal with SortMeRNA
+#	4.	Annotation using DIAMOND against Subsys database
+#	5.	Aggregation using analysis_counter.py (Subsys)
+#   	4.1  	Annotation using DIAMOND against Refseq database
+#   	5.1	Aggregation using analysis_counter.py (Refseq)
+#	6.	Running R scripts to get DESeq statistical analysis. (NOT RUN)
+#
+# NOTE: BEFORE running this script, please run package_installation.bash
+# and full_database_download.bash located at:
+# https://github.com/transcript/samsa2/tree/master/setup in order to set 
+# up SAMSA2 dependencies and download full databases.
+#
+####################################################################
+#
+echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n"
+#
+# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download
+#
+# 	0. Set starting location:
+starting_location=/home/renaut/scratch/ATRAPP/ATRAPP_Champlain_2016_Metat/St1
+
+#	0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/gzipped
+sequence_location=/home/renaut/scratch/ATRAPP/sequences/ATRAPP_Champlain_2016_Metat/St1
+
+#	1. PEAR
+pear_location=/home/renaut/bin
+
+# 	2. Trimmomatic
+trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36
+
+#	3. SortMeRNA
+sortmerna_location=/home/renaut/bin
+
+#	4. DIAMOND
+diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac"
+diamond_cyanotoxin="/home/renaut/scratch/blast_database/Cyanotoxin_Database/New_Toxin_Database_OneLine_1"
+diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db"
+diamond_location=/home/renaut/bin
+
+#	5. Aggregation
+python_programs=/home/renaut/samsa2.master/samsa2/python_scripts
+cyano_db="/home/renaut/scratch/blast_database/Cyanotoxin_Database/New_Toxin_Database_OneLine_1.faa2"
+RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa"
+Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa"
+
+#	6. R scripts and paths
+export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages"
+R_programs=/home/renaut/samsa2.master/samsa2/R_scripts
+
+#	threads
+threads="12"
+
+##################
+#STEP 0.1: create/read checkpoint
+cd $starting_location
+
+printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
+if [ ! -f "pipeline/checkpoints" ]
+        then
+                printf "\tThe file checkpoints does not exist, we will create it.\n"
+        touch "pipeline/checkpoints"
+else
+        printf "\tThe file checkpoints already exist\n"
+fi
+
+####################################################################
+#
+#STEP 0.2: GUNZIP PROCESS. 
+#
+#
+Step=$(grep "GUNZIP" pipeline/checkpoints)
+if [ "${Step}" != "GUNZIP" ]
+	then
+		for file in $sequence_location/zipped/*gz
+			do
+				file_out=`echo $file | awk -F ".gz" '{print $1}'`	
+        			gunzip $file -cd >$file_out
+
+				if [ $? -ne 0 ]
+                        		then
+                                	printf "\t!!! There is a problem in the gunzip step"
+                                	exit 1
+                		fi
+			done
+		
+		mkdir $sequence_location/unzipped
+		mv $sequence_location/zipped/*fastq $sequence_location/unzipped/.
+		printf "GUNZIP\n" >>pipeline/checkpoints
+                echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP"
+                date
+else
+        printf  "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 1: MERGING OF PAIRED-END FILES USING PEAR
+# Note: paired-end files are usually named using R1 and R2 in the name.
+#		Example: control_1.R1.fastq
+#				 control_1.R2.fastq
+#
+# Note: if using single-end sequencing, skip this step (comment out).
+# Note: if performing R analysis (step 6), be sure to name files with 
+# 	the appropriate prefix ("control_$file" and "experimental_$file")!
+
+
+
+Step=$(grep "MERGING" pipeline/checkpoints)
+if [ "${Step}" != "MERGING" ]
+	then
+		mkdir $starting_location/step_1_merging
+		touch $starting_location/step_1_merging/pear_log
+		for file in $sequence_location/unzipped/*_R1.fastq
+		do
+			file1=$file
+			file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'`
+			out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'`
+			out_name=`echo ${out_path##*/}`
+			$pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log
+		
+			if [ $? -ne 0 ]
+                        	then
+                                	printf "\t!!! There is a problem in the merging step"
+                                	exit 1
+                	fi
+		done
+		
+		###I will concatenate forward and reverse complement with 20Ns in the middle.
+		$R_programs/combining_umerged.R $starting_location
+		
+		printf "MERGING\n" >>pipeline/checkpoints
+		mkdir $starting_location/step_1_merging/
+		mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/
+		
+		rm $starting_location/*fastq
+		rm $sequence_location/unzipped/*fastq
+
+		echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR"
+		date        
+else
+        printf  "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 2: CLEANING FILES WITH TRIMMOMATIC
+# Note: if skipping PEAR, make sure that all starting files are in the
+# $starting_location/step_1_output/ folder!
+Step=$(grep "TRIMMING" pipeline/checkpoints)
+if [ "${Step}" != "TRIMMING" ]
+        then
+		mkdir $starting_location/step_2_trimming
+		touch $starting_location/step_2_trimming/trimmomatic_log
+
+	for file in $starting_location/step_1_merging/*assembled2.fastq
+		do
+		shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'`
+
+		java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the trimmmomatic step"
+				exit 1
+		fi
+		
+		done
+
+	mkdir $starting_location/step_2_trimming/
+	mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/
+	
+	rm step_1_merging/*assembled2.fastq
+	
+	echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC"
+	date
+	
+	printf "TRIMMING\n" >>pipeline/checkpoints
+else
+        printf  "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 2.5: GETTING RAW SEQUENCES COUNTS
+# Note: These are used later for statistical analysis.
+Step=$(grep "RAW" pipeline/checkpoints)
+if [ "${Step}" != "RAW" ]
+        then
+		if [ -f $starting_location/step_2_trimming/raw_counts.txt ]
+			then
+				rm $starting_location/step_2_trimming/raw_counts.txt
+				touch $starting_location/step_2_trimming/raw_counts.txt
+		else
+			touch $starting_location/step_2_trimming/raw_counts.txt
+		fi	
+
+		for file in $starting_location/step_2_trimming/*cleaned.fastq
+		do
+			python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the raw sequences step"
+				exit 1
+		fi
+		done
+		echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" 
+		date
+
+        printf "RAW\n" >>pipeline/checkpoints
+else
+        printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA
+# Note: this step assumes that the SortMeRNA databases are indexed.  If not,
+# do that first (see the SortMeRNA user manual for details).
+
+Step=$(grep "RIBODEPLETION" pipeline/checkpoints)
+if [ "${Step}" != "RIBODEPLETION" ]
+        then
+        mkdir $starting_location/step_3_ribodepletion
+		touch $starting_location/step_3_ribodepletion/sortmerna_log
+
+		for file in $starting_location/step_2_trimming/*cleaned.fastq
+		do
+			shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'`
+			$sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the sortmerna step"
+				exit 1
+		fi
+		done
+
+		mkdir $starting_location/step_3_ribodepletion/
+		mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/
+		
+		rm step_2_trimming/*cleaned.fastq
+		rm step_2_trimming/*ribosomes.fastq
+		
+		echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA"
+		date
+	
+        	printf "RIBODEPLETION\n" >>pipeline/checkpoints
+else
+        	printf  "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
+date
+Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints)
+if [ "${Step}" != "ANNOTATION_SUBSYS" ]
+	then
+		mkdir $starting_location/step_4_annotation
+		touch $starting_location/step_4_annotation/diamond_log
+		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
+		do
+			shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'`
+			#removed the diamond sensitive option
+			$diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
+			$diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the diamond subsys step"
+				exit 1
+		fi
+		done
+		
+		mkdir $starting_location/step_4_annotation/daa_binary_files/
+	
+		mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/
+		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
+
+		echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS"
+        	date
+
+		printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints
+else
+       	 printf  "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n"
+fi
+##################################################################
+#
+# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER
+Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints)
+if [ "${Step}" != "AGGREGATION_SUBSYS" ]
+	then
+		mkdir $starting_location/step_5_aggregation/
+		for file in $starting_location/step_4_annotation/*subsys_annotated
+		do
+			python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt
+	
+			# This quick program reduces down identical hierarchy annotations
+			python $python_programs/subsys_reducer.py -I $file.hierarchy
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the aggregation subsys step"
+				exit 1
+		fi
+		done
+
+		mkdir $starting_location/step_5_aggregation/Subsystems_results/
+		mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/
+		mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/
+		mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/
+		rm $starting_location/step_4_annotation/*.hierarchy
+
+		echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER"
+		date
+
+        	printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n"
+fi
+
+
+####################################################################
+#
+# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ
+# Note: this step assumes that the DIAMOND database is already built.  If not,
+# do that first before running this step.
+Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints)
+if [ "${Step}" != "ANNOTATION_REFSEQ" ]
+        then
+		touch $starting_location/step_4_annotation/diamond_log
+
+		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
+			do
+				shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'`
+				#removed the diamond sensitive option
+				$diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
+				$diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
+			if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the diamond refseq step"
+				exit 1
+			fi
+			done
+
+		mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/
+		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
+
+		echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ"
+		date
+
+        	printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 4.2: ANNOTATING WITH DIAMOND AGAINST CYANO
+# Note: this step assumes that the DIAMOND database is already built.  If not,
+# do that first before running this step.
+Step=$(grep "ANNOTATION_CYANO" pipeline/checkpoints)
+if [ "${Step}" != "ANNOTATION_CYANO" ]
+        then
+                touch $starting_location/step_4_annotation/diamond_cyano_log
+
+                for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
+                        do
+                                shortname=`echo $file | awk -F "ribodepleted" '{print $1 "cyano_annotated"}'`
+                                #removed the diamond sensitive option
+                                $diamond_location/diamond blastx --db $diamond_cyanotoxin -q $file -p $threads -a $file.cyano -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_cyano_log
+                                $diamond_location/diamond view --daa $file.cyano.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_cyano_log
+                        if [ $? -ne 0 ]
+                        then
+                                printf "\t!!! There is a problem in the diamond cyano step"
+                                exit 1
+                        fi
+                        done
+
+                mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/
+                mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
+
+                echo "STEP 4.2 DONE:ANNOTATING WITH DIAMOND AGAINST CYANO"
+                date
+
+                printf "ANNOTATION_CYANO\n" >>pipeline/checkpoints
+else
+                printf  "\tThe variable ANNOTATION_CYANO is in the checkpoint file. STEP 4.1 will then be passed\n"
+fi
+
+
+####################################################################
+#
+# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER
+Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints)
+if [ "${Step}" != "AGGREGATION_REFSEQ" ]
+	then
+		for file in $starting_location/step_4_annotation/*RefSeq_annotated*
+		do
+			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
+			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
+			if [ $? -ne 0 ]
+				then 
+					printf "\t!!! There is a problem in the aggregation refseq step"
+					exit 1
+			fi
+		done
+
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/
+		mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/
+		mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/
+
+		echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER"
+		date 
+
+        	printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n"
+fi
+
+
+####################################################################
+#
+# STEP 5.2: AGGREGATING WITH ANALYSIS_COUNTER
+Step=$(grep "AGGREGATION_CYANO" pipeline/checkpoints)
+if [ "${Step}" != "AGGREGATION_CYANO" ]
+        then
+                for file in $starting_location/step_4_annotation/*cyano_annotated*
+                do
+                        python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $cyano_db -O
+                        python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $cyano_db -F
+                        if [ $? -ne 0 ]
+                                then
+                                        printf "\t!!! There is a problem in the aggregation CYANO step"
+                                        exit 1
+                        fi
+                done
+
+                mkdir $starting_location/step_5_aggregation/cyano_results/
+                mkdir $starting_location/step_5_aggregation/cyano_results/org_results/
+                mkdir $starting_location/step_5_aggregation/cyano_results/func_results/
+                mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/cyano_results/org_results/
+                mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/cyano_results/func_results/
+
+                echo "STEP 5.2 DONE: AGGREGATING WITH ANALYSIS_COUNTER"
+                date
+
+                printf "AGGREGATION_CYANO\n" >>pipeline/checkpoints
+else
+                printf  "\tThe variable AGGREGATION_CYANO is in the checkpoint file. STEP 5.2 will then be passed\n"
+fi
+
+
+
+
+##################################################################
+#
+# At this point, all the results files are ready for analysis using R.
+# This next step performs basic DESeq2 analysis of the RefSeq organism, function,
+# and Subsystems annotations.
+#
+# More complex R analyses may be performed using specific .sh analysis scripts.
+#
+# STEP 6: R ANALYSIS
+# Note: For R to properly identify files to compare/contrast, they must include 
+# the appropriate prefix (either "control_$file" or experimental_$file")!
+
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt
+
+echo "STEP 6 DONE: R ANALYSIS"
+echo "Master bash script finished running at: "; date
+exit
+####################################################################
+
diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm
new file mode 100755
index 0000000..a16d11b
--- /dev/null
+++ b/bash_scripts/master_samsa2.slurm
@@ -0,0 +1,435 @@
+#!/bin/bash
+
+#SBATCH --time=0-23:59
+#SBATCH --mem=100G
+#SBATCH --account=def-bruneaua
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=48
+
+module load r/3.5.0
+
+
+# Lines starting with #SBATCH are for SLURM job management systems
+# and may be removed if user is not submitting jobs to SLURM
+####################################################################
+#set -x #echo
+#
+# master_script_for_sample_files.bash
+# Created April 2017 by Sam Westreich, github.com/transcript
+# This version modified by Michelle Treiber on August 9, 2017
+#
+####################################################################
+#
+# This script sets up and runs through ALL steps in the SAMSA pipeline
+# before the analysis (which is done in R, likely in RStudio).  Each
+# step is set up below.
+#
+# The steps are:
+#	0.1	Create the checkpoint
+#	0.2	Gunzip the raw sequences
+#	1.	Merging with PEAR, if applicable
+#	2.	Read cleaning with Trimmomatic
+#	3.	rRNA removal with SortMeRNA
+#	4.	Annotation using DIAMOND against Subsys database
+#	5.	Aggregation using analysis_counter.py (Subsys)
+#   	4.1  	Annotation using DIAMOND against Refseq database
+#   	5.1	Aggregation using analysis_counter.py (Refseq)
+#	6.	Running R scripts to get DESeq statistical analysis. (NOT RUN)
+#
+# NOTE: BEFORE running this script, please run package_installation.bash
+# and full_database_download.bash located at:
+# https://github.com/transcript/samsa2/tree/master/setup in order to set 
+# up SAMSA2 dependencies and download full databases.
+#
+####################################################################
+#
+echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n"
+#
+# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download
+#
+# 	0. Set starting location:
+starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1
+
+#	0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped
+sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1
+
+#	1. PEAR
+pear_location=/home/renaut/bin
+
+# 	2. Trimmomatic
+trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36
+
+#	3. SortMeRNA
+sortmerna_location=/home/renaut/bin
+
+#	4. DIAMOND
+diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac"
+diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db"
+diamond_location=/home/renaut/bin
+
+#	5. Aggregation
+python_programs=/home/renaut/samsa2.master/samsa2/python_scripts
+RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa"
+Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa"
+
+#	6. R scripts and paths
+export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages"
+R_programs=/home/renaut/samsa2.master/samsa2/R_scripts
+
+#	threads
+threads="48"
+
+##################
+#STEP 0.1: create/read checkpoint
+cd $starting_location
+
+printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
+if [ ! -f "pipeline/checkpoints" ]
+        then
+                printf "\tThe file checkpoints does not exist, we will create it.\n"
+        touch "pipeline/checkpoints"
+else
+        printf "\tThe file checkpoints already exist\n"
+fi
+
+####################################################################
+#
+#STEP 0.2: GUNZIP PROCESS. 
+#
+#
+Step=$(grep "GUNZIP" pipeline/checkpoints)
+if [ "${Step}" != "GUNZIP" ]
+	then
+		for file in $sequence_location/zipped/*gz
+			do
+				file_out=`echo $file | awk -F ".gz" '{print $1}'`	
+        			gunzip $file -cd >$file_out
+
+				if [ $? -ne 0 ]
+                        		then
+                                	printf "\t!!! There is a problem in the gunzip step"
+                                	exit 1
+                		fi
+			done
+		
+		mkdir $sequence_location/unzipped
+		mv $sequence_location/zipped/*fastq $sequence_location/unzipped/.
+		printf "GUNZIP\n" >>pipeline/checkpoints
+                echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP"
+                date
+else
+        printf  "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 1: MERGING OF PAIRED-END FILES USING PEAR
+# Note: paired-end files are usually named using R1 and R2 in the name.
+#		Example: control_1.R1.fastq
+#				 control_1.R2.fastq
+#
+# Note: if using single-end sequencing, skip this step (comment out).
+# Note: if performing R analysis (step 6), be sure to name files with 
+# 	the appropriate prefix ("control_$file" and "experimental_$file")!
+
+
+
+Step=$(grep "MERGING" pipeline/checkpoints)
+if [ "${Step}" != "MERGING" ]
+	then
+		mkdir $starting_location/step_1_merging
+		touch $starting_location/step_1_merging/pear_log
+		for file in $sequence_location/unzipped/*_R1.fastq
+		do
+			file1=$file
+			file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'`
+			out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'`
+			out_name=`echo ${out_path##*/}`
+			$pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log
+		
+			if [ $? -ne 0 ]
+                        	then
+                                	printf "\t!!! There is a problem in the merging step"
+                                	exit 1
+                	fi
+		done
+		
+		###I will concatenate forward and reverse complement with 20Ns in the middle.
+		$R_programs/combining_umerged.R $starting_location
+		
+		printf "MERGING\n" >>pipeline/checkpoints
+		mkdir $starting_location/step_1_merging/
+		mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/
+		
+		rm $starting_location/*fastq
+		rm $sequence_location/unzipped/*fastq
+
+		echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR"
+		date        
+else
+        printf  "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 2: CLEANING FILES WITH TRIMMOMATIC
+# Note: if skipping PEAR, make sure that all starting files are in the
+# $starting_location/step_1_output/ folder!
+Step=$(grep "TRIMMING" pipeline/checkpoints)
+if [ "${Step}" != "TRIMMING" ]
+        then
+		mkdir $starting_location/step_2_trimming
+		touch $starting_location/step_2_trimming/trimmomatic_log
+
+	for file in $starting_location/step_1_merging/*assembled2.fastq
+		do
+		shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'`
+
+		java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the trimmmomatic step"
+				exit 1
+		fi
+		
+		done
+
+	mkdir $starting_location/step_2_trimming/
+	mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/
+	
+	rm step_1_merging/*assembled2.fastq
+	
+	echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC"
+	date
+	
+	printf "TRIMMING\n" >>pipeline/checkpoints
+else
+        printf  "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 2.5: GETTING RAW SEQUENCES COUNTS
+# Note: These are used later for statistical analysis.
+Step=$(grep "RAW" pipeline/checkpoints)
+if [ "${Step}" != "RAW" ]
+        then
+		if [ -f $starting_location/step_2_trimming/raw_counts.txt ]
+			then
+				rm $starting_location/step_2_trimming/raw_counts.txt
+				touch $starting_location/step_2_trimming/raw_counts.txt
+		else
+			touch $starting_location/step_2_trimming/raw_counts.txt
+		fi	
+
+		for file in $starting_location/step_2_trimming/*cleaned.fastq
+		do
+			python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the raw sequences step"
+				exit 1
+		fi
+		done
+		echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" 
+		date
+
+        printf "RAW\n" >>pipeline/checkpoints
+else
+        printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA
+# Note: this step assumes that the SortMeRNA databases are indexed.  If not,
+# do that first (see the SortMeRNA user manual for details).
+
+Step=$(grep "RIBODEPLETION" pipeline/checkpoints)
+if [ "${Step}" != "RIBODEPLETION" ]
+        then
+        mkdir $starting_location/step_3_ribodepletion
+		touch $starting_location/step_3_ribodepletion/sortmerna_log
+
+		for file in $starting_location/step_2_trimming/*cleaned.fastq
+		do
+			shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'`
+			$sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the sortmerna step"
+				exit 1
+		fi
+		done
+
+		mkdir $starting_location/step_3_ribodepletion/
+		mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/
+		
+		rm step_2_trimming/*cleaned.fastq
+		rm step_2_trimming/*ribosomes.fastq
+		
+		echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA"
+		date
+	
+        	printf "RIBODEPLETION\n" >>pipeline/checkpoints
+else
+        	printf  "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
+date
+Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints)
+if [ "${Step}" != "ANNOTATION_SUBSYS" ]
+	then
+		mkdir $starting_location/step_4_annotation
+		touch $starting_location/step_4_annotation/diamond_log
+		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
+		do
+			shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'`
+			#removed the diamond sensitive option
+			$diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
+			$diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the diamond subsys step"
+				exit 1
+		fi
+		done
+		
+		mkdir $starting_location/step_4_annotation/daa_binary_files/
+	
+		mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/
+		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
+
+		echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS"
+        	date
+
+		printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints
+else
+       	 printf  "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n"
+fi
+##################################################################
+#
+# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER
+Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints)
+if [ "${Step}" != "AGGREGATION_SUBSYS" ]
+	then
+		mkdir $starting_location/step_5_aggregation/
+		for file in $starting_location/step_4_annotation/*subsys_annotated
+		do
+			python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt
+	
+			# This quick program reduces down identical hierarchy annotations
+			python $python_programs/subsys_reducer.py -I $file.hierarchy
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the aggregation subsys step"
+				exit 1
+		fi
+		done
+
+		mkdir $starting_location/step_5_aggregation/Subsystems_results/
+		mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/
+		mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/
+		mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/
+		rm $starting_location/step_4_annotation/*.hierarchy
+
+		echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER"
+		date
+
+        	printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n"
+fi
+
+
+####################################################################
+#
+# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ
+# Note: this step assumes that the DIAMOND database is already built.  If not,
+# do that first before running this step.
+Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints)
+if [ "${Step}" != "ANNOTATION_REFSEQ" ]
+        then
+		touch $starting_location/step_4_annotation/diamond_log
+
+		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
+			do
+				shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'`
+				#removed the diamond sensitive option
+				$diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
+				$diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
+			if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the diamond refseq step"
+				exit 1
+			fi
+			done
+
+		mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/
+		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
+
+		echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ"
+		date
+
+        	printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER
+Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints)
+if [ "${Step}" != "AGGREGATION_REFSEQ" ]
+	then
+		for file in $starting_location/step_4_annotation/*RefSeq_annotated*
+		do
+			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
+			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
+			if [ $? -ne 0 ]
+				then 
+					printf "\t!!! There is a problem in the aggregation refseq step"
+					exit 1
+			fi
+		done
+
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/
+		mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/
+		mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/
+
+		echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER"
+		date 
+
+        	printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n"
+fi
+
+
+
+##################################################################
+#
+# At this point, all the results files are ready for analysis using R.
+# This next step performs basic DESeq2 analysis of the RefSeq organism, function,
+# and Subsystems annotations.
+#
+# More complex R analyses may be performed using specific .sh analysis scripts.
+#
+# STEP 6: R ANALYSIS
+# Note: For R to properly identify files to compare/contrast, they must include 
+# the appropriate prefix (either "control_$file" or experimental_$file")!
+
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt
+
+echo "STEP 6 DONE: R ANALYSIS"
+echo "Master bash script finished running at: "; date
+exit
+####################################################################
+

From 0efac2564fe50ecce42ceca3abd14215b92935f4 Mon Sep 17 00:00:00 2001
From: Sam Westreich <swestreich@dnanexus.com>
Date: Tue, 12 Mar 2019 12:49:30 -0700
Subject: [PATCH 11/11] Incorporating and streamlining checkpoint update

---
 README.md                                     |  22 +-
 R_scripts/combining_umerged.R                 |  24 +-
 bash_scripts/cyanotoxin_master_samsa2.slurm   | 505 ------------------
 bash_scripts/master_samsa2.slurm              | 435 ---------------
 bash_scripts/master_script.sh                 |  98 ++--
 .../master_script_preserving_unmerged.sh      | 374 +++++++++++++
 6 files changed, 441 insertions(+), 1017 deletions(-)
 delete mode 100755 bash_scripts/cyanotoxin_master_samsa2.slurm
 delete mode 100755 bash_scripts/master_samsa2.slurm
 create mode 100644 bash_scripts/master_script_preserving_unmerged.sh

diff --git a/README.md b/README.md
index 00978e8..9110279 100644
--- a/README.md
+++ b/README.md
@@ -1,24 +1,20 @@
-# SAMSA2 - A fork of the complete metatranscriptome analysis pipeline
+# SAMSA2 - A complete metatranscriptome analysis pipeline
 
-### modification by sebastien.renaut@gmail.com
+*****
 
-### New in my version. 
-This is a new master script (master_samsa2.slurm) with several modifications:
+**Version 2.2.0 - Yesod - Modifications added by sebastien.renaut@gmail.com:**
 
-* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond)
-* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary.
-* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R`
+* Multithreading added for PEAR, Trimmomatic, SortMeRNA
+* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint` and that step is skipped on a rerun of the master_script. This is done to avoid re-running CPU-intensive steps if unnecessary.
+* A new version of the master script now exists, called "master\_script\_preserving\_unmerged.sh".  In this script, in the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R`
 * Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum.
 * Each step contains an exit statement to be printed if the master script dies due to an unforseen error.
 * Trimmomatic removes adapter contamination according to a specific fasta file.
 * All options, read & program location are to be specified in the first section of the script.
 * The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed.
-* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways...
-
-#### To do:
-* reverse the order of the merging and trimming step (but again, maybe not...)
-* simplify some syntax / make sure all paths are relative (mostly done)
+* The flag --num_alignments 0 in the ribosomal `sortmrna` step has been removed. This caused problems and slowed things down a lot. Plus, we don't care about the rRNA alignments - whether a sequence aligns to 1 or 1,000 rRNA, it's out anyways...
 
+*****
 
 Version 2 of the SAMSA pipeline - faster!  Lighter!  More options!  Less waiting!  
 
@@ -32,7 +28,7 @@ Version 2 of the SAMSA pipeline - faster!  Lighter!  More options!  Less waiting
 
 ### Dependencies
 
-SAMSA2 requires Python2 for aggregation scripts.  Currently, this pipeline does not work with Python3, although an update is planned.
+SAMSA2 requires Python2 for aggregation scripts.  Currently, this pipeline works mostly with Python3, although there may be some errors not yet caught.
 
 The following programs can be downloaded OR can be installed from the binaries provided in the programs/ folder.
 
diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R
index c04f28c..e4aad70 100755
--- a/R_scripts/combining_umerged.R
+++ b/R_scripts/combining_umerged.R
@@ -1,24 +1,23 @@
 #!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript
 
 args = commandArgs(TRUE)
-start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself.
-#start="/Users/jerry/Documents/CSBQ/shapiro/results"
+start = args[1] # Specify which sequences in "list_ind" file you want to align, directly from the shell. Alternatively, you can do this from the "alignments" function itself.
 
 setwd(start)
 
-###files to work with 
+###files to work with
 system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = ""))
 system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = ""))
 system(paste("ls -1 *.assembled.fastq >assembled.files",sep = ""))
 
-files_f = read.table("umerged.forward.files",stringsAsFactors = F)      
-files_r = read.table("umerged.reverse.files",stringsAsFactors = F) 
-files_a = read.table("assembled.files",stringsAsFactors = F) 
+files_f = read.table("umerged.forward.files",stringsAsFactors = F)
+files_r = read.table("umerged.reverse.files",stringsAsFactors = F)
+files_a = read.table("assembled.files",stringsAsFactors = F)
 
 
-for(i in 1:nrow(files_f))      
+for(i in 1:nrow(files_f))
   {
-#grep 1st and 2rd lines 
+#grep 1st and 2rd lines
   	unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="")
   	system(unassembled.fastq)
 
@@ -33,9 +32,9 @@ for(i in 1:nrow(files_f))
 #N and E quality file
  	system("wc -l unassembled.seq.reverse.fastq >wc")
  	wc = read.table("wc")
-  
+
  	write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F)
-  
+
 #paste the unassembled sequences into a single file
 	system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN")
 
@@ -54,8 +53,3 @@ for(i in 1:nrow(files_f))
 
 #remove the clutter (listing of all files)
 system("rm assembled.files umerged.forward.files umerged.reverse.files")
-
-
-
-
-
diff --git a/bash_scripts/cyanotoxin_master_samsa2.slurm b/bash_scripts/cyanotoxin_master_samsa2.slurm
deleted file mode 100755
index 2e278da..0000000
--- a/bash_scripts/cyanotoxin_master_samsa2.slurm
+++ /dev/null
@@ -1,505 +0,0 @@
-#!/bin/bash
-
-#SBATCH --time=0-07:59
-#SBATCH --mem=40G
-#SBATCH --account=def-bruneaua
-#SBATCH --nodes=1
-#SBATCH --ntasks-per-node=12
-
-module load r/3.5.0
-
-
-# Lines starting with #SBATCH are for SLURM job management systems
-# and may be removed if user is not submitting jobs to SLURM
-####################################################################
-#set -x #echo
-#
-# master_script_for_sample_files.bash
-# Created April 2017 by Sam Westreich, github.com/transcript
-# This version modified by Michelle Treiber on August 9, 2017
-#
-####################################################################
-#
-# This script sets up and runs through ALL steps in the SAMSA pipeline
-# before the analysis (which is done in R, likely in RStudio).  Each
-# step is set up below.
-#
-# The steps are:
-#	0.1	Create the checkpoint
-#	0.2	Gunzip the raw sequences
-#	1.	Merging with PEAR, if applicable
-#	2.	Read cleaning with Trimmomatic
-#	3.	rRNA removal with SortMeRNA
-#	4.	Annotation using DIAMOND against Subsys database
-#	5.	Aggregation using analysis_counter.py (Subsys)
-#   	4.1  	Annotation using DIAMOND against Refseq database
-#   	5.1	Aggregation using analysis_counter.py (Refseq)
-#	6.	Running R scripts to get DESeq statistical analysis. (NOT RUN)
-#
-# NOTE: BEFORE running this script, please run package_installation.bash
-# and full_database_download.bash located at:
-# https://github.com/transcript/samsa2/tree/master/setup in order to set 
-# up SAMSA2 dependencies and download full databases.
-#
-####################################################################
-#
-echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n"
-#
-# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download
-#
-# 	0. Set starting location:
-starting_location=/home/renaut/scratch/ATRAPP/ATRAPP_Champlain_2016_Metat/St1
-
-#	0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/gzipped
-sequence_location=/home/renaut/scratch/ATRAPP/sequences/ATRAPP_Champlain_2016_Metat/St1
-
-#	1. PEAR
-pear_location=/home/renaut/bin
-
-# 	2. Trimmomatic
-trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36
-
-#	3. SortMeRNA
-sortmerna_location=/home/renaut/bin
-
-#	4. DIAMOND
-diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac"
-diamond_cyanotoxin="/home/renaut/scratch/blast_database/Cyanotoxin_Database/New_Toxin_Database_OneLine_1"
-diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db"
-diamond_location=/home/renaut/bin
-
-#	5. Aggregation
-python_programs=/home/renaut/samsa2.master/samsa2/python_scripts
-cyano_db="/home/renaut/scratch/blast_database/Cyanotoxin_Database/New_Toxin_Database_OneLine_1.faa2"
-RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa"
-Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa"
-
-#	6. R scripts and paths
-export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages"
-R_programs=/home/renaut/samsa2.master/samsa2/R_scripts
-
-#	threads
-threads="12"
-
-##################
-#STEP 0.1: create/read checkpoint
-cd $starting_location
-
-printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
-if [ ! -f "pipeline/checkpoints" ]
-        then
-                printf "\tThe file checkpoints does not exist, we will create it.\n"
-        touch "pipeline/checkpoints"
-else
-        printf "\tThe file checkpoints already exist\n"
-fi
-
-####################################################################
-#
-#STEP 0.2: GUNZIP PROCESS. 
-#
-#
-Step=$(grep "GUNZIP" pipeline/checkpoints)
-if [ "${Step}" != "GUNZIP" ]
-	then
-		for file in $sequence_location/zipped/*gz
-			do
-				file_out=`echo $file | awk -F ".gz" '{print $1}'`	
-        			gunzip $file -cd >$file_out
-
-				if [ $? -ne 0 ]
-                        		then
-                                	printf "\t!!! There is a problem in the gunzip step"
-                                	exit 1
-                		fi
-			done
-		
-		mkdir $sequence_location/unzipped
-		mv $sequence_location/zipped/*fastq $sequence_location/unzipped/.
-		printf "GUNZIP\n" >>pipeline/checkpoints
-                echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP"
-                date
-else
-        printf  "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 1: MERGING OF PAIRED-END FILES USING PEAR
-# Note: paired-end files are usually named using R1 and R2 in the name.
-#		Example: control_1.R1.fastq
-#				 control_1.R2.fastq
-#
-# Note: if using single-end sequencing, skip this step (comment out).
-# Note: if performing R analysis (step 6), be sure to name files with 
-# 	the appropriate prefix ("control_$file" and "experimental_$file")!
-
-
-
-Step=$(grep "MERGING" pipeline/checkpoints)
-if [ "${Step}" != "MERGING" ]
-	then
-		mkdir $starting_location/step_1_merging
-		touch $starting_location/step_1_merging/pear_log
-		for file in $sequence_location/unzipped/*_R1.fastq
-		do
-			file1=$file
-			file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'`
-			out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'`
-			out_name=`echo ${out_path##*/}`
-			$pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log
-		
-			if [ $? -ne 0 ]
-                        	then
-                                	printf "\t!!! There is a problem in the merging step"
-                                	exit 1
-                	fi
-		done
-		
-		###I will concatenate forward and reverse complement with 20Ns in the middle.
-		$R_programs/combining_umerged.R $starting_location
-		
-		printf "MERGING\n" >>pipeline/checkpoints
-		mkdir $starting_location/step_1_merging/
-		mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/
-		
-		rm $starting_location/*fastq
-		rm $sequence_location/unzipped/*fastq
-
-		echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR"
-		date        
-else
-        printf  "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 2: CLEANING FILES WITH TRIMMOMATIC
-# Note: if skipping PEAR, make sure that all starting files are in the
-# $starting_location/step_1_output/ folder!
-Step=$(grep "TRIMMING" pipeline/checkpoints)
-if [ "${Step}" != "TRIMMING" ]
-        then
-		mkdir $starting_location/step_2_trimming
-		touch $starting_location/step_2_trimming/trimmomatic_log
-
-	for file in $starting_location/step_1_merging/*assembled2.fastq
-		do
-		shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'`
-
-		java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the trimmmomatic step"
-				exit 1
-		fi
-		
-		done
-
-	mkdir $starting_location/step_2_trimming/
-	mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/
-	
-	rm step_1_merging/*assembled2.fastq
-	
-	echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC"
-	date
-	
-	printf "TRIMMING\n" >>pipeline/checkpoints
-else
-        printf  "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 2.5: GETTING RAW SEQUENCES COUNTS
-# Note: These are used later for statistical analysis.
-Step=$(grep "RAW" pipeline/checkpoints)
-if [ "${Step}" != "RAW" ]
-        then
-		if [ -f $starting_location/step_2_trimming/raw_counts.txt ]
-			then
-				rm $starting_location/step_2_trimming/raw_counts.txt
-				touch $starting_location/step_2_trimming/raw_counts.txt
-		else
-			touch $starting_location/step_2_trimming/raw_counts.txt
-		fi	
-
-		for file in $starting_location/step_2_trimming/*cleaned.fastq
-		do
-			python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the raw sequences step"
-				exit 1
-		fi
-		done
-		echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" 
-		date
-
-        printf "RAW\n" >>pipeline/checkpoints
-else
-        printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA
-# Note: this step assumes that the SortMeRNA databases are indexed.  If not,
-# do that first (see the SortMeRNA user manual for details).
-
-Step=$(grep "RIBODEPLETION" pipeline/checkpoints)
-if [ "${Step}" != "RIBODEPLETION" ]
-        then
-        mkdir $starting_location/step_3_ribodepletion
-		touch $starting_location/step_3_ribodepletion/sortmerna_log
-
-		for file in $starting_location/step_2_trimming/*cleaned.fastq
-		do
-			shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'`
-			$sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the sortmerna step"
-				exit 1
-		fi
-		done
-
-		mkdir $starting_location/step_3_ribodepletion/
-		mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/
-		
-		rm step_2_trimming/*cleaned.fastq
-		rm step_2_trimming/*ribosomes.fastq
-		
-		echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA"
-		date
-	
-        	printf "RIBODEPLETION\n" >>pipeline/checkpoints
-else
-        	printf  "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
-date
-Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints)
-if [ "${Step}" != "ANNOTATION_SUBSYS" ]
-	then
-		mkdir $starting_location/step_4_annotation
-		touch $starting_location/step_4_annotation/diamond_log
-		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
-		do
-			shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'`
-			#removed the diamond sensitive option
-			$diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
-			$diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the diamond subsys step"
-				exit 1
-		fi
-		done
-		
-		mkdir $starting_location/step_4_annotation/daa_binary_files/
-	
-		mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/
-		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
-
-		echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS"
-        	date
-
-		printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints
-else
-       	 printf  "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n"
-fi
-##################################################################
-#
-# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER
-Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints)
-if [ "${Step}" != "AGGREGATION_SUBSYS" ]
-	then
-		mkdir $starting_location/step_5_aggregation/
-		for file in $starting_location/step_4_annotation/*subsys_annotated
-		do
-			python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt
-	
-			# This quick program reduces down identical hierarchy annotations
-			python $python_programs/subsys_reducer.py -I $file.hierarchy
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the aggregation subsys step"
-				exit 1
-		fi
-		done
-
-		mkdir $starting_location/step_5_aggregation/Subsystems_results/
-		mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/
-		mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/
-		mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/
-		rm $starting_location/step_4_annotation/*.hierarchy
-
-		echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER"
-		date
-
-        	printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints
-else
-	        printf  "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n"
-fi
-
-
-####################################################################
-#
-# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ
-# Note: this step assumes that the DIAMOND database is already built.  If not,
-# do that first before running this step.
-Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints)
-if [ "${Step}" != "ANNOTATION_REFSEQ" ]
-        then
-		touch $starting_location/step_4_annotation/diamond_log
-
-		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
-			do
-				shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'`
-				#removed the diamond sensitive option
-				$diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
-				$diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
-			if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the diamond refseq step"
-				exit 1
-			fi
-			done
-
-		mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/
-		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
-
-		echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ"
-		date
-
-        	printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints
-else
-	        printf  "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 4.2: ANNOTATING WITH DIAMOND AGAINST CYANO
-# Note: this step assumes that the DIAMOND database is already built.  If not,
-# do that first before running this step.
-Step=$(grep "ANNOTATION_CYANO" pipeline/checkpoints)
-if [ "${Step}" != "ANNOTATION_CYANO" ]
-        then
-                touch $starting_location/step_4_annotation/diamond_cyano_log
-
-                for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
-                        do
-                                shortname=`echo $file | awk -F "ribodepleted" '{print $1 "cyano_annotated"}'`
-                                #removed the diamond sensitive option
-                                $diamond_location/diamond blastx --db $diamond_cyanotoxin -q $file -p $threads -a $file.cyano -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_cyano_log
-                                $diamond_location/diamond view --daa $file.cyano.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_cyano_log
-                        if [ $? -ne 0 ]
-                        then
-                                printf "\t!!! There is a problem in the diamond cyano step"
-                                exit 1
-                        fi
-                        done
-
-                mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/
-                mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
-
-                echo "STEP 4.2 DONE:ANNOTATING WITH DIAMOND AGAINST CYANO"
-                date
-
-                printf "ANNOTATION_CYANO\n" >>pipeline/checkpoints
-else
-                printf  "\tThe variable ANNOTATION_CYANO is in the checkpoint file. STEP 4.1 will then be passed\n"
-fi
-
-
-####################################################################
-#
-# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER
-Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints)
-if [ "${Step}" != "AGGREGATION_REFSEQ" ]
-	then
-		for file in $starting_location/step_4_annotation/*RefSeq_annotated*
-		do
-			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
-			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
-			if [ $? -ne 0 ]
-				then 
-					printf "\t!!! There is a problem in the aggregation refseq step"
-					exit 1
-			fi
-		done
-
-		mkdir $starting_location/step_5_aggregation/RefSeq_results/
-		mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/
-		mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/
-		mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/
-		mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/
-
-		echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER"
-		date 
-
-        	printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints
-else
-	        printf  "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n"
-fi
-
-
-####################################################################
-#
-# STEP 5.2: AGGREGATING WITH ANALYSIS_COUNTER
-Step=$(grep "AGGREGATION_CYANO" pipeline/checkpoints)
-if [ "${Step}" != "AGGREGATION_CYANO" ]
-        then
-                for file in $starting_location/step_4_annotation/*cyano_annotated*
-                do
-                        python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $cyano_db -O
-                        python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $cyano_db -F
-                        if [ $? -ne 0 ]
-                                then
-                                        printf "\t!!! There is a problem in the aggregation CYANO step"
-                                        exit 1
-                        fi
-                done
-
-                mkdir $starting_location/step_5_aggregation/cyano_results/
-                mkdir $starting_location/step_5_aggregation/cyano_results/org_results/
-                mkdir $starting_location/step_5_aggregation/cyano_results/func_results/
-                mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/cyano_results/org_results/
-                mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/cyano_results/func_results/
-
-                echo "STEP 5.2 DONE: AGGREGATING WITH ANALYSIS_COUNTER"
-                date
-
-                printf "AGGREGATION_CYANO\n" >>pipeline/checkpoints
-else
-                printf  "\tThe variable AGGREGATION_CYANO is in the checkpoint file. STEP 5.2 will then be passed\n"
-fi
-
-
-
-
-##################################################################
-#
-# At this point, all the results files are ready for analysis using R.
-# This next step performs basic DESeq2 analysis of the RefSeq organism, function,
-# and Subsystems annotations.
-#
-# More complex R analyses may be performed using specific .sh analysis scripts.
-#
-# STEP 6: R ANALYSIS
-# Note: For R to properly identify files to compare/contrast, they must include 
-# the appropriate prefix (either "control_$file" or experimental_$file")!
-
-#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
-#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
-#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt
-
-echo "STEP 6 DONE: R ANALYSIS"
-echo "Master bash script finished running at: "; date
-exit
-####################################################################
-
diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm
deleted file mode 100755
index a16d11b..0000000
--- a/bash_scripts/master_samsa2.slurm
+++ /dev/null
@@ -1,435 +0,0 @@
-#!/bin/bash
-
-#SBATCH --time=0-23:59
-#SBATCH --mem=100G
-#SBATCH --account=def-bruneaua
-#SBATCH --nodes=1
-#SBATCH --ntasks-per-node=48
-
-module load r/3.5.0
-
-
-# Lines starting with #SBATCH are for SLURM job management systems
-# and may be removed if user is not submitting jobs to SLURM
-####################################################################
-#set -x #echo
-#
-# master_script_for_sample_files.bash
-# Created April 2017 by Sam Westreich, github.com/transcript
-# This version modified by Michelle Treiber on August 9, 2017
-#
-####################################################################
-#
-# This script sets up and runs through ALL steps in the SAMSA pipeline
-# before the analysis (which is done in R, likely in RStudio).  Each
-# step is set up below.
-#
-# The steps are:
-#	0.1	Create the checkpoint
-#	0.2	Gunzip the raw sequences
-#	1.	Merging with PEAR, if applicable
-#	2.	Read cleaning with Trimmomatic
-#	3.	rRNA removal with SortMeRNA
-#	4.	Annotation using DIAMOND against Subsys database
-#	5.	Aggregation using analysis_counter.py (Subsys)
-#   	4.1  	Annotation using DIAMOND against Refseq database
-#   	5.1	Aggregation using analysis_counter.py (Refseq)
-#	6.	Running R scripts to get DESeq statistical analysis. (NOT RUN)
-#
-# NOTE: BEFORE running this script, please run package_installation.bash
-# and full_database_download.bash located at:
-# https://github.com/transcript/samsa2/tree/master/setup in order to set 
-# up SAMSA2 dependencies and download full databases.
-#
-####################################################################
-#
-echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n"
-#
-# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download
-#
-# 	0. Set starting location:
-starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1
-
-#	0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped
-sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1
-
-#	1. PEAR
-pear_location=/home/renaut/bin
-
-# 	2. Trimmomatic
-trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36
-
-#	3. SortMeRNA
-sortmerna_location=/home/renaut/bin
-
-#	4. DIAMOND
-diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac"
-diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db"
-diamond_location=/home/renaut/bin
-
-#	5. Aggregation
-python_programs=/home/renaut/samsa2.master/samsa2/python_scripts
-RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa"
-Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa"
-
-#	6. R scripts and paths
-export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages"
-R_programs=/home/renaut/samsa2.master/samsa2/R_scripts
-
-#	threads
-threads="48"
-
-##################
-#STEP 0.1: create/read checkpoint
-cd $starting_location
-
-printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
-if [ ! -f "pipeline/checkpoints" ]
-        then
-                printf "\tThe file checkpoints does not exist, we will create it.\n"
-        touch "pipeline/checkpoints"
-else
-        printf "\tThe file checkpoints already exist\n"
-fi
-
-####################################################################
-#
-#STEP 0.2: GUNZIP PROCESS. 
-#
-#
-Step=$(grep "GUNZIP" pipeline/checkpoints)
-if [ "${Step}" != "GUNZIP" ]
-	then
-		for file in $sequence_location/zipped/*gz
-			do
-				file_out=`echo $file | awk -F ".gz" '{print $1}'`	
-        			gunzip $file -cd >$file_out
-
-				if [ $? -ne 0 ]
-                        		then
-                                	printf "\t!!! There is a problem in the gunzip step"
-                                	exit 1
-                		fi
-			done
-		
-		mkdir $sequence_location/unzipped
-		mv $sequence_location/zipped/*fastq $sequence_location/unzipped/.
-		printf "GUNZIP\n" >>pipeline/checkpoints
-                echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP"
-                date
-else
-        printf  "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 1: MERGING OF PAIRED-END FILES USING PEAR
-# Note: paired-end files are usually named using R1 and R2 in the name.
-#		Example: control_1.R1.fastq
-#				 control_1.R2.fastq
-#
-# Note: if using single-end sequencing, skip this step (comment out).
-# Note: if performing R analysis (step 6), be sure to name files with 
-# 	the appropriate prefix ("control_$file" and "experimental_$file")!
-
-
-
-Step=$(grep "MERGING" pipeline/checkpoints)
-if [ "${Step}" != "MERGING" ]
-	then
-		mkdir $starting_location/step_1_merging
-		touch $starting_location/step_1_merging/pear_log
-		for file in $sequence_location/unzipped/*_R1.fastq
-		do
-			file1=$file
-			file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'`
-			out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'`
-			out_name=`echo ${out_path##*/}`
-			$pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log
-		
-			if [ $? -ne 0 ]
-                        	then
-                                	printf "\t!!! There is a problem in the merging step"
-                                	exit 1
-                	fi
-		done
-		
-		###I will concatenate forward and reverse complement with 20Ns in the middle.
-		$R_programs/combining_umerged.R $starting_location
-		
-		printf "MERGING\n" >>pipeline/checkpoints
-		mkdir $starting_location/step_1_merging/
-		mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/
-		
-		rm $starting_location/*fastq
-		rm $sequence_location/unzipped/*fastq
-
-		echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR"
-		date        
-else
-        printf  "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 2: CLEANING FILES WITH TRIMMOMATIC
-# Note: if skipping PEAR, make sure that all starting files are in the
-# $starting_location/step_1_output/ folder!
-Step=$(grep "TRIMMING" pipeline/checkpoints)
-if [ "${Step}" != "TRIMMING" ]
-        then
-		mkdir $starting_location/step_2_trimming
-		touch $starting_location/step_2_trimming/trimmomatic_log
-
-	for file in $starting_location/step_1_merging/*assembled2.fastq
-		do
-		shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'`
-
-		java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the trimmmomatic step"
-				exit 1
-		fi
-		
-		done
-
-	mkdir $starting_location/step_2_trimming/
-	mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/
-	
-	rm step_1_merging/*assembled2.fastq
-	
-	echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC"
-	date
-	
-	printf "TRIMMING\n" >>pipeline/checkpoints
-else
-        printf  "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 2.5: GETTING RAW SEQUENCES COUNTS
-# Note: These are used later for statistical analysis.
-Step=$(grep "RAW" pipeline/checkpoints)
-if [ "${Step}" != "RAW" ]
-        then
-		if [ -f $starting_location/step_2_trimming/raw_counts.txt ]
-			then
-				rm $starting_location/step_2_trimming/raw_counts.txt
-				touch $starting_location/step_2_trimming/raw_counts.txt
-		else
-			touch $starting_location/step_2_trimming/raw_counts.txt
-		fi	
-
-		for file in $starting_location/step_2_trimming/*cleaned.fastq
-		do
-			python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the raw sequences step"
-				exit 1
-		fi
-		done
-		echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" 
-		date
-
-        printf "RAW\n" >>pipeline/checkpoints
-else
-        printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA
-# Note: this step assumes that the SortMeRNA databases are indexed.  If not,
-# do that first (see the SortMeRNA user manual for details).
-
-Step=$(grep "RIBODEPLETION" pipeline/checkpoints)
-if [ "${Step}" != "RIBODEPLETION" ]
-        then
-        mkdir $starting_location/step_3_ribodepletion
-		touch $starting_location/step_3_ribodepletion/sortmerna_log
-
-		for file in $starting_location/step_2_trimming/*cleaned.fastq
-		do
-			shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'`
-			$sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the sortmerna step"
-				exit 1
-		fi
-		done
-
-		mkdir $starting_location/step_3_ribodepletion/
-		mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/
-		
-		rm step_2_trimming/*cleaned.fastq
-		rm step_2_trimming/*ribosomes.fastq
-		
-		echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA"
-		date
-	
-        	printf "RIBODEPLETION\n" >>pipeline/checkpoints
-else
-        	printf  "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
-date
-Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints)
-if [ "${Step}" != "ANNOTATION_SUBSYS" ]
-	then
-		mkdir $starting_location/step_4_annotation
-		touch $starting_location/step_4_annotation/diamond_log
-		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
-		do
-			shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'`
-			#removed the diamond sensitive option
-			$diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
-			$diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the diamond subsys step"
-				exit 1
-		fi
-		done
-		
-		mkdir $starting_location/step_4_annotation/daa_binary_files/
-	
-		mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/
-		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
-
-		echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS"
-        	date
-
-		printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints
-else
-       	 printf  "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n"
-fi
-##################################################################
-#
-# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER
-Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints)
-if [ "${Step}" != "AGGREGATION_SUBSYS" ]
-	then
-		mkdir $starting_location/step_5_aggregation/
-		for file in $starting_location/step_4_annotation/*subsys_annotated
-		do
-			python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt
-	
-			# This quick program reduces down identical hierarchy annotations
-			python $python_programs/subsys_reducer.py -I $file.hierarchy
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the aggregation subsys step"
-				exit 1
-		fi
-		done
-
-		mkdir $starting_location/step_5_aggregation/Subsystems_results/
-		mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/
-		mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/
-		mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/
-		rm $starting_location/step_4_annotation/*.hierarchy
-
-		echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER"
-		date
-
-        	printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints
-else
-	        printf  "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n"
-fi
-
-
-####################################################################
-#
-# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ
-# Note: this step assumes that the DIAMOND database is already built.  If not,
-# do that first before running this step.
-Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints)
-if [ "${Step}" != "ANNOTATION_REFSEQ" ]
-        then
-		touch $starting_location/step_4_annotation/diamond_log
-
-		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
-			do
-				shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'`
-				#removed the diamond sensitive option
-				$diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
-				$diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
-			if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the diamond refseq step"
-				exit 1
-			fi
-			done
-
-		mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/
-		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
-
-		echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ"
-		date
-
-        	printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints
-else
-	        printf  "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER
-Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints)
-if [ "${Step}" != "AGGREGATION_REFSEQ" ]
-	then
-		for file in $starting_location/step_4_annotation/*RefSeq_annotated*
-		do
-			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
-			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
-			if [ $? -ne 0 ]
-				then 
-					printf "\t!!! There is a problem in the aggregation refseq step"
-					exit 1
-			fi
-		done
-
-		mkdir $starting_location/step_5_aggregation/RefSeq_results/
-		mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/
-		mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/
-		mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/
-		mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/
-
-		echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER"
-		date 
-
-        	printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints
-else
-	        printf  "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n"
-fi
-
-
-
-##################################################################
-#
-# At this point, all the results files are ready for analysis using R.
-# This next step performs basic DESeq2 analysis of the RefSeq organism, function,
-# and Subsystems annotations.
-#
-# More complex R analyses may be performed using specific .sh analysis scripts.
-#
-# STEP 6: R ANALYSIS
-# Note: For R to properly identify files to compare/contrast, they must include 
-# the appropriate prefix (either "control_$file" or experimental_$file")!
-
-#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
-#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
-#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt
-
-echo "STEP 6 DONE: R ANALYSIS"
-echo "Master bash script finished running at: "; date
-exit
-####################################################################
-
diff --git a/bash_scripts/master_script.sh b/bash_scripts/master_script.sh
index f6ef74e..25092f1 100644
--- a/bash_scripts/master_script.sh
+++ b/bash_scripts/master_script.sh
@@ -9,7 +9,7 @@
 #
 # master_script.sh
 # Created April 2017 by Sam Westreich, github.com/transcript
-# This version modified February 21, 2018
+# This version modified March 12, 2019
 #
 ####################################################################
 #
@@ -43,6 +43,9 @@ source "${BASH_SOURCE%/*}/../bash_scripts/lib/common.sh"
 INPUT_DIR=$SAMSA/input_files
 OUT_DIR=$SAMSA
 
+# number of threads
+threads=`getconf _NPROCESSORS_ONLN`
+
 STEP_1="$OUT_DIR/step_1_output"
 STEP_2="$OUT_DIR/step_2_output"
 STEP_3="$OUT_DIR/step_3_output"
@@ -74,30 +77,27 @@ fi
 ####################################################################
 #STEP 0.1: create/read checkpoint
 
-printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
+printf "\nStep 0.1: Checking for the presence of the checkpoint file.\n"
 if [ ! -f "$INPUT_DIR/checkpoints" ]
-        then
-                printf "\tThe file checkpoints does not exist, we will create it.\n"
-        touch "$INPUT_DIR/checkpoints"
+  then
+    printf "\tThe file 'checkpoints' does not exist in the input directory, creating...\n"
+    touch "$INPUT_DIR/checkpoints"
 else
-        printf "\tThe file checkpoints already exist\n"
+    printf "\tThe file 'checkpoints' already exists in the input directory.\n"
 fi
 
 ####################################################################
-
-
 #
 # STEP 1: CLEANING FILES WITH TRIMMOMATIC
 Step=$(grep "TRIMMO" $INPUT_DIR/checkpoints)
 if [ "${Step}" != "TRIMMO" ]
-        then
-
+  then
 
 if ls $INPUT_DIR/*.gz &>/dev/null; then
   for file in $INPUT_DIR/*.gz
   do
-      gunzip $file
-    done
+    gunzip $file
+  done
 fi
 
 $MKDIR $STEP_1
@@ -108,11 +108,11 @@ do
     out_path=`echo $f | awk -F "_R1" '{print $1 ".cleaned"}'`
     if [ -f $f2 ]; then
       paired=true
-      checked java -jar $TRIMMOMATIC PE -phred33 $f $f2 \
+      checked java -jar $TRIMMOMATIC PE -phred33 -threads $threads $f $f2 \
         $out_path".forward" $out_path".forward_unpaired" $out_path".reverse" $out_path".reverse_unpaired" \
         SLIDINGWINDOW:4:15 MINLEN:70
     else
-      checked java -jar $TRIMMOMATIC SE -phred33 $f $out_path SLIDINGWINDOW:4:15 MINLEN:70
+      checked java -jar $TRIMMOMATIC SE -phred33 -threads $threads $f $out_path SLIDINGWINDOW:4:15 MINLEN:70
     fi
 done
 
@@ -122,9 +122,10 @@ if $paired; then
 else
   mv $INPUT_DIR/*".cleaned" $STEP_1
 fi
+printf "TRIMMO\n" >>pipeline/checkpoints
 
 else
-        printf  "\tThe variable TRIMMO is in the checkpoint file. STEP 1 will then be passed\n"
+  printf  "\tThe variable TRIMMO is in the checkpoint file. STEP 1 will be skipped.\n"
 fi
 
 
@@ -134,18 +135,18 @@ fi
 # Note: paired-end files are usually named using R1 and R2 in the name.
 #       Example: control_1.R1.fastq
 #                control_1.R2.fastq
+
 Step=$(grep "MERGING" $INPUT_DIR/checkpoints)
 if [ "${Step}" != "MERGING" ]
-        then
-
+  then
 
 $MKDIR $STEP_2
 if $paired; then
   for file in $STEP_1/*.cleaned.forward
   do
-      f2=`echo $file | awk -F "cleaned.forward" '{print $1 "cleaned.reverse"}'`
-      shortname=`echo $file | awk -F "cleaned.forward" '{print $1 "merged"}'`
-      checked $PEAR -f $file -r $f2 -o $STEP_2/${shortname##*/}
+    f2=`echo $file | awk -F "cleaned.forward" '{print $1 "cleaned.reverse"}'`
+    shortname=`echo $file | awk -F "cleaned.forward" '{print $1 "merged"}'`
+    checked $PEAR -f $file -r $f2 -j $threads -o $STEP_2/${shortname##*/}
   done
 else
   for file in $STEP_1/*.cleaned
@@ -158,7 +159,7 @@ fi
 printf "MERGING\n" >>$INPUT_DIR/checkpoints
 
 else
-        printf  "\tThe variable MERGING is in the checkpoint file. STEP 2 will then be passed\n"
+  printf  "\tThe variable MERGING is in the checkpoint file. STEP 2 will be skipped.\n"
 fi
 
 ####################################################################
@@ -167,7 +168,7 @@ fi
 # Note: These are used later for statistical analysis.
 Step=$(grep "RAW" $INPUT_DIR/checkpoints)
 if [ "${Step}" != "RAW" ]
-        then
+  then
 
 if [[ -f $STEP_2/raw_counts.txt ]]; then
     rm $STEP_2/raw_counts.txt
@@ -177,19 +178,19 @@ touch $STEP_2/raw_counts.txt
 if $paired; then
   for file in $STEP_1/*cleaned.forward
   do
-      checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt
+    checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt
   done
 else
   for file in $STEP_1/*cleaned
   do
-      checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt
+    checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt
   done
 fi
 
 printf "RAW\n" >>$INPUT_DIR/checkpoints
 
 else
-        printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n"
+  printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will be skipped.\n"
 fi
 
 ####################################################################
@@ -199,15 +200,15 @@ fi
 # do that first (see the SortMeRNA user manual for details).
 Step=$(grep "RIBO" $INPUT_DIR/checkpoints)
 if [ "${Step}" != "RIBO" ]
-        then
+  then
 
 for file in $STEP_2/*.assembled.fastq
 do
-    shortname=`echo $file | awk -F "assembled" '{print $1 "ribodepleted"}'`
-    checked $SORTMERNA \
-      --ref $SORTMERNA_DIR/rRNA_databases/silva-bac-16s-id90.fasta,$SORTMERNA_DIR/index/silva-bac-16s-db \
-      --reads $file --aligned $file.ribosomes --other $shortname --fastx \
-      --num_alignments 0 --log -v
+  shortname=`echo $file | awk -F "assembled" '{print $1 "ribodepleted"}'`
+  checked $SORTMERNA -a $threads \
+    --ref $SORTMERNA_DIR/rRNA_databases/silva-bac-16s-id90.fasta,$SORTMERNA_DIR/index/silva-bac-16s-db \
+    --reads $file --aligned $file.ribosomes --other $shortname --fastx \
+    --log -v
 done
 
 $MKDIR $STEP_3
@@ -216,7 +217,7 @@ mv $STEP_2/*ribodepleted* $STEP_3
 printf "RIBO\n" >>$INPUT_DIR/checkpoints
 
 else
-        printf  "\tThe variable RIBO is in the checkpoint file. STEP 3 will then be passed\n"
+  printf  "\tThe variable RIBO is in the checkpoint file. STEP 3 will be skipped.\n"
 fi
 
 ####################################################################
@@ -226,7 +227,7 @@ fi
 # do that first before running this step.
 Step=$(grep "REFSEQ_ANNOT" $INPUT_DIR/checkpoints)
 if [ "${Step}" != "REFSEQ_ANNOT" ]
-        then
+  then
 
 echo "Now starting on DIAMOND org annotations at: "; date
 
@@ -249,7 +250,7 @@ echo "RefSeq DIAMOND annotations completed at: "; date
 printf "REFSEQ_ANNOT\n" >>$INPUT_DIR/checkpoints
 
 else
-        printf  "\tThe variable REFSEQ_ANNOT is in the checkpoint file. STEP 4 will then be passed\n"
+  printf  "\tThe variable REFSEQ_ANNOT is in the checkpoint file. STEP 4 will be skipped.\n"
 fi
 
 ####################################################################
@@ -257,12 +258,12 @@ fi
 # STEP 5: AGGREGATING WITH ANALYSIS_COUNTER
 Step=$(grep "REFSEQ_AGGREG" $INPUT_DIR/checkpoints)
 if [ "${Step}" != "REFSEQ_AGGREG" ]
-        then
+  then
 
 for file in $STEP_4/*RefSeq_annotated
 do
-    checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
-    checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
+  checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
+  checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
 done
 
 $MKDIR $STEP_5/RefSeq_results/org_results
@@ -273,7 +274,7 @@ mv $STEP_4/*function.tsv $STEP_5/RefSeq_results/func_results
 printf "REFSEQ_AGGREG\n" >>$INPUT_DIR/checkpoints
 
 else
-        printf  "\tThe variable REFSEQ_AGGREG is in the checkpoint file. STEP 5 will then be passed\n"
+  printf  "\tThe variable REFSEQ_AGGREG is in the checkpoint file. STEP 5 will be skipped.\n"
 fi
 
 ####################################################################
@@ -281,7 +282,7 @@ fi
 # STEP 4.1: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
 Step=$(grep "SUBSYS_ANNOT" $INPUT_DIR/checkpoints)
 if [ "${Step}" != "SUBSYS_ANNOT" ]
-        then
+  then
 
 echo "Now starting on DIAMOND Subsystems annotations at: "; date
 
@@ -301,7 +302,7 @@ echo "DIAMOND Subsystems annotations completed at: "; date
 printf "SUBSYS_ANNOT\n" >>$INPUT_DIR/checkpoints
 
 else
-        printf  "\tThe variable SUBSYS_ANNOT is in the checkpoint file. STEP 4.1 will then be passed\n"
+  printf  "\tThe variable SUBSYS_ANNOT is in the checkpoint file. STEP 4.1 will be skipped.\n"
 fi
 
 ##################################################################
@@ -309,15 +310,15 @@ fi
 # STEP 5.1: PYTHON SUBSYSTEMS ANALYSIS COUNTER
 Step=$(grep "SUBSYS_AGGREG" $INPUT_DIR/checkpoints)
 if [ "${Step}" != "SUBSYS_AGGREG" ]
-        then
+  then
 
 for file in $STEP_4/*subsys_annotated
 do
-    checked python $PY_DIR/DIAMOND_subsystems_analysis_counter.py -I $file \
-      -D $Subsys_db -O $file.hierarchy -P $file.receipt
+  checked python $PY_DIR/DIAMOND_subsystems_analysis_counter.py -I $file \
+    -D $Subsys_db -O $file.hierarchy -P $file.receipt
 
-    # This quick program reduces down identical hierarchy annotations
-    checked python $PY_DIR/subsys_reducer.py -I $file.hierarchy
+  # This quick program reduces down identical hierarchy annotations
+  checked python $PY_DIR/subsys_reducer.py -I $file.hierarchy
 done
 
 $MKDIR $STEP_5/Subsystems_results/receipts
@@ -328,7 +329,7 @@ rm $STEP_4/*.hierarchy
 printf "SUBSYS_AGGREG\n" >>$INPUT_DIR/checkpoints
 
 else
-        printf  "\tThe variable SUBSYS_AGGREG is in the checkpoint file. STEP 5.1 will then be passed\n"
+  printf  "\tThe variable SUBSYS_AGGREG is in the checkpoint file. STEP 5.1 will be skipped.\n"
 fi
 
 ##################################################################
@@ -344,7 +345,7 @@ fi
 # the appropriate prefix (either "control_$file" or experimental_$file")!
 Step=$(grep "R_ANALYSIS" $INPUT_DIR/checkpoints)
 if [ "${Step}" != "R_ANALYSIS" ]
-        then
+  then
 
 checked Rscript $R_DIR/run_DESeq_stats.R \
   -I $STEP_5/RefSeq_results/org_results \
@@ -362,10 +363,9 @@ checked Rscript $R_DIR/Subsystems_DESeq_stats.R \
 printf "R_ANALYSIS\n" >>$INPUT_DIR/checkpoints
 
 else
-        printf  "\tThe variable R_ANALYSIS is in the checkpoint file. STEP 6 will then be passed\n"
+  printf  "\tThe variable R_ANALYSIS is in the checkpoint file. STEP 6 will be skipped.\n"
 fi
 
 echo "Master bash script finished running at: "; date
 exit
 ####################################################################
-
diff --git a/bash_scripts/master_script_preserving_unmerged.sh b/bash_scripts/master_script_preserving_unmerged.sh
new file mode 100644
index 0000000..be495b6
--- /dev/null
+++ b/bash_scripts/master_script_preserving_unmerged.sh
@@ -0,0 +1,374 @@
+#!/bin/bash
+#SBATCH --mem=100000
+#SBATCH --time=7-0:0:0
+
+# Lines starting with #SBATCH are for SLURM job management systems
+# and may be removed if user is not submitting jobs to SLURM
+
+####################################################################
+#
+# master_script.sh
+# Created April 2017 by Sam Westreich, github.com/transcript
+# This version modified March 12, 2019
+#
+####################################################################
+#
+# This script sets up and runs through ALL steps in the SAMSA pipeline
+# before the analysis (which is done in R, likely in RStudio).  Each
+# step is set up below.
+#
+# The steps are:
+#   1.   Read cleaning with Trimmomatic
+#   2.   Merging with PEAR, if applicable
+#   3.   rRNA removal with SortMeRNA
+#   4.   Annotation using DIAMOND (by default against the RefSeq database)
+#   5.   Aggregation using analysis_counter.py
+#   4.1  Annotation using DIAMOND against the Subsystems database
+#   5.1  Aggregation using Subsystems-specific analysis counter.py
+#   6.   Running R scripts to get DESeq statistical analysis.
+#
+# NOTE: BEFORE running this script, please run package_installation.bash
+# and full_database_download.bash located at:
+# https://github.com/transcript/samsa2/tree/master/setup in order to set
+# up SAMSA2 dependencies and download full databases.
+#
+#######################################################################
+#
+echo -e "NOTE: Before running this script, please run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n"
+#
+# VARIABLES - set starting location and starting files location pathways
+#
+source "${BASH_SOURCE%/*}/../bash_scripts/lib/common.sh"
+
+INPUT_DIR=$SAMSA/input_files
+OUT_DIR=$SAMSA
+
+# number of threads
+threads=`getconf _NPROCESSORS_ONLN`
+
+STEP_1="$OUT_DIR/step_1_output"
+STEP_2="$OUT_DIR/step_2_output"
+STEP_3="$OUT_DIR/step_3_output"
+STEP_4="$OUT_DIR/step_4_output"
+STEP_5="$OUT_DIR/step_5_output"
+
+if [[ -n "$USE_TINY" ]]; then
+  # Diamond databases
+  diamond_database="$SAMSA/setup_and_test/tiny_databases/RefSeq_bac_TINY_24MB"
+  diamond_subsys_db="$SAMSA/setup_and_test/tiny_databases/subsys_db_TINY_24MB"
+  # Aggregation databases
+  RefSeq_db="$SAMSA/setup_and_test/tiny_databases/RefSeq_bac_TINY_24MB.fa"
+  Subsys_db="$SAMSA/setup_and_test/tiny_databases/subsys_db_TINY_24MB.fa"
+  # Use test output directories
+  STEP_1="${STEP_1}_test"
+  STEP_2="${STEP_2}_test"
+  STEP_3="${STEP_3}_test"
+  STEP_4="${STEP_4}_test"
+  STEP_5="${STEP_5}_test"
+else
+  # Diamond databases
+  diamond_database="$SAMSA/full_databases/RefSeq_bac"
+  diamond_subsys_db="$SAMSA/full_databases/subsys_db"
+  # Aggregation databases
+  RefSeq_db="$SAMSA/full_databases/RefSeq_bac.fa"
+  Subsys_db="$SAMSA/full_databases/subsys_db.fa"
+fi
+
+####################################################################
+#STEP 0.1: create/read checkpoint
+
+printf "\nStep 0.1: Checking for the presence of the checkpoint file.\n"
+if [ ! -f "$INPUT_DIR/checkpoints" ]
+  then
+    printf "\tThe file 'checkpoints' does not exist in the input directory, creating...\n"
+    touch "$INPUT_DIR/checkpoints"
+else
+    printf "\tThe file 'checkpoints' already exists in the input directory.\n"
+fi
+
+####################################################################
+#
+# STEP 1: CLEANING FILES WITH TRIMMOMATIC
+Step=$(grep "TRIMMO" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "TRIMMO" ]
+  then
+
+if ls $INPUT_DIR/*.gz &>/dev/null; then
+  for file in $INPUT_DIR/*.gz
+  do
+    gunzip $file
+  done
+fi
+
+$MKDIR $STEP_1
+paired=false
+for f in $INPUT_DIR/*R1*q
+do
+    f2=`echo $f | awk -F "R1" '{print $1 "R2" $2}'`
+    out_path=`echo $f | awk -F "_R1" '{print $1 ".cleaned"}'`
+    if [ -f $f2 ]; then
+      paired=true
+      checked java -jar $TRIMMOMATIC PE -phred33 -threads $threads $f $f2 \
+        $out_path".forward" $out_path".forward_unpaired" $out_path".reverse" $out_path".reverse_unpaired" \
+        SLIDINGWINDOW:4:15 MINLEN:70
+    else
+      checked java -jar $TRIMMOMATIC SE -phred33 -threads $threads $f $out_path SLIDINGWINDOW:4:15 MINLEN:70
+    fi
+done
+
+if $paired; then
+  mv $INPUT_DIR/*".cleaned.forward"* $STEP_1
+  mv $INPUT_DIR/*".cleaned.reverse"* $STEP_1
+else
+  mv $INPUT_DIR/*".cleaned" $STEP_1
+fi
+printf "TRIMMO\n" >>pipeline/checkpoints
+
+else
+  printf  "\tThe variable TRIMMO is in the checkpoint file. STEP 1 will be skipped.\n"
+fi
+
+####################################################################
+#
+# STEP 2: MERGING OF PAIRED-END FILES USING PEAR
+# Note: paired-end files are usually named using R1 and R2 in the name.
+#       Example: control_1.R1.fastq
+#                control_1.R2.fastq
+
+Step=$(grep "MERGING" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "MERGING" ]
+  then
+
+$MKDIR $STEP_2
+if $paired; then
+  for file in $STEP_1/*.cleaned.forward
+  do
+    f2=`echo $file | awk -F "cleaned.forward" '{print $1 "cleaned.reverse"}'`
+    shortname=`echo $file | awk -F "cleaned.forward" '{print $1 "merged"}'`
+    checked $PEAR -f $file -r $f2 -j $threads -o $STEP_2/${shortname##*/}
+  done
+else
+  for file in $STEP_1/*.cleaned
+  do
+    new_name=`echo $file | awk -F "cleaned" '{print $1 "merged.assembled.fastq"}'`
+    cp $file $STEP_2/${new_name##*/}
+  done
+fi
+
+###I will concatenate forward and reverse complement with 20Ns in the middle.
+$R_programs/combining_umerged.R $STEP_1
+
+printf "MERGING\n" >>pipeline/checkpoints
+mv $STEP_1/*assembled2.fastq $STEP_2/
+
+else
+  printf  "\tThe variable MERGING is in the checkpoint file. STEP 2 will be skipped.\n"
+fi
+
+####################################################################
+#
+# STEP 2.9: GETTING RAW SEQUENCES COUNTS
+# Note: These are used later for statistical analysis.
+Step=$(grep "RAW" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "RAW" ]
+  then
+
+if [[ -f $STEP_2/raw_counts.txt ]]; then
+    rm $STEP_2/raw_counts.txt
+fi
+touch $STEP_2/raw_counts.txt
+
+if $paired; then
+  for file in $STEP_1/*cleaned.forward
+  do
+    checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt
+  done
+else
+  for file in $STEP_1/*cleaned
+  do
+    checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt
+  done
+fi
+
+printf "RAW\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will be skipped.\n"
+fi
+
+####################################################################
+#
+# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA
+# Note: this step assumes that the SortMeRNA databases are indexed.  If not,
+# do that first (see the SortMeRNA user manual for details).
+Step=$(grep "RIBO" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "RIBO" ]
+  then
+
+for file in $STEP_2/*.assembled2.fastq
+do
+  shortname=`echo $file | awk -F "assembled2" '{print $1 "ribodepleted"}'`
+  checked $SORTMERNA -a $threads \
+    --ref $SORTMERNA_DIR/rRNA_databases/silva-bac-16s-id90.fasta,$SORTMERNA_DIR/index/silva-bac-16s-db \
+    --reads $file --aligned $file.ribosomes --other $shortname --fastx \
+    --log -v
+done
+
+$MKDIR $STEP_3
+mv $STEP_2/*ribodepleted* $STEP_3
+
+printf "RIBO\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable RIBO is in the checkpoint file. STEP 3 will be skipped.\n"
+fi
+
+####################################################################
+#
+# STEP 4: ANNOTATING WITH DIAMOND AGAINST REFSEQ
+# Note: this step assumes that the DIAMOND database is already built.  If not,
+# do that first before running this step.
+Step=$(grep "REFSEQ_ANNOT" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "REFSEQ_ANNOT" ]
+  then
+
+echo "Now starting on DIAMOND org annotations at: "; date
+
+for file in $STEP_3/*ribodepleted.fastq
+do
+    shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'`
+    echo "Now starting on " $file
+    echo "Converting to " $shortname
+    checked $DIAMOND blastx --db $diamond_database -q $file -a $file.RefSeq -t ./ -k 1
+    checked $DIAMOND view --daa $file.RefSeq.daa -o $shortname -f tab
+done
+
+$MKDIR $STEP_4/daa_binary_files
+
+mv $STEP_3/*annotated* $STEP_4
+mv $STEP_3/*.daa $STEP_4/daa_binary_files
+
+echo "RefSeq DIAMOND annotations completed at: "; date
+
+printf "REFSEQ_ANNOT\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable REFSEQ_ANNOT is in the checkpoint file. STEP 4 will be skipped.\n"
+fi
+
+####################################################################
+#
+# STEP 5: AGGREGATING WITH ANALYSIS_COUNTER
+Step=$(grep "REFSEQ_AGGREG" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "REFSEQ_AGGREG" ]
+  then
+
+for file in $STEP_4/*RefSeq_annotated
+do
+  checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
+  checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
+done
+
+$MKDIR $STEP_5/RefSeq_results/org_results
+$MKDIR $STEP_5/RefSeq_results/func_results
+mv $STEP_4/*organism.tsv $STEP_5/RefSeq_results/org_results
+mv $STEP_4/*function.tsv $STEP_5/RefSeq_results/func_results
+
+printf "REFSEQ_AGGREG\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable REFSEQ_AGGREG is in the checkpoint file. STEP 5 will be skipped.\n"
+fi
+
+####################################################################
+#
+# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
+Step=$(grep "SUBSYS_ANNOT" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "SUBSYS_ANNOT" ]
+  then
+
+echo "Now starting on DIAMOND Subsystems annotations at: "; date
+
+for file in $STEP_3/*ribodepleted.fastq
+do
+    shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'`
+    echo "Now starting on Subsystems annotations for " $file
+    checked $DIAMOND blastx --db $diamond_subsys_db -q $file -a $file.Subsys -t ./ -k 1
+    checked $DIAMOND view --daa $file.Subsys.daa -o $shortname -f tab
+done
+
+mv $STEP_3/*subsys_annotated* $STEP_4
+mv $STEP_3/*.daa $STEP_4/daa_binary_files
+
+echo "DIAMOND Subsystems annotations completed at: "; date
+
+printf "SUBSYS_ANNOT\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable SUBSYS_ANNOT is in the checkpoint file. STEP 4.1 will be skipped.\n"
+fi
+
+##################################################################
+#
+# STEP 5.1: PYTHON SUBSYSTEMS ANALYSIS COUNTER
+Step=$(grep "SUBSYS_AGGREG" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "SUBSYS_AGGREG" ]
+  then
+
+for file in $STEP_4/*subsys_annotated
+do
+  checked python $PY_DIR/DIAMOND_subsystems_analysis_counter.py -I $file \
+    -D $Subsys_db -O $file.hierarchy -P $file.receipt
+
+  # This quick program reduces down identical hierarchy annotations
+  checked python $PY_DIR/subsys_reducer.py -I $file.hierarchy
+done
+
+$MKDIR $STEP_5/Subsystems_results/receipts
+mv $STEP_4/*.reduced $STEP_5/Subsystems_results
+mv $STEP_4/*.receipt $STEP_5/Subsystems_results/receipts
+rm $STEP_4/*.hierarchy
+
+printf "SUBSYS_AGGREG\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable SUBSYS_AGGREG is in the checkpoint file. STEP 5.1 will be skipped.\n"
+fi
+
+##################################################################
+#
+# At this point, all the results files are ready for analysis using R.
+# This next step performs basic DESeq2 analysis of the RefSeq organism, function,
+# and Subsystems annotations.
+#
+# More complex R analyses may be performed using specific .sh analysis scripts.
+#
+# STEP 6: R ANALYSIS
+# Note: For R to properly identify files to compare/contrast, they must include
+# the appropriate prefix (either "control_$file" or experimental_$file")!
+Step=$(grep "R_ANALYSIS" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "R_ANALYSIS" ]
+  then
+
+checked Rscript $R_DIR/run_DESeq_stats.R \
+  -I $STEP_5/RefSeq_results/org_results \
+  -O RefSeq_org_DESeq_results.tab \
+  -R $STEP_2/raw_counts.txt
+checked Rscript $R_DIR/run_DESeq_stats.R \
+  -I $STEP_5/RefSeq_results/func_results \
+  -O RefSeq_func_DESeq_results.tab \
+  -R $STEP_2/raw_counts.txt
+checked Rscript $R_DIR/Subsystems_DESeq_stats.R \
+  -I $STEP_5/Subsystems_results \
+  -O Subsystems_level-1_DESeq_results.tab -L 1 \
+  -R $STEP_2/raw_counts.txt
+
+printf "R_ANALYSIS\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable R_ANALYSIS is in the checkpoint file. STEP 6 will be skipped.\n"
+fi
+
+echo "Master bash script finished running at: "; date
+exit
+####################################################################