From 455445b88f90d298fe34b3201a9182f6ad93d700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Wed, 28 Nov 2018 07:56:08 -0800 Subject: [PATCH 01/11] new improved master script --- README.md | 22 ++ R_scripts/combining_umerged.R | 61 +++++ bash_scripts/master_samsa2.slurm | 431 +++++++++++++++++++++++++++++++ 3 files changed, 514 insertions(+) create mode 100755 R_scripts/combining_umerged.R create mode 100755 bash_scripts/master_samsa2.slurm diff --git a/README.md b/README.md index 2ae3d5f..189ac85 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,27 @@ # SAMSA2 - A fork of the complete metatranscriptome analysis pipeline +### modification by sebastien.renaut@gmail.com + +### New in my version. +There is a new master script (master_samsa2.slurm) with several modifications: + +* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond) +* The script annomatically creates a checkpoint file. Once a step is finished, it writes the name of that specific step in the file. When you run the script again, it will SKIP the steps labelled in checkpoint. +* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R` +* Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum. +* Each step contains an exit statement to be printed if the master script dies. +* Trimommatic removes adapter contamination according to a specific file +* All options are to be specified in the first section of the script +* The script is formated to be run into a SLURM job scheduler, but this can be easily changed / removed. + + +#### To do: +* reverse the order of the merging and trimming step. +* simplify some syntax / make sure all paths are relative... + + + + Version 2 of the SAMSA pipeline - faster! Lighter! More options! Less waiting! ### New in version 2: diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R new file mode 100755 index 0000000..c04f28c --- /dev/null +++ b/R_scripts/combining_umerged.R @@ -0,0 +1,61 @@ +#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript + +args = commandArgs(TRUE) +start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself. +#start="/Users/jerry/Documents/CSBQ/shapiro/results" + +setwd(start) + +###files to work with +system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = "")) +system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = "")) +system(paste("ls -1 *.assembled.fastq >assembled.files",sep = "")) + +files_f = read.table("umerged.forward.files",stringsAsFactors = F) +files_r = read.table("umerged.reverse.files",stringsAsFactors = F) +files_a = read.table("assembled.files",stringsAsFactors = F) + + +for(i in 1:nrow(files_f)) + { +#grep 1st and 2rd lines + unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="") + system(unassembled.fastq) + +#grep sequences forward... + unassembled.forward.fastq = paste("awk 'NR % 2 == 0' ",files_f[i,1]," >unassembled.seq.forward.fastq",sep="") + system(unassembled.forward.fastq) + +#grep sequences reverse... + unassembled.reverse.fastq = paste("awk 'NR % 2 == 0' ",files_r[i,1]," >unassembled.seq.reverse.fastq",sep="") + system(unassembled.reverse.fastq) + +#N and E quality file + system("wc -l unassembled.seq.reverse.fastq >wc") + wc = read.table("wc") + + write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F) + +#paste the unassembled sequences into a single file + system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN") + +#put everything back into a single fastq + back = paste("paste -d '\\n' unassembled.names.seq.fastq unassembledN >",gsub("merged.unassembled.forward","cat",files_f[i,1]),sep= "") + system(back) + +#add the merged sequences + all = paste("cat ",files_a[i,1]," ",gsub("merged.unassembled.forward","cat",files_f[i,1])," >",gsub("merged.assembled","merged.assembled2",files_a[i,1]),sep = "") + system(all) + +#remove the clutter (file specific) + remove = c("rm NE_file unassembledN wc unassembled.names.seq.fastq unassembled.seq.forward.fastq unassembled.seq.reverse.fastq") + system(remove) +} + +#remove the clutter (listing of all files) +system("rm assembled.files umerged.forward.files umerged.reverse.files") + + + + + diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm new file mode 100755 index 0000000..cc60bb3 --- /dev/null +++ b/bash_scripts/master_samsa2.slurm @@ -0,0 +1,431 @@ +#!/bin/bash + +#SBATCH --time=0-23:59 +#SBATCH --mem=100000 +#SBATCH --account=def-bruneaua +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=48 + +module load r/3.5.0 + + +# Lines starting with #SBATCH are for SLURM job management systems +# and may be removed if user is not submitting jobs to SLURM +#################################################################### +#set -x #echo +# +# master_script_for_sample_files.bash +# Created April 2017 by Sam Westreich, github.com/transcript +# This version modified by Michelle Treiber on August 9, 2017 +# +#################################################################### +# +# This script sets up and runs through ALL steps in the SAMSA pipeline +# before the analysis (which is done in R, likely in RStudio). Each +# step is set up below. +# +# The steps are: +# 1. Merging with PEAR, if applicable +# 2. Read cleaning with Trimmomatic +# 3. rRNA removal with SortMeRNA +# 4. Annotation using DIAMOND (by default against the RefSeq database) +# 5. Aggregation using analysis_counter.py +# 4.1 Annotation using DIAMOND against the Subsystems database +# 5.1 Aggregation using Subsystems-specific analysis counter.py +# 6. Running R scripts to get DESeq statistical analysis. +# +# NOTE: BEFORE running this script, please run package_installation.bash +# and full_database_download.bash located at: +# https://github.com/transcript/samsa2/tree/master/setup in order to set +# up SAMSA2 dependencies and download full databases. +# +#################################################################### +# +echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n" +# +# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download +# +# 0. Set starting location: +starting_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/metagenome_St2 + +# 0.5 Sequences location: +sequence_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/sequences/metagenome/St2 + +# 1. PEAR +pear_location=/home/renaut/bin + +# 2. Trimmomatic +trimmomatic_location=/home/renaut/bin + +# 3. SortMeRNA +sortmerna_location=/home/renaut/bin + +# 4. DIAMOND +diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac" +diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db" +diamond_location=/home/renaut/bin + +# 5. Aggregation +python_programs=/home/renaut/samsa2/python_scripts +RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa" +Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa" + +# 6. R scripts and paths +export R_LIBS="/home/renaut/samsa2/R_scripts/packages" +R_programs=/home/renaut/samsa2/R_scripts + +# threads +threads="48" + +################## +#Step0.1: create/read checkpoint +cd $starting_location + +printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" +if [ ! -f "pipeline/checkpoints" ] + then + printf "\tThe file checkpoints does not exist, we will create it.\n" + touch "pipeline/checkpoints" +else + printf "\tThe file checkpoints already exist\n" +fi + +#################################################################### +# +#STEP 0.1: GUNZIP PROCESS. +# +# +Step=$(grep "GUNZIP" pipeline/checkpoints) +if [ "${Step}" != "GUNZIP" ] + then + for file in $sequence_location/hiseq_raw/*gz + do + file_out=`echo $file | awk -F ".gz" '{print $1}'` + gunzip $file -cd >$file_out + + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the gunzip step" + exit 1 + fi + done + mv $sequence_location/hiseq_raw/*fastq $sequence_location/hiseq_unzip/. + printf "GUNZIP\n" >>pipeline/checkpoints + echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" + date +else + printf "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n" +fi + +#################################################################### +# +# STEP 1: MERGING OF PAIRED-END FILES USING PEAR +# Note: paired-end files are usually named using R1 and R2 in the name. +# Example: control_1.R1.fastq +# control_1.R2.fastq +# +# Note: if using single-end sequencing, skip this step (comment out). +# Note: if performing R analysis (step 6), be sure to name files with +# the appropriate prefix ("control_$file" and "experimental_$file")! + + + +Step=$(grep "MERGING" pipeline/checkpoints) +if [ "${Step}" != "MERGING" ] + then + mkdir $starting_location/step_1_merging + touch $starting_location/step_1_merging/pear_log + for file in $sequence_location/hiseq_unzip/*_R1.fastq + do + file1=$file + file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'` + out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'` + out_name=`echo ${out_path##*/}` + $pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log + #FOR ATRAPP, GIVEN THAT IT APPEARS LIKE 50-60% ARE MERGED, AND I HAVE A LOT OF SEQUENCES, I WILL ONLY KEEP THE MERGED + + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the merging step" + exit 1 + fi + done + #FOR ATRAPP, I WILL NOW MERGE:I will merge the umerged leaving 20 N's in between. + + combining_umerged.R $starting_location + printf "MERGING\n" >>pipeline/checkpoints + mkdir $starting_location/step_1_merging/ + mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ + + rm $starting_location/*fastq + rm $sequence_location/hiseq_unzip/*fastq + + echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR" + date +else + printf "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n" +fi + +#################################################################### +# +# STEP 2: CLEANING FILES WITH TRIMMOMATIC +# Note: if skipping PEAR, make sure that all starting files are in the +# $starting_location/step_1_output/ folder! +Step=$(grep "TRIMMING" pipeline/checkpoints) +if [ "${Step}" != "TRIMMING" ] + then + mkdir $starting_location/step_2_trimming + touch $starting_location/step_2_trimming/trimmomatic_log + + for file in $starting_location/step_1_merging/*assembled2.fastq + do + shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'` + + java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the trimmmomatic step" + exit 1 + fi + + done + + mkdir $starting_location/step_2_trimming/ + mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/ + + rm step_1_merging/*assembled2.fastq + + echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC" + date + + printf "TRIMMING\n" >>pipeline/checkpoints +else + printf "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n" +fi + +#################################################################### +# +# STEP 2.5: GETTING RAW SEQUENCES COUNTS +# Note: These are used later for statistical analysis. +Step=$(grep "RAW" pipeline/checkpoints) +if [ "${Step}" != "RAW" ] + then + if [ -f $starting_location/step_2_trimming/raw_counts.txt ] + then + rm $starting_location/step_2_trimming/raw_counts.txt + touch $starting_location/step_2_trimming/raw_counts.txt + else + touch $starting_location/step_2_trimming/raw_counts.txt + fi + + for file in $starting_location/step_2_trimming/*cleaned.fastq + do + python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the raw sequences step" + exit 1 + fi + done + echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" + date + + printf "RAW\n" >>pipeline/checkpoints +else + printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" +fi + +#################################################################### +# +# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA +# Note: this step assumes that the SortMeRNA databases are indexed. If not, +# do that first (see the SortMeRNA user manual for details). + +Step=$(grep "RIBODEPLETION" pipeline/checkpoints) +if [ "${Step}" != "RIBODEPLETION" ] + then + mkdir $starting_location/step_3_ribodepletion + touch $starting_location/step_3_ribodepletion/sortmerna_log + + for file in $starting_location/step_2_trimming/*cleaned.fastq + do + shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'` + $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --num_alignments 0 --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the sortmerna step" + exit 1 + fi + done + + mkdir $starting_location/step_3_ribodepletion/ + mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/ + + rm step_2_trimming/*cleaned.fastq + rm step_2_trimming/*ribosomes.fastq + + echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA" + date + + printf "RIBODEPLETION\n" >>pipeline/checkpoints +else + printf "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n" +fi + +#################################################################### +# +# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS +date +Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints) +if [ "${Step}" != "ANNOTATION_SUBSYS" ] + then + mkdir $starting_location/step_4_annotation + touch $starting_location/step_4_annotation/diamond_log + for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq + do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'` + #removed the diamond sensitive option + $diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log + $diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the diamond subsys step" + exit 1 + fi + done + + mkdir $starting_location/step_4_annotation/daa_binary_files/ + + mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/ + mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ + + echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS" + date + + printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints +else + printf "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n" +fi +################################################################## +# +# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER +Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints) +if [ "${Step}" != "AGGREGATION_SUBSYS" ] + then + mkdir $starting_location/step_5_aggregation/ + for file in $starting_location/step_4_annotation/*subsys_annotated + do + python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt + + # This quick program reduces down identical hierarchy annotations + python $python_programs/subsys_reducer.py -I $file.hierarchy + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the aggregation subsys step" + exit 1 + fi + done + + mkdir $starting_location/step_5_aggregation/Subsystems_results/ + mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/ + mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/ + mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/ + rm $starting_location/step_4_annotation/*.hierarchy + + echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER" + date + + printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints +else + printf "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n" +fi + + +#################################################################### +# +# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ +# Note: this step assumes that the DIAMOND database is already built. If not, +# do that first before running this step. +Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints) +if [ "${Step}" != "ANNOTATION_REFSEQ" ] + then + touch $starting_location/step_4_annotation/diamond_log + + for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq + do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'` + #removed the diamond sensitive option + $diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log + $diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the diamond refseq step" + exit 1 + fi + done + + mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ + mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ + + echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ" + date + + printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints +else + printf "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n" +fi + +#################################################################### +# +# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER +Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints) +if [ "${Step}" != "AGGREGATION_REFSEQ" ] + then + for file in $starting_location/step_4_annotation/*RefSeq_annotated* + do + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the aggregation refseq step" + exit 1 + fi + done + + mkdir $starting_location/step_5_aggregation/RefSeq_results/ + mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/ + mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/ + mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/ + mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/ + + echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER" + date + + printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints +else + printf "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n" +fi + + + +################################################################## +# +# At this point, all the results files are ready for analysis using R. +# This next step performs basic DESeq2 analysis of the RefSeq organism, function, +# and Subsystems annotations. +# +# More complex R analyses may be performed using specific .sh analysis scripts. +# +# STEP 6: R ANALYSIS +# Note: For R to properly identify files to compare/contrast, they must include +# the appropriate prefix (either "control_$file" or experimental_$file")! + +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt + +echo "STEP 6 DONE: R ANALYSIS" +echo "Master bash script finished running at: "; date +exit +#################################################################### + From 5d90e0e4392d8210f95b828fa9c30121b9afb3cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Wed, 28 Nov 2018 08:48:36 -0800 Subject: [PATCH 02/11] improved syntax --- bash_scripts/master_samsa2.slurm | 44 +++++++++++++++++--------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm index cc60bb3..36b8297 100755 --- a/bash_scripts/master_samsa2.slurm +++ b/bash_scripts/master_samsa2.slurm @@ -25,14 +25,16 @@ module load r/3.5.0 # step is set up below. # # The steps are: -# 1. Merging with PEAR, if applicable -# 2. Read cleaning with Trimmomatic -# 3. rRNA removal with SortMeRNA -# 4. Annotation using DIAMOND (by default against the RefSeq database) -# 5. Aggregation using analysis_counter.py -# 4.1 Annotation using DIAMOND against the Subsystems database -# 5.1 Aggregation using Subsystems-specific analysis counter.py -# 6. Running R scripts to get DESeq statistical analysis. +# 0.1 Create the checkpoint +# 0.2 Gunzip the raw sequences +# 1. Merging with PEAR, if applicable +# 2. Read cleaning with Trimmomatic +# 3. rRNA removal with SortMeRNA +# 4. Annotation using DIAMOND against Subsys database +# 5. Aggregation using analysis_counter.py (Subsys) +# 4.1 Annotation using DIAMOND against Refseq database +# 5.1 Aggregation using analysis_counter.py (Refseq) +# 6. Running R scripts to get DESeq statistical analysis. (NOT RUN) # # NOTE: BEFORE running this script, please run package_installation.bash # and full_database_download.bash located at: @@ -48,14 +50,14 @@ echo -e "NOTE: Before running this script, user must run package_installation.ba # 0. Set starting location: starting_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/metagenome_St2 -# 0.5 Sequences location: +# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/hiseq_raw sequence_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/sequences/metagenome/St2 # 1. PEAR pear_location=/home/renaut/bin # 2. Trimmomatic -trimmomatic_location=/home/renaut/bin +trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36 # 3. SortMeRNA sortmerna_location=/home/renaut/bin @@ -66,19 +68,19 @@ diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db" diamond_location=/home/renaut/bin # 5. Aggregation -python_programs=/home/renaut/samsa2/python_scripts +python_programs=/home/renaut/samsa2.master/samsa2/python_scripts RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa" Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa" # 6. R scripts and paths -export R_LIBS="/home/renaut/samsa2/R_scripts/packages" -R_programs=/home/renaut/samsa2/R_scripts +export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages" +R_programs=/home/renaut/samsa2.master/samsa2/R_scripts # threads threads="48" ################## -#Step0.1: create/read checkpoint +#STEP 0.1: create/read checkpoint cd $starting_location printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" @@ -92,7 +94,7 @@ fi #################################################################### # -#STEP 0.1: GUNZIP PROCESS. +#STEP 0.2: GUNZIP PROCESS. # # Step=$(grep "GUNZIP" pipeline/checkpoints) @@ -109,6 +111,8 @@ if [ "${Step}" != "GUNZIP" ] exit 1 fi done + + mkdir $sequence_location/hiseq_unzip mv $sequence_location/hiseq_raw/*fastq $sequence_location/hiseq_unzip/. printf "GUNZIP\n" >>pipeline/checkpoints echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" @@ -142,7 +146,6 @@ if [ "${Step}" != "MERGING" ] out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'` out_name=`echo ${out_path##*/}` $pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log - #FOR ATRAPP, GIVEN THAT IT APPEARS LIKE 50-60% ARE MERGED, AND I HAVE A LOT OF SEQUENCES, I WILL ONLY KEEP THE MERGED if [ $? -ne 0 ] then @@ -150,9 +153,10 @@ if [ "${Step}" != "MERGING" ] exit 1 fi done - #FOR ATRAPP, I WILL NOW MERGE:I will merge the umerged leaving 20 N's in between. - - combining_umerged.R $starting_location + + ###I will concatenate forward and reverse complement with 20Ns in the middle. + $R_programs/combining_umerged.R $starting_location + printf "MERGING\n" >>pipeline/checkpoints mkdir $starting_location/step_1_merging/ mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ @@ -181,7 +185,7 @@ if [ "${Step}" != "TRIMMING" ] do shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'` - java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log + java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log if [ $? -ne 0 ] then printf "\t!!! There is a problem in the trimmmomatic step" From 33adea99d76ff7a05c8a8f1b3e244475e9f94df1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Wed, 28 Nov 2018 12:16:07 -0800 Subject: [PATCH 03/11] sortmerna options --- README.md | 4 ++-- bash_scripts/master_samsa2.slurm | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 189ac85..59417b5 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,10 @@ There is a new master script (master_samsa2.slurm) with several modifications: * Trimommatic removes adapter contamination according to a specific file * All options are to be specified in the first section of the script * The script is formated to be run into a SLURM job scheduler, but this can be easily changed / removed. - +* I've removed the --num_alignments 0 is sortmrna. This causes problems and slows things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to one or 1,000 rRNA its out anyways... #### To do: -* reverse the order of the merging and trimming step. +* reverse the order of the merging and trimming step (but again, maybe not...) * simplify some syntax / make sure all paths are relative... diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm index 36b8297..6fa3c24 100755 --- a/bash_scripts/master_samsa2.slurm +++ b/bash_scripts/master_samsa2.slurm @@ -254,7 +254,7 @@ if [ "${Step}" != "RIBODEPLETION" ] for file in $starting_location/step_2_trimming/*cleaned.fastq do shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'` - $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --num_alignments 0 --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log + $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log if [ $? -ne 0 ] then printf "\t!!! There is a problem in the sortmerna step" From 58513fc8e1403670d54762ea086322a84ee6eccd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Mon, 3 Dec 2018 10:01:23 -0800 Subject: [PATCH 04/11] improve flow --- README.md | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 59417b5..72c0ce0 100644 --- a/README.md +++ b/README.md @@ -3,23 +3,21 @@ ### modification by sebastien.renaut@gmail.com ### New in my version. -There is a new master script (master_samsa2.slurm) with several modifications: +This is a new master script (master_samsa2.slurm) with several modifications: * It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond) -* The script annomatically creates a checkpoint file. Once a step is finished, it writes the name of that specific step in the file. When you run the script again, it will SKIP the steps labelled in checkpoint. +* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary. * In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R` * Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum. -* Each step contains an exit statement to be printed if the master script dies. -* Trimommatic removes adapter contamination according to a specific file -* All options are to be specified in the first section of the script -* The script is formated to be run into a SLURM job scheduler, but this can be easily changed / removed. -* I've removed the --num_alignments 0 is sortmrna. This causes problems and slows things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to one or 1,000 rRNA its out anyways... +* Each step contains an exit statement to be printed if the master script dies due to an unforseen error. +* Trimmomatic removes adapter contamination according to a specific fasta file. +* All options, read & program location are to be specified in the first section of the script. +* The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed. +* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways... #### To do: * reverse the order of the merging and trimming step (but again, maybe not...) -* simplify some syntax / make sure all paths are relative... - - +* simplify some syntax / make sure all paths are relative (mostly done) Version 2 of the SAMSA pipeline - faster! Lighter! More options! Less waiting! From 79e66eede3f488cbc6272c34e10d7fa199e46553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Mon, 18 Feb 2019 11:01:57 -0800 Subject: [PATCH 05/11] minor --- .gitignore | 5 +++++ bash_scripts/master_samsa2.slurm | 18 +++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 9e8b5a7..c11349b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +# my files +bash_scripts/master_samsa2.slurm +bash_scripts/cyanotoxin_master_samsa2.slurm +R_scripts/combining_umerged.R + # Swap files *.swp *.swo diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm index 6fa3c24..a16d11b 100755 --- a/bash_scripts/master_samsa2.slurm +++ b/bash_scripts/master_samsa2.slurm @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --time=0-23:59 -#SBATCH --mem=100000 +#SBATCH --mem=100G #SBATCH --account=def-bruneaua #SBATCH --nodes=1 #SBATCH --ntasks-per-node=48 @@ -48,10 +48,10 @@ echo -e "NOTE: Before running this script, user must run package_installation.ba # VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download # # 0. Set starting location: -starting_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/metagenome_St2 +starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1 -# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/hiseq_raw -sequence_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/sequences/metagenome/St2 +# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped +sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1 # 1. PEAR pear_location=/home/renaut/bin @@ -100,7 +100,7 @@ fi Step=$(grep "GUNZIP" pipeline/checkpoints) if [ "${Step}" != "GUNZIP" ] then - for file in $sequence_location/hiseq_raw/*gz + for file in $sequence_location/zipped/*gz do file_out=`echo $file | awk -F ".gz" '{print $1}'` gunzip $file -cd >$file_out @@ -112,8 +112,8 @@ if [ "${Step}" != "GUNZIP" ] fi done - mkdir $sequence_location/hiseq_unzip - mv $sequence_location/hiseq_raw/*fastq $sequence_location/hiseq_unzip/. + mkdir $sequence_location/unzipped + mv $sequence_location/zipped/*fastq $sequence_location/unzipped/. printf "GUNZIP\n" >>pipeline/checkpoints echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" date @@ -139,7 +139,7 @@ if [ "${Step}" != "MERGING" ] then mkdir $starting_location/step_1_merging touch $starting_location/step_1_merging/pear_log - for file in $sequence_location/hiseq_unzip/*_R1.fastq + for file in $sequence_location/unzipped/*_R1.fastq do file1=$file file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'` @@ -162,7 +162,7 @@ if [ "${Step}" != "MERGING" ] mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ rm $starting_location/*fastq - rm $sequence_location/hiseq_unzip/*fastq + rm $sequence_location/unzipped/*fastq echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR" date From fb2fc99f0e4f369fa783f476b2322f5caad6422b Mon Sep 17 00:00:00 2001 From: Sebastien Renaut Date: Mon, 18 Feb 2019 14:03:14 -0500 Subject: [PATCH 06/11] remove from main --- bash_scripts/master_samsa2.slurm | 435 ------------------------------- 1 file changed, 435 deletions(-) delete mode 100755 bash_scripts/master_samsa2.slurm diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm deleted file mode 100755 index a16d11b..0000000 --- a/bash_scripts/master_samsa2.slurm +++ /dev/null @@ -1,435 +0,0 @@ -#!/bin/bash - -#SBATCH --time=0-23:59 -#SBATCH --mem=100G -#SBATCH --account=def-bruneaua -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=48 - -module load r/3.5.0 - - -# Lines starting with #SBATCH are for SLURM job management systems -# and may be removed if user is not submitting jobs to SLURM -#################################################################### -#set -x #echo -# -# master_script_for_sample_files.bash -# Created April 2017 by Sam Westreich, github.com/transcript -# This version modified by Michelle Treiber on August 9, 2017 -# -#################################################################### -# -# This script sets up and runs through ALL steps in the SAMSA pipeline -# before the analysis (which is done in R, likely in RStudio). Each -# step is set up below. -# -# The steps are: -# 0.1 Create the checkpoint -# 0.2 Gunzip the raw sequences -# 1. Merging with PEAR, if applicable -# 2. Read cleaning with Trimmomatic -# 3. rRNA removal with SortMeRNA -# 4. Annotation using DIAMOND against Subsys database -# 5. Aggregation using analysis_counter.py (Subsys) -# 4.1 Annotation using DIAMOND against Refseq database -# 5.1 Aggregation using analysis_counter.py (Refseq) -# 6. Running R scripts to get DESeq statistical analysis. (NOT RUN) -# -# NOTE: BEFORE running this script, please run package_installation.bash -# and full_database_download.bash located at: -# https://github.com/transcript/samsa2/tree/master/setup in order to set -# up SAMSA2 dependencies and download full databases. -# -#################################################################### -# -echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n" -# -# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download -# -# 0. Set starting location: -starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1 - -# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped -sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1 - -# 1. PEAR -pear_location=/home/renaut/bin - -# 2. Trimmomatic -trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36 - -# 3. SortMeRNA -sortmerna_location=/home/renaut/bin - -# 4. DIAMOND -diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac" -diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db" -diamond_location=/home/renaut/bin - -# 5. Aggregation -python_programs=/home/renaut/samsa2.master/samsa2/python_scripts -RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa" -Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa" - -# 6. R scripts and paths -export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages" -R_programs=/home/renaut/samsa2.master/samsa2/R_scripts - -# threads -threads="48" - -################## -#STEP 0.1: create/read checkpoint -cd $starting_location - -printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" -if [ ! -f "pipeline/checkpoints" ] - then - printf "\tThe file checkpoints does not exist, we will create it.\n" - touch "pipeline/checkpoints" -else - printf "\tThe file checkpoints already exist\n" -fi - -#################################################################### -# -#STEP 0.2: GUNZIP PROCESS. -# -# -Step=$(grep "GUNZIP" pipeline/checkpoints) -if [ "${Step}" != "GUNZIP" ] - then - for file in $sequence_location/zipped/*gz - do - file_out=`echo $file | awk -F ".gz" '{print $1}'` - gunzip $file -cd >$file_out - - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the gunzip step" - exit 1 - fi - done - - mkdir $sequence_location/unzipped - mv $sequence_location/zipped/*fastq $sequence_location/unzipped/. - printf "GUNZIP\n" >>pipeline/checkpoints - echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" - date -else - printf "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n" -fi - -#################################################################### -# -# STEP 1: MERGING OF PAIRED-END FILES USING PEAR -# Note: paired-end files are usually named using R1 and R2 in the name. -# Example: control_1.R1.fastq -# control_1.R2.fastq -# -# Note: if using single-end sequencing, skip this step (comment out). -# Note: if performing R analysis (step 6), be sure to name files with -# the appropriate prefix ("control_$file" and "experimental_$file")! - - - -Step=$(grep "MERGING" pipeline/checkpoints) -if [ "${Step}" != "MERGING" ] - then - mkdir $starting_location/step_1_merging - touch $starting_location/step_1_merging/pear_log - for file in $sequence_location/unzipped/*_R1.fastq - do - file1=$file - file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'` - out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'` - out_name=`echo ${out_path##*/}` - $pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log - - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the merging step" - exit 1 - fi - done - - ###I will concatenate forward and reverse complement with 20Ns in the middle. - $R_programs/combining_umerged.R $starting_location - - printf "MERGING\n" >>pipeline/checkpoints - mkdir $starting_location/step_1_merging/ - mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ - - rm $starting_location/*fastq - rm $sequence_location/unzipped/*fastq - - echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR" - date -else - printf "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n" -fi - -#################################################################### -# -# STEP 2: CLEANING FILES WITH TRIMMOMATIC -# Note: if skipping PEAR, make sure that all starting files are in the -# $starting_location/step_1_output/ folder! -Step=$(grep "TRIMMING" pipeline/checkpoints) -if [ "${Step}" != "TRIMMING" ] - then - mkdir $starting_location/step_2_trimming - touch $starting_location/step_2_trimming/trimmomatic_log - - for file in $starting_location/step_1_merging/*assembled2.fastq - do - shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'` - - java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the trimmmomatic step" - exit 1 - fi - - done - - mkdir $starting_location/step_2_trimming/ - mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/ - - rm step_1_merging/*assembled2.fastq - - echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC" - date - - printf "TRIMMING\n" >>pipeline/checkpoints -else - printf "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n" -fi - -#################################################################### -# -# STEP 2.5: GETTING RAW SEQUENCES COUNTS -# Note: These are used later for statistical analysis. -Step=$(grep "RAW" pipeline/checkpoints) -if [ "${Step}" != "RAW" ] - then - if [ -f $starting_location/step_2_trimming/raw_counts.txt ] - then - rm $starting_location/step_2_trimming/raw_counts.txt - touch $starting_location/step_2_trimming/raw_counts.txt - else - touch $starting_location/step_2_trimming/raw_counts.txt - fi - - for file in $starting_location/step_2_trimming/*cleaned.fastq - do - python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the raw sequences step" - exit 1 - fi - done - echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" - date - - printf "RAW\n" >>pipeline/checkpoints -else - printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" -fi - -#################################################################### -# -# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA -# Note: this step assumes that the SortMeRNA databases are indexed. If not, -# do that first (see the SortMeRNA user manual for details). - -Step=$(grep "RIBODEPLETION" pipeline/checkpoints) -if [ "${Step}" != "RIBODEPLETION" ] - then - mkdir $starting_location/step_3_ribodepletion - touch $starting_location/step_3_ribodepletion/sortmerna_log - - for file in $starting_location/step_2_trimming/*cleaned.fastq - do - shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'` - $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the sortmerna step" - exit 1 - fi - done - - mkdir $starting_location/step_3_ribodepletion/ - mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/ - - rm step_2_trimming/*cleaned.fastq - rm step_2_trimming/*ribosomes.fastq - - echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA" - date - - printf "RIBODEPLETION\n" >>pipeline/checkpoints -else - printf "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n" -fi - -#################################################################### -# -# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS -date -Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints) -if [ "${Step}" != "ANNOTATION_SUBSYS" ] - then - mkdir $starting_location/step_4_annotation - touch $starting_location/step_4_annotation/diamond_log - for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq - do - shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'` - #removed the diamond sensitive option - $diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log - $diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the diamond subsys step" - exit 1 - fi - done - - mkdir $starting_location/step_4_annotation/daa_binary_files/ - - mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/ - mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ - - echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS" - date - - printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints -else - printf "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n" -fi -################################################################## -# -# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER -Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints) -if [ "${Step}" != "AGGREGATION_SUBSYS" ] - then - mkdir $starting_location/step_5_aggregation/ - for file in $starting_location/step_4_annotation/*subsys_annotated - do - python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt - - # This quick program reduces down identical hierarchy annotations - python $python_programs/subsys_reducer.py -I $file.hierarchy - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the aggregation subsys step" - exit 1 - fi - done - - mkdir $starting_location/step_5_aggregation/Subsystems_results/ - mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/ - mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/ - mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/ - rm $starting_location/step_4_annotation/*.hierarchy - - echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER" - date - - printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints -else - printf "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n" -fi - - -#################################################################### -# -# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ -# Note: this step assumes that the DIAMOND database is already built. If not, -# do that first before running this step. -Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints) -if [ "${Step}" != "ANNOTATION_REFSEQ" ] - then - touch $starting_location/step_4_annotation/diamond_log - - for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq - do - shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'` - #removed the diamond sensitive option - $diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log - $diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the diamond refseq step" - exit 1 - fi - done - - mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ - mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ - - echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ" - date - - printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints -else - printf "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n" -fi - -#################################################################### -# -# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER -Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints) -if [ "${Step}" != "AGGREGATION_REFSEQ" ] - then - for file in $starting_location/step_4_annotation/*RefSeq_annotated* - do - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the aggregation refseq step" - exit 1 - fi - done - - mkdir $starting_location/step_5_aggregation/RefSeq_results/ - mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/ - mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/ - mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/ - mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/ - - echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER" - date - - printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints -else - printf "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n" -fi - - - -################################################################## -# -# At this point, all the results files are ready for analysis using R. -# This next step performs basic DESeq2 analysis of the RefSeq organism, function, -# and Subsystems annotations. -# -# More complex R analyses may be performed using specific .sh analysis scripts. -# -# STEP 6: R ANALYSIS -# Note: For R to properly identify files to compare/contrast, they must include -# the appropriate prefix (either "control_$file" or experimental_$file")! - -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt - -echo "STEP 6 DONE: R ANALYSIS" -echo "Master bash script finished running at: "; date -exit -#################################################################### - From f68ba12b7bd1a88d9830c21e1c012cc2b6eec3a1 Mon Sep 17 00:00:00 2001 From: Sebastien Renaut Date: Mon, 18 Feb 2019 14:07:22 -0500 Subject: [PATCH 07/11] delete from main branch --- R_scripts/combining_umerged.R | 61 ----------------------------------- 1 file changed, 61 deletions(-) delete mode 100755 R_scripts/combining_umerged.R diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R deleted file mode 100755 index c04f28c..0000000 --- a/R_scripts/combining_umerged.R +++ /dev/null @@ -1,61 +0,0 @@ -#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript - -args = commandArgs(TRUE) -start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself. -#start="/Users/jerry/Documents/CSBQ/shapiro/results" - -setwd(start) - -###files to work with -system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = "")) -system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = "")) -system(paste("ls -1 *.assembled.fastq >assembled.files",sep = "")) - -files_f = read.table("umerged.forward.files",stringsAsFactors = F) -files_r = read.table("umerged.reverse.files",stringsAsFactors = F) -files_a = read.table("assembled.files",stringsAsFactors = F) - - -for(i in 1:nrow(files_f)) - { -#grep 1st and 2rd lines - unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="") - system(unassembled.fastq) - -#grep sequences forward... - unassembled.forward.fastq = paste("awk 'NR % 2 == 0' ",files_f[i,1]," >unassembled.seq.forward.fastq",sep="") - system(unassembled.forward.fastq) - -#grep sequences reverse... - unassembled.reverse.fastq = paste("awk 'NR % 2 == 0' ",files_r[i,1]," >unassembled.seq.reverse.fastq",sep="") - system(unassembled.reverse.fastq) - -#N and E quality file - system("wc -l unassembled.seq.reverse.fastq >wc") - wc = read.table("wc") - - write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F) - -#paste the unassembled sequences into a single file - system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN") - -#put everything back into a single fastq - back = paste("paste -d '\\n' unassembled.names.seq.fastq unassembledN >",gsub("merged.unassembled.forward","cat",files_f[i,1]),sep= "") - system(back) - -#add the merged sequences - all = paste("cat ",files_a[i,1]," ",gsub("merged.unassembled.forward","cat",files_f[i,1])," >",gsub("merged.assembled","merged.assembled2",files_a[i,1]),sep = "") - system(all) - -#remove the clutter (file specific) - remove = c("rm NE_file unassembledN wc unassembled.names.seq.fastq unassembled.seq.forward.fastq unassembled.seq.reverse.fastq") - system(remove) -} - -#remove the clutter (listing of all files) -system("rm assembled.files umerged.forward.files umerged.reverse.files") - - - - - From ceba37eea48f45f4bcc7171066d2c43af7aa2a5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Mon, 18 Feb 2019 11:13:35 -0800 Subject: [PATCH 08/11] add a checkpoint file --- README.md | 20 -------------------- bash_scripts/master_script.sh | 23 +++++++++++++++++++++++ 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 72c0ce0..2ae3d5f 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,5 @@ # SAMSA2 - A fork of the complete metatranscriptome analysis pipeline -### modification by sebastien.renaut@gmail.com - -### New in my version. -This is a new master script (master_samsa2.slurm) with several modifications: - -* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond) -* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary. -* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R` -* Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum. -* Each step contains an exit statement to be printed if the master script dies due to an unforseen error. -* Trimmomatic removes adapter contamination according to a specific fasta file. -* All options, read & program location are to be specified in the first section of the script. -* The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed. -* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways... - -#### To do: -* reverse the order of the merging and trimming step (but again, maybe not...) -* simplify some syntax / make sure all paths are relative (mostly done) - - Version 2 of the SAMSA pipeline - faster! Lighter! More options! Less waiting! ### New in version 2: diff --git a/bash_scripts/master_script.sh b/bash_scripts/master_script.sh index 5f8ff5e..cd6ac6c 100644 --- a/bash_scripts/master_script.sh +++ b/bash_scripts/master_script.sh @@ -72,8 +72,26 @@ else fi #################################################################### +#STEP 0.1: create/read checkpoint + +printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" +if [ ! -f "$INPUT_DIR/checkpoints" ] + then + printf "\tThe file checkpoints does not exist, we will create it.\n" + touch "$INPUT_DIR/checkpoints" +else + printf "\tThe file checkpoints already exist\n" +fi + +#################################################################### + + # # STEP 1: CLEANING FILES WITH TRIMMOMATIC +Step=$(grep "TRIMMO" $INPUT_DIR/checkpoints) +if [ "${Step}" != "TRIMMO" ] + then + if ls $INPUT_DIR/*.gz &>/dev/null; then for file in $INPUT_DIR/*.gz @@ -105,6 +123,11 @@ else mv $INPUT_DIR/*".cleaned" $STEP_1 fi +else + printf "\tThe variable TRIMMO is in the checkpoint file. STEP 1 will then be passed\n" +fi + + #################################################################### # # STEP 2: MERGING OF PAIRED-END FILES USING PEAR From bf29e7888a2b08872837c5cbb50e923abd89d711 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Mon, 18 Feb 2019 11:26:35 -0800 Subject: [PATCH 09/11] checkpoint for all steps --- bash_scripts/master_script.sh | 74 +++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/bash_scripts/master_script.sh b/bash_scripts/master_script.sh index cd6ac6c..f6ef74e 100644 --- a/bash_scripts/master_script.sh +++ b/bash_scripts/master_script.sh @@ -134,6 +134,10 @@ fi # Note: paired-end files are usually named using R1 and R2 in the name. # Example: control_1.R1.fastq # control_1.R2.fastq +Step=$(grep "MERGING" $INPUT_DIR/checkpoints) +if [ "${Step}" != "MERGING" ] + then + $MKDIR $STEP_2 if $paired; then @@ -151,10 +155,19 @@ else done fi +printf "MERGING\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable MERGING is in the checkpoint file. STEP 2 will then be passed\n" +fi + #################################################################### # # STEP 2.9: GETTING RAW SEQUENCES COUNTS # Note: These are used later for statistical analysis. +Step=$(grep "RAW" $INPUT_DIR/checkpoints) +if [ "${Step}" != "RAW" ] + then if [[ -f $STEP_2/raw_counts.txt ]]; then rm $STEP_2/raw_counts.txt @@ -173,11 +186,20 @@ else done fi +printf "RAW\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" +fi + #################################################################### # # STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA # Note: this step assumes that the SortMeRNA databases are indexed. If not, # do that first (see the SortMeRNA user manual for details). +Step=$(grep "RIBO" $INPUT_DIR/checkpoints) +if [ "${Step}" != "RIBO" ] + then for file in $STEP_2/*.assembled.fastq do @@ -191,11 +213,20 @@ done $MKDIR $STEP_3 mv $STEP_2/*ribodepleted* $STEP_3 +printf "RIBO\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable RIBO is in the checkpoint file. STEP 3 will then be passed\n" +fi + #################################################################### # # STEP 4: ANNOTATING WITH DIAMOND AGAINST REFSEQ # Note: this step assumes that the DIAMOND database is already built. If not, # do that first before running this step. +Step=$(grep "REFSEQ_ANNOT" $INPUT_DIR/checkpoints) +if [ "${Step}" != "REFSEQ_ANNOT" ] + then echo "Now starting on DIAMOND org annotations at: "; date @@ -215,9 +246,18 @@ mv $STEP_3/*.daa $STEP_4/daa_binary_files echo "RefSeq DIAMOND annotations completed at: "; date +printf "REFSEQ_ANNOT\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable REFSEQ_ANNOT is in the checkpoint file. STEP 4 will then be passed\n" +fi + #################################################################### # # STEP 5: AGGREGATING WITH ANALYSIS_COUNTER +Step=$(grep "REFSEQ_AGGREG" $INPUT_DIR/checkpoints) +if [ "${Step}" != "REFSEQ_AGGREG" ] + then for file in $STEP_4/*RefSeq_annotated do @@ -230,9 +270,18 @@ $MKDIR $STEP_5/RefSeq_results/func_results mv $STEP_4/*organism.tsv $STEP_5/RefSeq_results/org_results mv $STEP_4/*function.tsv $STEP_5/RefSeq_results/func_results +printf "REFSEQ_AGGREG\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable REFSEQ_AGGREG is in the checkpoint file. STEP 5 will then be passed\n" +fi + #################################################################### # # STEP 4.1: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS +Step=$(grep "SUBSYS_ANNOT" $INPUT_DIR/checkpoints) +if [ "${Step}" != "SUBSYS_ANNOT" ] + then echo "Now starting on DIAMOND Subsystems annotations at: "; date @@ -249,9 +298,18 @@ mv $STEP_3/*.daa $STEP_4/daa_binary_files echo "DIAMOND Subsystems annotations completed at: "; date +printf "SUBSYS_ANNOT\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable SUBSYS_ANNOT is in the checkpoint file. STEP 4.1 will then be passed\n" +fi + ################################################################## # # STEP 5.1: PYTHON SUBSYSTEMS ANALYSIS COUNTER +Step=$(grep "SUBSYS_AGGREG" $INPUT_DIR/checkpoints) +if [ "${Step}" != "SUBSYS_AGGREG" ] + then for file in $STEP_4/*subsys_annotated do @@ -267,6 +325,12 @@ mv $STEP_4/*.reduced $STEP_5/Subsystems_results mv $STEP_4/*.receipt $STEP_5/Subsystems_results/receipts rm $STEP_4/*.hierarchy +printf "SUBSYS_AGGREG\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable SUBSYS_AGGREG is in the checkpoint file. STEP 5.1 will then be passed\n" +fi + ################################################################## # # At this point, all the results files are ready for analysis using R. @@ -278,6 +342,9 @@ rm $STEP_4/*.hierarchy # STEP 6: R ANALYSIS # Note: For R to properly identify files to compare/contrast, they must include # the appropriate prefix (either "control_$file" or experimental_$file")! +Step=$(grep "R_ANALYSIS" $INPUT_DIR/checkpoints) +if [ "${Step}" != "R_ANALYSIS" ] + then checked Rscript $R_DIR/run_DESeq_stats.R \ -I $STEP_5/RefSeq_results/org_results \ @@ -292,6 +359,13 @@ checked Rscript $R_DIR/Subsystems_DESeq_stats.R \ -O Subsystems_level-1_DESeq_results.tab -L 1 \ -R $STEP_2/raw_counts.txt +printf "R_ANALYSIS\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable R_ANALYSIS is in the checkpoint file. STEP 6 will then be passed\n" +fi + echo "Master bash script finished running at: "; date exit #################################################################### + From c7474db08cc89320ae587fc0c7def606508f2063 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Mon, 4 Mar 2019 12:07:28 -0800 Subject: [PATCH 10/11] adding my own scripts to the forked samsa2 pipeline --- .gitignore | 6 +- README.md | 20 + R_scripts/combining_umerged.R | 61 +++ bash_scripts/cyanotoxin_master_samsa2.slurm | 505 ++++++++++++++++++++ bash_scripts/master_samsa2.slurm | 435 +++++++++++++++++ 5 files changed, 1024 insertions(+), 3 deletions(-) create mode 100755 R_scripts/combining_umerged.R create mode 100755 bash_scripts/cyanotoxin_master_samsa2.slurm create mode 100755 bash_scripts/master_samsa2.slurm diff --git a/.gitignore b/.gitignore index c11349b..30b79f6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # my files -bash_scripts/master_samsa2.slurm -bash_scripts/cyanotoxin_master_samsa2.slurm -R_scripts/combining_umerged.R +#bash_scripts/master_samsa2.slurm +#bash_scripts/cyanotoxin_master_samsa2.slurm +#R_scripts/combining_umerged.R # Swap files *.swp diff --git a/README.md b/README.md index 2ae3d5f..72c0ce0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,25 @@ # SAMSA2 - A fork of the complete metatranscriptome analysis pipeline +### modification by sebastien.renaut@gmail.com + +### New in my version. +This is a new master script (master_samsa2.slurm) with several modifications: + +* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond) +* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary. +* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R` +* Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum. +* Each step contains an exit statement to be printed if the master script dies due to an unforseen error. +* Trimmomatic removes adapter contamination according to a specific fasta file. +* All options, read & program location are to be specified in the first section of the script. +* The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed. +* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways... + +#### To do: +* reverse the order of the merging and trimming step (but again, maybe not...) +* simplify some syntax / make sure all paths are relative (mostly done) + + Version 2 of the SAMSA pipeline - faster! Lighter! More options! Less waiting! ### New in version 2: diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R new file mode 100755 index 0000000..c04f28c --- /dev/null +++ b/R_scripts/combining_umerged.R @@ -0,0 +1,61 @@ +#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript + +args = commandArgs(TRUE) +start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself. +#start="/Users/jerry/Documents/CSBQ/shapiro/results" + +setwd(start) + +###files to work with +system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = "")) +system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = "")) +system(paste("ls -1 *.assembled.fastq >assembled.files",sep = "")) + +files_f = read.table("umerged.forward.files",stringsAsFactors = F) +files_r = read.table("umerged.reverse.files",stringsAsFactors = F) +files_a = read.table("assembled.files",stringsAsFactors = F) + + +for(i in 1:nrow(files_f)) + { +#grep 1st and 2rd lines + unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="") + system(unassembled.fastq) + +#grep sequences forward... + unassembled.forward.fastq = paste("awk 'NR % 2 == 0' ",files_f[i,1]," >unassembled.seq.forward.fastq",sep="") + system(unassembled.forward.fastq) + +#grep sequences reverse... + unassembled.reverse.fastq = paste("awk 'NR % 2 == 0' ",files_r[i,1]," >unassembled.seq.reverse.fastq",sep="") + system(unassembled.reverse.fastq) + +#N and E quality file + system("wc -l unassembled.seq.reverse.fastq >wc") + wc = read.table("wc") + + write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F) + +#paste the unassembled sequences into a single file + system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN") + +#put everything back into a single fastq + back = paste("paste -d '\\n' unassembled.names.seq.fastq unassembledN >",gsub("merged.unassembled.forward","cat",files_f[i,1]),sep= "") + system(back) + +#add the merged sequences + all = paste("cat ",files_a[i,1]," ",gsub("merged.unassembled.forward","cat",files_f[i,1])," >",gsub("merged.assembled","merged.assembled2",files_a[i,1]),sep = "") + system(all) + +#remove the clutter (file specific) + remove = c("rm NE_file unassembledN wc unassembled.names.seq.fastq unassembled.seq.forward.fastq unassembled.seq.reverse.fastq") + system(remove) +} + +#remove the clutter (listing of all files) +system("rm assembled.files umerged.forward.files umerged.reverse.files") + + + + + diff --git a/bash_scripts/cyanotoxin_master_samsa2.slurm b/bash_scripts/cyanotoxin_master_samsa2.slurm new file mode 100755 index 0000000..2e278da --- /dev/null +++ b/bash_scripts/cyanotoxin_master_samsa2.slurm @@ -0,0 +1,505 @@ +#!/bin/bash + +#SBATCH --time=0-07:59 +#SBATCH --mem=40G +#SBATCH --account=def-bruneaua +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=12 + +module load r/3.5.0 + + +# Lines starting with #SBATCH are for SLURM job management systems +# and may be removed if user is not submitting jobs to SLURM +#################################################################### +#set -x #echo +# +# master_script_for_sample_files.bash +# Created April 2017 by Sam Westreich, github.com/transcript +# This version modified by Michelle Treiber on August 9, 2017 +# +#################################################################### +# +# This script sets up and runs through ALL steps in the SAMSA pipeline +# before the analysis (which is done in R, likely in RStudio). Each +# step is set up below. +# +# The steps are: +# 0.1 Create the checkpoint +# 0.2 Gunzip the raw sequences +# 1. Merging with PEAR, if applicable +# 2. Read cleaning with Trimmomatic +# 3. rRNA removal with SortMeRNA +# 4. Annotation using DIAMOND against Subsys database +# 5. Aggregation using analysis_counter.py (Subsys) +# 4.1 Annotation using DIAMOND against Refseq database +# 5.1 Aggregation using analysis_counter.py (Refseq) +# 6. Running R scripts to get DESeq statistical analysis. (NOT RUN) +# +# NOTE: BEFORE running this script, please run package_installation.bash +# and full_database_download.bash located at: +# https://github.com/transcript/samsa2/tree/master/setup in order to set +# up SAMSA2 dependencies and download full databases. +# +#################################################################### +# +echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n" +# +# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download +# +# 0. Set starting location: +starting_location=/home/renaut/scratch/ATRAPP/ATRAPP_Champlain_2016_Metat/St1 + +# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/gzipped +sequence_location=/home/renaut/scratch/ATRAPP/sequences/ATRAPP_Champlain_2016_Metat/St1 + +# 1. PEAR +pear_location=/home/renaut/bin + +# 2. Trimmomatic +trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36 + +# 3. SortMeRNA +sortmerna_location=/home/renaut/bin + +# 4. DIAMOND +diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac" +diamond_cyanotoxin="/home/renaut/scratch/blast_database/Cyanotoxin_Database/New_Toxin_Database_OneLine_1" +diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db" +diamond_location=/home/renaut/bin + +# 5. Aggregation +python_programs=/home/renaut/samsa2.master/samsa2/python_scripts +cyano_db="/home/renaut/scratch/blast_database/Cyanotoxin_Database/New_Toxin_Database_OneLine_1.faa2" +RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa" +Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa" + +# 6. R scripts and paths +export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages" +R_programs=/home/renaut/samsa2.master/samsa2/R_scripts + +# threads +threads="12" + +################## +#STEP 0.1: create/read checkpoint +cd $starting_location + +printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" +if [ ! -f "pipeline/checkpoints" ] + then + printf "\tThe file checkpoints does not exist, we will create it.\n" + touch "pipeline/checkpoints" +else + printf "\tThe file checkpoints already exist\n" +fi + +#################################################################### +# +#STEP 0.2: GUNZIP PROCESS. +# +# +Step=$(grep "GUNZIP" pipeline/checkpoints) +if [ "${Step}" != "GUNZIP" ] + then + for file in $sequence_location/zipped/*gz + do + file_out=`echo $file | awk -F ".gz" '{print $1}'` + gunzip $file -cd >$file_out + + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the gunzip step" + exit 1 + fi + done + + mkdir $sequence_location/unzipped + mv $sequence_location/zipped/*fastq $sequence_location/unzipped/. + printf "GUNZIP\n" >>pipeline/checkpoints + echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" + date +else + printf "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n" +fi + +#################################################################### +# +# STEP 1: MERGING OF PAIRED-END FILES USING PEAR +# Note: paired-end files are usually named using R1 and R2 in the name. +# Example: control_1.R1.fastq +# control_1.R2.fastq +# +# Note: if using single-end sequencing, skip this step (comment out). +# Note: if performing R analysis (step 6), be sure to name files with +# the appropriate prefix ("control_$file" and "experimental_$file")! + + + +Step=$(grep "MERGING" pipeline/checkpoints) +if [ "${Step}" != "MERGING" ] + then + mkdir $starting_location/step_1_merging + touch $starting_location/step_1_merging/pear_log + for file in $sequence_location/unzipped/*_R1.fastq + do + file1=$file + file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'` + out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'` + out_name=`echo ${out_path##*/}` + $pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log + + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the merging step" + exit 1 + fi + done + + ###I will concatenate forward and reverse complement with 20Ns in the middle. + $R_programs/combining_umerged.R $starting_location + + printf "MERGING\n" >>pipeline/checkpoints + mkdir $starting_location/step_1_merging/ + mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ + + rm $starting_location/*fastq + rm $sequence_location/unzipped/*fastq + + echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR" + date +else + printf "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n" +fi + +#################################################################### +# +# STEP 2: CLEANING FILES WITH TRIMMOMATIC +# Note: if skipping PEAR, make sure that all starting files are in the +# $starting_location/step_1_output/ folder! +Step=$(grep "TRIMMING" pipeline/checkpoints) +if [ "${Step}" != "TRIMMING" ] + then + mkdir $starting_location/step_2_trimming + touch $starting_location/step_2_trimming/trimmomatic_log + + for file in $starting_location/step_1_merging/*assembled2.fastq + do + shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'` + + java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the trimmmomatic step" + exit 1 + fi + + done + + mkdir $starting_location/step_2_trimming/ + mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/ + + rm step_1_merging/*assembled2.fastq + + echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC" + date + + printf "TRIMMING\n" >>pipeline/checkpoints +else + printf "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n" +fi + +#################################################################### +# +# STEP 2.5: GETTING RAW SEQUENCES COUNTS +# Note: These are used later for statistical analysis. +Step=$(grep "RAW" pipeline/checkpoints) +if [ "${Step}" != "RAW" ] + then + if [ -f $starting_location/step_2_trimming/raw_counts.txt ] + then + rm $starting_location/step_2_trimming/raw_counts.txt + touch $starting_location/step_2_trimming/raw_counts.txt + else + touch $starting_location/step_2_trimming/raw_counts.txt + fi + + for file in $starting_location/step_2_trimming/*cleaned.fastq + do + python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the raw sequences step" + exit 1 + fi + done + echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" + date + + printf "RAW\n" >>pipeline/checkpoints +else + printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" +fi + +#################################################################### +# +# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA +# Note: this step assumes that the SortMeRNA databases are indexed. If not, +# do that first (see the SortMeRNA user manual for details). + +Step=$(grep "RIBODEPLETION" pipeline/checkpoints) +if [ "${Step}" != "RIBODEPLETION" ] + then + mkdir $starting_location/step_3_ribodepletion + touch $starting_location/step_3_ribodepletion/sortmerna_log + + for file in $starting_location/step_2_trimming/*cleaned.fastq + do + shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'` + $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the sortmerna step" + exit 1 + fi + done + + mkdir $starting_location/step_3_ribodepletion/ + mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/ + + rm step_2_trimming/*cleaned.fastq + rm step_2_trimming/*ribosomes.fastq + + echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA" + date + + printf "RIBODEPLETION\n" >>pipeline/checkpoints +else + printf "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n" +fi + +#################################################################### +# +# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS +date +Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints) +if [ "${Step}" != "ANNOTATION_SUBSYS" ] + then + mkdir $starting_location/step_4_annotation + touch $starting_location/step_4_annotation/diamond_log + for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq + do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'` + #removed the diamond sensitive option + $diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log + $diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the diamond subsys step" + exit 1 + fi + done + + mkdir $starting_location/step_4_annotation/daa_binary_files/ + + mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/ + mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ + + echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS" + date + + printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints +else + printf "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n" +fi +################################################################## +# +# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER +Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints) +if [ "${Step}" != "AGGREGATION_SUBSYS" ] + then + mkdir $starting_location/step_5_aggregation/ + for file in $starting_location/step_4_annotation/*subsys_annotated + do + python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt + + # This quick program reduces down identical hierarchy annotations + python $python_programs/subsys_reducer.py -I $file.hierarchy + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the aggregation subsys step" + exit 1 + fi + done + + mkdir $starting_location/step_5_aggregation/Subsystems_results/ + mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/ + mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/ + mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/ + rm $starting_location/step_4_annotation/*.hierarchy + + echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER" + date + + printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints +else + printf "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n" +fi + + +#################################################################### +# +# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ +# Note: this step assumes that the DIAMOND database is already built. If not, +# do that first before running this step. +Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints) +if [ "${Step}" != "ANNOTATION_REFSEQ" ] + then + touch $starting_location/step_4_annotation/diamond_log + + for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq + do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'` + #removed the diamond sensitive option + $diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log + $diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the diamond refseq step" + exit 1 + fi + done + + mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ + mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ + + echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ" + date + + printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints +else + printf "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n" +fi + +#################################################################### +# +# STEP 4.2: ANNOTATING WITH DIAMOND AGAINST CYANO +# Note: this step assumes that the DIAMOND database is already built. If not, +# do that first before running this step. +Step=$(grep "ANNOTATION_CYANO" pipeline/checkpoints) +if [ "${Step}" != "ANNOTATION_CYANO" ] + then + touch $starting_location/step_4_annotation/diamond_cyano_log + + for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq + do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "cyano_annotated"}'` + #removed the diamond sensitive option + $diamond_location/diamond blastx --db $diamond_cyanotoxin -q $file -p $threads -a $file.cyano -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_cyano_log + $diamond_location/diamond view --daa $file.cyano.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_cyano_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the diamond cyano step" + exit 1 + fi + done + + mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ + mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ + + echo "STEP 4.2 DONE:ANNOTATING WITH DIAMOND AGAINST CYANO" + date + + printf "ANNOTATION_CYANO\n" >>pipeline/checkpoints +else + printf "\tThe variable ANNOTATION_CYANO is in the checkpoint file. STEP 4.1 will then be passed\n" +fi + + +#################################################################### +# +# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER +Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints) +if [ "${Step}" != "AGGREGATION_REFSEQ" ] + then + for file in $starting_location/step_4_annotation/*RefSeq_annotated* + do + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the aggregation refseq step" + exit 1 + fi + done + + mkdir $starting_location/step_5_aggregation/RefSeq_results/ + mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/ + mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/ + mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/ + mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/ + + echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER" + date + + printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints +else + printf "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n" +fi + + +#################################################################### +# +# STEP 5.2: AGGREGATING WITH ANALYSIS_COUNTER +Step=$(grep "AGGREGATION_CYANO" pipeline/checkpoints) +if [ "${Step}" != "AGGREGATION_CYANO" ] + then + for file in $starting_location/step_4_annotation/*cyano_annotated* + do + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $cyano_db -O + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $cyano_db -F + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the aggregation CYANO step" + exit 1 + fi + done + + mkdir $starting_location/step_5_aggregation/cyano_results/ + mkdir $starting_location/step_5_aggregation/cyano_results/org_results/ + mkdir $starting_location/step_5_aggregation/cyano_results/func_results/ + mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/cyano_results/org_results/ + mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/cyano_results/func_results/ + + echo "STEP 5.2 DONE: AGGREGATING WITH ANALYSIS_COUNTER" + date + + printf "AGGREGATION_CYANO\n" >>pipeline/checkpoints +else + printf "\tThe variable AGGREGATION_CYANO is in the checkpoint file. STEP 5.2 will then be passed\n" +fi + + + + +################################################################## +# +# At this point, all the results files are ready for analysis using R. +# This next step performs basic DESeq2 analysis of the RefSeq organism, function, +# and Subsystems annotations. +# +# More complex R analyses may be performed using specific .sh analysis scripts. +# +# STEP 6: R ANALYSIS +# Note: For R to properly identify files to compare/contrast, they must include +# the appropriate prefix (either "control_$file" or experimental_$file")! + +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt + +echo "STEP 6 DONE: R ANALYSIS" +echo "Master bash script finished running at: "; date +exit +#################################################################### + diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm new file mode 100755 index 0000000..a16d11b --- /dev/null +++ b/bash_scripts/master_samsa2.slurm @@ -0,0 +1,435 @@ +#!/bin/bash + +#SBATCH --time=0-23:59 +#SBATCH --mem=100G +#SBATCH --account=def-bruneaua +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=48 + +module load r/3.5.0 + + +# Lines starting with #SBATCH are for SLURM job management systems +# and may be removed if user is not submitting jobs to SLURM +#################################################################### +#set -x #echo +# +# master_script_for_sample_files.bash +# Created April 2017 by Sam Westreich, github.com/transcript +# This version modified by Michelle Treiber on August 9, 2017 +# +#################################################################### +# +# This script sets up and runs through ALL steps in the SAMSA pipeline +# before the analysis (which is done in R, likely in RStudio). Each +# step is set up below. +# +# The steps are: +# 0.1 Create the checkpoint +# 0.2 Gunzip the raw sequences +# 1. Merging with PEAR, if applicable +# 2. Read cleaning with Trimmomatic +# 3. rRNA removal with SortMeRNA +# 4. Annotation using DIAMOND against Subsys database +# 5. Aggregation using analysis_counter.py (Subsys) +# 4.1 Annotation using DIAMOND against Refseq database +# 5.1 Aggregation using analysis_counter.py (Refseq) +# 6. Running R scripts to get DESeq statistical analysis. (NOT RUN) +# +# NOTE: BEFORE running this script, please run package_installation.bash +# and full_database_download.bash located at: +# https://github.com/transcript/samsa2/tree/master/setup in order to set +# up SAMSA2 dependencies and download full databases. +# +#################################################################### +# +echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n" +# +# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download +# +# 0. Set starting location: +starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1 + +# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped +sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1 + +# 1. PEAR +pear_location=/home/renaut/bin + +# 2. Trimmomatic +trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36 + +# 3. SortMeRNA +sortmerna_location=/home/renaut/bin + +# 4. DIAMOND +diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac" +diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db" +diamond_location=/home/renaut/bin + +# 5. Aggregation +python_programs=/home/renaut/samsa2.master/samsa2/python_scripts +RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa" +Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa" + +# 6. R scripts and paths +export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages" +R_programs=/home/renaut/samsa2.master/samsa2/R_scripts + +# threads +threads="48" + +################## +#STEP 0.1: create/read checkpoint +cd $starting_location + +printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" +if [ ! -f "pipeline/checkpoints" ] + then + printf "\tThe file checkpoints does not exist, we will create it.\n" + touch "pipeline/checkpoints" +else + printf "\tThe file checkpoints already exist\n" +fi + +#################################################################### +# +#STEP 0.2: GUNZIP PROCESS. +# +# +Step=$(grep "GUNZIP" pipeline/checkpoints) +if [ "${Step}" != "GUNZIP" ] + then + for file in $sequence_location/zipped/*gz + do + file_out=`echo $file | awk -F ".gz" '{print $1}'` + gunzip $file -cd >$file_out + + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the gunzip step" + exit 1 + fi + done + + mkdir $sequence_location/unzipped + mv $sequence_location/zipped/*fastq $sequence_location/unzipped/. + printf "GUNZIP\n" >>pipeline/checkpoints + echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" + date +else + printf "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n" +fi + +#################################################################### +# +# STEP 1: MERGING OF PAIRED-END FILES USING PEAR +# Note: paired-end files are usually named using R1 and R2 in the name. +# Example: control_1.R1.fastq +# control_1.R2.fastq +# +# Note: if using single-end sequencing, skip this step (comment out). +# Note: if performing R analysis (step 6), be sure to name files with +# the appropriate prefix ("control_$file" and "experimental_$file")! + + + +Step=$(grep "MERGING" pipeline/checkpoints) +if [ "${Step}" != "MERGING" ] + then + mkdir $starting_location/step_1_merging + touch $starting_location/step_1_merging/pear_log + for file in $sequence_location/unzipped/*_R1.fastq + do + file1=$file + file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'` + out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'` + out_name=`echo ${out_path##*/}` + $pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log + + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the merging step" + exit 1 + fi + done + + ###I will concatenate forward and reverse complement with 20Ns in the middle. + $R_programs/combining_umerged.R $starting_location + + printf "MERGING\n" >>pipeline/checkpoints + mkdir $starting_location/step_1_merging/ + mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ + + rm $starting_location/*fastq + rm $sequence_location/unzipped/*fastq + + echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR" + date +else + printf "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n" +fi + +#################################################################### +# +# STEP 2: CLEANING FILES WITH TRIMMOMATIC +# Note: if skipping PEAR, make sure that all starting files are in the +# $starting_location/step_1_output/ folder! +Step=$(grep "TRIMMING" pipeline/checkpoints) +if [ "${Step}" != "TRIMMING" ] + then + mkdir $starting_location/step_2_trimming + touch $starting_location/step_2_trimming/trimmomatic_log + + for file in $starting_location/step_1_merging/*assembled2.fastq + do + shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'` + + java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the trimmmomatic step" + exit 1 + fi + + done + + mkdir $starting_location/step_2_trimming/ + mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/ + + rm step_1_merging/*assembled2.fastq + + echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC" + date + + printf "TRIMMING\n" >>pipeline/checkpoints +else + printf "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n" +fi + +#################################################################### +# +# STEP 2.5: GETTING RAW SEQUENCES COUNTS +# Note: These are used later for statistical analysis. +Step=$(grep "RAW" pipeline/checkpoints) +if [ "${Step}" != "RAW" ] + then + if [ -f $starting_location/step_2_trimming/raw_counts.txt ] + then + rm $starting_location/step_2_trimming/raw_counts.txt + touch $starting_location/step_2_trimming/raw_counts.txt + else + touch $starting_location/step_2_trimming/raw_counts.txt + fi + + for file in $starting_location/step_2_trimming/*cleaned.fastq + do + python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the raw sequences step" + exit 1 + fi + done + echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" + date + + printf "RAW\n" >>pipeline/checkpoints +else + printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" +fi + +#################################################################### +# +# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA +# Note: this step assumes that the SortMeRNA databases are indexed. If not, +# do that first (see the SortMeRNA user manual for details). + +Step=$(grep "RIBODEPLETION" pipeline/checkpoints) +if [ "${Step}" != "RIBODEPLETION" ] + then + mkdir $starting_location/step_3_ribodepletion + touch $starting_location/step_3_ribodepletion/sortmerna_log + + for file in $starting_location/step_2_trimming/*cleaned.fastq + do + shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'` + $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the sortmerna step" + exit 1 + fi + done + + mkdir $starting_location/step_3_ribodepletion/ + mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/ + + rm step_2_trimming/*cleaned.fastq + rm step_2_trimming/*ribosomes.fastq + + echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA" + date + + printf "RIBODEPLETION\n" >>pipeline/checkpoints +else + printf "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n" +fi + +#################################################################### +# +# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS +date +Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints) +if [ "${Step}" != "ANNOTATION_SUBSYS" ] + then + mkdir $starting_location/step_4_annotation + touch $starting_location/step_4_annotation/diamond_log + for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq + do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'` + #removed the diamond sensitive option + $diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log + $diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the diamond subsys step" + exit 1 + fi + done + + mkdir $starting_location/step_4_annotation/daa_binary_files/ + + mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/ + mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ + + echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS" + date + + printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints +else + printf "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n" +fi +################################################################## +# +# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER +Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints) +if [ "${Step}" != "AGGREGATION_SUBSYS" ] + then + mkdir $starting_location/step_5_aggregation/ + for file in $starting_location/step_4_annotation/*subsys_annotated + do + python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt + + # This quick program reduces down identical hierarchy annotations + python $python_programs/subsys_reducer.py -I $file.hierarchy + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the aggregation subsys step" + exit 1 + fi + done + + mkdir $starting_location/step_5_aggregation/Subsystems_results/ + mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/ + mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/ + mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/ + rm $starting_location/step_4_annotation/*.hierarchy + + echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER" + date + + printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints +else + printf "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n" +fi + + +#################################################################### +# +# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ +# Note: this step assumes that the DIAMOND database is already built. If not, +# do that first before running this step. +Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints) +if [ "${Step}" != "ANNOTATION_REFSEQ" ] + then + touch $starting_location/step_4_annotation/diamond_log + + for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq + do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'` + #removed the diamond sensitive option + $diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log + $diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the diamond refseq step" + exit 1 + fi + done + + mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ + mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ + + echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ" + date + + printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints +else + printf "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n" +fi + +#################################################################### +# +# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER +Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints) +if [ "${Step}" != "AGGREGATION_REFSEQ" ] + then + for file in $starting_location/step_4_annotation/*RefSeq_annotated* + do + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the aggregation refseq step" + exit 1 + fi + done + + mkdir $starting_location/step_5_aggregation/RefSeq_results/ + mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/ + mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/ + mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/ + mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/ + + echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER" + date + + printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints +else + printf "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n" +fi + + + +################################################################## +# +# At this point, all the results files are ready for analysis using R. +# This next step performs basic DESeq2 analysis of the RefSeq organism, function, +# and Subsystems annotations. +# +# More complex R analyses may be performed using specific .sh analysis scripts. +# +# STEP 6: R ANALYSIS +# Note: For R to properly identify files to compare/contrast, they must include +# the appropriate prefix (either "control_$file" or experimental_$file")! + +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt + +echo "STEP 6 DONE: R ANALYSIS" +echo "Master bash script finished running at: "; date +exit +#################################################################### + From 0efac2564fe50ecce42ceca3abd14215b92935f4 Mon Sep 17 00:00:00 2001 From: Sam Westreich Date: Tue, 12 Mar 2019 12:49:30 -0700 Subject: [PATCH 11/11] Incorporating and streamlining checkpoint update --- README.md | 22 +- R_scripts/combining_umerged.R | 24 +- bash_scripts/cyanotoxin_master_samsa2.slurm | 505 ------------------ bash_scripts/master_samsa2.slurm | 435 --------------- bash_scripts/master_script.sh | 98 ++-- .../master_script_preserving_unmerged.sh | 374 +++++++++++++ 6 files changed, 441 insertions(+), 1017 deletions(-) delete mode 100755 bash_scripts/cyanotoxin_master_samsa2.slurm delete mode 100755 bash_scripts/master_samsa2.slurm create mode 100644 bash_scripts/master_script_preserving_unmerged.sh diff --git a/README.md b/README.md index 00978e8..9110279 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,20 @@ -# SAMSA2 - A fork of the complete metatranscriptome analysis pipeline +# SAMSA2 - A complete metatranscriptome analysis pipeline -### modification by sebastien.renaut@gmail.com +***** -### New in my version. -This is a new master script (master_samsa2.slurm) with several modifications: +**Version 2.2.0 - Yesod - Modifications added by sebastien.renaut@gmail.com:** -* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond) -* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary. -* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R` +* Multithreading added for PEAR, Trimmomatic, SortMeRNA +* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint` and that step is skipped on a rerun of the master_script. This is done to avoid re-running CPU-intensive steps if unnecessary. +* A new version of the master script now exists, called "master\_script\_preserving\_unmerged.sh". In this script, in the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R` * Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum. * Each step contains an exit statement to be printed if the master script dies due to an unforseen error. * Trimmomatic removes adapter contamination according to a specific fasta file. * All options, read & program location are to be specified in the first section of the script. * The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed. -* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways... - -#### To do: -* reverse the order of the merging and trimming step (but again, maybe not...) -* simplify some syntax / make sure all paths are relative (mostly done) +* The flag --num_alignments 0 in the ribosomal `sortmrna` step has been removed. This caused problems and slowed things down a lot. Plus, we don't care about the rRNA alignments - whether a sequence aligns to 1 or 1,000 rRNA, it's out anyways... +***** Version 2 of the SAMSA pipeline - faster! Lighter! More options! Less waiting! @@ -32,7 +28,7 @@ Version 2 of the SAMSA pipeline - faster! Lighter! More options! Less waiting ### Dependencies -SAMSA2 requires Python2 for aggregation scripts. Currently, this pipeline does not work with Python3, although an update is planned. +SAMSA2 requires Python2 for aggregation scripts. Currently, this pipeline works mostly with Python3, although there may be some errors not yet caught. The following programs can be downloaded OR can be installed from the binaries provided in the programs/ folder. diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R index c04f28c..e4aad70 100755 --- a/R_scripts/combining_umerged.R +++ b/R_scripts/combining_umerged.R @@ -1,24 +1,23 @@ #!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript args = commandArgs(TRUE) -start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself. -#start="/Users/jerry/Documents/CSBQ/shapiro/results" +start = args[1] # Specify which sequences in "list_ind" file you want to align, directly from the shell. Alternatively, you can do this from the "alignments" function itself. setwd(start) -###files to work with +###files to work with system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = "")) system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = "")) system(paste("ls -1 *.assembled.fastq >assembled.files",sep = "")) -files_f = read.table("umerged.forward.files",stringsAsFactors = F) -files_r = read.table("umerged.reverse.files",stringsAsFactors = F) -files_a = read.table("assembled.files",stringsAsFactors = F) +files_f = read.table("umerged.forward.files",stringsAsFactors = F) +files_r = read.table("umerged.reverse.files",stringsAsFactors = F) +files_a = read.table("assembled.files",stringsAsFactors = F) -for(i in 1:nrow(files_f)) +for(i in 1:nrow(files_f)) { -#grep 1st and 2rd lines +#grep 1st and 2rd lines unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="") system(unassembled.fastq) @@ -33,9 +32,9 @@ for(i in 1:nrow(files_f)) #N and E quality file system("wc -l unassembled.seq.reverse.fastq >wc") wc = read.table("wc") - + write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F) - + #paste the unassembled sequences into a single file system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN") @@ -54,8 +53,3 @@ for(i in 1:nrow(files_f)) #remove the clutter (listing of all files) system("rm assembled.files umerged.forward.files umerged.reverse.files") - - - - - diff --git a/bash_scripts/cyanotoxin_master_samsa2.slurm b/bash_scripts/cyanotoxin_master_samsa2.slurm deleted file mode 100755 index 2e278da..0000000 --- a/bash_scripts/cyanotoxin_master_samsa2.slurm +++ /dev/null @@ -1,505 +0,0 @@ -#!/bin/bash - -#SBATCH --time=0-07:59 -#SBATCH --mem=40G -#SBATCH --account=def-bruneaua -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=12 - -module load r/3.5.0 - - -# Lines starting with #SBATCH are for SLURM job management systems -# and may be removed if user is not submitting jobs to SLURM -#################################################################### -#set -x #echo -# -# master_script_for_sample_files.bash -# Created April 2017 by Sam Westreich, github.com/transcript -# This version modified by Michelle Treiber on August 9, 2017 -# -#################################################################### -# -# This script sets up and runs through ALL steps in the SAMSA pipeline -# before the analysis (which is done in R, likely in RStudio). Each -# step is set up below. -# -# The steps are: -# 0.1 Create the checkpoint -# 0.2 Gunzip the raw sequences -# 1. Merging with PEAR, if applicable -# 2. Read cleaning with Trimmomatic -# 3. rRNA removal with SortMeRNA -# 4. Annotation using DIAMOND against Subsys database -# 5. Aggregation using analysis_counter.py (Subsys) -# 4.1 Annotation using DIAMOND against Refseq database -# 5.1 Aggregation using analysis_counter.py (Refseq) -# 6. Running R scripts to get DESeq statistical analysis. (NOT RUN) -# -# NOTE: BEFORE running this script, please run package_installation.bash -# and full_database_download.bash located at: -# https://github.com/transcript/samsa2/tree/master/setup in order to set -# up SAMSA2 dependencies and download full databases. -# -#################################################################### -# -echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n" -# -# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download -# -# 0. Set starting location: -starting_location=/home/renaut/scratch/ATRAPP/ATRAPP_Champlain_2016_Metat/St1 - -# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/gzipped -sequence_location=/home/renaut/scratch/ATRAPP/sequences/ATRAPP_Champlain_2016_Metat/St1 - -# 1. PEAR -pear_location=/home/renaut/bin - -# 2. Trimmomatic -trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36 - -# 3. SortMeRNA -sortmerna_location=/home/renaut/bin - -# 4. DIAMOND -diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac" -diamond_cyanotoxin="/home/renaut/scratch/blast_database/Cyanotoxin_Database/New_Toxin_Database_OneLine_1" -diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db" -diamond_location=/home/renaut/bin - -# 5. Aggregation -python_programs=/home/renaut/samsa2.master/samsa2/python_scripts -cyano_db="/home/renaut/scratch/blast_database/Cyanotoxin_Database/New_Toxin_Database_OneLine_1.faa2" -RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa" -Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa" - -# 6. R scripts and paths -export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages" -R_programs=/home/renaut/samsa2.master/samsa2/R_scripts - -# threads -threads="12" - -################## -#STEP 0.1: create/read checkpoint -cd $starting_location - -printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" -if [ ! -f "pipeline/checkpoints" ] - then - printf "\tThe file checkpoints does not exist, we will create it.\n" - touch "pipeline/checkpoints" -else - printf "\tThe file checkpoints already exist\n" -fi - -#################################################################### -# -#STEP 0.2: GUNZIP PROCESS. -# -# -Step=$(grep "GUNZIP" pipeline/checkpoints) -if [ "${Step}" != "GUNZIP" ] - then - for file in $sequence_location/zipped/*gz - do - file_out=`echo $file | awk -F ".gz" '{print $1}'` - gunzip $file -cd >$file_out - - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the gunzip step" - exit 1 - fi - done - - mkdir $sequence_location/unzipped - mv $sequence_location/zipped/*fastq $sequence_location/unzipped/. - printf "GUNZIP\n" >>pipeline/checkpoints - echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" - date -else - printf "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n" -fi - -#################################################################### -# -# STEP 1: MERGING OF PAIRED-END FILES USING PEAR -# Note: paired-end files are usually named using R1 and R2 in the name. -# Example: control_1.R1.fastq -# control_1.R2.fastq -# -# Note: if using single-end sequencing, skip this step (comment out). -# Note: if performing R analysis (step 6), be sure to name files with -# the appropriate prefix ("control_$file" and "experimental_$file")! - - - -Step=$(grep "MERGING" pipeline/checkpoints) -if [ "${Step}" != "MERGING" ] - then - mkdir $starting_location/step_1_merging - touch $starting_location/step_1_merging/pear_log - for file in $sequence_location/unzipped/*_R1.fastq - do - file1=$file - file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'` - out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'` - out_name=`echo ${out_path##*/}` - $pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log - - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the merging step" - exit 1 - fi - done - - ###I will concatenate forward and reverse complement with 20Ns in the middle. - $R_programs/combining_umerged.R $starting_location - - printf "MERGING\n" >>pipeline/checkpoints - mkdir $starting_location/step_1_merging/ - mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ - - rm $starting_location/*fastq - rm $sequence_location/unzipped/*fastq - - echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR" - date -else - printf "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n" -fi - -#################################################################### -# -# STEP 2: CLEANING FILES WITH TRIMMOMATIC -# Note: if skipping PEAR, make sure that all starting files are in the -# $starting_location/step_1_output/ folder! -Step=$(grep "TRIMMING" pipeline/checkpoints) -if [ "${Step}" != "TRIMMING" ] - then - mkdir $starting_location/step_2_trimming - touch $starting_location/step_2_trimming/trimmomatic_log - - for file in $starting_location/step_1_merging/*assembled2.fastq - do - shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'` - - java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the trimmmomatic step" - exit 1 - fi - - done - - mkdir $starting_location/step_2_trimming/ - mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/ - - rm step_1_merging/*assembled2.fastq - - echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC" - date - - printf "TRIMMING\n" >>pipeline/checkpoints -else - printf "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n" -fi - -#################################################################### -# -# STEP 2.5: GETTING RAW SEQUENCES COUNTS -# Note: These are used later for statistical analysis. -Step=$(grep "RAW" pipeline/checkpoints) -if [ "${Step}" != "RAW" ] - then - if [ -f $starting_location/step_2_trimming/raw_counts.txt ] - then - rm $starting_location/step_2_trimming/raw_counts.txt - touch $starting_location/step_2_trimming/raw_counts.txt - else - touch $starting_location/step_2_trimming/raw_counts.txt - fi - - for file in $starting_location/step_2_trimming/*cleaned.fastq - do - python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the raw sequences step" - exit 1 - fi - done - echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" - date - - printf "RAW\n" >>pipeline/checkpoints -else - printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" -fi - -#################################################################### -# -# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA -# Note: this step assumes that the SortMeRNA databases are indexed. If not, -# do that first (see the SortMeRNA user manual for details). - -Step=$(grep "RIBODEPLETION" pipeline/checkpoints) -if [ "${Step}" != "RIBODEPLETION" ] - then - mkdir $starting_location/step_3_ribodepletion - touch $starting_location/step_3_ribodepletion/sortmerna_log - - for file in $starting_location/step_2_trimming/*cleaned.fastq - do - shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'` - $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the sortmerna step" - exit 1 - fi - done - - mkdir $starting_location/step_3_ribodepletion/ - mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/ - - rm step_2_trimming/*cleaned.fastq - rm step_2_trimming/*ribosomes.fastq - - echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA" - date - - printf "RIBODEPLETION\n" >>pipeline/checkpoints -else - printf "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n" -fi - -#################################################################### -# -# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS -date -Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints) -if [ "${Step}" != "ANNOTATION_SUBSYS" ] - then - mkdir $starting_location/step_4_annotation - touch $starting_location/step_4_annotation/diamond_log - for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq - do - shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'` - #removed the diamond sensitive option - $diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log - $diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the diamond subsys step" - exit 1 - fi - done - - mkdir $starting_location/step_4_annotation/daa_binary_files/ - - mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/ - mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ - - echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS" - date - - printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints -else - printf "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n" -fi -################################################################## -# -# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER -Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints) -if [ "${Step}" != "AGGREGATION_SUBSYS" ] - then - mkdir $starting_location/step_5_aggregation/ - for file in $starting_location/step_4_annotation/*subsys_annotated - do - python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt - - # This quick program reduces down identical hierarchy annotations - python $python_programs/subsys_reducer.py -I $file.hierarchy - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the aggregation subsys step" - exit 1 - fi - done - - mkdir $starting_location/step_5_aggregation/Subsystems_results/ - mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/ - mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/ - mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/ - rm $starting_location/step_4_annotation/*.hierarchy - - echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER" - date - - printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints -else - printf "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n" -fi - - -#################################################################### -# -# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ -# Note: this step assumes that the DIAMOND database is already built. If not, -# do that first before running this step. -Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints) -if [ "${Step}" != "ANNOTATION_REFSEQ" ] - then - touch $starting_location/step_4_annotation/diamond_log - - for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq - do - shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'` - #removed the diamond sensitive option - $diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log - $diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the diamond refseq step" - exit 1 - fi - done - - mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ - mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ - - echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ" - date - - printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints -else - printf "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n" -fi - -#################################################################### -# -# STEP 4.2: ANNOTATING WITH DIAMOND AGAINST CYANO -# Note: this step assumes that the DIAMOND database is already built. If not, -# do that first before running this step. -Step=$(grep "ANNOTATION_CYANO" pipeline/checkpoints) -if [ "${Step}" != "ANNOTATION_CYANO" ] - then - touch $starting_location/step_4_annotation/diamond_cyano_log - - for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq - do - shortname=`echo $file | awk -F "ribodepleted" '{print $1 "cyano_annotated"}'` - #removed the diamond sensitive option - $diamond_location/diamond blastx --db $diamond_cyanotoxin -q $file -p $threads -a $file.cyano -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_cyano_log - $diamond_location/diamond view --daa $file.cyano.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_cyano_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the diamond cyano step" - exit 1 - fi - done - - mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ - mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ - - echo "STEP 4.2 DONE:ANNOTATING WITH DIAMOND AGAINST CYANO" - date - - printf "ANNOTATION_CYANO\n" >>pipeline/checkpoints -else - printf "\tThe variable ANNOTATION_CYANO is in the checkpoint file. STEP 4.1 will then be passed\n" -fi - - -#################################################################### -# -# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER -Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints) -if [ "${Step}" != "AGGREGATION_REFSEQ" ] - then - for file in $starting_location/step_4_annotation/*RefSeq_annotated* - do - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the aggregation refseq step" - exit 1 - fi - done - - mkdir $starting_location/step_5_aggregation/RefSeq_results/ - mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/ - mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/ - mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/ - mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/ - - echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER" - date - - printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints -else - printf "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n" -fi - - -#################################################################### -# -# STEP 5.2: AGGREGATING WITH ANALYSIS_COUNTER -Step=$(grep "AGGREGATION_CYANO" pipeline/checkpoints) -if [ "${Step}" != "AGGREGATION_CYANO" ] - then - for file in $starting_location/step_4_annotation/*cyano_annotated* - do - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $cyano_db -O - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $cyano_db -F - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the aggregation CYANO step" - exit 1 - fi - done - - mkdir $starting_location/step_5_aggregation/cyano_results/ - mkdir $starting_location/step_5_aggregation/cyano_results/org_results/ - mkdir $starting_location/step_5_aggregation/cyano_results/func_results/ - mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/cyano_results/org_results/ - mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/cyano_results/func_results/ - - echo "STEP 5.2 DONE: AGGREGATING WITH ANALYSIS_COUNTER" - date - - printf "AGGREGATION_CYANO\n" >>pipeline/checkpoints -else - printf "\tThe variable AGGREGATION_CYANO is in the checkpoint file. STEP 5.2 will then be passed\n" -fi - - - - -################################################################## -# -# At this point, all the results files are ready for analysis using R. -# This next step performs basic DESeq2 analysis of the RefSeq organism, function, -# and Subsystems annotations. -# -# More complex R analyses may be performed using specific .sh analysis scripts. -# -# STEP 6: R ANALYSIS -# Note: For R to properly identify files to compare/contrast, they must include -# the appropriate prefix (either "control_$file" or experimental_$file")! - -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt - -echo "STEP 6 DONE: R ANALYSIS" -echo "Master bash script finished running at: "; date -exit -#################################################################### - diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm deleted file mode 100755 index a16d11b..0000000 --- a/bash_scripts/master_samsa2.slurm +++ /dev/null @@ -1,435 +0,0 @@ -#!/bin/bash - -#SBATCH --time=0-23:59 -#SBATCH --mem=100G -#SBATCH --account=def-bruneaua -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=48 - -module load r/3.5.0 - - -# Lines starting with #SBATCH are for SLURM job management systems -# and may be removed if user is not submitting jobs to SLURM -#################################################################### -#set -x #echo -# -# master_script_for_sample_files.bash -# Created April 2017 by Sam Westreich, github.com/transcript -# This version modified by Michelle Treiber on August 9, 2017 -# -#################################################################### -# -# This script sets up and runs through ALL steps in the SAMSA pipeline -# before the analysis (which is done in R, likely in RStudio). Each -# step is set up below. -# -# The steps are: -# 0.1 Create the checkpoint -# 0.2 Gunzip the raw sequences -# 1. Merging with PEAR, if applicable -# 2. Read cleaning with Trimmomatic -# 3. rRNA removal with SortMeRNA -# 4. Annotation using DIAMOND against Subsys database -# 5. Aggregation using analysis_counter.py (Subsys) -# 4.1 Annotation using DIAMOND against Refseq database -# 5.1 Aggregation using analysis_counter.py (Refseq) -# 6. Running R scripts to get DESeq statistical analysis. (NOT RUN) -# -# NOTE: BEFORE running this script, please run package_installation.bash -# and full_database_download.bash located at: -# https://github.com/transcript/samsa2/tree/master/setup in order to set -# up SAMSA2 dependencies and download full databases. -# -#################################################################### -# -echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n" -# -# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download -# -# 0. Set starting location: -starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1 - -# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped -sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1 - -# 1. PEAR -pear_location=/home/renaut/bin - -# 2. Trimmomatic -trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36 - -# 3. SortMeRNA -sortmerna_location=/home/renaut/bin - -# 4. DIAMOND -diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac" -diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db" -diamond_location=/home/renaut/bin - -# 5. Aggregation -python_programs=/home/renaut/samsa2.master/samsa2/python_scripts -RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa" -Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa" - -# 6. R scripts and paths -export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages" -R_programs=/home/renaut/samsa2.master/samsa2/R_scripts - -# threads -threads="48" - -################## -#STEP 0.1: create/read checkpoint -cd $starting_location - -printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" -if [ ! -f "pipeline/checkpoints" ] - then - printf "\tThe file checkpoints does not exist, we will create it.\n" - touch "pipeline/checkpoints" -else - printf "\tThe file checkpoints already exist\n" -fi - -#################################################################### -# -#STEP 0.2: GUNZIP PROCESS. -# -# -Step=$(grep "GUNZIP" pipeline/checkpoints) -if [ "${Step}" != "GUNZIP" ] - then - for file in $sequence_location/zipped/*gz - do - file_out=`echo $file | awk -F ".gz" '{print $1}'` - gunzip $file -cd >$file_out - - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the gunzip step" - exit 1 - fi - done - - mkdir $sequence_location/unzipped - mv $sequence_location/zipped/*fastq $sequence_location/unzipped/. - printf "GUNZIP\n" >>pipeline/checkpoints - echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" - date -else - printf "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n" -fi - -#################################################################### -# -# STEP 1: MERGING OF PAIRED-END FILES USING PEAR -# Note: paired-end files are usually named using R1 and R2 in the name. -# Example: control_1.R1.fastq -# control_1.R2.fastq -# -# Note: if using single-end sequencing, skip this step (comment out). -# Note: if performing R analysis (step 6), be sure to name files with -# the appropriate prefix ("control_$file" and "experimental_$file")! - - - -Step=$(grep "MERGING" pipeline/checkpoints) -if [ "${Step}" != "MERGING" ] - then - mkdir $starting_location/step_1_merging - touch $starting_location/step_1_merging/pear_log - for file in $sequence_location/unzipped/*_R1.fastq - do - file1=$file - file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'` - out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'` - out_name=`echo ${out_path##*/}` - $pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log - - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the merging step" - exit 1 - fi - done - - ###I will concatenate forward and reverse complement with 20Ns in the middle. - $R_programs/combining_umerged.R $starting_location - - printf "MERGING\n" >>pipeline/checkpoints - mkdir $starting_location/step_1_merging/ - mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ - - rm $starting_location/*fastq - rm $sequence_location/unzipped/*fastq - - echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR" - date -else - printf "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n" -fi - -#################################################################### -# -# STEP 2: CLEANING FILES WITH TRIMMOMATIC -# Note: if skipping PEAR, make sure that all starting files are in the -# $starting_location/step_1_output/ folder! -Step=$(grep "TRIMMING" pipeline/checkpoints) -if [ "${Step}" != "TRIMMING" ] - then - mkdir $starting_location/step_2_trimming - touch $starting_location/step_2_trimming/trimmomatic_log - - for file in $starting_location/step_1_merging/*assembled2.fastq - do - shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'` - - java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the trimmmomatic step" - exit 1 - fi - - done - - mkdir $starting_location/step_2_trimming/ - mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/ - - rm step_1_merging/*assembled2.fastq - - echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC" - date - - printf "TRIMMING\n" >>pipeline/checkpoints -else - printf "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n" -fi - -#################################################################### -# -# STEP 2.5: GETTING RAW SEQUENCES COUNTS -# Note: These are used later for statistical analysis. -Step=$(grep "RAW" pipeline/checkpoints) -if [ "${Step}" != "RAW" ] - then - if [ -f $starting_location/step_2_trimming/raw_counts.txt ] - then - rm $starting_location/step_2_trimming/raw_counts.txt - touch $starting_location/step_2_trimming/raw_counts.txt - else - touch $starting_location/step_2_trimming/raw_counts.txt - fi - - for file in $starting_location/step_2_trimming/*cleaned.fastq - do - python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the raw sequences step" - exit 1 - fi - done - echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" - date - - printf "RAW\n" >>pipeline/checkpoints -else - printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" -fi - -#################################################################### -# -# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA -# Note: this step assumes that the SortMeRNA databases are indexed. If not, -# do that first (see the SortMeRNA user manual for details). - -Step=$(grep "RIBODEPLETION" pipeline/checkpoints) -if [ "${Step}" != "RIBODEPLETION" ] - then - mkdir $starting_location/step_3_ribodepletion - touch $starting_location/step_3_ribodepletion/sortmerna_log - - for file in $starting_location/step_2_trimming/*cleaned.fastq - do - shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'` - $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the sortmerna step" - exit 1 - fi - done - - mkdir $starting_location/step_3_ribodepletion/ - mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/ - - rm step_2_trimming/*cleaned.fastq - rm step_2_trimming/*ribosomes.fastq - - echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA" - date - - printf "RIBODEPLETION\n" >>pipeline/checkpoints -else - printf "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n" -fi - -#################################################################### -# -# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS -date -Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints) -if [ "${Step}" != "ANNOTATION_SUBSYS" ] - then - mkdir $starting_location/step_4_annotation - touch $starting_location/step_4_annotation/diamond_log - for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq - do - shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'` - #removed the diamond sensitive option - $diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log - $diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the diamond subsys step" - exit 1 - fi - done - - mkdir $starting_location/step_4_annotation/daa_binary_files/ - - mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/ - mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ - - echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS" - date - - printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints -else - printf "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n" -fi -################################################################## -# -# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER -Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints) -if [ "${Step}" != "AGGREGATION_SUBSYS" ] - then - mkdir $starting_location/step_5_aggregation/ - for file in $starting_location/step_4_annotation/*subsys_annotated - do - python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt - - # This quick program reduces down identical hierarchy annotations - python $python_programs/subsys_reducer.py -I $file.hierarchy - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the aggregation subsys step" - exit 1 - fi - done - - mkdir $starting_location/step_5_aggregation/Subsystems_results/ - mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/ - mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/ - mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/ - rm $starting_location/step_4_annotation/*.hierarchy - - echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER" - date - - printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints -else - printf "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n" -fi - - -#################################################################### -# -# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ -# Note: this step assumes that the DIAMOND database is already built. If not, -# do that first before running this step. -Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints) -if [ "${Step}" != "ANNOTATION_REFSEQ" ] - then - touch $starting_location/step_4_annotation/diamond_log - - for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq - do - shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'` - #removed the diamond sensitive option - $diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log - $diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the diamond refseq step" - exit 1 - fi - done - - mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ - mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ - - echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ" - date - - printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints -else - printf "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n" -fi - -#################################################################### -# -# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER -Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints) -if [ "${Step}" != "AGGREGATION_REFSEQ" ] - then - for file in $starting_location/step_4_annotation/*RefSeq_annotated* - do - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the aggregation refseq step" - exit 1 - fi - done - - mkdir $starting_location/step_5_aggregation/RefSeq_results/ - mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/ - mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/ - mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/ - mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/ - - echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER" - date - - printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints -else - printf "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n" -fi - - - -################################################################## -# -# At this point, all the results files are ready for analysis using R. -# This next step performs basic DESeq2 analysis of the RefSeq organism, function, -# and Subsystems annotations. -# -# More complex R analyses may be performed using specific .sh analysis scripts. -# -# STEP 6: R ANALYSIS -# Note: For R to properly identify files to compare/contrast, they must include -# the appropriate prefix (either "control_$file" or experimental_$file")! - -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt - -echo "STEP 6 DONE: R ANALYSIS" -echo "Master bash script finished running at: "; date -exit -#################################################################### - diff --git a/bash_scripts/master_script.sh b/bash_scripts/master_script.sh index f6ef74e..25092f1 100644 --- a/bash_scripts/master_script.sh +++ b/bash_scripts/master_script.sh @@ -9,7 +9,7 @@ # # master_script.sh # Created April 2017 by Sam Westreich, github.com/transcript -# This version modified February 21, 2018 +# This version modified March 12, 2019 # #################################################################### # @@ -43,6 +43,9 @@ source "${BASH_SOURCE%/*}/../bash_scripts/lib/common.sh" INPUT_DIR=$SAMSA/input_files OUT_DIR=$SAMSA +# number of threads +threads=`getconf _NPROCESSORS_ONLN` + STEP_1="$OUT_DIR/step_1_output" STEP_2="$OUT_DIR/step_2_output" STEP_3="$OUT_DIR/step_3_output" @@ -74,30 +77,27 @@ fi #################################################################### #STEP 0.1: create/read checkpoint -printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" +printf "\nStep 0.1: Checking for the presence of the checkpoint file.\n" if [ ! -f "$INPUT_DIR/checkpoints" ] - then - printf "\tThe file checkpoints does not exist, we will create it.\n" - touch "$INPUT_DIR/checkpoints" + then + printf "\tThe file 'checkpoints' does not exist in the input directory, creating...\n" + touch "$INPUT_DIR/checkpoints" else - printf "\tThe file checkpoints already exist\n" + printf "\tThe file 'checkpoints' already exists in the input directory.\n" fi #################################################################### - - # # STEP 1: CLEANING FILES WITH TRIMMOMATIC Step=$(grep "TRIMMO" $INPUT_DIR/checkpoints) if [ "${Step}" != "TRIMMO" ] - then - + then if ls $INPUT_DIR/*.gz &>/dev/null; then for file in $INPUT_DIR/*.gz do - gunzip $file - done + gunzip $file + done fi $MKDIR $STEP_1 @@ -108,11 +108,11 @@ do out_path=`echo $f | awk -F "_R1" '{print $1 ".cleaned"}'` if [ -f $f2 ]; then paired=true - checked java -jar $TRIMMOMATIC PE -phred33 $f $f2 \ + checked java -jar $TRIMMOMATIC PE -phred33 -threads $threads $f $f2 \ $out_path".forward" $out_path".forward_unpaired" $out_path".reverse" $out_path".reverse_unpaired" \ SLIDINGWINDOW:4:15 MINLEN:70 else - checked java -jar $TRIMMOMATIC SE -phred33 $f $out_path SLIDINGWINDOW:4:15 MINLEN:70 + checked java -jar $TRIMMOMATIC SE -phred33 -threads $threads $f $out_path SLIDINGWINDOW:4:15 MINLEN:70 fi done @@ -122,9 +122,10 @@ if $paired; then else mv $INPUT_DIR/*".cleaned" $STEP_1 fi +printf "TRIMMO\n" >>pipeline/checkpoints else - printf "\tThe variable TRIMMO is in the checkpoint file. STEP 1 will then be passed\n" + printf "\tThe variable TRIMMO is in the checkpoint file. STEP 1 will be skipped.\n" fi @@ -134,18 +135,18 @@ fi # Note: paired-end files are usually named using R1 and R2 in the name. # Example: control_1.R1.fastq # control_1.R2.fastq + Step=$(grep "MERGING" $INPUT_DIR/checkpoints) if [ "${Step}" != "MERGING" ] - then - + then $MKDIR $STEP_2 if $paired; then for file in $STEP_1/*.cleaned.forward do - f2=`echo $file | awk -F "cleaned.forward" '{print $1 "cleaned.reverse"}'` - shortname=`echo $file | awk -F "cleaned.forward" '{print $1 "merged"}'` - checked $PEAR -f $file -r $f2 -o $STEP_2/${shortname##*/} + f2=`echo $file | awk -F "cleaned.forward" '{print $1 "cleaned.reverse"}'` + shortname=`echo $file | awk -F "cleaned.forward" '{print $1 "merged"}'` + checked $PEAR -f $file -r $f2 -j $threads -o $STEP_2/${shortname##*/} done else for file in $STEP_1/*.cleaned @@ -158,7 +159,7 @@ fi printf "MERGING\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable MERGING is in the checkpoint file. STEP 2 will then be passed\n" + printf "\tThe variable MERGING is in the checkpoint file. STEP 2 will be skipped.\n" fi #################################################################### @@ -167,7 +168,7 @@ fi # Note: These are used later for statistical analysis. Step=$(grep "RAW" $INPUT_DIR/checkpoints) if [ "${Step}" != "RAW" ] - then + then if [[ -f $STEP_2/raw_counts.txt ]]; then rm $STEP_2/raw_counts.txt @@ -177,19 +178,19 @@ touch $STEP_2/raw_counts.txt if $paired; then for file in $STEP_1/*cleaned.forward do - checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt + checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt done else for file in $STEP_1/*cleaned do - checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt + checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt done fi printf "RAW\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" + printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will be skipped.\n" fi #################################################################### @@ -199,15 +200,15 @@ fi # do that first (see the SortMeRNA user manual for details). Step=$(grep "RIBO" $INPUT_DIR/checkpoints) if [ "${Step}" != "RIBO" ] - then + then for file in $STEP_2/*.assembled.fastq do - shortname=`echo $file | awk -F "assembled" '{print $1 "ribodepleted"}'` - checked $SORTMERNA \ - --ref $SORTMERNA_DIR/rRNA_databases/silva-bac-16s-id90.fasta,$SORTMERNA_DIR/index/silva-bac-16s-db \ - --reads $file --aligned $file.ribosomes --other $shortname --fastx \ - --num_alignments 0 --log -v + shortname=`echo $file | awk -F "assembled" '{print $1 "ribodepleted"}'` + checked $SORTMERNA -a $threads \ + --ref $SORTMERNA_DIR/rRNA_databases/silva-bac-16s-id90.fasta,$SORTMERNA_DIR/index/silva-bac-16s-db \ + --reads $file --aligned $file.ribosomes --other $shortname --fastx \ + --log -v done $MKDIR $STEP_3 @@ -216,7 +217,7 @@ mv $STEP_2/*ribodepleted* $STEP_3 printf "RIBO\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable RIBO is in the checkpoint file. STEP 3 will then be passed\n" + printf "\tThe variable RIBO is in the checkpoint file. STEP 3 will be skipped.\n" fi #################################################################### @@ -226,7 +227,7 @@ fi # do that first before running this step. Step=$(grep "REFSEQ_ANNOT" $INPUT_DIR/checkpoints) if [ "${Step}" != "REFSEQ_ANNOT" ] - then + then echo "Now starting on DIAMOND org annotations at: "; date @@ -249,7 +250,7 @@ echo "RefSeq DIAMOND annotations completed at: "; date printf "REFSEQ_ANNOT\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable REFSEQ_ANNOT is in the checkpoint file. STEP 4 will then be passed\n" + printf "\tThe variable REFSEQ_ANNOT is in the checkpoint file. STEP 4 will be skipped.\n" fi #################################################################### @@ -257,12 +258,12 @@ fi # STEP 5: AGGREGATING WITH ANALYSIS_COUNTER Step=$(grep "REFSEQ_AGGREG" $INPUT_DIR/checkpoints) if [ "${Step}" != "REFSEQ_AGGREG" ] - then + then for file in $STEP_4/*RefSeq_annotated do - checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O - checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F + checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O + checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F done $MKDIR $STEP_5/RefSeq_results/org_results @@ -273,7 +274,7 @@ mv $STEP_4/*function.tsv $STEP_5/RefSeq_results/func_results printf "REFSEQ_AGGREG\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable REFSEQ_AGGREG is in the checkpoint file. STEP 5 will then be passed\n" + printf "\tThe variable REFSEQ_AGGREG is in the checkpoint file. STEP 5 will be skipped.\n" fi #################################################################### @@ -281,7 +282,7 @@ fi # STEP 4.1: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS Step=$(grep "SUBSYS_ANNOT" $INPUT_DIR/checkpoints) if [ "${Step}" != "SUBSYS_ANNOT" ] - then + then echo "Now starting on DIAMOND Subsystems annotations at: "; date @@ -301,7 +302,7 @@ echo "DIAMOND Subsystems annotations completed at: "; date printf "SUBSYS_ANNOT\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable SUBSYS_ANNOT is in the checkpoint file. STEP 4.1 will then be passed\n" + printf "\tThe variable SUBSYS_ANNOT is in the checkpoint file. STEP 4.1 will be skipped.\n" fi ################################################################## @@ -309,15 +310,15 @@ fi # STEP 5.1: PYTHON SUBSYSTEMS ANALYSIS COUNTER Step=$(grep "SUBSYS_AGGREG" $INPUT_DIR/checkpoints) if [ "${Step}" != "SUBSYS_AGGREG" ] - then + then for file in $STEP_4/*subsys_annotated do - checked python $PY_DIR/DIAMOND_subsystems_analysis_counter.py -I $file \ - -D $Subsys_db -O $file.hierarchy -P $file.receipt + checked python $PY_DIR/DIAMOND_subsystems_analysis_counter.py -I $file \ + -D $Subsys_db -O $file.hierarchy -P $file.receipt - # This quick program reduces down identical hierarchy annotations - checked python $PY_DIR/subsys_reducer.py -I $file.hierarchy + # This quick program reduces down identical hierarchy annotations + checked python $PY_DIR/subsys_reducer.py -I $file.hierarchy done $MKDIR $STEP_5/Subsystems_results/receipts @@ -328,7 +329,7 @@ rm $STEP_4/*.hierarchy printf "SUBSYS_AGGREG\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable SUBSYS_AGGREG is in the checkpoint file. STEP 5.1 will then be passed\n" + printf "\tThe variable SUBSYS_AGGREG is in the checkpoint file. STEP 5.1 will be skipped.\n" fi ################################################################## @@ -344,7 +345,7 @@ fi # the appropriate prefix (either "control_$file" or experimental_$file")! Step=$(grep "R_ANALYSIS" $INPUT_DIR/checkpoints) if [ "${Step}" != "R_ANALYSIS" ] - then + then checked Rscript $R_DIR/run_DESeq_stats.R \ -I $STEP_5/RefSeq_results/org_results \ @@ -362,10 +363,9 @@ checked Rscript $R_DIR/Subsystems_DESeq_stats.R \ printf "R_ANALYSIS\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable R_ANALYSIS is in the checkpoint file. STEP 6 will then be passed\n" + printf "\tThe variable R_ANALYSIS is in the checkpoint file. STEP 6 will be skipped.\n" fi echo "Master bash script finished running at: "; date exit #################################################################### - diff --git a/bash_scripts/master_script_preserving_unmerged.sh b/bash_scripts/master_script_preserving_unmerged.sh new file mode 100644 index 0000000..be495b6 --- /dev/null +++ b/bash_scripts/master_script_preserving_unmerged.sh @@ -0,0 +1,374 @@ +#!/bin/bash +#SBATCH --mem=100000 +#SBATCH --time=7-0:0:0 + +# Lines starting with #SBATCH are for SLURM job management systems +# and may be removed if user is not submitting jobs to SLURM + +#################################################################### +# +# master_script.sh +# Created April 2017 by Sam Westreich, github.com/transcript +# This version modified March 12, 2019 +# +#################################################################### +# +# This script sets up and runs through ALL steps in the SAMSA pipeline +# before the analysis (which is done in R, likely in RStudio). Each +# step is set up below. +# +# The steps are: +# 1. Read cleaning with Trimmomatic +# 2. Merging with PEAR, if applicable +# 3. rRNA removal with SortMeRNA +# 4. Annotation using DIAMOND (by default against the RefSeq database) +# 5. Aggregation using analysis_counter.py +# 4.1 Annotation using DIAMOND against the Subsystems database +# 5.1 Aggregation using Subsystems-specific analysis counter.py +# 6. Running R scripts to get DESeq statistical analysis. +# +# NOTE: BEFORE running this script, please run package_installation.bash +# and full_database_download.bash located at: +# https://github.com/transcript/samsa2/tree/master/setup in order to set +# up SAMSA2 dependencies and download full databases. +# +####################################################################### +# +echo -e "NOTE: Before running this script, please run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n" +# +# VARIABLES - set starting location and starting files location pathways +# +source "${BASH_SOURCE%/*}/../bash_scripts/lib/common.sh" + +INPUT_DIR=$SAMSA/input_files +OUT_DIR=$SAMSA + +# number of threads +threads=`getconf _NPROCESSORS_ONLN` + +STEP_1="$OUT_DIR/step_1_output" +STEP_2="$OUT_DIR/step_2_output" +STEP_3="$OUT_DIR/step_3_output" +STEP_4="$OUT_DIR/step_4_output" +STEP_5="$OUT_DIR/step_5_output" + +if [[ -n "$USE_TINY" ]]; then + # Diamond databases + diamond_database="$SAMSA/setup_and_test/tiny_databases/RefSeq_bac_TINY_24MB" + diamond_subsys_db="$SAMSA/setup_and_test/tiny_databases/subsys_db_TINY_24MB" + # Aggregation databases + RefSeq_db="$SAMSA/setup_and_test/tiny_databases/RefSeq_bac_TINY_24MB.fa" + Subsys_db="$SAMSA/setup_and_test/tiny_databases/subsys_db_TINY_24MB.fa" + # Use test output directories + STEP_1="${STEP_1}_test" + STEP_2="${STEP_2}_test" + STEP_3="${STEP_3}_test" + STEP_4="${STEP_4}_test" + STEP_5="${STEP_5}_test" +else + # Diamond databases + diamond_database="$SAMSA/full_databases/RefSeq_bac" + diamond_subsys_db="$SAMSA/full_databases/subsys_db" + # Aggregation databases + RefSeq_db="$SAMSA/full_databases/RefSeq_bac.fa" + Subsys_db="$SAMSA/full_databases/subsys_db.fa" +fi + +#################################################################### +#STEP 0.1: create/read checkpoint + +printf "\nStep 0.1: Checking for the presence of the checkpoint file.\n" +if [ ! -f "$INPUT_DIR/checkpoints" ] + then + printf "\tThe file 'checkpoints' does not exist in the input directory, creating...\n" + touch "$INPUT_DIR/checkpoints" +else + printf "\tThe file 'checkpoints' already exists in the input directory.\n" +fi + +#################################################################### +# +# STEP 1: CLEANING FILES WITH TRIMMOMATIC +Step=$(grep "TRIMMO" $INPUT_DIR/checkpoints) +if [ "${Step}" != "TRIMMO" ] + then + +if ls $INPUT_DIR/*.gz &>/dev/null; then + for file in $INPUT_DIR/*.gz + do + gunzip $file + done +fi + +$MKDIR $STEP_1 +paired=false +for f in $INPUT_DIR/*R1*q +do + f2=`echo $f | awk -F "R1" '{print $1 "R2" $2}'` + out_path=`echo $f | awk -F "_R1" '{print $1 ".cleaned"}'` + if [ -f $f2 ]; then + paired=true + checked java -jar $TRIMMOMATIC PE -phred33 -threads $threads $f $f2 \ + $out_path".forward" $out_path".forward_unpaired" $out_path".reverse" $out_path".reverse_unpaired" \ + SLIDINGWINDOW:4:15 MINLEN:70 + else + checked java -jar $TRIMMOMATIC SE -phred33 -threads $threads $f $out_path SLIDINGWINDOW:4:15 MINLEN:70 + fi +done + +if $paired; then + mv $INPUT_DIR/*".cleaned.forward"* $STEP_1 + mv $INPUT_DIR/*".cleaned.reverse"* $STEP_1 +else + mv $INPUT_DIR/*".cleaned" $STEP_1 +fi +printf "TRIMMO\n" >>pipeline/checkpoints + +else + printf "\tThe variable TRIMMO is in the checkpoint file. STEP 1 will be skipped.\n" +fi + +#################################################################### +# +# STEP 2: MERGING OF PAIRED-END FILES USING PEAR +# Note: paired-end files are usually named using R1 and R2 in the name. +# Example: control_1.R1.fastq +# control_1.R2.fastq + +Step=$(grep "MERGING" $INPUT_DIR/checkpoints) +if [ "${Step}" != "MERGING" ] + then + +$MKDIR $STEP_2 +if $paired; then + for file in $STEP_1/*.cleaned.forward + do + f2=`echo $file | awk -F "cleaned.forward" '{print $1 "cleaned.reverse"}'` + shortname=`echo $file | awk -F "cleaned.forward" '{print $1 "merged"}'` + checked $PEAR -f $file -r $f2 -j $threads -o $STEP_2/${shortname##*/} + done +else + for file in $STEP_1/*.cleaned + do + new_name=`echo $file | awk -F "cleaned" '{print $1 "merged.assembled.fastq"}'` + cp $file $STEP_2/${new_name##*/} + done +fi + +###I will concatenate forward and reverse complement with 20Ns in the middle. +$R_programs/combining_umerged.R $STEP_1 + +printf "MERGING\n" >>pipeline/checkpoints +mv $STEP_1/*assembled2.fastq $STEP_2/ + +else + printf "\tThe variable MERGING is in the checkpoint file. STEP 2 will be skipped.\n" +fi + +#################################################################### +# +# STEP 2.9: GETTING RAW SEQUENCES COUNTS +# Note: These are used later for statistical analysis. +Step=$(grep "RAW" $INPUT_DIR/checkpoints) +if [ "${Step}" != "RAW" ] + then + +if [[ -f $STEP_2/raw_counts.txt ]]; then + rm $STEP_2/raw_counts.txt +fi +touch $STEP_2/raw_counts.txt + +if $paired; then + for file in $STEP_1/*cleaned.forward + do + checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt + done +else + for file in $STEP_1/*cleaned + do + checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt + done +fi + +printf "RAW\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will be skipped.\n" +fi + +#################################################################### +# +# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA +# Note: this step assumes that the SortMeRNA databases are indexed. If not, +# do that first (see the SortMeRNA user manual for details). +Step=$(grep "RIBO" $INPUT_DIR/checkpoints) +if [ "${Step}" != "RIBO" ] + then + +for file in $STEP_2/*.assembled2.fastq +do + shortname=`echo $file | awk -F "assembled2" '{print $1 "ribodepleted"}'` + checked $SORTMERNA -a $threads \ + --ref $SORTMERNA_DIR/rRNA_databases/silva-bac-16s-id90.fasta,$SORTMERNA_DIR/index/silva-bac-16s-db \ + --reads $file --aligned $file.ribosomes --other $shortname --fastx \ + --log -v +done + +$MKDIR $STEP_3 +mv $STEP_2/*ribodepleted* $STEP_3 + +printf "RIBO\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable RIBO is in the checkpoint file. STEP 3 will be skipped.\n" +fi + +#################################################################### +# +# STEP 4: ANNOTATING WITH DIAMOND AGAINST REFSEQ +# Note: this step assumes that the DIAMOND database is already built. If not, +# do that first before running this step. +Step=$(grep "REFSEQ_ANNOT" $INPUT_DIR/checkpoints) +if [ "${Step}" != "REFSEQ_ANNOT" ] + then + +echo "Now starting on DIAMOND org annotations at: "; date + +for file in $STEP_3/*ribodepleted.fastq +do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'` + echo "Now starting on " $file + echo "Converting to " $shortname + checked $DIAMOND blastx --db $diamond_database -q $file -a $file.RefSeq -t ./ -k 1 + checked $DIAMOND view --daa $file.RefSeq.daa -o $shortname -f tab +done + +$MKDIR $STEP_4/daa_binary_files + +mv $STEP_3/*annotated* $STEP_4 +mv $STEP_3/*.daa $STEP_4/daa_binary_files + +echo "RefSeq DIAMOND annotations completed at: "; date + +printf "REFSEQ_ANNOT\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable REFSEQ_ANNOT is in the checkpoint file. STEP 4 will be skipped.\n" +fi + +#################################################################### +# +# STEP 5: AGGREGATING WITH ANALYSIS_COUNTER +Step=$(grep "REFSEQ_AGGREG" $INPUT_DIR/checkpoints) +if [ "${Step}" != "REFSEQ_AGGREG" ] + then + +for file in $STEP_4/*RefSeq_annotated +do + checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O + checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F +done + +$MKDIR $STEP_5/RefSeq_results/org_results +$MKDIR $STEP_5/RefSeq_results/func_results +mv $STEP_4/*organism.tsv $STEP_5/RefSeq_results/org_results +mv $STEP_4/*function.tsv $STEP_5/RefSeq_results/func_results + +printf "REFSEQ_AGGREG\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable REFSEQ_AGGREG is in the checkpoint file. STEP 5 will be skipped.\n" +fi + +#################################################################### +# +# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS +Step=$(grep "SUBSYS_ANNOT" $INPUT_DIR/checkpoints) +if [ "${Step}" != "SUBSYS_ANNOT" ] + then + +echo "Now starting on DIAMOND Subsystems annotations at: "; date + +for file in $STEP_3/*ribodepleted.fastq +do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'` + echo "Now starting on Subsystems annotations for " $file + checked $DIAMOND blastx --db $diamond_subsys_db -q $file -a $file.Subsys -t ./ -k 1 + checked $DIAMOND view --daa $file.Subsys.daa -o $shortname -f tab +done + +mv $STEP_3/*subsys_annotated* $STEP_4 +mv $STEP_3/*.daa $STEP_4/daa_binary_files + +echo "DIAMOND Subsystems annotations completed at: "; date + +printf "SUBSYS_ANNOT\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable SUBSYS_ANNOT is in the checkpoint file. STEP 4.1 will be skipped.\n" +fi + +################################################################## +# +# STEP 5.1: PYTHON SUBSYSTEMS ANALYSIS COUNTER +Step=$(grep "SUBSYS_AGGREG" $INPUT_DIR/checkpoints) +if [ "${Step}" != "SUBSYS_AGGREG" ] + then + +for file in $STEP_4/*subsys_annotated +do + checked python $PY_DIR/DIAMOND_subsystems_analysis_counter.py -I $file \ + -D $Subsys_db -O $file.hierarchy -P $file.receipt + + # This quick program reduces down identical hierarchy annotations + checked python $PY_DIR/subsys_reducer.py -I $file.hierarchy +done + +$MKDIR $STEP_5/Subsystems_results/receipts +mv $STEP_4/*.reduced $STEP_5/Subsystems_results +mv $STEP_4/*.receipt $STEP_5/Subsystems_results/receipts +rm $STEP_4/*.hierarchy + +printf "SUBSYS_AGGREG\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable SUBSYS_AGGREG is in the checkpoint file. STEP 5.1 will be skipped.\n" +fi + +################################################################## +# +# At this point, all the results files are ready for analysis using R. +# This next step performs basic DESeq2 analysis of the RefSeq organism, function, +# and Subsystems annotations. +# +# More complex R analyses may be performed using specific .sh analysis scripts. +# +# STEP 6: R ANALYSIS +# Note: For R to properly identify files to compare/contrast, they must include +# the appropriate prefix (either "control_$file" or experimental_$file")! +Step=$(grep "R_ANALYSIS" $INPUT_DIR/checkpoints) +if [ "${Step}" != "R_ANALYSIS" ] + then + +checked Rscript $R_DIR/run_DESeq_stats.R \ + -I $STEP_5/RefSeq_results/org_results \ + -O RefSeq_org_DESeq_results.tab \ + -R $STEP_2/raw_counts.txt +checked Rscript $R_DIR/run_DESeq_stats.R \ + -I $STEP_5/RefSeq_results/func_results \ + -O RefSeq_func_DESeq_results.tab \ + -R $STEP_2/raw_counts.txt +checked Rscript $R_DIR/Subsystems_DESeq_stats.R \ + -I $STEP_5/Subsystems_results \ + -O Subsystems_level-1_DESeq_results.tab -L 1 \ + -R $STEP_2/raw_counts.txt + +printf "R_ANALYSIS\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable R_ANALYSIS is in the checkpoint file. STEP 6 will be skipped.\n" +fi + +echo "Master bash script finished running at: "; date +exit +####################################################################