From 0efac2564fe50ecce42ceca3abd14215b92935f4 Mon Sep 17 00:00:00 2001 From: Sam Westreich Date: Tue, 12 Mar 2019 12:49:30 -0700 Subject: [PATCH] Incorporating and streamlining checkpoint update --- README.md | 22 +- R_scripts/combining_umerged.R | 24 +- bash_scripts/cyanotoxin_master_samsa2.slurm | 505 ------------------ bash_scripts/master_samsa2.slurm | 435 --------------- bash_scripts/master_script.sh | 98 ++-- .../master_script_preserving_unmerged.sh | 374 +++++++++++++ 6 files changed, 441 insertions(+), 1017 deletions(-) delete mode 100755 bash_scripts/cyanotoxin_master_samsa2.slurm delete mode 100755 bash_scripts/master_samsa2.slurm create mode 100644 bash_scripts/master_script_preserving_unmerged.sh diff --git a/README.md b/README.md index 00978e8..9110279 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,20 @@ -# SAMSA2 - A fork of the complete metatranscriptome analysis pipeline +# SAMSA2 - A complete metatranscriptome analysis pipeline -### modification by sebastien.renaut@gmail.com +***** -### New in my version. -This is a new master script (master_samsa2.slurm) with several modifications: +**Version 2.2.0 - Yesod - Modifications added by sebastien.renaut@gmail.com:** -* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond) -* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary. -* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R` +* Multithreading added for PEAR, Trimmomatic, SortMeRNA +* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint` and that step is skipped on a rerun of the master_script. This is done to avoid re-running CPU-intensive steps if unnecessary. +* A new version of the master script now exists, called "master\_script\_preserving\_unmerged.sh". In this script, in the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R` * Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum. * Each step contains an exit statement to be printed if the master script dies due to an unforseen error. * Trimmomatic removes adapter contamination according to a specific fasta file. * All options, read & program location are to be specified in the first section of the script. * The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed. -* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways... - -#### To do: -* reverse the order of the merging and trimming step (but again, maybe not...) -* simplify some syntax / make sure all paths are relative (mostly done) +* The flag --num_alignments 0 in the ribosomal `sortmrna` step has been removed. This caused problems and slowed things down a lot. Plus, we don't care about the rRNA alignments - whether a sequence aligns to 1 or 1,000 rRNA, it's out anyways... +***** Version 2 of the SAMSA pipeline - faster! Lighter! More options! Less waiting! @@ -32,7 +28,7 @@ Version 2 of the SAMSA pipeline - faster! Lighter! More options! Less waiting ### Dependencies -SAMSA2 requires Python2 for aggregation scripts. Currently, this pipeline does not work with Python3, although an update is planned. +SAMSA2 requires Python2 for aggregation scripts. Currently, this pipeline works mostly with Python3, although there may be some errors not yet caught. The following programs can be downloaded OR can be installed from the binaries provided in the programs/ folder. diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R index c04f28c..e4aad70 100755 --- a/R_scripts/combining_umerged.R +++ b/R_scripts/combining_umerged.R @@ -1,24 +1,23 @@ #!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript args = commandArgs(TRUE) -start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself. -#start="/Users/jerry/Documents/CSBQ/shapiro/results" +start = args[1] # Specify which sequences in "list_ind" file you want to align, directly from the shell. Alternatively, you can do this from the "alignments" function itself. setwd(start) -###files to work with +###files to work with system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = "")) system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = "")) system(paste("ls -1 *.assembled.fastq >assembled.files",sep = "")) -files_f = read.table("umerged.forward.files",stringsAsFactors = F) -files_r = read.table("umerged.reverse.files",stringsAsFactors = F) -files_a = read.table("assembled.files",stringsAsFactors = F) +files_f = read.table("umerged.forward.files",stringsAsFactors = F) +files_r = read.table("umerged.reverse.files",stringsAsFactors = F) +files_a = read.table("assembled.files",stringsAsFactors = F) -for(i in 1:nrow(files_f)) +for(i in 1:nrow(files_f)) { -#grep 1st and 2rd lines +#grep 1st and 2rd lines unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="") system(unassembled.fastq) @@ -33,9 +32,9 @@ for(i in 1:nrow(files_f)) #N and E quality file system("wc -l unassembled.seq.reverse.fastq >wc") wc = read.table("wc") - + write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F) - + #paste the unassembled sequences into a single file system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN") @@ -54,8 +53,3 @@ for(i in 1:nrow(files_f)) #remove the clutter (listing of all files) system("rm assembled.files umerged.forward.files umerged.reverse.files") - - - - - diff --git a/bash_scripts/cyanotoxin_master_samsa2.slurm b/bash_scripts/cyanotoxin_master_samsa2.slurm deleted file mode 100755 index 2e278da..0000000 --- a/bash_scripts/cyanotoxin_master_samsa2.slurm +++ /dev/null @@ -1,505 +0,0 @@ -#!/bin/bash - -#SBATCH --time=0-07:59 -#SBATCH --mem=40G -#SBATCH --account=def-bruneaua -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=12 - -module load r/3.5.0 - - -# Lines starting with #SBATCH are for SLURM job management systems -# and may be removed if user is not submitting jobs to SLURM -#################################################################### -#set -x #echo -# -# master_script_for_sample_files.bash -# Created April 2017 by Sam Westreich, github.com/transcript -# This version modified by Michelle Treiber on August 9, 2017 -# -#################################################################### -# -# This script sets up and runs through ALL steps in the SAMSA pipeline -# before the analysis (which is done in R, likely in RStudio). Each -# step is set up below. -# -# The steps are: -# 0.1 Create the checkpoint -# 0.2 Gunzip the raw sequences -# 1. Merging with PEAR, if applicable -# 2. Read cleaning with Trimmomatic -# 3. rRNA removal with SortMeRNA -# 4. Annotation using DIAMOND against Subsys database -# 5. Aggregation using analysis_counter.py (Subsys) -# 4.1 Annotation using DIAMOND against Refseq database -# 5.1 Aggregation using analysis_counter.py (Refseq) -# 6. Running R scripts to get DESeq statistical analysis. (NOT RUN) -# -# NOTE: BEFORE running this script, please run package_installation.bash -# and full_database_download.bash located at: -# https://github.com/transcript/samsa2/tree/master/setup in order to set -# up SAMSA2 dependencies and download full databases. -# -#################################################################### -# -echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n" -# -# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download -# -# 0. Set starting location: -starting_location=/home/renaut/scratch/ATRAPP/ATRAPP_Champlain_2016_Metat/St1 - -# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/gzipped -sequence_location=/home/renaut/scratch/ATRAPP/sequences/ATRAPP_Champlain_2016_Metat/St1 - -# 1. PEAR -pear_location=/home/renaut/bin - -# 2. Trimmomatic -trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36 - -# 3. SortMeRNA -sortmerna_location=/home/renaut/bin - -# 4. DIAMOND -diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac" -diamond_cyanotoxin="/home/renaut/scratch/blast_database/Cyanotoxin_Database/New_Toxin_Database_OneLine_1" -diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db" -diamond_location=/home/renaut/bin - -# 5. Aggregation -python_programs=/home/renaut/samsa2.master/samsa2/python_scripts -cyano_db="/home/renaut/scratch/blast_database/Cyanotoxin_Database/New_Toxin_Database_OneLine_1.faa2" -RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa" -Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa" - -# 6. R scripts and paths -export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages" -R_programs=/home/renaut/samsa2.master/samsa2/R_scripts - -# threads -threads="12" - -################## -#STEP 0.1: create/read checkpoint -cd $starting_location - -printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" -if [ ! -f "pipeline/checkpoints" ] - then - printf "\tThe file checkpoints does not exist, we will create it.\n" - touch "pipeline/checkpoints" -else - printf "\tThe file checkpoints already exist\n" -fi - -#################################################################### -# -#STEP 0.2: GUNZIP PROCESS. -# -# -Step=$(grep "GUNZIP" pipeline/checkpoints) -if [ "${Step}" != "GUNZIP" ] - then - for file in $sequence_location/zipped/*gz - do - file_out=`echo $file | awk -F ".gz" '{print $1}'` - gunzip $file -cd >$file_out - - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the gunzip step" - exit 1 - fi - done - - mkdir $sequence_location/unzipped - mv $sequence_location/zipped/*fastq $sequence_location/unzipped/. - printf "GUNZIP\n" >>pipeline/checkpoints - echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" - date -else - printf "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n" -fi - -#################################################################### -# -# STEP 1: MERGING OF PAIRED-END FILES USING PEAR -# Note: paired-end files are usually named using R1 and R2 in the name. -# Example: control_1.R1.fastq -# control_1.R2.fastq -# -# Note: if using single-end sequencing, skip this step (comment out). -# Note: if performing R analysis (step 6), be sure to name files with -# the appropriate prefix ("control_$file" and "experimental_$file")! - - - -Step=$(grep "MERGING" pipeline/checkpoints) -if [ "${Step}" != "MERGING" ] - then - mkdir $starting_location/step_1_merging - touch $starting_location/step_1_merging/pear_log - for file in $sequence_location/unzipped/*_R1.fastq - do - file1=$file - file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'` - out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'` - out_name=`echo ${out_path##*/}` - $pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log - - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the merging step" - exit 1 - fi - done - - ###I will concatenate forward and reverse complement with 20Ns in the middle. - $R_programs/combining_umerged.R $starting_location - - printf "MERGING\n" >>pipeline/checkpoints - mkdir $starting_location/step_1_merging/ - mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ - - rm $starting_location/*fastq - rm $sequence_location/unzipped/*fastq - - echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR" - date -else - printf "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n" -fi - -#################################################################### -# -# STEP 2: CLEANING FILES WITH TRIMMOMATIC -# Note: if skipping PEAR, make sure that all starting files are in the -# $starting_location/step_1_output/ folder! -Step=$(grep "TRIMMING" pipeline/checkpoints) -if [ "${Step}" != "TRIMMING" ] - then - mkdir $starting_location/step_2_trimming - touch $starting_location/step_2_trimming/trimmomatic_log - - for file in $starting_location/step_1_merging/*assembled2.fastq - do - shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'` - - java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the trimmmomatic step" - exit 1 - fi - - done - - mkdir $starting_location/step_2_trimming/ - mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/ - - rm step_1_merging/*assembled2.fastq - - echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC" - date - - printf "TRIMMING\n" >>pipeline/checkpoints -else - printf "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n" -fi - -#################################################################### -# -# STEP 2.5: GETTING RAW SEQUENCES COUNTS -# Note: These are used later for statistical analysis. -Step=$(grep "RAW" pipeline/checkpoints) -if [ "${Step}" != "RAW" ] - then - if [ -f $starting_location/step_2_trimming/raw_counts.txt ] - then - rm $starting_location/step_2_trimming/raw_counts.txt - touch $starting_location/step_2_trimming/raw_counts.txt - else - touch $starting_location/step_2_trimming/raw_counts.txt - fi - - for file in $starting_location/step_2_trimming/*cleaned.fastq - do - python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the raw sequences step" - exit 1 - fi - done - echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" - date - - printf "RAW\n" >>pipeline/checkpoints -else - printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" -fi - -#################################################################### -# -# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA -# Note: this step assumes that the SortMeRNA databases are indexed. If not, -# do that first (see the SortMeRNA user manual for details). - -Step=$(grep "RIBODEPLETION" pipeline/checkpoints) -if [ "${Step}" != "RIBODEPLETION" ] - then - mkdir $starting_location/step_3_ribodepletion - touch $starting_location/step_3_ribodepletion/sortmerna_log - - for file in $starting_location/step_2_trimming/*cleaned.fastq - do - shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'` - $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the sortmerna step" - exit 1 - fi - done - - mkdir $starting_location/step_3_ribodepletion/ - mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/ - - rm step_2_trimming/*cleaned.fastq - rm step_2_trimming/*ribosomes.fastq - - echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA" - date - - printf "RIBODEPLETION\n" >>pipeline/checkpoints -else - printf "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n" -fi - -#################################################################### -# -# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS -date -Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints) -if [ "${Step}" != "ANNOTATION_SUBSYS" ] - then - mkdir $starting_location/step_4_annotation - touch $starting_location/step_4_annotation/diamond_log - for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq - do - shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'` - #removed the diamond sensitive option - $diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log - $diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the diamond subsys step" - exit 1 - fi - done - - mkdir $starting_location/step_4_annotation/daa_binary_files/ - - mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/ - mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ - - echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS" - date - - printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints -else - printf "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n" -fi -################################################################## -# -# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER -Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints) -if [ "${Step}" != "AGGREGATION_SUBSYS" ] - then - mkdir $starting_location/step_5_aggregation/ - for file in $starting_location/step_4_annotation/*subsys_annotated - do - python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt - - # This quick program reduces down identical hierarchy annotations - python $python_programs/subsys_reducer.py -I $file.hierarchy - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the aggregation subsys step" - exit 1 - fi - done - - mkdir $starting_location/step_5_aggregation/Subsystems_results/ - mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/ - mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/ - mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/ - rm $starting_location/step_4_annotation/*.hierarchy - - echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER" - date - - printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints -else - printf "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n" -fi - - -#################################################################### -# -# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ -# Note: this step assumes that the DIAMOND database is already built. If not, -# do that first before running this step. -Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints) -if [ "${Step}" != "ANNOTATION_REFSEQ" ] - then - touch $starting_location/step_4_annotation/diamond_log - - for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq - do - shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'` - #removed the diamond sensitive option - $diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log - $diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the diamond refseq step" - exit 1 - fi - done - - mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ - mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ - - echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ" - date - - printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints -else - printf "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n" -fi - -#################################################################### -# -# STEP 4.2: ANNOTATING WITH DIAMOND AGAINST CYANO -# Note: this step assumes that the DIAMOND database is already built. If not, -# do that first before running this step. -Step=$(grep "ANNOTATION_CYANO" pipeline/checkpoints) -if [ "${Step}" != "ANNOTATION_CYANO" ] - then - touch $starting_location/step_4_annotation/diamond_cyano_log - - for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq - do - shortname=`echo $file | awk -F "ribodepleted" '{print $1 "cyano_annotated"}'` - #removed the diamond sensitive option - $diamond_location/diamond blastx --db $diamond_cyanotoxin -q $file -p $threads -a $file.cyano -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_cyano_log - $diamond_location/diamond view --daa $file.cyano.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_cyano_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the diamond cyano step" - exit 1 - fi - done - - mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ - mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ - - echo "STEP 4.2 DONE:ANNOTATING WITH DIAMOND AGAINST CYANO" - date - - printf "ANNOTATION_CYANO\n" >>pipeline/checkpoints -else - printf "\tThe variable ANNOTATION_CYANO is in the checkpoint file. STEP 4.1 will then be passed\n" -fi - - -#################################################################### -# -# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER -Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints) -if [ "${Step}" != "AGGREGATION_REFSEQ" ] - then - for file in $starting_location/step_4_annotation/*RefSeq_annotated* - do - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the aggregation refseq step" - exit 1 - fi - done - - mkdir $starting_location/step_5_aggregation/RefSeq_results/ - mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/ - mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/ - mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/ - mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/ - - echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER" - date - - printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints -else - printf "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n" -fi - - -#################################################################### -# -# STEP 5.2: AGGREGATING WITH ANALYSIS_COUNTER -Step=$(grep "AGGREGATION_CYANO" pipeline/checkpoints) -if [ "${Step}" != "AGGREGATION_CYANO" ] - then - for file in $starting_location/step_4_annotation/*cyano_annotated* - do - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $cyano_db -O - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $cyano_db -F - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the aggregation CYANO step" - exit 1 - fi - done - - mkdir $starting_location/step_5_aggregation/cyano_results/ - mkdir $starting_location/step_5_aggregation/cyano_results/org_results/ - mkdir $starting_location/step_5_aggregation/cyano_results/func_results/ - mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/cyano_results/org_results/ - mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/cyano_results/func_results/ - - echo "STEP 5.2 DONE: AGGREGATING WITH ANALYSIS_COUNTER" - date - - printf "AGGREGATION_CYANO\n" >>pipeline/checkpoints -else - printf "\tThe variable AGGREGATION_CYANO is in the checkpoint file. STEP 5.2 will then be passed\n" -fi - - - - -################################################################## -# -# At this point, all the results files are ready for analysis using R. -# This next step performs basic DESeq2 analysis of the RefSeq organism, function, -# and Subsystems annotations. -# -# More complex R analyses may be performed using specific .sh analysis scripts. -# -# STEP 6: R ANALYSIS -# Note: For R to properly identify files to compare/contrast, they must include -# the appropriate prefix (either "control_$file" or experimental_$file")! - -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt - -echo "STEP 6 DONE: R ANALYSIS" -echo "Master bash script finished running at: "; date -exit -#################################################################### - diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm deleted file mode 100755 index a16d11b..0000000 --- a/bash_scripts/master_samsa2.slurm +++ /dev/null @@ -1,435 +0,0 @@ -#!/bin/bash - -#SBATCH --time=0-23:59 -#SBATCH --mem=100G -#SBATCH --account=def-bruneaua -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=48 - -module load r/3.5.0 - - -# Lines starting with #SBATCH are for SLURM job management systems -# and may be removed if user is not submitting jobs to SLURM -#################################################################### -#set -x #echo -# -# master_script_for_sample_files.bash -# Created April 2017 by Sam Westreich, github.com/transcript -# This version modified by Michelle Treiber on August 9, 2017 -# -#################################################################### -# -# This script sets up and runs through ALL steps in the SAMSA pipeline -# before the analysis (which is done in R, likely in RStudio). Each -# step is set up below. -# -# The steps are: -# 0.1 Create the checkpoint -# 0.2 Gunzip the raw sequences -# 1. Merging with PEAR, if applicable -# 2. Read cleaning with Trimmomatic -# 3. rRNA removal with SortMeRNA -# 4. Annotation using DIAMOND against Subsys database -# 5. Aggregation using analysis_counter.py (Subsys) -# 4.1 Annotation using DIAMOND against Refseq database -# 5.1 Aggregation using analysis_counter.py (Refseq) -# 6. Running R scripts to get DESeq statistical analysis. (NOT RUN) -# -# NOTE: BEFORE running this script, please run package_installation.bash -# and full_database_download.bash located at: -# https://github.com/transcript/samsa2/tree/master/setup in order to set -# up SAMSA2 dependencies and download full databases. -# -#################################################################### -# -echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n" -# -# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download -# -# 0. Set starting location: -starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1 - -# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped -sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1 - -# 1. PEAR -pear_location=/home/renaut/bin - -# 2. Trimmomatic -trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36 - -# 3. SortMeRNA -sortmerna_location=/home/renaut/bin - -# 4. DIAMOND -diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac" -diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db" -diamond_location=/home/renaut/bin - -# 5. Aggregation -python_programs=/home/renaut/samsa2.master/samsa2/python_scripts -RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa" -Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa" - -# 6. R scripts and paths -export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages" -R_programs=/home/renaut/samsa2.master/samsa2/R_scripts - -# threads -threads="48" - -################## -#STEP 0.1: create/read checkpoint -cd $starting_location - -printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" -if [ ! -f "pipeline/checkpoints" ] - then - printf "\tThe file checkpoints does not exist, we will create it.\n" - touch "pipeline/checkpoints" -else - printf "\tThe file checkpoints already exist\n" -fi - -#################################################################### -# -#STEP 0.2: GUNZIP PROCESS. -# -# -Step=$(grep "GUNZIP" pipeline/checkpoints) -if [ "${Step}" != "GUNZIP" ] - then - for file in $sequence_location/zipped/*gz - do - file_out=`echo $file | awk -F ".gz" '{print $1}'` - gunzip $file -cd >$file_out - - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the gunzip step" - exit 1 - fi - done - - mkdir $sequence_location/unzipped - mv $sequence_location/zipped/*fastq $sequence_location/unzipped/. - printf "GUNZIP\n" >>pipeline/checkpoints - echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" - date -else - printf "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n" -fi - -#################################################################### -# -# STEP 1: MERGING OF PAIRED-END FILES USING PEAR -# Note: paired-end files are usually named using R1 and R2 in the name. -# Example: control_1.R1.fastq -# control_1.R2.fastq -# -# Note: if using single-end sequencing, skip this step (comment out). -# Note: if performing R analysis (step 6), be sure to name files with -# the appropriate prefix ("control_$file" and "experimental_$file")! - - - -Step=$(grep "MERGING" pipeline/checkpoints) -if [ "${Step}" != "MERGING" ] - then - mkdir $starting_location/step_1_merging - touch $starting_location/step_1_merging/pear_log - for file in $sequence_location/unzipped/*_R1.fastq - do - file1=$file - file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'` - out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'` - out_name=`echo ${out_path##*/}` - $pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log - - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the merging step" - exit 1 - fi - done - - ###I will concatenate forward and reverse complement with 20Ns in the middle. - $R_programs/combining_umerged.R $starting_location - - printf "MERGING\n" >>pipeline/checkpoints - mkdir $starting_location/step_1_merging/ - mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ - - rm $starting_location/*fastq - rm $sequence_location/unzipped/*fastq - - echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR" - date -else - printf "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n" -fi - -#################################################################### -# -# STEP 2: CLEANING FILES WITH TRIMMOMATIC -# Note: if skipping PEAR, make sure that all starting files are in the -# $starting_location/step_1_output/ folder! -Step=$(grep "TRIMMING" pipeline/checkpoints) -if [ "${Step}" != "TRIMMING" ] - then - mkdir $starting_location/step_2_trimming - touch $starting_location/step_2_trimming/trimmomatic_log - - for file in $starting_location/step_1_merging/*assembled2.fastq - do - shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'` - - java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the trimmmomatic step" - exit 1 - fi - - done - - mkdir $starting_location/step_2_trimming/ - mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/ - - rm step_1_merging/*assembled2.fastq - - echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC" - date - - printf "TRIMMING\n" >>pipeline/checkpoints -else - printf "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n" -fi - -#################################################################### -# -# STEP 2.5: GETTING RAW SEQUENCES COUNTS -# Note: These are used later for statistical analysis. -Step=$(grep "RAW" pipeline/checkpoints) -if [ "${Step}" != "RAW" ] - then - if [ -f $starting_location/step_2_trimming/raw_counts.txt ] - then - rm $starting_location/step_2_trimming/raw_counts.txt - touch $starting_location/step_2_trimming/raw_counts.txt - else - touch $starting_location/step_2_trimming/raw_counts.txt - fi - - for file in $starting_location/step_2_trimming/*cleaned.fastq - do - python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the raw sequences step" - exit 1 - fi - done - echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" - date - - printf "RAW\n" >>pipeline/checkpoints -else - printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" -fi - -#################################################################### -# -# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA -# Note: this step assumes that the SortMeRNA databases are indexed. If not, -# do that first (see the SortMeRNA user manual for details). - -Step=$(grep "RIBODEPLETION" pipeline/checkpoints) -if [ "${Step}" != "RIBODEPLETION" ] - then - mkdir $starting_location/step_3_ribodepletion - touch $starting_location/step_3_ribodepletion/sortmerna_log - - for file in $starting_location/step_2_trimming/*cleaned.fastq - do - shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'` - $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the sortmerna step" - exit 1 - fi - done - - mkdir $starting_location/step_3_ribodepletion/ - mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/ - - rm step_2_trimming/*cleaned.fastq - rm step_2_trimming/*ribosomes.fastq - - echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA" - date - - printf "RIBODEPLETION\n" >>pipeline/checkpoints -else - printf "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n" -fi - -#################################################################### -# -# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS -date -Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints) -if [ "${Step}" != "ANNOTATION_SUBSYS" ] - then - mkdir $starting_location/step_4_annotation - touch $starting_location/step_4_annotation/diamond_log - for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq - do - shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'` - #removed the diamond sensitive option - $diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log - $diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the diamond subsys step" - exit 1 - fi - done - - mkdir $starting_location/step_4_annotation/daa_binary_files/ - - mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/ - mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ - - echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS" - date - - printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints -else - printf "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n" -fi -################################################################## -# -# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER -Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints) -if [ "${Step}" != "AGGREGATION_SUBSYS" ] - then - mkdir $starting_location/step_5_aggregation/ - for file in $starting_location/step_4_annotation/*subsys_annotated - do - python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt - - # This quick program reduces down identical hierarchy annotations - python $python_programs/subsys_reducer.py -I $file.hierarchy - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the aggregation subsys step" - exit 1 - fi - done - - mkdir $starting_location/step_5_aggregation/Subsystems_results/ - mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/ - mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/ - mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/ - rm $starting_location/step_4_annotation/*.hierarchy - - echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER" - date - - printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints -else - printf "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n" -fi - - -#################################################################### -# -# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ -# Note: this step assumes that the DIAMOND database is already built. If not, -# do that first before running this step. -Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints) -if [ "${Step}" != "ANNOTATION_REFSEQ" ] - then - touch $starting_location/step_4_annotation/diamond_log - - for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq - do - shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'` - #removed the diamond sensitive option - $diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log - $diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the diamond refseq step" - exit 1 - fi - done - - mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ - mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ - - echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ" - date - - printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints -else - printf "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n" -fi - -#################################################################### -# -# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER -Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints) -if [ "${Step}" != "AGGREGATION_REFSEQ" ] - then - for file in $starting_location/step_4_annotation/*RefSeq_annotated* - do - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the aggregation refseq step" - exit 1 - fi - done - - mkdir $starting_location/step_5_aggregation/RefSeq_results/ - mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/ - mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/ - mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/ - mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/ - - echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER" - date - - printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints -else - printf "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n" -fi - - - -################################################################## -# -# At this point, all the results files are ready for analysis using R. -# This next step performs basic DESeq2 analysis of the RefSeq organism, function, -# and Subsystems annotations. -# -# More complex R analyses may be performed using specific .sh analysis scripts. -# -# STEP 6: R ANALYSIS -# Note: For R to properly identify files to compare/contrast, they must include -# the appropriate prefix (either "control_$file" or experimental_$file")! - -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt - -echo "STEP 6 DONE: R ANALYSIS" -echo "Master bash script finished running at: "; date -exit -#################################################################### - diff --git a/bash_scripts/master_script.sh b/bash_scripts/master_script.sh index f6ef74e..25092f1 100644 --- a/bash_scripts/master_script.sh +++ b/bash_scripts/master_script.sh @@ -9,7 +9,7 @@ # # master_script.sh # Created April 2017 by Sam Westreich, github.com/transcript -# This version modified February 21, 2018 +# This version modified March 12, 2019 # #################################################################### # @@ -43,6 +43,9 @@ source "${BASH_SOURCE%/*}/../bash_scripts/lib/common.sh" INPUT_DIR=$SAMSA/input_files OUT_DIR=$SAMSA +# number of threads +threads=`getconf _NPROCESSORS_ONLN` + STEP_1="$OUT_DIR/step_1_output" STEP_2="$OUT_DIR/step_2_output" STEP_3="$OUT_DIR/step_3_output" @@ -74,30 +77,27 @@ fi #################################################################### #STEP 0.1: create/read checkpoint -printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" +printf "\nStep 0.1: Checking for the presence of the checkpoint file.\n" if [ ! -f "$INPUT_DIR/checkpoints" ] - then - printf "\tThe file checkpoints does not exist, we will create it.\n" - touch "$INPUT_DIR/checkpoints" + then + printf "\tThe file 'checkpoints' does not exist in the input directory, creating...\n" + touch "$INPUT_DIR/checkpoints" else - printf "\tThe file checkpoints already exist\n" + printf "\tThe file 'checkpoints' already exists in the input directory.\n" fi #################################################################### - - # # STEP 1: CLEANING FILES WITH TRIMMOMATIC Step=$(grep "TRIMMO" $INPUT_DIR/checkpoints) if [ "${Step}" != "TRIMMO" ] - then - + then if ls $INPUT_DIR/*.gz &>/dev/null; then for file in $INPUT_DIR/*.gz do - gunzip $file - done + gunzip $file + done fi $MKDIR $STEP_1 @@ -108,11 +108,11 @@ do out_path=`echo $f | awk -F "_R1" '{print $1 ".cleaned"}'` if [ -f $f2 ]; then paired=true - checked java -jar $TRIMMOMATIC PE -phred33 $f $f2 \ + checked java -jar $TRIMMOMATIC PE -phred33 -threads $threads $f $f2 \ $out_path".forward" $out_path".forward_unpaired" $out_path".reverse" $out_path".reverse_unpaired" \ SLIDINGWINDOW:4:15 MINLEN:70 else - checked java -jar $TRIMMOMATIC SE -phred33 $f $out_path SLIDINGWINDOW:4:15 MINLEN:70 + checked java -jar $TRIMMOMATIC SE -phred33 -threads $threads $f $out_path SLIDINGWINDOW:4:15 MINLEN:70 fi done @@ -122,9 +122,10 @@ if $paired; then else mv $INPUT_DIR/*".cleaned" $STEP_1 fi +printf "TRIMMO\n" >>pipeline/checkpoints else - printf "\tThe variable TRIMMO is in the checkpoint file. STEP 1 will then be passed\n" + printf "\tThe variable TRIMMO is in the checkpoint file. STEP 1 will be skipped.\n" fi @@ -134,18 +135,18 @@ fi # Note: paired-end files are usually named using R1 and R2 in the name. # Example: control_1.R1.fastq # control_1.R2.fastq + Step=$(grep "MERGING" $INPUT_DIR/checkpoints) if [ "${Step}" != "MERGING" ] - then - + then $MKDIR $STEP_2 if $paired; then for file in $STEP_1/*.cleaned.forward do - f2=`echo $file | awk -F "cleaned.forward" '{print $1 "cleaned.reverse"}'` - shortname=`echo $file | awk -F "cleaned.forward" '{print $1 "merged"}'` - checked $PEAR -f $file -r $f2 -o $STEP_2/${shortname##*/} + f2=`echo $file | awk -F "cleaned.forward" '{print $1 "cleaned.reverse"}'` + shortname=`echo $file | awk -F "cleaned.forward" '{print $1 "merged"}'` + checked $PEAR -f $file -r $f2 -j $threads -o $STEP_2/${shortname##*/} done else for file in $STEP_1/*.cleaned @@ -158,7 +159,7 @@ fi printf "MERGING\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable MERGING is in the checkpoint file. STEP 2 will then be passed\n" + printf "\tThe variable MERGING is in the checkpoint file. STEP 2 will be skipped.\n" fi #################################################################### @@ -167,7 +168,7 @@ fi # Note: These are used later for statistical analysis. Step=$(grep "RAW" $INPUT_DIR/checkpoints) if [ "${Step}" != "RAW" ] - then + then if [[ -f $STEP_2/raw_counts.txt ]]; then rm $STEP_2/raw_counts.txt @@ -177,19 +178,19 @@ touch $STEP_2/raw_counts.txt if $paired; then for file in $STEP_1/*cleaned.forward do - checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt + checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt done else for file in $STEP_1/*cleaned do - checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt + checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt done fi printf "RAW\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" + printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will be skipped.\n" fi #################################################################### @@ -199,15 +200,15 @@ fi # do that first (see the SortMeRNA user manual for details). Step=$(grep "RIBO" $INPUT_DIR/checkpoints) if [ "${Step}" != "RIBO" ] - then + then for file in $STEP_2/*.assembled.fastq do - shortname=`echo $file | awk -F "assembled" '{print $1 "ribodepleted"}'` - checked $SORTMERNA \ - --ref $SORTMERNA_DIR/rRNA_databases/silva-bac-16s-id90.fasta,$SORTMERNA_DIR/index/silva-bac-16s-db \ - --reads $file --aligned $file.ribosomes --other $shortname --fastx \ - --num_alignments 0 --log -v + shortname=`echo $file | awk -F "assembled" '{print $1 "ribodepleted"}'` + checked $SORTMERNA -a $threads \ + --ref $SORTMERNA_DIR/rRNA_databases/silva-bac-16s-id90.fasta,$SORTMERNA_DIR/index/silva-bac-16s-db \ + --reads $file --aligned $file.ribosomes --other $shortname --fastx \ + --log -v done $MKDIR $STEP_3 @@ -216,7 +217,7 @@ mv $STEP_2/*ribodepleted* $STEP_3 printf "RIBO\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable RIBO is in the checkpoint file. STEP 3 will then be passed\n" + printf "\tThe variable RIBO is in the checkpoint file. STEP 3 will be skipped.\n" fi #################################################################### @@ -226,7 +227,7 @@ fi # do that first before running this step. Step=$(grep "REFSEQ_ANNOT" $INPUT_DIR/checkpoints) if [ "${Step}" != "REFSEQ_ANNOT" ] - then + then echo "Now starting on DIAMOND org annotations at: "; date @@ -249,7 +250,7 @@ echo "RefSeq DIAMOND annotations completed at: "; date printf "REFSEQ_ANNOT\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable REFSEQ_ANNOT is in the checkpoint file. STEP 4 will then be passed\n" + printf "\tThe variable REFSEQ_ANNOT is in the checkpoint file. STEP 4 will be skipped.\n" fi #################################################################### @@ -257,12 +258,12 @@ fi # STEP 5: AGGREGATING WITH ANALYSIS_COUNTER Step=$(grep "REFSEQ_AGGREG" $INPUT_DIR/checkpoints) if [ "${Step}" != "REFSEQ_AGGREG" ] - then + then for file in $STEP_4/*RefSeq_annotated do - checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O - checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F + checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O + checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F done $MKDIR $STEP_5/RefSeq_results/org_results @@ -273,7 +274,7 @@ mv $STEP_4/*function.tsv $STEP_5/RefSeq_results/func_results printf "REFSEQ_AGGREG\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable REFSEQ_AGGREG is in the checkpoint file. STEP 5 will then be passed\n" + printf "\tThe variable REFSEQ_AGGREG is in the checkpoint file. STEP 5 will be skipped.\n" fi #################################################################### @@ -281,7 +282,7 @@ fi # STEP 4.1: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS Step=$(grep "SUBSYS_ANNOT" $INPUT_DIR/checkpoints) if [ "${Step}" != "SUBSYS_ANNOT" ] - then + then echo "Now starting on DIAMOND Subsystems annotations at: "; date @@ -301,7 +302,7 @@ echo "DIAMOND Subsystems annotations completed at: "; date printf "SUBSYS_ANNOT\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable SUBSYS_ANNOT is in the checkpoint file. STEP 4.1 will then be passed\n" + printf "\tThe variable SUBSYS_ANNOT is in the checkpoint file. STEP 4.1 will be skipped.\n" fi ################################################################## @@ -309,15 +310,15 @@ fi # STEP 5.1: PYTHON SUBSYSTEMS ANALYSIS COUNTER Step=$(grep "SUBSYS_AGGREG" $INPUT_DIR/checkpoints) if [ "${Step}" != "SUBSYS_AGGREG" ] - then + then for file in $STEP_4/*subsys_annotated do - checked python $PY_DIR/DIAMOND_subsystems_analysis_counter.py -I $file \ - -D $Subsys_db -O $file.hierarchy -P $file.receipt + checked python $PY_DIR/DIAMOND_subsystems_analysis_counter.py -I $file \ + -D $Subsys_db -O $file.hierarchy -P $file.receipt - # This quick program reduces down identical hierarchy annotations - checked python $PY_DIR/subsys_reducer.py -I $file.hierarchy + # This quick program reduces down identical hierarchy annotations + checked python $PY_DIR/subsys_reducer.py -I $file.hierarchy done $MKDIR $STEP_5/Subsystems_results/receipts @@ -328,7 +329,7 @@ rm $STEP_4/*.hierarchy printf "SUBSYS_AGGREG\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable SUBSYS_AGGREG is in the checkpoint file. STEP 5.1 will then be passed\n" + printf "\tThe variable SUBSYS_AGGREG is in the checkpoint file. STEP 5.1 will be skipped.\n" fi ################################################################## @@ -344,7 +345,7 @@ fi # the appropriate prefix (either "control_$file" or experimental_$file")! Step=$(grep "R_ANALYSIS" $INPUT_DIR/checkpoints) if [ "${Step}" != "R_ANALYSIS" ] - then + then checked Rscript $R_DIR/run_DESeq_stats.R \ -I $STEP_5/RefSeq_results/org_results \ @@ -362,10 +363,9 @@ checked Rscript $R_DIR/Subsystems_DESeq_stats.R \ printf "R_ANALYSIS\n" >>$INPUT_DIR/checkpoints else - printf "\tThe variable R_ANALYSIS is in the checkpoint file. STEP 6 will then be passed\n" + printf "\tThe variable R_ANALYSIS is in the checkpoint file. STEP 6 will be skipped.\n" fi echo "Master bash script finished running at: "; date exit #################################################################### - diff --git a/bash_scripts/master_script_preserving_unmerged.sh b/bash_scripts/master_script_preserving_unmerged.sh new file mode 100644 index 0000000..be495b6 --- /dev/null +++ b/bash_scripts/master_script_preserving_unmerged.sh @@ -0,0 +1,374 @@ +#!/bin/bash +#SBATCH --mem=100000 +#SBATCH --time=7-0:0:0 + +# Lines starting with #SBATCH are for SLURM job management systems +# and may be removed if user is not submitting jobs to SLURM + +#################################################################### +# +# master_script.sh +# Created April 2017 by Sam Westreich, github.com/transcript +# This version modified March 12, 2019 +# +#################################################################### +# +# This script sets up and runs through ALL steps in the SAMSA pipeline +# before the analysis (which is done in R, likely in RStudio). Each +# step is set up below. +# +# The steps are: +# 1. Read cleaning with Trimmomatic +# 2. Merging with PEAR, if applicable +# 3. rRNA removal with SortMeRNA +# 4. Annotation using DIAMOND (by default against the RefSeq database) +# 5. Aggregation using analysis_counter.py +# 4.1 Annotation using DIAMOND against the Subsystems database +# 5.1 Aggregation using Subsystems-specific analysis counter.py +# 6. Running R scripts to get DESeq statistical analysis. +# +# NOTE: BEFORE running this script, please run package_installation.bash +# and full_database_download.bash located at: +# https://github.com/transcript/samsa2/tree/master/setup in order to set +# up SAMSA2 dependencies and download full databases. +# +####################################################################### +# +echo -e "NOTE: Before running this script, please run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n" +# +# VARIABLES - set starting location and starting files location pathways +# +source "${BASH_SOURCE%/*}/../bash_scripts/lib/common.sh" + +INPUT_DIR=$SAMSA/input_files +OUT_DIR=$SAMSA + +# number of threads +threads=`getconf _NPROCESSORS_ONLN` + +STEP_1="$OUT_DIR/step_1_output" +STEP_2="$OUT_DIR/step_2_output" +STEP_3="$OUT_DIR/step_3_output" +STEP_4="$OUT_DIR/step_4_output" +STEP_5="$OUT_DIR/step_5_output" + +if [[ -n "$USE_TINY" ]]; then + # Diamond databases + diamond_database="$SAMSA/setup_and_test/tiny_databases/RefSeq_bac_TINY_24MB" + diamond_subsys_db="$SAMSA/setup_and_test/tiny_databases/subsys_db_TINY_24MB" + # Aggregation databases + RefSeq_db="$SAMSA/setup_and_test/tiny_databases/RefSeq_bac_TINY_24MB.fa" + Subsys_db="$SAMSA/setup_and_test/tiny_databases/subsys_db_TINY_24MB.fa" + # Use test output directories + STEP_1="${STEP_1}_test" + STEP_2="${STEP_2}_test" + STEP_3="${STEP_3}_test" + STEP_4="${STEP_4}_test" + STEP_5="${STEP_5}_test" +else + # Diamond databases + diamond_database="$SAMSA/full_databases/RefSeq_bac" + diamond_subsys_db="$SAMSA/full_databases/subsys_db" + # Aggregation databases + RefSeq_db="$SAMSA/full_databases/RefSeq_bac.fa" + Subsys_db="$SAMSA/full_databases/subsys_db.fa" +fi + +#################################################################### +#STEP 0.1: create/read checkpoint + +printf "\nStep 0.1: Checking for the presence of the checkpoint file.\n" +if [ ! -f "$INPUT_DIR/checkpoints" ] + then + printf "\tThe file 'checkpoints' does not exist in the input directory, creating...\n" + touch "$INPUT_DIR/checkpoints" +else + printf "\tThe file 'checkpoints' already exists in the input directory.\n" +fi + +#################################################################### +# +# STEP 1: CLEANING FILES WITH TRIMMOMATIC +Step=$(grep "TRIMMO" $INPUT_DIR/checkpoints) +if [ "${Step}" != "TRIMMO" ] + then + +if ls $INPUT_DIR/*.gz &>/dev/null; then + for file in $INPUT_DIR/*.gz + do + gunzip $file + done +fi + +$MKDIR $STEP_1 +paired=false +for f in $INPUT_DIR/*R1*q +do + f2=`echo $f | awk -F "R1" '{print $1 "R2" $2}'` + out_path=`echo $f | awk -F "_R1" '{print $1 ".cleaned"}'` + if [ -f $f2 ]; then + paired=true + checked java -jar $TRIMMOMATIC PE -phred33 -threads $threads $f $f2 \ + $out_path".forward" $out_path".forward_unpaired" $out_path".reverse" $out_path".reverse_unpaired" \ + SLIDINGWINDOW:4:15 MINLEN:70 + else + checked java -jar $TRIMMOMATIC SE -phred33 -threads $threads $f $out_path SLIDINGWINDOW:4:15 MINLEN:70 + fi +done + +if $paired; then + mv $INPUT_DIR/*".cleaned.forward"* $STEP_1 + mv $INPUT_DIR/*".cleaned.reverse"* $STEP_1 +else + mv $INPUT_DIR/*".cleaned" $STEP_1 +fi +printf "TRIMMO\n" >>pipeline/checkpoints + +else + printf "\tThe variable TRIMMO is in the checkpoint file. STEP 1 will be skipped.\n" +fi + +#################################################################### +# +# STEP 2: MERGING OF PAIRED-END FILES USING PEAR +# Note: paired-end files are usually named using R1 and R2 in the name. +# Example: control_1.R1.fastq +# control_1.R2.fastq + +Step=$(grep "MERGING" $INPUT_DIR/checkpoints) +if [ "${Step}" != "MERGING" ] + then + +$MKDIR $STEP_2 +if $paired; then + for file in $STEP_1/*.cleaned.forward + do + f2=`echo $file | awk -F "cleaned.forward" '{print $1 "cleaned.reverse"}'` + shortname=`echo $file | awk -F "cleaned.forward" '{print $1 "merged"}'` + checked $PEAR -f $file -r $f2 -j $threads -o $STEP_2/${shortname##*/} + done +else + for file in $STEP_1/*.cleaned + do + new_name=`echo $file | awk -F "cleaned" '{print $1 "merged.assembled.fastq"}'` + cp $file $STEP_2/${new_name##*/} + done +fi + +###I will concatenate forward and reverse complement with 20Ns in the middle. +$R_programs/combining_umerged.R $STEP_1 + +printf "MERGING\n" >>pipeline/checkpoints +mv $STEP_1/*assembled2.fastq $STEP_2/ + +else + printf "\tThe variable MERGING is in the checkpoint file. STEP 2 will be skipped.\n" +fi + +#################################################################### +# +# STEP 2.9: GETTING RAW SEQUENCES COUNTS +# Note: These are used later for statistical analysis. +Step=$(grep "RAW" $INPUT_DIR/checkpoints) +if [ "${Step}" != "RAW" ] + then + +if [[ -f $STEP_2/raw_counts.txt ]]; then + rm $STEP_2/raw_counts.txt +fi +touch $STEP_2/raw_counts.txt + +if $paired; then + for file in $STEP_1/*cleaned.forward + do + checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt + done +else + for file in $STEP_1/*cleaned + do + checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt + done +fi + +printf "RAW\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will be skipped.\n" +fi + +#################################################################### +# +# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA +# Note: this step assumes that the SortMeRNA databases are indexed. If not, +# do that first (see the SortMeRNA user manual for details). +Step=$(grep "RIBO" $INPUT_DIR/checkpoints) +if [ "${Step}" != "RIBO" ] + then + +for file in $STEP_2/*.assembled2.fastq +do + shortname=`echo $file | awk -F "assembled2" '{print $1 "ribodepleted"}'` + checked $SORTMERNA -a $threads \ + --ref $SORTMERNA_DIR/rRNA_databases/silva-bac-16s-id90.fasta,$SORTMERNA_DIR/index/silva-bac-16s-db \ + --reads $file --aligned $file.ribosomes --other $shortname --fastx \ + --log -v +done + +$MKDIR $STEP_3 +mv $STEP_2/*ribodepleted* $STEP_3 + +printf "RIBO\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable RIBO is in the checkpoint file. STEP 3 will be skipped.\n" +fi + +#################################################################### +# +# STEP 4: ANNOTATING WITH DIAMOND AGAINST REFSEQ +# Note: this step assumes that the DIAMOND database is already built. If not, +# do that first before running this step. +Step=$(grep "REFSEQ_ANNOT" $INPUT_DIR/checkpoints) +if [ "${Step}" != "REFSEQ_ANNOT" ] + then + +echo "Now starting on DIAMOND org annotations at: "; date + +for file in $STEP_3/*ribodepleted.fastq +do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'` + echo "Now starting on " $file + echo "Converting to " $shortname + checked $DIAMOND blastx --db $diamond_database -q $file -a $file.RefSeq -t ./ -k 1 + checked $DIAMOND view --daa $file.RefSeq.daa -o $shortname -f tab +done + +$MKDIR $STEP_4/daa_binary_files + +mv $STEP_3/*annotated* $STEP_4 +mv $STEP_3/*.daa $STEP_4/daa_binary_files + +echo "RefSeq DIAMOND annotations completed at: "; date + +printf "REFSEQ_ANNOT\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable REFSEQ_ANNOT is in the checkpoint file. STEP 4 will be skipped.\n" +fi + +#################################################################### +# +# STEP 5: AGGREGATING WITH ANALYSIS_COUNTER +Step=$(grep "REFSEQ_AGGREG" $INPUT_DIR/checkpoints) +if [ "${Step}" != "REFSEQ_AGGREG" ] + then + +for file in $STEP_4/*RefSeq_annotated +do + checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O + checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F +done + +$MKDIR $STEP_5/RefSeq_results/org_results +$MKDIR $STEP_5/RefSeq_results/func_results +mv $STEP_4/*organism.tsv $STEP_5/RefSeq_results/org_results +mv $STEP_4/*function.tsv $STEP_5/RefSeq_results/func_results + +printf "REFSEQ_AGGREG\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable REFSEQ_AGGREG is in the checkpoint file. STEP 5 will be skipped.\n" +fi + +#################################################################### +# +# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS +Step=$(grep "SUBSYS_ANNOT" $INPUT_DIR/checkpoints) +if [ "${Step}" != "SUBSYS_ANNOT" ] + then + +echo "Now starting on DIAMOND Subsystems annotations at: "; date + +for file in $STEP_3/*ribodepleted.fastq +do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'` + echo "Now starting on Subsystems annotations for " $file + checked $DIAMOND blastx --db $diamond_subsys_db -q $file -a $file.Subsys -t ./ -k 1 + checked $DIAMOND view --daa $file.Subsys.daa -o $shortname -f tab +done + +mv $STEP_3/*subsys_annotated* $STEP_4 +mv $STEP_3/*.daa $STEP_4/daa_binary_files + +echo "DIAMOND Subsystems annotations completed at: "; date + +printf "SUBSYS_ANNOT\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable SUBSYS_ANNOT is in the checkpoint file. STEP 4.1 will be skipped.\n" +fi + +################################################################## +# +# STEP 5.1: PYTHON SUBSYSTEMS ANALYSIS COUNTER +Step=$(grep "SUBSYS_AGGREG" $INPUT_DIR/checkpoints) +if [ "${Step}" != "SUBSYS_AGGREG" ] + then + +for file in $STEP_4/*subsys_annotated +do + checked python $PY_DIR/DIAMOND_subsystems_analysis_counter.py -I $file \ + -D $Subsys_db -O $file.hierarchy -P $file.receipt + + # This quick program reduces down identical hierarchy annotations + checked python $PY_DIR/subsys_reducer.py -I $file.hierarchy +done + +$MKDIR $STEP_5/Subsystems_results/receipts +mv $STEP_4/*.reduced $STEP_5/Subsystems_results +mv $STEP_4/*.receipt $STEP_5/Subsystems_results/receipts +rm $STEP_4/*.hierarchy + +printf "SUBSYS_AGGREG\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable SUBSYS_AGGREG is in the checkpoint file. STEP 5.1 will be skipped.\n" +fi + +################################################################## +# +# At this point, all the results files are ready for analysis using R. +# This next step performs basic DESeq2 analysis of the RefSeq organism, function, +# and Subsystems annotations. +# +# More complex R analyses may be performed using specific .sh analysis scripts. +# +# STEP 6: R ANALYSIS +# Note: For R to properly identify files to compare/contrast, they must include +# the appropriate prefix (either "control_$file" or experimental_$file")! +Step=$(grep "R_ANALYSIS" $INPUT_DIR/checkpoints) +if [ "${Step}" != "R_ANALYSIS" ] + then + +checked Rscript $R_DIR/run_DESeq_stats.R \ + -I $STEP_5/RefSeq_results/org_results \ + -O RefSeq_org_DESeq_results.tab \ + -R $STEP_2/raw_counts.txt +checked Rscript $R_DIR/run_DESeq_stats.R \ + -I $STEP_5/RefSeq_results/func_results \ + -O RefSeq_func_DESeq_results.tab \ + -R $STEP_2/raw_counts.txt +checked Rscript $R_DIR/Subsystems_DESeq_stats.R \ + -I $STEP_5/Subsystems_results \ + -O Subsystems_level-1_DESeq_results.tab -L 1 \ + -R $STEP_2/raw_counts.txt + +printf "R_ANALYSIS\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable R_ANALYSIS is in the checkpoint file. STEP 6 will be skipped.\n" +fi + +echo "Master bash script finished running at: "; date +exit +####################################################################