env_samples_qPCR_primer_analysis.sh

#!/bin/bash

# Script: env_samples_qPCR_primer_analysis.sh
# This file is a shell script that evaluates whether the sequences derived from samples will yield positives using the Spackman et al 2003 M-segment qPCR primers
# NOTE: This file depends on minion_demultiplexing_flu_assignment.sh. Run that script first!

#This script is part of the following manuscript:
#Analysis of MinION sequence data from water and sediment samples for avian influenza detection in California wetlands
#This github repository includes code (and links to data) from the manuscript:
#"Linking remote sensing for targeted surveillance of Avian Influenza virus via tangential flow ultra-filtration and whole segment amplification in California wetlands"
#Madeline M. McCuen | Maurice E. Pitesky | Ana Paula da Silva | Rodrigo A. Gallardo | Jeff J. Buler | Sarai Acosta | Alexander Wilcox | Ronald F. Bond | Samuel L. Díaz-Muñoz

################ M-segment qPCR Evaluation on California Wetland Samples ################
#### 1. Script Description, Data Sources, and Preparing Data
#### 2. Method A: Exact sequence match analysis using GREP
#### 3. Method B: Biochemical simulation of qPCR using primers against sequence database using ThermonucleotideBLAST 
#### 4. Conclusions 

#### 1. Script Description, Data Sources, and Preparing Data ####

# This shell script evaluates whether the sequences derived from our samples will yield positives using the Spackman et al 2003 (see below) primers
# In particular, we only obtained a positive using the Spackman et al. 2003 protocol for sample B1 SW Unfltrd
#Spackman, Erica, et al. "Development of a real-time reverse transcriptase PCR assay for type A influenza virus and the avian H5 and H7 hemagglutinin subtypes." Journal of clinical microbiology 40.9 (2002): 3256-3260.
# The M primers are (from Table 1) :
# Influenza A virus	
#  M + 25	      AGA TGA GTC TTC TAA CCG AGG TCG
#	 M − 124	    TGC AAA AAC ATC TTC AAG TCT CTG
#	 M + 64	  FAM-TCA GGC CCC CTC AAA GCC GA-TAMRA 

# Without spaces: 
#   M+25:   AGATGAGTCTTCTAACCGAGGTCG REVCOMPL> CGACCTCGGTTAGAAGACTCATCT
#   M−124:	TGCAAAAACATCTTCAAGTCTCTG REVCOMPL> CAGAGACTTGAAGATGTTTTTGCA
#   M+64:	  FAM-TCAGGCCCCCTCAAAGCCGA-TAMRA REVCOMPL> TCGGCTTTGAGGGGGCCTGA

#The primary data are the demultiplexed FASTA files sequenced from water and soil samples in CA Wetlands 

#FASTAS's were generated by the shell script minion_demultiplexing_flu_assignment.sh
#Files will be in the same directory as the script that generated them

#Number of sequences in each FASTA file
wc -l reverse_*.fasta | awk '{print $1/2 " "$2}'
#9631 total

wc -l forward_*.fasta | awk '{print $1/2 " "$2}'
#38331 total

#### 2. Method A: Exact sequence match analysis using GREP ####
#First we'll check for exact matches to the Spackman primers. 

## 2a. First with forward reads

#Forward Primer Exact Matches
grep "AGATGAGTCTTCTAACCGAGGTCG" -c forward_*.fasta
#forward_B1_SW_Unfltrd.fasta:95
#All others 0

#Reverse Primer Exact Matches
grep "CAGAGACTTGAAGATGTTTTTGCA" -c forward_*.fasta
#No matches

#Probe Primer Exact Matches
grep "TCAGGCCCCCTCAAAGCCGA" -c forward_*.fasta
#forward_A1_Fltrd_RXD.fasta:9
#forward_A4_Fltrd_RXD.fasta:1
#forward_B1_SW_Unfltrd.fasta:88
#forward_B3_SW_Unfltrd.fasta:3
#forward_B4_SW_Unfltrd.fasta:1
#forward_C3_SW_Unfltrd.fasta:1
#forward_C5_SW_Unfltrd.fasta:1
#forward_PR8_RNA_B.fasta:620

#All other samples negatives

#Obviously we need these primers/probe to be on the same molecule, so adjusting the grep search
grep AGATGAGTCTTCTAACCGAGGTCG.*TCAGGCCCCCTCAAAGCCGA.*CAGAGACTTGAAGATGTTTTTGCA -c forward_*.fasta
#No matches

#Check if these are the expected size amplicon size of ~100bp
grep AGATGAGTCTTCTAACCGAGGTCG.*TCAGGCCCCCTCAAAGCCGA.*CAGAGACTTGAAGATGTTTTTGCA -o forward_*.fasta
#Nothing above, so nothing expected here

#This analysis would suggest no amplification in any of the samples, based on exact primer matches  

#We might want to make the search more flexible by excluding the 5' ends of the primers that are less important for annealing 

#Forward Primer Only first 16nt from 3'
grep "AGATGAGTCTTCTAAC" -c forward_*.fasta
#forward_A5_SW_Unfltrd.fasta:1
#forward_B1_SW_Unfltrd.fasta:168
#All others negative

#Reverse Primer Only first 16nt from 3'
grep "CAGAGACTTGAAGATG" -c forward_*.fasta
#forward_A1_Fltrd_RXD.fasta:5
#forward_A5_SW_Unfltrd.fasta:1
#forward_B1_SW_Unfltrd.fasta:160
#forward_B2_SW_Unfltrd.fasta:1
#forward_B3_SW_Unfltrd.fasta:2
#forward_B4_SW_Unfltrd.fasta:1
#forward_B5_SW_Unfltrd.fasta:1
#forward_C5_SW_Unfltrd.fasta:1
#forward_PR8_RNA_B.fasta:770
#All others 0

#Probe Only first 16nt from 3'
grep "TCAGGCCCCCTCAAAG" -c forward_*.fasta
#forward_A1_Fltrd_RXD.fasta:10
#forward_A4_Fltrd_RXD.fasta:1
#forward_B1_SW_Unfltrd.fasta:104
#forward_B3_SW_Unfltrd.fasta:3
#forward_B4_SW_Unfltrd.fasta:1
#forward_C3_SW_Unfltrd.fasta:1
#forward_C5_SW_Unfltrd.fasta:1
#forward_PR8_RNA_B.fasta:676
#All others 0

#Now all together, adjusting the grep search for primers/probe to be on the same molecule, so 
grep AGATGAGTCTTCTAAC.*TCAGGCCCCCTCAAAG.*CAGAGACTTGAAGATG -c forward_*.fasta
#forward_B1_SW_Unfltrd.fasta:53
#All others 0

#Quick check if we're getting expected amplicon sizes
grep AGATGAGTCTTCTAAC.*TCAGGCCCCCTCAAAG.*CAGAGACTTGAAGATG -o forward_*.fasta | awk '{ print length }'
#116 - 124, so as expected for Spackman primers

## 2b. Now do with reverse reads:

#Now all together, adjusting the grep search for primers/probe to be on the same molecule, NOTE REVERSE COMPLEMENT 
grep CATCTTCAAGTCTCTG.*CTTTGAGGGGGCCTGA.*GTTAGAAGACTCATCT -c reverse*.fasta
#reverse_B1_SW_Unfltrd.fasta:8
#reverse_PR8_RNA_B.fasta:1
#All others 0

#Quick check if we're getting expected amplicon sizes
grep CATCTTCAAGTCTCTG.*CTTTGAGGGGGCCTGA.*GTTAGAAGACTCATCT -o reverse*.fasta | awk '{ print length }'
#113 - 122, so as expected for Spackman primers

#Taking forward and reverse reads together 61 cDNA molecules should amplify from B1_SW_Unfltrd and 1 from PR8 postive control
#This analysis would suggest only sample B1_SW_Unfltrd would amplify out of 36 samples (excluding PR8 postive control and negative controls)

#However, sucess rate could be higher given that PCR is able to tolerate mismatches. So examining a second method:  

#### 3. Method B: Biochemical simulation of qPCR using primers against sequence database using ThermonucleotideBLAST ####
#This method uses ThermonucleotideBLAST (https://public.lanl.gov/jgans/tntblast/tntblast_doc.html) 
#Citation for ThermonucleotideBLAST: J. D. Gans and M. Wolinsky "Improved assay-dependent searching of nucleic acid sequence databases" Nucleic Acid Res. 2008 Jul;36(12):e74
#ThermonucleotideBLAST is a software program for searching a target database of nucleic acid sequences using an assay specific query. ThermonucleotideBLAST queries are based on biochemical assays (i.e. a pair of oligonucleotide sequences representing PCR primers or Padlock probes, a triplet of oligos representing PCR primers and a TaqManTM probe or a single oligo representing a hybridization probe). Unlike existing programs (e.g. BLAST) which use heuristic measures of sequence similarity for identifying matches between a query and target sequence, ThermonucleotideBLAST uses physically relevant measures of sequence similarity -- free energy and melting temperature. For example, given a pair of PCR primers, a database of DNA targets and an annealing temperature, ThermonucleotideBLAST will return a list of predicted amplicons that will (ideally) match experimental PCR results.

#Code below assumes you have ThermonucleotideBLAST installed as a module. 
module load tntblast
#Module tntblast/2.04 loaded

#First step is to create files with the primer sequences and probes
echo "TaqmanqPCR-msegment    AGATGAGTCTTCTAACCGAGGTCG    TGCAAAAACATCTTCAAGTCTCTG    TCAGGCCCCCTCAAAGCCGA" > taqman_qPCR_query.txt

#Create second file with just the amplicon information (regular qPCR or PCR)
echo "qPCR-msegment AGATGAGTCTTCTAACCGAGGTCG    TGCAAAAACATCTTCAAGTCTCTG" > qPCR_query.txt

#First evaluate the Taqman assay, using Spackman et al 2002 conditions of 60 degrees C annealing temp; 10 pmol of each primer and 0.3 μM probe in a 20uL reaction 
#Note use of [1-9] to get only positive samples
for file in forward_*.fasta
do
	prefix=${file%.fasta}
	echo $file
	tntblast -i taqman_qPCR_query.txt -d ${prefix}.fasta -e 60 -E 60 -m 1 -t 0.0000003 -T 0.0000005 | grep "Found.[1-9]*..t" -A 2  
done
#No positives

#Note use of [1-9] to get only positive samples
#Check with more permissive temperature 50 degrees C
for file in forward_*.fasta
do
	prefix=${file%.fasta}
	echo $file
	tntblast -i taqman_qPCR_query.txt -d ${prefix}.fasta -e 50 -E 50 -m 1 -t 0.0000003 -T 0.0000005 | grep "Found.[1-9]*..t" -A 2  
done
#forward_B1_SW_Unfltrd.fasta
#Found 84 (total) matches
#Amplicon:
#	95 <= Amplicon length <= 105
#All others negative

#So 1 positive 1/36 samples

#Note use of [1-9] to get only positive samples
#Check with ultra permissive temperature 40 degrees C
for file in forward_*.fasta
do
	prefix=${file%.fasta}
	echo $file	
	tntblast -i taqman_qPCR_query.txt -d ${prefix}.fasta -e 40 -E 40 -m 1 -t 0.0000003 -T 0.0000005 | grep "Found.[1-9]*..t" -A 2  
done
#forward_A1_Fltrd_RXD.fasta
#Found 4 (total) matches
#Amplicon:
#	98 <= Amplicon length <= 104
#forward_B1_SW_Unfltrd.fasta
#Found 176 (total) matches
#Amplicon:
#	94 <= Amplicon length <= 106
#forward_B3_SW_Unfltrd.fasta
#Found 3 (total) matches
#Amplicon:
#	99 <= Amplicon length <= 105
#forward_B4_SW_Unfltrd.fasta
#Found 1 (total) matches
#Amplicon:
#	101 <= Amplicon length <= 101
#forward_B5_SW_Unfltrd.fasta
#Found 1 (total) matches
#Amplicon:
#	99 <= Amplicon length <= 99
#Found 1 (total) matches
#Amplicon:
#	102 <= Amplicon length <= 102
#forward_C5_SW_Unfltrd.fasta
#Found 1 (total) matches
#Amplicon:
#	102 <= Amplicon length <= 102
#forward_PR8_RNA_B.fasta
#Found 714 (total) matches
#Amplicon:
#	92 <= Amplicon length <= 111

#So 7 positivies and one of them is positive control so  6/36 samples for forward reads

#Now redo, but with reverse reads

#First step is to create files with the primer sequences and probes
echo "TaqmanqPCR-msegment-rev    TGCAAAAACATCTTCAAGTCTCTG    TCGGCTTTGAGGGGGCCTGA    CGACCTCGGTTAGAAGACTCATCT" > taqman_qPCR_query_reverse.txt

#Same as before but for reverse reads: First evaluate the Taqman assay, using Spackman et al 2002 conditions of 60 degrees C annealing temp; 10 pmol of each primer and 0.3 μM probe in a 20uL reaction 
#Note use of [1-9] to get only positive samples
for file in reverse_*.fasta
do
	prefix=${file%.fasta}
	echo $file
	tntblast -i taqman_qPCR_query_reverse.txt -d ${prefix}.fasta -e 60 -E 60 -m 1 -t 0.0000003 -T 0.0000005 | grep "Found.[1-9]*..t" -A 2  
done
#0 matches

#Note use of [1-9] to get only positive samples
#Same as before but for reverse reads: Check with more permissive temperature 50 degrees C
for file in reverse_*.fasta
do
	prefix=${file%.fasta}
	echo $file
	tntblast -i taqman_qPCR_query_reverse.txt -d ${prefix}.fasta -e 50 -E 50 -m 1 -t 0.0000003 -T 0.0000005 | grep "Found.[1-9]*..t" -A 2  
done
#0 matches

#Note use of [1-9] to get only positive samples
#Same as before but for reverse reads: Check with ultra permissive temperature 40 degrees C
for file in reverse_*.fasta
do
	prefix=${file%.fasta}
	echo $file	
	tntblast -i taqman_qPCR_query_reverse.txt -d ${prefix}.fasta -e 40 -E 40 -m 1 -t 0.0000003 -T 0.0000005 | grep "Found.[1-9]*..t" -A 2  
done
#0 matches

#This analysis would suggest no amplification based on published qPCR/Taqman conditions.
#Under more permissive conditions (50 degree C annealing), the analysis suggests an overall success rate of 1/36, with the positive indeed being the sample we experimentally found to be positive B1_SW_Unfltrd
#At the an annealing temperature to 40 degrees C, the analysis suggests an overall success rate of 6/36 = 0.1666667 ~ 16.67%. Despite low annealing temperature predicted amplicons were still in the expected size

#So based on this analysis, M-Segment Taqman qPCR (as published by Spackman et al 2002) only detected the samples we expected based on this analysis


#### 4. Conclusions ####
#Regarding the Spackman et al. 2003 M-segment Taqman assay on our environmental samples:
#This analysis would suggest no amplification in any of the samples, based on exact primer/probe matches. No samples have exact matches to the full-length F/R primers. 7/36 samples had exact matches to the full length probe. 
#Assuming an exact match of the first 16nt (3'-5') of the primers and probe were sufficient for a positive, sample B1_SW_Unfltrd and the PR8 positive control would yield positives  
#This exact match analysis would suggest only sample B1_SW_Unfltrd would amplify out of 36 samples (excluding PR8 postive control and negative controls)
#Using a thermodynamic simulaton of qPCR/Taqman: no amplification is expected based on published qPCR/Taqman conditions.
#Under more permissive conditions (50 degree C annealing), the analysis suggests amplification in one sample, with the positive indeed being the sample we experimentally found to be positive B1_SW_Unfltrd
#At the an annealing temperature to 40 degrees C, the analysis suggests six total samples would amplify. Despite low annealing temperature predicted amplicons were still in the expected size