Skip to content

Commit

Permalink
run_get_phylomarkers_pipeline.sh 2.8.2.0_2024-04-19
Browse files Browse the repository at this point in the history
- added likelihood mapping test of tree-likeness and phylogentic signal
   for -A I (IQ-TREE) and -t DNA (i.e., not yet for proteins)
- top markers for -A I -t DNA need to pass both the lmapping and
    min median support values
- min_supp_val set to 0.65 from 0.7 to relax a bit, due to the stringent test above
- min_bash_vers=4.4 (instead of previous 4.3), as it is required for mapfile -t array <(cmd); see SC2207
  • Loading branch information
vinuesa committed Apr 19, 2024
1 parent ff02b19 commit 1d95393
Showing 1 changed file with 185 additions and 44 deletions.
229 changes: 185 additions & 44 deletions run_get_phylomarkers_pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@

# Set Bash strict mode
# http://redsymbol.net/articles/unofficial-bash-strict-mode/
#set -x # enables debugging output to trace the execution flow; Print commands and their arguments as they are executed.
#set -x # enables debugging output to trace the execution flow; Print commands and their arguments as they are executed. Also executed as bash -x script
set -e # NOTES: all "${distrodir}"/* calls fail unless they are wrapped in the following code
# { "${distrodir}"/run_parallel_cmmds.pl faaed 'add_nos2fasta_header.pl $file > ${file}no' "$n_cores" &> /dev/null && return 0; }
# Also added extended error and return 0 calls to the functions in get_phylomarkers_fun_lib and main script
set -u
set -o pipefail

progname=${0##*/} # run_get_phylomarkers_pipeline.sh
VERSION='2.8.1.4_2024-04-17'
VERSION='2.8.2.0_2024-04-19'

# Set GLOBALS
# in Strict mode, need to explicitly set undefined variables to an empty string var=''
Expand All @@ -58,24 +58,31 @@ lmsg=''
gene_tree_ext=''
no_kde_outliers=''
no_kde_ok=''
lmap_sampling=2000

declare -A filtering_results_h
filtering_results_h=()
declare -a filtering_results_kyes_a
filtering_results_kyes_a=()
declare -A filtering_results_h=()
declare -a filtering_results_kyes_a=()

declare -A output_files_h
output_files_h=()
declare -a output_files_a
output_files_a=()
declare -A output_files_h=()
declare -a output_files_a=()

declare -A figs_h=()
declare -a figs_a=()

declare -A aln_models_h=()
declare -a aln_models_a=()

min_bash_vers=4.3 # required for:
# 1. printf '%(%F)T' '-1' in print_start_time; and
# 2. passing an array or hash by name reference to a bash function (since version 4.3+),
declare -A aln_lmappings_h=()
declare -a aln_lmappings_a=()

declare -A alns_passing_lmap_and_suppval_h=()
declare -a alns_passing_lmap_and_suppval_a=()


min_bash_vers=4.4 # required for:
# 1. mapfile -t array <(cmd); see SC2207
# 2. printf '%(%F)T' '-1' in print_start_time; and
# 3. passing an array or hash by name reference to a bash function (since version 4.3+),
# by setting the -n attribute
# see https://stackoverflow.com/questions/16461656/how-to-pass-array-as-an-argument-to-a-function-in-bash

Expand Down Expand Up @@ -598,7 +605,7 @@ function print_help()
-k <real> kde stringency (0.7-1.6 are reasonable values; less is more stringent) [default: $kde_stringency]
-K flag to run molecular clock test on codon alignments [default: $eval_clock]
-l <integer> max. spr length (7-12 are reasonable values) [default: $spr_length]
-m <real> min. average support value (0.7-0.8 are reasonable values) for trees to be selected [default: $min_supp_val]
-m <real> min. average support value (0.6-0.8 are reasonable values) for trees to be selected [default: $min_supp_val]
-M <string> base Model for clock test (use one of: GTR|TrN|HKY|K2P|F81); uses +G in all cases [default: $base_mod]
-n <integer> number of cores/threads to use for parallel computations except IQT searches [default: all cores]
-N <integer> number of IQ-TREE searches to run [only active with -T high] [default: $nrep_IQT_searches]
Expand Down Expand Up @@ -657,7 +664,7 @@ cluster_format=STD
search_thoroughness='high'
IQT_threads=2 # used only with concatenated supermatrix 2|INTEGER; AUTO is too slow
kde_stringency=1.5
min_supp_val=0.7
min_supp_val=0.65
min_no_ext_branches=4
n_cores=''
#VERBOSITY=0
Expand Down Expand Up @@ -1413,7 +1420,7 @@ then
msg " >>>>>>>>>>>>>>> parallel IQ-TREE runs to estimate gene trees <<<<<<<<<<<<<<< " PROGR YELLOW
msg "" PROGR NC
estimate_IQT_gene_trees "$mol_type" "$search_thoroughness" # "$IQT_models"

# 4.1.1 check that IQT computed the expected gene trees
no_gene_trees=$(find . -name "*.treefile" | wc -l)
(( no_gene_trees < 1 )) && print_start_time && msg " >>> ERROR: There are no gene tree to work on in non_recomb_cdn_alns/. will exit now!" ERROR RED && exit 3
Expand All @@ -1426,6 +1433,24 @@ then
filtering_results_h[n_starting_gene_trees]="$no_gene_trees"
filtering_results_kyes_a+=('n_starting_gene_trees')
(( DEBUG > 0 )) && for k in "${filtering_results_kyes_a[@]}"; do echo "$k: ${filtering_results_h[$k]}"; done

# 4.1.3 Populate the aln_models_h hash and aln_models_a array from *stats.tsv generated by compute_IQT_gene_tree_stats
gene_tree_stats_file=$(find . -name \*stats.tsv -printf '%f\n')

ed_dir=$(rm_top_dir_prefix_from_wkdir "$top_dir" "$non_recomb_cdn_alns_dir")
output_files_a+=(gene_tree_stats_file)
output_files_h[gene_tree_stats_file]=$(echo -e "$ed_dir\t$gene_tree_stats_file")

print_start_time && msg "# populating the aln_models_h hash from $gene_tree_stats_file ..." PROGR BLUE

while read -r aln _ _ _ model _; do
aln="${aln/.\//}"
aln="${aln%.log}"
aln_models_a+=("${aln/.\//}")
aln_models_h["${aln/.\//}"]="$model"
done < <(awk 'NR > 1' "$gene_tree_stats_file");

((DEBUG > 1)) && { echo "# DEBUGGING aln_models_h ..."; for a in "${aln_models_a[@]}"; do echo -e "$a\t${aln_models_h[$a]}"; done ; }
fi

#remove trees with < 4 external branches
Expand All @@ -1444,7 +1469,8 @@ then
output_files_a+=(n_branches_per_tree)
output_files_h[n_branches_per_tree]=$(echo -e "$ed_dir\tno_tree_branches.list")

# collect the fasta file names producing "no real trees" with < 4 external branches in the no_tree_counter array
# Collect the fasta file names producing "no real trees" with < 4 external branches in the no_tree_counter array
# and remove trivial trees and alignments
declare -a no_tree_counter_a
no_tree_counter_a=()
#for phy in $(grep -v '^#Tree' no_tree_branches.list | awk -v min_no_ext_branches="$min_no_ext_branches" 'BEGIN{FS="\t"; OFS="\t"}$7 < min_no_ext_branches {print $1}')
Expand Down Expand Up @@ -1542,16 +1568,14 @@ then
print_start_time && msg "# making dir kde_outliers/ and moving $no_kde_outliers outlier files into it ..." PROGR BLUE
mkdir kde_outliers || { msg "ERROR: could not run mkdir kde_outliers" ERROR RED && exit 1 ; }
if [ "$search_algorithm" == "F" ]; then
#for f in $(grep outlier kde_dfr_file_all_gene_trees.tre.tab | cut -f1|sed 's/\.ph//'); do
while read -r f; do
base=${f/_cdnAln*/_cdnAln}
mv "${base}"* kde_outliers
done < <(grep outlier kde_dfr_file_all_gene_trees.tre.tab | cut -f1)
fi

if [ "$search_algorithm" == "I" ]; then
#for f in $(grep outlier kde_dfr_file_all_gene_trees.tre.tab | cut -f1|sed 's/\.treefile//'); do
while read -r f; do
while read -r f; do
mv "${f}"* kde_outliers
done < <(grep outlier kde_dfr_file_all_gene_trees.tre.tab | cut -f1|sed 's/\.treefile//')
fi
Expand Down Expand Up @@ -1590,15 +1614,15 @@ then
#----------------------------------------#
if (( runmode == 1 ))
then
# >>> 5.1 compute average bipartition support values for each gene tree
# >>> 5.1 Compute likelihood mapping (only for IQ-Tree only) and average bipartition support values for each gene tree
# and write them to sorted_aggregated_support_values4loci_ge${min_supp_val_perc}perc.tab
wkdir=$(pwd)

msg "" PROGR NC
msg " >>>>>>>>>>>>>>> filter gene trees by phylogenetic signal content <<<<<<<<<<<<<<< " PROGR YELLOW
msg "" PROGR NC

print_start_time && msg "# computing tree support values ..." PROGR BLUE
print_start_time && msg "# computing tree support values and RF-distances ..." PROGR BLUE
(( DEBUG > 0 )) && msg " > compute_suppValStas_and_RF-dist.R $wkdir 1 fasta ${gene_tree_ext} 1 &> /dev/null" DEBUG NC
compute_suppValStas_and_RF-dist.R "$wkdir" 1 fasta "${gene_tree_ext}" 1 &> /dev/null

Expand All @@ -1614,7 +1638,7 @@ then
awk -v min_supp_val="$min_supp_val" '$2 >= min_supp_val' sorted_aggregated_support_values4loci.tab > "sorted_aggregated_support_values4loci_ge${min_supp_val_perc}perc.tab"
check_output "sorted_aggregated_support_values4loci_ge${min_supp_val_perc}perc.tab" "$parent_PID"

ed_dir=$(rm_top_dir_prefix_from_wkdir "$top_dir" kde_ok)
ed_dir=$(rm_top_dir_prefix_from_wkdir "$top_dir" "$wkdir")
output_files_a+=(sorted_aggregated_support_values_top_DNA_markers)
output_files_h[sorted_aggregated_support_values_top_DNA_markers]=$(echo -e "$ed_dir\tsorted_aggregated_support_values4loci_ge${min_supp_val_perc}perc.tab")

Expand All @@ -1627,31 +1651,148 @@ then

filtering_results_h[n_top_markers]="$no_top_markers"
filtering_results_kyes_a+=('n_top_markers')
(( DEBUG > 0 )) && for k in "${filtering_results_kyes_a[@]}"; do echo "$k: ${filtering_results_h[$k]}"; done
(( DEBUG > 1 )) && for k in "${filtering_results_kyes_a[@]}"; do echo "$k: ${filtering_results_h[$k]}"; done


# Run likelihood mapping, only if algorithm == I (IQ-TREE)
# to asses tree-likeness and phylogenetic signal
if [ "$search_algorithm" == "I" ]
then
print_start_time && msg "# computing likelihood mappings to asses tree-likeness and phylogenetic signal ..." PROGR BLUE

# Popullate the aln_lmappings_a array and aln_lmappings_h hash
# the aln_models_a array and aln_models_h were popullated from *stats.tsv, after running compute_IQT_gene_tree_stats
for a in "${aln_models_a[@]}"; do
if [ -s "$a" ]; then
((DEBUG > 1)) && echo "iqtree -s $a -m ${aln_models_h[$a]} -te ${a}.treefile -lmap $lmap_sampling -T $n_cores --prefix lmapping_test_${a%.*} --quiet"
iqtree -s "$a" -m "${aln_models_h[$a]}" -te "${a}".treefile -lmap "$lmap_sampling" -T "$n_cores" --prefix lmapping_test_"${a%.*}" --quiet

# parse AU test results from iqtree files
aln_lmappings_a+=("$a")
aln_lmappings_h["$a"]=$(grep 'Number of fully resolved' lmapping_test_"${a%.*}".iqtree | sed 's/Number of fully.*=//; s/%)//')
else
continue
fi
done

print_start_time && msg "# parsing likelihood mapping results, and writing summary table ..." PROGR BLUE
# Print the likelihood mapping results of each gene tree to a tsv file
for a in "${aln_lmappings_a[@]}"; do
echo -e "$a\t${aln_lmappings_h[$a]}"
done > geneTree_lmapping_tests.tsv

check_output geneTree_lmapping_tests.tsv "$parent_PID"
ed_dir=$(rm_top_dir_prefix_from_wkdir "$top_dir" "$wkdir")
output_files_a+=(geneTree_lmapping_tests)
output_files_h[geneTree_lmapping_tests]=$(echo -e "$ed_dir\tgeneTree_lmapping_tests.tsv")

# cleanup the lmappint_test_files
[ -s geneTree_lmapping_tests.tsv ] && rm lmapping_test_*

# filter lmapping results >= \$min_support_val_perc
print_start_time && msg "# parsing likelihood mapping results table to identify trees with >= $min_supp_val_perc fully resolved quartets ..." PROGR BLUE
awk -v min="$min_supp_val_perc" 'BEGIN{FS=OFS="\t"}$2 >= min' geneTree_lmapping_tests.tsv > \
geneTrees_passing_lmapping_tests_ge_"${min_supp_val_perc}".tsv

check_output geneTrees_passing_lmapping_tests_ge_"${min_supp_val_perc}".tsv "$parent_PID"
ed_dir=$(rm_top_dir_prefix_from_wkdir "$top_dir" "$wkdir")
output_files_a+=(geneTree_passing_lmapping_tests)
output_files_h[geneTree_passing_lmapping_tests]=$(echo -e "$ed_dir\tgeneTrees_passing_lmapping_tests_ge_${min_supp_val_perc}.tsv")

n_lmapping_tests=$(wc -l < geneTree_lmapping_tests.tsv)
n_alns_passing_lmapping_tests=$(wc -l < geneTrees_passing_lmapping_tests_ge_"${min_supp_val_perc}".tsv)
n_alns_failing_lmapping_tests=$((n_lmapping_tests - n_alns_passing_lmapping_tests))

msg " >>> $n_alns_passing_lmapping_tests alignments passed the likelihood mapping test at >= ${min_supp_val_perc}% out of $n_lmapping_tests ... " PROGR GREEN

filtering_results_kyes_a+=('n_alns_failing_lmapping_tests')
filtering_results_h[n_alns_failing_lmapping_tests]="$n_alns_failing_lmapping_tests"

filtering_results_kyes_a+=('n_alns_passing_lmapping_tests')
filtering_results_h[n_alns_passing_lmapping_tests]="$n_alns_passing_lmapping_tests"


# Filter alignments passing both the likelihood mapping (lmap) AND
# the min. median bipartition support tests >= min_supp_val_perc
# 1. filter by lmap
while read -r id; do
[ -z "$id" ] && continue
[[ "$id" =~ [^0-9]+ ]] && continue
((DEBUG > 1)) && msg "# filter by lmap: running: alns_passing_lmap_and_suppval_h[$id]++" DEBUG NC
alns_passing_lmap_and_suppval_h["$id"]=1
done < <(cut -d_ -f1 geneTrees_passing_lmapping_tests_ge_"${min_supp_val_perc}".tsv)

# 2. filter by min_supp_val
while read -r id; do
[ -z "$id" ] && continue
id=${id//\"/}
[[ "$id" =~ [^0-9]+ ]] && continue
((DEBUG > 1)) && msg "# filter by min_supp_val: running: alns_passing_lmap_and_suppval_h[$id]++" DEBUG NC
((alns_passing_lmap_and_suppval_h["$id"]++)) || alns_passing_lmap_and_suppval_h["$id"]=1 # requires this test if id is unbound
done < <(cut -f1 "$top_markers_tab") # sorted_aggregated_support_values4loci_ge${min_supp_val_perc}perc.tab

if ((DEBUG > 1)); then
for a in "${!alns_passing_lmap_and_suppval_h[@]}"; do
printf "%s\t%d\n" "$a" "${alns_passing_lmap_and_suppval_h[$a]}"
done
fi
# 3. fill the array of alignments passing both phylogenetic signal content tests (i.e. "${alns_passing_lmap_and_suppval_h[$a]}" == 2)
# alns_passing_lmap_and_suppval_a+=($(for a in "${!alns_passing_lmap_and_suppval_h[@]}"; do (("${alns_passing_lmap_and_suppval_h[$a]}" == 2 )) && echo "${a}"; done))
mapfile -t alns_passing_lmap_and_suppval_a < <(for a in "${!alns_passing_lmap_and_suppval_h[@]}"; do (("${alns_passing_lmap_and_suppval_h[$a]}" == 2 )) && echo "${a}"; done)
# 4. Reset the top_markers_dir and no_top_markers when runnig both phylogenetic signal filters under -A I (IQ-TREE)
top_markers_dir=top_"${#alns_passing_lmap_and_suppval_a[@]}"_markers_ge"${min_supp_val_perc}"perc
no_top_markers="${#alns_passing_lmap_and_suppval_a[@]}"

msg " >>> $no_top_markers alignments passed both the lmapping and median bipartition support tests at >= ${min_supp_val_perc}% out of $n_lmapping_tests ..." PROGR GREEN

filtering_results_kyes_a+=('n_alns_failing_lmapping_and_bipart_support_tests')
filtering_results_h[n_alns_failing_lmapping_and_bipart_support_tests]=$(( "$n_alns_failing_lmapping_tests" - "$no_top_markers" ))

# >>> 5.2 move top-ranking markers to $top_markers_dir
filtering_results_kyes_a+=('n_alns_passing_lmapping_and_bipart_support_tests')
filtering_results_h[n_alns_passing_lmapping_and_bipart_support_tests]="$no_top_markers"
fi # [ "$search_algorithm" == "I" ]

# >>> 5.2 move top-ranking markers to $top_markers_dir
print_start_time && msg "# making dir $top_markers_dir and moving $no_top_markers top markers into it ..." PROGR LBLUE
{ mkdir "$top_markers_dir" && cd "$top_markers_dir" ; } || { msg "ERROR: cannot cd into $top_markers_dir" ERROR RED && exit 1 ; }
top_markers_dir=$(pwd)
ln -s ../"$top_markers_tab" .
while read -r id _; do
[[ "$id" =~ loci ]] && continue
id=${id//\"/}

if [ "$search_algorithm" == "F" ] # FastTree
then
ln -s ../"$top_markers_tab" .
while read -r id _; do
[[ "$id" =~ loci ]] && continue
id=${id//\"/}

# -h doesn't work with globs. Use a for loop
#for i in $(find .. -maxdepth 1 -name "${id}"\* -printf "%f\n")
while read -r i; do
(( DEBUG > 0 )) && msg " > reading top_markers_tab:$top_markers_tab in top_markers_dir:$top_markers_dir; running: ln -s ../${i} ." DEBUG NC
if [ ! -h "$i" ]
then
ln -s ../"$i" .
else
continue
fi
done < <(find .. -maxdepth 1 -name "${id}"\* -printf "%f\n")
done < "$top_markers_tab"

(( no_top_markers < 2 )) && print_start_time && msg " >>> Warning: There are less than 2 top markers. Relax your filtering thresholds. will exit now!" ERROR LRED && exit 3
# -h doesn't work with globs. Use a for loop
#for i in $(find .. -maxdepth 1 -name "${id}"\* -printf "%f\n")
while read -r i; do
(( DEBUG > 0 )) && msg " > reading top_markers_tab:$top_markers_tab in top_markers_dir:$top_markers_dir; running: ln -s ../${i} ." DEBUG NC
if [ ! -h "$i" ]
then
ln -s ../"$i" .
else
continue
fi
done < <(find .. -maxdepth 1 -name "${id}"\* -printf "%f\n")
done < "$top_markers_tab"
else # IQ-Tree
for id in "${alns_passing_lmap_and_suppval_a[@]}"
do
while read -r i
do
(( DEBUG > 0 )) && msg " > reading top_markers_tab:$top_markers_tab in top_markers_dir:$top_markers_dir; running: ln -s ../${i} ." DEBUG NC
if [ ! -h "$i" ]
then
ln -s ../"$i" .
else
continue
fi
done < <(find .. -maxdepth 1 -name "${id}"\* -printf "%f\n")
done
fi

(( no_top_markers < 3 )) && print_start_time && msg " >>> Warning: There are less than 3 top markers. Relax your filtering thresholds. will exit now!" ERROR LRED && exit 3

msg "" PROGR NC
msg " >>>>>>>>>>>>>>> generate supermatrix from concatenated, top-ranking alignments <<<<<<<<<<<<<<< " PROGR YELLOW
Expand Down Expand Up @@ -2050,7 +2191,7 @@ then

tar -czf concatenated_alignment_files.tgz concat_cdnAlns.fna concat_cdnAlns.fnainf
[ -s concatenated_alignment_files.tgz ] && rm concat_cdnAlns.fna concat_cdnAlns.fnainf
(( DEBUG == 0 )) && rm gene_trees2_concat_tree_RF_distances.tab ./*cdnAln.fasta ./*.log sorted_aggregated_support_values4loci_*.tab
(( DEBUG == 0 )) && rm gene_trees2_concat_tree_RF_distances.tab ./*cdnAln.fasta ./*.log #sorted_aggregated_support_values4loci_*.tab
(( DEBUG == 0 )) && rm ./*ckp.gz ./*model.gz
(( DEBUG == 0 )) && if ls ./*uniqueseq.phy &> /dev/null; then rm ./*uniqueseq.phy; fi
(( DEBUG == 0 )) && rm list2concat Rplots.pdf ./*cdnAln.ph
Expand Down

0 comments on commit 1d95393

Please sign in to comment.