PIA_inner.pl

#!/usr/bin/perl

######################################################
##################### PIA_inner.pl ###################
######################################################
#########  Phylogenetic Intersection Analysis ########
############## Robin Allaby, UoW 2013 ################
######################################################
############## Version 5.6, 2021-05-07 ###############
######################################################

# Edited by Roselyn Ware, UoW 2015
# Further edited by Becky Cribdon, UoW 2019
		
# A method of metagenomic phylogenetic assignation, designed to be robust to partial representation of organisms in the database		
# Please report any problems to r.cribdon@warwick.ac.uk.

# This usually runs inside PIA.pl. To run independently, PIA.pl must already have made a read_info file containing the name and length of every sequence in the FASTA. 
# Then run PIA_inner.pl giving the read_info file as -r and a BLAST file as -b.
# > perl PIA_inner.pl -r [read_info file] -b [BLAST file]

	use strict;
	use warnings;
	use Getopt::Std;
    use DB_File;
    use Fcntl;
	#use Data::Dumper qw(Dumper); # Only used for testing


######################################################
########### Check arguments and Input Data ###########
######################################################

##### Get arguments from command line #####
	my %options=();
	getopts('h:b:c:C:m:s:r:', \%options);
		

##### Display help text and exit if -h flag called #####
	if ($options{h}){
        print "Usage: perl PIA.pl -r [read_info file] -b [BLAST file] [other options]

Main Arguments
	Option	Description	Input		Explanation
	-r	read_info filename	Y		Read information file generated by PIA.pl. Read names and sequence lengths from the input FASTA.
	-b	BLAST filename	Y		BLAST filename containing entries for all reads in the FASTA (can contain other entries too).


Optional
	Option	Description		Input		Explanation
 	-c 	cap			Optional	Maximum unique BLAST taxa examined. Impacts taxonomic diversity score. Default is 100.
	-C	min % coverage		Optional	Minimum percentage coverage a top BLAST hit must have for a read to be taken forward. Default is 95.
	-h	help			N		Print this help text.
	-s	min diversity score	Optional	Minimum taxonomic diversity score for a read to make it to Summary_Basic.txt and Summary_Reads.txt. Depends on cap. Default is 0.1.
";
        exit;
	}

##### Check read_info filename #####
	# The read_info file is the read names and sequence lengths extracted from a FASTA file by PIA.pl.
	my $read_info_filepath = '';
    if ($options{r}) {
        $read_info_filepath = $options{r};
    } else {
        print "\nSpecify a read information file (produced by PIA.pl) with -r.\n\n";
        exit;
    }
    
##### Derive other useful names #####    
    my $read_info_filename = $read_info_filepath; # Default the name to the path; assume there isn't a path.
    my $read_info_parentpath = ''; # Default the path to empty.
    if (index($read_info_filepath, '/') != -1) { # If there was in fact a path:
        my @read_info_filepath_elements = split('/', $read_info_filepath); # Split $read_info_filepath on "/" symbols to separate elements of the path.
        $read_info_filename = pop @read_info_filepath_elements; # Separate the read_info file name.
        $read_info_parentpath = join('/', @read_info_filepath_elements); # Put the rest of the path back together.
    }
    my @read_info_filename_elements = split('\.', $read_info_filename);
    my $sample_name = join('.', @read_info_filename_elements[0..(scalar(@read_info_filename_elements)-3)]); # The sample name is $read_info_filename minus 'read_info' and whatever FASTA
    
##### Check BLAST filename #####
 	my $blast_filepath = '';
    if ($options{b}) {
        $blast_filepath = $options{b};
    } else {
        print "\nSpecify a BLAST file with -b.\n\n";
        exit;
    }

##### Check names.dmp.dbm and nodes.dmp.dbm file paths #####
    unless (-e 'Reference_files/names.dmp.dbm') {
        print "\nCannot find names DBM at Reference_files/names.dmp.dbm. You may need to run PIA_index_maker.pl.\n\n";
        exit;
    }
    unless (-e 'Reference_files/nodes.dmp.dbm') {
        print "\nCannot find nodes DBM at Reference_files/nodes.dmp.dbm. You may need to run PIA_index_maker.pl.\n\n";
        exit;
    }

##### See if cap input #####
	# BLAST hits are all assigned to taxa. $cap is the maximum number of taxa to be looked at. If $cap is hit, no more BLAST hits will be considered. $cap is used to calculate the taxonomic diversity score, so affects the minimum score threshold (option s). Default is 100.	
	my $cap = 100;
	if ($options{c}) { $cap = $options{c}; } # If there is an option, overwrite the default.
    print "\nMax taxa to consider: $cap\n";
    
##### See if min % coverage input #####
	# The minimum percentage coverage a top BLAST hit must have for a read to be taken forward. Default is 95.
	my $min_coverage_perc = 95;
	if ($options{C}) { $min_coverage_perc = $options{C}; } # If there is an option, overwrite the default.
    print "Min coverage %: $min_coverage_perc\n";
    
##### See if min taxonomic diversity score input #####
	# The minimum taxonomic diversity score a read must have to make it to Summary_Basic.txt. Depends on $cap. Defaults to 0.1.
	my $min_taxdiv_score = 0.1;
	if ($options{s}) { $min_taxdiv_score = $options{s}; } # If there is an option, overwrite the default.
    print "Min taxonomic diversity score: $min_taxdiv_score\n\n";


######################################
########### Start log file ###########
######################################

    my $log_filename = $read_info_filepath . '.PIA_log.txt';
    open( my $log_filehandle, '>', $log_filename) or die "Cannot open $log_filename for writing: $!\n"; # This will overwrite old logs.
    use IO::Handle; # Enable autoflush.
    $log_filehandle -> autoflush(1); # Set autoflush to 1 for the log filehandle. This means that Perl won't buffer its output to the log, so the log will be updated in real time.
    print $log_filehandle "\n**** $read_info_filename ****\n\n";


######################################################
######### Make copies of the DBM index files #########
######################################################
    my $nodesfileDBM = 'Reference_files/nodes.dmp.dbm' . "_$read_info_filename";
    system ("cp Reference_files/nodes.dmp.dbm $nodesfileDBM");
    my %nodesfileDBM = ();

    my $namesfileDBM = 'Reference_files/names.dmp.dbm' . "_$read_info_filename";
    system ("cp Reference_files/names.dmp.dbm $namesfileDBM");
    my %namesfileDBM = ();


######################################################
###################### Run PIA #######################
######################################################
	
	my $output_directory_path = PIA($read_info_filepath, $read_info_filename, $read_info_parentpath, $sample_name, $blast_filepath, $cap, $min_coverage_perc); # PIA() returns the path to the main output file, which includes the output directory.
    

######################################################	
################## Summarise Data ####################
######################################################

    my $main_output_filepath = $output_directory_path . '/' . $read_info_filename . '.Full.txt'; # Recreate the main output file path.
    
##### Extract simple summary from the main output file
	my $summary_basic_filename = summary_basic($read_info_filename, $min_taxdiv_score, $output_directory_path, $main_output_filepath);
    
##### Extract read-by-read summary from the main output file
    my $summary_reads_filename = summary_reads($read_info_filename, $min_taxdiv_score, $output_directory_path, $main_output_filepath);


######################################################
##################### Tidy Up ########################
######################################################

##### Remove un-necessary files	
    unlink $namesfileDBM;
    unlink $nodesfileDBM;

# Finish the log and move it into the output directory.
    #print "\nThis run of PIA_inner.pl is finished.\n\n";
    print $log_filehandle "\n****This run of PIA_inner.pl is finished.****\n\n\n\n";
    close $log_filehandle;
    my $log_final_destination = $output_directory_path . '/' . $read_info_filename . '.PIA_inner_log.txt';
    system("mv $log_filename $log_final_destination");
    

######################################################	
######################################################
#################### SUBROUTINES #####################
######################################################
######################################################


sub find_taxonomic_intersect {
##### Find intersection of two taxa by comparing their routes down the phlyogenetic tree
	my ($first_route, $second_route) = @_; # Routes are the taxonomic IDs of successive parent taxa.
	my $intersect_ID = 0; # If either the first or second route aren't defined, return the intersect ID 0 (null).
	if (defined $first_route && defined $second_route){
		my @first_route = split (/\t/, $first_route);
		my @second_route = split (/\t/, $second_route);
		my $connected = 0; # The $connected flag activates when the first shared ID is stored as $intersect_ID and prevents it from being overwritten. Because the routes go from lower to higher taxa, higher shared IDs are ignored.
		foreach my $first_ID (@first_route) { # Start with each ID in the first route.
			foreach my $second_ID (@second_route) { # Take the IDs in the second route.
				if ($first_ID == $second_ID) { # If the IDs match,
					unless ($connected == 1) { # And if they're not yet flagged as connected,
						$connected = 1; # Flag them as connected.
						$intersect_ID = $first_ID; # $intersect_ID becomes this shared rank.
					} # If the routes are already flagged as connected, move on to the next ID in the second route. TO DO: stop processing when a connection is found?
				}
			}
		}
	}
	return $intersect_ID;	
}


sub PIA {
##### Run phylogenetic intersection analysis

	# Summary
	#--------
	# Retrieve the name and sequence length for each read.
    #
    # Look at the BLAST file line by line.
    #
    #   If a line is just another hit to the current read,
    #       Skip if we've already seen a hit to this taxon.
    #       Otherwise, list hits by E value.
    #
    #   If a line is the first hit to a new read (the top hit),
    #       First, finish processing the hits from the last read.
    #           Collapse multiple hits with the same E value to their intersection. So, in the end there's just one hit per E value.
    #           If a new "hits" is to an existing taxon, discard all but the hit with the highest E value.
    #           Calculate the taxonomic diversity score. This affects which reads make it to the Summary Basic and Summary Reads files.
    #           Find the intersection between the top and second-top hits. This is the taxon the read is assigned to.
    #           Find the intersection between the top and bottom hits. This is just for interest.
    #           Output all information for this read to the main output file.
    #       Then,
    #           If the new read isn't in our list, skip it.
    #           Check coverage of the top hit. If insufficient, skip the read.
    #           Note the Identities score and E value for the top hit.
    #
	# Return $output_directory_path.
    
	my ($read_info_filepath, $read_info_filename, $read_info_parentpath, $sample_name, $blast_filepath, $cap, $min_coverage_perc) = @_;
    
    my $output_directory_path = $read_info_filename . '.PIA_output'; # Make an output directory for this read_info file.
    if ($read_info_parentpath ne '') { # If there was a parent path, add it on now.
        $output_directory_path = $read_info_parentpath . '/' . $output_directory_path; 
    }
    
    mkdir $output_directory_path;
    my $main_output_filepath = $output_directory_path . '/' . $read_info_filename . '.Full.txt'; # This main output file will list information for reads that pass the coverage check.
  
    
	# Retrieve the name and sequence length for each read
	#----------------------------------------------------
	open (my $read_info_filehandle, $read_info_filepath) or die "Cannot open $read_info_filepath: !$ \n";
	my %read_names_and_lengths = (); # Keys are read names, values are sequence lengths.
	
	while (1) {
        my $line = readline ($read_info_filehandle);
        if (! defined $line) { last; } # If you've reached the end of the file, exit the loop.
        
        chomp $line;
        my @line = split ("\t", $line);
        $read_names_and_lengths{$line[0]} = $line[1]; # Element 0 is the name; element 1 is the length.
	}
	close $read_info_filehandle;
    
	my $read_count = scalar(keys %read_names_and_lengths);
    #print "\n$read_count reads to process.\n\n";
	print $log_filehandle "$read_count reads to process.\n\n";

    
	# Search the BLAST file for entries for those reads
	#--------------------------------------------------
    open (my $blast_filehandle, $blast_filepath) or die "Cannot open $blast_filepath: $!";
    open (my $main_output_filehandle, '>', $main_output_filepath) or die "Cannot open output file $main_output_filepath for writing: $!\n"; # Open the output file too, so we can output as we go. Saves a little memory.
    # Print the main output file a header:
    print $main_output_filehandle "Read_name\tCoverage\tTop_hit_name\tTop_hit_ID\tExpect_value\tIdentities_value\tNext_hit_name\tNext_hit_ID\tLast_hit_name\tLast_hit_ID\tPhylogenetic_range_name\tPhylogenetic_range_ID\tRaw_hit_count\tTaxonomic_diversity_(up_to_cap_if_met)\tTaxonomic_diversity_score\tPhylogenetic_intersection_name\tPhylogenetic_intersection_ID\n";

    my $current_read = 'none';
    my $current_read_number = '0'; # Interestingly, the first read will be labelled "1". This is only for humans to read though, so eh.
    my $skip_read = 0; # 1 for skip, 0 for continue.
    my $skip_rest_of_read = 0; # Same as $skip, but allows what data was collected from the read to be outputted.
    my $coverage; my $tophit_identities; my $tophit_e_value; # Calculated for each read but need to have wide scope.
    my %hit_taxa = (); # A list of the unique BLAST taxa. Used to check whether hits are to taxa that have already been seen.
    my %hit_e_values; # A list of the unique E values. Used to check whether hits share E values. E values with multiple hits are "averaged" into one "hit" by taking the intersection of the taxa involved.
    my $raw_hit_count; # A count of hits per read in the BLAST file.
    
    BLASTLINE: while (1) {  # Run this loop until "last" is called.
        
        if (!keys %read_names_and_lengths) { last BLASTLINE; } # If there aren't any read names left to find, stop looking in the BLAST file (note that it does look at one more line before stopping in order to finish off the last read).
        
        my $line = readline($blast_filehandle); # Otherwise, look at every line.
        if (! defined $line) { $line = "swan\tsong"; } # If there is no next line, give $line a stand-in string so we can just finish off the last read.
        my @line = split ("\t", $line);
        
        #======================================================================================================================================================================
        if ($line[0] ne $current_read) { # If this is the top hit for a new read,
            
            
            #------------------------------------------------------------------------------------------------------------------------------------------------------------------
            if (%hit_taxa) { # If there was a previous read, we must now have all of its hits stored in %hit_taxa. Finish processing them before starting the next read:
                
                # Find the intersection of any hits with the same score
                #------------------------------------------------------
                # For each entry in %hit_e_values, if the value contains more than one ID, find the intersection of those IDs and save it as the new value.
                foreach my $query_e_value (keys %hit_e_values) {
                    my @IDs_per_e_value = split ("\t", $hit_e_values{$query_e_value});
                    my $number_of_IDs = @IDs_per_e_value;
                    if ($number_of_IDs > 1) { # If there is more than one ID under this score,
                        my $zeroth_route = retrieve_taxonomic_structure($IDs_per_e_value[0], $nodesfileDBM);
                        my $first_route = retrieve_taxonomic_structure ($IDs_per_e_value[1], $nodesfileDBM);
                        my $intersection_ID = find_taxonomic_intersect ($zeroth_route, $first_route); # This is the initial intersection.

                        if ($number_of_IDs > 2) { # If there are also additional IDs, find their intersection with the initial one.
                            foreach my $next_ID (@IDs_per_e_value[2 .. $#IDs_per_e_value]) { # This is an array slice. We don't want to process elements 0-1 of @IDs_per_score again.
                                my $intersection_route = retrieve_taxonomic_structure($intersection_ID, $nodesfileDBM);
                                my $next_route = retrieve_taxonomic_structure ($next_ID, $nodesfileDBM);
                                my $intersection_next_temp = find_taxonomic_intersect($intersection_route, $next_route);
                                $intersection_ID = find_taxonomic_intersect($intersection_route, $next_route);  # The next intersection will always be at least as high as the current.
                            } 
                        }
                        
                        $hit_e_values{$query_e_value} = $intersection_ID;
                        
                    } # If there was only one ID, leave it alone.
                }
                
                # %hit_e_values now contains a list of unique E values paired with a single taxonomic ID. Each ID represents either a real BLAST hit or an 'average' for hits with the same score.
                # However, some of the IDs might now be repeated. If IDs are represented more than once, remove all but the hit with the best E value.
                my %hit_e_values_IDcheck = (); # Like %hit_e_values, but where keys are IDs and values are E values, instead of the other way around. Just used for checking.
                foreach my $query_e_value (sort keys %hit_e_values) {
                    my $ID = $hit_e_values{$query_e_value};
                    
                    if (exists $hit_e_values_IDcheck{$ID}) {
                        my $previous_e_value = $hit_e_values_IDcheck{$ID};
                        # We only want one hit per taxon, and that hit should have best E value available (remember that the E values are all unique).
                        if ($previous_e_value < $query_e_value) {
                            delete $hit_e_values{$query_e_value}; # If the previous hit E value is smaller than the current one, it has priority, so delete the current hit from %hit_e_values.
                        } else {
                            delete $hit_e_values{$previous_e_value}; # If the current hit E value is smaller than the previous one, it has priority, so delete the previous hit from %hit_e_values.
                        }
       
                    } else {
                        $hit_e_values_IDcheck{$ID} = $query_e_value; # If we haven't noted this ID yet, do so. Pair it with the current E value.
                    }
                    
                }
                
                # To recreate the list of BLAST hits, sort by E value (ascending; score is descending).  We don't need the E values themselves any more.
                my @hit_taxa_finished = ();
                foreach my $ID (sort {$a <=> $b} keys %hit_e_values) {
                    push (@hit_taxa_finished, $hit_e_values{$ID});
                }

                
                # Look up the more information about the finished hits using their IDs
                #---------------------------------------------------------------------
                my @all_hit_info;
                foreach my $finished_hit_ID (@hit_taxa_finished) {
                    my $finished_hit_name = retrieve_name ($finished_hit_ID, $namesfileDBM);
                    my $finished_hit_info = $finished_hit_ID . "\t" . $finished_hit_name;
                    push (@all_hit_info, $finished_hit_info); # Add the [ID\tname] for this finished BLAST hit to the end of @all_hit_info.
                }
                
                my $number_of_finished_blast_hits = @all_hit_info;
                if ($number_of_finished_blast_hits == 0) {
                    #print "\t\tNo hits identified for this read. Something might have gone wrong.\n";
                    print $log_filehandle "\t\tNo hits identified for this read. Something might have gone wrong.\n";
                    next BLASTENTRY; # Move on to the next read (for which there is a BLAST entry)
                }


                # Calculate the taxonomic diversity score
                #----------------------------------------
                my $taxonomic_diversity = keys %hit_taxa; # Note that this is based on the number of taxa in the BLAST hits, not the number of hits after filtering by E value.
                #print "\t\tNumber of taxa used to calculate taxdiv score: $taxonomic_diversity\n";
                my $tax_diversity_score = ($taxonomic_diversity/$cap) - (1/$cap); # Remember, $cap defaults to 100.
                
                my $contrastinghit_ID = 0; my $contrastinghit_name = 'NA'; # contrastinghit is second BLAST hit: number 1 if you're counting from 0. Default the values to null.
                
                if ($number_of_finished_blast_hits > 1) { # If there is more than one finished hit:
                        my @contrastinghit_info = split ("\t", $all_hit_info[1]);
                        $contrastinghit_ID = $contrastinghit_info[0]; # Fetch the contrasting hit ID and name from its info array.
                        $contrastinghit_name = $contrastinghit_info[1]; 
                } else {
                    #print "\t\tOnly one hit in final processed list. Cannot find intersection.\n";
                    print $log_filehandle "\t\tOnly one hit in final processed list. Cannot find intersection.\n";
                }

                
                # Find the intersection between the top and contrasting hits
                #-----------------------------------------------------------
                # This is the taxon the read will be assigned to.
                my $intersect_ID = 0; my $intersect_name = 'NA'; my $tophit_ID = 0; my $tophit_route = 0; # Default the intersect values to null. Note that routes are 
                my @tophit_info = split ("\t", $all_hit_info[0]); # tophit is the top BLAST hit. Unless the top hit wasn't identified, it's also the first BLAST taxon. We ignore unidentified hits.
        
                $tophit_ID = $tophit_info[0];
                if ($tophit_ID == 0) {
                    #print "\t\tCannot identify top hit: ID is 0. Cannot find intersection.\n";
                    print $log_filehandle "\t\tCannot identify top hit: ID is 0. Cannot find intersection.\n";
                }
                
                if ($number_of_finished_blast_hits > 1) { # If there is more than one BLAST hit:
                    unless ($tophit_ID == 0) { # The BLAST taxa might not all be the same, but if the top taxon has ID 0, a proper ID was never found and we can't calculate an intersect.
                        $tophit_route = retrieve_taxonomic_structure ($tophit_ID, $nodesfileDBM); # retrieve_taxonomic_structure() returns the route from this taxon down to the root.
                        my $contrastinghit_route = retrieve_taxonomic_structure ($contrastinghit_ID, $nodesfileDBM);
                        $intersect_ID = find_taxonomic_intersect ($tophit_route, $contrastinghit_route); # find_taxonomic_intersect() returns the lowest shared rank between the two routes. If there wasn't any shared taxon or if one or more routes were undefined, it returns an ID of 0.
                            
                        if ($intersect_ID == 0) {
                                $intersect_name = 'NA';
                        } else { # If there was an intersect, find its name.
                                $intersect_name = retrieve_name ($intersect_ID, $namesfileDBM);
                        }	
                    }
                }
                
         
                # Find the intersection between the top and bottom hits
                #------------------------------------------------------
                # We also find the intersection between the top hit and the bottom (within $cap): bottom_intersect. This isn't used in any calculations, but it is printed in the main output file and might be useful one day.
                my $tophit_name = 'NA'; my $tophit_rank = 'NA'; # Default to null.
                if ($tophit_info[1]) {
                    $tophit_name = $tophit_info[1];
                }  
                
                my $bottom_intersect_ID = 0; my $bottom_intersect_name = 'NA'; my $bottomhit_ID = 0; my $bottomhit_name = 'NA'; # Default to null values.
                
                if ($number_of_finished_blast_hits == 1) { # If there was only one hit in the end, the bottom hit is the same as the top hit, as is the intersect.
                    $bottom_intersect_ID = $tophit_ID; $bottom_intersect_name = $tophit_name;
                    $bottomhit_ID = $tophit_ID; $bottomhit_name = $tophit_name; 
                }

                if ($number_of_finished_blast_hits == 2) { # If there were only two hits in the end, the top intersect is the same as the intersect and the bottom hit is the same as the contrasting hit.
                    $bottom_intersect_ID = $intersect_ID; $bottom_intersect_name = $intersect_name;
                    $bottomhit_ID = $contrastinghit_ID; $bottomhit_name = $contrastinghit_name; 
                }
                
                if ($number_of_finished_blast_hits > 2) { # If there were more than two hits in the end, it gets more complicated.
                        my $bottomhit = pop @all_hit_info; # $bottom is the final BLAST hit after filtering. The least good BLAST match (within our standards).
                        my @bottomhit_info = split ("\t", $bottomhit);
                        $bottomhit_ID = $bottomhit_info[0];
                        $bottomhit_name = $bottomhit_info[1];
        
                        unless ($bottomhit_ID == 0) { # If the bottom hit doesn't have an ID, we can't calculate a top intersection.
                            my $bottomhit_route = retrieve_taxonomic_structure ($bottomhit_ID, $nodesfileDBM);
                            $bottom_intersect_ID = find_taxonomic_intersect ($tophit_route, $bottomhit_route);
                        }
                        
                        if ($bottom_intersect_ID == 0) {			
                            $bottom_intersect_name = 'NA';
                        } else {
                            $bottom_intersect_name = retrieve_name ($bottom_intersect_ID, $namesfileDBM);
                        }
                }
                 
                
                # Print all of this information to the main output file
                #------------------------------------------------------              
                if ($skip_read == 0) { # If $skip_read is 1, this will print mostly null values for a read that wasn't even in the original FASTA. Don't want that.
                    print $main_output_filehandle "$current_read\t$coverage\t$tophit_name\t$tophit_ID\t$tophit_e_value\t$tophit_identities\t$contrastinghit_name\t$contrastinghit_ID\t$bottomhit_name\t$bottomhit_ID\t$bottom_intersect_name\t$bottom_intersect_ID\t$raw_hit_count\t$taxonomic_diversity\t$tax_diversity_score\t$intersect_name\t$intersect_ID\n";
                }
            }
            
            if ($line eq "swan\tsong") { last BLASTLINE; } # If this was the stand-in line, exit the loop now. You've processed the whole file.
            
            delete $read_names_and_lengths{$current_read}; # Otherwise, delete the last read from the reads hash so there's fewer to check against next time.
            #------------------------------------------------------------------------------------------------------------------------------------------------------------------
            
            
            # Back to the new read.
            
            $skip_rest_of_read = 0; # Reset the rest-of-read skip.
            
            unless (exists $read_names_and_lengths{$line[0]}) { # If the qseqid (query sequence ID) for this line doesn't match a read name we're looking for, activate $skip_read.
                $current_read = $line[0]; # Define the current read.
                #print "\tSkipping read $current_read\n";
                $skip_read = 1;
                next BLASTLINE;
            }

            $current_read = $line[0]; # So, this read was in the input FASTA. Define it as the current read.
            
            $current_read_number ++;
            #print "\t$current_read_number of $read_count: $line[0]\n";
            print $log_filehandle "\t$current_read_number of $read_count: $line[0]\n";
            
            $raw_hit_count = 0; %hit_taxa = (); %hit_e_values = (); # Reset $raw_hit_count, %hit_taxa and %hit_e_values for the new read.
            
            # Check coverage of the top hit:
            $coverage = $line[3] / $read_names_and_lengths{$line[0]}; # Coverage = [match length] / [read length]
            my $min_coverage = $min_coverage_perc / 100;
            if ($coverage < $min_coverage) { # If the top BLAST hit doesn't have at least $min_coverage, activate $skip_read.
                #print "\t\tTop hit has insufficient coverage. Skipping.\n";
                print $log_filehandle "\t\tTop hit has insufficient coverage. Skipping.\n";
                print $main_output_filehandle "$current_read\t$coverage" . "\tNA" x 15 . "\n"; # Don't bother calculating the other information.
                
                $skip_read = 1;
                next BLASTLINE;
            } else { $skip_read = 0 }; # Otherwise, turn $skip_read off.
            
            $tophit_identities = $line[2]; # Note the Identities score.
            $tophit_e_value = $line[10]; # Note the E value. These will eventually be exported in the main output file.     
        }
        #======================================================================================================================================================================
        
        # For all lines until the next read:
        
        $raw_hit_count ++; # Even if we end up skipping this hit, add it to the raw hit count.
        
        if ($skip_read == 1 or $skip_rest_of_read == 1) { next BLASTLINE; }
        
        my $ID;
        chomp $line[-1]; # Remove the pesky newline. I'm assuming the final field is staxids.
        
        if (index ($line[-1], ';') != -1) { # If the field contains a ';', it contains multiple taxa.
            #print "\t\tAssigning hit $line[1] to intersection of associated taxa.\n";
            print $log_filehandle "\t\tAssigning hit $line[1] to intersection of associated taxa.\n";
            my @associated_IDs = split (';', $line[-1]); # If there are multiple associated IDs, assign the hit to their phylogenetic intersection. For that, we need routes.
            my $zeroth_route = retrieve_taxonomic_structure($associated_IDs[0], $nodesfileDBM);
            my $first_route = retrieve_taxonomic_structure ($associated_IDs[1], $nodesfileDBM);
            my $intersection_ID = find_taxonomic_intersect ($zeroth_route, $first_route); # This is the initial intersection.

            if ($associated_IDs[2]) { # If there are also additional IDs, find their intersection with the initial one.
                    foreach my $next_ID (@associated_IDs[2 .. $#associated_IDs]) { # This is an array slice. We don't want to process elements 0-1 of @IDs_per_score again.
                        my $intersection_route = retrieve_taxonomic_structure($intersection_ID, $nodesfileDBM);
                        my $next_route = retrieve_taxonomic_structure ($next_ID, $nodesfileDBM);
                        my $intersection_next_temp = find_taxonomic_intersect($intersection_route, $next_route);
                        $intersection_ID = find_taxonomic_intersect($intersection_route, $next_route);  # The next intersection will always be at least as high as the current.
                    } 
            }
            
            $ID = $intersection_ID;
            
        } else {
            $ID = $line[-1]; # If the reference seqeunce ID field is identical to the "associated taxa" field, the reference is only associated with the organism it came from. So just take that as the ID.
        }


        if (exists $hit_taxa{$ID} or $ID eq 'N/A' or $ID == 0) { next BLASTLINE; } # If we already have a hit from this organism, or if the ID for this hit is 'N/A' (some references aren't identified), or if the ID is 0 (tried and failed to find a phylogenetic intersection), skip this hit.

        my $number_of_taxa_before_this_hit = keys %hit_taxa;

        if ($number_of_taxa_before_this_hit == $cap) { # Once $cap taxa have been found, subsequent hits are skipped, because new hits will either be to existing taxa so skipped, or to new taxa that would be above $cap.
            #print "\t\tTaxon cap ($cap) reached. Ignoring subsequent hits.\n";
            print $log_filehandle "\t\tTaxon cap ($cap) reached. Ignoring subsequent hits.\n";
            $skip_rest_of_read = 1;
            next BLASTLINE;
        }
        
        $hit_taxa{$ID} = undef; # Otherwise, note the ID in %hit_taxa.
        
        my $e_value = $line[10]; # Note the E value (Perl recognises that something like 3.14e-10 is a number).
        if (exists $hit_e_values{$e_value}) {
            $hit_e_values{$e_value} = $hit_e_values{$e_value} . "\t" . $ID; # If other hits have had this E value, list this ID with them. We'll find their intersection at the end.
        } else {
            $hit_e_values{$e_value} = $ID; # If the E value is new, make a note.
        }
        
    }
    close $blast_filehandle;
    close $main_output_filehandle;

	return $output_directory_path; # Return the path to the output directory for use with other subroutines.
}


sub retrieve_name {
##### Use taxonomic ID to get name
	my ($query_ID, $namesfileDBM) = @_; # $query_ID is the taxonomic ID in question. We want to find its name.
	my $name = 'NA'; # Default to null.
    
    unless ($query_ID == 0) { # ID 0 is null. It's not in the names file.
        my %namesfileDBM = (); # Set up a fresh hash to hold the names DBM file.
        tie (%namesfileDBM, "DB_File", $namesfileDBM, O_RDONLY, 0666, $DB_BTREE) or die "Cannot open $namesfileDBM: $!\n";
        
        if (exists $namesfileDBM{$query_ID}) {
            $name = $namesfileDBM{$query_ID};
        }
    }
    untie %namesfileDBM;
	return $name; 
}


sub retrieve_taxonomic_structure {
##### Get hierarchy from nodes.dmp file
	my ($query_ID, $nodesfileDBM) = @_; # $query_ID is a non-0 taxonomic ID. $nodesfileDBM is the path to the nodes index file.

    my $route = undef; # Default the route to undef.
    
    unless ($query_ID == 0) { # I've had it before where $query_ID is "N/A".
        my $exit = 0; # When to exit the do loop below.
        my $next_level_ID; # The ID of the parent node.
        my @route = (); # @route is a list of tab-separated taxonomic IDs moving down from the current node.
        my %nodesfileDBM = (); # Set up a fresh hash to hold the nodes DBM file.
        tie (%nodesfileDBM, "DB_File", $nodesfileDBM, O_RDONLY, 0666, $DB_BTREE) or die "Cannot open $nodesfileDBM: $!\n";
        
        do {
            push (@route, $query_ID); # Add the current ID to @route.
            
            if ($query_ID == 1) { # If the current node has ID 1, it's the root. We have a route.
                $exit = 1;
            }
            
            if (exists $nodesfileDBM{$query_ID}) { # Extract the ID of the parent node from the nodes file.
                    my @node_info = split("\t", $nodesfileDBM{$query_ID});
                    $next_level_ID = $node_info[0];
            } else {
                #print "\t\tID $query_ID was not found in nodes file. Truncating route here.\n";
                print $log_filehandle "\t\tID $query_ID was not found in nodes file. Truncating route here.\n";
                $exit = 1;
            }
            $query_ID = $next_level_ID; # Move on to the current parent node.
            
        } until ($exit == 1);
        
        untie %nodesfileDBM;
        $route = join ("\t", @route);
    }
    return $route;
}


sub summary_basic {
#### Create simple summary of the output. The main output file is more informative, but this is what most people will take as the output.
	my ($read_info_filename, $min_taxdiv_score, $output_directory_path, $main_output_filepath) = @_;
    
	open (my $main_output_filehandle, $main_output_filepath) or die "Cannot open main output file $main_output_filepath for generating Summary Basic: $!\n";			
	readline($main_output_filehandle); # Skip the header line.
    my %taxon_count = (); # Keys are taxon names and IDs in the format "name\tID". Values are the number of times that taxon occurs.
    
    # Get a list of phylogenetic intersections where the taxonomic diversity score was at least $min_taxdiv_score.
    while (1) {
        my $line = readline($main_output_filehandle);
        if (! defined $line) { last; } 
        
        chomp $line;
        my @line = split("\t", $line);
        my $tax_diversity_score = $line[14]; # Taxonomic diversity score is in the 14th column.
        
        if ( ($tax_diversity_score ne 'NA') && ($tax_diversity_score >= $min_taxdiv_score) && ($line[16] != 0) ) { # If the taxonomic diversity score is both not NA and also at least the minimum, and a phylogenetic intersection ID was found:
            my $taxon_ID_and_name = "$line[16]\t$line[15]"; # Join the ID and name with a tab.
                    
            # Add to the hash:
            if (exists $taxon_count{$taxon_ID_and_name}) {
                $taxon_count{$taxon_ID_and_name} = $taxon_count{$taxon_ID_and_name} + 1;
            } else {
                    $taxon_count{$taxon_ID_and_name} = 1;
            }
        }
	}
    close $main_output_filehandle;

    my $summary_basic_filepath = $output_directory_path . '/' . $read_info_filename . '.Summary_Basic.txt'; # This simply lists output taxa and their counts.
    
	open (my $summary_basic_filehandle, '>', $summary_basic_filepath) or die "Cannot write Summary Basic file: $!\n";
	print $summary_basic_filehandle "# Read info file:\t$read_info_filename\n"; # Output the read_info file this was built from as a header. This is only temporary.

    foreach my $taxon_ID_and_name (keys %taxon_count) {
            print $summary_basic_filehandle "$taxon_ID_and_name\t$taxon_count{$taxon_ID_and_name}\n";
    }

	close $summary_basic_filehandle;
	return $summary_basic_filename;
}


sub summary_reads {
#### Create read-by-read summary of output. This lists the names of reads that passed and what taxon each one was assigned to.
	my ($read_info_filename, $min_taxdiv_score, $output_directory_path, $main_output_filepath) = @_;

    my $summary_reads_filepath = $output_directory_path . '/' . $read_info_filename . '.Summary_Reads.txt'; # This will list the assignment for each read that had one.
    open (my $summary_reads_filehandle, '>', $summary_reads_filepath) or die "Cannot write Summary Reads file: $!\n"; # We can print this output as we go.
    print $summary_reads_filehandle "# Read info file:\t$read_info_filename\n"; # Output the read_info file this was built from as a header. This is only temporary.
    
    open (my $main_output_filehandle, $main_output_filepath) or die "Cannot open main output file $main_output_filepath for generating Summary Reads: $!\n";
    readline($main_output_filehandle); # Skip the header line.
    
    # Get a list of phylogenetic intersections where the taxonomic diversity score was at least $min_taxdiv_score.
	while (1) {
        my $line = readline($main_output_filehandle);
        if (! defined $line) { last; }
        
        chomp $line;
        my @line = split("\t", $line);
        my $tax_diversity_score = $line[14]; # Taxonomic diversity score is in the 14th column.
        
        if ( ($tax_diversity_score ne 'NA') && ($tax_diversity_score >= $min_taxdiv_score) && ($line[16] != 0) ) { # If the taxonomic diversity score is both not NA and also at least the minimum, and a phylogenetic intersection ID was found:
            print $summary_reads_filehandle "$line[0]\t$line[16]\t$line[15]\n"; # Print the read name and phylogenetic intersection name and ID the hash.
        }
	}
    close $main_output_filehandle;
    close $summary_reads_filehandle;
    
	return $summary_reads_filename;
}