-
Notifications
You must be signed in to change notification settings - Fork 0
/
PIA_inner.pl
693 lines (537 loc) · 40.5 KB
/
PIA_inner.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
#!/usr/bin/perl
######################################################
##################### PIA_inner.pl ###################
######################################################
######### Phylogenetic Intersection Analysis ########
############## Robin Allaby, UoW 2013 ################
######################################################
############## Version 5.6, 2021-05-07 ###############
######################################################
# Edited by Roselyn Ware, UoW 2015
# Further edited by Becky Cribdon, UoW 2019
# A method of metagenomic phylogenetic assignation, designed to be robust to partial representation of organisms in the database
# Please report any problems to r.cribdon@warwick.ac.uk.
# This usually runs inside PIA.pl. To run independently, PIA.pl must already have made a read_info file containing the name and length of every sequence in the FASTA.
# Then run PIA_inner.pl giving the read_info file as -r and a BLAST file as -b.
# > perl PIA_inner.pl -r [read_info file] -b [BLAST file]
use strict;
use warnings;
use Getopt::Std;
use DB_File;
use Fcntl;
#use Data::Dumper qw(Dumper); # Only used for testing
######################################################
########### Check arguments and Input Data ###########
######################################################
##### Get arguments from command line #####
my %options=();
getopts('h:b:c:C:m:s:r:', \%options);
##### Display help text and exit if -h flag called #####
if ($options{h}){
print "Usage: perl PIA.pl -r [read_info file] -b [BLAST file] [other options]
Main Arguments
Option Description Input Explanation
-r read_info filename Y Read information file generated by PIA.pl. Read names and sequence lengths from the input FASTA.
-b BLAST filename Y BLAST filename containing entries for all reads in the FASTA (can contain other entries too).
Optional
Option Description Input Explanation
-c cap Optional Maximum unique BLAST taxa examined. Impacts taxonomic diversity score. Default is 100.
-C min % coverage Optional Minimum percentage coverage a top BLAST hit must have for a read to be taken forward. Default is 95.
-h help N Print this help text.
-s min diversity score Optional Minimum taxonomic diversity score for a read to make it to Summary_Basic.txt and Summary_Reads.txt. Depends on cap. Default is 0.1.
";
exit;
}
##### Check read_info filename #####
# The read_info file is the read names and sequence lengths extracted from a FASTA file by PIA.pl.
my $read_info_filepath = '';
if ($options{r}) {
$read_info_filepath = $options{r};
} else {
print "\nSpecify a read information file (produced by PIA.pl) with -r.\n\n";
exit;
}
##### Derive other useful names #####
my $read_info_filename = $read_info_filepath; # Default the name to the path; assume there isn't a path.
my $read_info_parentpath = ''; # Default the path to empty.
if (index($read_info_filepath, '/') != -1) { # If there was in fact a path:
my @read_info_filepath_elements = split('/', $read_info_filepath); # Split $read_info_filepath on "/" symbols to separate elements of the path.
$read_info_filename = pop @read_info_filepath_elements; # Separate the read_info file name.
$read_info_parentpath = join('/', @read_info_filepath_elements); # Put the rest of the path back together.
}
my @read_info_filename_elements = split('\.', $read_info_filename);
my $sample_name = join('.', @read_info_filename_elements[0..(scalar(@read_info_filename_elements)-3)]); # The sample name is $read_info_filename minus 'read_info' and whatever FASTA
##### Check BLAST filename #####
my $blast_filepath = '';
if ($options{b}) {
$blast_filepath = $options{b};
} else {
print "\nSpecify a BLAST file with -b.\n\n";
exit;
}
##### Check names.dmp.dbm and nodes.dmp.dbm file paths #####
unless (-e 'Reference_files/names.dmp.dbm') {
print "\nCannot find names DBM at Reference_files/names.dmp.dbm. You may need to run PIA_index_maker.pl.\n\n";
exit;
}
unless (-e 'Reference_files/nodes.dmp.dbm') {
print "\nCannot find nodes DBM at Reference_files/nodes.dmp.dbm. You may need to run PIA_index_maker.pl.\n\n";
exit;
}
##### See if cap input #####
# BLAST hits are all assigned to taxa. $cap is the maximum number of taxa to be looked at. If $cap is hit, no more BLAST hits will be considered. $cap is used to calculate the taxonomic diversity score, so affects the minimum score threshold (option s). Default is 100.
my $cap = 100;
if ($options{c}) { $cap = $options{c}; } # If there is an option, overwrite the default.
print "\nMax taxa to consider: $cap\n";
##### See if min % coverage input #####
# The minimum percentage coverage a top BLAST hit must have for a read to be taken forward. Default is 95.
my $min_coverage_perc = 95;
if ($options{C}) { $min_coverage_perc = $options{C}; } # If there is an option, overwrite the default.
print "Min coverage %: $min_coverage_perc\n";
##### See if min taxonomic diversity score input #####
# The minimum taxonomic diversity score a read must have to make it to Summary_Basic.txt. Depends on $cap. Defaults to 0.1.
my $min_taxdiv_score = 0.1;
if ($options{s}) { $min_taxdiv_score = $options{s}; } # If there is an option, overwrite the default.
print "Min taxonomic diversity score: $min_taxdiv_score\n\n";
######################################
########### Start log file ###########
######################################
my $log_filename = $read_info_filepath . '.PIA_log.txt';
open( my $log_filehandle, '>', $log_filename) or die "Cannot open $log_filename for writing: $!\n"; # This will overwrite old logs.
use IO::Handle; # Enable autoflush.
$log_filehandle -> autoflush(1); # Set autoflush to 1 for the log filehandle. This means that Perl won't buffer its output to the log, so the log will be updated in real time.
print $log_filehandle "\n**** $read_info_filename ****\n\n";
######################################################
######### Make copies of the DBM index files #########
######################################################
my $nodesfileDBM = 'Reference_files/nodes.dmp.dbm' . "_$read_info_filename";
system ("cp Reference_files/nodes.dmp.dbm $nodesfileDBM");
my %nodesfileDBM = ();
my $namesfileDBM = 'Reference_files/names.dmp.dbm' . "_$read_info_filename";
system ("cp Reference_files/names.dmp.dbm $namesfileDBM");
my %namesfileDBM = ();
######################################################
###################### Run PIA #######################
######################################################
my $output_directory_path = PIA($read_info_filepath, $read_info_filename, $read_info_parentpath, $sample_name, $blast_filepath, $cap, $min_coverage_perc); # PIA() returns the path to the main output file, which includes the output directory.
######################################################
################## Summarise Data ####################
######################################################
my $main_output_filepath = $output_directory_path . '/' . $read_info_filename . '.Full.txt'; # Recreate the main output file path.
##### Extract simple summary from the main output file
my $summary_basic_filename = summary_basic($read_info_filename, $min_taxdiv_score, $output_directory_path, $main_output_filepath);
##### Extract read-by-read summary from the main output file
my $summary_reads_filename = summary_reads($read_info_filename, $min_taxdiv_score, $output_directory_path, $main_output_filepath);
######################################################
##################### Tidy Up ########################
######################################################
##### Remove un-necessary files
unlink $namesfileDBM;
unlink $nodesfileDBM;
# Finish the log and move it into the output directory.
#print "\nThis run of PIA_inner.pl is finished.\n\n";
print $log_filehandle "\n****This run of PIA_inner.pl is finished.****\n\n\n\n";
close $log_filehandle;
my $log_final_destination = $output_directory_path . '/' . $read_info_filename . '.PIA_inner_log.txt';
system("mv $log_filename $log_final_destination");
######################################################
######################################################
#################### SUBROUTINES #####################
######################################################
######################################################
sub find_taxonomic_intersect {
##### Find intersection of two taxa by comparing their routes down the phlyogenetic tree
my ($first_route, $second_route) = @_; # Routes are the taxonomic IDs of successive parent taxa.
my $intersect_ID = 0; # If either the first or second route aren't defined, return the intersect ID 0 (null).
if (defined $first_route && defined $second_route){
my @first_route = split (/\t/, $first_route);
my @second_route = split (/\t/, $second_route);
my $connected = 0; # The $connected flag activates when the first shared ID is stored as $intersect_ID and prevents it from being overwritten. Because the routes go from lower to higher taxa, higher shared IDs are ignored.
foreach my $first_ID (@first_route) { # Start with each ID in the first route.
foreach my $second_ID (@second_route) { # Take the IDs in the second route.
if ($first_ID == $second_ID) { # If the IDs match,
unless ($connected == 1) { # And if they're not yet flagged as connected,
$connected = 1; # Flag them as connected.
$intersect_ID = $first_ID; # $intersect_ID becomes this shared rank.
} # If the routes are already flagged as connected, move on to the next ID in the second route. TO DO: stop processing when a connection is found?
}
}
}
}
return $intersect_ID;
}
sub PIA {
##### Run phylogenetic intersection analysis
# Summary
#--------
# Retrieve the name and sequence length for each read.
#
# Look at the BLAST file line by line.
#
# If a line is just another hit to the current read,
# Skip if we've already seen a hit to this taxon.
# Otherwise, list hits by E value.
#
# If a line is the first hit to a new read (the top hit),
# First, finish processing the hits from the last read.
# Collapse multiple hits with the same E value to their intersection. So, in the end there's just one hit per E value.
# If a new "hits" is to an existing taxon, discard all but the hit with the highest E value.
# Calculate the taxonomic diversity score. This affects which reads make it to the Summary Basic and Summary Reads files.
# Find the intersection between the top and second-top hits. This is the taxon the read is assigned to.
# Find the intersection between the top and bottom hits. This is just for interest.
# Output all information for this read to the main output file.
# Then,
# If the new read isn't in our list, skip it.
# Check coverage of the top hit. If insufficient, skip the read.
# Note the Identities score and E value for the top hit.
#
# Return $output_directory_path.
my ($read_info_filepath, $read_info_filename, $read_info_parentpath, $sample_name, $blast_filepath, $cap, $min_coverage_perc) = @_;
my $output_directory_path = $read_info_filename . '.PIA_output'; # Make an output directory for this read_info file.
if ($read_info_parentpath ne '') { # If there was a parent path, add it on now.
$output_directory_path = $read_info_parentpath . '/' . $output_directory_path;
}
mkdir $output_directory_path;
my $main_output_filepath = $output_directory_path . '/' . $read_info_filename . '.Full.txt'; # This main output file will list information for reads that pass the coverage check.
# Retrieve the name and sequence length for each read
#----------------------------------------------------
open (my $read_info_filehandle, $read_info_filepath) or die "Cannot open $read_info_filepath: !$ \n";
my %read_names_and_lengths = (); # Keys are read names, values are sequence lengths.
while (1) {
my $line = readline ($read_info_filehandle);
if (! defined $line) { last; } # If you've reached the end of the file, exit the loop.
chomp $line;
my @line = split ("\t", $line);
$read_names_and_lengths{$line[0]} = $line[1]; # Element 0 is the name; element 1 is the length.
}
close $read_info_filehandle;
my $read_count = scalar(keys %read_names_and_lengths);
#print "\n$read_count reads to process.\n\n";
print $log_filehandle "$read_count reads to process.\n\n";
# Search the BLAST file for entries for those reads
#--------------------------------------------------
open (my $blast_filehandle, $blast_filepath) or die "Cannot open $blast_filepath: $!";
open (my $main_output_filehandle, '>', $main_output_filepath) or die "Cannot open output file $main_output_filepath for writing: $!\n"; # Open the output file too, so we can output as we go. Saves a little memory.
# Print the main output file a header:
print $main_output_filehandle "Read_name\tCoverage\tTop_hit_name\tTop_hit_ID\tExpect_value\tIdentities_value\tNext_hit_name\tNext_hit_ID\tLast_hit_name\tLast_hit_ID\tPhylogenetic_range_name\tPhylogenetic_range_ID\tRaw_hit_count\tTaxonomic_diversity_(up_to_cap_if_met)\tTaxonomic_diversity_score\tPhylogenetic_intersection_name\tPhylogenetic_intersection_ID\n";
my $current_read = 'none';
my $current_read_number = '0'; # Interestingly, the first read will be labelled "1". This is only for humans to read though, so eh.
my $skip_read = 0; # 1 for skip, 0 for continue.
my $skip_rest_of_read = 0; # Same as $skip, but allows what data was collected from the read to be outputted.
my $coverage; my $tophit_identities; my $tophit_e_value; # Calculated for each read but need to have wide scope.
my %hit_taxa = (); # A list of the unique BLAST taxa. Used to check whether hits are to taxa that have already been seen.
my %hit_e_values; # A list of the unique E values. Used to check whether hits share E values. E values with multiple hits are "averaged" into one "hit" by taking the intersection of the taxa involved.
my $raw_hit_count; # A count of hits per read in the BLAST file.
BLASTLINE: while (1) { # Run this loop until "last" is called.
if (!keys %read_names_and_lengths) { last BLASTLINE; } # If there aren't any read names left to find, stop looking in the BLAST file (note that it does look at one more line before stopping in order to finish off the last read).
my $line = readline($blast_filehandle); # Otherwise, look at every line.
if (! defined $line) { $line = "swan\tsong"; } # If there is no next line, give $line a stand-in string so we can just finish off the last read.
my @line = split ("\t", $line);
#======================================================================================================================================================================
if ($line[0] ne $current_read) { # If this is the top hit for a new read,
#------------------------------------------------------------------------------------------------------------------------------------------------------------------
if (%hit_taxa) { # If there was a previous read, we must now have all of its hits stored in %hit_taxa. Finish processing them before starting the next read:
# Find the intersection of any hits with the same score
#------------------------------------------------------
# For each entry in %hit_e_values, if the value contains more than one ID, find the intersection of those IDs and save it as the new value.
foreach my $query_e_value (keys %hit_e_values) {
my @IDs_per_e_value = split ("\t", $hit_e_values{$query_e_value});
my $number_of_IDs = @IDs_per_e_value;
if ($number_of_IDs > 1) { # If there is more than one ID under this score,
my $zeroth_route = retrieve_taxonomic_structure($IDs_per_e_value[0], $nodesfileDBM);
my $first_route = retrieve_taxonomic_structure ($IDs_per_e_value[1], $nodesfileDBM);
my $intersection_ID = find_taxonomic_intersect ($zeroth_route, $first_route); # This is the initial intersection.
if ($number_of_IDs > 2) { # If there are also additional IDs, find their intersection with the initial one.
foreach my $next_ID (@IDs_per_e_value[2 .. $#IDs_per_e_value]) { # This is an array slice. We don't want to process elements 0-1 of @IDs_per_score again.
my $intersection_route = retrieve_taxonomic_structure($intersection_ID, $nodesfileDBM);
my $next_route = retrieve_taxonomic_structure ($next_ID, $nodesfileDBM);
my $intersection_next_temp = find_taxonomic_intersect($intersection_route, $next_route);
$intersection_ID = find_taxonomic_intersect($intersection_route, $next_route); # The next intersection will always be at least as high as the current.
}
}
$hit_e_values{$query_e_value} = $intersection_ID;
} # If there was only one ID, leave it alone.
}
# %hit_e_values now contains a list of unique E values paired with a single taxonomic ID. Each ID represents either a real BLAST hit or an 'average' for hits with the same score.
# However, some of the IDs might now be repeated. If IDs are represented more than once, remove all but the hit with the best E value.
my %hit_e_values_IDcheck = (); # Like %hit_e_values, but where keys are IDs and values are E values, instead of the other way around. Just used for checking.
foreach my $query_e_value (sort keys %hit_e_values) {
my $ID = $hit_e_values{$query_e_value};
if (exists $hit_e_values_IDcheck{$ID}) {
my $previous_e_value = $hit_e_values_IDcheck{$ID};
# We only want one hit per taxon, and that hit should have best E value available (remember that the E values are all unique).
if ($previous_e_value < $query_e_value) {
delete $hit_e_values{$query_e_value}; # If the previous hit E value is smaller than the current one, it has priority, so delete the current hit from %hit_e_values.
} else {
delete $hit_e_values{$previous_e_value}; # If the current hit E value is smaller than the previous one, it has priority, so delete the previous hit from %hit_e_values.
}
} else {
$hit_e_values_IDcheck{$ID} = $query_e_value; # If we haven't noted this ID yet, do so. Pair it with the current E value.
}
}
# To recreate the list of BLAST hits, sort by E value (ascending; score is descending). We don't need the E values themselves any more.
my @hit_taxa_finished = ();
foreach my $ID (sort {$a <=> $b} keys %hit_e_values) {
push (@hit_taxa_finished, $hit_e_values{$ID});
}
# Look up the more information about the finished hits using their IDs
#---------------------------------------------------------------------
my @all_hit_info;
foreach my $finished_hit_ID (@hit_taxa_finished) {
my $finished_hit_name = retrieve_name ($finished_hit_ID, $namesfileDBM);
my $finished_hit_info = $finished_hit_ID . "\t" . $finished_hit_name;
push (@all_hit_info, $finished_hit_info); # Add the [ID\tname] for this finished BLAST hit to the end of @all_hit_info.
}
my $number_of_finished_blast_hits = @all_hit_info;
if ($number_of_finished_blast_hits == 0) {
#print "\t\tNo hits identified for this read. Something might have gone wrong.\n";
print $log_filehandle "\t\tNo hits identified for this read. Something might have gone wrong.\n";
next BLASTENTRY; # Move on to the next read (for which there is a BLAST entry)
}
# Calculate the taxonomic diversity score
#----------------------------------------
my $taxonomic_diversity = keys %hit_taxa; # Note that this is based on the number of taxa in the BLAST hits, not the number of hits after filtering by E value.
#print "\t\tNumber of taxa used to calculate taxdiv score: $taxonomic_diversity\n";
my $tax_diversity_score = ($taxonomic_diversity/$cap) - (1/$cap); # Remember, $cap defaults to 100.
my $contrastinghit_ID = 0; my $contrastinghit_name = 'NA'; # contrastinghit is second BLAST hit: number 1 if you're counting from 0. Default the values to null.
if ($number_of_finished_blast_hits > 1) { # If there is more than one finished hit:
my @contrastinghit_info = split ("\t", $all_hit_info[1]);
$contrastinghit_ID = $contrastinghit_info[0]; # Fetch the contrasting hit ID and name from its info array.
$contrastinghit_name = $contrastinghit_info[1];
} else {
#print "\t\tOnly one hit in final processed list. Cannot find intersection.\n";
print $log_filehandle "\t\tOnly one hit in final processed list. Cannot find intersection.\n";
}
# Find the intersection between the top and contrasting hits
#-----------------------------------------------------------
# This is the taxon the read will be assigned to.
my $intersect_ID = 0; my $intersect_name = 'NA'; my $tophit_ID = 0; my $tophit_route = 0; # Default the intersect values to null. Note that routes are
my @tophit_info = split ("\t", $all_hit_info[0]); # tophit is the top BLAST hit. Unless the top hit wasn't identified, it's also the first BLAST taxon. We ignore unidentified hits.
$tophit_ID = $tophit_info[0];
if ($tophit_ID == 0) {
#print "\t\tCannot identify top hit: ID is 0. Cannot find intersection.\n";
print $log_filehandle "\t\tCannot identify top hit: ID is 0. Cannot find intersection.\n";
}
if ($number_of_finished_blast_hits > 1) { # If there is more than one BLAST hit:
unless ($tophit_ID == 0) { # The BLAST taxa might not all be the same, but if the top taxon has ID 0, a proper ID was never found and we can't calculate an intersect.
$tophit_route = retrieve_taxonomic_structure ($tophit_ID, $nodesfileDBM); # retrieve_taxonomic_structure() returns the route from this taxon down to the root.
my $contrastinghit_route = retrieve_taxonomic_structure ($contrastinghit_ID, $nodesfileDBM);
$intersect_ID = find_taxonomic_intersect ($tophit_route, $contrastinghit_route); # find_taxonomic_intersect() returns the lowest shared rank between the two routes. If there wasn't any shared taxon or if one or more routes were undefined, it returns an ID of 0.
if ($intersect_ID == 0) {
$intersect_name = 'NA';
} else { # If there was an intersect, find its name.
$intersect_name = retrieve_name ($intersect_ID, $namesfileDBM);
}
}
}
# Find the intersection between the top and bottom hits
#------------------------------------------------------
# We also find the intersection between the top hit and the bottom (within $cap): bottom_intersect. This isn't used in any calculations, but it is printed in the main output file and might be useful one day.
my $tophit_name = 'NA'; my $tophit_rank = 'NA'; # Default to null.
if ($tophit_info[1]) {
$tophit_name = $tophit_info[1];
}
my $bottom_intersect_ID = 0; my $bottom_intersect_name = 'NA'; my $bottomhit_ID = 0; my $bottomhit_name = 'NA'; # Default to null values.
if ($number_of_finished_blast_hits == 1) { # If there was only one hit in the end, the bottom hit is the same as the top hit, as is the intersect.
$bottom_intersect_ID = $tophit_ID; $bottom_intersect_name = $tophit_name;
$bottomhit_ID = $tophit_ID; $bottomhit_name = $tophit_name;
}
if ($number_of_finished_blast_hits == 2) { # If there were only two hits in the end, the top intersect is the same as the intersect and the bottom hit is the same as the contrasting hit.
$bottom_intersect_ID = $intersect_ID; $bottom_intersect_name = $intersect_name;
$bottomhit_ID = $contrastinghit_ID; $bottomhit_name = $contrastinghit_name;
}
if ($number_of_finished_blast_hits > 2) { # If there were more than two hits in the end, it gets more complicated.
my $bottomhit = pop @all_hit_info; # $bottom is the final BLAST hit after filtering. The least good BLAST match (within our standards).
my @bottomhit_info = split ("\t", $bottomhit);
$bottomhit_ID = $bottomhit_info[0];
$bottomhit_name = $bottomhit_info[1];
unless ($bottomhit_ID == 0) { # If the bottom hit doesn't have an ID, we can't calculate a top intersection.
my $bottomhit_route = retrieve_taxonomic_structure ($bottomhit_ID, $nodesfileDBM);
$bottom_intersect_ID = find_taxonomic_intersect ($tophit_route, $bottomhit_route);
}
if ($bottom_intersect_ID == 0) {
$bottom_intersect_name = 'NA';
} else {
$bottom_intersect_name = retrieve_name ($bottom_intersect_ID, $namesfileDBM);
}
}
# Print all of this information to the main output file
#------------------------------------------------------
if ($skip_read == 0) { # If $skip_read is 1, this will print mostly null values for a read that wasn't even in the original FASTA. Don't want that.
print $main_output_filehandle "$current_read\t$coverage\t$tophit_name\t$tophit_ID\t$tophit_e_value\t$tophit_identities\t$contrastinghit_name\t$contrastinghit_ID\t$bottomhit_name\t$bottomhit_ID\t$bottom_intersect_name\t$bottom_intersect_ID\t$raw_hit_count\t$taxonomic_diversity\t$tax_diversity_score\t$intersect_name\t$intersect_ID\n";
}
}
if ($line eq "swan\tsong") { last BLASTLINE; } # If this was the stand-in line, exit the loop now. You've processed the whole file.
delete $read_names_and_lengths{$current_read}; # Otherwise, delete the last read from the reads hash so there's fewer to check against next time.
#------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Back to the new read.
$skip_rest_of_read = 0; # Reset the rest-of-read skip.
unless (exists $read_names_and_lengths{$line[0]}) { # If the qseqid (query sequence ID) for this line doesn't match a read name we're looking for, activate $skip_read.
$current_read = $line[0]; # Define the current read.
#print "\tSkipping read $current_read\n";
$skip_read = 1;
next BLASTLINE;
}
$current_read = $line[0]; # So, this read was in the input FASTA. Define it as the current read.
$current_read_number ++;
#print "\t$current_read_number of $read_count: $line[0]\n";
print $log_filehandle "\t$current_read_number of $read_count: $line[0]\n";
$raw_hit_count = 0; %hit_taxa = (); %hit_e_values = (); # Reset $raw_hit_count, %hit_taxa and %hit_e_values for the new read.
# Check coverage of the top hit:
$coverage = $line[3] / $read_names_and_lengths{$line[0]}; # Coverage = [match length] / [read length]
my $min_coverage = $min_coverage_perc / 100;
if ($coverage < $min_coverage) { # If the top BLAST hit doesn't have at least $min_coverage, activate $skip_read.
#print "\t\tTop hit has insufficient coverage. Skipping.\n";
print $log_filehandle "\t\tTop hit has insufficient coverage. Skipping.\n";
print $main_output_filehandle "$current_read\t$coverage" . "\tNA" x 15 . "\n"; # Don't bother calculating the other information.
$skip_read = 1;
next BLASTLINE;
} else { $skip_read = 0 }; # Otherwise, turn $skip_read off.
$tophit_identities = $line[2]; # Note the Identities score.
$tophit_e_value = $line[10]; # Note the E value. These will eventually be exported in the main output file.
}
#======================================================================================================================================================================
# For all lines until the next read:
$raw_hit_count ++; # Even if we end up skipping this hit, add it to the raw hit count.
if ($skip_read == 1 or $skip_rest_of_read == 1) { next BLASTLINE; }
my $ID;
chomp $line[-1]; # Remove the pesky newline. I'm assuming the final field is staxids.
if (index ($line[-1], ';') != -1) { # If the field contains a ';', it contains multiple taxa.
#print "\t\tAssigning hit $line[1] to intersection of associated taxa.\n";
print $log_filehandle "\t\tAssigning hit $line[1] to intersection of associated taxa.\n";
my @associated_IDs = split (';', $line[-1]); # If there are multiple associated IDs, assign the hit to their phylogenetic intersection. For that, we need routes.
my $zeroth_route = retrieve_taxonomic_structure($associated_IDs[0], $nodesfileDBM);
my $first_route = retrieve_taxonomic_structure ($associated_IDs[1], $nodesfileDBM);
my $intersection_ID = find_taxonomic_intersect ($zeroth_route, $first_route); # This is the initial intersection.
if ($associated_IDs[2]) { # If there are also additional IDs, find their intersection with the initial one.
foreach my $next_ID (@associated_IDs[2 .. $#associated_IDs]) { # This is an array slice. We don't want to process elements 0-1 of @IDs_per_score again.
my $intersection_route = retrieve_taxonomic_structure($intersection_ID, $nodesfileDBM);
my $next_route = retrieve_taxonomic_structure ($next_ID, $nodesfileDBM);
my $intersection_next_temp = find_taxonomic_intersect($intersection_route, $next_route);
$intersection_ID = find_taxonomic_intersect($intersection_route, $next_route); # The next intersection will always be at least as high as the current.
}
}
$ID = $intersection_ID;
} else {
$ID = $line[-1]; # If the reference seqeunce ID field is identical to the "associated taxa" field, the reference is only associated with the organism it came from. So just take that as the ID.
}
if (exists $hit_taxa{$ID} or $ID eq 'N/A' or $ID == 0) { next BLASTLINE; } # If we already have a hit from this organism, or if the ID for this hit is 'N/A' (some references aren't identified), or if the ID is 0 (tried and failed to find a phylogenetic intersection), skip this hit.
my $number_of_taxa_before_this_hit = keys %hit_taxa;
if ($number_of_taxa_before_this_hit == $cap) { # Once $cap taxa have been found, subsequent hits are skipped, because new hits will either be to existing taxa so skipped, or to new taxa that would be above $cap.
#print "\t\tTaxon cap ($cap) reached. Ignoring subsequent hits.\n";
print $log_filehandle "\t\tTaxon cap ($cap) reached. Ignoring subsequent hits.\n";
$skip_rest_of_read = 1;
next BLASTLINE;
}
$hit_taxa{$ID} = undef; # Otherwise, note the ID in %hit_taxa.
my $e_value = $line[10]; # Note the E value (Perl recognises that something like 3.14e-10 is a number).
if (exists $hit_e_values{$e_value}) {
$hit_e_values{$e_value} = $hit_e_values{$e_value} . "\t" . $ID; # If other hits have had this E value, list this ID with them. We'll find their intersection at the end.
} else {
$hit_e_values{$e_value} = $ID; # If the E value is new, make a note.
}
}
close $blast_filehandle;
close $main_output_filehandle;
return $output_directory_path; # Return the path to the output directory for use with other subroutines.
}
sub retrieve_name {
##### Use taxonomic ID to get name
my ($query_ID, $namesfileDBM) = @_; # $query_ID is the taxonomic ID in question. We want to find its name.
my $name = 'NA'; # Default to null.
unless ($query_ID == 0) { # ID 0 is null. It's not in the names file.
my %namesfileDBM = (); # Set up a fresh hash to hold the names DBM file.
tie (%namesfileDBM, "DB_File", $namesfileDBM, O_RDONLY, 0666, $DB_BTREE) or die "Cannot open $namesfileDBM: $!\n";
if (exists $namesfileDBM{$query_ID}) {
$name = $namesfileDBM{$query_ID};
}
}
untie %namesfileDBM;
return $name;
}
sub retrieve_taxonomic_structure {
##### Get hierarchy from nodes.dmp file
my ($query_ID, $nodesfileDBM) = @_; # $query_ID is a non-0 taxonomic ID. $nodesfileDBM is the path to the nodes index file.
my $route = undef; # Default the route to undef.
unless ($query_ID == 0) { # I've had it before where $query_ID is "N/A".
my $exit = 0; # When to exit the do loop below.
my $next_level_ID; # The ID of the parent node.
my @route = (); # @route is a list of tab-separated taxonomic IDs moving down from the current node.
my %nodesfileDBM = (); # Set up a fresh hash to hold the nodes DBM file.
tie (%nodesfileDBM, "DB_File", $nodesfileDBM, O_RDONLY, 0666, $DB_BTREE) or die "Cannot open $nodesfileDBM: $!\n";
do {
push (@route, $query_ID); # Add the current ID to @route.
if ($query_ID == 1) { # If the current node has ID 1, it's the root. We have a route.
$exit = 1;
}
if (exists $nodesfileDBM{$query_ID}) { # Extract the ID of the parent node from the nodes file.
my @node_info = split("\t", $nodesfileDBM{$query_ID});
$next_level_ID = $node_info[0];
} else {
#print "\t\tID $query_ID was not found in nodes file. Truncating route here.\n";
print $log_filehandle "\t\tID $query_ID was not found in nodes file. Truncating route here.\n";
$exit = 1;
}
$query_ID = $next_level_ID; # Move on to the current parent node.
} until ($exit == 1);
untie %nodesfileDBM;
$route = join ("\t", @route);
}
return $route;
}
sub summary_basic {
#### Create simple summary of the output. The main output file is more informative, but this is what most people will take as the output.
my ($read_info_filename, $min_taxdiv_score, $output_directory_path, $main_output_filepath) = @_;
open (my $main_output_filehandle, $main_output_filepath) or die "Cannot open main output file $main_output_filepath for generating Summary Basic: $!\n";
readline($main_output_filehandle); # Skip the header line.
my %taxon_count = (); # Keys are taxon names and IDs in the format "name\tID". Values are the number of times that taxon occurs.
# Get a list of phylogenetic intersections where the taxonomic diversity score was at least $min_taxdiv_score.
while (1) {
my $line = readline($main_output_filehandle);
if (! defined $line) { last; }
chomp $line;
my @line = split("\t", $line);
my $tax_diversity_score = $line[14]; # Taxonomic diversity score is in the 14th column.
if ( ($tax_diversity_score ne 'NA') && ($tax_diversity_score >= $min_taxdiv_score) && ($line[16] != 0) ) { # If the taxonomic diversity score is both not NA and also at least the minimum, and a phylogenetic intersection ID was found:
my $taxon_ID_and_name = "$line[16]\t$line[15]"; # Join the ID and name with a tab.
# Add to the hash:
if (exists $taxon_count{$taxon_ID_and_name}) {
$taxon_count{$taxon_ID_and_name} = $taxon_count{$taxon_ID_and_name} + 1;
} else {
$taxon_count{$taxon_ID_and_name} = 1;
}
}
}
close $main_output_filehandle;
my $summary_basic_filepath = $output_directory_path . '/' . $read_info_filename . '.Summary_Basic.txt'; # This simply lists output taxa and their counts.
open (my $summary_basic_filehandle, '>', $summary_basic_filepath) or die "Cannot write Summary Basic file: $!\n";
print $summary_basic_filehandle "# Read info file:\t$read_info_filename\n"; # Output the read_info file this was built from as a header. This is only temporary.
foreach my $taxon_ID_and_name (keys %taxon_count) {
print $summary_basic_filehandle "$taxon_ID_and_name\t$taxon_count{$taxon_ID_and_name}\n";
}
close $summary_basic_filehandle;
return $summary_basic_filename;
}
sub summary_reads {
#### Create read-by-read summary of output. This lists the names of reads that passed and what taxon each one was assigned to.
my ($read_info_filename, $min_taxdiv_score, $output_directory_path, $main_output_filepath) = @_;
my $summary_reads_filepath = $output_directory_path . '/' . $read_info_filename . '.Summary_Reads.txt'; # This will list the assignment for each read that had one.
open (my $summary_reads_filehandle, '>', $summary_reads_filepath) or die "Cannot write Summary Reads file: $!\n"; # We can print this output as we go.
print $summary_reads_filehandle "# Read info file:\t$read_info_filename\n"; # Output the read_info file this was built from as a header. This is only temporary.
open (my $main_output_filehandle, $main_output_filepath) or die "Cannot open main output file $main_output_filepath for generating Summary Reads: $!\n";
readline($main_output_filehandle); # Skip the header line.
# Get a list of phylogenetic intersections where the taxonomic diversity score was at least $min_taxdiv_score.
while (1) {
my $line = readline($main_output_filehandle);
if (! defined $line) { last; }
chomp $line;
my @line = split("\t", $line);
my $tax_diversity_score = $line[14]; # Taxonomic diversity score is in the 14th column.
if ( ($tax_diversity_score ne 'NA') && ($tax_diversity_score >= $min_taxdiv_score) && ($line[16] != 0) ) { # If the taxonomic diversity score is both not NA and also at least the minimum, and a phylogenetic intersection ID was found:
print $summary_reads_filehandle "$line[0]\t$line[16]\t$line[15]\n"; # Print the read name and phylogenetic intersection name and ID the hash.
}
}
close $main_output_filehandle;
close $summary_reads_filehandle;
return $summary_reads_filename;
}