diff --git a/README.md b/README.md index f2d0f8c1..5d736fa4 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,11 @@ strobealign ref.fa reads.1.fastq.gz reads.2.fastq.gz | samtools sort -o sorted.b This is usually faster than doing the two steps separately because fewer intermediate files are created. +To output the estimated abundance of every contig, the format of output file is: contig_id \t abundance_value: +``` +strobealign ref.fa reads.fq --aemb > abundance.txt # Single-end reads +strobealign ref.fa reads1.fq reads2.fq --aemb > abundance.txt # Paired-end reads +``` ## Command-line options @@ -127,6 +132,7 @@ options. Some important ones are: * `--eqx`: Emit `=` and `X` CIGAR operations instead of `M`. * `-x`: Only map reads, do not do no base-level alignment. This switches the output format from SAM to [PAF](https://github.com/lh3/miniasm/blob/master/PAF.md). +* `--aemb`: Output the estimated abundance value of every contig, the format of output file is: contig_id \t abundance_value. * `--rg-id=ID`: Add RG tag to each SAM record. * `--rg=TAG:VALUE`: Add read group metadata to the SAM header. This can be specified multiple times. Example: `--rg-id=1 --rg=SM:mysamle --rg=LB:mylibrary`. diff --git a/src/aln.cpp b/src/aln.cpp index b5105bb1..eeabd292 100644 --- a/src/aln.cpp +++ b/src/aln.cpp @@ -862,13 +862,17 @@ std::vector align_paired( return high_scores; } -// Only used for PAF output +// Used for PAF and abundances output inline void get_best_map_location( std::vector &nams1, std::vector &nams2, InsertSizeDistribution &isize_est, Nam &best_nam1, - Nam &best_nam2 + Nam &best_nam2, + int read1_len, + int read2_len, + std::vector &abundances, + bool output_abundance ) { std::vector nam_pairs = get_best_scoring_nam_pairs(nams1, nams2, isize_est.mu, isize_est.sigma); best_nam1.ref_start = -1; //Unmapped until proven mapped @@ -903,6 +907,52 @@ inline void get_best_map_location( if (score_joint > score_indiv) { // joint score is better than individual best_nam1 = n1_joint_max; best_nam2 = n2_joint_max; + + if (output_abundance){ + // we loop twice because we need to count the number of best pairs + size_t n_best = 0; + for (auto &[score, n1, n2] : nam_pairs){ + if ((n1.score + n2.score) == score_joint){ + ++n_best; + } else { + break; + } + } + for (auto &[score, n1, n2] : nam_pairs){ + if ((n1.score + n2.score) == score_joint){ + if (n1.ref_start >= 0) { + abundances[n1.ref_id] += float(read1_len) / float(n_best); + } + if (n2.ref_start >= 0) { + abundances[n2.ref_id] += float(read2_len) / float(n_best); + } + } else { + break; + } + } + } + } else if (output_abundance) { + for (auto &[nams, read_len]: { std::make_pair(std::cref(nams1), read1_len), + std::make_pair(std::cref(nams2), read2_len) }) { + size_t best_score = 0; + // We loop twice because we need to count the number of NAMs with best score + for (auto &nam : nams) { + if (nam.score == nams[0].score){ + ++best_score; + } else { + break; + } + } + for (auto &nam: nams) { + if (nam.ref_start < 0) { + continue; + } + if (nam.score != nams[0].score){ + break; + } + abundances[nam.ref_id] += float(read_len) / float(best_score); + } + } } if (isize_est.sample_size < 400 && score_joint > score_indiv) { @@ -957,7 +1007,8 @@ void align_or_map_paired( const IndexParameters& index_parameters, const References& references, const StrobemerIndex& index, - std::minstd_rand& random_engine + std::minstd_rand& random_engine, + std::vector &abundances ) { std::array details; std::array, 2> nams_pair; @@ -991,18 +1042,24 @@ void align_or_map_paired( } Timer extend_timer; - if (!map_param.is_sam_out) { + if (map_param.output_format != OutputFormat::SAM) { // PAF or abundance Nam nam_read1; Nam nam_read2; - get_best_map_location(nams_pair[0], nams_pair[1], isize_est, - nam_read1, - nam_read2); - output_hits_paf_PE(outstring, nam_read1, record1.name, - references, - record1.seq.length()); - output_hits_paf_PE(outstring, nam_read2, record2.name, - references, - record2.seq.length()); + get_best_map_location( + nams_pair[0], nams_pair[1], + isize_est, + nam_read1, nam_read2, + record1.seq.length(), record2.seq.length(), + abundances, + map_param.output_format == OutputFormat::Abundance); + if (map_param.output_format == OutputFormat::PAF) { + output_hits_paf_PE(outstring, nam_read1, record1.name, + references, + record1.seq.length()); + output_hits_paf_PE(outstring, nam_read2, record2.name, + references, + record2.seq.length()); + } } else { Read read1(record1.seq); Read read2(record2.seq); @@ -1082,7 +1139,8 @@ void align_or_map_single( const IndexParameters& index_parameters, const References& references, const StrobemerIndex& index, - std::minstd_rand& random_engine + std::minstd_rand& random_engine, + std::vector &abundances ) { Details details; Timer strobe_timer; @@ -1111,15 +1169,41 @@ void align_or_map_single( Timer extend_timer; - if (!map_param.is_sam_out) { - output_hits_paf(outstring, nams, record.name, references, - record.seq.length()); - } else { - align_single( - aligner, sam, nams, record, index_parameters.syncmer.k, - references, details, map_param.dropoff_threshold, map_param.max_tries, - map_param.max_secondary, random_engine - ); + size_t n_best = 0; + switch (map_param.output_format) { + case OutputFormat::Abundance: { + if (!nams.empty()){ + for (auto &t : nams){ + if (t.score == nams[0].score){ + ++n_best; + }else{ + break; + } + } + + for (auto &nam: nams) { + if (nam.ref_start < 0) { + continue; + } + if (nam.score != nams[0].score){ + break; + } + abundances[nam.ref_id] += float(record.seq.length()) / float(n_best); + } + } + } + break; + case OutputFormat::PAF: + output_hits_paf(outstring, nams, record.name, references, + record.seq.length()); + break; + case OutputFormat::SAM: + align_single( + aligner, sam, nams, record, index_parameters.syncmer.k, + references, details, map_param.dropoff_threshold, map_param.max_tries, + map_param.max_secondary, random_engine + ); + break; } statistics.tot_extend += extend_timer.duration(); statistics += details; diff --git a/src/aln.hpp b/src/aln.hpp index f8bb69bf..0a60fb41 100644 --- a/src/aln.hpp +++ b/src/aln.hpp @@ -56,6 +56,12 @@ struct AlignmentStatistics { } }; +enum class OutputFormat { + SAM, + PAF, + Abundance +}; + struct MappingParameters { int r { 150 }; int max_secondary { 0 }; @@ -63,7 +69,7 @@ struct MappingParameters { int rescue_level { 2 }; int max_tries { 20 }; int rescue_cutoff; - bool is_sam_out { true }; + OutputFormat output_format {OutputFormat::SAM}; CigarOps cigar_ops{CigarOps::M}; bool output_unmapped { true }; bool details{false}; @@ -88,7 +94,8 @@ void align_or_map_paired( const IndexParameters& index_parameters, const References& references, const StrobemerIndex& index, - std::minstd_rand& random_engine + std::minstd_rand& random_engine, + std::vector &abundances ); void align_or_map_single( @@ -101,7 +108,8 @@ void align_or_map_single( const IndexParameters& index_parameters, const References& references, const StrobemerIndex& index, - std::minstd_rand& random_engine + std::minstd_rand& random_engine, + std::vector &abundances ); // Private declarations, only needed for tests diff --git a/src/cmdline.cpp b/src/cmdline.cpp index c00c5479..5ad83821 100644 --- a/src/cmdline.cpp +++ b/src/cmdline.cpp @@ -27,6 +27,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) { args::Flag v(parser, "v", "Verbose output", {'v'}); args::Flag no_progress(parser, "no-progress", "Disable progress report (enabled by default if output is a terminal)", {"no-progress"}); args::Flag x(parser, "x", "Only map reads, no base level alignment (produces PAF file)", {'x'}); + args::Flag aemb(parser, "aemb", "Output the estimated abundance value of contigs, the format of output file is: contig_id \t abundance_value", {"aemb"}); args::Flag interleaved(parser, "interleaved", "Interleaved input", {"interleaved"}); args::ValueFlag index_statistics(parser, "PATH", "Print statistics of indexing to PATH", {"index-statistics"}); args::Flag i(parser, "index", "Do not map reads; only generate the strobemer index and write it to disk. If read files are provided, they are used to estimate read length", {"create-index", 'i'}); @@ -97,6 +98,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) { if (index_statistics) { opt.logfile_name = args::get(index_statistics); } if (i) { opt.only_gen_index = true; } if (use_index) { opt.use_index = true; } + if (aemb) {opt.is_abundance_out = true; } // SAM output if (eqx) { opt.cigar_eqx = true; } diff --git a/src/cmdline.hpp b/src/cmdline.hpp index 2ae3186e..9f5b4a05 100644 --- a/src/cmdline.hpp +++ b/src/cmdline.hpp @@ -18,6 +18,7 @@ struct CommandLineOptions { bool only_gen_index { false }; bool use_index { false }; bool is_sam_out { true }; + bool is_abundance_out {false}; // SAM output bool cigar_eqx { false }; diff --git a/src/main.cpp b/src/main.cpp index f9b8f65a..b2c06004 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -105,6 +105,12 @@ InputBuffer get_input_buffer(const CommandLineOptions& opt) { } } +void output_abundance(const std::vector& abundances, const References& references){ + for (size_t i = 0; i < references.size(); ++i) { + std::cout << references.names[i] << '\t' << std::fixed << std::setprecision(6) << abundances[i] / double(references.sequences[i].size()) << std::endl; + } +} + void show_progress_until_done(std::vector& worker_done, std::vector& stats) { Timer timer; bool reported = false; @@ -155,6 +161,11 @@ int run_strobealign(int argc, char **argv) { if (opt.c >= 64 || opt.c <= 0) { throw BadParameter("c must be greater than 0 and less than 64"); } + + if (!opt.is_sam_out && opt.is_abundance_out){ + throw BadParameter("Can not use -x and --aemb at the same time"); + } + InputBuffer input_buffer = get_input_buffer(opt); if (!opt.r_set && !opt.reads_filename1.empty()) { opt.r = estimate_read_length(input_buffer); @@ -184,7 +195,10 @@ int run_strobealign(int argc, char **argv) { map_param.dropoff_threshold = opt.dropoff_threshold; map_param.rescue_level = opt.rescue_level; map_param.max_tries = opt.max_tries; - map_param.is_sam_out = opt.is_sam_out; + map_param.output_format = ( + opt.is_abundance_out ? OutputFormat::Abundance : + opt.is_sam_out ? OutputFormat::SAM : + OutputFormat::PAF); map_param.cigar_ops = opt.cigar_eqx ? CigarOps::EQX : CigarOps::M; map_param.output_unmapped = opt.output_unmapped; map_param.details = opt.details; @@ -288,32 +302,31 @@ int run_strobealign(int argc, char **argv) { } std::ostream out(buf); - - if (map_param.is_sam_out) { - std::stringstream cmd_line; - for(int i = 0; i < argc; ++i) { - cmd_line << argv[i] << " "; - } - - out << sam_header(references, opt.read_group_id, opt.read_group_fields); - if (opt.pg_header) { - out << pg_header(cmd_line.str()); - } + + if (map_param.output_format == OutputFormat::SAM) { + std::stringstream cmd_line; + for(int i = 0; i < argc; ++i) { + cmd_line << argv[i] << " "; + } + out << sam_header(references, opt.read_group_id, opt.read_group_fields); + if (opt.pg_header) { + out << pg_header(cmd_line.str()); + } } std::vector log_stats_vec(opt.n_threads); - + logger.info() << "Running in " << (opt.is_SE ? "single-end" : "paired-end") << " mode" << std::endl; OutputBuffer output_buffer(out); - std::vector workers; std::vector worker_done(opt.n_threads); // each thread sets its entry to 1 when it’s done + std::vector> worker_abundances(opt.n_threads, std::vector(references.size(), 0)); for (int i = 0; i < opt.n_threads; ++i) { std::thread consumer(perform_task, std::ref(input_buffer), std::ref(output_buffer), std::ref(log_stats_vec[i]), std::ref(worker_done[i]), std::ref(aln_params), std::ref(map_param), std::ref(index_parameters), std::ref(references), - std::ref(index), std::ref(opt.read_group_id)); + std::ref(index), std::ref(opt.read_group_id), std::ref(worker_abundances[i])); workers.push_back(std::move(consumer)); } if (opt.show_progress && isatty(2)) { @@ -329,6 +342,19 @@ int run_strobealign(int argc, char **argv) { tot_statistics += it; } + if (map_param.output_format == OutputFormat::Abundance) { + std::vector abundances(references.size(), 0); + for (size_t i = 0; i < worker_abundances.size(); ++i) { + for (size_t j = 0; j < worker_abundances[i].size(); ++j) { + abundances[j] += worker_abundances[i][j]; + } + } + + // output the abundance file + output_abundance(abundances, references); + } + + logger.info() << "Total mapping sites tried: " << tot_statistics.tot_all_tried << std::endl << "Total calls to ssw: " << tot_statistics.tot_aligner_calls << std::endl << "Inconsistent NAM ends: " << tot_statistics.inconsistent_nams << std::endl diff --git a/src/pc.cpp b/src/pc.cpp index 5011bf21..f9f53653 100644 --- a/src/pc.cpp +++ b/src/pc.cpp @@ -139,7 +139,8 @@ void perform_task( const IndexParameters& index_parameters, const References& references, const StrobemerIndex& index, - const std::string& read_group_id + const std::string& read_group_id, + std::vector &abundances ) { bool eof = false; Aligner aligner{aln_params}; @@ -170,16 +171,20 @@ void perform_task( to_uppercase(record1.seq); to_uppercase(record2.seq); align_or_map_paired(record1, record2, sam, sam_out, statistics, isize_est, aligner, - map_param, index_parameters, references, index, random_engine); + map_param, index_parameters, references, index, random_engine, abundances); statistics.n_reads += 2; } for (size_t i = 0; i < records3.size(); ++i) { auto record = records3[i]; - align_or_map_single(record, sam, sam_out, statistics, aligner, map_param, index_parameters, references, index, random_engine); + align_or_map_single(record, sam, sam_out, statistics, aligner, map_param, index_parameters, references, index, random_engine, abundances); statistics.n_reads++; } - output_buffer.output_records(std::move(sam_out), chunk_index); - assert(sam_out == ""); + + + if (map_param.output_format != OutputFormat::Abundance) { + output_buffer.output_records(std::move(sam_out), chunk_index); + assert(sam_out == ""); + } } statistics.tot_aligner_calls += aligner.calls_count(); done = true; diff --git a/src/pc.hpp b/src/pc.hpp index 703de209..87e4e873 100644 --- a/src/pc.hpp +++ b/src/pc.hpp @@ -64,7 +64,8 @@ class OutputBuffer { void perform_task(InputBuffer &input_buffer, OutputBuffer &output_buffer, AlignmentStatistics& statistics, int& done, const AlignmentParameters &aln_params, - const MappingParameters &map_param, const IndexParameters& index_parameters, const References& references, const StrobemerIndex& index, const std::string& read_group_id); + const MappingParameters &map_param, const IndexParameters& index_parameters, + const References& references, const StrobemerIndex& index, const std::string& read_group_id, std::vector &abundances); bool same_name(const std::string& n1, const std::string& n2); diff --git a/tests/phix.abun.pe.txt b/tests/phix.abun.pe.txt new file mode 100644 index 00000000..900b9de5 --- /dev/null +++ b/tests/phix.abun.pe.txt @@ -0,0 +1 @@ +NC_001422.1 4.638507 diff --git a/tests/phix.abun.se.txt b/tests/phix.abun.se.txt new file mode 100644 index 00000000..55e31ba6 --- /dev/null +++ b/tests/phix.abun.se.txt @@ -0,0 +1 @@ +NC_001422.1 2.347196 diff --git a/tests/run.sh b/tests/run.sh index f82dbb5b..d7d6f064 100755 --- a/tests/run.sh +++ b/tests/run.sh @@ -58,6 +58,16 @@ strobealign -x tests/phix.fasta tests/phix.1.fastq tests/phix.2.fastq | tail -n diff tests/phix.pe.paf phix.pe.paf rm phix.pe.paf +# Single-end abundance estimation +strobealign --aemb tests/phix.fasta tests/phix.1.fastq > phix.abun.se.txt +diff tests/phix.abun.se.txt phix.abun.se.txt +rm phix.abun.se.txt + +# Paired-end abundance estimation +strobealign --aemb tests/phix.fasta tests/phix.1.fastq tests/phix.2.fastq > phix.abun.pe.txt +diff tests/phix.abun.pe.txt phix.abun.pe.txt +rm phix.abun.pe.txt + # Build a separate index strobealign --no-PG -r 150 tests/phix.fasta tests/phix.1.fastq > without-sti.sam strobealign -r 150 -i tests/phix.fasta