Merge pull request #394 from psj1997/main

strobealign-aemb module for metagenomic binning
ksahlin · Mar 4, 2024 · bd183f5 · bd183f5
2 parents 79cdd96 + 5a1ab9e
commit bd183f5
Show file tree

Hide file tree

Showing 11 changed files with 192 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -113,6 +113,11 @@ strobealign ref.fa reads.1.fastq.gz reads.2.fastq.gz | samtools sort -o sorted.b
 This is usually faster than doing the two steps separately because fewer
 intermediate files are created.
 
+To output the estimated abundance of every contig, the format of output file is: contig_id \t abundance_value:
+```
+strobealign ref.fa reads.fq --aemb > abundance.txt # Single-end reads
+strobealign ref.fa reads1.fq reads2.fq --aemb > abundance.txt # Paired-end reads
+```
 
 ## Command-line options
 
@@ -127,6 +132,7 @@ options. Some important ones are:
 * `--eqx`: Emit `=` and `X` CIGAR operations instead of `M`.
 * `-x`: Only map reads, do not do no base-level alignment. This switches the
  output format from SAM to [PAF](https://github.com/lh3/miniasm/blob/master/PAF.md).
+* `--aemb`: Output the estimated abundance value of every contig, the format of output file is: contig_id \t abundance_value.
 * `--rg-id=ID`: Add RG tag to each SAM record.
 * `--rg=TAG:VALUE`: Add read group metadata to the SAM header. This can be
  specified multiple times. Example: `--rg-id=1 --rg=SM:mysamle --rg=LB:mylibrary`.

diff --git a/src/aln.cpp b/src/aln.cpp
@@ -862,13 +862,17 @@ std::vector<ScoredAlignmentPair> align_paired(
  return high_scores;
 }
 
-// Only used for PAF output
+// Used for PAF and abundances output
 inline void get_best_map_location(
  std::vector<Nam> &nams1,
  std::vector<Nam> &nams2,
  InsertSizeDistribution &isize_est,
  Nam &best_nam1,
- Nam &best_nam2
+ Nam &best_nam2,
+ int read1_len,
+ int read2_len,
+ std::vector<double> &abundances,
+ bool output_abundance
 ) {
  std::vector<NamPair> nam_pairs = get_best_scoring_nam_pairs(nams1, nams2, isize_est.mu, isize_est.sigma);
  best_nam1.ref_start = -1; //Unmapped until proven mapped
@@ -903,6 +907,52 @@ inline void get_best_map_location(
  if (score_joint > score_indiv) { // joint score is better than individual
  best_nam1 = n1_joint_max;
  best_nam2 = n2_joint_max;
+
+ if (output_abundance){
+ // we loop twice because we need to count the number of best pairs
+ size_t n_best = 0;
+ for (auto &[score, n1, n2] : nam_pairs){
+ if ((n1.score + n2.score) == score_joint){
+ ++n_best;
+ } else {
+ break;
+ }
+ }
+ for (auto &[score, n1, n2] : nam_pairs){
+ if ((n1.score + n2.score) == score_joint){
+ if (n1.ref_start >= 0) {
+ abundances[n1.ref_id] += float(read1_len) / float(n_best);
+ }
+ if (n2.ref_start >= 0) {
+ abundances[n2.ref_id] += float(read2_len) / float(n_best);
+ }
+ } else {
+ break;
+ }
+ }
+ }
+ } else if (output_abundance) {
+ for (auto &[nams, read_len]: { std::make_pair(std::cref(nams1), read1_len),
+ std::make_pair(std::cref(nams2), read2_len) }) {
+ size_t best_score = 0;
+ // We loop twice because we need to count the number of NAMs with best score
+ for (auto &nam : nams) {
+ if (nam.score == nams[0].score){
+ ++best_score;
+ } else {
+ break;
+ }
+ }
+ for (auto &nam: nams) {
+ if (nam.ref_start < 0) {
+ continue;
+ }
+ if (nam.score != nams[0].score){
+ break;
+ }
+ abundances[nam.ref_id] += float(read_len) / float(best_score);
+ }
+ }
  }
 
  if (isize_est.sample_size < 400 && score_joint > score_indiv) {
@@ -957,7 +1007,8 @@ void align_or_map_paired(
  const IndexParameters& index_parameters,
  const References& references,
  const StrobemerIndex& index,
- std::minstd_rand& random_engine
+ std::minstd_rand& random_engine,
+ std::vector<double> &abundances
 ) {
  std::array<Details, 2> details;
  std::array<std::vector<Nam>, 2> nams_pair;
@@ -991,18 +1042,24 @@ void align_or_map_paired(
  }
 
  Timer extend_timer;
- if (!map_param.is_sam_out) {
+ if (map_param.output_format != OutputFormat::SAM) { // PAF or abundance
  Nam nam_read1;
  Nam nam_read2;
- get_best_map_location(nams_pair[0], nams_pair[1], isize_est,
- nam_read1,
- nam_read2);
- output_hits_paf_PE(outstring, nam_read1, record1.name,
- references,
- record1.seq.length());
- output_hits_paf_PE(outstring, nam_read2, record2.name,
- references,
- record2.seq.length());
+ get_best_map_location(
+ nams_pair[0], nams_pair[1],
+ isize_est,
+ nam_read1, nam_read2,
+ record1.seq.length(), record2.seq.length(),
+ abundances,
+ map_param.output_format == OutputFormat::Abundance);
+ if (map_param.output_format == OutputFormat::PAF) {
+ output_hits_paf_PE(outstring, nam_read1, record1.name,
+ references,
+ record1.seq.length());
+ output_hits_paf_PE(outstring, nam_read2, record2.name,
+ references,
+ record2.seq.length());
+ }
  } else {
  Read read1(record1.seq);
  Read read2(record2.seq);
@@ -1082,7 +1139,8 @@ void align_or_map_single(
  const IndexParameters& index_parameters,
  const References& references,
  const StrobemerIndex& index,
- std::minstd_rand& random_engine
+ std::minstd_rand& random_engine,
+ std::vector<double> &abundances
 ) {
  Details details;
  Timer strobe_timer;
@@ -1111,15 +1169,41 @@ void align_or_map_single(
 
 
  Timer extend_timer;
- if (!map_param.is_sam_out) {
- output_hits_paf(outstring, nams, record.name, references,
- record.seq.length());
- } else {
- align_single(
- aligner, sam, nams, record, index_parameters.syncmer.k,
- references, details, map_param.dropoff_threshold, map_param.max_tries,
- map_param.max_secondary, random_engine
- );
+ size_t n_best = 0;
+ switch (map_param.output_format) {
+ case OutputFormat::Abundance: {
+ if (!nams.empty()){
+ for (auto &t : nams){
+ if (t.score == nams[0].score){
+ ++n_best;
+ }else{
+ break;
+ }
+ }
+
+ for (auto &nam: nams) {
+ if (nam.ref_start < 0) {
+ continue;
+ }
+ if (nam.score != nams[0].score){
+ break;
+ }
+ abundances[nam.ref_id] += float(record.seq.length()) / float(n_best);
+ }
+ }
+ }
+ break;
+ case OutputFormat::PAF:
+ output_hits_paf(outstring, nams, record.name, references,
+ record.seq.length());
+ break;
+ case OutputFormat::SAM:
+ align_single(
+ aligner, sam, nams, record, index_parameters.syncmer.k,
+ references, details, map_param.dropoff_threshold, map_param.max_tries,
+ map_param.max_secondary, random_engine
+ );
+ break;
  }
  statistics.tot_extend += extend_timer.duration();
  statistics += details;

diff --git a/src/aln.hpp b/src/aln.hpp
@@ -56,14 +56,20 @@ struct AlignmentStatistics {
  }
 };
 
+enum class OutputFormat {
+ SAM,
+ PAF,
+ Abundance
+};
+
 struct MappingParameters {
  int r { 150 };
  int max_secondary { 0 };
  float dropoff_threshold { 0.5 };
  int rescue_level { 2 };
  int max_tries { 20 };
  int rescue_cutoff;
- bool is_sam_out { true };
+ OutputFormat output_format {OutputFormat::SAM};
  CigarOps cigar_ops{CigarOps::M};
  bool output_unmapped { true };
  bool details{false};
@@ -88,7 +94,8 @@ void align_or_map_paired(
  const IndexParameters& index_parameters,
  const References& references,
  const StrobemerIndex& index,
- std::minstd_rand& random_engine
+ std::minstd_rand& random_engine,
+ std::vector<double> &abundances
 );
 
 void align_or_map_single(
@@ -101,7 +108,8 @@ void align_or_map_single(
  const IndexParameters& index_parameters,
  const References& references,
  const StrobemerIndex& index,
- std::minstd_rand& random_engine
+ std::minstd_rand& random_engine,
+ std::vector<double> &abundances
 );
 
 // Private declarations, only needed for tests

diff --git a/src/cmdline.cpp b/src/cmdline.cpp
@@ -27,6 +27,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
  args::Flag v(parser, "v", "Verbose output", {'v'});
  args::Flag no_progress(parser, "no-progress", "Disable progress report (enabled by default if output is a terminal)", {"no-progress"});
  args::Flag x(parser, "x", "Only map reads, no base level alignment (produces PAF file)", {'x'});
+ args::Flag aemb(parser, "aemb", "Output the estimated abundance value of contigs, the format of output file is: contig_id \t abundance_value", {"aemb"});
  args::Flag interleaved(parser, "interleaved", "Interleaved input", {"interleaved"});
  args::ValueFlag<std::string> index_statistics(parser, "PATH", "Print statistics of indexing to PATH", {"index-statistics"});
  args::Flag i(parser, "index", "Do not map reads; only generate the strobemer index and write it to disk. If read files are provided, they are used to estimate read length", {"create-index", 'i'});
@@ -97,6 +98,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
  if (index_statistics) { opt.logfile_name = args::get(index_statistics); }
  if (i) { opt.only_gen_index = true; }
  if (use_index) { opt.use_index = true; }
+ if (aemb) {opt.is_abundance_out = true; }
 
  // SAM output
  if (eqx) { opt.cigar_eqx = true; }

diff --git a/src/cmdline.hpp b/src/cmdline.hpp
@@ -18,6 +18,7 @@ struct CommandLineOptions {
  bool only_gen_index { false };
  bool use_index { false };
  bool is_sam_out { true };
+ bool is_abundance_out {false};
 
  // SAM output
  bool cigar_eqx { false };

diff --git a/src/main.cpp b/src/main.cpp
@@ -105,6 +105,12 @@ InputBuffer get_input_buffer(const CommandLineOptions& opt) {
  }
 }
 
+void output_abundance(const std::vector<double>& abundances, const References& references){
+ for (size_t i = 0; i < references.size(); ++i) {
+ std::cout << references.names[i] << '\t' << std::fixed << std::setprecision(6) << abundances[i] / double(references.sequences[i].size()) << std::endl;
+ }
+}
+
 void show_progress_until_done(std::vector<int>& worker_done, std::vector<AlignmentStatistics>& stats) {
  Timer timer;
  bool reported = false;
@@ -155,6 +161,11 @@ int run_strobealign(int argc, char **argv) {
  if (opt.c >= 64 || opt.c <= 0) {
  throw BadParameter("c must be greater than 0 and less than 64");
  }
+
+ if (!opt.is_sam_out && opt.is_abundance_out){
+ throw BadParameter("Can not use -x and --aemb at the same time");
+ }
+
  InputBuffer input_buffer = get_input_buffer(opt);
  if (!opt.r_set && !opt.reads_filename1.empty()) {
  opt.r = estimate_read_length(input_buffer);
@@ -184,7 +195,10 @@ int run_strobealign(int argc, char **argv) {
  map_param.dropoff_threshold = opt.dropoff_threshold;
  map_param.rescue_level = opt.rescue_level;
  map_param.max_tries = opt.max_tries;
- map_param.is_sam_out = opt.is_sam_out;
+ map_param.output_format = (
+ opt.is_abundance_out ? OutputFormat::Abundance :
+ opt.is_sam_out ? OutputFormat::SAM :
+ OutputFormat::PAF);
  map_param.cigar_ops = opt.cigar_eqx ? CigarOps::EQX : CigarOps::M;
  map_param.output_unmapped = opt.output_unmapped;
  map_param.details = opt.details;
@@ -288,32 +302,31 @@ int run_strobealign(int argc, char **argv) {
  }
 
  std::ostream out(buf);
-
- if (map_param.is_sam_out) {
- std::stringstream cmd_line;
- for(int i = 0; i < argc; ++i) {
- cmd_line << argv[i] << " ";
- }
-
- out << sam_header(references, opt.read_group_id, opt.read_group_fields);
- if (opt.pg_header) {
- out << pg_header(cmd_line.str());
- }
+
+ if (map_param.output_format == OutputFormat::SAM) {
+ std::stringstream cmd_line;
+ for(int i = 0; i < argc; ++i) {
+ cmd_line << argv[i] << " ";
+ }
+ out << sam_header(references, opt.read_group_id, opt.read_group_fields);
+ if (opt.pg_header) {
+ out << pg_header(cmd_line.str());
+ }
  }
 
  std::vector<AlignmentStatistics> log_stats_vec(opt.n_threads);
-
+ 
  logger.info() << "Running in " << (opt.is_SE ? "single-end" : "paired-end") << " mode" << std::endl;
 
  OutputBuffer output_buffer(out);
-
  std::vector<std::thread> workers;
  std::vector<int> worker_done(opt.n_threads); // each thread sets its entry to 1 when it’s done
+ std::vector<std::vector<double>> worker_abundances(opt.n_threads, std::vector<double>(references.size(), 0));
  for (int i = 0; i < opt.n_threads; ++i) {
  std::thread consumer(perform_task, std::ref(input_buffer), std::ref(output_buffer),
  std::ref(log_stats_vec[i]), std::ref(worker_done[i]), std::ref(aln_params),
  std::ref(map_param), std::ref(index_parameters), std::ref(references),
- std::ref(index), std::ref(opt.read_group_id));
+ std::ref(index), std::ref(opt.read_group_id), std::ref(worker_abundances[i]));
  workers.push_back(std::move(consumer));
  }
  if (opt.show_progress && isatty(2)) {
@@ -329,6 +342,19 @@ int run_strobealign(int argc, char **argv) {
  tot_statistics += it;
  }
 
+ if (map_param.output_format == OutputFormat::Abundance) {
+ std::vector<double> abundances(references.size(), 0);
+ for (size_t i = 0; i < worker_abundances.size(); ++i) {
+ for (size_t j = 0; j < worker_abundances[i].size(); ++j) {
+ abundances[j] += worker_abundances[i][j];
+ }
+ }
+
+ // output the abundance file
+ output_abundance(abundances, references);
+ }
+
+
  logger.info() << "Total mapping sites tried: " << tot_statistics.tot_all_tried << std::endl
  << "Total calls to ssw: " << tot_statistics.tot_aligner_calls << std::endl
  << "Inconsistent NAM ends: " << tot_statistics.inconsistent_nams << std::endl