diff --git a/src/alignments.cpp b/src/alignments.cpp index ce8525d..848416c 100644 --- a/src/alignments.cpp +++ b/src/alignments.cpp @@ -2,13 +2,28 @@ namespace seqwish { + +uint64_t match_hash(const pos_t& q, const pos_t& t, const uint64_t& l) { + uint64_t seed = q | t | l; + seed ^= q + 0x9e3779b97f4a7c15 + (seed << 17) + (seed >> 9); + seed ^= t + 0x9e3779b97f4a7c15 + (seed << 7) + (seed >> 23); + seed ^= l + 0x9e3779b97f4a7c15 + (seed << 9) + (seed >> 2); + return seed; +} + +bool keep_sparse(const pos_t& q, const pos_t& t, const uint64_t& l, const float f) { + // hash the match and check if it's accepted given our sparsification factor + return match_hash(q, t, l) < std::numeric_limits::max() * f; +} + void paf_worker( igzstream& paf_in, std::atomic& paf_more, std::mutex& paf_in_mutex, mmmulti::iitree& aln_iitree, const seqindex_t& seqidx, - const uint64_t& min_match_len) { + const uint64_t& min_match_len, + const float& sparsification_factor) { while (paf_more.load()) { paf_in_mutex.lock(); std::string line; @@ -45,7 +60,11 @@ void paf_worker( uint64_t match_len = 0; auto add_match = [&](void) { - if (match_len && match_len >= min_match_len) { + if (match_len + && match_len >= min_match_len + && (sparsification_factor == 0 || + keep_sparse(q_pos_match_start, t_pos_match_start, match_len, + sparsification_factor))) { if (is_rev(q_pos)) { pos_t x_pos = q_pos; decr_pos(x_pos); // to guard against underflow when our start is 0-, we need to decr in pos_t space @@ -101,6 +120,7 @@ void unpack_paf_alignments(const std::string& paf_file, mmmulti::iitree& aln_iitree, const seqindex_t& seqidx, const uint64_t& min_match_len, + const float& sparsification_factor, const uint64_t& num_threads) { // go through the PAF file igzstream paf_in(paf_file.c_str()); @@ -112,7 +132,7 @@ void unpack_paf_alignments(const std::string& paf_file, std::atomic paf_more; paf_more.store(true); std::vector workers; workers.reserve(num_threads); for (uint64_t t = 0; t < num_threads; ++t) { - workers.emplace_back(paf_worker, std::ref(paf_in), std::ref(paf_more), std::ref(paf_in_mutex), std::ref(aln_iitree), std::ref(seqidx), std::ref(min_match_len)); + workers.emplace_back(paf_worker, std::ref(paf_in), std::ref(paf_more), std::ref(paf_in_mutex), std::ref(aln_iitree), std::ref(seqidx), std::ref(min_match_len), std::ref(sparsification_factor)); } for (uint64_t t = 0; t < num_threads; ++t) { workers[t].join(); diff --git a/src/alignments.hpp b/src/alignments.hpp index 7f36c9f..108f1e3 100644 --- a/src/alignments.hpp +++ b/src/alignments.hpp @@ -22,15 +22,21 @@ void paf_worker( std::mutex& paf_in_mutex, mmmulti::iitree& aln_iitree, const seqindex_t& seqidx, - const uint64_t& min_match_len); + const uint64_t& min_match_len, + const float& sparsification_factor); void unpack_paf_alignments( const std::string& paf_file, mmmulti::iitree& aln_iitree, const seqindex_t& seqidx, const uint64_t& min_match_len, + const float& sparsification_factor, const uint64_t& num_threads); +uint64_t match_hash(const pos_t& q, const pos_t& t, const uint64_t& l); + +bool keep_sparse(const pos_t& q, const pos_t& t, const uint64_t& l, const float f); + /* void filter_alignments(mmmulti::map& aln_mm, diff --git a/src/main.cpp b/src/main.cpp index e1564b4..b5b8eb4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -38,6 +38,7 @@ int main(int argc, char** argv) { args::ValueFlag repeat_max(parser, "N", "Limit transitive closure to include no more than N copies of a given input base", {'r', "repeat-max"}); args::ValueFlag min_repeat_dist(parser, "N", "Prevent transitive closure for bases at least this far apart in input sequences", {'l', "min-repeat-distance"}); args::ValueFlag min_match_len(parser, "N", "Filter exact matches below this length. This can smooth the graph locally and prevent the formation of complex local graph topologies from forming due to differential alignments.", {'k', "min-match-len"}); + args::ValueFlag match_sparsification(parser, "N", "Sparsify input matches, keeping the fraction that minimize a hash function.", {'f', "sparse-factor"}); args::ValueFlag transclose_batch(parser, "N", "Number of bp to use for transitive closure batch (1k = 1K = 1000, 1m = 1M = 10^6, 1g = 1G = 10^9) [default 1M]", {'B', "transclose-batch"}); //args::ValueFlag num_domains(parser, "N", "number of domains for iitii interpolation", {'D', "domains"}); args::Flag keep_temp_files(parser, "", "keep intermediate files generated during graph induction", {'T', "keep-temp"}); @@ -131,6 +132,7 @@ int main(int argc, char** argv) { auto aln_iitree_ptr = std::make_unique>(aln_idx); auto& aln_iitree = *aln_iitree_ptr; aln_iitree.open_writer(); + float sparse_match = match_sparsification ? args::get(match_sparsification) : 0; if (!pafs_and_min_lengths.empty()) { for (auto& p : pafs_and_min_lengths) { auto& file = p.first; @@ -138,7 +140,7 @@ int main(int argc, char** argv) { if (!min_length && args::get(min_match_len)) { min_length = args::get(min_match_len); } - unpack_paf_alignments(file, aln_iitree, seqidx, min_length, num_threads); + unpack_paf_alignments(file, aln_iitree, seqidx, min_length, sparse_match, num_threads); } } if (args::get(show_progress)) std::cerr << "[seqwish::alignments] " << std::fixed << std::showpoint << std::setprecision(3) << seconds_since(start_time) << " indexing" << std::endl;