Skip to content

Commit

Permalink
Merge pull request #94 from ekg/sparsify-matches
Browse files Browse the repository at this point in the history
sparsify matches
  • Loading branch information
ekg authored Apr 3, 2022
2 parents a2d85fb + 3d507fa commit bd96fbb
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 5 deletions.
26 changes: 23 additions & 3 deletions src/alignments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,28 @@

namespace seqwish {


uint64_t match_hash(const pos_t& q, const pos_t& t, const uint64_t& l) {
uint64_t seed = q | t | l;
seed ^= q + 0x9e3779b97f4a7c15 + (seed << 17) + (seed >> 9);
seed ^= t + 0x9e3779b97f4a7c15 + (seed << 7) + (seed >> 23);
seed ^= l + 0x9e3779b97f4a7c15 + (seed << 9) + (seed >> 2);
return seed;
}

bool keep_sparse(const pos_t& q, const pos_t& t, const uint64_t& l, const float f) {
// hash the match and check if it's accepted given our sparsification factor
return match_hash(q, t, l) < std::numeric_limits<uint64_t>::max() * f;
}

void paf_worker(
igzstream& paf_in,
std::atomic<bool>& paf_more,
std::mutex& paf_in_mutex,
mmmulti::iitree<uint64_t, pos_t>& aln_iitree,
const seqindex_t& seqidx,
const uint64_t& min_match_len) {
const uint64_t& min_match_len,
const float& sparsification_factor) {
while (paf_more.load()) {
paf_in_mutex.lock();
std::string line;
Expand Down Expand Up @@ -45,7 +60,11 @@ void paf_worker(
uint64_t match_len = 0;
auto add_match =
[&](void) {
if (match_len && match_len >= min_match_len) {
if (match_len
&& match_len >= min_match_len
&& (sparsification_factor == 0 ||
keep_sparse(q_pos_match_start, t_pos_match_start, match_len,
sparsification_factor))) {
if (is_rev(q_pos)) {
pos_t x_pos = q_pos;
decr_pos(x_pos); // to guard against underflow when our start is 0-, we need to decr in pos_t space
Expand Down Expand Up @@ -101,6 +120,7 @@ void unpack_paf_alignments(const std::string& paf_file,
mmmulti::iitree<uint64_t, pos_t>& aln_iitree,
const seqindex_t& seqidx,
const uint64_t& min_match_len,
const float& sparsification_factor,
const uint64_t& num_threads) {
// go through the PAF file
igzstream paf_in(paf_file.c_str());
Expand All @@ -112,7 +132,7 @@ void unpack_paf_alignments(const std::string& paf_file,
std::atomic<bool> paf_more; paf_more.store(true);
std::vector<std::thread> workers; workers.reserve(num_threads);
for (uint64_t t = 0; t < num_threads; ++t) {
workers.emplace_back(paf_worker, std::ref(paf_in), std::ref(paf_more), std::ref(paf_in_mutex), std::ref(aln_iitree), std::ref(seqidx), std::ref(min_match_len));
workers.emplace_back(paf_worker, std::ref(paf_in), std::ref(paf_more), std::ref(paf_in_mutex), std::ref(aln_iitree), std::ref(seqidx), std::ref(min_match_len), std::ref(sparsification_factor));
}
for (uint64_t t = 0; t < num_threads; ++t) {
workers[t].join();
Expand Down
8 changes: 7 additions & 1 deletion src/alignments.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,21 @@ void paf_worker(
std::mutex& paf_in_mutex,
mmmulti::iitree<uint64_t, pos_t>& aln_iitree,
const seqindex_t& seqidx,
const uint64_t& min_match_len);
const uint64_t& min_match_len,
const float& sparsification_factor);

void unpack_paf_alignments(
const std::string& paf_file,
mmmulti::iitree<uint64_t, pos_t>& aln_iitree,
const seqindex_t& seqidx,
const uint64_t& min_match_len,
const float& sparsification_factor,
const uint64_t& num_threads);

uint64_t match_hash(const pos_t& q, const pos_t& t, const uint64_t& l);

bool keep_sparse(const pos_t& q, const pos_t& t, const uint64_t& l, const float f);


/*
void filter_alignments(mmmulti::map<pos_t, aln_pos_t>& aln_mm,
Expand Down
4 changes: 3 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ int main(int argc, char** argv) {
args::ValueFlag<uint64_t> repeat_max(parser, "N", "Limit transitive closure to include no more than N copies of a given input base", {'r', "repeat-max"});
args::ValueFlag<uint64_t> min_repeat_dist(parser, "N", "Prevent transitive closure for bases at least this far apart in input sequences", {'l', "min-repeat-distance"});
args::ValueFlag<uint64_t> min_match_len(parser, "N", "Filter exact matches below this length. This can smooth the graph locally and prevent the formation of complex local graph topologies from forming due to differential alignments.", {'k', "min-match-len"});
args::ValueFlag<float> match_sparsification(parser, "N", "Sparsify input matches, keeping the fraction that minimize a hash function.", {'f', "sparse-factor"});
args::ValueFlag<std::string> transclose_batch(parser, "N", "Number of bp to use for transitive closure batch (1k = 1K = 1000, 1m = 1M = 10^6, 1g = 1G = 10^9) [default 1M]", {'B', "transclose-batch"});
//args::ValueFlag<uint64_t> num_domains(parser, "N", "number of domains for iitii interpolation", {'D', "domains"});
args::Flag keep_temp_files(parser, "", "keep intermediate files generated during graph induction", {'T', "keep-temp"});
Expand Down Expand Up @@ -131,14 +132,15 @@ int main(int argc, char** argv) {
auto aln_iitree_ptr = std::make_unique<mmmulti::iitree<uint64_t, pos_t>>(aln_idx);
auto& aln_iitree = *aln_iitree_ptr;
aln_iitree.open_writer();
float sparse_match = match_sparsification ? args::get(match_sparsification) : 0;
if (!pafs_and_min_lengths.empty()) {
for (auto& p : pafs_and_min_lengths) {
auto& file = p.first;
uint64_t min_length = p.second;
if (!min_length && args::get(min_match_len)) {
min_length = args::get(min_match_len);
}
unpack_paf_alignments(file, aln_iitree, seqidx, min_length, num_threads);
unpack_paf_alignments(file, aln_iitree, seqidx, min_length, sparse_match, num_threads);
}
}
if (args::get(show_progress)) std::cerr << "[seqwish::alignments] " << std::fixed << std::showpoint << std::setprecision(3) << seconds_since(start_time) << " indexing" << std::endl;
Expand Down

0 comments on commit bd96fbb

Please sign in to comment.