From d9f8629d7c84ef7bc4ad48fa31f47ab81dedefab Mon Sep 17 00:00:00 2001 From: Lauren Coombe Date: Fri, 3 Jun 2022 09:29:17 -0700 Subject: [PATCH] Add btllib as a dependancy (#152) * Removed packaged btllib, make a dependency * Fix conda install command * add parameter to conda install in CI * Update Common/Makefile.am * Update CI * Fix yml formatting * Activate conda env for all compile steps * Fix for clang-tidy, ensure clang-tidy reads CXXFLAGS * Update README * Add mac to CI * CI format fix * CI format fix * CI fix formatting * Set unused-but-set-variable to be warning, not error --- Arcs/Makefile.am | 2 +- Common/Makefile.am | 2 +- README.md | 9 +- azure-pipelines.yml | 39 +- src/Makefile.am | 17 +- src/include/btllib/bloom_filter.hpp | 575 --- src/include/btllib/counting_bloom_filter.hpp | 354 -- src/include/btllib/cstring.hpp | 97 - src/include/btllib/data_saveload.hpp | 547 --- src/include/btllib/data_stream.hpp | 750 ---- src/include/btllib/graph.hpp | 96 - src/include/btllib/index_queue.hpp | 229 -- src/include/btllib/indexlr.hpp | 545 --- src/include/btllib/indexlr2.cpp | 315 -- src/include/btllib/indexlr2.hpp | 684 ---- src/include/btllib/nthash.hpp | 1555 -------- src/include/btllib/order_queue.hpp | 230 -- src/include/btllib/rolling_hash.hpp | 378 -- src/include/btllib/seq.hpp | 96 - src/include/btllib/seq_reader.hpp | 1239 ------ src/include/btllib/seq_writer.hpp | 116 - src/include/btllib/status.hpp | 91 - src/include/btllib/util.hpp | 81 - src/include/meson.build | 7 - src/include/vendor/cpptoml.hpp | 3668 ------------------ src/long-to-linked-pe.cpp | 13 +- 26 files changed, 54 insertions(+), 11681 deletions(-) delete mode 100644 src/include/btllib/bloom_filter.hpp delete mode 100644 src/include/btllib/counting_bloom_filter.hpp delete mode 100644 src/include/btllib/cstring.hpp delete mode 100644 src/include/btllib/data_saveload.hpp delete mode 100644 src/include/btllib/data_stream.hpp delete mode 100644 src/include/btllib/graph.hpp delete mode 100644 src/include/btllib/index_queue.hpp delete mode 100644 src/include/btllib/indexlr.hpp delete mode 100644 src/include/btllib/indexlr2.cpp delete mode 100644 src/include/btllib/indexlr2.hpp delete mode 100644 src/include/btllib/nthash.hpp delete mode 100644 src/include/btllib/order_queue.hpp delete mode 100644 src/include/btllib/rolling_hash.hpp delete mode 100644 src/include/btllib/seq.hpp delete mode 100644 src/include/btllib/seq_reader.hpp delete mode 100644 src/include/btllib/seq_writer.hpp delete mode 100644 src/include/btllib/status.hpp delete mode 100644 src/include/btllib/util.hpp delete mode 100644 src/include/meson.build delete mode 100644 src/include/vendor/cpptoml.hpp diff --git a/Arcs/Makefile.am b/Arcs/Makefile.am index 9db7591..5343b3e 100644 --- a/Arcs/Makefile.am +++ b/Arcs/Makefile.am @@ -1,6 +1,6 @@ bin_PROGRAMS = arcs -arcs_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS) +arcs_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS) -Wno-error=unused-but-set-variable arcs_CPPFLAGS = -I$(top_srcdir)/Arcs \ -I$(top_srcdir)/Common \ diff --git a/Common/Makefile.am b/Common/Makefile.am index 32503c4..7810059 100644 --- a/Common/Makefile.am +++ b/Common/Makefile.am @@ -1,6 +1,6 @@ noinst_LIBRARIES = libcommon.a -libcommon_a_CPPFLAGS = -I$(top_srcdir) +libcommon_a_CPPFLAGS = -I$(top_srcdir) -Wno-error=unused-result libcommon_a_SOURCES = \ BloomFilter.cpp BloomFilter.h \ diff --git a/README.md b/README.md index ae27b75..b93acd7 100644 --- a/README.md +++ b/README.md @@ -18,11 +18,12 @@ Because ARKS is not dependent on read alignments, it is generally much faster th ### Dependencies * Boost (tested on 1.61) -* GCC (5.1+) +* GCC (6+) * Autotools (if cloning directly from repository) * LINKS (tested on 1.8) * Google SparseHash * ABySS (if using long reads) (tested on 2.2.5) +* [btllib](https://github.com/bcgsc/btllib) (1.4.3+) ### Compilation: If cloning directly from the repository run: @@ -41,6 +42,12 @@ If your boost library headers are not in your PATH you can specify their locatio ``` ./configure –-with-boost=/boost/path --prefix=/ARCS/PATH && make install ``` +If you compiled btllib from source (as opposed to installation using conda), you can specify the location of the btllib library files: +``` +export CXXFLAGS+=" -I /path/to/btllib/include" +export LDFLAGS+=" -L /path/to/btllib/install/lib" +./configure && make +``` ### ARCS+LINKS Pipeline diff --git a/azure-pipelines.yml b/azure-pipelines.yml index fff3cd7..e8c8713 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -7,15 +7,14 @@ jobs: steps: - script: echo "##vso[task.prependpath]$CONDA/bin" displayName: Add conda to PATH - - script: conda create --yes --quiet --name arcs_CI displayName: Create Anaconda environment - - script: | - sudo apt-get update -qq - sudo apt-get install -qq libboost-dev libsparsehash-dev automake - displayName: Install C++ modules + source activate arcs_CI + conda install --yes -c conda-forge -c bioconda compilers btllib boost automake sparsehash + displayName: Install dependencies - script: | + source activate arcs_CI ./autogen.sh ./configure make @@ -30,7 +29,9 @@ jobs: pylint makeTSVfile.py cd ../ displayName: Run pylint - - script: make distcheck + - script: | + source activate arcs_CI + make distcheck displayName: Compile ARCS with make distcheck - script: | curl https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - @@ -39,8 +40,32 @@ jobs: sudo apt-get install -y --no-install-recommends clang-format clang-tidy displayName: Install clang-format and clang-tidy - script: | + source activate arcs_CI make -C Arcs clang-format make -C src lint displayName: Run clang-format - - script: make -C src all + - script: | + source activate arcs_CI + make -C src all displayName: Compile long-to-linked-pe + +- job: + displayName: mac-latest + pool: + vmImage: 'macOS-latest' + + steps: + - script: echo "##vso[task.prependpath]$CONDA/bin" + displayName: Add conda to PATH + - script: conda create --yes --quiet --name arcs_CI + displayName: Create Anaconda environment + - script: | + source activate arcs_CI + conda install --yes -c conda-forge -c bioconda compilers btllib boost automake sparsehash + displayName: Install dependencies + - script: | + source activate arcs_CI + ./autogen.sh + ./configure + make + displayName: Compile ARCS diff --git a/src/Makefile.am b/src/Makefile.am index 52dfe9a..a6ba180 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,24 +1,19 @@ bin_PROGRAMS = long-to-linked-pe -long_to_linked_pe_CXXFLAGS = -O3 -Wall -Werror -Wno-unknown-pragmas -std=c++11 -pthread -I$(top_srcdir)/src/include +long_to_linked_pe_CXXFLAGS = -O3 -Wall -Werror -Wno-unknown-pragmas -std=c++17 -pthread -I$(top_srcdir)/src/include -CXXFLAGS = $(long_to_linked_pe_CXXFLAGS) +CXXFLAGS += $(long_to_linked_pe_CXXFLAGS) -long_to_linked_pe_SOURCES = long-to-linked-pe.cpp \ - include/btllib/seq_reader.hpp \ - include/btllib/status.hpp \ - include/btllib/util.hpp \ - include/btllib/cstring.hpp \ - include/btllib/data_stream.hpp \ - include/btllib/order_queue.hpp \ - include/btllib/seq.hpp +long_to_linked_pe_LDADD = -lbtllib + +long_to_linked_pe_SOURCES = long-to-linked-pe.cpp # Check the C++ source code for errors. lint: clang-format clang-tidy # Check the C++ source code for errors with clang-tidy. clang-tidy: - clang-tidy -warnings-as-errors='*' *.cpp -- -std=c++11 -x c++ -Ibtllib/include -Iinclude + clang-tidy -warnings-as-errors='*' *.cpp -- -std=c++11 -x c++ ${CXXFLAGS} # Check the C++ source code for white-space errors with clang-format. clang-format: diff --git a/src/include/btllib/bloom_filter.hpp b/src/include/btllib/bloom_filter.hpp deleted file mode 100644 index 4fd6157..0000000 --- a/src/include/btllib/bloom_filter.hpp +++ /dev/null @@ -1,575 +0,0 @@ -#ifndef BTLLIB_BLOOM_FILTER_HPP -#define BTLLIB_BLOOM_FILTER_HPP - -#include "nthash.hpp" -#include "status.hpp" - -#include "vendor/cpptoml.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace btllib { - -static const unsigned char BIT_MASKS[CHAR_BIT] = { - // NOLINT - 0x01, 0x02, 0x04, 0x08, // NOLINT - 0x10, 0x20, 0x40, 0x80 // NOLINT -}; - -static const char* const BLOOM_FILTER_MAGIC_HEADER = "BTLBloomFilter_v2"; -static const char* const KMER_BLOOM_FILTER_MAGIC_HEADER = - "BTLKmerBloomFilter_v2"; -static const char* const SEED_BLOOM_FILTER_MAGIC_HEADER = - "BTLSeedBloomFilter_v2"; - -inline unsigned -pop_cnt_byte(uint8_t x) -{ - return ((0x876543210 >> // NOLINT - (((0x4332322132212110 >> ((x & 0xF) << 2)) & 0xF) << 2)) >> // NOLINT - ((0x4332322132212110 >> (((x & 0xF0) >> 2)) & 0xF) << 2)) & // NOLINT - 0xf; // NOLINT -} - -class BloomFilter -{ - -public: - BloomFilter() {} - BloomFilter(size_t bytes, unsigned hash_num); - explicit BloomFilter(const std::string& path); - - ~BloomFilter() { delete[] array; } - - BloomFilter(const BloomFilter&) = delete; - BloomFilter(BloomFilter&&) = delete; - - BloomFilter& operator=(const BloomFilter&) = delete; - BloomFilter& operator=(BloomFilter&&) = delete; - - void insert(const uint64_t* hashes); - void insert(const std::vector& hashes) { insert(hashes.data()); } - - bool contains(const uint64_t* hashes) const; - bool contains(const std::vector& hashes) const - { - return contains(hashes.data()); - } - - size_t get_bytes() const { return bytes; } - uint64_t get_pop_cnt() const; - double get_occupancy() const; - unsigned get_hash_num() const { return hash_num; } - double get_fpr() const; - - static std::shared_ptr parse_header( - std::ifstream& file, - const std::string& magic_string); - - void write(const std::string& path); - -private: - friend class KmerBloomFilter; - friend class SeedBloomFilter; - - std::atomic* array = nullptr; - size_t bytes = 0; - size_t array_size = - 0; // Should be equal to bytes, but not guaranteed by standard - size_t array_bits = 0; - unsigned hash_num = 0; -}; - -/** - * Bloom filter data structure that kmerizes and hashes given sequences, - * storing the results. - */ -class KmerBloomFilter -{ - -public: - KmerBloomFilter() {} - /** - * Constructor. - * @param k kmer size - * @param bytes bytes to allocate for the filter - * @param hash_num number of hashes - */ - KmerBloomFilter(size_t bytes, unsigned hash_num, unsigned k); - explicit KmerBloomFilter(const std::string& path); - - KmerBloomFilter(const KmerBloomFilter&) = delete; - KmerBloomFilter(KmerBloomFilter&&) = delete; - - KmerBloomFilter& operator=(const KmerBloomFilter&) = delete; - KmerBloomFilter& operator=(KmerBloomFilter&&) = delete; - - /** - * Store the kmers of a sequence. - * @param seq sequence to kmerize - * @param seq_len length of seq - */ - void insert(const char* seq, size_t seq_len); - - /** - * Store the kmers of a sequence. - * @param seq sequence to kmerize - */ - void insert(const std::string& seq) { insert(seq.c_str(), seq.size()); } - - /** - * Query the kmers of a sequence. - * @param seq sequence to kmerize - * @param seq_len length of seq - * - * @return number of kmers found in seq - */ - unsigned contains(const char* seq, size_t seq_len) const; - - /** - * Query the kmers of a sequence. - * @param seq sequence to kmerize - * - * @return number of kmers found in seq - */ - unsigned contains(const std::string& seq) const - { - return contains(seq.c_str(), seq.size()); - } - - bool contains(const uint64_t* hashes) const - { - return bloom_filter.contains(hashes); - } - bool contains(const std::vector& hashes) const - { - return bloom_filter.contains(hashes); - } - - size_t get_bytes() const { return bloom_filter.get_bytes(); } - uint64_t get_pop_cnt() const { return bloom_filter.get_pop_cnt(); } - double get_occupancy() const { return bloom_filter.get_occupancy(); } - unsigned get_hash_num() const { return bloom_filter.get_hash_num(); } - double get_fpr() const { return bloom_filter.get_fpr(); } - unsigned get_k() const { return k; } - BloomFilter& get_bloom_filter() { return bloom_filter; } - - void write(const std::string& path); - -private: - friend class SeedBloomFilter; - - BloomFilter bloom_filter; - unsigned k = 0; -}; - -class SeedBloomFilter -{ - -public: - SeedBloomFilter() {} - SeedBloomFilter(size_t bytes, - unsigned k, - const std::vector& seeds, - unsigned hash_num_per_seed); - explicit SeedBloomFilter(const std::string& path); - - SeedBloomFilter(const SeedBloomFilter&) = delete; - SeedBloomFilter(SeedBloomFilter&&) = delete; - - SeedBloomFilter& operator=(const SeedBloomFilter&) = delete; - SeedBloomFilter& operator=(SeedBloomFilter&&) = delete; - - void insert(const char* seq, size_t seq_len); - void insert(const std::string& seq) { insert(seq.c_str(), seq.size()); } - - std::vector> contains(const char* seq, - size_t seq_len) const; - std::vector> contains(const std::string& seq) const - { - return contains(seq.c_str(), seq.size()); - } - - bool contains(const uint64_t* hashes) const - { - return kmer_bloom_filter.contains(hashes); - } - bool contains(const std::vector& hashes) const - { - return kmer_bloom_filter.contains(hashes); - } - - size_t get_bytes() const { return kmer_bloom_filter.get_bytes(); } - uint64_t get_pop_cnt() const { return kmer_bloom_filter.get_pop_cnt(); } - double get_occupancy() const { return kmer_bloom_filter.get_occupancy(); } - unsigned get_hash_num() const - { - return get_hash_num_per_seed() * get_seeds().size(); - } - double get_fpr() const; - unsigned get_k() const { return kmer_bloom_filter.get_k(); } - const std::vector& get_seeds() const { return seeds; } - const std::vector& get_parsed_seeds() const - { - return parsed_seeds; - } - unsigned get_hash_num_per_seed() const - { - return kmer_bloom_filter.get_hash_num(); - } - KmerBloomFilter& get_kmer_bloom_filter() { return kmer_bloom_filter; } - - void write(const std::string& path); - -private: - KmerBloomFilter kmer_bloom_filter; - std::vector seeds; - std::vector parsed_seeds; -}; - -inline BloomFilter::BloomFilter(size_t bytes, unsigned hash_num) - : bytes(std::ceil(bytes / sizeof(uint64_t)) * sizeof(uint64_t)) - , array_size(get_bytes() / sizeof(array[0])) - , array_bits(array_size * CHAR_BIT) - , hash_num(hash_num) -{ - check_warning( - sizeof(uint8_t) != sizeof(std::atomic), - "Atomic primitives take extra memory. BloomFilter will have less than " + - std::to_string(bytes) + " for bit array."); - array = new std::atomic[array_size]; - std::memset((void*)array, 0, array_size * sizeof(array[0])); -} - -inline void -BloomFilter::insert(const uint64_t* hashes) -{ - for (unsigned i = 0; i < hash_num; ++i) { - const auto normalized = hashes[i] % array_bits; - array[normalized / CHAR_BIT] |= BIT_MASKS[normalized % CHAR_BIT]; - } -} - -inline bool -BloomFilter::contains(const uint64_t* hashes) const -{ - for (unsigned i = 0; i < hash_num; ++i) { - const auto normalized = hashes[i] % array_bits; - const auto mask = BIT_MASKS[normalized % CHAR_BIT]; - if (!bool(array[normalized / CHAR_BIT] & mask)) { - return false; - } - } - return true; -} - -inline uint64_t -BloomFilter::get_pop_cnt() const -{ - uint64_t pop_cnt = 0; -#pragma omp parallel for reduction(+ : pop_cnt) - for (size_t i = 0; i < array_size; ++i) { - pop_cnt += pop_cnt_byte(array[i]); - } - return pop_cnt; -} - -inline double -BloomFilter::get_occupancy() const -{ - return double(get_pop_cnt()) / double(array_bits); -} - -inline double -BloomFilter::get_fpr() const -{ - return std::pow(get_occupancy(), double(hash_num)); -} - -inline std::shared_ptr -BloomFilter::parse_header(std::ifstream& file, const std::string& magic_string) -{ - const std::string magic_with_brackets = std::string("[") + magic_string + "]"; - - std::string line; - std::getline(file, line); - if (line != magic_with_brackets) { - log_error( - std::string("Magic string does not match (likely version mismatch)\n") + - "File magic string:\t" + line + "\n" + "Loader magic string:\t" + - magic_with_brackets); - std::exit(EXIT_FAILURE); - } - - /* Read bloom filter line by line until it sees "[HeaderEnd]" - which is used to mark the end of the header section and - assigns the header to a char array*/ - std::string toml_buffer(line + '\n'); - bool header_end_found = false; - while (bool(std::getline(file, line))) { - toml_buffer.append(line + '\n'); - if (line == "[HeaderEnd]") { - header_end_found = true; - break; - } - } - if (!header_end_found) { - log_error("Pre-built bloom filter does not have the correct header end."); - std::exit(EXIT_FAILURE); - } - - // Send the char array to a stringstream for the cpptoml parser to parse - std::istringstream toml_stream(toml_buffer); - cpptoml::parser toml_parser(toml_stream); - const auto header_config = toml_parser.parse(); - - // Obtain header values from toml parser and assign them to class members - return header_config->get_table(magic_string); -} - -inline BloomFilter::BloomFilter(const std::string& path) -{ - std::ifstream file(path); - - auto table = parse_header(file, BLOOM_FILTER_MAGIC_HEADER); - bytes = *(table->get_as("bytes")); - check_warning( - sizeof(uint8_t) != sizeof(std::atomic), - "Atomic primitives take extra memory. BloomFilter will have less than " + - std::to_string(bytes) + " for bit array."); - array_size = bytes / sizeof(std::atomic); - array_bits = array_size * CHAR_BIT; - hash_num = *(table->get_as("hash_num")); - - array = new std::atomic[array_size]; - file.read((char*)array, array_size * sizeof(array[0])); -} - -inline void -BloomFilter::write(const std::string& path) -{ - std::ofstream file(path.c_str(), std::ios::out | std::ios::binary); - - /* Initialize cpptoml root table - Note: Tables and fields are unordered - Ordering of table is maintained by directing the table - to the output stream immediately after completion */ - auto root = cpptoml::make_table(); - - /* Initialize bloom filter section and insert fields - and output to ostream */ - auto header = cpptoml::make_table(); - header->insert("bytes", get_bytes()); - header->insert("hash_num", get_hash_num()); - root->insert(BLOOM_FILTER_MAGIC_HEADER, header); - file << *root << "[HeaderEnd]\n"; - - file.write((char*)array, array_size * sizeof(array[0])); -} - -inline KmerBloomFilter::KmerBloomFilter(size_t bytes, - unsigned hash_num, - unsigned k) - : bloom_filter(bytes, hash_num) - , k(k) -{} - -inline void -KmerBloomFilter::insert(const char* seq, size_t seq_len) -{ - NtHash nthash(seq, seq_len, get_k(), get_hash_num()); - while (nthash.roll()) { - bloom_filter.insert(nthash.hashes()); - } -} - -inline unsigned -KmerBloomFilter::contains(const char* seq, size_t seq_len) const -{ - unsigned count = 0; - NtHash nthash(seq, seq_len, get_k(), get_hash_num()); - while (nthash.roll()) { - if (bloom_filter.contains(nthash.hashes())) { - count++; - } - } - return count; -} - -inline KmerBloomFilter::KmerBloomFilter(const std::string& path) -{ - std::ifstream file(path); - - auto table = bloom_filter.parse_header(file, KMER_BLOOM_FILTER_MAGIC_HEADER); - bloom_filter.bytes = *(table->get_as("bytes")); - check_warning( - sizeof(uint8_t) != sizeof(std::atomic), - "Atomic primitives take extra memory. BloomFilter will have less than " + - std::to_string(get_bytes()) + " for bit array."); - bloom_filter.array_size = get_bytes() / sizeof(bloom_filter.array[0]); - bloom_filter.array_bits = bloom_filter.array_size * CHAR_BIT; - bloom_filter.hash_num = - *(table->get_as("hash_num")); - k = *(table->get_as("k")); - - bloom_filter.array = new std::atomic[bloom_filter.array_size]; - file.read((char*)bloom_filter.array, - bloom_filter.array_size * sizeof(bloom_filter.array[0])); -} - -inline void -KmerBloomFilter::write(const std::string& path) -{ - std::ofstream file(path.c_str(), std::ios::out | std::ios::binary); - - /* Initialize cpptoml root table - Note: Tables and fields are unordered - Ordering of table is maintained by directing the table - to the output stream immediately after completion */ - auto root = cpptoml::make_table(); - - /* Initialize bloom filter section and insert fields - and output to ostream */ - auto header = cpptoml::make_table(); - header->insert("bytes", get_bytes()); - header->insert("hash_num", get_hash_num()); - header->insert("k", get_k()); - root->insert(KMER_BLOOM_FILTER_MAGIC_HEADER, header); - file << *root << "[HeaderEnd]\n"; - - file.write((char*)bloom_filter.array, - bloom_filter.array_size * sizeof(bloom_filter.array[0])); -} - -inline SeedBloomFilter::SeedBloomFilter(size_t bytes, - unsigned k, - const std::vector& seeds, - unsigned hash_num_per_seed) - : kmer_bloom_filter(bytes, hash_num_per_seed, k) - , seeds(seeds) - , parsed_seeds(parse_seeds(seeds)) -{ - for (const auto& seed : seeds) { - check_error(k != seed.size(), - "SeedBloomFilter: passed k (" + std::to_string(k) + - ") not equal to passed spaced seed size (" + - std::to_string(seed.size()) + ")"); - } -} - -inline void -SeedBloomFilter::insert(const char* seq, size_t seq_len) -{ - SeedNtHash nthash( - seq, seq_len, get_k(), parsed_seeds, get_hash_num_per_seed()); - while (nthash.roll()) { - for (size_t s = 0; s < seeds.size(); s++) { - kmer_bloom_filter.bloom_filter.insert(nthash.hashes() + - s * get_hash_num_per_seed()); - } - } -} - -inline std::vector> -SeedBloomFilter::contains(const char* seq, size_t seq_len) const -{ - std::vector> hit_seeds; - SeedNtHash nthash( - seq, seq_len, get_k(), parsed_seeds, get_hash_num_per_seed()); - while (nthash.roll()) { - hit_seeds.emplace_back(); - for (size_t s = 0; s < seeds.size(); s++) { - if (kmer_bloom_filter.bloom_filter.contains( - nthash.hashes() + s * get_hash_num_per_seed())) { - hit_seeds.back().push_back(s); - } - } - } - return hit_seeds; -} - -inline double -SeedBloomFilter::get_fpr() const -{ - const double single_seed_fpr = - std::pow(get_occupancy(), get_hash_num_per_seed()); - return 1 - std::pow(1 - single_seed_fpr, seeds.size()); -} - -inline SeedBloomFilter::SeedBloomFilter(const std::string& path) -{ - std::ifstream file(path); - - auto table = kmer_bloom_filter.bloom_filter.parse_header( - file, SEED_BLOOM_FILTER_MAGIC_HEADER); - kmer_bloom_filter.bloom_filter.bytes = - *(table->get_as("bytes")); - check_warning( - sizeof(uint8_t) != sizeof(std::atomic), - "Atomic primitives take extra memory. BloomFilter will have less than " + - std::to_string(get_bytes()) + " for bit array."); - kmer_bloom_filter.bloom_filter.array_size = - get_bytes() / sizeof(kmer_bloom_filter.bloom_filter.array[0]); - kmer_bloom_filter.bloom_filter.array_bits = - kmer_bloom_filter.bloom_filter.array_size * CHAR_BIT; - kmer_bloom_filter.bloom_filter.hash_num = - *(table->get_as( - "hash_num_per_seed")); - const auto hash_num = - *(table->get_as( - "hash_num")); - kmer_bloom_filter.k = *(table->get_as("k")); - seeds = *(table->get_array_of("seeds")); - parsed_seeds = parse_seeds(seeds); - check_error(hash_num != get_hash_num_per_seed() * seeds.size(), - "SeedBloomFilter: hash_num, hash_num_per_seed, or number of " - "seeds is wrong."); - - kmer_bloom_filter.bloom_filter.array = - new std::atomic[kmer_bloom_filter.bloom_filter.array_size]; - file.read((char*)kmer_bloom_filter.bloom_filter.array, - kmer_bloom_filter.bloom_filter.array_size * - sizeof(kmer_bloom_filter.bloom_filter.array[0])); -} - -inline void -SeedBloomFilter::write(const std::string& path) -{ - std::ofstream file(path.c_str(), std::ios::out | std::ios::binary); - - /* Initialize cpptoml root table - Note: Tables and fields are unordered - Ordering of table is maintained by directing the table - to the output stream immediately after completion */ - auto root = cpptoml::make_table(); - - /* Initialize bloom filter section and insert fields - and output to ostream */ - auto header = cpptoml::make_table(); - header->insert("bytes", get_bytes()); - header->insert("hash_num", get_hash_num()); - header->insert("hash_num_per_seed", get_hash_num_per_seed()); - header->insert("k", get_k()); - auto seeds_array = cpptoml::make_array(); - for (const auto& seed : seeds) { - seeds_array->push_back(seed); - } - header->insert("seeds", seeds_array); - root->insert(SEED_BLOOM_FILTER_MAGIC_HEADER, header); - file << *root << "[HeaderEnd]\n"; - - file.write((char*)kmer_bloom_filter.bloom_filter.array, - kmer_bloom_filter.bloom_filter.array_size * - sizeof(kmer_bloom_filter.bloom_filter.array[0])); -} - -} // namespace btllib - -#endif diff --git a/src/include/btllib/counting_bloom_filter.hpp b/src/include/btllib/counting_bloom_filter.hpp deleted file mode 100644 index 4d920b0..0000000 --- a/src/include/btllib/counting_bloom_filter.hpp +++ /dev/null @@ -1,354 +0,0 @@ -#ifndef BTLLIB_COUNTING_BLOOM_FILTER_HPP -#define BTLLIB_COUNTING_BLOOM_FILTER_HPP - -#include "bloom_filter.hpp" -#include "nthash.hpp" -#include "status.hpp" - -#include "vendor/cpptoml.hpp" - -#include -#include -#include -#include -#include -#include -#include - -namespace btllib { - -static const char* const COUNTING_BLOOM_FILTER_MAGIC_HEADER = - "BTLCountingBloomFilter_v2"; -static const char* const KMER_COUNTING_BLOOM_FILTER_MAGIC_HEADER = - "BTLKmerCountingBloomFilter_v2"; - -template -class KmerCountingBloomFilter; - -template -class CountingBloomFilter -{ - -public: - CountingBloomFilter() {} - CountingBloomFilter(size_t bytes, unsigned hash_num); - explicit CountingBloomFilter(const std::string& path); - - ~CountingBloomFilter() { delete[] array; } - - CountingBloomFilter(const CountingBloomFilter&) = delete; - CountingBloomFilter(CountingBloomFilter&&) = delete; - - CountingBloomFilter& operator=(const CountingBloomFilter&) = delete; - CountingBloomFilter& operator=(CountingBloomFilter&&) = delete; - - void insert(const uint64_t* hashes); - void insert(const std::vector& hashes) { insert(hashes.data()); } - - T contains(const uint64_t* hashes) const; - T contains(const std::vector& hashes) const - { - return contains(hashes.data()); - } - - size_t get_bytes() const { return bytes; } - uint64_t get_pop_cnt() const; - double get_occupancy() const; - unsigned get_hash_num() const { return hash_num; } - double get_fpr() const; - - /** - * Write bloom filter data to a file - * @param path output filepath - */ - void write(const std::string& path); - -private: - friend class KmerCountingBloomFilter; - - std::atomic* array = nullptr; - size_t bytes = 0; - size_t array_size = 0; - unsigned hash_num = 0; -}; - -template -class KmerCountingBloomFilter -{ - -public: - KmerCountingBloomFilter(size_t bytes, unsigned hash_num, unsigned k); - explicit KmerCountingBloomFilter(const std::string& path); - - KmerCountingBloomFilter(const KmerCountingBloomFilter&) = delete; - KmerCountingBloomFilter(KmerCountingBloomFilter&&) = delete; - - KmerCountingBloomFilter& operator=(const KmerCountingBloomFilter&) = delete; - KmerCountingBloomFilter& operator=(KmerCountingBloomFilter&&) = delete; - - void insert(const char* seq, size_t seq_len); - void insert(const std::string& seq) { insert(seq.c_str(), seq.size()); } - - uint64_t contains(const char* seq, size_t seq_len) const; - uint64_t contains(const std::string& seq) const - { - return contains(seq.c_str(), seq.size()); - } - - T contains(const uint64_t* hashes) const - { - counting_bloom_filter.contains(hashes); - } - T contains(const std::vector& hashes) const - { - counting_bloom_filter.contains(hashes); - } - - size_t get_bytes() const { return counting_bloom_filter.get_bytes(); } - uint64_t get_pop_cnt() const { return counting_bloom_filter.get_pop_cnt(); } - double get_occupancy() const { return counting_bloom_filter.get_occupancy(); } - unsigned get_hash_num() const { return counting_bloom_filter.get_hash_num(); } - double get_fpr() const { return counting_bloom_filter.get_fpr(); } - unsigned get_k() const { return k; } - - void write(const std::string& path); - -private: - CountingBloomFilter counting_bloom_filter; - unsigned k; -}; - -using CountingBloomFilter8 = CountingBloomFilter; -using CountingBloomFilter16 = CountingBloomFilter; -using CountingBloomFilter32 = CountingBloomFilter; - -using KmerCountingBloomFilter8 = KmerCountingBloomFilter; -using KmerCountingBloomFilter16 = KmerCountingBloomFilter; -using KmerCountingBloomFilter32 = KmerCountingBloomFilter; - -template -inline CountingBloomFilter::CountingBloomFilter(size_t bytes, - unsigned hash_num) - : bytes(std::ceil(bytes / sizeof(uint64_t)) * sizeof(uint64_t)) - , array_size(get_bytes() / sizeof(array[0])) - , hash_num(hash_num) -{ - check_warning(sizeof(uint8_t) != sizeof(std::atomic), - "Atomic primitives take extra memory. CountingBloomFilter will " - "have less than " + - std::to_string(bytes) + " for bit array."); - array = new std::atomic[array_size]; - std::memset((void*)array, 0, array_size * sizeof(array[0])); -} - -template -inline void -CountingBloomFilter::insert(const uint64_t* hashes) -{ - // Update flag to track if increment is done on at least one counter - bool update_done = false; - T new_val; - T min_val = contains(hashes); - while (!update_done) { - // Simple check to deal with overflow - new_val = min_val + 1; - if (min_val > new_val) { - return; - } - for (size_t i = 0; i < hash_num; ++i) { - decltype(min_val) temp_min_val = min_val; - if (array[hashes[i] % array_size].compare_exchange_strong(temp_min_val, - new_val)) { - update_done = true; - } - } - // Recalculate minval because if increment fails, it needs a new minval to - // use and if it doesnt hava a new one, the while loop runs forever. - if (!update_done) { - min_val = contains(hashes); - } - } -} - -template -inline T -CountingBloomFilter::contains(const uint64_t* hashes) const -{ - T min = array[hashes[0] % array_size]; - for (size_t i = 1; i < hash_num; ++i) { - const size_t idx = hashes[i] % array_size; - if (array[idx] < min) { - min = array[idx]; - } - } - return min; -} - -template -inline uint64_t -CountingBloomFilter::get_pop_cnt() const -{ - uint64_t pop_cnt = 0; -#pragma omp parallel for reduction(+ : pop_cnt) - for (size_t i = 0; i < array_size; ++i) { - if (array[i] > 0) { - ++pop_cnt; - } - } - return pop_cnt; -} - -template -inline double -CountingBloomFilter::get_occupancy() const -{ - return double(get_pop_cnt()) / double(array_size); -} - -template -inline double -CountingBloomFilter::get_fpr() const -{ - return std::pow(get_occupancy(), double(hash_num)); -} - -template -inline CountingBloomFilter::CountingBloomFilter(const std::string& path) -{ - std::ifstream file(path); - - auto table = - BloomFilter::parse_header(file, COUNTING_BLOOM_FILTER_MAGIC_HEADER); - bytes = *table->get_as("bytes"); - check_warning(sizeof(uint8_t) != sizeof(std::atomic), - "Atomic primitives take extra memory. CountingBloomFilter will " - "have less than " + - std::to_string(bytes) + " for bit array."); - array_size = bytes / sizeof(array[0]); - hash_num = *table->get_as("hash_num"); - check_error( - sizeof(array[0]) * CHAR_BIT != *table->get_as("counter_bits"), - "CountingBloomFilter" + std::to_string(sizeof(array[0]) * CHAR_BIT) + - " tried to load a file of CountingBloomFilter" + - std::to_string(*table->get_as("counter_bits"))); - - array = new std::atomic[array_size]; - file.read((char*)array, array_size * sizeof(array[0])); -} - -template -inline void -CountingBloomFilter::write(const std::string& path) -{ - std::ofstream file(path.c_str(), std::ios::out | std::ios::binary); - - /* Initialize cpptoml root table - Note: Tables and fields are unordered - Ordering of table is maintained by directing the table - to the output stream immediately after completion */ - auto root = cpptoml::make_table(); - - /* Initialize bloom filter section and insert fields - and output to ostream */ - auto header = cpptoml::make_table(); - header->insert("bytes", get_bytes()); - header->insert("hash_num", get_hash_num()); - header->insert("counter_bits", size_t(sizeof(array[0]) * CHAR_BIT)); - root->insert(COUNTING_BLOOM_FILTER_MAGIC_HEADER, header); - file << *root << "[HeaderEnd]\n"; - - file.write((char*)array, array_size * sizeof(array[0])); -} - -template -inline KmerCountingBloomFilter::KmerCountingBloomFilter(size_t bytes, - unsigned hash_num, - unsigned k) - : counting_bloom_filter(bytes, hash_num) - , k(k) -{} - -template -inline void -KmerCountingBloomFilter::insert(const char* seq, size_t seq_len) -{ - NtHash nthash(seq, seq_len, get_k(), get_hash_num()); - while (nthash.roll()) { - counting_bloom_filter.insert(nthash.hashes()); - } -} - -template -inline uint64_t -KmerCountingBloomFilter::contains(const char* seq, size_t seq_len) const -{ - uint64_t count = 0; - NtHash nthash(seq, seq_len, get_k(), get_hash_num()); - while (nthash.roll()) { - count += counting_bloom_filter.contains(nthash.hashes()); - } - return count; -} - -template -inline KmerCountingBloomFilter::KmerCountingBloomFilter( - const std::string& path) -{ - std::ifstream file(path); - - auto table = - BloomFilter::parse_header(file, KMER_COUNTING_BLOOM_FILTER_MAGIC_HEADER); - counting_bloom_filter.bytes = - *table->get_as("bytes"); - check_warning(sizeof(uint8_t) != sizeof(std::atomic), - "Atomic primitives take extra memory. CountingBloomFilter will " - "have less than " + - std::to_string(get_bytes()) + " for bit array."); - counting_bloom_filter.array_size = - get_bytes() / sizeof(counting_bloom_filter.array[0]); - counting_bloom_filter.hash_num = - *table->get_as("hash_num"); - k = *table->get_as("k"); - check_error(sizeof(T) * CHAR_BIT != *table->get_as("counter_bits"), - "CountingBloomFilter" + std::to_string(sizeof(T) * CHAR_BIT) + - " tried to load a file of CountingBloomFilter" + - std::to_string(*table->get_as("counter_bits"))); - - counting_bloom_filter.array = - new std::atomic[counting_bloom_filter.array_size]; - file.read((char*)counting_bloom_filter.array, - counting_bloom_filter.array_size * - sizeof(counting_bloom_filter.array[0])); -} - -template -inline void -KmerCountingBloomFilter::write(const std::string& path) -{ - std::ofstream file(path.c_str(), std::ios::out | std::ios::binary); - - /* Initialize cpptoml root table - Note: Tables and fields are unordered - Ordering of table is maintained by directing the table - to the output stream immediately after completion */ - auto root = cpptoml::make_table(); - - /* Initialize bloom filter section and insert fields - and output to ostream */ - auto header = cpptoml::make_table(); - header->insert("bytes", get_bytes()); - header->insert("hash_num", get_hash_num()); - header->insert("counter_bits", - size_t(sizeof(counting_bloom_filter.array[0]) * CHAR_BIT)); - header->insert("k", k); - root->insert(KMER_COUNTING_BLOOM_FILTER_MAGIC_HEADER, header); - file << *root << "[HeaderEnd]\n"; - - file.write((char*)counting_bloom_filter.array, - counting_bloom_filter.array_size * - sizeof(counting_bloom_filter.array[0])); -} - -} // namespace btllib - -#endif diff --git a/src/include/btllib/cstring.hpp b/src/include/btllib/cstring.hpp deleted file mode 100644 index a9ec7fc..0000000 --- a/src/include/btllib/cstring.hpp +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef BTLLIB_CSTRING_HPP -#define BTLLIB_CSTRING_HPP - -#include -#include -#include - -namespace btllib { - -struct CString -{ - static const size_t CSTRING_DEFAULT_CAP = 4096; - - CString() { s[0] = '\0'; } - - CString(const CString& cstring) - { - if (cstring.s_size > s_cap) { - s = (char*)std::realloc((char*)s, cstring.s_size); // NOLINT - s_cap = cstring.s_size; - } - s_size = cstring.s_size; - memcpy(s, cstring.s, s_size); - } - - CString(CString&& cstring) noexcept - { - std::swap(s, cstring.s); - s_size = cstring.s_size; - cstring.clear(); - std::swap(s_cap, cstring.s_cap); - } - - CString(const std::string& str) - { - if (str.size() + 1 > s_cap) { - s_cap = str.size() + 1; - s = (char*)std::realloc((char*)s, s_cap); // NOLINT - } - s_size = str.size(); - memcpy(s, str.c_str(), s_size + 1); - } - - CString& operator=(const CString& cstring) - { - if (this == &cstring) { - return *this; - } - if (cstring.s_size > s_cap) { - s = (char*)std::realloc((char*)s, cstring.s_size); // NOLINT - s_cap = cstring.s_size; - } - s_size = cstring.s_size; - memcpy(s, cstring.s, s_size); - return *this; - } - - CString& operator=(CString&& cstring) noexcept - { - std::swap(s, cstring.s); - s_size = cstring.s_size; - cstring.clear(); - std::swap(s_cap, cstring.s_cap); - return *this; - } - - CString& operator=(const std::string& str) - { - if (str.size() + 1 > s_cap) { - s_cap = str.size() + 1; - s = (char*)std::realloc((char*)s, s_cap); // NOLINT - } - s_size = str.size(); - memcpy(s, str.c_str(), s_size + 1); - return *this; - } - - ~CString() { free(s); } // NOLINT - - void clear() - { - s[0] = '\0'; - s_size = 0; - } - bool empty() const { return (ssize_t)s_size <= 0; } - size_t size() const { return s_size; } - - operator char*() const { return s; } - - char* s = (char*)std::malloc(CSTRING_DEFAULT_CAP); // NOLINT - size_t s_size = 0; - size_t s_cap = CSTRING_DEFAULT_CAP; -}; - -} // namespace btllib - -#endif \ No newline at end of file diff --git a/src/include/btllib/data_saveload.hpp b/src/include/btllib/data_saveload.hpp deleted file mode 100644 index 2d5845e..0000000 --- a/src/include/btllib/data_saveload.hpp +++ /dev/null @@ -1,547 +0,0 @@ -#ifndef BTLLIB_DATA_SAVELOAD_HPP -#define BTLLIB_DATA_SAVELOAD_HPP - -#include "status.hpp" -#include "util.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace btllib { - -enum SaveloadOp -{ - READ, - WRITE, - APPEND -}; - -class _Pipeline -{ - -public: - _Pipeline() {} - - _Pipeline(FILE* file, pid_t pid_first, pid_t pid_last) - : file(file) - , pid_first(pid_first) - , pid_last(pid_last) - {} - - FILE* file = nullptr; - pid_t pid_first = -1; - pid_t pid_last = -1; -}; - -inline std::string -get_saveload_cmd(const std::string& path, SaveloadOp op); -inline _Pipeline -run_saveload_cmd(const std::string& cmd, SaveloadOp op); - -class DataSource -{ -public: - DataSource(const std::string& source) - { - if (source == "-") { - pipeline.file = stdin; - pipeline.pid_first = -1; - pipeline.pid_last = -1; - } else { - const auto cmd = get_saveload_cmd(source, READ); - check_error(cmd.empty(), "Error loading from " + source); - pipeline = run_saveload_cmd(cmd, READ); - } - } - - ~DataSource() { close(); } - - void close() - { - if (!closed) { - if (pipeline.file != stdin) { - int status; - kill(pipeline.pid_first, SIGTERM); - waitpid(pipeline.pid_last, &status, 0); - std::fclose(pipeline.file); - } - closed = true; - } - } - - FILE* operator*() { return pipeline.file; } - FILE* operator->() { return pipeline.file; } - operator FILE*() { return pipeline.file; } - - _Pipeline pipeline; - bool closed = false; -}; - -class DataSink -{ -public: - DataSink(const std::string& sink, bool append) - { - if (sink == "-") { - pipeline.file = stdout; - pipeline.pid_first = -1; - pipeline.pid_last = -1; - } else { - const auto cmd = get_saveload_cmd(sink, append ? APPEND : WRITE); - check_error(cmd.empty(), "Error saving to " + sink); - pipeline = run_saveload_cmd(cmd, append ? APPEND : WRITE); - } - } - - ~DataSink() { close(); } - - void close() - { - if (!closed) { - if (pipeline.file != stdout) { - fclose(pipeline.file); - int status; - waitpid(pipeline.pid_last, &status, 0); - } - closed = true; - } - } - - FILE* operator*() { return pipeline.file; } - FILE* operator->() { return pipeline.file; } - operator FILE*() { return pipeline.file; } - - _Pipeline pipeline; - bool closed = false; -}; - -/** SIGCHLD handler. Reap child processes and report an error if any - * fail. */ -inline void -sigchld_handler(const int sig) -{ - assert(sig == SIGCHLD); - (void)sig; - - pid_t pid; - int status; - while ((pid = waitpid(-1, &status, WNOHANG)) > 0) { - if (status != 0) { - if (WIFEXITED(status)) { // NOLINT - std::cerr << "PID " << pid << " exited with status " - << WEXITSTATUS(status) << std::endl; // NOLINT - } else if (WIFSIGNALED(status)) { // NOLINT - if (WTERMSIG(status) == SIGTERM) { - return; - } - std::cerr << "PID " << pid << " killed by signal " - << WTERMSIG(status) // NOLINT - << std::endl; - } else { - std::cerr << "PID " << pid << " exited with code " << status - << std::endl; - } - std::exit(EXIT_FAILURE); - } - } - if (pid == -1 && errno != ECHILD) { - std::perror("waitpid"); - std::exit(EXIT_FAILURE); - } -} - -bool -data_saveload_init(); -static const bool data_saveload_initialized = data_saveload_init(); - -inline bool -data_saveload_init() -{ - (void)data_saveload_initialized; - struct sigaction action; // NOLINT - action.sa_handler = sigchld_handler; - sigemptyset(&action.sa_mask); - action.sa_flags = SA_RESTART; - sigaction(SIGCHLD, &action, nullptr); - return true; -} - -inline std::string -get_saveload_cmd(const std::string& path, SaveloadOp op) -{ - struct Datatype - { - std::vector prefixes; - std::vector suffixes; - std::vector cmds_check_existence; - std::vector read_cmds; - std::vector write_cmds; - std::vector append_cmds; - }; - - // clang-format off - static const Datatype DATATYPES[]{ - { { "http://", "https://", "ftp://" }, {}, { "which wget" }, { "wget -O-" }, { "" }, { "" } }, - { {}, { ".url" }, { "which wget" }, { "wget -O- -i" }, { "" }, { "" } }, - { {}, { ".ar" }, { "which ar" }, { "ar -p" }, { "" }, { "" } }, - { {}, { ".tar" }, { "which tar" }, { "tar -xOf" }, { "" }, { "" } }, - { {}, { ".tgz" }, { "which tar" }, { "tar -zxOf" }, { "" }, { "" } }, - { {}, { ".gz", ".z" }, { "which pigz", "which gzip" }, { "pigz -dc", "gzip -dc" }, { "pigz >", "gzip >" }, { "pigz >>", "gzip >>" } }, - { {}, { ".bz2" }, { "which bzip2" }, { "bunzip2 -dc" }, { "bzip2 >" }, { "bzip2 >>" } }, - { {}, { ".xz" }, { "which xz" }, { "unxz -dc" }, { "xz -T0 >" }, { "xz -T0 >>" } }, - { {}, { ".7z" }, { "which 7z" }, { "7z -so e" }, { "7z -si a" }, { "7z -si a" } }, - { {}, { ".zip" }, { "which zip" }, { "unzip -p" }, { "" }, { "" } }, - { {}, { ".bam", ".cram" }, { "which samtools" }, { "samtools view -h" }, { "samtools -Sb - >" }, { "samtools -Sb - >>" } }, - }; - // clang-format on - std::string default_cmd = "cat"; - if (op == WRITE) { - default_cmd += " >"; - } else if (op == APPEND) { - default_cmd += " >>"; - } - - std::string path_trimmed = path; - std::vector cmd_layers; - for (;;) { - bool found_datatype = false; - for (const auto& datatype : DATATYPES) { - size_t trim_start = 0, trim_end = 0; - bool this_datatype = false; - for (const auto& prefix : datatype.prefixes) { - if (starts_with(path_trimmed, prefix)) { - this_datatype = true; - trim_start += prefix.size(); - break; - } - } - for (const auto& suffix : datatype.suffixes) { - if (ends_with(path_trimmed, suffix)) { - this_datatype = true; - trim_end += suffix.size(); - break; - } - } - - if (this_datatype) { - found_datatype = true; - bool found_cmd = false; - int cmd_idx = 0; - for (const auto& existence_cmd : datatype.cmds_check_existence) { - bool good = true; - auto sub_cmds = split(existence_cmd, "&&"); - std::for_each(sub_cmds.begin(), sub_cmds.end(), trim); - for (const auto& sub_cmd : sub_cmds) { - auto args = split(sub_cmd, " "); - std::for_each(args.begin(), args.end(), trim); - - pid_t pid = fork(); - if (pid == 0) { - int null_fd = open("/dev/null", O_WRONLY, 0); - dup2(null_fd, STDOUT_FILENO); - dup2(null_fd, STDERR_FILENO); - close(null_fd); - - switch (args.size()) { - case 1: - execlp(args[0].c_str(), args[0].c_str(), NULL); - // fall through - case 2: - execlp( - args[0].c_str(), args[0].c_str(), args[1].c_str(), NULL); - // fall through - case 3: - execlp(args[0].c_str(), - args[0].c_str(), - args[1].c_str(), - args[2].c_str(), - NULL); - // fall through - case 4: - execlp(args[0].c_str(), - args[0].c_str(), - args[1].c_str(), - args[2].c_str(), - args[3].c_str(), - NULL); - // fall through - default: - log_error("Invalid number of arguments supplied to execlp (" + - std::to_string(args.size()) + ")."); - std::exit(EXIT_FAILURE); - } - log_error("execlp failed."); - std::exit(EXIT_FAILURE); - } else { - check_error(pid == -1, "Error on fork."); - int status; - waitpid(pid, &status, 0); - if (WIFEXITED(status) && WEXITSTATUS(status) != 0) { // NOLINT - good = false; - break; - } - } - } - if (good) { - found_cmd = true; - break; - } - cmd_idx++; - } - - if (found_cmd) { - std::string cmd; - switch (op) { - case READ: - cmd = datatype.read_cmds[cmd_idx]; - break; - case WRITE: - cmd = datatype.write_cmds[cmd_idx]; - break; - case APPEND: - cmd = datatype.append_cmds[cmd_idx]; - break; - } - if (cmd.empty()) { - log_warning("Filetype recognized for '" + path + - "', but no tool available to work with it."); - } else { - cmd_layers.push_back(cmd); - } - } else { - log_warning("Filetype recognized for '" + path + - "', but no tool available to work with it."); - } - path_trimmed.erase(0, trim_start); - path_trimmed.erase(path_trimmed.size() - trim_end); - } - } - if (!found_datatype) { - break; - } - } - if (cmd_layers.empty()) { - cmd_layers.push_back(default_cmd); - } - if (op == WRITE || op == APPEND) { - std::reverse(cmd_layers.begin(), cmd_layers.end()); - } - - std::string result_cmd; - for (size_t i = 0; i < cmd_layers.size(); i++) { - auto& cmd = cmd_layers[i]; - if (op == WRITE || op == APPEND) { - if (i == cmd_layers.size() - 1) { - if (cmd.back() == '>') { - cmd += path; - } else { - cmd += " "; - cmd += path; - } - } else { - if (cmd.back() == '>') { - while (cmd.back() == '>' || cmd.back() == ' ') { - cmd.pop_back(); - } - } else { - cmd += " -"; - } - } - } else { - if (i == 0) { - cmd += " "; - cmd += path; - } else { - cmd += " -"; - } - } - if (i > 0) { - result_cmd += " | "; - } - result_cmd += cmd; - } - - return result_cmd; -} - -inline _Pipeline -run_saveload_cmd(const std::string& cmd, SaveloadOp op) -{ - static const int READ_END = 0; - static const int WRITE_END = 1; - - auto individual_cmds = split(cmd, " | "); - assert(!individual_cmds.empty()); - std::reverse(individual_cmds.begin(), individual_cmds.end()); - std::vector pids; - std::vector> fds; - int input_fd[2], output_fd[2]; - input_fd[READ_END] = -1; - input_fd[WRITE_END] = -1; - output_fd[READ_END] = -1; - output_fd[WRITE_END] = -1; - if (op == READ) { - check_error(pipe2(output_fd, O_CLOEXEC) == -1, "Error opening a pipe."); - } - size_t i = 0; - for (const auto& individual_cmd : individual_cmds) { - auto args = split(individual_cmd, " "); - std::for_each(args.begin(), args.end(), trim); - - std::string stdout_to_file; - decltype(args)::iterator it; - for (it = args.begin(); it != args.end(); ++it) { - if (it->front() == '>') { - stdout_to_file = it->substr(1); - break; - } - } - if (it != args.end()) { - args.erase(it); - } - - if (op == READ) { - if (i < individual_cmds.size() - 1) { - check_error(pipe2(input_fd, O_CLOEXEC) == -1, "Error opening a pipe."); - } - } else { - check_error(pipe2(input_fd, O_CLOEXEC) == -1, "Error opening a pipe."); - } - - pid_t pid = fork(); - if (pid == 0) { - if (op == READ) { - dup2(output_fd[WRITE_END], STDOUT_FILENO); - close(output_fd[READ_END]); - close(output_fd[WRITE_END]); - - if (i > 0) { - close(fds.front()[READ_END]); - close(fds.front()[WRITE_END]); - } - - if (i < individual_cmds.size() - 1) { - dup2(input_fd[READ_END], STDIN_FILENO); - close(input_fd[READ_END]); - close(input_fd[WRITE_END]); - } - - switch (args.size()) { - case 1: - execlp(args[0].c_str(), args[0].c_str(), NULL); - // fall through - case 2: - execlp(args[0].c_str(), args[0].c_str(), args[1].c_str(), NULL); - // fall through - case 3: - execlp(args[0].c_str(), - args[0].c_str(), - args[1].c_str(), - args[2].c_str(), - NULL); - // fall through - case 4: - execlp(args[0].c_str(), - args[0].c_str(), - args[1].c_str(), - args[2].c_str(), - args[3].c_str(), - NULL); - // fall through - default: - log_error("Invalid number of arguments supplied to execlp (" + - std::to_string(args.size()) + ")."); - std::exit(EXIT_FAILURE); - } - log_error("execlp failed."); - std::exit(EXIT_FAILURE); - } else { - dup2(input_fd[READ_END], STDIN_FILENO); - close(input_fd[READ_END]); - close(input_fd[WRITE_END]); - - if (!stdout_to_file.empty()) { - int outfd = - open(stdout_to_file.c_str(), - O_WRONLY | O_CREAT | (op == APPEND ? O_APPEND : 0), - S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); - dup2(outfd, STDOUT_FILENO); - close(outfd); - } else if (i > 0) { - dup2(output_fd[WRITE_END], STDOUT_FILENO); - close(output_fd[READ_END]); - close(output_fd[WRITE_END]); - } - - switch (args.size()) { - case 1: - execlp(args[0].c_str(), args[0].c_str(), NULL); - // fall through - case 2: - execlp(args[0].c_str(), args[0].c_str(), args[1].c_str(), NULL); - // fall through - case 3: - execlp(args[0].c_str(), - args[0].c_str(), - args[1].c_str(), - args[2].c_str(), - NULL); - // fall through - case 4: - execlp(args[0].c_str(), - args[0].c_str(), - args[1].c_str(), - args[2].c_str(), - args[3].c_str(), - NULL); - // fall through - default: - log_error("Invalid number of arguments supplied to execlp (" + - std::to_string(args.size()) + ")."); - std::exit(EXIT_FAILURE); - } - log_error("execlp failed."); - exit(EXIT_FAILURE); - } - } - check_error(pid == -1, "Error on fork."); - pids.push_back(pid); - if (op == READ) { - fds.push_back({ output_fd[READ_END], output_fd[WRITE_END] }); - } else { - fds.push_back({ input_fd[READ_END], input_fd[WRITE_END] }); - } - if (i > 0) { - close(output_fd[READ_END]); - close(output_fd[WRITE_END]); - } - output_fd[READ_END] = input_fd[READ_END]; - output_fd[WRITE_END] = input_fd[WRITE_END]; - i++; - } - - if (op == READ) { - close(fds.front()[WRITE_END]); - return _Pipeline( - fdopen(fds.front()[READ_END], "r"), pids.back(), pids.front()); - } - close(fds.back()[READ_END]); - return _Pipeline( - fdopen(fds.back()[WRITE_END], "w"), pids.back(), pids.front()); -} - -} // namespace btllib - -#endif diff --git a/src/include/btllib/data_stream.hpp b/src/include/btllib/data_stream.hpp deleted file mode 100644 index 0ad946d..0000000 --- a/src/include/btllib/data_stream.hpp +++ /dev/null @@ -1,750 +0,0 @@ -#ifndef BTLLIB_DATA_STREAM_HPP -#define BTLLIB_DATA_STREAM_HPP - -#include "status.hpp" -#include "util.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace btllib { - -static const int PIPE_READ_END = 0; -static const int PIPE_WRITE_END = 1; -static const int COMM_BUFFER_SIZE = 1024; -static const mode_t PIPE_PERMISSIONS = 0666; - -using PipeId = unsigned long; -class DataStreamPipeline; - -// clang-format off -inline bool& process_spawner_initialized() { static bool var; return var; } -inline int* process_spawner_parent2child_fd() { static int var[2]; return var; } -inline int* process_spawner_child2parent_fd() { static int var[2]; return var; } -inline std::mutex& process_spawner_comm_mutex() { static std::mutex var; return var; }; -inline PipeId new_pipe_id() { static PipeId last_pipe_id = 0; return last_pipe_id++; } -inline std::map& pipeline_map() { static std::map var; return var; } -// clang-format on - -static inline std::string -get_pipepath(const PipeId id) -{ - return "btllib-" + std::to_string(getpid()) + "-" + std::to_string(id); -} - -static inline void -read_from_child(void* buf, size_t count) -{ - size_t so_far = 0, ret; - while (so_far < count) { - ret = read(process_spawner_child2parent_fd()[PIPE_READ_END], - (uint8_t*)(buf) + so_far, - count - so_far); - check_error(ret <= 0, "Error communicating with helper process."); - so_far += ret; - } -} - -static inline void -write_to_child(const void* buf, size_t count) -{ - size_t so_far = 0, ret; - while (so_far < count) { - ret = write(process_spawner_parent2child_fd()[PIPE_WRITE_END], - (uint8_t*)(buf) + so_far, - count - so_far); - check_error(ret <= 0, "Error communicating with helper process."); - so_far += ret; - } -} - -static inline void -check_children_failures() -{ - // Checks if any children have failed so the caller can be a disappointed - // parent. - int status; - pid_t pid; - while ((pid = waitpid(-1, &status, WNOHANG)) > 0) { - if (status != 0) { - std::cerr << "Helper process failed before data stream was closed:" - << std::endl; - if (WIFEXITED(status)) { // NOLINT - std::cerr << "PID " << pid << " exited with status " - << WEXITSTATUS(status) << std::endl; // NOLINT - } else if (WIFSIGNALED(status)) { // NOLINT - std::cerr << "PID " << pid << " killed by signal " - << WTERMSIG(status) // NOLINT - << std::endl; - } else { - std::cerr << "PID " << pid << " exited with code " << status - << std::endl; - } - std::exit(EXIT_FAILURE); - } - } -} - -static inline void -end_child() -{ - check_children_failures(); - for (PipeId last_id = new_pipe_id(), id = 0; id < last_id; id++) { - const auto fname = get_pipepath(id); - if (access(fname.c_str(), F_OK) != -1) { - unlink(fname.c_str()); - } - } - std::exit(EXIT_SUCCESS); -} - -static inline void -read_from_parent(void* buf, size_t count) -{ - size_t so_far = 0, ret; - while (so_far < count) { - ret = read(process_spawner_parent2child_fd()[PIPE_READ_END], - (uint8_t*)(buf) + so_far, - count - so_far); - if (ret <= 0) { - end_child(); - break; - } - so_far += ret; - } -} - -static inline void -write_to_parent(const void* buf, size_t count) -{ - size_t so_far = 0, ret; - while (so_far < count) { - ret = write(process_spawner_child2parent_fd()[PIPE_WRITE_END], - (uint8_t*)(buf) + so_far, - count - so_far); - if (ret <= 0) { - end_child(); - break; - } - so_far += ret; - } -} - -class DataStream -{ -public: - enum Operation - { - READ, - WRITE, - APPEND, - CLOSE - }; - - DataStream(const std::string& path, Operation op); - ~DataStream() { close(); } - void close(); - - FILE* operator*() const { return file; } - FILE* operator->() const { return file; } - operator FILE*() const { return file; } - -protected: - std::string streampath; - Operation op; - std::string pipepath; - FILE* file = nullptr; - bool closed = false; -}; - -class DataSource : public DataStream -{ - -public: - DataSource(const std::string& path) - : DataStream(path, READ) - {} -}; - -class DataSink : public DataStream -{ - -public: - DataSink(const std::string& path, bool append = false) - : DataStream(path, append ? APPEND : WRITE) - {} -}; - -inline DataStream::DataStream(const std::string& path, Operation op) - : streampath(path) - , op(op) -{ - std::unique_lock lock(process_spawner_comm_mutex()); - - write_to_child(&op, sizeof(op)); - - size_t pathlen = path.size() + 1; - check_error(pathlen > COMM_BUFFER_SIZE, - "Stream path length too large for the buffer."); - write_to_child(&pathlen, sizeof(pathlen)); - write_to_child(path.c_str(), pathlen); - - char buf[COMM_BUFFER_SIZE]; - read_from_child(&pathlen, sizeof(pathlen)); - read_from_child(buf, pathlen); - pipepath = buf; - - char confirmation = 0; - if (op != READ) { - read_from_child(&confirmation, sizeof(confirmation)); - } - int pipe_fd = - open(pipepath.c_str(), (op == READ ? O_RDONLY : O_WRONLY) | O_NONBLOCK); - write_to_child(&confirmation, sizeof(confirmation)); - - if (op == READ) { - read_from_child(&confirmation, sizeof(confirmation)); - } - fcntl(pipe_fd, F_SETFL, fcntl(pipe_fd, F_GETFL) & ~O_NONBLOCK); - file = fdopen(pipe_fd, op == READ ? "r" : "w"); -} - -inline void -DataStream::close() -{ - if (!closed) { - std::unique_lock lock(process_spawner_comm_mutex()); - - char confirmation = 0; - if (op == READ) { - op = CLOSE; - if (file != stdin) { - write_to_child(&op, sizeof(op)); - - size_t pathlen = pipepath.size() + 1; - check_error(pathlen > COMM_BUFFER_SIZE, - "Stream path length too large for the buffer."); - write_to_child(&pathlen, sizeof(pathlen)); - write_to_child(pipepath.c_str(), pathlen); - - read_from_child(&confirmation, sizeof(confirmation)); - - std::fclose(file); - } - } else if (op == WRITE || op == APPEND) { - op = CLOSE; - if (file != stdout) { - std::fclose(file); - - write_to_child(&op, sizeof(op)); - - size_t pathlen = pipepath.size() + 1; - check_error(pathlen > COMM_BUFFER_SIZE, - "Stream path length too large for the buffer."); - write_to_child(&pathlen, sizeof(pathlen)); - write_to_child(pipepath.c_str(), pathlen); - - read_from_child(&confirmation, sizeof(confirmation)); - } - } - - closed = true; - } -} - -class DataStreamPipeline -{ - -public: - enum Direction - { - SOURCE, - SINK - }; - - DataStreamPipeline() {} - - DataStreamPipeline(Direction direction, pid_t pid_first, pid_t pid_last) - : direction(direction) - , pid_first(pid_first) - , pid_last(pid_last) - {} - - void finish(); - - Direction direction = SOURCE; - pid_t pid_first = -1; - pid_t pid_last = -1; - bool closed = false; -}; - -inline void -DataStreamPipeline::finish() -{ - if (!closed) { - int status; - if (direction == SOURCE) { - kill(pid_first, SIGTERM); - waitpid(pid_first, &status, 0); - } - waitpid(pid_last, &status, 0); - - check_children_failures(); - - closed = true; - } -} - -static inline bool -process_spawner_init(); - -static const bool PROCESS_SPAWNER_INITIALIZER = process_spawner_init(); - -static inline std::string -get_pipeline_cmd(const std::string& path, DataStream::Operation op); - -static inline DataStreamPipeline -run_pipeline_cmd(const std::string& cmd, DataStream::Operation op, int pipe_fd); - -static inline bool -process_spawner_init() -{ - (void)PROCESS_SPAWNER_INITIALIZER; - if (!process_spawner_initialized()) { - process_spawner_initialized() = true; - - process_spawner_parent2child_fd()[PIPE_READ_END] = -1; - process_spawner_parent2child_fd()[PIPE_WRITE_END] = -1; - process_spawner_child2parent_fd()[PIPE_READ_END] = -1; - process_spawner_child2parent_fd()[PIPE_WRITE_END] = -1; - check_error(pipe(process_spawner_parent2child_fd()) == -1, - "Error opening a pipe."); - check_error(pipe(process_spawner_child2parent_fd()) == -1, - "Error opening a pipe."); - - pid_t pid = fork(); - if (pid == 0) { - close(process_spawner_parent2child_fd()[PIPE_WRITE_END]); - close(process_spawner_child2parent_fd()[PIPE_READ_END]); - - { - struct sigaction action; // NOLINT - action.sa_handler = [](const int sig) { (void)sig; }; - sigemptyset(&action.sa_mask); - action.sa_flags = SA_RESTART; - sigaction(SIGCHLD, &action, nullptr); - } - - { - struct sigaction action; // NOLINT - action.sa_handler = [](const int sig) { - (void)sig; - for (PipeId last_id = new_pipe_id(), id = 0; id < last_id; id++) { - const auto fname = get_pipepath(id); - if (access(fname.c_str(), F_OK) != -1) { - unlink(fname.c_str()); - } - } - std::exit(EXIT_FAILURE); - }; - sigemptyset(&action.sa_mask); - action.sa_flags = SA_RESTART; - sigaction(SIGHUP, &action, nullptr); - sigaction(SIGQUIT, &action, nullptr); - sigaction(SIGILL, &action, nullptr); - sigaction(SIGABRT, &action, nullptr); - sigaction(SIGBUS, &action, nullptr); - sigaction(SIGSEGV, &action, nullptr); - sigaction(SIGPIPE, &action, nullptr); - sigaction(SIGTERM, &action, nullptr); - } - - DataStream::Operation op; - std::string pipepath; - int pipe_fd; - char buf[COMM_BUFFER_SIZE]; - size_t pathlen; - DataStreamPipeline pipeline; - char confirmation = 0; - for (;;) { - read_from_parent(&op, sizeof(op)); - - read_from_parent(&pathlen, sizeof(pathlen)); - read_from_parent(buf, pathlen); - - switch (op) { - case DataStream::Operation::READ: - case DataStream::Operation::WRITE: - case DataStream::Operation::APPEND: - pipepath = get_pipepath(new_pipe_id()); - if (access(pipepath.c_str(), F_OK) != -1) { - unlink(pipepath.c_str()); - } - mkfifo(pipepath.c_str(), PIPE_PERMISSIONS); - - pathlen = pipepath.size() + 1; - check_error(pathlen > COMM_BUFFER_SIZE, - "Stream path length too large for the buffer."); - write_to_parent(&pathlen, sizeof(pathlen)); - write_to_parent(pipepath.c_str(), pathlen); - - if (op == DataStream::Operation::READ) { - read_from_parent(&confirmation, sizeof(confirmation)); - } - pipe_fd = - open(pipepath.c_str(), - (op == DataStream::Operation::READ ? O_WRONLY : O_RDONLY) | - O_NONBLOCK | O_CLOEXEC); - if (op != DataStream::Operation::READ) { - write_to_parent(&confirmation, sizeof(confirmation)); - read_from_parent(&confirmation, sizeof(confirmation)); - } - - unlink(pipepath.c_str()); - - if (op == DataStream::Operation::READ) { - write_to_parent(&confirmation, sizeof(confirmation)); - } - fcntl(pipe_fd, F_SETFL, fcntl(pipe_fd, F_GETFL) & ~O_NONBLOCK); - pipeline = run_pipeline_cmd(get_pipeline_cmd(buf, op), op, pipe_fd); - close(pipe_fd); - - pipeline_map()[pipepath] = pipeline; - break; - case DataStream::Operation::CLOSE: - pipeline = pipeline_map()[std::string(buf)]; - pipeline.finish(); - pipeline_map().erase(std::string(buf)); - write_to_parent(&confirmation, sizeof(confirmation)); - break; - default: - log_error("Invalid stream operation."); - std::exit(EXIT_FAILURE); - } - } - } - close(process_spawner_parent2child_fd()[PIPE_READ_END]); - close(process_spawner_child2parent_fd()[PIPE_WRITE_END]); - } - return true; -} - -static inline std::string -get_pipeline_cmd(const std::string& path, DataStream::Operation op) -{ - struct Datatype - { - std::vector prefixes; - std::vector suffixes; - std::vector cmds_check_existence; - std::vector read_cmds; - std::vector write_cmds; - std::vector append_cmds; - }; - - // clang-format off - static const Datatype DATATYPES[] { - { { "http://", "https://", "ftp://" }, {}, { "command -v wget" }, { "wget -O-" }, { "" }, { "" } }, - { {}, { ".url" }, { "command -v wget" }, { "wget -O- -i" }, { "" }, { "" } }, - { {}, { ".ar" }, { "command -v ar" }, { "ar -p" }, { "" }, { "" } }, - { {}, { ".tar" }, { "command -v tar" }, { "tar -xOf" }, { "" }, { "" } }, - { {}, { ".tgz" }, { "command -v tar" }, { "tar -zxOf" }, { "" }, { "" } }, - { {}, { ".gz", ".z" }, { "command -v pigz", "command -v gzip" }, { "pigz -dc", "gzip -dc" }, { "pigz >", "gzip >" }, { "pigz >>", "gzip >>" } }, - { {}, { ".bz2" }, { "command -v bzip2" }, { "bunzip2 -dc" }, { "bzip2 >" }, { "bzip2 >>" } }, - { {}, { ".xz" }, { "command -v xz" }, { "unxz -dc" }, { "xz -T0 >" }, { "xz -T0 >>" } }, - { {}, { ".7z" }, { "command -v 7z" }, { "7z -so e" }, { "7z -si a" }, { "7z -si a" } }, - { {}, { ".zip" }, { "command -v zip" }, { "unzip -p" }, { "" }, { "" } }, - { {}, { ".bam", ".cram" }, { "command -v samtools" }, { "samtools view -h" }, { "samtools -Sb - >" }, { "samtools -Sb - >>" } }, - }; - // clang-format on - std::string default_cmd = "cat"; - if (op == DataStream::Operation::WRITE) { - default_cmd += " >"; - } else if (op == DataStream::Operation::APPEND) { - default_cmd += " >>"; - } - - std::string path_trimmed = path; - std::vector cmd_layers; - for (;;) { - bool found_datatype = false; - for (const auto& datatype : DATATYPES) { - size_t trim_start = 0, trim_end = 0; - bool this_datatype = false; - for (const auto& prefix : datatype.prefixes) { - if (starts_with(path_trimmed, prefix)) { - this_datatype = true; - trim_start += prefix.size(); - break; - } - } - for (const auto& suffix : datatype.suffixes) { - if (ends_with(path_trimmed, suffix)) { - this_datatype = true; - trim_end += suffix.size(); - break; - } - } - - if (this_datatype) { - found_datatype = true; - bool found_cmd = false; - int cmd_idx = 0; - for (const auto& existence_cmd : datatype.cmds_check_existence) { - pid_t pid = fork(); - if (pid == 0) { - int null_fd = open("/dev/null", O_WRONLY, 0); - dup2(null_fd, STDOUT_FILENO); - dup2(null_fd, STDERR_FILENO); - close(null_fd); - - execlp("sh", "sh", "-c", existence_cmd.c_str(), NULL); - log_error("exec failed: sh -c \"" + existence_cmd + "\'"); - std::exit(EXIT_FAILURE); - } else { - check_error(pid == -1, "Error on fork."); - int status; - check_error(waitpid(pid, &status, 0) != pid, "waitpid error."); - if (!(WIFSIGNALED(status)) && - ((WIFEXITED(status)) && (WEXITSTATUS(status) == 0))) { // NOLINT - found_cmd = true; - break; - } - } - cmd_idx++; - } - - if (found_cmd) { - std::string cmd; - switch (op) { - case DataStream::Operation::READ: - cmd = datatype.read_cmds[cmd_idx]; - break; - case DataStream::Operation::WRITE: - cmd = datatype.write_cmds[cmd_idx]; - break; - case DataStream::Operation::APPEND: - cmd = datatype.append_cmds[cmd_idx]; - break; - default: - log_error("Invalid operation"); - std::exit(EXIT_FAILURE); - } - if (cmd.empty()) { - log_warning("Filetype recognized for '" + path_trimmed + - "', but no tool available to work with it."); - } else { - cmd_layers.push_back(cmd); - } - } else { - log_warning("Filetype recognized for '" + path_trimmed + - "', but no tool available to work with it."); - } - path_trimmed.erase(0, trim_start); - path_trimmed.erase(path_trimmed.size() - trim_end); - } - } - if (!found_datatype) { - break; - } - } - if (cmd_layers.empty()) { - cmd_layers.push_back(default_cmd); - } - if (op == DataStream::Operation::WRITE || - op == DataStream::Operation::APPEND) { - std::reverse(cmd_layers.begin(), cmd_layers.end()); - } - - std::string result_cmd; - for (size_t i = 0; i < cmd_layers.size(); i++) { - auto& cmd = cmd_layers[i]; - if (op == DataStream::Operation::WRITE || - op == DataStream::Operation::APPEND) { - if (i == cmd_layers.size() - 1) { - if (cmd.back() == '>') { - cmd += path; - } else { - cmd += " "; - cmd += path; - } - } else { - if (cmd.back() == '>') { - while (cmd.back() == '>' || cmd.back() == ' ') { - cmd.pop_back(); - } - } else { - cmd += " -"; - } - } - } else { - if (i == 0) { - cmd += " "; - cmd += path; - } else { - cmd += " -"; - } - } - if (i > 0) { - result_cmd += " | "; - } - result_cmd += cmd; - } - - check_error(result_cmd.empty(), - (op == DataStream::Operation::READ ? "Error loading from " - : "Error saving to ") + - path); - return result_cmd; -} - -static inline DataStreamPipeline -run_pipeline_cmd(const std::string& cmd, DataStream::Operation op, int pipe_fd) -{ - auto individual_cmds = split(cmd, " | "); - check_error(individual_cmds.empty(), - "Error processing data stream commands."); - std::reverse(individual_cmds.begin(), individual_cmds.end()); - - std::vector pids; - - int input_fd[2], output_fd[2]; - input_fd[PIPE_READ_END] = -1; - input_fd[PIPE_WRITE_END] = -1; - output_fd[PIPE_READ_END] = -1; - output_fd[PIPE_WRITE_END] = -1; - - size_t i = 0; - for (const auto& individual_cmd : individual_cmds) { - auto args = split(individual_cmd, " "); - std::for_each(args.begin(), args.end(), trim); - - std::string stdout_to_file; - decltype(args)::iterator it; - for (it = args.begin(); it != args.end(); ++it) { - if (it->front() == '>') { - stdout_to_file = it->substr(1); - break; - } - } - if (it != args.end()) { - args.erase(it); - } - - char* const* argv = new char*[args.size() + 2]; - ((char*&)(argv[0])) = (char*)(args[0].c_str()); - for (size_t i = 0; i < args.size(); i++) { - ((char*&)(argv[i + 1])) = (char*)(args[i].c_str()); - } - ((char*&)(argv[args.size() + 1])) = nullptr; - - if (i < individual_cmds.size() - 1) { - check_error(pipe(input_fd) == -1, "Error opening a pipe."); - fcntl(input_fd[PIPE_READ_END], F_SETFD, FD_CLOEXEC); - fcntl(input_fd[PIPE_WRITE_END], F_SETFD, FD_CLOEXEC); - } - - pid_t pid = fork(); - if (pid == 0) { - if (op == DataStream::Operation::READ) { - if (i == 0) { - dup2(pipe_fd, STDOUT_FILENO); - close(pipe_fd); - } else { - dup2(output_fd[PIPE_WRITE_END], STDOUT_FILENO); - close(output_fd[PIPE_READ_END]); - close(output_fd[PIPE_WRITE_END]); - } - - if (i < individual_cmds.size() - 1) { - dup2(input_fd[PIPE_READ_END], STDIN_FILENO); - close(input_fd[PIPE_READ_END]); - close(input_fd[PIPE_WRITE_END]); - } - - execvp(argv[0], argv + 1); - std::string argv_print; - for (int i = 0; argv[i] != nullptr; i++) { - argv_print += " " + std::string(argv[i]); - } - log_error("exec failed: " + argv_print); - std::exit(EXIT_FAILURE); - } else { - if (i == individual_cmds.size() - 1) { - dup2(pipe_fd, STDIN_FILENO); - close(pipe_fd); - } else { - dup2(input_fd[PIPE_READ_END], STDIN_FILENO); - close(input_fd[PIPE_READ_END]); - close(input_fd[PIPE_WRITE_END]); - } - - if (!stdout_to_file.empty()) { - int outfd = - open(stdout_to_file.c_str(), - O_WRONLY | O_CREAT | - (op == DataStream::Operation::APPEND ? O_APPEND : 0), - S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); - dup2(outfd, STDOUT_FILENO); - close(outfd); - } else if (i > 0) { - dup2(output_fd[PIPE_WRITE_END], STDOUT_FILENO); - close(output_fd[PIPE_READ_END]); - close(output_fd[PIPE_WRITE_END]); - } - - execvp(argv[0], argv + 1); - std::string argv_print = argv[0]; - for (int i = 1; argv[i] != nullptr; i++) { - argv_print += " " + std::string(argv[i]); - } - log_error("exec failed: " + argv_print); - exit(EXIT_FAILURE); - } - } - check_error(pid == -1, "Error on fork."); - - delete[] argv; - - pids.push_back(pid); - - if (i > 0) { - close(output_fd[PIPE_READ_END]); - close(output_fd[PIPE_WRITE_END]); - } - - if (i < individual_cmds.size() - 1) { - output_fd[PIPE_READ_END] = input_fd[PIPE_READ_END]; - output_fd[PIPE_WRITE_END] = input_fd[PIPE_WRITE_END]; - } - - i++; - } - - return DataStreamPipeline(op == DataStream::Operation::READ - ? DataStreamPipeline::Direction::SOURCE - : DataStreamPipeline::Direction::SINK, - pids.back(), - pids.front()); -} - -} // namespace btllib - -#endif diff --git a/src/include/btllib/graph.hpp b/src/include/btllib/graph.hpp deleted file mode 100644 index 8a98146..0000000 --- a/src/include/btllib/graph.hpp +++ /dev/null @@ -1,96 +0,0 @@ -// #ifndef BTLLIB_GRAPH_HPP -// #define BTLLIB_GRAPH_HPP - -// #include -// #include - -// #include -// #include -// #include - -// namespace btllib { - -// class Graph; - -// class Vertex -// { - -// public: -// private: -// friend class Graph; - -// Vertex(long id) -// : id(id) -// {} - -// long id; -// }; - -// class Edge -// { - -// public: -// private: -// friend class Graph; - -// Edge(Vertex u, Vertex v) -// : u(u) -// , v(v) -// {} - -// Vertex u, v; -// }; - -// class Graph -// { - -// public: -// Graph() {} - -// Vertex add_vertex(); -// void remove_vertex(Vertex v); - -// Edge add_edge(Vertex u, Vertex v); -// void remove_edge(Edge e); - -// std::string to_string(); - -// private: -// boost::adjacency_list -// graph; -// }; - -// inline Vertex -// Graph::add_vertex() -// { -// return boost::add_vertex(graph); -// } - -// inline void -// Graph::remove_vertex(Vertex v) -// {} - -// inline Edge -// Graph::add_edge(Vertex u, Vertex v) -// { -// boost::add_edge(u.id, v.id, graph); -// return Edge(u, v); -// } - -// inline void -// Graph::remove_edge(Edge e) -// { -// boost::remove_edge(e.u.id, e.v.id, graph); -// } - -// inline std::string -// Graph::to_string() -// { -// std::stringstream ss; -// boost::write_graphviz(ss, graph); -// return ss.str(); -// } - -// } // namespace btllib - -// #endif diff --git a/src/include/btllib/index_queue.hpp b/src/include/btllib/index_queue.hpp deleted file mode 100644 index df8e9a8..0000000 --- a/src/include/btllib/index_queue.hpp +++ /dev/null @@ -1,229 +0,0 @@ -#ifndef BTLLIB_INDEX_QUEUE_HPP -#define BTLLIB_INDEX_QUEUE_HPP - -#include -#include -#include -#include -#include -#include -#include - -namespace btllib { - -template -class IndexQueue -{ - -public: - struct Block - { - - Block() - : data(new T[BLOCK_SIZE]) - {} - - Block(const Block&) = delete; - - Block(Block&& block) noexcept - : current(block.current) - , count(block.count) - , index(block.index) - { - std::swap(data, block.data); - } - - Block& operator=(const Block&) = delete; - - Block& operator=(Block&& block) noexcept - { - std::swap(data, block.data); - current = block.current; - count = block.count; - index = block.index; - return *this; - } - - ~Block() { delete[] data; } - - T* data = nullptr; - size_t current = 0; - size_t count = 0; - size_t index = 0; - }; - - // Surrounds pieces of data in the buffer with a busy mutex - // for exclusive access - struct Slot - { - Slot() = default; - Slot(const Slot& slot) - : block(slot.block) - , occupied(slot.occupied) - , last_tenant(slot.last_tenant) - {} - - typename IndexQueue::Block block; - std::mutex busy; - bool occupied = false; - std::condition_variable occupancy_changed; - size_t last_tenant = -1; // Required to ensure read order - }; - - size_t elements() const { return element_count; } - - void close() - { - closed = true; - for (auto& slot : this->slots) { - slot.occupancy_changed.notify_all(); - } - } - - bool is_closed() const { return closed; } - -protected: - std::vector slots{ QUEUE_SIZE }; - size_t read_counter = 0; - std::atomic element_count{ 0 }; - std::atomic closed{ false }; -}; - -template -class IndexQueueSPMC : public IndexQueue -{ - -public: - using Block = typename IndexQueue::Block; - using Slot = typename IndexQueue::Slot; - - void write(Block& block) - { - auto index = block.index; - auto& target = this->slots[index % QUEUE_SIZE]; - std::unique_lock busy_lock(target.busy); - target.occupancy_changed.wait( - busy_lock, [&] { return !target.occupied || this->closed; }); - if (this->closed) { - return; - } - target.block = std::move(block); - target.occupied = true; - target.occupancy_changed.notify_one(); - ++(this->element_count); - } - - void read(Block& block) - { - std::unique_lock read_lock(read_mutex); - - auto& target = this->slots[this->read_counter % QUEUE_SIZE]; - std::unique_lock busy_lock(target.busy); - target.occupancy_changed.wait( - busy_lock, [&] { return target.occupied || this->closed; }); - if (this->closed) { - return; - } - ++(this->read_counter); - - read_lock.unlock(); - - block = std::move(target.block); - target.occupied = false; - target.occupancy_changed.notify_one(); - --(this->element_count); - } - -private: - std::mutex read_mutex; -}; - -template -class IndexQueueSPSC : public IndexQueue -{ - -public: - using Block = typename IndexQueue::Block; - using Slot = typename IndexQueue::Slot; - - void write(Block& block) - { - auto index = block.index; - auto& target = this->slots[index % QUEUE_SIZE]; - std::unique_lock busy_lock(target.busy); - target.occupancy_changed.wait( - busy_lock, [&] { return !target.occupied || this->closed; }); - if (this->closed) { - return; - } - target.block = std::move(block); - target.occupied = true; - target.occupancy_changed.notify_one(); - ++(this->element_count); - } - - void read(Block& block) - { - auto& target = this->slots[this->read_counter % QUEUE_SIZE]; - std::unique_lock busy_lock(target.busy); - target.occupancy_changed.wait( - busy_lock, [&] { return target.occupied || this->closed; }); - if (this->closed) { - return; - } - ++(this->read_counter); - - block = std::move(target.block); - target.occupied = false; - target.occupancy_changed.notify_one(); - --(this->element_count); - } -}; - -template -class IndexQueueMPSC : public IndexQueue -{ - -public: - using Block = typename IndexQueue::Block; - using Slot = typename IndexQueue::Slot; - - void write(Block& block) - { - auto index = block.index; - auto& target = this->slots[index % QUEUE_SIZE]; - std::unique_lock busy_lock(target.busy); - target.occupancy_changed.wait(busy_lock, [&] { - return (!target.occupied && (index - target.last_tenant <= QUEUE_SIZE)) || - this->closed; - }); - if (this->closed) { - return; - } - target.block = std::move(block); - target.occupied = true; - target.last_tenant = index; - target.occupancy_changed.notify_all(); - ++(this->element_count); - } - - void read(Block& block) - { - auto& target = this->slots[this->read_counter % QUEUE_SIZE]; - std::unique_lock busy_lock(target.busy); - target.occupancy_changed.wait( - busy_lock, [&] { return target.occupied || this->closed; }); - if (this->closed) { - return; - } - ++(this->read_counter); - block = std::move(target.block); - target.occupied = false; - target.occupancy_changed.notify_all(); - --(this->element_count); - } -}; - -} // namespace btllib - -#endif \ No newline at end of file diff --git a/src/include/btllib/indexlr.hpp b/src/include/btllib/indexlr.hpp deleted file mode 100644 index 452c609..0000000 --- a/src/include/btllib/indexlr.hpp +++ /dev/null @@ -1,545 +0,0 @@ -#ifndef BTLLIB_INDEXLR_HPP -#define BTLLIB_INDEXLR_HPP - -#include "bloom_filter.hpp" -#include "nthash.hpp" -#include "order_queue.hpp" -#include "seq_reader.hpp" -#include "status.hpp" -#include "util.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace btllib { - -// TODO: Allow multiple Indexlr objects to be instantiated (by assigning ID to -// each instance / indexing static members based on ID) -class Indexlr -{ - -public: - /* Has to be a struct and not an enum because: - * 1) Non-class enums are not name qualified and can collide - * 2) class enums can't be implicitly converted into integers - */ - struct Flag - { - static const unsigned ID = 0; - static const unsigned NO_ID = 1; - static const unsigned BX = 2; - static const unsigned NO_BX = 0; - static const unsigned SEQ = 4; - static const unsigned NO_SEQ = 0; - static const unsigned FILTER_IN = 8; - static const unsigned NO_FILTER_IN = 0; - static const unsigned FILTER_OUT = 16; - static const unsigned NO_FILTER_OUT = 0; - static const unsigned SHORT_MODE = 0; - static const unsigned LONG_MODE = 32; - }; - - bool output_id() const { return bool(~flags & Flag::NO_ID); } - bool output_bx() const { return bool(flags & Flag::BX); } - bool output_seq() const { return bool(flags & Flag::SEQ); } - bool filter_in() const { return bool(flags & Flag::FILTER_IN); } - bool filter_out() const { return bool(flags & Flag::FILTER_OUT); } - bool short_mode() const { return bool(~flags & Flag::LONG_MODE); } - bool long_mode() const { return bool(flags & Flag::LONG_MODE); } - - struct Read - { - Read() {} - - Read(size_t num, std::string id, std::string comment, std::string seq) - : num(num) - , id(std::move(id)) - , comment(std::move(comment)) - , seq(std::move(seq)) - {} - - size_t num = 0; - std::string id; - std::string comment; - std::string seq; - }; - - struct Minimizer - { - Minimizer() = default; - - Minimizer(uint64_t min_hash, - uint64_t out_hash, - size_t pos, - bool forward, - std::string seq) - : min_hash(min_hash) - , out_hash(out_hash) - , pos(pos) - , forward(forward) - , seq(std::move(seq)) - {} - - uint64_t min_hash = 0, out_hash = 0; - size_t pos = 0; - bool forward = false; - std::string seq; - }; - - using HashedKmer = Minimizer; - - struct Record - { - Record() {} - - Record(size_t num, - std::string id, - std::string barcode, - std::vector minimizers) - : num(num) - , id(std::move(id)) - , barcode(std::move(barcode)) - , minimizers(std::move(minimizers)) - {} - - size_t num = 0; - std::string id; - std::string barcode; - std::vector minimizers; - - operator bool() const { return !id.empty() || !barcode.empty(); } - }; - - Record get_minimizers(); - - Indexlr(std::string seqfile, - size_t k, - size_t w, - unsigned flags = 0, - unsigned threads = 5, - bool verbose = false, - const btllib::BloomFilter& bf1 = Indexlr::dummy_bf(), - const btllib::BloomFilter& bf2 = Indexlr::dummy_bf()); - - ~Indexlr(); - - static const size_t MAX_SIMULTANEOUS_INDEXLRS = 256; - - static const size_t SHORT_MODE_BUFFER_SIZE = 32; - static const size_t SHORT_MODE_BLOCK_SIZE = 32; - - static const size_t LONG_MODE_BUFFER_SIZE = 4; - static const size_t LONG_MODE_BLOCK_SIZE = 1; - -private: - static std::string extract_barcode(const std::string& id, - const std::string& comment); - std::vector minimize(const std::string& seq) const; - - const std::string seqfile; - const size_t k, w; - const unsigned flags; - const unsigned threads; - const bool verbose; - const long id; - const size_t buffer_size; - const size_t block_size; - - static const BloomFilter& dummy_bf() - { - static const BloomFilter VAR; - return VAR; - } - - const std::reference_wrapper bf1; - const std::reference_wrapper bf2; - bool filter_in_enabled; - bool filter_out_enabled; - - std::atomic fasta{ false }; - OrderQueueSPMC input_queue; - OrderQueueMPSC output_queue; - - using OutputQueueType = decltype(output_queue); - static std::unique_ptr* ready_blocks_array() - { - thread_local static std::unique_ptr - var[MAX_SIMULTANEOUS_INDEXLRS]; - return var; - } - - static long* ready_blocks_owners() - { - thread_local static long var[MAX_SIMULTANEOUS_INDEXLRS]; - return var; - } - - static std::atomic& last_id() - { - static std::atomic var(0); - return var; - } - - class Worker - { - public: - void start() { t = std::thread(do_work, this); } - void join() { t.join(); } - - virtual ~Worker() {} - - Worker& operator=(const Worker& worker) = delete; - Worker& operator=(Worker&& worker) = delete; - - protected: - Worker(Indexlr& indexlr) - : indexlr(indexlr) - {} - - Worker(const Worker& worker) - : Worker(worker.indexlr) - {} - Worker(Worker&& worker) noexcept - : Worker(worker.indexlr) - {} - - Indexlr& indexlr; - - virtual void work() = 0; - static void do_work(Worker* worker) { worker->work(); } - - std::thread t; - }; - - class InputWorker : public Worker - { - public: - InputWorker(Indexlr& indexlr) - : Worker(indexlr) - {} - - InputWorker(const InputWorker& worker) - : InputWorker(worker.indexlr) - {} - InputWorker(InputWorker&& worker) noexcept - : InputWorker(worker.indexlr) - {} - - InputWorker& operator=(const InputWorker& worker) = delete; - InputWorker& operator=(InputWorker&& worker) = delete; - - void work() override; - }; - - class MinimizeWorker : public Worker - { - public: - MinimizeWorker(Indexlr& indexlr) - : Worker(indexlr) - {} - - MinimizeWorker(const MinimizeWorker& worker) - : MinimizeWorker(worker.indexlr) - {} - MinimizeWorker(MinimizeWorker&& worker) noexcept - : MinimizeWorker(worker.indexlr) - {} - - MinimizeWorker& operator=(const MinimizeWorker& worker) = delete; - MinimizeWorker& operator=(MinimizeWorker&& worker) = delete; - - void work() override; - }; - - SeqReader reader; - InputWorker input_worker; - std::vector minimize_workers; -}; - -inline Indexlr::Indexlr(std::string seqfile, - const size_t k, - const size_t w, - const unsigned flags, - const unsigned threads, - const bool verbose, - const BloomFilter& bf1, - const BloomFilter& bf2) - : seqfile(std::move(seqfile)) - , k(k) - , w(w) - , flags(flags) - , threads(threads) - , verbose(verbose) - , id(++last_id()) - , buffer_size(short_mode() ? SHORT_MODE_BUFFER_SIZE : LONG_MODE_BUFFER_SIZE) - , block_size(short_mode() ? SHORT_MODE_BLOCK_SIZE : LONG_MODE_BLOCK_SIZE) - , bf1(bf1) - , bf2(bf2) - , filter_in_enabled(filter_in()) - , filter_out_enabled(filter_out()) - , input_queue(buffer_size, block_size) - , output_queue(buffer_size, block_size) - , reader(this->seqfile, 0, 3, buffer_size, block_size) - , input_worker(*this) - , minimize_workers( - std::vector(threads, MinimizeWorker(*this))) -{ - input_worker.start(); - for (auto& worker : minimize_workers) { - worker.start(); - } -} - -inline Indexlr::~Indexlr() -{ - reader.close(); - for (auto& worker : minimize_workers) { - worker.join(); - } - input_worker.join(); -} - -// Minimerize a sequence: Find the minimizers of a vector of hash values -// representing a sequence. -/* Algorithm -v is a vector of non-negative integers -w is the window size -Invariants - 0 < w <= v.size() - 1 - 0 <= l <= r <= v.size() - 1 -Initial conditions - M = NIL Final set of minimizers, empty initially - min = -1 Minimum element - i = -1 Index of minimum element - prev = -1 Index of previous minimum element - l = 0 Index of left end of window - r = l + w - 1 Index of right end of window -Computation -At each window, if the previous minimum is out of scope, find the new, -right-most, minimum or else, check with only the right-most element to determine -if that is the new minimum. A minimizer is added to the final vector only if -it's index has changed. for each window of v bounded by [l, r] if (i < l) i = -index of minimum element in [l, r], furthest from l. else if (v[r] <= v[i]) i = -r min = v[i] if (i != prev) { prev = i M <- M + m - } - l = l + 1 Move window's left bound by one element - r = l + w - 1 Set window's right bound -}*/ - -inline std::string -Indexlr::extract_barcode(const std::string& id, const std::string& comment) -{ - const static std::string BARCODE_PREFIX = "BX:Z:"; - if (starts_with(comment, BARCODE_PREFIX)) { - const auto space_pos = comment.find(' '); - if (space_pos != std::string::npos) { - return comment.substr(BARCODE_PREFIX.size(), - space_pos - BARCODE_PREFIX.size()); - } - return comment.substr(BARCODE_PREFIX.size()); - } - const auto pound_pos = id.find('#'); - if (pound_pos != std::string::npos) { - const auto slash_pos = id.find('/'); - if (slash_pos > pound_pos) { - return id.substr(pound_pos + 1, slash_pos - (pound_pos + 1)); - } - } - return "NA"; -} - -inline std::vector -Indexlr::minimize(const std::string& seq) const -{ - if ((k > seq.size()) || (w > seq.size() - k + 1)) { - return {}; - } - std::vector minimizers; - minimizers.reserve(2 * (seq.size() - k + 1) / w); - std::vector hashed_kmers_buffer(w + 1); - ssize_t min_idx_left, min_idx_right, min_pos_prev = -1; - const Minimizer* min_current = nullptr; - size_t idx = 0; - for (NtHash nh(seq, k, 2); nh.roll(); ++idx) { - auto& hk = hashed_kmers_buffer[idx % hashed_kmers_buffer.size()]; - - hk = HashedKmer(nh.hashes()[0], - nh.hashes()[1], - nh.get_pos(), - nh.forward(), - output_seq() ? seq.substr(nh.get_pos(), k) : ""); - - if (filter_in() && filter_out()) { - std::vector tmp; - tmp = { hk.min_hash }; - if (!bf1.get().contains(tmp) || bf2.get().contains(tmp)) { - hk.min_hash = std::numeric_limits::max(); - } - } else if (filter_in()) { - if (!bf1.get().contains({ hk.min_hash })) { - hk.min_hash = std::numeric_limits::max(); - } - } else if (filter_out()) { - if (bf1.get().contains({ hk.min_hash })) { - hk.min_hash = std::numeric_limits::max(); - } - } - - if (idx + 1 >= w) { - min_idx_left = idx + 1 - w; - min_idx_right = idx + 1; - const auto& min_left = - hashed_kmers_buffer[min_idx_left % hashed_kmers_buffer.size()]; - const auto& min_right = - hashed_kmers_buffer[(min_idx_right - 1) % hashed_kmers_buffer.size()]; - - if (min_current == nullptr || min_current->pos < min_left.pos) { - min_current = &min_left; - // Use of operator '<=' returns the minimum that is furthest from left. - for (ssize_t i = min_idx_left; i < min_idx_right; i++) { - const auto& min_i = - hashed_kmers_buffer[i % hashed_kmers_buffer.size()]; - if (min_i.min_hash <= min_current->min_hash) { - min_current = &min_i; - } - } - } else if (min_right.min_hash <= min_current->min_hash) { - min_current = &min_right; - } - if (ssize_t(min_current->pos) > min_pos_prev && - min_current->min_hash != std::numeric_limits::max()) { - min_pos_prev = min_current->pos; - minimizers.push_back(*min_current); - } - } - } - return minimizers; -} - -inline Indexlr::Record -Indexlr::get_minimizers() -{ - if (ready_blocks_owners()[id % MAX_SIMULTANEOUS_INDEXLRS] != id) { - ready_blocks_array()[id % MAX_SIMULTANEOUS_INDEXLRS] = - std::unique_ptr( - new decltype(output_queue)::Block(block_size)); - ready_blocks_owners()[id % MAX_SIMULTANEOUS_INDEXLRS] = id; - } - auto& block = *(ready_blocks_array()[id % MAX_SIMULTANEOUS_INDEXLRS]); - if (block.count <= block.current) { - output_queue.read(block); - if (block.count <= block.current) { - output_queue.close(); - block = decltype(output_queue)::Block(block_size); - return Record(); - } - } - return std::move(block.data[block.current++]); -} - -inline void -Indexlr::InputWorker::work() -{ - if (indexlr.reader.get_format() == SeqReader::Format::FASTA) { - indexlr.fasta = true; - } else { - indexlr.fasta = false; - } - - decltype(indexlr.input_queue)::Block block(indexlr.block_size); - size_t current_block_num = 0; - SeqReader::Record record; - Read read; - while ((record = indexlr.reader.read())) { - block.data[block.count++] = Read(record.num, - std::move(record.name), - std::move(record.comment), - std::move(record.seq)); - if (block.count == indexlr.block_size) { - block.num = current_block_num++; - indexlr.input_queue.write(block); - block.count = 0; - } - } - if (block.count > 0) { - block.num = current_block_num++; - indexlr.input_queue.write(block); - } - for (unsigned i = 0; i < indexlr.threads; i++) { - block.num = current_block_num++; - block.current = 0; - block.count = 0; - indexlr.input_queue.write(block); - } -} - -inline void -Indexlr::MinimizeWorker::work() -{ - decltype(indexlr.input_queue)::Block input_block(indexlr.block_size); - decltype(indexlr.output_queue)::Block output_block(indexlr.block_size); - - for (;;) { - if (input_block.current == input_block.count) { - if (output_block.count > 0) { - output_block.num = input_block.num; - indexlr.output_queue.write(output_block); - output_block.current = 0; - output_block.count = 0; - } - indexlr.input_queue.read(input_block); - } - if (input_block.count == 0) { - output_block.num = input_block.num; - output_block.current = 0; - output_block.count = 0; - indexlr.output_queue.write(output_block); - break; - } - Read& read = input_block.data[input_block.current++]; - Record record; - record.num = read.num; - if (indexlr.output_id()) { - record.id = std::move(read.id); - } - if (indexlr.output_bx()) { - record.barcode = indexlr.extract_barcode(record.id, read.comment); - } - - check_info(indexlr.verbose && indexlr.k > read.seq.size(), - "Indexlr: skipped seq " + std::to_string(read.num) + - " on line " + - std::to_string(read.num * (indexlr.fasta ? 2 : 4) + 2) + - "; k (" + std::to_string(indexlr.k) + ") > seq length (" + - std::to_string(read.seq.size()) + ")"); - - check_info(indexlr.verbose && indexlr.w > read.seq.size() - indexlr.k + 1, - "Indexlr: skipped seq " + std::to_string(read.num) + - " on line " + - std::to_string(read.num * (indexlr.fasta ? 2 : 4) + 2) + - "; w (" + std::to_string(indexlr.w) + ") > # of hashes (" + - std::to_string(read.seq.size() - indexlr.k + 1) + ")"); - - if (indexlr.k <= read.seq.size() && - indexlr.w <= read.seq.size() - indexlr.k + 1) { - record.minimizers = indexlr.minimize(read.seq); - } else { - record.minimizers = {}; - } - - output_block.data[output_block.count++] = std::move(record); - } -} - -} // namespace btllib - -#endif \ No newline at end of file diff --git a/src/include/btllib/indexlr2.cpp b/src/include/btllib/indexlr2.cpp deleted file mode 100644 index 4167853..0000000 --- a/src/include/btllib/indexlr2.cpp +++ /dev/null @@ -1,315 +0,0 @@ -#include "btllib/indexlr2.hpp" -#include "btllib/bloom_filter.hpp" -#include "btllib/status.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -const static std::string PROGNAME = "indexlr"; -const static std::string VERSION = "v1.3"; -const static size_t OUTPUT_PERIOD_SHORT = 512; -const static size_t OUTPUT_PERIOD_LONG = 2; -const static size_t INITIAL_OUTPUT_STREAM_SIZE = 100; -const static size_t QUEUE_SIZE = 64; -const static size_t MAX_THREADS = 5; - -static void -print_error_msg(const std::string& msg) -{ - std::cerr << PROGNAME << ' ' << VERSION << ": " << msg << std::endl; -} - -static void -print_usage() -{ - std::cerr - << "Usage: " << PROGNAME - << " -k K -w W [-r repeat_bf_path] [-s solid_bf_path] [--id] [--bx] [--pos] [--seq] " - "[-o FILE] FILE...\n\n" - " -k K Use K as k-mer size.\n" - " -w W Use W as sliding-window size.\n" - " --id Include read ids in the output.\n" - " --bx Include read barcodes in the output.\n" - " --len Include read length in the output.\n" - " --pos Include minimizer positions in the output (appended with : after " - "minimizer value).\n" - " --strand Include minimizer strands in the output (appended with : after minimizer " - "value).\n" - " --seq Include minimizer sequences in the output (appended with : after " - "minimizer value).\n" - " If a combination of --pos, --strand, and --seq options are provided, " - "they're appended in the --pos, --strand, --seq order after the minimizer value.\n" - " --long Enable long mode which is more efficient for long sequences (e.g. long " - "reads, contigs, reference).\n" - " -r repeat_bf_path Use a Bloom filter to filter out repetitive minimizers.\n" - " -s solid_bf_path Use a Bloom filter to only select solid minimizers.\n" - " -o FILE Write output to FILE, default is stdout.\n" - " -t T Use T number of threads (default 5, max 5) per input file.\n" - " -v Show verbose output.\n" - " --help Display this help and exit.\n" - " --version Display version and exit.\n" - " FILE Space separated list of FASTA/Q files." - << std::endl; -} - -int -main(int argc, char* argv[]) -{ - int c; - int optindex = 0; - int help = 0, version = 0; - bool verbose = false; - unsigned k = 0, w = 0, t = 5; - bool w_set = false; - bool k_set = false; - int with_id = 0, with_bx = 0, with_readlen = 0, with_pos = 0, with_strand = 0, with_seq = 0; - std::unique_ptr repeat_bf, solid_bf; - bool with_repeat = false, with_solid = false; - int long_mode = 0; - std::string outfile("-"); - bool failed = false; - static const struct option longopts[] = { - { "id", no_argument, &with_id, 1 }, { "bx", no_argument, &with_bx, 1 }, - { "len", no_argument, &with_readlen, 1 }, { "pos", no_argument, &with_pos, 1 }, - { "strand", no_argument, &with_strand, 1 }, { "seq", no_argument, &with_seq, 1 }, - { "long", no_argument, &long_mode, 1 }, { "help", no_argument, &help, 1 }, - { "version", no_argument, &version, 1 }, { nullptr, 0, nullptr, 0 } - }; - while ((c = getopt_long(argc, argv, "k:w:o:t:vr:s:", longopts, &optindex)) != -1) { - switch (c) { - case 0: - break; - case 'k': - k_set = true; - k = std::stoul(optarg); - break; - case 'w': - w_set = true; - w = std::stoul(optarg); - break; - case 'o': - outfile = optarg; - break; - case 't': - t = std::stoul(optarg); - break; - case 'v': - verbose = true; - break; - case 'r': { - with_repeat = true; - std::cerr << "Loading repeat Bloom filter from " << optarg << std::endl; - try { - repeat_bf = - std::unique_ptr(new btllib::KmerBloomFilter(optarg)); - } catch (const std::exception& e) { - std::cerr << e.what() << '\n'; - } - std::cerr << "Finished loading repeat Bloom filter" << std::endl; - break; - } - case 's': { - with_solid = true; - std::cerr << "Loading solid Bloom filter from " << optarg << std::endl; - try { - solid_bf = - std::unique_ptr(new btllib::KmerBloomFilter(optarg)); - } catch (const std::exception& e) { - std::cerr << e.what() << '\n'; - } - std::cerr << "Finished loading solid Bloom filter" << std::endl; - break; - } - default: - std::exit(EXIT_FAILURE); - } - } - if (t > MAX_THREADS) { - t = MAX_THREADS; - std::cerr << (PROGNAME + ' ' + VERSION + ": Using more than " + - std::to_string(MAX_THREADS) + " threads does not scale, reverting to 5.\n") - << std::flush; - } - std::vector infiles(&argv[optind], &argv[argc]); - if (argc < 2) { - print_usage(); - std::exit(EXIT_FAILURE); - } - if (help != 0) { - print_usage(); - std::exit(EXIT_SUCCESS); - } else if (version != 0) { - std::cerr << PROGNAME << ' ' << VERSION << std::endl; - std::exit(EXIT_SUCCESS); - } - if (!k_set) { - print_error_msg("missing option -- 'k'"); - failed = true; - } else if (k == 0) { - print_error_msg("option has incorrect value -- 'k'"); - failed = true; - } - if (!w_set) { - print_error_msg("missing option -- 'w'"); - failed = true; - } else if (w == 0) { - print_error_msg("option has incorrect value -- 'w'"); - failed = true; - } - if (infiles.empty()) { - print_error_msg("missing file operand"); - failed = true; - } - if (failed) { - std::cerr << "Try '" << PROGNAME << " --help' for more information.\n"; - std::exit(EXIT_FAILURE); - } - - unsigned flags = 0; - if (with_id) { - flags |= btllib::Indexlr::Flag::ID; - } - if (with_bx) { - flags |= btllib::Indexlr::Flag::BX; - } - if (with_seq) { - flags |= btllib::Indexlr::Flag::SEQ; - } - if (long_mode) { - flags |= btllib::Indexlr::Flag::LONG_MODE; - } - - btllib::Indexlr::Record record; - FILE* out; - if (outfile == "-") { - out = stdout; - } else { - out = fopen(outfile.c_str(), "w"); - } - for (auto& infile : infiles) { - std::unique_ptr indexlr; - if (with_repeat && with_solid) { - flags |= btllib::Indexlr::Flag::FILTER_IN; - flags |= btllib::Indexlr::Flag::FILTER_OUT; - indexlr = std::unique_ptr(new btllib::Indexlr( - infile, - k, - w, - flags, - t, - verbose, - 0, - solid_bf->get_bloom_filter(), - repeat_bf->get_bloom_filter())); - } else if (with_repeat) { - flags |= btllib::Indexlr::Flag::FILTER_OUT; - indexlr = std::unique_ptr(new btllib::Indexlr( - infile, k, w, flags, t, verbose, 0, repeat_bf->get_bloom_filter())); - } else if (with_solid) { - flags |= btllib::Indexlr::Flag::FILTER_IN; - indexlr = std::unique_ptr( - new btllib::Indexlr(infile, k, w, flags, t, verbose, 0, solid_bf->get_bloom_filter())); - } else { - indexlr = std::unique_ptr( - new btllib::Indexlr(infile, k, w, flags, t, verbose)); - } - std::queue output_queue; - std::mutex output_queue_mutex; - std::condition_variable queue_empty, queue_full; - size_t max_seen_output_size = INITIAL_OUTPUT_STREAM_SIZE; - const size_t output_period = long_mode ? OUTPUT_PERIOD_LONG : OUTPUT_PERIOD_SHORT; - std::unique_ptr info_compiler(new std::thread([&]() { - std::stringstream ss; - while ((record = indexlr->get_minimizers())) { - if (with_id || (!with_id && !with_bx)) { - ss << record.id << '\t'; - } - if (with_bx) { - ss << record.barcode << '\t'; - } - if (with_readlen) { - ss << record.readlen << '\t'; - } - int j = 0; - for (const auto& min : record.minimizers) { - if (j > 0) { - ss << ' '; - } - ss << min.out_hash; - if (with_pos) { - ss << ':' << min.pos; - } - if (with_strand) { - ss << ':' << (min.forward ? '+' : '-'); - } - if (with_seq) { - ss << ':' << min.seq; - } - j++; - } - ss << '\n'; - if (record.num % output_period == output_period - 1) { - auto ss_str = ss.str(); - max_seen_output_size = std::max(max_seen_output_size, ss_str.size()); - std::unique_lock lock(output_queue_mutex); - while (output_queue.size() == QUEUE_SIZE) { - queue_full.wait(lock); - } - output_queue.push(std::move(ss_str)); - queue_empty.notify_one(); - lock.unlock(); - std::string newstring; - newstring.reserve(max_seen_output_size); - ss.str(std::move(newstring)); - } - } - { - std::unique_lock lock(output_queue_mutex); - output_queue.push(ss.str()); - output_queue.push(std::string()); - queue_empty.notify_one(); - } - })); - std::unique_ptr output_worker(new std::thread([&]() { - std::string to_write; - for (;;) { - { - std::unique_lock lock(output_queue_mutex); - while (output_queue.empty()) { - queue_empty.wait(lock); - } - to_write = std::move(output_queue.front()); - output_queue.pop(); - queue_full.notify_one(); - } - if (to_write.empty()) { - break; - } - btllib::check_error( - fwrite(to_write.c_str(), 1, to_write.size(), out) != to_write.size(), - "Indexlr: fwrite failed."); - } - })); - info_compiler->join(); - output_worker->join(); - } - if (out != stdout) { - fclose(out); - } - - return 0; -} diff --git a/src/include/btllib/indexlr2.hpp b/src/include/btllib/indexlr2.hpp deleted file mode 100644 index e5ce406..0000000 --- a/src/include/btllib/indexlr2.hpp +++ /dev/null @@ -1,684 +0,0 @@ -#ifndef BTLLIB_INDEXLR_HPP -#define BTLLIB_INDEXLR_HPP - -#include "bloom_filter.hpp" -#include "nthash.hpp" -#include "order_queue.hpp" -#include "seq_reader.hpp" -#include "status.hpp" -#include "util.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace btllib { - -// TODO: Allow multiple Indexlr objects to be instantiated (by assigning ID to -// each instance / indexing static members based on ID) -class Indexlr -{ - -public: - /* Has to be a struct and not an enum because: - * 1) Non-class enums are not name qualified and can collide - * 2) class enums can't be implicitly converted into integers - */ - struct Flag - { - static const unsigned ID = 0; - static const unsigned NO_ID = 1; - static const unsigned BX = 2; - static const unsigned NO_BX = 0; - static const unsigned SEQ = 4; - static const unsigned NO_SEQ = 0; - static const unsigned FILTER_IN = 8; - static const unsigned NO_FILTER_IN = 0; - static const unsigned FILTER_OUT = 16; - static const unsigned NO_FILTER_OUT = 0; - static const unsigned SHORT_MODE = 0; - static const unsigned LONG_MODE = 32; - }; - - bool output_id() const { return bool(~flags & Flag::NO_ID); } - bool output_bx() const { return bool(flags & Flag::BX); } - bool output_seq() const { return bool(flags & Flag::SEQ); } - bool filter_in() const { return bool(flags & Flag::FILTER_IN); } - bool filter_out() const { return bool(flags & Flag::FILTER_OUT); } - bool short_mode() const { return bool(~flags & Flag::LONG_MODE); } - bool long_mode() const { return bool(flags & Flag::LONG_MODE); } - - struct Read - { - Read() {} - - Read(size_t num, std::string id, std::string comment, std::string seq) - : num(num) - , id(std::move(id)) - , comment(std::move(comment)) - , seq(std::move(seq)) - {} - - size_t num = 0; - std::string id; - std::string comment; - std::string seq; - }; - - struct Minimizer - { - Minimizer() = default; - - Minimizer(uint64_t min_hash, - uint64_t out_hash, - size_t pos, - bool forward, - std::string seq) - : min_hash(min_hash) - , out_hash(out_hash) - , pos(pos) - , forward(forward) - , seq(std::move(seq)) - {} - - uint64_t min_hash = 0, out_hash = 0; - size_t pos = 0; - bool forward = false; - std::string seq; - }; - - using HashedKmer = Minimizer; - - struct Record - { - Record() {} - - Record(size_t num, - std::string id, - std::string barcode, - size_t readlen, - std::vector minimizers) - : num(num) - , id(std::move(id)) - , barcode(std::move(barcode)) - , readlen(readlen) - , minimizers(std::move(minimizers)) - {} - - size_t num = 0; - std::string id; - std::string barcode; - size_t readlen = 0; - std::vector minimizers; - - operator bool() const { return !id.empty() || !barcode.empty(); } - }; - - Record get_minimizers(); - - Indexlr(std::string seqfile, - size_t k, - size_t w, - unsigned flags = 0, - unsigned threads = 5, - bool verbose = false, - size_t W = 0, - std::vector ss = std::vector(), - const btllib::BloomFilter& bf1 = Indexlr::dummy_bf(), - const btllib::BloomFilter& bf2 = Indexlr::dummy_bf()); - - ~Indexlr(); - - static const size_t MAX_SIMULTANEOUS_INDEXLRS = 256; - - static const size_t SHORT_MODE_BUFFER_SIZE = 32; - static const size_t SHORT_MODE_BLOCK_SIZE = 32; - - static const size_t LONG_MODE_BUFFER_SIZE = 4; - static const size_t LONG_MODE_BLOCK_SIZE = 1; - -private: - static std::string extract_barcode(const std::string& id, - const std::string& comment); - std::vector minimize(const std::string& seq) const; - - std::vector minimize_with_ss(const std::string& seq) const; - - const std::string seqfile; - const size_t k, w; - const unsigned flags; - const unsigned threads; - const bool verbose; - size_t W = 0; - std::vector ss; - const long id; - const size_t buffer_size; - const size_t block_size; - - - static const BloomFilter& dummy_bf() - { - static const BloomFilter VAR; - return VAR; - } - - const std::reference_wrapper bf1; - const std::reference_wrapper bf2; - bool filter_in_enabled; - bool filter_out_enabled; - - std::atomic fasta{ false }; - OrderQueueSPMC input_queue; - OrderQueueMPSC output_queue; - - using OutputQueueType = decltype(output_queue); - static std::unique_ptr* ready_blocks_array() - { - thread_local static std::unique_ptr - var[MAX_SIMULTANEOUS_INDEXLRS]; - return var; - } - - static long* ready_blocks_owners() - { - thread_local static long var[MAX_SIMULTANEOUS_INDEXLRS]; - return var; - } - - static std::atomic& last_id() - { - static std::atomic var(0); - return var; - } - - class Worker - { - public: - void start() { t = std::thread(do_work, this); } - void join() { t.join(); } - - virtual ~Worker() {} - - Worker& operator=(const Worker& worker) = delete; - Worker& operator=(Worker&& worker) = delete; - - protected: - Worker(Indexlr& indexlr) - : indexlr(indexlr) - {} - - Worker(const Worker& worker) - : Worker(worker.indexlr) - {} - Worker(Worker&& worker) noexcept - : Worker(worker.indexlr) - {} - - Indexlr& indexlr; - - virtual void work() = 0; - static void do_work(Worker* worker) { worker->work(); } - - std::thread t; - }; - - class InputWorker : public Worker - { - public: - InputWorker(Indexlr& indexlr) - : Worker(indexlr) - {} - - InputWorker(const InputWorker& worker) - : InputWorker(worker.indexlr) - {} - InputWorker(InputWorker&& worker) noexcept - : InputWorker(worker.indexlr) - {} - - InputWorker& operator=(const InputWorker& worker) = delete; - InputWorker& operator=(InputWorker&& worker) = delete; - - void work() override; - }; - - class MinimizeWorker : public Worker - { - public: - MinimizeWorker(Indexlr& indexlr) - : Worker(indexlr) - {} - - MinimizeWorker(const MinimizeWorker& worker) - : MinimizeWorker(worker.indexlr) - {} - MinimizeWorker(MinimizeWorker&& worker) noexcept - : MinimizeWorker(worker.indexlr) - {} - - MinimizeWorker& operator=(const MinimizeWorker& worker) = delete; - MinimizeWorker& operator=(MinimizeWorker&& worker) = delete; - - void work() override; - }; - - SeqReader reader; - InputWorker input_worker; - std::vector minimize_workers; -}; - -inline Indexlr::Indexlr(std::string seqfile, - const size_t k, - const size_t w, - const unsigned flags, - const unsigned threads, - const bool verbose, - size_t W, - std::vector ss, - const BloomFilter& bf1, - const BloomFilter& bf2) - : seqfile(std::move(seqfile)) - , k(k) - , w(w) - , flags(flags) - , threads(threads) - , verbose(verbose) - , W(W) - , ss(ss) - , id(++last_id()) - , buffer_size(short_mode() ? SHORT_MODE_BUFFER_SIZE : LONG_MODE_BUFFER_SIZE) - , block_size(short_mode() ? SHORT_MODE_BLOCK_SIZE : LONG_MODE_BLOCK_SIZE) - , bf1(bf1) - , bf2(bf2) - , filter_in_enabled(filter_in()) - , filter_out_enabled(filter_out()) - , input_queue(buffer_size, block_size) - , output_queue(buffer_size, block_size) - , reader(this->seqfile, 0, 3, buffer_size, block_size) - , input_worker(*this) - , minimize_workers( - std::vector(threads, MinimizeWorker(*this))) -{ - input_worker.start(); - for (auto& worker : minimize_workers) { - worker.start(); - } -} - -inline Indexlr::~Indexlr() -{ - reader.close(); - for (auto& worker : minimize_workers) { - worker.join(); - } - input_worker.join(); -} - -// Minimerize a sequence: Find the minimizers of a vector of hash values -// representing a sequence. -/* Algorithm -v is a vector of non-negative integers -w is the window size -Invariants - 0 < w <= v.size() - 1 - 0 <= l <= r <= v.size() - 1 -Initial conditions - M = NIL Final set of minimizers, empty initially - min = -1 Minimum element - i = -1 Index of minimum element - prev = -1 Index of previous minimum element - l = 0 Index of left end of window - r = l + w - 1 Index of right end of window -Computation -At each window, if the previous minimum is out of scope, find the new, -right-most, minimum or else, check with only the right-most element to determine -if that is the new minimum. A minimizer is added to the final vector only if -it's index has changed. for each window of v bounded by [l, r] if (i < l) i = -index of minimum element in [l, r], furthest from l. else if (v[r] <= v[i]) i = -r min = v[i] if (i != prev) { prev = i M <- M + m - } - l = l + 1 Move window's left bound by one element - r = l + w - 1 Set window's right bound -}*/ - -inline std::string -Indexlr::extract_barcode(const std::string& id, const std::string& comment) -{ - const static std::string BARCODE_PREFIX = "BX:Z:"; - if (starts_with(comment, BARCODE_PREFIX)) { - const auto space_pos = comment.find(' '); - if (space_pos != std::string::npos) { - return comment.substr(BARCODE_PREFIX.size(), - space_pos - BARCODE_PREFIX.size()); - } - return comment.substr(BARCODE_PREFIX.size()); - } - const auto pound_pos = id.find('#'); - if (pound_pos != std::string::npos) { - const auto slash_pos = id.find('/'); - if (slash_pos > pound_pos) { - return id.substr(pound_pos + 1, slash_pos - (pound_pos + 1)); - } - } - return "NA"; -} - -inline std::vector -Indexlr::minimize(const std::string& seq) const -{ - if ((k > seq.size()) || (w > seq.size() - k + 1)) { - return {}; - } - if (W > seq.size()) { - return {}; - } - std::vector minimizers; - std::vector partial_minimizers; - std::vector paired_minimizers; - partial_minimizers.reserve(seq.size() - k + 1 - w); - paired_minimizers.reserve(seq.size() - W + 1); - minimizers.reserve(2 * (seq.size() - k + 1) / w); - std::vector hashed_kmers_buffer(w + 1); - ssize_t min_idx_left, min_idx_right, min_pos_prev = -1; - const Minimizer* min_current = nullptr; - size_t idx = 0; - for (NtHash nh(seq, k, 2); nh.roll(); ++idx) { - auto& hk = hashed_kmers_buffer[idx % hashed_kmers_buffer.size()]; - - hk = HashedKmer(nh.hashes()[0], - nh.hashes()[1], - nh.get_pos(), - nh.forward(), - output_seq() ? seq.substr(nh.get_pos(), k) : ""); - - if (filter_in() && filter_out()) { - std::vector tmp; - tmp = { hk.min_hash }; - if (!bf1.get().contains(tmp) || bf2.get().contains(tmp)) { - hk.min_hash = std::numeric_limits::max(); - } - } else if (filter_in()) { - if (!bf1.get().contains({ hk.min_hash })) { - hk.min_hash = std::numeric_limits::max(); - } - } else if (filter_out()) { - if (bf1.get().contains({ hk.min_hash })) { - hk.min_hash = std::numeric_limits::max(); - } - } - - if (idx + 1 >= w) { - min_idx_left = idx + 1 - w; - min_idx_right = idx + 1; - const auto& min_left = - hashed_kmers_buffer[min_idx_left % hashed_kmers_buffer.size()]; - const auto& min_right = - hashed_kmers_buffer[(min_idx_right - 1) % hashed_kmers_buffer.size()]; - - if (min_current == nullptr || min_current->pos < min_left.pos) { - min_current = &min_left; - // Use of operator '<=' returns the minimum that is furthest from left. - for (ssize_t i = min_idx_left; i < min_idx_right; i++) { - const auto& min_i = - hashed_kmers_buffer[i % hashed_kmers_buffer.size()]; - if (min_i.min_hash <= min_current->min_hash) { - min_current = &min_i; - } - } - } else if (min_right.min_hash <= min_current->min_hash) { - min_current = &min_right; - } - if (!W) { - if (ssize_t(min_current->pos) > min_pos_prev && - min_current->min_hash != std::numeric_limits::max()) { - min_pos_prev = min_current->pos; - minimizers.push_back(*min_current); - } - } else { - partial_minimizers.push_back(*min_current); - } - } - } - uint64_t prev_min = std::numeric_limits::max(); - for (size_t i = W/2; i < partial_minimizers.size(); ++i) { - uint64_t curr_min = btllib::rolx(partial_minimizers[i].out_hash, k) ^ partial_minimizers[i - (W / 2)].out_hash; - if (curr_min == prev_min) { - continue; - } - prev_min = curr_min; - Minimizer paired_min = HashedKmer (0, curr_min, 0, false, ""); - paired_minimizers.push_back(paired_min); - } - if (W) { - std::cerr << "working345" << std::endl; - return paired_minimizers; - } - return minimizers; -} - -inline std::vector -Indexlr::minimize_with_ss(const std::string& seq) const -{ - if ((k > seq.size()) || (w > seq.size() - k + 1)) { - return {}; - } - if (W > seq.size()) { - return {}; - } - std::vector minimizers; - std::vector partial_minimizers; - std::vector paired_minimizers; - partial_minimizers.reserve(seq.size() - k + 1 - w); - paired_minimizers.reserve(seq.size() - W + 1); - minimizers.reserve(2 * (seq.size() - k + 1) / w); - std::vector hashed_kmers_buffer(w + 1); - ssize_t min_idx_left, min_idx_right, min_pos_prev = -1; - const Minimizer* min_current = nullptr; - size_t idx = 0; - for (SeedNtHash nh(seq, k, ss,2); nh.roll(); ++idx) { - auto& hk = hashed_kmers_buffer[idx % hashed_kmers_buffer.size()]; - - hk = HashedKmer(nh.hashes()[0], - nh.hashes()[1], - nh.get_pos(), - nh.forward(), - output_seq() ? seq.substr(nh.get_pos(), k) : ""); - - if (filter_in() && filter_out()) { - std::vector tmp; - tmp = { hk.min_hash }; - if (!bf1.get().contains(tmp) || bf2.get().contains(tmp)) { - hk.min_hash = std::numeric_limits::max(); - } - } else if (filter_in()) { - if (!bf1.get().contains({ hk.min_hash })) { - hk.min_hash = std::numeric_limits::max(); - } - } else if (filter_out()) { - if (bf1.get().contains({ hk.min_hash })) { - hk.min_hash = std::numeric_limits::max(); - } - } - - if (idx + 1 >= w) { - min_idx_left = idx + 1 - w; - min_idx_right = idx + 1; - const auto& min_left = - hashed_kmers_buffer[min_idx_left % hashed_kmers_buffer.size()]; - const auto& min_right = - hashed_kmers_buffer[(min_idx_right - 1) % hashed_kmers_buffer.size()]; - - if (min_current == nullptr || min_current->pos < min_left.pos) { - min_current = &min_left; - // Use of operator '<=' returns the minimum that is furthest from left. - for (ssize_t i = min_idx_left; i < min_idx_right; i++) { - const auto& min_i = - hashed_kmers_buffer[i % hashed_kmers_buffer.size()]; - if (min_i.min_hash <= min_current->min_hash) { - min_current = &min_i; - } - } - } else if (min_right.min_hash <= min_current->min_hash) { - min_current = &min_right; - } - if (!W) { - if (ssize_t(min_current->pos) > min_pos_prev && - min_current->min_hash != std::numeric_limits::max()) { - min_pos_prev = min_current->pos; - minimizers.push_back(*min_current); - } - } else { - partial_minimizers.push_back(*min_current); - } - } - } - uint64_t prev_min = std::numeric_limits::max(); - for (size_t i = W/2; i < partial_minimizers.size(); ++i) { - uint64_t curr_min = btllib::rolx(partial_minimizers[i].out_hash, k) ^ partial_minimizers[i - (W / 2)].out_hash; - if (curr_min == prev_min) { - continue; - } - prev_min = curr_min; - Minimizer paired_min = HashedKmer (0, curr_min, 0, false, ""); - paired_minimizers.push_back(paired_min); - } - if (W) { - std::cerr << "working" << std::endl; - return paired_minimizers; - } - return minimizers; -} - - -inline Indexlr::Record -Indexlr::get_minimizers() -{ - if (ready_blocks_owners()[id % MAX_SIMULTANEOUS_INDEXLRS] != id) { - ready_blocks_array()[id % MAX_SIMULTANEOUS_INDEXLRS] = - std::unique_ptr( - new decltype(output_queue)::Block(block_size)); - ready_blocks_owners()[id % MAX_SIMULTANEOUS_INDEXLRS] = id; - } - auto& block = *(ready_blocks_array()[id % MAX_SIMULTANEOUS_INDEXLRS]); - if (block.count <= block.current) { - output_queue.read(block); - if (block.count <= block.current) { - output_queue.close(); - block = decltype(output_queue)::Block(block_size); - return Record(); - } - } - return std::move(block.data[block.current++]); -} - -inline void -Indexlr::InputWorker::work() -{ - if (indexlr.reader.get_format() == SeqReader::Format::FASTA) { - indexlr.fasta = true; - } else { - indexlr.fasta = false; - } - - decltype(indexlr.input_queue)::Block block(indexlr.block_size); - size_t current_block_num = 0; - SeqReader::Record record; - Read read; - while ((record = indexlr.reader.read())) { - block.data[block.count++] = Read(record.num, - std::move(record.name), - std::move(record.comment), - std::move(record.seq)); - if (block.count == indexlr.block_size) { - block.num = current_block_num++; - indexlr.input_queue.write(block); - block.count = 0; - } - } - if (block.count > 0) { - block.num = current_block_num++; - indexlr.input_queue.write(block); - } - for (unsigned i = 0; i < indexlr.threads; i++) { - block.num = current_block_num++; - block.current = 0; - block.count = 0; - indexlr.input_queue.write(block); - } -} - -inline void -Indexlr::MinimizeWorker::work() -{ - decltype(indexlr.input_queue)::Block input_block(indexlr.block_size); - decltype(indexlr.output_queue)::Block output_block(indexlr.block_size); - - for (;;) { - if (input_block.current == input_block.count) { - if (output_block.count > 0) { - output_block.num = input_block.num; - indexlr.output_queue.write(output_block); - output_block.current = 0; - output_block.count = 0; - } - indexlr.input_queue.read(input_block); - } - if (input_block.count == 0) { - output_block.num = input_block.num; - output_block.current = 0; - output_block.count = 0; - indexlr.output_queue.write(output_block); - break; - } - Read& read = input_block.data[input_block.current++]; - Record record; - record.num = read.num; - if (indexlr.output_id()) { - record.id = std::move(read.id); - } - if (indexlr.output_bx()) { - record.barcode = indexlr.extract_barcode(record.id, read.comment); - } - record.readlen = read.seq.size(); - - check_info(indexlr.verbose && indexlr.k > read.seq.size(), - "Indexlr: skipped seq " + std::to_string(read.num) + - " on line " + - std::to_string(read.num * (indexlr.fasta ? 2 : 4) + 2) + - "; k (" + std::to_string(indexlr.k) + ") > seq length (" + - std::to_string(read.seq.size()) + ")"); - - check_info(indexlr.verbose && indexlr.w > read.seq.size() - indexlr.k + 1, - "Indexlr: skipped seq " + std::to_string(read.num) + - " on line " + - std::to_string(read.num * (indexlr.fasta ? 2 : 4) + 2) + - "; w (" + std::to_string(indexlr.w) + ") > # of hashes (" + - std::to_string(read.seq.size() - indexlr.k + 1) + ")"); - - if (indexlr.k <= read.seq.size() && - indexlr.w <= read.seq.size() - indexlr.k + 1) { - if (indexlr.ss.size() == 0) { - record.minimizers = indexlr.minimize(read.seq); - } else { - std::cerr << "working2" < -#include -#include -#include - -namespace btllib { - -// offset for the complement base in the random seeds table -const uint8_t CP_OFF = 0x07; - -// shift for gerenerating multiple hash values -const int MULTISHIFT = 27; - -// seed for gerenerating multiple hash values -static const uint64_t MULTISEED = 0x90b45d39fb6da1fa; - -// 64-bit random seeds corresponding to bases and their complements -static const uint64_t SEED_A = 0x3c8bfbb395c60474; -static const uint64_t SEED_C = 0x3193c18562a02b4c; -static const uint64_t SEED_G = 0x20323ed082572324; -static const uint64_t SEED_T = 0x295549f54be24456; -static const uint64_t SEED_N = 0x0000000000000000; - -static const int ASCII_SIZE = 256; - -static const uint64_t SEED_TAB[ASCII_SIZE] = { - SEED_N, SEED_T, SEED_N, SEED_G, SEED_A, SEED_A, SEED_N, SEED_C, // 0..7 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 8..15 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 16..23 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 24..31 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 32..39 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 40..47 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 48..55 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 56..63 - SEED_N, SEED_A, SEED_N, SEED_C, SEED_N, SEED_N, SEED_N, SEED_G, // 64..71 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 72..79 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_T, SEED_T, SEED_N, SEED_N, // 80..87 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 88..95 - SEED_N, SEED_A, SEED_N, SEED_C, SEED_N, SEED_N, SEED_N, SEED_G, // 96..103 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 104..111 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_T, SEED_T, SEED_N, SEED_N, // 112..119 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 120..127 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 128..135 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 136..143 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 144..151 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 152..159 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 160..167 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 168..175 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 176..183 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 184..191 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 192..199 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 200..207 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 208..215 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 216..223 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 224..231 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 232..239 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, // 240..247 - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N // 248..255 -}; - -static const uint64_t A33R[33] = { - 0x195c60474, 0x12b8c08e9, 0x571811d3, 0xae3023a6, 0x15c60474c, 0xb8c08e99, - 0x171811d32, 0xe3023a65, 0x1c60474ca, 0x18c08e995, 0x11811d32b, 0x3023a657, - 0x60474cae, 0xc08e995c, 0x1811d32b8, 0x1023a6571, 0x474cae3, 0x8e995c6, - 0x11d32b8c, 0x23a65718, 0x474cae30, 0x8e995c60, 0x11d32b8c0, 0x3a657181, - 0x74cae302, 0xe995c604, 0x1d32b8c08, 0x1a6571811, 0x14cae3023, 0x995c6047, - 0x132b8c08e, 0x6571811d, 0xcae3023a -}; - -static const uint64_t A31L[31] = { - 0x3c8bfbb200000000, 0x7917f76400000000, 0xf22feec800000000, - 0xe45fdd9200000000, 0xc8bfbb2600000000, 0x917f764e00000000, - 0x22feec9e00000000, 0x45fdd93c00000000, 0x8bfbb27800000000, - 0x17f764f200000000, 0x2feec9e400000000, 0x5fdd93c800000000, - 0xbfbb279000000000, 0x7f764f2200000000, 0xfeec9e4400000000, - 0xfdd93c8a00000000, 0xfbb2791600000000, 0xf764f22e00000000, - 0xeec9e45e00000000, 0xdd93c8be00000000, 0xbb27917e00000000, - 0x764f22fe00000000, 0xec9e45fc00000000, 0xd93c8bfa00000000, - 0xb27917f600000000, 0x64f22fee00000000, 0xc9e45fdc00000000, - 0x93c8bfba00000000, 0x27917f7600000000, 0x4f22feec00000000, - 0x9e45fdd800000000 -}; - -static const uint64_t C33R[33] = { - 0x162a02b4c, 0xc5405699, 0x18a80ad32, 0x115015a65, 0x2a02b4cb, 0x54056996, - 0xa80ad32c, 0x15015a658, 0xa02b4cb1, 0x140569962, 0x80ad32c5, 0x1015a658a, - 0x2b4cb15, 0x569962a, 0xad32c54, 0x15a658a8, 0x2b4cb150, 0x569962a0, - 0xad32c540, 0x15a658a80, 0xb4cb1501, 0x169962a02, 0xd32c5405, 0x1a658a80a, - 0x14cb15015, 0x9962a02b, 0x132c54056, 0x658a80ad, 0xcb15015a, 0x1962a02b4, - 0x12c540569, 0x58a80ad3, 0xb15015a6 -}; - -static const uint64_t C31L[31] = { - 0x3193c18400000000, 0x6327830800000000, 0xc64f061000000000, - 0x8c9e0c2200000000, 0x193c184600000000, 0x3278308c00000000, - 0x64f0611800000000, 0xc9e0c23000000000, 0x93c1846200000000, - 0x278308c600000000, 0x4f06118c00000000, 0x9e0c231800000000, - 0x3c18463200000000, 0x78308c6400000000, 0xf06118c800000000, - 0xe0c2319200000000, 0xc184632600000000, 0x8308c64e00000000, - 0x6118c9e00000000, 0xc23193c00000000, 0x1846327800000000, - 0x308c64f000000000, 0x6118c9e000000000, 0xc23193c000000000, - 0x8463278200000000, 0x8c64f0600000000, 0x118c9e0c00000000, - 0x23193c1800000000, 0x4632783000000000, 0x8c64f06000000000, - 0x18c9e0c200000000 -}; - -static const uint64_t G33R[33] = { - 0x82572324, 0x104ae4648, 0x95c8c91, 0x12b91922, 0x25723244, 0x4ae46488, - 0x95c8c910, 0x12b919220, 0x57232441, 0xae464882, 0x15c8c9104, 0xb9192209, - 0x172324412, 0xe4648825, 0x1c8c9104a, 0x191922095, 0x12324412b, 0x46488257, - 0x8c9104ae, 0x11922095c, 0x324412b9, 0x64882572, 0xc9104ae4, 0x1922095c8, - 0x124412b91, 0x48825723, 0x9104ae46, 0x122095c8c, 0x4412b919, 0x88257232, - 0x1104ae464, 0x2095c8c9, 0x412b9192 -}; - -static const uint64_t G31L[31] = { - 0x20323ed000000000, 0x40647da000000000, 0x80c8fb4000000000, - 0x191f68200000000, 0x323ed0400000000, 0x647da0800000000, - 0xc8fb41000000000, 0x191f682000000000, 0x323ed04000000000, - 0x647da08000000000, 0xc8fb410000000000, 0x91f6820200000000, - 0x23ed040600000000, 0x47da080c00000000, 0x8fb4101800000000, - 0x1f68203200000000, 0x3ed0406400000000, 0x7da080c800000000, - 0xfb41019000000000, 0xf682032200000000, 0xed04064600000000, - 0xda080c8e00000000, 0xb410191e00000000, 0x6820323e00000000, - 0xd040647c00000000, 0xa080c8fa00000000, 0x410191f600000000, - 0x820323ec00000000, 0x40647da00000000, 0x80c8fb400000000, - 0x10191f6800000000 -}; - -static const uint64_t T33R[33] = { - 0x14be24456, 0x97c488ad, 0x12f89115a, 0x5f1222b5, 0xbe24456a, 0x17c488ad4, - 0xf89115a9, 0x1f1222b52, 0x1e24456a5, 0x1c488ad4b, 0x189115a97, 0x11222b52f, - 0x24456a5f, 0x488ad4be, 0x9115a97c, 0x1222b52f8, 0x4456a5f1, 0x88ad4be2, - 0x1115a97c4, 0x22b52f89, 0x456a5f12, 0x8ad4be24, 0x115a97c48, 0x2b52f891, - 0x56a5f122, 0xad4be244, 0x15a97c488, 0xb52f8911, 0x16a5f1222, 0xd4be2445, - 0x1a97c488a, 0x152f89115, 0xa5f1222b -}; - -static const uint64_t T31L[31] = { - 0x295549f400000000, 0x52aa93e800000000, 0xa55527d000000000, - 0x4aaa4fa200000000, 0x95549f4400000000, 0x2aa93e8a00000000, - 0x55527d1400000000, 0xaaa4fa2800000000, 0x5549f45200000000, - 0xaa93e8a400000000, 0x5527d14a00000000, 0xaa4fa29400000000, - 0x549f452a00000000, 0xa93e8a5400000000, 0x527d14aa00000000, - 0xa4fa295400000000, 0x49f452aa00000000, 0x93e8a55400000000, - 0x27d14aaa00000000, 0x4fa2955400000000, 0x9f452aa800000000, - 0x3e8a555200000000, 0x7d14aaa400000000, 0xfa29554800000000, - 0xf452aa9200000000, 0xe8a5552600000000, 0xd14aaa4e00000000, - 0xa295549e00000000, 0x452aa93e00000000, 0x8a55527c00000000, - 0x14aaa4fa00000000 -}; - -static const uint64_t N33R[33] = { - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N -}; - -static const uint64_t N31L[31] = { - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, - SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N, SEED_N -}; - -static const uint64_t* const MS_TAB_33R[ASCII_SIZE] = { - N33R, T33R, N33R, G33R, A33R, A33R, N33R, C33R, // 0..7 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 8..15 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 16..23 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 24..31 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 32..39 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 40..47 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 48..55 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 56..63 - N33R, A33R, N33R, C33R, N33R, N33R, N33R, G33R, // 64..71 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 72..79 - N33R, N33R, N33R, N33R, T33R, T33R, N33R, N33R, // 80..87 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 88..95 - N33R, A33R, N33R, C33R, N33R, N33R, N33R, G33R, // 96..103 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 104..111 - N33R, N33R, N33R, N33R, T33R, T33R, N33R, N33R, // 112..119 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 120..127 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 128..135 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 136..143 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 144..151 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 152..159 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 160..167 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 168..175 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 176..183 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 184..191 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 192..199 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 200..207 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 208..215 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 216..223 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 224..231 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 232..239 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R, // 240..247 - N33R, N33R, N33R, N33R, N33R, N33R, N33R, N33R // 248..255 -}; - -static const uint64_t* const MS_TAB_31L[ASCII_SIZE] = { - N31L, T31L, N31L, G31L, A31L, A31L, N31L, C31L, // 0..7 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 8..15 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 16..23 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 24..31 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 32..39 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 40..47 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 48..55 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 56..63 - N31L, A31L, N31L, C31L, N31L, N31L, N31L, G31L, // 64..71 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 72..79 - N31L, N31L, N31L, N31L, T31L, T31L, N31L, N31L, // 80..87 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 88..95 - N31L, A31L, N31L, C31L, N31L, N31L, N31L, G31L, // 96..103 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 104..111 - N31L, N31L, N31L, N31L, T31L, T31L, N31L, N31L, // 112..119 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 120..127 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 128..135 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 136..143 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 144..151 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 152..159 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 160..167 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 168..175 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 176..183 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 184..191 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 192..199 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 200..207 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 208..215 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 216..223 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 224..231 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 232..239 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L, // 240..247 - N31L, N31L, N31L, N31L, N31L, N31L, N31L, N31L // 248..255 -}; - -static const uint8_t RC_CONVERT_TAB[256] = { - 255, 255, 255, 255, 255, 255, 255, 255, // 0..7 - 255, 255, 255, 255, 255, 255, 255, 255, // 8..15 - 255, 255, 255, 255, 255, 255, 255, 255, // 16..23 - 255, 255, 255, 255, 255, 255, 255, 255, // 24..31 - 255, 255, 255, 255, 255, 255, 255, 255, // 32..39 - 255, 255, 255, 255, 255, 255, 255, 255, // 40..47 - 255, 255, 255, 255, 255, 255, 255, 255, // 48..55 - 255, 255, 255, 255, 255, 255, 255, 255, // 56..63 - 255, 3, 255, 2, 255, 255, 255, 1, // 64..71 - 255, 255, 255, 255, 255, 255, 255, 255, // 72..79 - 255, 255, 255, 255, 0, 3, 255, 255, // 80..87 - 255, 255, 255, 255, 255, 255, 255, 255, // 88..95 - 255, 3, 255, 2, 255, 255, 255, 1, // 96..103 - 255, 255, 255, 255, 255, 255, 255, 255, // 104..111 - 255, 255, 255, 255, 0, 3, 255, 255, // 112..119 - 255, 255, 255, 255, 255, 255, 255, 255, // 120..127 - 255, 255, 255, 255, 255, 255, 255, 255, // 128..135 - 255, 255, 255, 255, 255, 255, 255, 255, // 136..143 - 255, 255, 255, 255, 255, 255, 255, 255, // 144..151 - 255, 255, 255, 255, 255, 255, 255, 255, // 152..159 - 255, 255, 255, 255, 255, 255, 255, 255, // 160..167 - 255, 255, 255, 255, 255, 255, 255, 255, // 168..175 - 255, 255, 255, 255, 255, 255, 255, 255, // 176..183 - 255, 255, 255, 255, 255, 255, 255, 255, // 184..191 - 255, 255, 255, 255, 255, 255, 255, 255, // 192..199 - 255, 255, 255, 255, 255, 255, 255, 255, // 200..207 - 255, 255, 255, 255, 255, 255, 255, 255, // 208..215 - 255, 255, 255, 255, 255, 255, 255, 255, // 216..223 - 255, 255, 255, 255, 255, 255, 255, 255, // 224..231 - 255, 255, 255, 255, 255, 255, 255, 255, // 232..239 - 255, 255, 255, 255, 255, 255, 255, 255, // 240..247 - 255, 255, 255, 255, 255, 255, 255, 255 // 248..255 -}; - -static const uint8_t CONVERT_TAB[256] = { - 255, 255, 255, 255, 255, 255, 255, 255, // 0..7 - 255, 255, 255, 255, 255, 255, 255, 255, // 8..15 - 255, 255, 255, 255, 255, 255, 255, 255, // 16..23 - 255, 255, 255, 255, 255, 255, 255, 255, // 24..31 - 255, 255, 255, 255, 255, 255, 255, 255, // 32..39 - 255, 255, 255, 255, 255, 255, 255, 255, // 40..47 - 255, 255, 255, 255, 255, 255, 255, 255, // 48..55 - 255, 255, 255, 255, 255, 255, 255, 255, // 56..63 - 255, 0, 255, 1, 255, 255, 255, 2, // 64..71 - 255, 255, 255, 255, 255, 255, 255, 255, // 72..79 - 255, 255, 255, 255, 3, 0, 255, 255, // 80..87 - 255, 255, 255, 255, 255, 255, 255, 255, // 88..95 - 255, 0, 255, 1, 255, 255, 255, 2, // 96..103 - 255, 255, 255, 255, 255, 255, 255, 255, // 104..111 - 255, 255, 255, 255, 3, 0, 255, 255, // 112..119 - 255, 255, 255, 255, 255, 255, 255, 255, // 120..127 - 255, 255, 255, 255, 255, 255, 255, 255, // 128..135 - 255, 255, 255, 255, 255, 255, 255, 255, // 136..143 - 255, 255, 255, 255, 255, 255, 255, 255, // 144..151 - 255, 255, 255, 255, 255, 255, 255, 255, // 152..159 - 255, 255, 255, 255, 255, 255, 255, 255, // 160..167 - 255, 255, 255, 255, 255, 255, 255, 255, // 168..175 - 255, 255, 255, 255, 255, 255, 255, 255, // 176..183 - 255, 255, 255, 255, 255, 255, 255, 255, // 184..191 - 255, 255, 255, 255, 255, 255, 255, 255, // 192..199 - 255, 255, 255, 255, 255, 255, 255, 255, // 200..207 - 255, 255, 255, 255, 255, 255, 255, 255, // 208..215 - 255, 255, 255, 255, 255, 255, 255, 255, // 216..223 - 255, 255, 255, 255, 255, 255, 255, 255, // 224..231 - 255, 255, 255, 255, 255, 255, 255, 255, // 232..239 - 255, 255, 255, 255, 255, 255, 255, 255, // 240..247 - 255, 255, 255, 255, 255, 255, 255, 255 // 248..255 -}; - -static const uint64_t DIMER_TAB[16] = { - 5015898201438948509U, 5225361804584821669U, 6423762225589857229U, - 5783394398799547583U, 6894017875502584557U, 5959461383092338133U, - 4833978511655400893U, 5364573296520205007U, 9002561594443973180U, - 8212239310050454788U, 6941810030513055084U, 7579897184553533982U, - 7935738758488558809U, 7149836515649299425U, 8257540373175577481U, - 8935100007508790523U -}; - -static const uint64_t TRIMER_TAB[64] = { - 13237172352163388750U, 13451082378889146998U, 12324706752351386142U, - 11704099346423635308U, 12503002411303846718U, 11573033083854154758U, - 12770611021816489070U, 13284814289517544220U, 10286336837755622383U, - 9500434588327378135U, 10554658215321236671U, 11177611689138066381U, - 11245073286936829194U, 10454751004568891954U, 9274956656780491354U, - 9930495270120774952U, 9498947889754972591U, 10289371588586147479U, - 11487222103436658431U, 10812501148518244749U, 11088845979783725023U, - 10735249574334615783U, 9609199230360475791U, 10105458452942995453U, - 13447889238169808654U, 13238535845420384310U, 11968673763542288478U, - 12645600078955589420U, 12136759312206930411U, 11922809957208297171U, - 13031072242070652603U, 13668666814620918217U, 14219262150204358668U, - 14433136993975185204U, 15703263506252408668U, 15026899868095529006U, - 16097136083696541308U, 15167201938128040260U, 14113514427211577644U, - 14608043031429815902U, 18169629015343943341U, 17383691583363408277U, - 16185576633819064829U, 16859734366019948175U, 17215452794964541512U, - 16425095330967072624U, 17460550829194815256U, 18101973914136232042U, - 16197524846324948423U, 17136496960994620159U, 18190301010467109527U, - 17660752969549176293U, 18084590689685816247U, 17861669045228104847U, - 16591430392433501415U, 17233003275094786965U, 15689030113991676774U, - 15321980360070757470U, 14196301091602199606U, 14727918144983470916U, - 14660430141886012803U, 14297932370981794491U, 15550237822687034067U, - 16044915679164358049U -}; - -static const uint64_t TETRAMER_TAB[256] = { - 6047278271377325800U, 6842100033257738704U, 5716751207778949560U, - 5058261232784932554U, 5322212292231585944U, 4955210659836481440U, - 6153481158060361672U, 6630136099103187130U, 7683058811908681801U, - 7460089081761259377U, 8513615477720831769U, 9169618076073996395U, - 8669810821731892908U, 8451393064794886548U, 7271235746105367036U, - 7894785163577458318U, 7461575445318369801U, 7680024275870068017U, - 8878022265940976985U, 8237757801848291883U, 9060296013225843833U, - 8116780716040188737U, 6991106539262573353U, 7521593563379047515U, - 6845292839028968616U, 6045914992845185936U, 4775672622745250808U, - 5413871935584767114U, 5490367161684853325U, 4695435745326017909U, - 5803018666222232861U, 6480400171096490607U, 2381043025085637546U, - 3175899973157948562U, 4445879008075678970U, 3807116472585741192U, - 4268108881087626714U, 3901072061426881250U, 2847008385469766282U, - 3379366782720458232U, 1763336001516006667U, 1540401457157816883U, - 342666797974407771U, 983493939256405289U, 771890739233563630U, - 553508169276984534U, 1589643033626739902U, 2263336780810576844U, - 330722743541775969U, 688712796851212633U, 1742668713148160305U, - 1245320973785726531U, 2208596672445898769U, 1422777727841816361U, - 152919646732699457U, 826464124477841459U, 4460107693596700864U, - 3530055095011467256U, 2403999925630162832U, 2899137386794791138U, - 3398970977768160805U, 2464498338584432925U, 3716128830812494197U, - 4248337413163712007U, 4264326372183459627U, 3906261395711551507U, - 2851952150714671227U, 3383149429014333193U, 2386233046276708699U, - 3172117876357805667U, 4441779805226941963U, 3801926588820052345U, - 170684860043692426U, 1100671402695403186U, 2226926226858061530U, - 1693589575942097320U, 1193606390847620975U, 2128144916583147607U, - 876319371625685055U, 382305650241144653U, 1102545060664966090U, - 168107437338776818U, 1437989166537956506U, 1915072878734195688U, - 1548519783094789562U, 1757891215679916674U, 703889661060612842U, - 46092416782165400U, 3908715595921208683U, 4262294307145226835U, - 3064498623987880507U, 2585134797421409609U, 2661735585529691022U, - 3019760716990469302U, 4055956603131813086U, 3543998858204232620U, - 5317339067591416425U, 4959238909506745681U, 6157334207435046201U, - 6635009461133220427U, 6051307208490845209U, 6837227221258447649U, - 5711490920986878793U, 5054232433096901691U, 8122648135453742280U, - 9052599496358476784U, 7782418148093113240U, 7307023562816214250U, - 7095314801322056237U, 8029818144085865749U, 9137340041034366333U, - 8622472983995947535U, 7806751516869674914U, 7011855109925922970U, - 8137690373747335410U, 8757695200062998400U, 8531879593853721042U, - 8898947385530005226U, 7700757522090507906U, 7186022138009770480U, - 6135219772853324035U, 6358123720871388731U, 5304510851123850835U, - 4682089562405882145U, 5182028715320330214U, 5400512630465816798U, - 6580751683450298550U, 5923625422568720324U, 13124074928584983660U, - 13491146941631638356U, 12293650504952193852U, 11816502978180760654U, - 12399079312662682140U, 11604187204414436644U, 12730450818222161228U, - 13388307479092468286U, 10327209524901530317U, 9388215691182564853U, - 10657868830410829213U, 11137168911054473967U, 11357920004770333736U, - 10414374197647485712U, 9306325182584103800U, 9818342344138146826U, - 9386341947321596045U, 10329786896059045813U, 11455812913355464669U, - 10924692575052363951U, 10984992149858150141U, 10766613702172592581U, - 9568826821541020077U, 10208598699842184927U, 13488692655530571308U, - 13126106942075820308U, 12072096584926548348U, 12605510244625659406U, - 12249677498819492041U, 11882645355480553457U, 13062230760632229785U, - 13556163143878539499U, 14178740190036597038U, 14545847390080448022U, - 15599559227675164286U, 15067834145139579148U, 16065876409530435422U, - 15270949115358734438U, 14000758968863088654U, 14640014089599289212U, - 18281953465151117199U, 17342994818563569847U, 16217267316526477535U, - 16746698532205467565U, 17255653680509032810U, 16312143059561297490U, - 17564497017566543418U, 18061360711745100104U, 16237972021990524133U, - 17023861349393640413U, 18293930539975648181U, 17619893477009409223U, - 18115916316835994261U, 17757855915011241389U, 16704251839199542725U, - 17200966263939144375U, 15576639675766950468U, 15362743113290245500U, - 14164544455910714644U, 14841019967217601126U, 14620295210399335585U, - 14410818688327658393U, 15446357621659116529U, 16085462927495578755U, - 18237799192036655099U, 17294270664133710019U, 16258109964509321387U, - 16773410497518403545U, 16657084189963477387U, 16875519862962278067U, - 18127020052323321563U, 17507580374969491881U, 14153168177888129370U, - 14515696771658964578U, 15624080140268688906U, 15110866744451150200U, - 15466708232756051903U, 15833797605570023559U, 14563810316809509103U, - 14085706539145691037U, 14517711175708869402U, 14150731501263563810U, - 15402451490950456394U, 15899948742203982648U, 15224753927964908906U, - 16019597712369578578U, 14983744703118572090U, 14310050713553640776U, - 17296865610423782843U, 18235907873078829699U, 17055988043521714923U, - 16561000163437350297U, 16340222631939670878U, 17283720110790814822U, - 18338064546595415054U, 17805706452459078524U, 10375933128878629561U, - 9432369415202180481U, 10612588863825479145U, 11105888166746317467U, - 10794790039591648457U, 11013260899437695985U, 9905396050428550041U, - 9228014311730625771U, 13154226096333843480U, 13516719503928509216U, - 12264699899470662472U, 11768891770841246778U, 11836546934201131773U, - 12203601119882644933U, 13328994472388527533U, 12798507759874630367U, - 12277767672444305266U, 12068343612890878026U, 13176021535246260258U, - 13816435502572994384U, 12705517425460601090U, 13640043170446921274U, - 12460006250421962322U, 11929369723008524576U, 10597232027372843475U, - 11387585128312430315U, 10351852510211364483U, 9713802769929286129U, - 9357917249443839798U, 10143859113470169102U, 11342251114164164710U, - 10664720106027613972U -}; - -// rotate "v" to the left 1 position -inline uint64_t -rol1(const uint64_t v) -{ - return (v << 1) | (v >> 63); // NOLINT -} - -// rotate "v" to the left x position -inline uint64_t -rolx(const uint64_t v, const unsigned x) -{ - return (v << x) | (v >> (64 - x)); // NOLINT -} - -// rotate "v" to the right by 1 position -inline uint64_t -ror1(const uint64_t v) -{ - return (v >> 1) | (v << 63); // NOLINT -} - -// rotate 31-left bits of "v" to the left by "s" positions -inline uint64_t -rol31(const uint64_t v, unsigned s) -{ - s %= 31; // NOLINT - return ((v << s) | (v >> (31 - s))) & 0x7FFFFFFF; // NOLINT -} - -// rotate 33-right bits of "v" to the left by "s" positions -inline uint64_t -rol33(const uint64_t v, unsigned s) -{ - s %= 33; // NOLINT - return ((v << s) | (v >> (33 - s))) & 0x1FFFFFFFF; // NOLINT -} - -// swap bit 0 with bit 33 in "v" -inline uint64_t -swapbits033(const uint64_t v) -{ - uint64_t x = (v ^ (v >> 33)) & 1; // NOLINT - return v ^ (x | (x << 33)); // NOLINT -} - -// swap bit 32 with bit 63 in "v" -inline uint64_t -swapbits3263(const uint64_t v) -{ - uint64_t x = ((v >> 32) ^ (v >> 63)) & 1; // NOLINT - return v ^ ((x << 32) | (x << 63)); // NOLINT -} - -inline uint64_t -swapxbits033(const uint64_t v, const unsigned x) -{ - uint64_t y = (v ^ (v >> 33)) & // NOLINT - (std::numeric_limits::max() >> (64 - x)); // NOLINT - return v ^ (y | (y << 33)); // NOLINT -} - -// forward-strand hash value of the base kmer, i.e. fhval(kmer_0) -inline uint64_t -ntf64(const char* kmer_seq, const unsigned k) -{ - uint64_t h_val = 0; - for (unsigned i = 0; i < k / 4; i++) { - h_val = rolx(h_val, 4); - h_val = swapxbits033(h_val, 4); - uint8_t curr_offset = 4 * i; - uint8_t tetramer_loc = - 64 * CONVERT_TAB[(unsigned char)kmer_seq[curr_offset]] + // NOLINT - 16 * CONVERT_TAB[(unsigned char)kmer_seq[curr_offset + 1]] + // NOLINT - 4 * CONVERT_TAB[(unsigned char)kmer_seq[curr_offset + 2]] + - CONVERT_TAB[(unsigned char)kmer_seq[curr_offset + 3]]; - h_val ^= TETRAMER_TAB[tetramer_loc]; - } - unsigned remainder = k % 4; - h_val = rolx(h_val, remainder); - h_val = swapxbits033(h_val, remainder); - if (remainder == 3) { - uint8_t trimer_loc = - 16 * CONVERT_TAB[(unsigned char)kmer_seq[k - 3]] + // NOLINT - 4 * CONVERT_TAB[(unsigned char)kmer_seq[k - 2]] + - CONVERT_TAB[(unsigned char)kmer_seq[k - 1]]; - h_val ^= TRIMER_TAB[trimer_loc]; - } else if (remainder == 2) { - uint8_t dimer_loc = 4 * CONVERT_TAB[(unsigned char)kmer_seq[k - 2]] + - CONVERT_TAB[(unsigned char)kmer_seq[k - 1]]; - h_val ^= DIMER_TAB[dimer_loc]; - } else if (remainder == 1) { - h_val ^= SEED_TAB[(unsigned char)kmer_seq[k - 1]]; - } - return h_val; -} - -// reverse-strand hash value of the base kmer, i.e. rhval(kmer_0) -inline uint64_t -ntr64(const char* kmer_seq, const unsigned k) -{ - uint64_t h_val = 0; - unsigned remainder = k % 4; - if (remainder == 3) { - uint8_t trimer_loc = - 16 * RC_CONVERT_TAB[(unsigned char)kmer_seq[k - 1]] + // NOLINT - 4 * RC_CONVERT_TAB[(unsigned char)kmer_seq[k - 2]] + - RC_CONVERT_TAB[(unsigned char)kmer_seq[k - 3]]; - h_val ^= TRIMER_TAB[trimer_loc]; - } else if (remainder == 2) { - uint8_t dimer_loc = 4 * RC_CONVERT_TAB[(unsigned char)kmer_seq[k - 1]] + - RC_CONVERT_TAB[(unsigned char)kmer_seq[k - 2]]; - h_val ^= DIMER_TAB[dimer_loc]; - } else if (remainder == 1) { - h_val ^= SEED_TAB[(unsigned char)kmer_seq[k - 1] & CP_OFF]; - } - for (unsigned i = 0; i < k / 4; i++) { - h_val = rolx(h_val, 4); - h_val = swapxbits033(h_val, 4); - uint8_t curr_offset = 4 * (k / 4 - i) - 1; - uint8_t tetramer_loc = - 64 * RC_CONVERT_TAB[(unsigned char)kmer_seq[curr_offset]] + // NOLINT - 16 * RC_CONVERT_TAB[(unsigned char)kmer_seq[curr_offset - 1]] + // NOLINT - 4 * RC_CONVERT_TAB[(unsigned char)kmer_seq[curr_offset - 2]] + - RC_CONVERT_TAB[(unsigned char)kmer_seq[curr_offset - 3]]; - h_val ^= TETRAMER_TAB[tetramer_loc]; - } - return h_val; -} - -// forward-strand ntHash for sliding k-mers -inline uint64_t -ntf64(const uint64_t fh_val, - const unsigned k, - const unsigned char char_out, - const unsigned char char_in) -{ - uint64_t h_val = rol1(fh_val); - h_val = swapbits033(h_val); - h_val ^= SEED_TAB[char_in]; - h_val ^= - (MS_TAB_31L[char_out][k % 31] | MS_TAB_33R[char_out][k % 33]); // NOLINT - return h_val; -} - -// reverse-complement ntHash for sliding k-mers -inline uint64_t -ntr64(const uint64_t rh_val, - const unsigned k, - const unsigned char char_out, - const unsigned char char_in) -{ - uint64_t h_val = rh_val ^ (MS_TAB_31L[char_in & CP_OFF][k % 31] | // NOLINT - MS_TAB_33R[char_in & CP_OFF][k % 33]); // NOLINT - h_val ^= SEED_TAB[char_out & CP_OFF]; - h_val = ror1(h_val); - h_val = swapbits3263(h_val); - return h_val; -} - -// canonical ntBase -inline uint64_t -ntc64(const char* kmer_seq, const unsigned k) -{ - uint64_t fh_val = 0, rh_val = 0; - fh_val = ntf64(kmer_seq, k); - rh_val = ntr64(kmer_seq, k); - return (rh_val < fh_val) ? rh_val : fh_val; -} - -// canonical ntHash -inline uint64_t -ntc64(const char* kmer_seq, - const unsigned k, - uint64_t& fh_val, - uint64_t& rh_val) -{ - fh_val = ntf64(kmer_seq, k); - rh_val = ntr64(kmer_seq, k); - return (rh_val < fh_val) ? rh_val : fh_val; -} - -// canonical ntHash for sliding k-mers -inline uint64_t -ntc64(const unsigned char char_out, - const unsigned char char_in, - const unsigned k, - uint64_t& fh_val, - uint64_t& rh_val) -{ - fh_val = ntf64(fh_val, k, char_out, char_in); - rh_val = ntr64(rh_val, k, char_out, char_in); - return (rh_val < fh_val) ? rh_val : fh_val; -} - -// forward-strand ntHash for sliding k-mers to the left -inline uint64_t -ntf64l(const uint64_t rh_val, - const unsigned k, - const unsigned char char_out, - const unsigned char char_in) -{ - uint64_t h_val = rh_val ^ (MS_TAB_31L[char_in][k % 31] | // NOLINT - MS_TAB_33R[char_in][k % 33]); // NOLINT - h_val ^= SEED_TAB[char_out]; - h_val = ror1(h_val); - h_val = swapbits3263(h_val); - return h_val; -} - -// reverse-complement ntHash for sliding k-mers to the left -inline uint64_t -ntr64l(const uint64_t fh_val, - const unsigned k, - const unsigned char char_out, - const unsigned char char_in) -{ - uint64_t h_val = rol1(fh_val); - h_val = swapbits033(h_val); - h_val ^= SEED_TAB[char_in & CP_OFF]; - h_val ^= (MS_TAB_31L[char_out & CP_OFF][k % 31] | // NOLINT - MS_TAB_33R[char_out & CP_OFF][k % 33]); // NOLINT - return h_val; -} - -// canonical ntHash for sliding k-mers to the left -inline uint64_t -ntc64l(const unsigned char char_out, - const unsigned char char_in, - const unsigned k, - uint64_t& fh_val, - uint64_t& rh_val) -{ - fh_val = ntf64l(fh_val, k, char_out, char_in); - rh_val = ntr64l(rh_val, k, char_out, char_in); - return (rh_val < fh_val) ? rh_val : fh_val; -} - -// ntBase with seeding option -inline uint64_t -ntf64(const char* kmer_seq, const unsigned k, const unsigned seed) -{ - uint64_t h_val = ntf64(kmer_seq, k); - if (seed == 0) { - return h_val; - } - h_val *= seed ^ k * MULTISEED; - h_val ^= h_val >> MULTISHIFT; - return h_val; -} - -// canonical ntBase with seeding option -inline uint64_t -ntc64(const char* kmer_seq, const unsigned k, const unsigned seed) -{ - uint64_t h_val = ntc64(kmer_seq, k); - if (seed == 0) { - return h_val; - } - h_val *= seed ^ k * MULTISEED; - h_val ^= h_val >> MULTISHIFT; - return h_val; -} - -// multihash ntHash, ntBase -inline void -ntm64(const char* kmer_seq, const unsigned k, const unsigned m, uint64_t* h_val) -{ - uint64_t b_val = 0, t_val = 0; - b_val = ntf64(kmer_seq, k); - h_val[0] = b_val; - for (unsigned i = 1; i < m; i++) { - t_val = b_val * (i ^ k * MULTISEED); - t_val ^= t_val >> MULTISHIFT; - h_val[i] = t_val; - } -} - -// one extra hash for given base hash -inline uint64_t -nte64(const uint64_t h_val, const unsigned k, const unsigned i) -{ - uint64_t t_val = h_val; - t_val *= (i ^ k * MULTISEED); - t_val ^= t_val >> MULTISHIFT; - return t_val; -} - -// multihash ntHash for sliding k-mers -inline void -ntm64(const unsigned char char_out, - const unsigned char char_in, - const unsigned k, - const unsigned m, - uint64_t* h_val) -{ - uint64_t b_val = 0, t_val = 0; - b_val = ntf64(h_val[0], k, char_out, char_in); - h_val[0] = b_val; - for (unsigned i = 1; i < m; i++) { - t_val = b_val * (i ^ k * MULTISEED); - t_val ^= t_val >> MULTISHIFT; - h_val[i] = t_val; - } -} - -// canonical multihash ntBase -inline void -ntmc64(const char* kmer_seq, - const unsigned k, - const unsigned m, - uint64_t* h_val) -{ - uint64_t b_val = 0, t_val = 0; - b_val = ntc64(kmer_seq, k); - h_val[0] = b_val; - for (unsigned i = 1; i < m; i++) { - t_val = b_val * (i ^ k * MULTISEED); - t_val ^= t_val >> MULTISHIFT; - h_val[i] = t_val; - } -} - -// canonical multihash ntHash -inline void -ntmc64(const char* kmer_seq, - const unsigned k, - const unsigned m, - uint64_t& fh_val, - uint64_t& rh_val, - uint64_t* h_val) -{ - uint64_t b_val = 0, t_val = 0; - b_val = ntc64(kmer_seq, k, fh_val, rh_val); - h_val[0] = b_val; - for (unsigned i = 1; i < m; i++) { - t_val = b_val * (i ^ k * MULTISEED); - t_val ^= t_val >> MULTISHIFT; - h_val[i] = t_val; - } -} - -// canonical multihash ntHash for sliding k-mers -inline void -ntmc64(const unsigned char char_out, - const unsigned char char_in, - const unsigned k, - const unsigned m, - uint64_t& fh_val, - uint64_t& rh_val, - uint64_t* h_val) -{ - uint64_t b_val = 0, t_val = 0; - b_val = ntc64(char_out, char_in, k, fh_val, rh_val); - h_val[0] = b_val; - for (unsigned i = 1; i < m; i++) { - t_val = b_val * (i ^ k * MULTISEED); - t_val ^= t_val >> MULTISHIFT; - h_val[i] = t_val; - } -} - -/* - * ignoring k-mers containing nonACGT using ntHash function - */ - -// canonical ntBase -inline bool -ntc64(const char* kmer_seq, const unsigned k, uint64_t& h_val, unsigned& loc_n) -{ - h_val = 0; - loc_n = 0; - uint64_t fh_val = 0, rh_val = 0; - for (int i = int(k - 1); i >= 0; i--) { - if (SEED_TAB[(unsigned char)kmer_seq[i]] == SEED_N) { - loc_n = i; - return false; - } - fh_val = rol1(fh_val); - fh_val = swapbits033(fh_val); - fh_val ^= SEED_TAB[(unsigned char)kmer_seq[k - 1 - i]]; - - rh_val = rol1(rh_val); - rh_val = swapbits033(rh_val); - rh_val ^= SEED_TAB[(unsigned char)kmer_seq[i] & CP_OFF]; - } - h_val = (rh_val < fh_val) ? rh_val : fh_val; - return true; -} - -// canonical multihash ntBase -inline bool -ntmc64(const char* kmer_seq, - const unsigned k, - const unsigned m, - unsigned& loc_n, - uint64_t* h_val) -{ - uint64_t b_val = 0, t_val = 0, fh_val = 0, rh_val = 0; - loc_n = 0; - for (int i = int(k - 1); i >= 0; i--) { - if (SEED_TAB[(unsigned char)kmer_seq[i]] == SEED_N) { - loc_n = i; - return false; - } - fh_val = rol1(fh_val); - fh_val = swapbits033(fh_val); - fh_val ^= SEED_TAB[(unsigned char)kmer_seq[k - 1 - i]]; - - rh_val = rol1(rh_val); - rh_val = swapbits033(rh_val); - rh_val ^= SEED_TAB[(unsigned char)kmer_seq[i] & CP_OFF]; - } - b_val = (rh_val < fh_val) ? rh_val : fh_val; - h_val[0] = b_val; - for (unsigned i = 1; i < m; i++) { - t_val = b_val * (i ^ k * MULTISEED); - t_val ^= t_val >> MULTISHIFT; - h_val[i] = t_val; - } - return true; -} - -// canonical ntHash -inline bool -ntc64(const char* kmer_seq, - const unsigned k, - uint64_t& fh_val, - uint64_t& rh_val, - uint64_t& h_val, - unsigned& loc_n) -{ - h_val = fh_val = rh_val = 0; - loc_n = 0; - for (int i = int(k - 1); i >= 0; i--) { - if (SEED_TAB[(unsigned char)kmer_seq[i]] == SEED_N) { - loc_n = i; - return false; - } - fh_val = rol1(fh_val); - fh_val = swapbits033(fh_val); - fh_val ^= SEED_TAB[(unsigned char)kmer_seq[k - 1 - i]]; - - rh_val = rol1(rh_val); - rh_val = swapbits033(rh_val); - rh_val ^= SEED_TAB[(unsigned char)kmer_seq[i] & CP_OFF]; - } - h_val = (rh_val < fh_val) ? rh_val : fh_val; - return true; -} - -// canonical multihash ntHash -inline bool -ntmc64(const char* kmer_seq, - const unsigned k, - const unsigned m, - uint64_t& fh_val, - uint64_t& rh_val, - unsigned& loc_n, - uint64_t* h_val) -{ - fh_val = rh_val = 0; - uint64_t b_val = 0, t_val = 0; - loc_n = 0; - for (int i = int(k - 1); i >= 0; i--) { - if (SEED_TAB[(unsigned char)kmer_seq[i]] == SEED_N) { - loc_n = i; - return false; - } - fh_val = rol1(fh_val); - fh_val = swapbits033(fh_val); - fh_val ^= SEED_TAB[(unsigned char)kmer_seq[k - 1 - i]]; - - rh_val = rol1(rh_val); - rh_val = swapbits033(rh_val); - rh_val ^= SEED_TAB[(unsigned char)kmer_seq[i] & CP_OFF]; - } - b_val = (rh_val < fh_val) ? rh_val : fh_val; - h_val[0] = b_val; - for (unsigned i = 1; i < m; i++) { - t_val = b_val * (i ^ k * MULTISEED); - t_val ^= t_val >> MULTISHIFT; - h_val[i] = t_val; - } - return true; -} - -// strand-aware canonical multihash ntHash -inline bool -ntmc64(const char* kmer_seq, - const unsigned k, - const unsigned m, - uint64_t& fh_val, - uint64_t& rh_val, - unsigned& loc_n, - uint64_t* h_val, - bool& h_stn) -{ - fh_val = rh_val = 0; - uint64_t b_val = 0, t_val = 0; - loc_n = 0; - for (int i = int(k - 1); i >= 0; i--) { - if (SEED_TAB[(unsigned char)kmer_seq[i]] == SEED_N) { - loc_n = i; - return false; - } - fh_val = rol1(fh_val); - fh_val = swapbits033(fh_val); - fh_val ^= SEED_TAB[(unsigned char)kmer_seq[k - 1 - i]]; - - rh_val = rol1(rh_val); - rh_val = swapbits033(rh_val); - rh_val ^= SEED_TAB[(unsigned char)kmer_seq[i] & CP_OFF]; - } - h_stn = rh_val < fh_val; - b_val = h_stn ? rh_val : fh_val; - h_val[0] = b_val; - for (unsigned i = 1; i < m; i++) { - t_val = b_val * (i ^ k * MULTISEED); - t_val ^= t_val >> MULTISHIFT; - h_val[i] = t_val; - } - return true; -} - -// starnd-aware canonical multihash ntHash for sliding k-mers -inline void -ntmc64(const unsigned char char_out, - const unsigned char char_in, - const unsigned k, - const unsigned m, - uint64_t& fh_val, - uint64_t& rh_val, - uint64_t* h_val, - bool& h_stn) -{ - uint64_t b_val = 0, t_val = 0; - b_val = ntc64(char_out, char_in, k, fh_val, rh_val); - h_stn = rh_val < fh_val; - h_val[0] = b_val; - for (unsigned i = 1; i < m; i++) { - t_val = b_val * (i ^ k * MULTISEED); - t_val ^= t_val >> MULTISHIFT; - h_val[i] = t_val; - } -} - -// masking canonical ntHash using spaced seed pattern -inline uint64_t -mask_hash(uint64_t& fk_val, - uint64_t& rk_val, - const char* seed_seq, - const char* kmer_seq, - const unsigned k) -{ - uint64_t fs_val = fk_val, rs_val = rk_val; - for (unsigned i = 0; i < k; i++) { - if (seed_seq[i] != '1') { - fs_val ^= - (MS_TAB_31L[(unsigned char)kmer_seq[i]][(k - 1 - i) % 31] | // NOLINT - MS_TAB_33R[(unsigned char)kmer_seq[i]][(k - 1 - i) % 33]); // NOLINT - rs_val ^= - (MS_TAB_31L[(unsigned char)kmer_seq[i] & CP_OFF][i % 31] | // NOLINT - MS_TAB_33R[(unsigned char)kmer_seq[i] & CP_OFF][i % 33]); // NOLINT - } - } - return (rs_val < fs_val) ? rs_val : fs_val; -} - -// replacing canonical ntHash with a substitution -inline void -sub_hash(uint64_t fh_val, - uint64_t rh_val, - const char* kmer_seq, - const std::vector& positions, - const std::vector& new_bases, - const unsigned k, - const unsigned m, - uint64_t* h_val) -{ - uint64_t b_val = 0, t_val = 0; - - for (size_t i = 0; i < positions.size(); i++) { - const auto pos = positions[i]; - const auto new_base = new_bases[i]; - - fh_val ^= - (MS_TAB_31L[(unsigned char)kmer_seq[pos]][(k - 1 - pos) % 31] | // NOLINT - MS_TAB_33R[(unsigned char)kmer_seq[pos]][(k - 1 - pos) % 33]); // NOLINT - fh_val ^= (MS_TAB_31L[new_base][(k - 1 - pos) % 31] | // NOLINT - MS_TAB_33R[new_base][(k - 1 - pos) % 33]); // NOLINT - - rh_val ^= - (MS_TAB_31L[(unsigned char)kmer_seq[pos] & CP_OFF][pos % 31] | // NOLINT - MS_TAB_33R[(unsigned char)kmer_seq[pos] & CP_OFF][pos % 33]); // NOLINT - rh_val ^= (MS_TAB_31L[new_base & CP_OFF][pos % 31] | // NOLINT - MS_TAB_33R[new_base & CP_OFF][pos % 33]); // NOLINT - } - - b_val = rh_val < fh_val ? rh_val : fh_val; - h_val[0] = b_val; - for (unsigned i = 1; i < m; i++) { - t_val = b_val * (i ^ k * MULTISEED); - t_val ^= t_val >> MULTISHIFT; - h_val[i] = t_val; - } -} - -// spaced seed ntHash for base kmer, i.e. fhval(kmer_0) -inline uint64_t -nts64(const char* kmer_seq, - const std::vector& seed, - const unsigned k, - uint64_t& h_val) -{ - h_val = 0; - uint64_t s_val = 0; - for (unsigned i = 0; i < k; i++) { - h_val = rol1(h_val); - h_val = swapbits033(h_val); - s_val = h_val; - h_val ^= SEED_TAB[(unsigned char)kmer_seq[i]]; - if (seed[i]) { - s_val = h_val; - } - } - return s_val; -} - -// spaced seed ntHash for sliding k-mers -inline uint64_t -nts64(const char* kmer_seq, - const std::vector& seed, - const unsigned char char_out, - const unsigned char char_in, - const unsigned k, - uint64_t& h_val) -{ - h_val = ntf64(h_val, k, char_out, char_in); - uint64_t s_val = h_val; - for (unsigned i = 0; i < k; i++) { - if (!seed[i]) { - s_val ^= (MS_TAB_31L[(unsigned char)kmer_seq[i]][k % 31] | // NOLINT - MS_TAB_33R[(unsigned char)kmer_seq[i]][k % 33]); // NOLINT - } - } - return s_val; -} - -// strand-aware multihash spaced seed ntHash -inline bool -ntms64(const char* kmer_seq, - const std::vector>& seed_seq, - const unsigned k, - const unsigned m, - uint64_t& fh_val, - uint64_t& rh_val, - unsigned& loc_n, - uint64_t* h_val, - bool* h_stn) -{ - fh_val = rh_val = 0; - loc_n = 0; - for (int i = int(k - 1); i >= 0; i--) { - if (SEED_TAB[(unsigned char)kmer_seq[i]] == SEED_N) { - loc_n = i; - return false; - } - fh_val = rol1(fh_val); - fh_val = swapbits033(fh_val); - fh_val ^= SEED_TAB[(unsigned char)kmer_seq[k - 1 - i]]; - - rh_val = rol1(rh_val); - rh_val = swapbits033(rh_val); - rh_val ^= SEED_TAB[(unsigned char)kmer_seq[i] & CP_OFF]; - } - - for (unsigned j = 0; j < m; j++) { - uint64_t fs_val = fh_val, rs_val = rh_val; - for (const auto& seed_pos : seed_seq[j]) { - fs_val ^= (MS_TAB_31L[(unsigned char)kmer_seq[seed_pos]] - [(k - 1 - seed_pos) % 31] | // NOLINT - MS_TAB_33R[(unsigned char)kmer_seq[seed_pos]] - [(k - 1 - seed_pos) % 33]); // NOLINT - rs_val ^= (MS_TAB_31L[(unsigned char)kmer_seq[seed_pos] & CP_OFF] - [seed_pos % 31] | // NOLINT - MS_TAB_33R[(unsigned char)kmer_seq[seed_pos] & CP_OFF] - [seed_pos % 33]); // NOLINT - } - h_stn[j] = rs_val < fs_val; - h_val[j] = h_stn[j] ? rs_val : fs_val; - } - return true; -} - -// strand-aware multihash spaced seed ntHash for sliding k-mers -inline void -ntms64(const char* kmer_seq, - const std::vector>& seed_seq, - const unsigned char char_out, - const unsigned char char_in, - const unsigned k, - const unsigned m, - uint64_t& fh_val, - uint64_t& rh_val, - uint64_t* h_val, - bool* h_stn) -{ - fh_val = ntf64(fh_val, k, char_out, char_in); - rh_val = ntr64(rh_val, k, char_out, char_in); - for (unsigned j = 0; j < m; j++) { - uint64_t fs_val = fh_val, rs_val = rh_val; - for (const auto& seed_pos : seed_seq[j]) { - fs_val ^= (MS_TAB_31L[(unsigned char)kmer_seq[seed_pos]] - [(k - 1 - seed_pos) % 31] | // NOLINT - MS_TAB_33R[(unsigned char)kmer_seq[seed_pos]] - [(k - 1 - seed_pos) % 33]); // NOLINT - rs_val ^= (MS_TAB_31L[(unsigned char)kmer_seq[seed_pos] & CP_OFF] - [seed_pos % 31] | // NOLINT - MS_TAB_33R[(unsigned char)kmer_seq[seed_pos] & CP_OFF] - [seed_pos % 33]); // NOLINT - ; - } - h_stn[j] = rs_val < fs_val; - h_val[j] = h_stn[j] ? rs_val : fs_val; - } -} - -// Multi spaced seed ntHash with multiple hashes per seed -inline bool -ntmsm64(const char* kmer_seq, - const std::vector>& seed_seq, - const unsigned k, - const unsigned m, - const unsigned m2, - uint64_t& fh_val, - uint64_t& rh_val, - unsigned& loc_n, - uint64_t* h_val) -{ - fh_val = rh_val = 0; - loc_n = 0; - for (int i = int(k - 1); i >= 0; i--) { - if (SEED_TAB[(unsigned char)kmer_seq[i]] == SEED_N) { - loc_n = i; - return false; - } - fh_val = rol1(fh_val); - fh_val = swapbits033(fh_val); - fh_val ^= SEED_TAB[(unsigned char)kmer_seq[k - 1 - i]]; - - rh_val = rol1(rh_val); - rh_val = swapbits033(rh_val); - rh_val ^= SEED_TAB[(unsigned char)kmer_seq[i] & CP_OFF]; - } - - for (unsigned j = 0; j < m; j++) { - uint64_t fs_val = fh_val, rs_val = rh_val; - for (const auto& seed_pos : seed_seq[j]) { - fs_val ^= (MS_TAB_31L[(unsigned char)kmer_seq[seed_pos]] - [(k - 1 - seed_pos) % 31] | // NOLINT - MS_TAB_33R[(unsigned char)kmer_seq[seed_pos]] - [(k - 1 - seed_pos) % 33]); // NOLINT - rs_val ^= (MS_TAB_31L[(unsigned char)kmer_seq[seed_pos] & CP_OFF] - [seed_pos % 31] | // NOLINT - MS_TAB_33R[(unsigned char)kmer_seq[seed_pos] & CP_OFF] - [seed_pos % 33]); // NOLINT - } - h_val[j * m2] = rs_val < fs_val ? rs_val : fs_val; - for (unsigned j2 = 1; j2 < m2; j2++) { - uint64_t t_val = h_val[j * m2] * (j2 ^ k * MULTISEED); - t_val ^= t_val >> MULTISHIFT; - h_val[j * m2 + j2] = t_val; - } - } - return true; -} - -// Multi spaced seed ntHash for sliding k-mers with multiple hashes per seed -inline void -ntmsm64(const char* kmer_seq, - const std::vector>& seed_seq, - const unsigned char char_out, - const unsigned char char_in, - const unsigned k, - const unsigned m, - const unsigned m2, - uint64_t& fh_val, - uint64_t& rh_val, - uint64_t* h_val) -{ - fh_val = ntf64(fh_val, k, char_out, char_in); - rh_val = ntr64(rh_val, k, char_out, char_in); - for (unsigned j = 0; j < m; j++) { - uint64_t fs_val = fh_val, rs_val = rh_val; - for (const auto& seed_pos : seed_seq[j]) { - fs_val ^= (MS_TAB_31L[(unsigned char)kmer_seq[seed_pos]] - [(k - 1 - seed_pos) % 31] | // NOLINT - MS_TAB_33R[(unsigned char)kmer_seq[seed_pos]] - [(k - 1 - seed_pos) % 33]); // NOLINT - rs_val ^= (MS_TAB_31L[(unsigned char)kmer_seq[seed_pos] & CP_OFF] - [seed_pos % 31] | // NOLINT - MS_TAB_33R[(unsigned char)kmer_seq[seed_pos] & CP_OFF] - [seed_pos % 33]); // NOLINT - } - h_val[j * m2] = rs_val < fs_val ? rs_val : fs_val; - for (unsigned j2 = 1; j2 < m2; j2++) { - uint64_t t_val = h_val[j * m2] * (j2 ^ k * MULTISEED); - t_val ^= t_val >> MULTISHIFT; - h_val[j * m2 + j2] = t_val; - } - } -} - -class NtHash; -class SeedNtHash; -using SpacedSeed = std::vector; -static std::vector -parse_seeds(const std::vector& seed_strings); - -/** - * Iterate over hash values for k-mers in a - * given DNA sequence. - * - * This implementation uses ntHash - * function to efficiently calculate - * hash values for successive k-mers. - */ -class NtHash -{ - -public: - /** - * Constructor. - * @param seq DNA sequence to be hashed - * @param seq_len length of seq - * @param k k-mer size - * @param hash_num number of hashes - */ - NtHash(const char* seq, - size_t seq_len, - unsigned k, - unsigned hash_num, - size_t pos = 0); - - /** - * Constructor. - * @param seq DNA sequence to be hashed - * @param k k-mer size - * @param hash_num number of hashes - */ - NtHash(const std::string& seq, unsigned k, unsigned hash_num, size_t pos = 0); - - /** - * Calculate the next hash value - * @return true on success and false otherwise - */ - bool roll(); - - void sub(const std::vector& positions, - const std::vector& new_bases); - - const uint64_t* hashes() const { return hashes_vector.data(); } - - size_t get_pos() const { return pos; } - bool forward() const { return forward_hash <= reverse_hash; } - unsigned get_k() const { return k; } - unsigned get_hash_num() const { return hash_num; } - -private: - friend class SeedNtHash; - - /** Initialize internal state of iterator */ - bool init(); - - const char* seq; - const size_t seq_len; - const unsigned k; - const unsigned hash_num; - size_t pos; - bool initialized; - std::vector hashes_vector; - uint64_t forward_hash = 0; - uint64_t reverse_hash = 0; -}; - -class SeedNtHash -{ - -public: - SeedNtHash(const char* seq, - size_t seq_len, - unsigned k, - const std::vector& seeds, - unsigned hash_num_per_seed, - size_t pos = 0); - SeedNtHash(const std::string& seq, - unsigned k, - const std::vector& seeds, - unsigned hash_num_per_seed, - size_t pos = 0); - SeedNtHash(const char* seq, - size_t seq_len, - unsigned k, - const std::vector& seeds, - unsigned hash_num_per_seed, - size_t pos = 0); - SeedNtHash(const std::string& seq, - unsigned k, - const std::vector& seeds, - unsigned hash_num_per_seed, - size_t pos = 0); - - const uint64_t* hashes() const { return nthash.hashes(); } - - size_t get_pos() const { return nthash.get_pos(); } - bool forward() const { return nthash.forward(); } - unsigned get_k() const { return nthash.get_k(); } - unsigned get_hash_num() const { return nthash.get_hash_num(); } - unsigned get_hash_num_per_seed() const { return hash_num_per_seed; } - - bool roll(); - -private: - bool init(); - - NtHash nthash; - const unsigned hash_num_per_seed; - std::vector seeds; -}; - -inline NtHash::NtHash(const char* seq, - size_t seq_len, - unsigned k, - unsigned hash_num, - size_t pos) - : seq(seq) - , seq_len(seq_len) - , k(k) - , hash_num(hash_num) - , pos(pos) - , initialized(false) -{ - hashes_vector.resize(hash_num); -} - -inline NtHash::NtHash(const std::string& seq, - unsigned k, - unsigned hash_num, - size_t pos) - : NtHash(seq.c_str(), seq.size(), k, hash_num, pos) -{} - -inline SeedNtHash::SeedNtHash(const char* seq, - size_t seq_len, - unsigned k, - const std::vector& seeds, - unsigned hash_num_per_seed, - size_t pos) - : nthash(seq, seq_len, k, seeds.size() * hash_num_per_seed, pos) - , hash_num_per_seed(hash_num_per_seed) - , seeds(seeds) -{} - -inline SeedNtHash::SeedNtHash(const std::string& seq, - unsigned k, - const std::vector& seeds, - unsigned hash_num_per_seed, - size_t pos) - : nthash(seq, k, seeds.size() * hash_num_per_seed, pos) - , hash_num_per_seed(hash_num_per_seed) - , seeds(seeds) -{} - -inline SeedNtHash::SeedNtHash(const char* seq, - size_t seq_len, - unsigned k, - const std::vector& seeds, - unsigned hash_num_per_seed, - size_t pos) - : nthash(seq, seq_len, k, seeds.size() * hash_num_per_seed, pos) - , hash_num_per_seed(hash_num_per_seed) - , seeds(parse_seeds(seeds)) -{} - -inline SeedNtHash::SeedNtHash(const std::string& seq, - unsigned k, - const std::vector& seeds, - unsigned hash_num_per_seed, - size_t pos) - : nthash(seq, k, seeds.size() * hash_num_per_seed, pos) - , hash_num_per_seed(hash_num_per_seed) - , seeds(parse_seeds(seeds)) -{} - -static std::vector -parse_seeds(const std::vector& seed_strings) -{ - std::vector seed_set; - for (const auto& seed_string : seed_strings) { - SpacedSeed seed; - size_t pos = 0; - for (const auto& c : seed_string) { - if (c != '1') { - seed.push_back(pos); - } - ++pos; - } - seed_set.push_back(seed); - } - return seed_set; -} - -inline void -NtHash::sub(const std::vector& positions, - const std::vector& new_bases) -{ - sub_hash(forward_hash, - reverse_hash, - seq + pos, - positions, - new_bases, - get_k(), - get_hash_num(), - hashes_vector.data()); -} - -// NOLINTNEXTLINE -#define NTHASH_INIT(CLASS, NTHASH_CALL, MEMBER_PREFIX) \ - inline bool CLASS::init() \ - { \ - if (MEMBER_PREFIX k > MEMBER_PREFIX seq_len) { \ - MEMBER_PREFIX pos = std::numeric_limits::max(); \ - return false; \ - } \ - unsigned posN = 0; \ - while ( \ - (MEMBER_PREFIX pos < MEMBER_PREFIX seq_len - MEMBER_PREFIX k + 1) && \ - !(NTHASH_CALL)) { \ - MEMBER_PREFIX pos += posN + 1; \ - } \ - if (MEMBER_PREFIX pos > MEMBER_PREFIX seq_len - MEMBER_PREFIX k) { \ - MEMBER_PREFIX pos = std::numeric_limits::max(); \ - return false; \ - } \ - MEMBER_PREFIX initialized = true; \ - return true; \ - } - -// NOLINTNEXTLINE -#define NTHASH_ROLL(CLASS, NTHASH_CALL, MEMBER_PREFIX) \ - inline bool CLASS::roll() \ - { \ - if (!MEMBER_PREFIX initialized) { \ - return init(); \ - } \ - ++MEMBER_PREFIX pos; \ - if (MEMBER_PREFIX pos > MEMBER_PREFIX seq_len - MEMBER_PREFIX k) { \ - return false; \ - } \ - if (SEED_TAB[(unsigned char)(MEMBER_PREFIX seq[MEMBER_PREFIX pos + \ - MEMBER_PREFIX k - 1])] == \ - SEED_N) { \ - MEMBER_PREFIX pos += MEMBER_PREFIX k; \ - return init(); \ - } \ - (NTHASH_CALL); \ - return true; \ - } - -NTHASH_INIT(NtHash, - ntmc64(seq + pos, - k, - hash_num, - forward_hash, - reverse_hash, - posN, - hashes_vector.data()), ) -NTHASH_ROLL(NtHash, - ntmc64(seq[pos - 1], - seq[pos - 1 + k], - k, - hash_num, - forward_hash, - reverse_hash, - hashes_vector.data()), ) - -NTHASH_INIT(SeedNtHash, - ntmsm64(nthash.seq + nthash.pos, - seeds, - nthash.k, - seeds.size(), - hash_num_per_seed, - nthash.forward_hash, - nthash.reverse_hash, - posN, - nthash.hashes_vector.data()), - nthash.) -NTHASH_ROLL(SeedNtHash, - ntmsm64(nthash.seq + nthash.pos, - seeds, - nthash.seq[nthash.pos - 1], - nthash.seq[nthash.pos - 1 + nthash.k], - nthash.k, - seeds.size(), - hash_num_per_seed, - nthash.forward_hash, - nthash.reverse_hash, - nthash.hashes_vector.data()), - nthash.) - -#undef NTHASH_INIT -#undef NTHASH_ROLL - -} // namespace btllib - -#endif diff --git a/src/include/btllib/order_queue.hpp b/src/include/btllib/order_queue.hpp deleted file mode 100644 index 119b281..0000000 --- a/src/include/btllib/order_queue.hpp +++ /dev/null @@ -1,230 +0,0 @@ -#ifndef BTLLIB_ORDER_QUEUE_HPP -#define BTLLIB_ORDER_QUEUE_HPP - -#include -#include -#include -#include -#include -#include -#include - -namespace btllib { - -template -class OrderQueue -{ - -public: - struct Block - { - - Block(const size_t block_size) - : data(block_size) - {} - - Block(const Block& block) = default; - - Block(Block&& block) noexcept - : current(block.current) - , count(block.count) - , num(block.num) - { - std::swap(data, block.data); - block.current = 0; - block.count = 0; - block.num = 0; - } - - Block& operator=(const Block& block) = default; - - Block& operator=(Block&& block) noexcept - { - std::swap(data, block.data); - current = block.current; - count = block.count; - num = block.num; - block.current = 0; - block.count = 0; - block.num = 0; - return *this; - } - - std::vector data; - size_t current = 0; - size_t count = 0; - size_t num = 0; - }; - - // Surrounds pieces of data in the buffer with a busy mutex - // for exclusive access - struct Slot - { - Slot(size_t block_size) - : block(block_size) - {} - Slot(const Slot& slot) - : block(slot.block) - , occupied(slot.occupied) - , last_tenant(slot.last_tenant) - {} - Slot(Slot&& slot) noexcept - : block(slot.block) - , occupied(slot.occupied) - , last_tenant(slot.last_tenant) - {} - - Slot& operator=(const Slot& slot) - { - if (this == &slot) { - return *this; - } - block = slot.block; - occupied = slot.occupied; - last_tenant = slot.last_tenant; - return *this; - } - Slot& operator=(Slot&& slot) noexcept - { - block = slot.block; - occupied = slot.occupied; - last_tenant = slot.last_tenant; - return *this; - } - - typename OrderQueue::Block block; - std::mutex busy; - bool occupied = false; - std::condition_variable occupancy_changed; - size_t last_tenant = -1; // Required to ensure read order - }; - - size_t elements() const { return element_count; } - - void close() - { - closed = true; - for (auto& slot : this->slots) { - slot.occupancy_changed.notify_all(); - } - } - - bool is_closed() const { return closed; } - - OrderQueue(const size_t queue_size, const size_t block_size) - : slots(queue_size, Slot(block_size)) - , queue_size(queue_size) - , block_size(block_size) - {} - - OrderQueue(const OrderQueue&) = delete; - OrderQueue(OrderQueue&&) = delete; - -protected: - std::vector slots; - size_t queue_size, block_size; - size_t read_counter = 0; - std::atomic element_count{ 0 }; - std::atomic closed{ false }; -}; - -#define ORDER_QUEUE_XPXC(SUFFIX, \ - PRE_WRITE_LOCK, \ - EXTRA_WRITE_LOCK_CONDS, \ - POST_WRITE_LOCK, \ - NOTIFY_WRITE, \ - PRE_READ_LOCK, \ - EXTRA_READ_LOCK_CONDS, \ - POST_READ_LOCK, \ - NOTIFY_READ, \ - MEMBERS) \ - template \ - class OrderQueue##SUFFIX : public OrderQueue \ - { \ - \ - public: \ - OrderQueue##SUFFIX(const size_t queue_size, const size_t block_size) \ - : OrderQueue(queue_size, block_size) \ - {} \ - \ - using Block = typename OrderQueue::Block; \ - using Slot = typename OrderQueue::Slot; \ - \ - void write(Block& block) \ - { \ - PRE_WRITE_LOCK; \ - const auto num = block.num; \ - auto& target = this->slots[num % this->queue_size]; \ - std::unique_lock busy_lock(target.busy); \ - target.occupancy_changed.wait(busy_lock, [&] { \ - return (!target.occupied EXTRA_WRITE_LOCK_CONDS) || this->closed; \ - }); \ - if (this->closed) { \ - return; \ - } \ - POST_WRITE_LOCK; /* NOLINT */ \ - target.block = std::move(block); \ - target.occupied = true; \ - target.occupancy_changed.NOTIFY_WRITE(); \ - ++(this->element_count); \ - } \ - \ - void read(Block& block) \ - { \ - PRE_READ_LOCK; \ - auto& target = this->slots[this->read_counter % this->queue_size]; \ - std::unique_lock busy_lock(target.busy); \ - target.occupancy_changed.wait(busy_lock, [&] { \ - return (target.occupied EXTRA_READ_LOCK_CONDS) || this->closed; \ - }); \ - if (this->closed) { \ - return; \ - } \ - ++(this->read_counter); \ - POST_READ_LOCK; \ - block = std::move(target.block); \ - target.occupied = false; \ - target.occupancy_changed.NOTIFY_READ(); \ - --(this->element_count); \ - } \ - \ - private: \ - MEMBERS; /* NOLINT */ \ - }; - -ORDER_QUEUE_XPXC(SPSC, , , , notify_one, , , , notify_one, ) -ORDER_QUEUE_XPXC(MPSC, - , - &&(num - target.last_tenant <= this->queue_size), - target.last_tenant = num, - notify_all, - , - , - , - notify_all, ) -ORDER_QUEUE_XPXC(SPMC, - , - , - , - notify_one, - std::unique_lock read_lock(read_mutex), - , - read_lock.unlock(), - notify_one, - std::mutex read_mutex) -ORDER_QUEUE_XPXC(MPMC, - , - &&(num - target.last_tenant <= this->queue_size), - target.last_tenant = num, - notify_all, - std::unique_lock read_lock(read_mutex), - , - read_lock.unlock(), - notify_all, - std::mutex read_mutex) - -#undef ORDER_QUEUE_XPXC - -} // namespace btllib - -#endif \ No newline at end of file diff --git a/src/include/btllib/rolling_hash.hpp b/src/include/btllib/rolling_hash.hpp deleted file mode 100644 index 4c6ba0b..0000000 --- a/src/include/btllib/rolling_hash.hpp +++ /dev/null @@ -1,378 +0,0 @@ -#ifndef BTLLIB_ROLLING_HASH_HPP -#define BTLLIB_ROLLING_HASH_HPP - -#include "nthash.hpp" - -#include -#include -#include -#include - -namespace btllib -{ - -class RollingHash; -class SeedRollingHash; -using SpacedSeed = std::vector; -static std::vector -parse_seeds(const std::vector &seed_strings); - -/** - * Iterate over hash values for k-mers in a - * given DNA sequence. - * - * This implementation uses ntHash - * function to efficiently calculate - * hash values for successive k-mers. - */ -class RollingHash -{ - -public: - /** - * Constructor. - * @param seq DNA sequence to be hashed - * @param seq_len length of seq - * @param k k-mer size - * @param hash_num number of hashes - */ - RollingHash(const char *seq, size_t seq_len, unsigned k, unsigned hash_num); - - /** - * Constructor. - * @param seq DNA sequence to be hashed - * @param k k-mer size - * @param hash_num number of hashes - */ - RollingHash(const std::string &seq, unsigned k, unsigned hash_num); - - /** - * Calculate the next hash value - * @return true on success and false otherwise - */ - bool roll(); - - const uint64_t *hashes() const; - - size_t get_pos() const { return pos; } - unsigned get_k() const { return k; } - unsigned get_hash_num() const { return hash_num; } - -protected: - /** Initialize internal state of iterator */ - bool init(); - - const char *seq; - const size_t seq_len; - const unsigned k; - const unsigned hash_num; - size_t pos = 0; - std::vector hashes_vector; - uint64_t forward_hash = 0; - uint64_t reverse_hash = 0; -}; - -class SeedRollingHash : public RollingHash -{ - -public: - SeedRollingHash(const char *seq, - size_t seq_len, - unsigned k, - const std::vector &seeds, - unsigned hash_num_per_seed); - SeedRollingHash(const std::string &seq, - unsigned k, - const std::vector &seeds, - unsigned hash_num_per_seed); - SeedRollingHash(const char *seq, - size_t seq_len, - unsigned k, - const std::vector &seeds, - unsigned hash_num_per_seed); - SeedRollingHash(const std::string &seq, - unsigned k, - const std::vector &seeds, - unsigned hash_num_per_seed); - - unsigned get_hash_num_per_seed() const { return hash_num_per_seed; } - - std::vector>> hash_components; - - bool roll() - { - init(); - hashes_vector[0] = 0; - return true; - } - -private: - bool init() - { - - if (k > seq_len) - { - pos = std::numeric_limits::max(); - return false; - } - std::cerr << "checkpoint1" << std::endl; - - unsigned max_block_length = 0; - for (unsigned i = 0; i < seeds.size(); ++i) - { - const btllib::SpacedSeed &seed = seeds.at(i); - std::vector> one_blocks; - unsigned block_length = 0; - if (seed.at(0) > max_block_length) - { - max_block_length = seed.at(0); - } - one_blocks.emplace_back(std::make_pair(0, seed.at(0))); - - for (unsigned j = 1; j < seed.size(); ++j) - { - block_length = seed.at(j) - seed.at(j - 1) - 1; - if (seed.at(j) - seed.at(j - 1) > 1) - { - one_blocks.emplace_back(std::make_pair(seed.at(j - 1) + 1, block_length)); - } - if (block_length > max_block_length) - { - max_block_length = seed.at(0); - } - /*std::cerr << "checkpoint11" << std::endl; - - std::cerr << seed.at(j) << std::endl; - if (seed.at(j) == 1) - { - std::cerr << "checkpoint10" << std::endl; - - ++block_length; - if (at_ones == false) - { - at_ones = true; - start_of_block = j; - } - } - if (seed.at(j) == 0 && at_ones == true) - { - at_ones = false; - one_blocks.emplace_back(std::make_pair(start_of_block, block_length)); - if (block_length > max_block_length) - { - max_block_length = block_length; - } - block_length = 0; - }*/ - } - block_length = seq_len - seed.back() - 1; - one_blocks.emplace_back(std::make_pair(seed.back() + 1, block_length)); - - //one_blocks.emplace_back(std::make_pair(start_of_block, block_length)); - if (block_length > max_block_length) - { - max_block_length = block_length; - } - one_blocks_of_seeds.emplace_back(one_blocks); - } - std::cerr << max_block_length << std::endl; - std::cerr << "checkpoint2" << std::endl; - - hash_components.resize(max_block_length + 1); - for (const auto &one_blocks_of_seed : one_blocks_of_seeds) - { - for (const auto &one_block : one_blocks_of_seed) - { - auto &kmer_size = std::get<1>(one_block); - std::cerr << kmer_size << std::endl; - if (hash_components[kmer_size].size() == 0) - { - - pos = 0; - hash_components[kmer_size] = std::vector>(); - unsigned posN = 0; - while ((pos < seq_len - kmer_size + 1) && !(NTC64(seq + pos, kmer_size, posN, hash_components[kmer_size]))) - { - pos += posN + 1; - } - if (pos > seq_len - kmer_size) - { - pos = std::numeric_limits::max(); - return false; - } - ++pos; - - while (pos < seq_len - kmer_size + 1) - { - if (seed_tab[(unsigned char)(seq[pos + kmer_size - 1])] == seedN) - { - pos += kmer_size; - for (unsigned i = 0; i < kmer_size; ++i) - { - hash_components[kmer_size].emplace_back(std::make_pair((uint64_t)0, (uint64_t)0)); - } - while ((pos < seq_len - kmer_size + 1) && !(NTC64(seq + pos, kmer_size, posN, hash_components[kmer_size]))) - { - pos += posN + 1; - hash_components[kmer_size].emplace_back(std::make_pair((uint64_t)0, (uint64_t)0)); - } - } - NTMC64(seq[pos - 1], seq[pos - 1 + kmer_size], kmer_size, hash_components[kmer_size]); - ++pos; - } - } - } - } - return true; - } - - const unsigned hash_num_per_seed; - std::vector seeds; - std::vector>> one_blocks_of_seeds; -}; - -inline RollingHash::RollingHash(const char *seq, - size_t seq_len, - unsigned k, - unsigned hash_num) - : seq(seq), seq_len(seq_len), k(k), hash_num(hash_num) -{ - hashes_vector.resize(hash_num); -} - -inline RollingHash::RollingHash(const std::string &seq, - unsigned k, - unsigned hash_num) - : RollingHash(seq.c_str(), seq.size(), k, hash_num) -{ -} - -inline SeedRollingHash::SeedRollingHash(const char *seq, - size_t seq_len, - unsigned k, - const std::vector &seeds, - unsigned hash_num_per_seed) - : RollingHash(seq, seq_len, k, seeds.size() * hash_num_per_seed), hash_num_per_seed(hash_num_per_seed), seeds(seeds) -{ -} - -inline SeedRollingHash::SeedRollingHash(const std::string &seq, - unsigned k, - const std::vector &seeds, - unsigned hash_num_per_seed) - : RollingHash(seq, k, seeds.size() * hash_num_per_seed), hash_num_per_seed(hash_num_per_seed), seeds(seeds) -{ -} - -inline SeedRollingHash::SeedRollingHash(const char *seq, - size_t seq_len, - unsigned k, - const std::vector &seeds, - unsigned hash_num_per_seed) - : RollingHash(seq, seq_len, k, seeds.size() * hash_num_per_seed), hash_num_per_seed(hash_num_per_seed), seeds(parse_seeds(seeds)) -{ -} - -inline SeedRollingHash::SeedRollingHash(const std::string &seq, - unsigned k, - const std::vector &seeds, - unsigned hash_num_per_seed) - : RollingHash(seq, k, seeds.size() * hash_num_per_seed), hash_num_per_seed(hash_num_per_seed), seeds(parse_seeds(seeds)) -{ -} - -static std::vector -parse_seeds(const std::vector &seed_strings) -{ - std::vector seed_set; - for (const auto &seed_string : seed_strings) - { - SpacedSeed seed; - size_t pos = 0; - for (const auto &c : seed_string) - { - if (c != '1') - { - seed.push_back(pos); - } - ++pos; - } - seed_set.push_back(seed); - } - return seed_set; -} - -// NOLINTNEXTLINE -#define ROLLING_HASH_INIT(CLASS, NTHASH_CALL) \ - inline bool CLASS::init() \ - { \ - if (k > seq_len) \ - { \ - pos = std::numeric_limits::max(); \ - return false; \ - } \ - unsigned posN = 0; \ - while ((pos < seq_len - k + 1) && !(NTHASH_CALL)) \ - { \ - pos += posN + 1; \ - } \ - if (pos > seq_len - k) \ - { \ - pos = std::numeric_limits::max(); \ - return false; \ - } \ - ++pos; \ - return true; \ - } - -// NOLINTNEXTLINE -#define ROLLING_HASH_ROLL(CLASS, NTHASH_CALL) \ - inline bool CLASS::roll() \ - { \ - if (pos == 0) \ - { \ - return init(); \ - } \ - if (pos > seq_len - k) \ - { \ - return false; \ - } \ - if (seed_tab[(unsigned char)(seq[pos + k - 1])] == seedN) \ - { \ - pos += k; \ - return init(); \ - } \ - (NTHASH_CALL); \ - ++pos; \ - return true; \ - } - -ROLLING_HASH_INIT(RollingHash, - NTMC64(seq + pos, - k, - hash_num, - forward_hash, - reverse_hash, - posN, - hashes_vector.data())) -ROLLING_HASH_ROLL(RollingHash, - NTMC64(seq[pos - 1], - seq[pos - 1 + k], - k, - hash_num, - forward_hash, - reverse_hash, - hashes_vector.data())) - -#undef ROLLING_HASH_INIT -#undef ROLLING_HASH_ROLL - -inline const uint64_t * -RollingHash::hashes() const -{ - return hashes_vector.data(); -} - -} // namespace btllib - -#endif diff --git a/src/include/btllib/seq.hpp b/src/include/btllib/seq.hpp deleted file mode 100644 index a4b40a9..0000000 --- a/src/include/btllib/seq.hpp +++ /dev/null @@ -1,96 +0,0 @@ -#ifndef BTLLIB_SEQ_HPP -#define BTLLIB_SEQ_HPP - -#include "status.hpp" - -#include -#include - -namespace btllib { - -// clang-format off -static const char COMPLEMENTS[256] = { - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - -// ! " # $ % & ' ( ) * + , - . / - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '-' , '.', 0 , - -// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - -// @ A B C D E F G H I J K L M N O - 0 , 'T', 'V', 'G', 'H', 0 , 0 , 'C', 'D', 0 , 0 , 'M', 0 , 'K', 'N', 0 , - -// P Q R S T U V W X Y Z [ \ ] ^ _ - 0 , 0 , 'Y', 'S', 'A', 'U', 'B', 'W', 0 , 'R', 0 , 0 , 0 , 0 , 0 , 0 , - -// ` a b c d e f g h i j k l m n o - 0 , 't', 'v', 'g', 'h', 0 , 0 , 'c', 'd', 0 , 0 , 'm', 0 , 'k', 'n', 0 , - -// p q r s t u v w x y z { | } ~ DEL - 0 , 0 , 'y', 's', 'a', 'u', 'b', 'w', 0 , 'r', 0 , 0 , 0 , 0 , 0 , 0 , - - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 -}; - -static const char CAPITALS[256] = { - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - -// ! " # $ % & ' ( ) * + , - . / - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '-' , '.', 0 , - -// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - -// @ A B C D E F G H I J K L M N O - 0 , 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', - -// P Q R S T U V W X Y Z [ \ ] ^ _ - 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 0 , 0 , 0 , 0 , 0 , - -// ` a b c d e f g h i j k l m n o - 0 , 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', - -// p q r s t u v w x y z { | } ~ DEL - 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 0 , 0 , 0 , 0 , 0 , - - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 -}; -// clang-format on - -inline void -reverse_complement(std::string& seq) -{ - std::reverse(seq.begin(), seq.end()); - std::transform(seq.begin(), seq.end(), seq.begin(), [](char c) { - return COMPLEMENTS[(unsigned char)(c)]; - }); -} - -inline std::string -get_reverse_complement(const std::string& seq) -{ - std::string rc(seq); - reverse_complement(rc); - return rc; -} - -} // namespace btllib - -#endif \ No newline at end of file diff --git a/src/include/btllib/seq_reader.hpp b/src/include/btllib/seq_reader.hpp deleted file mode 100644 index 8b9eac7..0000000 --- a/src/include/btllib/seq_reader.hpp +++ /dev/null @@ -1,1239 +0,0 @@ -#ifndef BTLLIB_SEQ_READER_HPP -#define BTLLIB_SEQ_READER_HPP - -#include "cstring.hpp" -#include "data_stream.hpp" -#include "order_queue.hpp" -#include "seq.hpp" -#include "status.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace btllib { - -/** - * @example seq_reader.cpp - * An example of reading a gzipped fastq file. - */ - -/** Read a FASTA, FASTQ, SAM, or GFA2 file. Threadsafe. */ -class SeqReader -{ -public: - /* Has to be a struct and not an enum because: - * 1) Non-class enums are not name qualified and can collide - * 2) class enums can't be implicitly converted into integers - */ - struct Flag - { - /** Fold lower-case characters to upper-case. */ - static const unsigned FOLD_CASE = 0; - static const unsigned NO_FOLD_CASE = 1; - /** Trim masked (lower case) characters from the ends of - * sequences. */ - static const unsigned NO_TRIM_MASKED = 0; - static const unsigned TRIM_MASKED = 2; - }; - - SeqReader(const std::string& source_path, - unsigned flags = 0, - unsigned threads = 3, - size_t buffer_size = 32, - size_t block_size = 32); - - SeqReader(const SeqReader&) = delete; - SeqReader(SeqReader&&) = delete; - - SeqReader& operator=(const SeqReader&) = delete; - SeqReader& operator=(SeqReader&&) = delete; - - ~SeqReader(); - - void close() noexcept; - - bool fold_case() const { return bool(~flags & Flag::NO_FOLD_CASE); } - bool trim_masked() const { return bool(flags & Flag::TRIM_MASKED); } - - enum class Format - { - UNDETERMINED, - FASTA, - FASTQ, - SAM, - GFA2, - INVALID - }; - - Format get_format() const { return format; } - - struct Record - { - size_t num = -1; - std::string name; - std::string comment; - std::string seq; - std::string qual; - - operator bool() const { return !seq.empty(); } - }; - - /** Read operator. */ - Record read(); - - static const size_t MAX_SIMULTANEOUS_SEQREADERS = 256; - -private: - const std::string& source_path; - DataSource source; - const unsigned flags; - const unsigned threads; - Format format = Format::UNDETERMINED; // Format of the source file - bool closed = false; - - static const size_t DETERMINE_FORMAT_CHARS = 2048; - static const size_t BUFFER_SIZE = DETERMINE_FORMAT_CHARS; - - std::vector buffer; - size_t buffer_start = 0; - size_t buffer_end = 0; - bool eof_newline_inserted = false; - - struct RecordCString - { - CString header; - CString seq; - CString qual; - }; - - CString tmp; - - std::unique_ptr reader_thread; - std::vector> processor_threads; - std::mutex format_mutex; - std::condition_variable format_cv; - std::atomic reader_end; - RecordCString* reader_record = nullptr; - const size_t buffer_size; - const size_t block_size; - OrderQueueSPMC cstring_queue; - OrderQueueMPMC output_queue; - - // I am crying at this code, but until C++17 compliant compilers are - // widespread, this cannot be a static inline variable - using OutputQueueType = decltype(output_queue); - static std::unique_ptr* ready_records_array() - { - thread_local static std::unique_ptr - var[MAX_SIMULTANEOUS_SEQREADERS]; - return var; - } - - static long* ready_records_owners() - { - thread_local static long var[MAX_SIMULTANEOUS_SEQREADERS]; - return var; - } - - // :( - static std::atomic& last_id() - { - static std::atomic var(0); - return var; - } - - const long id; - - void determine_format(); - void start_reader(); - void start_processor(); - - bool load_buffer(); - - bool is_fasta_buffer(); - bool is_fastq_buffer(); - bool is_sam_buffer(); - bool is_gfa2_buffer(); - - bool readline_buffer_append(CString& s); - void readline_file(CString& s); - void readline_file_append(CString& s); - - enum class ReadStage - { - HEADER, - SEQ, - SEP, - QUAL - }; - - ReadStage read_stage = ReadStage::HEADER; - - /// @cond HIDDEN_SYMBOLS - struct read_fasta_buffer; - struct read_fastq_buffer; - struct read_sam_buffer; - struct read_gfa2_buffer; - - struct read_fasta_transition; - struct read_fastq_transition; - struct read_sam_transition; - struct read_gfa2_transition; - - struct read_fasta_file; - struct read_fastq_file; - struct read_sam_file; - struct read_gfa2_file; - /// @endcond - - template - void read_from_buffer(F f, - OrderQueueSPMC::Block& records, - size_t& counter); - - template - void read_transition(F f, - OrderQueueSPMC::Block& records, - size_t& counter); - - template - void read_from_file(F f, - OrderQueueSPMC::Block& records, - size_t& counter); - - void postprocess(); -}; - -inline SeqReader::SeqReader(const std::string& source_path, - const unsigned flags, - const unsigned threads, - const size_t buffer_size, - const size_t block_size) - : source_path(source_path) - , source(source_path) - , flags(flags) - , threads(threads) - , buffer(std::vector(BUFFER_SIZE)) - , reader_end(false) - , buffer_size(buffer_size) - , block_size(block_size) - , cstring_queue(buffer_size, block_size) - , output_queue(buffer_size, block_size) - , id(++last_id()) -{ - start_processor(); - { - std::unique_lock lock(format_mutex); - start_reader(); - format_cv.wait(lock); - } -} - -inline SeqReader::~SeqReader() -{ - close(); -} - -inline void -SeqReader::close() noexcept -{ - if (!closed) { - try { - closed = true; - reader_end = true; - output_queue.close(); - for (auto& pt : processor_threads) { - pt->join(); - } - cstring_queue.close(); - reader_thread->join(); - source.close(); - } catch (const std::system_error& e) { - log_error("SeqReader thread join failure: " + std::string(e.what())); - std::exit(EXIT_FAILURE); - } - } -} - -inline bool -SeqReader::load_buffer() -{ - buffer_start = 0; - char last = buffer_end > 0 ? buffer[buffer_end - 1] : char(0); - buffer_end = 0; - do { - buffer_end += - fread(buffer.data() + buffer_end, 1, BUFFER_SIZE - buffer_end, source); - } while (buffer_end < BUFFER_SIZE && !bool(std::feof(source))); - - if (bool(std::feof(source)) && !eof_newline_inserted) { - if (buffer_end < BUFFER_SIZE) { - if ((buffer_end == 0 && last != '\n') || - (buffer_end > 0 && buffer[buffer_end - 1] != '\n')) { - buffer[buffer_end++] = '\n'; - } - eof_newline_inserted = true; - } else if (buffer[BUFFER_SIZE - 1] == '\n') { - eof_newline_inserted = true; - } - return true; - } - return bool(buffer_end); -} - -inline bool -SeqReader::is_fasta_buffer() -{ - size_t current = buffer_start; - unsigned char c; - enum State - { - IN_HEADER_1, - IN_HEADER_2, - IN_SEQ - }; - State state = IN_HEADER_1; - while (current < buffer_start + DETERMINE_FORMAT_CHARS && - current < buffer_end) { - c = buffer[current]; - switch (state) { - case IN_HEADER_1: - if (c == '>') { - state = IN_HEADER_2; - } else { - return false; - } - break; - case IN_HEADER_2: - if (c == '\n') { - state = IN_SEQ; - } - break; - case IN_SEQ: - if (c == '\n') { - state = IN_HEADER_1; - } else if (!bool(COMPLEMENTS[c])) { - return false; - } - break; - } - current++; - } - return true; -} - -inline bool -SeqReader::is_fastq_buffer() -{ - size_t current = buffer_start; - unsigned char c; - enum State - { - IN_HEADER_1, - IN_HEADER_2, - IN_SEQ, - IN_PLUS_1, - IN_PLUS_2, - IN_QUAL - }; - State state = IN_HEADER_1; - while (current < buffer_start + DETERMINE_FORMAT_CHARS && - current < buffer_end) { - c = buffer[current]; - switch (state) { - case IN_HEADER_1: - if (c == '@') { - state = IN_HEADER_2; - } else { - return false; - } - break; - case IN_HEADER_2: - if (c == '\n') { - state = IN_SEQ; - } - break; - case IN_SEQ: - if (c == '\n') { - state = IN_PLUS_1; - } else if (!bool(COMPLEMENTS[c])) { - return false; - } - break; - case IN_PLUS_1: - if (c == '+') { - state = IN_PLUS_2; - } else { - return false; - } - break; - case IN_PLUS_2: - if (c == '\n') { - state = IN_QUAL; - } - break; - case IN_QUAL: - if (c == '\n') { - state = IN_HEADER_1; - } else if (c < '!' || c > '~') { - return false; - } - break; - } - current++; - } - return true; -} - -inline bool -SeqReader::is_sam_buffer() -{ - enum Column - { - QNAME = 1, - FLAG, - RNAME, - POS, - MAPQ, - CIGAR, - RNEXT, - PNEXT, - TLEN, - SEQ, - QUAL - }; - - size_t current = buffer_start; - - while (current < buffer_start + DETERMINE_FORMAT_CHARS && - current < buffer_end && buffer[current] == '@') { - while (current < buffer_start + DETERMINE_FORMAT_CHARS && - current < buffer_end && buffer[current] != '\n') { - current++; - } - current++; - } - - int column = 1; - unsigned char c; - while (current < buffer_start + DETERMINE_FORMAT_CHARS && - current < buffer_end) { - c = buffer[current]; - if (c == '\n') { - break; - } - if (c == '\t') { - if (current > 0 && !bool(std::isspace(buffer[current - 1]))) { - column++; - } else { - return false; - } - } else { - switch (Column(column)) { - case QNAME: - if (bool(std::isspace(c))) { - return false; - } - break; - case FLAG: - if (!bool(std::isdigit(c))) { - return false; - } - break; - case RNAME: - if (bool(std::isspace(c))) { - return false; - } - break; - case POS: - if (!bool(std::isdigit(c))) { - return false; - } - break; - case MAPQ: - if (!bool(std::isdigit(c))) { - return false; - } - break; - case CIGAR: - if (bool(std::isspace(c))) { - return false; - } - break; - case RNEXT: - if (bool(std::isspace(c))) { - return false; - } - break; - case PNEXT: - if (!bool(std::isdigit(c))) { - return false; - } - break; - case TLEN: - if (!bool(std::isdigit(c))) { - return false; - } - break; - case SEQ: - if (!bool(COMPLEMENTS[c])) { - return false; - } - break; - case QUAL: - if (bool(std::isspace(c))) { - return false; - } - break; - default: - break; - } - } - current++; - } - - return current >= buffer_end || column >= QUAL; -} - -inline bool -SeqReader::is_gfa2_buffer() -{ - const unsigned char specs[] = { 'H', 'S', 'F', 'E', 'G', 'O', 'U' }; - - enum State - { - IN_ID, - IN_ID_TAB, - IN_REST, - IN_IGNORED - }; - - auto is_a_spec = [&](unsigned char c) { - bool found = false; - for (unsigned char spec : specs) { - if (c == spec) { - found = true; - break; - } - } - return found; - }; - - State state = is_a_spec(buffer[0]) ? IN_ID : IN_IGNORED; - bool has_id = false; - size_t current = buffer_start; - unsigned char c; - while (current < buffer_start + DETERMINE_FORMAT_CHARS && - current < buffer_end) { - c = buffer[current]; - switch (state) { - case IN_ID: - if (!is_a_spec(c)) { - return false; - } - has_id = true; - state = IN_ID_TAB; - break; - case IN_ID_TAB: - if (c != '\t') { - return false; - } - state = IN_REST; - break; - case IN_REST: - if (c == '\n') { - if (current + 1 < buffer_end) { - state = is_a_spec(buffer[current + 1]) ? IN_ID : IN_IGNORED; - } - } - break; - case IN_IGNORED: - if (c == '\n') { - if (current + 1 < buffer_end) { - state = is_a_spec(buffer[current + 1]) ? IN_ID : IN_IGNORED; - } - } - break; - default: - break; - } - current++; - } - - return has_id; -} - -inline void -SeqReader::determine_format() -{ - load_buffer(); - bool empty = buffer_end - buffer_start == 1; - check_warning(empty, std::string(source_path) + " is empty."); - - if (empty) { - return; - } - - if (is_fasta_buffer()) { - format = Format::FASTA; - } else if (is_fastq_buffer()) { - format = Format::FASTQ; - } else if (is_sam_buffer()) { - format = Format::SAM; - } else if (is_gfa2_buffer()) { - format = Format::GFA2; - } else { - format = Format::INVALID; - log_error(std::string(source_path) + " source file is in invalid format!"); - std::exit(EXIT_FAILURE); - } -} - -inline bool -SeqReader::readline_buffer_append(CString& s) -{ - char c = char(0); - for (; buffer_start < buffer_end && (c = buffer[buffer_start]) != '\n'; - ++buffer_start) { - if (s.s_size >= s.s_cap) { - s.s_cap *= 2; - s.s = (char*)std::realloc((char*)(s.s), s.s_cap); // NOLINT - } - s.s[s.s_size++] = c; - } - if (s.s_size >= s.s_cap) { - s.s_cap *= 2; - s.s = (char*)std::realloc((char*)(s.s), s.s_cap); // NOLINT - } - s.s[s.s_size] = '\0'; - if (c == '\n') { - ++buffer_start; - return true; - } - return false; -} - -inline void -SeqReader::readline_file(CString& s) -{ - s.s_size = getline(&(s.s), &(s.s_cap), source); -} - -inline void -SeqReader::readline_file_append(CString& s) -{ - readline_file(tmp); - if (s.s_size + tmp.s_size + 1 > s.s_cap) { - s.s_cap = s.s_size + tmp.s_size + 1; - s.s = (char*)std::realloc((char*)(s.s), s.s_cap); // NOLINT - } - memcpy(s.s + s.s_size, tmp.s, tmp.s_size + 1); - s.s_size += tmp.s_size; -} - -// NOLINTNEXTLINE -#define READ_SAM(READLINE_SECTION, MIDEND_SECTION, END_SECTION) \ - enum Column \ - { \ - QNAME = 1, \ - FLAG, \ - RNAME, \ - POS, \ - MAPQ, \ - CIGAR, \ - RNEXT, \ - PNEXT, \ - TLEN, \ - SEQ, \ - QUAL \ - }; \ - for (;;) { \ - READLINE_SECTION \ - std::string tmp_string = seq_reader.tmp.s; \ - if (tmp_string.length() > 0 && tmp_string[0] != '@') { \ - size_t pos = 0, pos2 = 0, pos3 = 0; \ - pos2 = tmp_string.find('\t'); \ - if (tmp_string.size() + 1 > seq_reader.reader_record->header.s_cap) { \ - seq_reader.reader_record->header.s_cap = tmp_string.size() + 1; \ - seq_reader.reader_record->header.s = \ - (char*)std::realloc((char*)(seq_reader.reader_record->header), \ - seq_reader.reader_record->header.s_cap); \ - } \ - seq_reader.reader_record->header = tmp_string.substr(0, pos2); \ - for (int i = 0; i < int(SEQ) - 1; i++) { \ - pos = tmp_string.find('\t', pos + 1); \ - } \ - pos2 = tmp_string.find('\t', pos + 1); \ - pos3 = tmp_string.find('\t', pos2 + 1); \ - if (pos3 == std::string::npos) { \ - pos3 = tmp_string.length(); \ - } \ - if (tmp_string.size() + 1 > seq_reader.reader_record->seq.s_cap) { \ - seq_reader.reader_record->seq.s_cap = tmp_string.size() + 1; \ - seq_reader.reader_record->seq.s = \ - (char*)std::realloc((char*)(seq_reader.reader_record->seq.s), \ - seq_reader.reader_record->seq.s_cap); \ - } \ - if (tmp_string.size() + 1 > seq_reader.reader_record->qual.s_cap) { \ - seq_reader.reader_record->qual.s_cap = tmp_string.size() + 1; \ - seq_reader.reader_record->qual.s = \ - (char*)std::realloc((char*)(seq_reader.reader_record->qual.s), \ - seq_reader.reader_record->qual.s_cap); \ - } \ - seq_reader.reader_record->seq = \ - tmp_string.substr(pos + 1, pos2 - pos - 1); \ - seq_reader.reader_record->qual = \ - tmp_string.substr(pos2 + 1, pos3 - pos2 - 1); \ - MIDEND_SECTION \ - } \ - seq_reader.tmp.clear(); \ - END_SECTION \ - } - -// NOLINTNEXTLINE -#define READ_GFA2(READLINE_SECTION, MIDEND_SECTION, END_SECTION) \ - enum Column \ - { \ - S = 1, \ - ID, \ - LEN, \ - SEQ \ - }; \ - for (;;) { \ - READLINE_SECTION \ - std::string tmp_string = seq_reader.tmp.s; \ - if (tmp_string.length() > 0 && tmp_string[0] == 'S') { \ - size_t pos = 0, pos2 = 0; \ - pos2 = tmp_string.find('\t', 1); \ - if (tmp_string.size() + 1 > seq_reader.reader_record->header.s_cap) { \ - seq_reader.reader_record->header.s_cap = tmp_string.size() + 1; \ - seq_reader.reader_record->header.s = \ - (char*)std::realloc((char*)(seq_reader.reader_record->header.s), \ - seq_reader.reader_record->header.s_cap); \ - } \ - seq_reader.reader_record->header = tmp_string.substr(1, pos2 - 1); \ - for (int i = 0; i < int(SEQ) - 1; i++) { \ - pos = tmp_string.find('\t', pos + 1); \ - } \ - pos2 = tmp_string.find('\t', pos + 1); \ - if (pos2 == std::string::npos) { \ - pos2 = tmp_string.length(); \ - } \ - if (tmp_string.size() + 1 > seq_reader.reader_record->seq.s_cap) { \ - seq_reader.reader_record->seq.s_cap = tmp_string.size() + 1; \ - seq_reader.reader_record->seq.s = \ - (char*)std::realloc((char*)(seq_reader.reader_record->seq.s), \ - seq_reader.reader_record->seq.s_cap); \ - } \ - seq_reader.reader_record->seq = \ - tmp_string.substr(pos + 1, pos2 - pos - 1); \ - MIDEND_SECTION \ - } \ - seq_reader.tmp.clear(); \ - END_SECTION \ - } - -/// @cond HIDDEN_SYMBOLS -struct SeqReader::read_fasta_buffer -{ - bool operator()(SeqReader& seq_reader) - { - switch (seq_reader.read_stage) { - case ReadStage::HEADER: { - if (!seq_reader.readline_buffer_append( - seq_reader.reader_record->header)) { - return false; - } - seq_reader.read_stage = ReadStage::SEQ; - } - // fall through - case ReadStage::SEQ: { - if (!seq_reader.readline_buffer_append(seq_reader.reader_record->seq)) { - return false; - } - seq_reader.read_stage = ReadStage::HEADER; - return true; - } - default: { - log_error("SeqReader has entered an invalid state."); - std::exit(EXIT_FAILURE); - } - } - return false; - } -}; - -struct SeqReader::read_fastq_buffer -{ - bool operator()(SeqReader& seq_reader) - { - switch (seq_reader.read_stage) { - case ReadStage::HEADER: { - if (!seq_reader.readline_buffer_append( - seq_reader.reader_record->header)) { - return false; - } - seq_reader.read_stage = ReadStage::SEQ; - } - // fall through - case ReadStage::SEQ: { - if (!seq_reader.readline_buffer_append(seq_reader.reader_record->seq)) { - return false; - } - seq_reader.read_stage = ReadStage::SEP; - } - // fall through - case ReadStage::SEP: { - if (!seq_reader.readline_buffer_append(seq_reader.tmp)) { - return false; - } - seq_reader.read_stage = ReadStage::QUAL; - seq_reader.tmp.clear(); - } - // fall through - case ReadStage::QUAL: { - if (!seq_reader.readline_buffer_append( - seq_reader.reader_record->qual)) { - return false; - } - seq_reader.read_stage = ReadStage::HEADER; - return true; - } - default: { - log_error("SeqReader has entered an invalid state."); - std::exit(EXIT_FAILURE); - } - } - return false; - } -}; - -struct SeqReader::read_sam_buffer -{ - bool operator()(SeqReader& seq_reader) - { - READ_SAM( // NOLINT - if (!seq_reader.readline_buffer_append( // NOLINT - seq_reader.tmp)) { return false; }, // NOLINT - seq_reader.tmp.clear(); // NOLINT - return true; // NOLINT - , - if (seq_reader.buffer_start >= seq_reader.buffer_end) { - return false; - }) // NOLINT - } -}; - -struct SeqReader::read_gfa2_buffer -{ - bool operator()(SeqReader& seq_reader) - { - READ_GFA2( // NOLINT - if (!seq_reader.readline_buffer_append( // NOLINT - seq_reader.tmp)) { return false; }, // NOLINT - seq_reader.tmp.clear(); // NOLINT - return true; // NOLINT - , - if (seq_reader.buffer_start >= seq_reader.buffer_end) { - return false; - }) // NOLINT - } -}; - -struct SeqReader::read_fasta_transition -{ - void operator()(SeqReader& seq_reader) - { - switch (seq_reader.read_stage) { - case ReadStage::HEADER: { - seq_reader.readline_file_append(seq_reader.reader_record->header); - seq_reader.read_stage = ReadStage::SEQ; - } - // fall through - case ReadStage::SEQ: { - seq_reader.readline_file_append(seq_reader.reader_record->seq); - seq_reader.read_stage = ReadStage::HEADER; - return; - } - default: { - log_error("SeqReader has entered an invalid state."); - std::exit(EXIT_FAILURE); - } - } - } -}; - -struct SeqReader::read_fastq_transition -{ - void operator()(SeqReader& seq_reader) - { - switch (seq_reader.read_stage) { - case ReadStage::HEADER: { - seq_reader.readline_file_append(seq_reader.reader_record->header); - seq_reader.read_stage = ReadStage::SEQ; - } - // fall through - case ReadStage::SEQ: { - seq_reader.readline_file_append(seq_reader.reader_record->seq); - seq_reader.read_stage = ReadStage::SEP; - } - // fall through - case ReadStage::SEP: { - seq_reader.readline_file_append(seq_reader.tmp); - seq_reader.read_stage = ReadStage::QUAL; - seq_reader.tmp.clear(); - } - // fall through - case ReadStage::QUAL: { - seq_reader.readline_file_append(seq_reader.reader_record->qual); - seq_reader.read_stage = ReadStage::HEADER; - return; - } - default: { - log_error("SeqReader has entered an invalid state."); - std::exit(EXIT_FAILURE); - } - } - } -}; - -struct SeqReader::read_sam_transition -{ - void operator()(SeqReader& seq_reader) - { - READ_SAM( // NOLINT - seq_reader.readline_file_append(seq_reader.tmp); // NOLINT - , , if (bool(feof(seq_reader.source))) { break; }) // NOLINT - } -}; - -struct SeqReader::read_gfa2_transition -{ - void operator()(SeqReader& seq_reader) - { - READ_GFA2( // NOLINT - seq_reader.readline_file_append(seq_reader.tmp); // NOLINT - , , if (bool(feof(seq_reader.source))) { break; }) // NOLINT - } -}; - -struct SeqReader::read_fasta_file -{ - void operator()(SeqReader& seq_reader) - { - seq_reader.readline_file(seq_reader.reader_record->header); - seq_reader.readline_file(seq_reader.reader_record->seq); - } -}; - -struct SeqReader::read_fastq_file -{ - void operator()(SeqReader& seq_reader) - { - seq_reader.readline_file(seq_reader.reader_record->header); - seq_reader.readline_file(seq_reader.reader_record->seq); - seq_reader.readline_file(seq_reader.tmp); - seq_reader.readline_file(seq_reader.reader_record->qual); - } -}; - -struct SeqReader::read_sam_file -{ - void operator()(SeqReader& seq_reader) - { - READ_SAM( // NOLINT - seq_reader.readline_file(seq_reader.tmp); // NOLINT - , , if (bool(feof(seq_reader.source))) { break; }) // NOLINT - } -}; - -struct SeqReader::read_gfa2_file -{ - void operator()(SeqReader& seq_reader) - { - READ_GFA2( // NOLINT - seq_reader.readline_file(seq_reader.tmp); // NOLINT - , , if (bool(feof(seq_reader.source))) { break; }) // NOLINT - } -}; -/// @endcond - -template -inline void -SeqReader::read_from_buffer(F f, - OrderQueueSPMC::Block& records, - size_t& counter) -{ - for (; buffer_start < buffer_end && !reader_end;) { - reader_record = &(records.data[records.count]); - reader_record->header.clear(); - reader_record->seq.clear(); - reader_record->qual.clear(); - if (!f(*this) || reader_record->seq.empty()) { - break; - } - records.count++; - if (records.count == block_size) { - records.current = 0; - records.num = counter++; - cstring_queue.write(records); - records.num = 0; - records.current = 0; - records.count = 0; - } - } -} - -template -inline void -SeqReader::read_transition(F f, - OrderQueueSPMC::Block& records, - size_t& counter) -{ - if (std::ferror(source) == 0 && std::feof(source) == 0 && !reader_end) { - int p = std::fgetc(source); - if (p != EOF) { - std::ungetc(p, source); - reader_record = &(records.data[records.count]); - f(*this); - if (!reader_record->seq.empty()) { - records.count++; - if (records.count == block_size) { - records.current = 0; - records.num = counter++; - cstring_queue.write(records); - records.num = 0; - records.current = 0; - records.count = 0; - } - } - } - } -} - -template -inline void -SeqReader::read_from_file(F f, - OrderQueueSPMC::Block& records, - size_t& counter) -{ - for (; std::ferror(source) == 0 && std::feof(source) == 0 && !reader_end;) { - reader_record = &(records.data[records.count]); - f(*this); - if (reader_record->seq.empty()) { - break; - } - records.count++; - if (records.count == block_size) { - records.current = 0; - records.num = counter++; - cstring_queue.write(records); - records.num = 0; - records.current = 0; - records.count = 0; - } - } -} - -inline void -SeqReader::start_reader() -{ - reader_thread = std::unique_ptr(new std::thread([this]() { - { - std::unique_lock lock(format_mutex); - determine_format(); - format_cv.notify_all(); - } - - size_t counter = 0; - decltype(cstring_queue)::Block records(block_size); - switch (format) { - case Format::FASTA: { - read_from_buffer(read_fasta_buffer(), records, counter); - read_transition(read_fasta_transition(), records, counter); - read_from_file(read_fasta_file(), records, counter); - break; - } - case Format::FASTQ: { - read_from_buffer(read_fastq_buffer(), records, counter); - read_transition(read_fastq_transition(), records, counter); - read_from_file(read_fastq_file(), records, counter); - break; - } - case Format::SAM: { - read_from_buffer(read_sam_buffer(), records, counter); - read_transition(read_sam_transition(), records, counter); - read_from_file(read_sam_file(), records, counter); - break; - } - case Format::GFA2: { - read_from_buffer(read_gfa2_buffer(), records, counter); - read_transition(read_gfa2_transition(), records, counter); - read_from_file(read_gfa2_file(), records, counter); - break; - } - default: { - break; - } - } - - reader_end = true; - if (records.count > 0) { - records.current = 0; - records.num = counter++; - cstring_queue.write(records); - } - for (unsigned i = 0; i < threads; i++) { - decltype(cstring_queue)::Block dummy(block_size); - dummy.num = counter++; - dummy.current = 0; - dummy.count = 0; - cstring_queue.write(dummy); - } - })); -} - -inline void -SeqReader::start_processor() -{ - processor_threads.reserve(threads); - for (unsigned i = 0; i < threads; i++) { - processor_threads.push_back( - std::unique_ptr(new std::thread([this]() { - decltype(cstring_queue)::Block records_in(block_size); - decltype(output_queue)::Block records_out(block_size); - for (;;) { - cstring_queue.read(records_in); - for (size_t i = 0; i < records_in.count; i++) { - records_out.data[i].seq = std::string( - records_in.data[i].seq, records_in.data[i].seq.size()); - auto& seq = records_out.data[i].seq; - if (!seq.empty() && seq.back() == '\n') { - seq.pop_back(); - } - - records_out.data[i].qual = std::string( - records_in.data[i].qual, records_in.data[i].qual.size()); - auto& qual = records_out.data[i].qual; - if (!qual.empty() && qual.back() == '\n') { - qual.pop_back(); - } - - char *first_whitespace = nullptr, *last_whitespace = nullptr; - for (size_t j = 0; j < records_in.data[i].header.size(); j++) { - if (bool(std::isspace(records_in.data[i].header[j]))) { - if (first_whitespace == nullptr) { - first_whitespace = records_in.data[i].header + j; - } - last_whitespace = records_in.data[i].header + j; - } else if (last_whitespace != nullptr) { - break; - } - } - size_t name_start = - (format == Format::FASTA || format == Format::FASTQ) ? 1 : 0; - - if (first_whitespace == nullptr) { - records_out.data[i].name = - std::string(records_in.data[i].header + name_start, - records_in.data[i].header.size() - name_start); - records_out.data[i].comment = ""; - } else { - records_out.data[i].name = std::string( - records_in.data[i].header + name_start, - first_whitespace - records_in.data[i].header - name_start); - records_out.data[i].comment = std::string( - last_whitespace + 1, - records_in.data[i].header.size() - - (last_whitespace - records_in.data[i].header) - 1); - } - records_in.data[i].header.clear(); - - auto& name = records_out.data[i].name; - auto& comment = records_out.data[i].comment; - if (!name.empty() && name.back() == '\n') { - name.pop_back(); - } - if (!comment.empty() && comment.back() == '\n') { - comment.pop_back(); - } - - if (trim_masked()) { - const auto len = seq.length(); - size_t trim_start = 0, trim_end = seq.length(); - while (trim_start <= len && bool(islower(seq[trim_start]))) { - trim_start++; - } - while (trim_end > 0 && bool(islower(seq[trim_end - 1]))) { - trim_end--; - } - seq.erase(trim_end); - seq.erase(0, trim_start); - if (!qual.empty()) { - qual.erase(trim_end); - qual.erase(0, trim_start); - } - } - if (fold_case()) { - for (auto& c : seq) { - char old = c; - c = CAPITALS[(unsigned char)(c)]; - if (!bool(c)) { - log_error(std::string("A sequence contains invalid " - "IUPAC character: ") + - old); - std::exit(EXIT_FAILURE); - } - } - } - records_out.data[i].num = records_in.num * block_size + i; - } - records_out.count = records_in.count; - records_out.current = records_in.current; - records_out.num = records_in.num; - if (records_out.count == 0) { - output_queue.write(records_out); - break; - } - output_queue.write(records_out); - } - }))); - } -} - -inline SeqReader::Record -SeqReader::read() -{ - if (ready_records_owners()[id % MAX_SIMULTANEOUS_SEQREADERS] != id) { - ready_records_array()[id % MAX_SIMULTANEOUS_SEQREADERS] = - std::unique_ptr( - new decltype(output_queue)::Block(block_size)); - ready_records_owners()[id % MAX_SIMULTANEOUS_SEQREADERS] = id; - } - auto& ready_records = - *(ready_records_array()[id % MAX_SIMULTANEOUS_SEQREADERS]); - if (ready_records.count <= ready_records.current) { - output_queue.read(ready_records); - if (ready_records.count <= ready_records.current) { - close(); - ready_records = decltype(output_queue)::Block(block_size); - return Record(); - } - } - return std::move(ready_records.data[ready_records.current++]); -} - -} // namespace btllib - -#endif \ No newline at end of file diff --git a/src/include/btllib/seq_writer.hpp b/src/include/btllib/seq_writer.hpp deleted file mode 100644 index 5564c5b..0000000 --- a/src/include/btllib/seq_writer.hpp +++ /dev/null @@ -1,116 +0,0 @@ -#ifndef BTLLIB_SEQ_WRITER_HPP -#define BTLLIB_SEQ_WRITER_HPP - -#include "data_stream.hpp" -#include "seq.hpp" - -#include -#include -#include - -namespace btllib { - -/** - * @example seq_writer.cpp - * An example of writing a gzipped fastq file. - */ - -/** Write FASTA or FASTQ sequences to a file. Threadsafe. */ -class SeqWriter -{ - -public: - enum Format - { - FASTA, - FASTQ - }; - - SeqWriter(const std::string& sink_path, - Format format = FASTA, - bool append = false); - - void close(); - - void write(const std::string& name, - const std::string& comment, - const std::string& seq, - const std::string& qual); - -private: - const std::string sink_path; - DataSink sink; - bool closed; - Format format; - char headerchar; - std::mutex mutex; -}; - -inline SeqWriter::SeqWriter(const std::string& sink_path, - Format format, - bool append) - : sink_path(sink_path) - , sink(sink_path, append) - , closed(false) - , format(format) - , headerchar(format == FASTA ? '>' : '@') -{} - -inline void -SeqWriter::close() -{ - if (!closed) { - sink.close(); - closed = true; - } -} - -inline void -SeqWriter::write(const std::string& name, - const std::string& comment, - const std::string& seq, - const std::string& qual) -{ - check_error(seq.empty(), "Attempted to write empty sequence."); - for (const auto& c : seq) { - if (!bool(COMPLEMENTS[(unsigned char)(c)])) { - log_error(std::string("A sequence contains invalid IUPAC character: ") + - c); - std::exit(EXIT_FAILURE); - } - } - - std::string output; - output.reserve(1 + name.size() + 1 + comment.size() + 1 + seq.size() + 3 + - qual.size() + 1); - output += headerchar; - if (!name.empty()) { - output += name; - } - if (!comment.empty()) { - output += " "; - output += comment; - output += '\n'; - } - - output += seq; - output += '\n'; - - if (format == FASTQ) { - check_error(seq.size() != qual.size(), - "Quality must be the same length as sequence."); - output += "+\n"; - output += qual; - output += '\n'; - } - - { - std::unique_lock lock(mutex); - check_error(fwrite(output.c_str(), 1, output.size(), sink) != output.size(), - "SeqWriter: fwrite failed."); - } -} - -} // namespace btllib - -#endif \ No newline at end of file diff --git a/src/include/btllib/status.hpp b/src/include/btllib/status.hpp deleted file mode 100644 index 95efc89..0000000 --- a/src/include/btllib/status.hpp +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef BTLLIB_STATUS_HPP -#define BTLLIB_STATUS_HPP - -#include -#include -#include -#include -#include - -namespace btllib { - -inline std::string -get_time(); -inline void -log_info(const std::string& msg); -inline void -log_warning(const std::string& msg); -inline void -log_error(const std::string& msg); -inline void -check_info(bool condition, const std::string& msg); -inline void -check_warning(bool condition, const std::string& msg); -inline void -check_error(bool condition, const std::string& msg); -inline void -check_stream(const std::ios& stream, const std::string& name); - -inline std::string -get_time() -{ - time_t now; - time(&now); - char buf[sizeof("2011-10-08T07:07:09Z")]; - strftime(buf, sizeof buf, "%F %T", localtime(&now)); - return std::string(buf); -} - -inline void -log_info(const std::string& msg) -{ - std::cerr << ('[' + get_time() + "] [INFO] " + msg + '\n') << std::flush; -} - -inline void -log_warning(const std::string& msg) -{ - std::cerr << ('[' + get_time() + "] [WARNING] " + msg + '\n') << std::flush; -} - -inline void -log_error(const std::string& msg) -{ - std::cerr << ('[' + get_time() + "] [ERROR] " + msg + '\n') << std::flush; -} - -inline void -check_info(bool condition, const std::string& msg) -{ - if (condition) { - log_info(msg); - } -} - -inline void -check_warning(bool condition, const std::string& msg) -{ - if (condition) { - log_warning(msg); - } -} - -inline void -check_error(bool condition, const std::string& msg) -{ - if (condition) { - log_error(msg); - std::exit(EXIT_FAILURE); - } -} - -inline void -check_stream(const std::ios& stream, const std::string& name) -{ - check_error(!stream.good(), - "'" + name + "' stream error: " + std::strerror(errno)); -} - -} // namespace btllib - -#endif diff --git a/src/include/btllib/util.hpp b/src/include/btllib/util.hpp deleted file mode 100644 index 7e6e40a..0000000 --- a/src/include/btllib/util.hpp +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef BTLLIB_UTIL_HPP -#define BTLLIB_UTIL_HPP - -#include -#include -#include - -namespace btllib { - -inline std::vector -split(const std::string& s, const std::string& delim); -inline void -ltrim(std::string& s); -inline void -rtrim(std::string& s); -inline void -trim(std::string& s); -inline bool -starts_with(std::string s, std::string prefix); -inline bool -ends_with(std::string s, std::string suffix); - -inline std::vector -split(const std::string& s, const std::string& delim) -{ - std::vector tokens; - size_t pos1 = 0, pos2 = 0; - while ((pos2 = s.find(delim, pos2)) != std::string::npos) { - tokens.push_back(s.substr(pos1, pos2 - pos1)); - pos2 += delim.size(); - pos1 = pos2; - } - tokens.push_back(s.substr(pos1)); - return tokens; -} - -inline void -ltrim(std::string& s) -{ - s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) { - return !bool(std::isspace(ch)); - })); -} - -inline void -rtrim(std::string& s) -{ - s.erase(std::find_if(s.rbegin(), - s.rend(), - [](int ch) { return !bool(std::isspace(ch)); }) - .base(), - s.end()); -} - -inline void -trim(std::string& s) -{ - ltrim(s); - rtrim(s); -} - -inline bool -starts_with(std::string s, std::string prefix) -{ - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - std::transform(prefix.begin(), prefix.end(), prefix.begin(), ::tolower); - return s.find(prefix) == 0; -}; - -inline bool -ends_with(std::string s, std::string suffix) -{ - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - std::transform(suffix.begin(), suffix.end(), suffix.begin(), ::tolower); - auto pos = s.rfind(suffix); - return (pos != std::string::npos) && (pos == s.size() - suffix.size()); -}; - -} // namespace btllib - -#endif \ No newline at end of file diff --git a/src/include/meson.build b/src/include/meson.build deleted file mode 100644 index 2dfeacf..0000000 --- a/src/include/meson.build +++ /dev/null @@ -1,7 +0,0 @@ -header_paths = run_command('../scripts/get_include_files').stdout().strip().split() - -# Do not install -#foreach header : header_paths -# relative = run_command('../scripts/get_include_relative', header).stdout().strip() -# install_headers(files(header), subdir: relative) -#endforeach \ No newline at end of file diff --git a/src/include/vendor/cpptoml.hpp b/src/include/vendor/cpptoml.hpp deleted file mode 100644 index 5cf0c47..0000000 --- a/src/include/vendor/cpptoml.hpp +++ /dev/null @@ -1,3668 +0,0 @@ -/** - * @file cpptoml.hpp - * @author Chase Geigle - * @date May 2013 - */ - -#ifndef CPPTOML_H -#define CPPTOML_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if __cplusplus > 201103L -#define CPPTOML_DEPRECATED(reason) [[deprecated(reason)]] -#elif defined(__clang__) -#define CPPTOML_DEPRECATED(reason) __attribute__((deprecated(reason))) -#elif defined(__GNUG__) -#define CPPTOML_DEPRECATED(reason) __attribute__((deprecated)) -#elif defined(_MSC_VER) -#if _MSC_VER < 1910 -#define CPPTOML_DEPRECATED(reason) __declspec(deprecated) -#else -#define CPPTOML_DEPRECATED(reason) [[deprecated(reason)]] -#endif -#endif - -namespace cpptoml -{ -class writer; // forward declaration -class base; // forward declaration -#if defined(CPPTOML_USE_MAP) -// a std::map will ensure that entries a sorted, albeit at a slight -// performance penalty relative to the (default) unordered_map -using string_to_base_map = std::map>; -#else -// by default an unordered_map is used for best performance as the -// toml specification does not require entries to be sorted -using string_to_base_map - = std::unordered_map>; -#endif - -// if defined, `base` will retain type information in form of an enum class -// such that static_cast can be used instead of dynamic_cast -// #define CPPTOML_NO_RTTI - -template -class option -{ - public: - option() : empty_{true} - { - // nothing - } - - option(T value) : empty_{false}, value_(std::move(value)) - { - // nothing - } - - explicit operator bool() const - { - return !empty_; - } - - const T& operator*() const - { - return value_; - } - - const T* operator->() const - { - return &value_; - } - - template - T value_or(U&& alternative) const - { - if (!empty_) - return value_; - return static_cast(std::forward(alternative)); - } - - private: - bool empty_; - T value_; -}; - -struct local_date -{ - int year = 0; - int month = 0; - int day = 0; -}; - -struct local_time -{ - int hour = 0; - int minute = 0; - int second = 0; - int microsecond = 0; -}; - -struct zone_offset -{ - int hour_offset = 0; - int minute_offset = 0; -}; - -struct local_datetime : local_date, local_time -{ -}; - -struct offset_datetime : local_datetime, zone_offset -{ - static inline struct offset_datetime from_zoned(const struct tm& t) - { - offset_datetime dt; - dt.year = t.tm_year + 1900; - dt.month = t.tm_mon + 1; - dt.day = t.tm_mday; - dt.hour = t.tm_hour; - dt.minute = t.tm_min; - dt.second = t.tm_sec; - - char buf[16]; - strftime(buf, 16, "%z", &t); - - int offset = std::stoi(buf); - dt.hour_offset = offset / 100; - dt.minute_offset = offset % 100; - return dt; - } - - CPPTOML_DEPRECATED("from_local has been renamed to from_zoned") - static inline struct offset_datetime from_local(const struct tm& t) - { - return from_zoned(t); - } - - static inline struct offset_datetime from_utc(const struct tm& t) - { - offset_datetime dt; - dt.year = t.tm_year + 1900; - dt.month = t.tm_mon + 1; - dt.day = t.tm_mday; - dt.hour = t.tm_hour; - dt.minute = t.tm_min; - dt.second = t.tm_sec; - return dt; - } -}; - -CPPTOML_DEPRECATED("datetime has been renamed to offset_datetime") -typedef offset_datetime datetime; - -class fill_guard -{ - public: - fill_guard(std::ostream& os) : os_(os), fill_{os.fill()} - { - // nothing - } - - ~fill_guard() - { - os_.fill(fill_); - } - - private: - std::ostream& os_; - std::ostream::char_type fill_; -}; - -inline std::ostream& operator<<(std::ostream& os, const local_date& dt) -{ - fill_guard g{os}; - os.fill('0'); - - using std::setw; - os << setw(4) << dt.year << "-" << setw(2) << dt.month << "-" << setw(2) - << dt.day; - - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const local_time& ltime) -{ - fill_guard g{os}; - os.fill('0'); - - using std::setw; - os << setw(2) << ltime.hour << ":" << setw(2) << ltime.minute << ":" - << setw(2) << ltime.second; - - if (ltime.microsecond > 0) - { - os << "."; - int power = 100000; - for (int curr_us = ltime.microsecond; curr_us; power /= 10) - { - auto num = curr_us / power; - os << num; - curr_us -= num * power; - } - } - - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const zone_offset& zo) -{ - fill_guard g{os}; - os.fill('0'); - - using std::setw; - - if (zo.hour_offset != 0 || zo.minute_offset != 0) - { - if (zo.hour_offset > 0) - { - os << "+"; - } - else - { - os << "-"; - } - os << setw(2) << std::abs(zo.hour_offset) << ":" << setw(2) - << std::abs(zo.minute_offset); - } - else - { - os << "Z"; - } - - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const local_datetime& dt) -{ - return os << static_cast(dt) << "T" - << static_cast(dt); -} - -inline std::ostream& operator<<(std::ostream& os, const offset_datetime& dt) -{ - return os << static_cast(dt) - << static_cast(dt); -} - -template -struct is_one_of; - -template -struct is_one_of : std::is_same -{ -}; - -template -struct is_one_of -{ - const static bool value - = std::is_same::value || is_one_of::value; -}; - -template -class value; - -template -struct valid_value - : is_one_of -{ -}; - -template -struct value_traits; - -template -struct valid_value_or_string_convertible -{ - - const static bool value = valid_value::type>::value - || std::is_convertible::value; -}; - -template -struct value_traits::value>::type> -{ - using value_type = typename std::conditional< - valid_value::type>::value, - typename std::decay::type, std::string>::type; - - using type = value; - - static value_type construct(T&& val) - { - return value_type(val); - } -}; - -template -struct value_traits< - T, - typename std::enable_if< - !valid_value_or_string_convertible::value - && std::is_floating_point::type>::value>::type> -{ - using value_type = typename std::decay::type; - - using type = value; - - static value_type construct(T&& val) - { - return value_type(val); - } -}; - -template -struct value_traits< - T, typename std::enable_if< - !valid_value_or_string_convertible::value - && !std::is_floating_point::type>::value - && std::is_signed::type>::value>::type> -{ - using value_type = int64_t; - - using type = value; - - static value_type construct(T&& val) - { - if (val < (std::numeric_limits::min)()) - throw std::underflow_error{"constructed value cannot be " - "represented by a 64-bit signed " - "integer"}; - - if (val > (std::numeric_limits::max)()) - throw std::overflow_error{"constructed value cannot be represented " - "by a 64-bit signed integer"}; - - return static_cast(val); - } -}; - -template -struct value_traits< - T, typename std::enable_if< - !valid_value_or_string_convertible::value - && std::is_unsigned::type>::value>::type> -{ - using value_type = int64_t; - - using type = value; - - static value_type construct(T&& val) - { - if (val > static_cast((std::numeric_limits::max)())) - throw std::overflow_error{"constructed value cannot be represented " - "by a 64-bit signed integer"}; - - return static_cast(val); - } -}; - -class array; -class table; -class table_array; - -template -struct array_of_trait -{ - using return_type = option>; -}; - -template <> -struct array_of_trait -{ - using return_type = option>>; -}; - -template -inline std::shared_ptr::type> make_value(T&& val); -inline std::shared_ptr make_array(); - -namespace detail -{ -template -inline std::shared_ptr make_element(); -} - -inline std::shared_ptr make_table(); -inline std::shared_ptr make_table_array(bool is_inline = false); - -#if defined(CPPTOML_NO_RTTI) -/// Base type used to store underlying data type explicitly if RTTI is disabled -enum class base_type -{ - NONE, - STRING, - LOCAL_TIME, - LOCAL_DATE, - LOCAL_DATETIME, - OFFSET_DATETIME, - INT, - FLOAT, - BOOL, - TABLE, - ARRAY, - TABLE_ARRAY -}; - -/// Type traits class to convert C++ types to enum member -template -struct base_type_traits; - -template <> -struct base_type_traits -{ - static const base_type type = base_type::STRING; -}; - -template <> -struct base_type_traits -{ - static const base_type type = base_type::LOCAL_TIME; -}; - -template <> -struct base_type_traits -{ - static const base_type type = base_type::LOCAL_DATE; -}; - -template <> -struct base_type_traits -{ - static const base_type type = base_type::LOCAL_DATETIME; -}; - -template <> -struct base_type_traits -{ - static const base_type type = base_type::OFFSET_DATETIME; -}; - -template <> -struct base_type_traits -{ - static const base_type type = base_type::INT; -}; - -template <> -struct base_type_traits -{ - static const base_type type = base_type::FLOAT; -}; - -template <> -struct base_type_traits -{ - static const base_type type = base_type::BOOL; -}; - -template <> -struct base_type_traits
-{ - static const base_type type = base_type::TABLE; -}; - -template <> -struct base_type_traits -{ - static const base_type type = base_type::ARRAY; -}; - -template <> -struct base_type_traits -{ - static const base_type type = base_type::TABLE_ARRAY; -}; -#endif - -/** - * A generic base TOML value used for type erasure. - */ -class base : public std::enable_shared_from_this -{ - public: - virtual ~base() = default; - - virtual std::shared_ptr clone() const = 0; - - /** - * Determines if the given TOML element is a value. - */ - virtual bool is_value() const - { - return false; - } - - /** - * Determines if the given TOML element is a table. - */ - virtual bool is_table() const - { - return false; - } - - /** - * Converts the TOML element into a table. - */ - std::shared_ptr
as_table() - { - if (is_table()) - return std::static_pointer_cast
(shared_from_this()); - return nullptr; - } - /** - * Determines if the TOML element is an array of "leaf" elements. - */ - virtual bool is_array() const - { - return false; - } - - /** - * Converts the TOML element to an array. - */ - std::shared_ptr as_array() - { - if (is_array()) - return std::static_pointer_cast(shared_from_this()); - return nullptr; - } - - /** - * Determines if the given TOML element is an array of tables. - */ - virtual bool is_table_array() const - { - return false; - } - - /** - * Converts the TOML element into a table array. - */ - std::shared_ptr as_table_array() - { - if (is_table_array()) - return std::static_pointer_cast(shared_from_this()); - return nullptr; - } - - /** - * Attempts to coerce the TOML element into a concrete TOML value - * of type T. - */ - template - std::shared_ptr> as(); - - template - std::shared_ptr> as() const; - - template - void accept(Visitor&& visitor, Args&&... args) const; - -#if defined(CPPTOML_NO_RTTI) - base_type type() const - { - return type_; - } - - protected: - base(const base_type t) : type_(t) - { - // nothing - } - - private: - const base_type type_ = base_type::NONE; - -#else - protected: - base() - { - // nothing - } -#endif -}; - -/** - * A concrete TOML value representing the "leaves" of the "tree". - */ -template -class value : public base -{ - struct make_shared_enabler - { - // nothing; this is a private key accessible only to friends - }; - - template - friend std::shared_ptr::type> - cpptoml::make_value(U&& val); - - public: - static_assert(valid_value::value, "invalid value type"); - - std::shared_ptr clone() const override; - - value(const make_shared_enabler&, const T& val) : value(val) - { - // nothing; note that users cannot actually invoke this function - // because they lack access to the make_shared_enabler. - } - - bool is_value() const override - { - return true; - } - - /** - * Gets the data associated with this value. - */ - T& get() - { - return data_; - } - - /** - * Gets the data associated with this value. Const version. - */ - const T& get() const - { - return data_; - } - - private: - T data_; - - /** - * Constructs a value from the given data. - */ -#if defined(CPPTOML_NO_RTTI) - value(const T& val) : base(base_type_traits::type), data_(val) - { - } -#else - value(const T& val) : data_(val) - { - } -#endif - - value(const value& val) = delete; - value& operator=(const value& val) = delete; -}; - -template -std::shared_ptr::type> make_value(T&& val) -{ - using value_type = typename value_traits::type; - using enabler = typename value_type::make_shared_enabler; - return std::make_shared( - enabler{}, value_traits::construct(std::forward(val))); -} - -template -inline std::shared_ptr> base::as() -{ -#if defined(CPPTOML_NO_RTTI) - if (type() == base_type_traits::type) - return std::static_pointer_cast>(shared_from_this()); - else - return nullptr; -#else - return std::dynamic_pointer_cast>(shared_from_this()); -#endif -} - -// special case value to allow getting an integer parameter as a -// double value -template <> -inline std::shared_ptr> base::as() -{ -#if defined(CPPTOML_NO_RTTI) - if (type() == base_type::FLOAT) - return std::static_pointer_cast>(shared_from_this()); - - if (type() == base_type::INT) - { - auto v = std::static_pointer_cast>(shared_from_this()); - return make_value(static_cast(v->get())); - } -#else - if (auto v = std::dynamic_pointer_cast>(shared_from_this())) - return v; - - if (auto v = std::dynamic_pointer_cast>(shared_from_this())) - return make_value(static_cast(v->get())); -#endif - - return nullptr; -} - -template -inline std::shared_ptr> base::as() const -{ -#if defined(CPPTOML_NO_RTTI) - if (type() == base_type_traits::type) - return std::static_pointer_cast>(shared_from_this()); - else - return nullptr; -#else - return std::dynamic_pointer_cast>(shared_from_this()); -#endif -} - -// special case value to allow getting an integer parameter as a -// double value -template <> -inline std::shared_ptr> base::as() const -{ -#if defined(CPPTOML_NO_RTTI) - if (type() == base_type::FLOAT) - return std::static_pointer_cast>( - shared_from_this()); - - if (type() == base_type::INT) - { - auto v = as(); - // the below has to be a non-const value due to a bug in - // libc++: https://llvm.org/bugs/show_bug.cgi?id=18843 - return make_value(static_cast(v->get())); - } -#else - if (auto v - = std::dynamic_pointer_cast>(shared_from_this())) - return v; - - if (auto v = as()) - { - // the below has to be a non-const value due to a bug in - // libc++: https://llvm.org/bugs/show_bug.cgi?id=18843 - return make_value(static_cast(v->get())); - } -#endif - - return nullptr; -} - -/** - * Exception class for array insertion errors. - */ -class array_exception : public std::runtime_error -{ - public: - array_exception(const std::string& err) : std::runtime_error{err} - { - } -}; - -class array : public base -{ - public: - friend std::shared_ptr make_array(); - - std::shared_ptr clone() const override; - - virtual bool is_array() const override - { - return true; - } - - using size_type = std::size_t; - - /** - * arrays can be iterated over - */ - using iterator = std::vector>::iterator; - - /** - * arrays can be iterated over. Const version. - */ - using const_iterator = std::vector>::const_iterator; - - iterator begin() - { - return values_.begin(); - } - - const_iterator begin() const - { - return values_.begin(); - } - - iterator end() - { - return values_.end(); - } - - const_iterator end() const - { - return values_.end(); - } - - /** - * Obtains the array (vector) of base values. - */ - std::vector>& get() - { - return values_; - } - - /** - * Obtains the array (vector) of base values. Const version. - */ - const std::vector>& get() const - { - return values_; - } - - std::shared_ptr at(size_t idx) const - { - return values_.at(idx); - } - - /** - * Obtains an array of values. Note that elements may be - * nullptr if they cannot be converted to a value. - */ - template - std::vector>> array_of() const - { - std::vector>> result(values_.size()); - - std::transform(values_.begin(), values_.end(), result.begin(), - [&](std::shared_ptr v) { return v->as(); }); - - return result; - } - - /** - * Obtains a option>. The option will be empty if the array - * contains values that are not of type T. - */ - template - inline typename array_of_trait::return_type get_array_of() const - { - std::vector result; - result.reserve(values_.size()); - - for (const auto& val : values_) - { - if (auto v = val->as()) - result.push_back(v->get()); - else - return {}; - } - - return {std::move(result)}; - } - - /** - * Obtains an array of arrays. Note that elements may be nullptr - * if they cannot be converted to a array. - */ - std::vector> nested_array() const - { - std::vector> result(values_.size()); - - std::transform(values_.begin(), values_.end(), result.begin(), - [&](std::shared_ptr v) -> std::shared_ptr { - if (v->is_array()) - return std::static_pointer_cast(v); - return std::shared_ptr{}; - }); - - return result; - } - - /** - * Add a value to the end of the array - */ - template - void push_back(const std::shared_ptr>& val) - { - if (values_.empty() || values_[0]->as()) - { - values_.push_back(val); - } - else - { - throw array_exception{"Arrays must be homogenous."}; - } - } - - /** - * Add an array to the end of the array - */ - void push_back(const std::shared_ptr& val) - { - if (values_.empty() || values_[0]->is_array()) - { - values_.push_back(val); - } - else - { - throw array_exception{"Arrays must be homogenous."}; - } - } - - /** - * Convenience function for adding a simple element to the end - * of the array. - */ - template - void push_back(T&& val, typename value_traits::type* = 0) - { - push_back(make_value(std::forward(val))); - } - - /** - * Insert a value into the array - */ - template - iterator insert(iterator position, const std::shared_ptr>& value) - { - if (values_.empty() || values_[0]->as()) - { - return values_.insert(position, value); - } - else - { - throw array_exception{"Arrays must be homogenous."}; - } - } - - /** - * Insert an array into the array - */ - iterator insert(iterator position, const std::shared_ptr& value) - { - if (values_.empty() || values_[0]->is_array()) - { - return values_.insert(position, value); - } - else - { - throw array_exception{"Arrays must be homogenous."}; - } - } - - /** - * Convenience function for inserting a simple element in the array - */ - template - iterator insert(iterator position, T&& val, - typename value_traits::type* = 0) - { - return insert(position, make_value(std::forward(val))); - } - - /** - * Erase an element from the array - */ - iterator erase(iterator position) - { - return values_.erase(position); - } - - /** - * Clear the array - */ - void clear() - { - values_.clear(); - } - - /** - * Reserve space for n values. - */ - void reserve(size_type n) - { - values_.reserve(n); - } - - private: -#if defined(CPPTOML_NO_RTTI) - array() : base(base_type::ARRAY) - { - // empty - } -#else - array() = default; -#endif - - template - array(InputIterator begin, InputIterator end) : values_{begin, end} - { - // nothing - } - - array(const array& obj) = delete; - array& operator=(const array& obj) = delete; - - std::vector> values_; -}; - -inline std::shared_ptr make_array() -{ - struct make_shared_enabler : public array - { - make_shared_enabler() - { - // nothing - } - }; - - return std::make_shared(); -} - -namespace detail -{ -template <> -inline std::shared_ptr make_element() -{ - return make_array(); -} -} // namespace detail - -/** - * Obtains a option>. The option will be empty if the array - * contains values that are not of type T. - */ -template <> -inline typename array_of_trait::return_type -array::get_array_of() const -{ - std::vector> result; - result.reserve(values_.size()); - - for (const auto& val : values_) - { - if (auto v = val->as_array()) - result.push_back(v); - else - return {}; - } - - return {std::move(result)}; -} - -class table; - -class table_array : public base -{ - friend class table; - friend std::shared_ptr make_table_array(bool); - - public: - std::shared_ptr clone() const override; - - using size_type = std::size_t; - - /** - * arrays can be iterated over - */ - using iterator = std::vector>::iterator; - - /** - * arrays can be iterated over. Const version. - */ - using const_iterator = std::vector>::const_iterator; - - iterator begin() - { - return array_.begin(); - } - - const_iterator begin() const - { - return array_.begin(); - } - - iterator end() - { - return array_.end(); - } - - const_iterator end() const - { - return array_.end(); - } - - virtual bool is_table_array() const override - { - return true; - } - - std::vector>& get() - { - return array_; - } - - const std::vector>& get() const - { - return array_; - } - - /** - * Add a table to the end of the array - */ - void push_back(const std::shared_ptr
& val) - { - array_.push_back(val); - } - - /** - * Insert a table into the array - */ - iterator insert(iterator position, const std::shared_ptr
& value) - { - return array_.insert(position, value); - } - - /** - * Erase an element from the array - */ - iterator erase(iterator position) - { - return array_.erase(position); - } - - /** - * Clear the array - */ - void clear() - { - array_.clear(); - } - - /** - * Reserve space for n tables. - */ - void reserve(size_type n) - { - array_.reserve(n); - } - - /** - * Whether or not the table array is declared inline. This mostly - * matters for parsing, where statically defined arrays cannot be - * appended to using the array-of-table syntax. - */ - bool is_inline() const - { - return is_inline_; - } - - private: -#if defined(CPPTOML_NO_RTTI) - table_array(bool is_inline = false) - : base(base_type::TABLE_ARRAY), is_inline_(is_inline) - { - // nothing - } -#else - table_array(bool is_inline = false) : is_inline_(is_inline) - { - // nothing - } -#endif - - table_array(const table_array& obj) = delete; - table_array& operator=(const table_array& rhs) = delete; - - std::vector> array_; - const bool is_inline_ = false; -}; - -inline std::shared_ptr make_table_array(bool is_inline) -{ - struct make_shared_enabler : public table_array - { - make_shared_enabler(bool mse_is_inline) : table_array(mse_is_inline) - { - // nothing - } - }; - - return std::make_shared(is_inline); -} - -namespace detail -{ -template <> -inline std::shared_ptr make_element() -{ - return make_table_array(true); -} -} // namespace detail - -// The below are overloads for fetching specific value types out of a value -// where special casting behavior (like bounds checking) is desired - -template -typename std::enable_if::value - && std::is_signed::value, - option>::type -get_impl(const std::shared_ptr& elem) -{ - if (auto v = elem->as()) - { - if (v->get() < (std::numeric_limits::min)()) - throw std::underflow_error{ - "T cannot represent the value requested in get"}; - - if (v->get() > (std::numeric_limits::max)()) - throw std::overflow_error{ - "T cannot represent the value requested in get"}; - - return {static_cast(v->get())}; - } - else - { - return {}; - } -} - -template -typename std::enable_if::value - && std::is_unsigned::value, - option>::type -get_impl(const std::shared_ptr& elem) -{ - if (auto v = elem->as()) - { - if (v->get() < 0) - throw std::underflow_error{"T cannot store negative value in get"}; - - if (static_cast(v->get()) > (std::numeric_limits::max)()) - throw std::overflow_error{ - "T cannot represent the value requested in get"}; - - return {static_cast(v->get())}; - } - else - { - return {}; - } -} - -template -typename std::enable_if::value - || std::is_same::value, - option>::type -get_impl(const std::shared_ptr& elem) -{ - if (auto v = elem->as()) - { - return {v->get()}; - } - else - { - return {}; - } -} - -/** - * Represents a TOML keytable. - */ -class table : public base -{ - public: - friend class table_array; - friend std::shared_ptr
make_table(); - - std::shared_ptr clone() const override; - - /** - * tables can be iterated over. - */ - using iterator = string_to_base_map::iterator; - - /** - * tables can be iterated over. Const version. - */ - using const_iterator = string_to_base_map::const_iterator; - - iterator begin() - { - return map_.begin(); - } - - const_iterator begin() const - { - return map_.begin(); - } - - iterator end() - { - return map_.end(); - } - - const_iterator end() const - { - return map_.end(); - } - - bool is_table() const override - { - return true; - } - - bool empty() const - { - return map_.empty(); - } - - /** - * Determines if this key table contains the given key. - */ - bool contains(const std::string& key) const - { - return map_.find(key) != map_.end(); - } - - /** - * Determines if this key table contains the given key. Will - * resolve "qualified keys". Qualified keys are the full access - * path separated with dots like "grandparent.parent.child". - */ - bool contains_qualified(const std::string& key) const - { - return resolve_qualified(key); - } - - /** - * Obtains the base for a given key. - * @throw std::out_of_range if the key does not exist - */ - std::shared_ptr get(const std::string& key) const - { - return map_.at(key); - } - - /** - * Obtains the base for a given key. Will resolve "qualified - * keys". Qualified keys are the full access path separated with - * dots like "grandparent.parent.child". - * - * @throw std::out_of_range if the key does not exist - */ - std::shared_ptr get_qualified(const std::string& key) const - { - std::shared_ptr p; - resolve_qualified(key, &p); - return p; - } - - /** - * Obtains a table for a given key, if possible. - */ - std::shared_ptr
get_table(const std::string& key) const - { - if (contains(key) && get(key)->is_table()) - return std::static_pointer_cast
(get(key)); - return nullptr; - } - - /** - * Obtains a table for a given key, if possible. Will resolve - * "qualified keys". - */ - std::shared_ptr
get_table_qualified(const std::string& key) const - { - if (contains_qualified(key) && get_qualified(key)->is_table()) - return std::static_pointer_cast
(get_qualified(key)); - return nullptr; - } - - /** - * Obtains an array for a given key. - */ - std::shared_ptr get_array(const std::string& key) const - { - if (!contains(key)) - return nullptr; - return get(key)->as_array(); - } - - /** - * Obtains an array for a given key. Will resolve "qualified keys". - */ - std::shared_ptr get_array_qualified(const std::string& key) const - { - if (!contains_qualified(key)) - return nullptr; - return get_qualified(key)->as_array(); - } - - /** - * Obtains a table_array for a given key, if possible. - */ - std::shared_ptr get_table_array(const std::string& key) const - { - if (!contains(key)) - return nullptr; - return get(key)->as_table_array(); - } - - /** - * Obtains a table_array for a given key, if possible. Will resolve - * "qualified keys". - */ - std::shared_ptr - get_table_array_qualified(const std::string& key) const - { - if (!contains_qualified(key)) - return nullptr; - return get_qualified(key)->as_table_array(); - } - - /** - * Helper function that attempts to get a value corresponding - * to the template parameter from a given key. - */ - template - option get_as(const std::string& key) const - { - try - { - return get_impl(get(key)); - } - catch (const std::out_of_range&) - { - return {}; - } - } - - /** - * Helper function that attempts to get a value corresponding - * to the template parameter from a given key. Will resolve "qualified - * keys". - */ - template - option get_qualified_as(const std::string& key) const - { - try - { - return get_impl(get_qualified(key)); - } - catch (const std::out_of_range&) - { - return {}; - } - } - - /** - * Helper function that attempts to get an array of values of a given - * type corresponding to the template parameter for a given key. - * - * If the key doesn't exist, doesn't exist as an array type, or one or - * more keys inside the array type are not of type T, an empty option - * is returned. Otherwise, an option containing a vector of the values - * is returned. - */ - template - inline typename array_of_trait::return_type - get_array_of(const std::string& key) const - { - if (auto v = get_array(key)) - { - std::vector result; - result.reserve(v->get().size()); - - for (const auto& b : v->get()) - { - if (auto val = b->as()) - result.push_back(val->get()); - else - return {}; - } - return {std::move(result)}; - } - - return {}; - } - - /** - * Helper function that attempts to get an array of values of a given - * type corresponding to the template parameter for a given key. Will - * resolve "qualified keys". - * - * If the key doesn't exist, doesn't exist as an array type, or one or - * more keys inside the array type are not of type T, an empty option - * is returned. Otherwise, an option containing a vector of the values - * is returned. - */ - template - inline typename array_of_trait::return_type - get_qualified_array_of(const std::string& key) const - { - if (auto v = get_array_qualified(key)) - { - std::vector result; - result.reserve(v->get().size()); - - for (const auto& b : v->get()) - { - if (auto val = b->as()) - result.push_back(val->get()); - else - return {}; - } - return {std::move(result)}; - } - - return {}; - } - - /** - * Adds an element to the keytable. - */ - void insert(const std::string& key, const std::shared_ptr& value) - { - map_[key] = value; - } - - /** - * Convenience shorthand for adding a simple element to the - * keytable. - */ - template - void insert(const std::string& key, T&& val, - typename value_traits::type* = 0) - { - insert(key, make_value(std::forward(val))); - } - - /** - * Removes an element from the table. - */ - void erase(const std::string& key) - { - map_.erase(key); - } - - private: -#if defined(CPPTOML_NO_RTTI) - table() : base(base_type::TABLE) - { - // nothing - } -#else - table() - { - // nothing - } -#endif - - table(const table& obj) = delete; - table& operator=(const table& rhs) = delete; - - std::vector split(const std::string& value, - char separator) const - { - std::vector result; - std::string::size_type p = 0; - std::string::size_type q; - while ((q = value.find(separator, p)) != std::string::npos) - { - result.emplace_back(value, p, q - p); - p = q + 1; - } - result.emplace_back(value, p); - return result; - } - - // If output parameter p is specified, fill it with the pointer to the - // specified entry and throw std::out_of_range if it couldn't be found. - // - // Otherwise, just return true if the entry could be found or false - // otherwise and do not throw. - bool resolve_qualified(const std::string& key, - std::shared_ptr* p = nullptr) const - { - auto parts = split(key, '.'); - auto last_key = parts.back(); - parts.pop_back(); - - auto cur_table = this; - for (const auto& part : parts) - { - cur_table = cur_table->get_table(part).get(); - if (!cur_table) - { - if (!p) - return false; - - throw std::out_of_range{key + " is not a valid key"}; - } - } - - if (!p) - return cur_table->map_.count(last_key) != 0; - - *p = cur_table->map_.at(last_key); - return true; - } - - string_to_base_map map_; -}; - -/** - * Helper function that attempts to get an array of arrays for a given - * key. - * - * If the key doesn't exist, doesn't exist as an array type, or one or - * more keys inside the array type are not of type T, an empty option - * is returned. Otherwise, an option containing a vector of the values - * is returned. - */ -template <> -inline typename array_of_trait::return_type -table::get_array_of(const std::string& key) const -{ - if (auto v = get_array(key)) - { - std::vector> result; - result.reserve(v->get().size()); - - for (const auto& b : v->get()) - { - if (auto val = b->as_array()) - result.push_back(val); - else - return {}; - } - - return {std::move(result)}; - } - - return {}; -} - -/** - * Helper function that attempts to get an array of arrays for a given - * key. Will resolve "qualified keys". - * - * If the key doesn't exist, doesn't exist as an array type, or one or - * more keys inside the array type are not of type T, an empty option - * is returned. Otherwise, an option containing a vector of the values - * is returned. - */ -template <> -inline typename array_of_trait::return_type -table::get_qualified_array_of(const std::string& key) const -{ - if (auto v = get_array_qualified(key)) - { - std::vector> result; - result.reserve(v->get().size()); - - for (const auto& b : v->get()) - { - if (auto val = b->as_array()) - result.push_back(val); - else - return {}; - } - - return {std::move(result)}; - } - - return {}; -} - -std::shared_ptr
make_table() -{ - struct make_shared_enabler : public table - { - make_shared_enabler() - { - // nothing - } - }; - - return std::make_shared(); -} - -namespace detail -{ -template <> -inline std::shared_ptr
make_element
() -{ - return make_table(); -} -} // namespace detail - -template -std::shared_ptr value::clone() const -{ - return make_value(data_); -} - -inline std::shared_ptr array::clone() const -{ - auto result = make_array(); - result->reserve(values_.size()); - for (const auto& ptr : values_) - result->values_.push_back(ptr->clone()); - return result; -} - -inline std::shared_ptr table_array::clone() const -{ - auto result = make_table_array(is_inline()); - result->reserve(array_.size()); - for (const auto& ptr : array_) - result->array_.push_back(ptr->clone()->as_table()); - return result; -} - -inline std::shared_ptr table::clone() const -{ - auto result = make_table(); - for (const auto& pr : map_) - result->insert(pr.first, pr.second->clone()); - return result; -} - -/** - * Exception class for all TOML parsing errors. - */ -class parse_exception : public std::runtime_error -{ - public: - parse_exception(const std::string& err) : std::runtime_error{err} - { - } - - parse_exception(const std::string& err, std::size_t line_number) - : std::runtime_error{err + " at line " + std::to_string(line_number)} - { - } -}; - -inline bool is_number(char c) -{ - return c >= '0' && c <= '9'; -} - -inline bool is_hex(char c) -{ - return is_number(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); -} - -/** - * Helper object for consuming expected characters. - */ -template -class consumer -{ - public: - consumer(std::string::iterator& it, const std::string::iterator& end, - OnError&& on_error) - : it_(it), end_(end), on_error_(std::forward(on_error)) - { - // nothing - } - - void operator()(char c) - { - if (it_ == end_ || *it_ != c) - on_error_(); - ++it_; - } - - template - void operator()(const char (&str)[N]) - { - std::for_each(std::begin(str), std::end(str) - 1, - [&](char c) { (*this)(c); }); - } - - void eat_or(char a, char b) - { - if (it_ == end_ || (*it_ != a && *it_ != b)) - on_error_(); - ++it_; - } - - int eat_digits(int len) - { - int val = 0; - for (int i = 0; i < len; ++i) - { - if (!is_number(*it_) || it_ == end_) - on_error_(); - val = 10 * val + (*it_++ - '0'); - } - return val; - } - - void error() - { - on_error_(); - } - - private: - std::string::iterator& it_; - const std::string::iterator& end_; - OnError on_error_; -}; - -template -consumer make_consumer(std::string::iterator& it, - const std::string::iterator& end, - OnError&& on_error) -{ - return consumer(it, end, std::forward(on_error)); -} - -// replacement for std::getline to handle incorrectly line-ended files -// https://stackoverflow.com/questions/6089231/getting-std-ifstream-to-handle-lf-cr-and-crlf -namespace detail -{ -inline std::istream& getline(std::istream& input, std::string& line) -{ - line.clear(); - - std::istream::sentry sentry{input, true}; - auto sb = input.rdbuf(); - - while (true) - { - auto c = sb->sbumpc(); - if (c == '\r') - { - if (sb->sgetc() == '\n') - c = sb->sbumpc(); - } - - if (c == '\n') - return input; - - if (c == std::istream::traits_type::eof()) - { - if (line.empty()) - input.setstate(std::ios::eofbit); - return input; - } - - line.push_back(static_cast(c)); - } -} -} // namespace detail - -/** - * The parser class. - */ -class parser -{ - public: - /** - * Parsers are constructed from streams. - */ - parser(std::istream& stream) : input_(stream) - { - // nothing - } - - parser& operator=(const parser& parser) = delete; - - /** - * Parses the stream this parser was created on until EOF. - * @throw parse_exception if there are errors in parsing - */ - std::shared_ptr
parse() - { - std::shared_ptr
root = make_table(); - - table* curr_table = root.get(); - - while (detail::getline(input_, line_)) - { - line_number_++; - auto it = line_.begin(); - auto end = line_.end(); - consume_whitespace(it, end); - if (it == end || *it == '#') - continue; - if (*it == '[') - { - curr_table = root.get(); - parse_table(it, end, curr_table); - } - else - { - parse_key_value(it, end, curr_table); - consume_whitespace(it, end); - eol_or_comment(it, end); - } - } - return root; - } - - private: -#if defined _MSC_VER - __declspec(noreturn) -#elif defined __GNUC__ - __attribute__((noreturn)) -#endif - void throw_parse_exception(const std::string& err) - { - throw parse_exception{err, line_number_}; - } - - void parse_table(std::string::iterator& it, - const std::string::iterator& end, table*& curr_table) - { - // remove the beginning keytable marker - ++it; - if (it == end) - throw_parse_exception("Unexpected end of table"); - if (*it == '[') - parse_table_array(it, end, curr_table); - else - parse_single_table(it, end, curr_table); - } - - void parse_single_table(std::string::iterator& it, - const std::string::iterator& end, - table*& curr_table) - { - if (it == end || *it == ']') - throw_parse_exception("Table name cannot be empty"); - - std::string full_table_name; - bool inserted = false; - - auto key_end = [](char c) { return c == ']'; }; - - auto key_part_handler = [&](const std::string& part) { - if (part.empty()) - throw_parse_exception("Empty component of table name"); - - if (!full_table_name.empty()) - full_table_name += '.'; - full_table_name += part; - - if (curr_table->contains(part)) - { -#if !defined(__PGI) - auto b = curr_table->get(part); -#else - // Workaround for PGI compiler - std::shared_ptr b = curr_table->get(part); -#endif - if (b->is_table()) - curr_table = static_cast(b.get()); - else if (b->is_table_array()) - curr_table = std::static_pointer_cast(b) - ->get() - .back() - .get(); - else - throw_parse_exception("Key " + full_table_name - + "already exists as a value"); - } - else - { - inserted = true; - curr_table->insert(part, make_table()); - curr_table = static_cast(curr_table->get(part).get()); - } - }; - - key_part_handler(parse_key(it, end, key_end, key_part_handler)); - - if (it == end) - throw_parse_exception( - "Unterminated table declaration; did you forget a ']'?"); - - if (*it != ']') - { - std::string errmsg{"Unexpected character in table definition: "}; - errmsg += '"'; - errmsg += *it; - errmsg += '"'; - throw_parse_exception(errmsg); - } - - // table already existed - if (!inserted) - { - auto is_value - = [](const std::pair&>& p) { - return p.second->is_value(); - }; - - // if there are any values, we can't add values to this table - // since it has already been defined. If there aren't any - // values, then it was implicitly created by something like - // [a.b] - if (curr_table->empty() - || std::any_of(curr_table->begin(), curr_table->end(), - is_value)) - { - throw_parse_exception("Redefinition of table " - + full_table_name); - } - } - - ++it; - consume_whitespace(it, end); - eol_or_comment(it, end); - } - - void parse_table_array(std::string::iterator& it, - const std::string::iterator& end, table*& curr_table) - { - ++it; - if (it == end || *it == ']') - throw_parse_exception("Table array name cannot be empty"); - - auto key_end = [](char c) { return c == ']'; }; - - std::string full_ta_name; - auto key_part_handler = [&](const std::string& part) { - if (part.empty()) - throw_parse_exception("Empty component of table array name"); - - if (!full_ta_name.empty()) - full_ta_name += '.'; - full_ta_name += part; - - if (curr_table->contains(part)) - { -#if !defined(__PGI) - auto b = curr_table->get(part); -#else - // Workaround for PGI compiler - std::shared_ptr b = curr_table->get(part); -#endif - - // if this is the end of the table array name, add an - // element to the table array that we just looked up, - // provided it was not declared inline - if (it != end && *it == ']') - { - if (!b->is_table_array()) - { - throw_parse_exception("Key " + full_ta_name - + " is not a table array"); - } - - auto v = b->as_table_array(); - - if (v->is_inline()) - { - throw_parse_exception("Static array " + full_ta_name - + " cannot be appended to"); - } - - v->get().push_back(make_table()); - curr_table = v->get().back().get(); - } - // otherwise, just keep traversing down the key name - else - { - if (b->is_table()) - curr_table = static_cast(b.get()); - else if (b->is_table_array()) - curr_table = std::static_pointer_cast(b) - ->get() - .back() - .get(); - else - throw_parse_exception("Key " + full_ta_name - + " already exists as a value"); - } - } - else - { - // if this is the end of the table array name, add a new - // table array and a new table inside that array for us to - // add keys to next - if (it != end && *it == ']') - { - curr_table->insert(part, make_table_array()); - auto arr = std::static_pointer_cast( - curr_table->get(part)); - arr->get().push_back(make_table()); - curr_table = arr->get().back().get(); - } - // otherwise, create the implicitly defined table and move - // down to it - else - { - curr_table->insert(part, make_table()); - curr_table - = static_cast(curr_table->get(part).get()); - } - } - }; - - key_part_handler(parse_key(it, end, key_end, key_part_handler)); - - // consume the last "]]" - auto eat = make_consumer(it, end, [this]() { - throw_parse_exception("Unterminated table array name"); - }); - eat(']'); - eat(']'); - - consume_whitespace(it, end); - eol_or_comment(it, end); - } - - void parse_key_value(std::string::iterator& it, std::string::iterator& end, - table* curr_table) - { - auto key_end = [](char c) { return c == '='; }; - - auto key_part_handler = [&](const std::string& part) { - // two cases: this key part exists already, in which case it must - // be a table, or it doesn't exist in which case we must create - // an implicitly defined table - if (curr_table->contains(part)) - { - auto val = curr_table->get(part); - if (val->is_table()) - { - curr_table = static_cast(val.get()); - } - else - { - throw_parse_exception("Key " + part - + " already exists as a value"); - } - } - else - { - auto newtable = make_table(); - curr_table->insert(part, newtable); - curr_table = newtable.get(); - } - }; - - auto key = parse_key(it, end, key_end, key_part_handler); - - if (curr_table->contains(key)) - throw_parse_exception("Key " + key + " already present"); - if (it == end || *it != '=') - throw_parse_exception("Value must follow after a '='"); - ++it; - consume_whitespace(it, end); - curr_table->insert(key, parse_value(it, end)); - consume_whitespace(it, end); - } - - template - std::string - parse_key(std::string::iterator& it, const std::string::iterator& end, - KeyEndFinder&& key_end, KeyPartHandler&& key_part_handler) - { - // parse the key as a series of one or more simple-keys joined with '.' - while (it != end && !key_end(*it)) - { - auto part = parse_simple_key(it, end); - consume_whitespace(it, end); - - if (it == end || key_end(*it)) - { - return part; - } - - if (*it != '.') - { - std::string errmsg{"Unexpected character in key: "}; - errmsg += '"'; - errmsg += *it; - errmsg += '"'; - throw_parse_exception(errmsg); - } - - key_part_handler(part); - - // consume the dot - ++it; - } - - throw_parse_exception("Unexpected end of key"); - } - - std::string parse_simple_key(std::string::iterator& it, - const std::string::iterator& end) - { - consume_whitespace(it, end); - - if (it == end) - throw_parse_exception("Unexpected end of key (blank key?)"); - - if (*it == '"' || *it == '\'') - { - return string_literal(it, end, *it); - } - else - { - auto bke = std::find_if(it, end, [](char c) { - return c == '.' || c == '=' || c == ']'; - }); - return parse_bare_key(it, bke); - } - } - - std::string parse_bare_key(std::string::iterator& it, - const std::string::iterator& end) - { - if (it == end) - { - throw_parse_exception("Bare key missing name"); - } - - auto key_end = end; - --key_end; - consume_backwards_whitespace(key_end, it); - ++key_end; - std::string key{it, key_end}; - - if (std::find(it, key_end, '#') != key_end) - { - throw_parse_exception("Bare key " + key + " cannot contain #"); - } - - if (std::find_if(it, key_end, - [](char c) { return c == ' ' || c == '\t'; }) - != key_end) - { - throw_parse_exception("Bare key " + key - + " cannot contain whitespace"); - } - - if (std::find_if(it, key_end, - [](char c) { return c == '[' || c == ']'; }) - != key_end) - { - throw_parse_exception("Bare key " + key - + " cannot contain '[' or ']'"); - } - - it = end; - return key; - } - - enum class parse_type - { - STRING = 1, - LOCAL_TIME, - LOCAL_DATE, - LOCAL_DATETIME, - OFFSET_DATETIME, - INT, - FLOAT, - BOOL, - ARRAY, - INLINE_TABLE - }; - - std::shared_ptr parse_value(std::string::iterator& it, - std::string::iterator& end) - { - parse_type type = determine_value_type(it, end); - switch (type) - { - case parse_type::STRING: - return parse_string(it, end); - case parse_type::LOCAL_TIME: - return parse_time(it, end); - case parse_type::LOCAL_DATE: - case parse_type::LOCAL_DATETIME: - case parse_type::OFFSET_DATETIME: - return parse_date(it, end); - case parse_type::INT: - case parse_type::FLOAT: - return parse_number(it, end); - case parse_type::BOOL: - return parse_bool(it, end); - case parse_type::ARRAY: - return parse_array(it, end); - case parse_type::INLINE_TABLE: - return parse_inline_table(it, end); - default: - throw_parse_exception("Failed to parse value"); - } - } - - parse_type determine_value_type(const std::string::iterator& it, - const std::string::iterator& end) - { - if (it == end) - { - throw_parse_exception("Failed to parse value type"); - } - if (*it == '"' || *it == '\'') - { - return parse_type::STRING; - } - else if (is_time(it, end)) - { - return parse_type::LOCAL_TIME; - } - else if (auto dtype = date_type(it, end)) - { - return *dtype; - } - else if (is_number(*it) || *it == '-' || *it == '+' - || (*it == 'i' && it + 1 != end && it[1] == 'n' - && it + 2 != end && it[2] == 'f') - || (*it == 'n' && it + 1 != end && it[1] == 'a' - && it + 2 != end && it[2] == 'n')) - { - return determine_number_type(it, end); - } - else if (*it == 't' || *it == 'f') - { - return parse_type::BOOL; - } - else if (*it == '[') - { - return parse_type::ARRAY; - } - else if (*it == '{') - { - return parse_type::INLINE_TABLE; - } - throw_parse_exception("Failed to parse value type"); - } - - parse_type determine_number_type(const std::string::iterator& it, - const std::string::iterator& end) - { - // determine if we are an integer or a float - auto check_it = it; - if (*check_it == '-' || *check_it == '+') - ++check_it; - - if (check_it == end) - throw_parse_exception("Malformed number"); - - if (*check_it == 'i' || *check_it == 'n') - return parse_type::FLOAT; - - while (check_it != end && is_number(*check_it)) - ++check_it; - if (check_it != end && *check_it == '.') - { - ++check_it; - while (check_it != end && is_number(*check_it)) - ++check_it; - return parse_type::FLOAT; - } - else - { - return parse_type::INT; - } - } - - std::shared_ptr> parse_string(std::string::iterator& it, - std::string::iterator& end) - { - auto delim = *it; - assert(delim == '"' || delim == '\''); - - // end is non-const here because we have to be able to potentially - // parse multiple lines in a string, not just one - auto check_it = it; - ++check_it; - if (check_it != end && *check_it == delim) - { - ++check_it; - if (check_it != end && *check_it == delim) - { - it = ++check_it; - return parse_multiline_string(it, end, delim); - } - } - return make_value(string_literal(it, end, delim)); - } - - std::shared_ptr> - parse_multiline_string(std::string::iterator& it, - std::string::iterator& end, char delim) - { - std::stringstream ss; - - auto is_ws = [](char c) { return c == ' ' || c == '\t'; }; - - bool consuming = false; - std::shared_ptr> ret; - - auto handle_line = [&](std::string::iterator& local_it, - std::string::iterator& local_end) { - if (consuming) - { - local_it = std::find_if_not(local_it, local_end, is_ws); - - // whole line is whitespace - if (local_it == local_end) - return; - } - - consuming = false; - - while (local_it != local_end) - { - // handle escaped characters - if (delim == '"' && *local_it == '\\') - { - auto check = local_it; - // check if this is an actual escape sequence or a - // whitespace escaping backslash - ++check; - consume_whitespace(check, local_end); - if (check == local_end) - { - consuming = true; - break; - } - - ss << parse_escape_code(local_it, local_end); - continue; - } - - // if we can end the string - if (std::distance(local_it, local_end) >= 3) - { - auto check = local_it; - // check for """ - if (*check++ == delim && *check++ == delim - && *check++ == delim) - { - local_it = check; - ret = make_value(ss.str()); - break; - } - } - - ss << *local_it++; - } - }; - - // handle the remainder of the current line - handle_line(it, end); - if (ret) - return ret; - - // start eating lines - while (detail::getline(input_, line_)) - { - ++line_number_; - - it = line_.begin(); - end = line_.end(); - - handle_line(it, end); - - if (ret) - return ret; - - if (!consuming) - ss << std::endl; - } - - throw_parse_exception("Unterminated multi-line basic string"); - } - - std::string string_literal(std::string::iterator& it, - const std::string::iterator& end, char delim) - { - ++it; - std::string val; - while (it != end) - { - // handle escaped characters - if (delim == '"' && *it == '\\') - { - val += parse_escape_code(it, end); - } - else if (*it == delim) - { - ++it; - consume_whitespace(it, end); - return val; - } - else - { - val += *it++; - } - } - throw_parse_exception("Unterminated string literal"); - } - - std::string parse_escape_code(std::string::iterator& it, - const std::string::iterator& end) - { - ++it; - if (it == end) - throw_parse_exception("Invalid escape sequence"); - char value; - if (*it == 'b') - { - value = '\b'; - } - else if (*it == 't') - { - value = '\t'; - } - else if (*it == 'n') - { - value = '\n'; - } - else if (*it == 'f') - { - value = '\f'; - } - else if (*it == 'r') - { - value = '\r'; - } - else if (*it == '"') - { - value = '"'; - } - else if (*it == '\\') - { - value = '\\'; - } - else if (*it == 'u' || *it == 'U') - { - return parse_unicode(it, end); - } - else - { - throw_parse_exception("Invalid escape sequence"); - } - ++it; - return std::string(1, value); - } - - std::string parse_unicode(std::string::iterator& it, - const std::string::iterator& end) - { - bool large = *it++ == 'U'; - auto codepoint = parse_hex(it, end, large ? 0x10000000 : 0x1000); - - if ((codepoint > 0xd7ff && codepoint < 0xe000) || codepoint > 0x10ffff) - { - throw_parse_exception( - "Unicode escape sequence is not a Unicode scalar value"); - } - - std::string result; - // See Table 3-6 of the Unicode standard - if (codepoint <= 0x7f) - { - // 1-byte codepoints: 00000000 0xxxxxxx - // repr: 0xxxxxxx - result += static_cast(codepoint & 0x7f); - } - else if (codepoint <= 0x7ff) - { - // 2-byte codepoints: 00000yyy yyxxxxxx - // repr: 110yyyyy 10xxxxxx - // - // 0x1f = 00011111 - // 0xc0 = 11000000 - // - result += static_cast(0xc0 | ((codepoint >> 6) & 0x1f)); - // - // 0x80 = 10000000 - // 0x3f = 00111111 - // - result += static_cast(0x80 | (codepoint & 0x3f)); - } - else if (codepoint <= 0xffff) - { - // 3-byte codepoints: zzzzyyyy yyxxxxxx - // repr: 1110zzzz 10yyyyyy 10xxxxxx - // - // 0xe0 = 11100000 - // 0x0f = 00001111 - // - result += static_cast(0xe0 | ((codepoint >> 12) & 0x0f)); - result += static_cast(0x80 | ((codepoint >> 6) & 0x1f)); - result += static_cast(0x80 | (codepoint & 0x3f)); - } - else - { - // 4-byte codepoints: 000uuuuu zzzzyyyy yyxxxxxx - // repr: 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx - // - // 0xf0 = 11110000 - // 0x07 = 00000111 - // - result += static_cast(0xf0 | ((codepoint >> 18) & 0x07)); - result += static_cast(0x80 | ((codepoint >> 12) & 0x3f)); - result += static_cast(0x80 | ((codepoint >> 6) & 0x3f)); - result += static_cast(0x80 | (codepoint & 0x3f)); - } - return result; - } - - uint32_t parse_hex(std::string::iterator& it, - const std::string::iterator& end, uint32_t place) - { - uint32_t value = 0; - while (place > 0) - { - if (it == end) - throw_parse_exception("Unexpected end of unicode sequence"); - - if (!is_hex(*it)) - throw_parse_exception("Invalid unicode escape sequence"); - - value += place * hex_to_digit(*it++); - place /= 16; - } - return value; - } - - uint32_t hex_to_digit(char c) - { - if (is_number(c)) - return static_cast(c - '0'); - return 10 - + static_cast(c - - ((c >= 'a' && c <= 'f') ? 'a' : 'A')); - } - - std::shared_ptr parse_number(std::string::iterator& it, - const std::string::iterator& end) - { - auto check_it = it; - auto check_end = find_end_of_number(it, end); - - auto eat_sign = [&]() { - if (check_it != end && (*check_it == '-' || *check_it == '+')) - ++check_it; - }; - - auto check_no_leading_zero = [&]() { - if (check_it != end && *check_it == '0' && check_it + 1 != check_end - && check_it[1] != '.') - { - throw_parse_exception("Numbers may not have leading zeros"); - } - }; - - auto eat_digits = [&](bool (*check_char)(char)) { - auto beg = check_it; - while (check_it != end && check_char(*check_it)) - { - ++check_it; - if (check_it != end && *check_it == '_') - { - ++check_it; - if (check_it == end || !check_char(*check_it)) - throw_parse_exception("Malformed number"); - } - } - - if (check_it == beg) - throw_parse_exception("Malformed number"); - }; - - auto eat_hex = [&]() { eat_digits(&is_hex); }; - - auto eat_numbers = [&]() { eat_digits(&is_number); }; - - if (check_it != end && *check_it == '0' && check_it + 1 != check_end - && (check_it[1] == 'x' || check_it[1] == 'o' || check_it[1] == 'b')) - { - ++check_it; - char base = *check_it; - ++check_it; - if (base == 'x') - { - eat_hex(); - return parse_int(it, check_it, 16); - } - else if (base == 'o') - { - auto start = check_it; - eat_numbers(); - auto val = parse_int(start, check_it, 8, "0"); - it = start; - return val; - } - else // if (base == 'b') - { - auto start = check_it; - eat_numbers(); - auto val = parse_int(start, check_it, 2); - it = start; - return val; - } - } - - eat_sign(); - check_no_leading_zero(); - - if (check_it != end && check_it + 1 != end && check_it + 2 != end) - { - if (check_it[0] == 'i' && check_it[1] == 'n' && check_it[2] == 'f') - { - auto val = std::numeric_limits::infinity(); - if (*it == '-') - val = -val; - it = check_it + 3; - return make_value(val); - } - else if (check_it[0] == 'n' && check_it[1] == 'a' - && check_it[2] == 'n') - { - auto val = std::numeric_limits::quiet_NaN(); - if (*it == '-') - val = -val; - it = check_it + 3; - return make_value(val); - } - } - - eat_numbers(); - - if (check_it != end - && (*check_it == '.' || *check_it == 'e' || *check_it == 'E')) - { - bool is_exp = *check_it == 'e' || *check_it == 'E'; - - ++check_it; - if (check_it == end) - throw_parse_exception("Floats must have trailing digits"); - - auto eat_exp = [&]() { - eat_sign(); - check_no_leading_zero(); - eat_numbers(); - }; - - if (is_exp) - eat_exp(); - else - eat_numbers(); - - if (!is_exp && check_it != end - && (*check_it == 'e' || *check_it == 'E')) - { - ++check_it; - eat_exp(); - } - - return parse_float(it, check_it); - } - else - { - return parse_int(it, check_it); - } - } - - std::shared_ptr> parse_int(std::string::iterator& it, - const std::string::iterator& end, - int base = 10, - const char* prefix = "") - { - std::string v{it, end}; - v = prefix + v; - v.erase(std::remove(v.begin(), v.end(), '_'), v.end()); - it = end; - try - { - return make_value(std::stoll(v, nullptr, base)); - } - catch (const std::invalid_argument& ex) - { - throw_parse_exception("Malformed number (invalid argument: " - + std::string{ex.what()} + ")"); - } - catch (const std::out_of_range& ex) - { - throw_parse_exception("Malformed number (out of range: " - + std::string{ex.what()} + ")"); - } - } - - std::shared_ptr> parse_float(std::string::iterator& it, - const std::string::iterator& end) - { - std::string v{it, end}; - v.erase(std::remove(v.begin(), v.end(), '_'), v.end()); - it = end; - char decimal_point = std::localeconv()->decimal_point[0]; - std::replace(v.begin(), v.end(), '.', decimal_point); - try - { - return make_value(std::stod(v)); - } - catch (const std::invalid_argument& ex) - { - throw_parse_exception("Malformed number (invalid argument: " - + std::string{ex.what()} + ")"); - } - catch (const std::out_of_range& ex) - { - throw_parse_exception("Malformed number (out of range: " - + std::string{ex.what()} + ")"); - } - } - - std::shared_ptr> parse_bool(std::string::iterator& it, - const std::string::iterator& end) - { - auto eat = make_consumer(it, end, [this]() { - throw_parse_exception("Attempted to parse invalid boolean value"); - }); - - if (*it == 't') - { - eat("true"); - return make_value(true); - } - else if (*it == 'f') - { - eat("false"); - return make_value(false); - } - - eat.error(); - return nullptr; - } - - std::string::iterator find_end_of_number(std::string::iterator it, - std::string::iterator end) - { - auto ret = std::find_if(it, end, [](char c) { - return !is_number(c) && c != '_' && c != '.' && c != 'e' && c != 'E' - && c != '-' && c != '+' && c != 'x' && c != 'o' && c != 'b'; - }); - if (ret != end && ret + 1 != end && ret + 2 != end) - { - if ((ret[0] == 'i' && ret[1] == 'n' && ret[2] == 'f') - || (ret[0] == 'n' && ret[1] == 'a' && ret[2] == 'n')) - { - ret = ret + 3; - } - } - return ret; - } - - std::string::iterator find_end_of_date(std::string::iterator it, - std::string::iterator end) - { - auto end_of_date = std::find_if(it, end, [](char c) { - return !is_number(c) && c != '-'; - }); - if (end_of_date != end && *end_of_date == ' ' && end_of_date + 1 != end - && is_number(end_of_date[1])) - end_of_date++; - return std::find_if(end_of_date, end, [](char c) { - return !is_number(c) && c != 'T' && c != 'Z' && c != ':' - && c != '-' && c != '+' && c != '.'; - }); - } - - std::string::iterator find_end_of_time(std::string::iterator it, - std::string::iterator end) - { - return std::find_if(it, end, [](char c) { - return !is_number(c) && c != ':' && c != '.'; - }); - } - - local_time read_time(std::string::iterator& it, - const std::string::iterator& end) - { - auto time_end = find_end_of_time(it, end); - - auto eat = make_consumer( - it, time_end, [&]() { throw_parse_exception("Malformed time"); }); - - local_time ltime; - - ltime.hour = eat.eat_digits(2); - eat(':'); - ltime.minute = eat.eat_digits(2); - eat(':'); - ltime.second = eat.eat_digits(2); - - int power = 100000; - if (it != time_end && *it == '.') - { - ++it; - while (it != time_end && is_number(*it)) - { - ltime.microsecond += power * (*it++ - '0'); - power /= 10; - } - } - - if (it != time_end) - throw_parse_exception("Malformed time"); - - return ltime; - } - - std::shared_ptr> - parse_time(std::string::iterator& it, const std::string::iterator& end) - { - return make_value(read_time(it, end)); - } - - std::shared_ptr parse_date(std::string::iterator& it, - const std::string::iterator& end) - { - auto date_end = find_end_of_date(it, end); - - auto eat = make_consumer( - it, date_end, [&]() { throw_parse_exception("Malformed date"); }); - - local_date ldate; - ldate.year = eat.eat_digits(4); - eat('-'); - ldate.month = eat.eat_digits(2); - eat('-'); - ldate.day = eat.eat_digits(2); - - if (it == date_end) - return make_value(ldate); - - eat.eat_or('T', ' '); - - local_datetime ldt; - static_cast(ldt) = ldate; - static_cast(ldt) = read_time(it, date_end); - - if (it == date_end) - return make_value(ldt); - - offset_datetime dt; - static_cast(dt) = ldt; - - int hoff = 0; - int moff = 0; - if (*it == '+' || *it == '-') - { - auto plus = *it == '+'; - ++it; - - hoff = eat.eat_digits(2); - dt.hour_offset = (plus) ? hoff : -hoff; - eat(':'); - moff = eat.eat_digits(2); - dt.minute_offset = (plus) ? moff : -moff; - } - else if (*it == 'Z') - { - ++it; - } - - if (it != date_end) - throw_parse_exception("Malformed date"); - - return make_value(dt); - } - - std::shared_ptr parse_array(std::string::iterator& it, - std::string::iterator& end) - { - // this gets ugly because of the "homogeneity" restriction: - // arrays can either be of only one type, or contain arrays - // (each of those arrays could be of different types, though) - // - // because of the latter portion, we don't really have a choice - // but to represent them as arrays of base values... - ++it; - - // ugh---have to read the first value to determine array type... - skip_whitespace_and_comments(it, end); - - // edge case---empty array - if (*it == ']') - { - ++it; - return make_array(); - } - - auto val_end = std::find_if( - it, end, [](char c) { return c == ',' || c == ']' || c == '#'; }); - parse_type type = determine_value_type(it, val_end); - switch (type) - { - case parse_type::STRING: - return parse_value_array(it, end); - case parse_type::LOCAL_TIME: - return parse_value_array(it, end); - case parse_type::LOCAL_DATE: - return parse_value_array(it, end); - case parse_type::LOCAL_DATETIME: - return parse_value_array(it, end); - case parse_type::OFFSET_DATETIME: - return parse_value_array(it, end); - case parse_type::INT: - return parse_value_array(it, end); - case parse_type::FLOAT: - return parse_value_array(it, end); - case parse_type::BOOL: - return parse_value_array(it, end); - case parse_type::ARRAY: - return parse_object_array(&parser::parse_array, '[', it, - end); - case parse_type::INLINE_TABLE: - return parse_object_array( - &parser::parse_inline_table, '{', it, end); - default: - throw_parse_exception("Unable to parse array"); - } - } - - template - std::shared_ptr parse_value_array(std::string::iterator& it, - std::string::iterator& end) - { - auto arr = make_array(); - while (it != end && *it != ']') - { - auto val = parse_value(it, end); - if (auto v = val->as()) - arr->get().push_back(val); - else - throw_parse_exception("Arrays must be homogeneous"); - skip_whitespace_and_comments(it, end); - if (*it != ',') - break; - ++it; - skip_whitespace_and_comments(it, end); - } - if (it != end) - ++it; - return arr; - } - - template - std::shared_ptr parse_object_array(Function&& fun, char delim, - std::string::iterator& it, - std::string::iterator& end) - { - auto arr = detail::make_element(); - - while (it != end && *it != ']') - { - if (*it != delim) - throw_parse_exception("Unexpected character in array"); - - arr->get().push_back(((*this).*fun)(it, end)); - skip_whitespace_and_comments(it, end); - - if (it == end || *it != ',') - break; - - ++it; - skip_whitespace_and_comments(it, end); - } - - if (it == end || *it != ']') - throw_parse_exception("Unterminated array"); - - ++it; - return arr; - } - - std::shared_ptr
parse_inline_table(std::string::iterator& it, - std::string::iterator& end) - { - auto tbl = make_table(); - do - { - ++it; - if (it == end) - throw_parse_exception("Unterminated inline table"); - - consume_whitespace(it, end); - if (it != end && *it != '}') - { - parse_key_value(it, end, tbl.get()); - consume_whitespace(it, end); - } - } while (*it == ','); - - if (it == end || *it != '}') - throw_parse_exception("Unterminated inline table"); - - ++it; - consume_whitespace(it, end); - - return tbl; - } - - void skip_whitespace_and_comments(std::string::iterator& start, - std::string::iterator& end) - { - consume_whitespace(start, end); - while (start == end || *start == '#') - { - if (!detail::getline(input_, line_)) - throw_parse_exception("Unclosed array"); - line_number_++; - start = line_.begin(); - end = line_.end(); - consume_whitespace(start, end); - } - } - - void consume_whitespace(std::string::iterator& it, - const std::string::iterator& end) - { - while (it != end && (*it == ' ' || *it == '\t')) - ++it; - } - - void consume_backwards_whitespace(std::string::iterator& back, - const std::string::iterator& front) - { - while (back != front && (*back == ' ' || *back == '\t')) - --back; - } - - void eol_or_comment(const std::string::iterator& it, - const std::string::iterator& end) - { - if (it != end && *it != '#') - throw_parse_exception("Unidentified trailing character '" - + std::string{*it} - + "'---did you forget a '#'?"); - } - - bool is_time(const std::string::iterator& it, - const std::string::iterator& end) - { - auto time_end = find_end_of_time(it, end); - auto len = std::distance(it, time_end); - - if (len < 8) - return false; - - if (it[2] != ':' || it[5] != ':') - return false; - - if (len > 8) - return it[8] == '.' && len > 9; - - return true; - } - - option date_type(const std::string::iterator& it, - const std::string::iterator& end) - { - auto date_end = find_end_of_date(it, end); - auto len = std::distance(it, date_end); - - if (len < 10) - return {}; - - if (it[4] != '-' || it[7] != '-') - return {}; - - if (len >= 19 && (it[10] == 'T' || it[10] == ' ') - && is_time(it + 11, date_end)) - { - // datetime type - auto time_end = find_end_of_time(it + 11, date_end); - if (time_end == date_end) - return {parse_type::LOCAL_DATETIME}; - else - return {parse_type::OFFSET_DATETIME}; - } - else if (len == 10) - { - // just a regular date - return {parse_type::LOCAL_DATE}; - } - - return {}; - } - - std::istream& input_; - std::string line_; - std::size_t line_number_ = 0; -}; - -/** - * Utility function to parse a file as a TOML file. Returns the root table. - * Throws a parse_exception if the file cannot be opened. - */ -inline std::shared_ptr
parse_file(const std::string& filename) -{ -#if defined(BOOST_NOWIDE_FSTREAM_INCLUDED_HPP) - boost::nowide::ifstream file{filename.c_str()}; -#elif defined(NOWIDE_FSTREAM_INCLUDED_HPP) - nowide::ifstream file{filename.c_str()}; -#else - std::ifstream file{filename}; -#endif - if (!file.is_open()) - throw parse_exception{filename + " could not be opened for parsing"}; - parser p{file}; - return p.parse(); -} - -template -struct value_accept; - -template <> -struct value_accept<> -{ - template - static void accept(const base&, Visitor&&, Args&&...) - { - // nothing - } -}; - -template -struct value_accept -{ - template - static void accept(const base& b, Visitor&& visitor, Args&&... args) - { - if (auto v = b.as()) - { - visitor.visit(*v, std::forward(args)...); - } - else - { - value_accept::accept(b, std::forward(visitor), - std::forward(args)...); - } - } -}; - -/** - * base implementation of accept() that calls visitor.visit() on the concrete - * class. - */ -template -void base::accept(Visitor&& visitor, Args&&... args) const -{ - if (is_value()) - { - using value_acceptor - = value_accept; - value_acceptor::accept(*this, std::forward(visitor), - std::forward(args)...); - } - else if (is_table()) - { - visitor.visit(static_cast(*this), - std::forward(args)...); - } - else if (is_array()) - { - visitor.visit(static_cast(*this), - std::forward(args)...); - } - else if (is_table_array()) - { - visitor.visit(static_cast(*this), - std::forward(args)...); - } -} - -/** - * Writer that can be passed to accept() functions of cpptoml objects and - * will output valid TOML to a stream. - */ -class toml_writer -{ - public: - /** - * Construct a toml_writer that will write to the given stream - */ - toml_writer(std::ostream& s, const std::string& indent_space = "\t") - : stream_(s), indent_(indent_space), has_naked_endline_(false) - { - // nothing - } - - public: - /** - * Output a base value of the TOML tree. - */ - template - void visit(const value& v, bool = false) - { - write(v); - } - - /** - * Output a table element of the TOML tree - */ - void visit(const table& t, bool in_array = false) - { - write_table_header(in_array); - std::vector values; - std::vector tables; - - for (const auto& i : t) - { - if (i.second->is_table() || i.second->is_table_array()) - { - tables.push_back(i.first); - } - else - { - values.push_back(i.first); - } - } - - for (unsigned int i = 0; i < values.size(); ++i) - { - path_.push_back(values[i]); - - if (i > 0) - endline(); - - write_table_item_header(*t.get(values[i])); - t.get(values[i])->accept(*this, false); - path_.pop_back(); - } - - for (unsigned int i = 0; i < tables.size(); ++i) - { - path_.push_back(tables[i]); - - if (values.size() > 0 || i > 0) - endline(); - - write_table_item_header(*t.get(tables[i])); - t.get(tables[i])->accept(*this, false); - path_.pop_back(); - } - - endline(); - } - - /** - * Output an array element of the TOML tree - */ - void visit(const array& a, bool = false) - { - write("["); - - for (unsigned int i = 0; i < a.get().size(); ++i) - { - if (i > 0) - write(", "); - - if (a.get()[i]->is_array()) - { - a.get()[i]->as_array()->accept(*this, true); - } - else - { - a.get()[i]->accept(*this, true); - } - } - - write("]"); - } - - /** - * Output a table_array element of the TOML tree - */ - void visit(const table_array& t, bool = false) - { - for (unsigned int j = 0; j < t.get().size(); ++j) - { - if (j > 0) - endline(); - - t.get()[j]->accept(*this, true); - } - - endline(); - } - - /** - * Escape a string for output. - */ - static std::string escape_string(const std::string& str) - { - std::string res; - for (auto it = str.begin(); it != str.end(); ++it) - { - if (*it == '\b') - { - res += "\\b"; - } - else if (*it == '\t') - { - res += "\\t"; - } - else if (*it == '\n') - { - res += "\\n"; - } - else if (*it == '\f') - { - res += "\\f"; - } - else if (*it == '\r') - { - res += "\\r"; - } - else if (*it == '"') - { - res += "\\\""; - } - else if (*it == '\\') - { - res += "\\\\"; - } - else if (static_cast(*it) <= UINT32_C(0x001f)) - { - res += "\\u"; - std::stringstream ss; - ss << std::hex << static_cast(*it); - res += ss.str(); - } - else - { - res += *it; - } - } - return res; - } - - protected: - /** - * Write out a string. - */ - void write(const value& v) - { - write("\""); - write(escape_string(v.get())); - write("\""); - } - - /** - * Write out a double. - */ - void write(const value& v) - { - std::stringstream ss; - ss << std::showpoint - << std::setprecision(std::numeric_limits::max_digits10) - << v.get(); - - auto double_str = ss.str(); - auto pos = double_str.find("e0"); - if (pos != std::string::npos) - double_str.replace(pos, 2, "e"); - pos = double_str.find("e-0"); - if (pos != std::string::npos) - double_str.replace(pos, 3, "e-"); - - stream_ << double_str; - has_naked_endline_ = false; - } - - /** - * Write out an integer, local_date, local_time, local_datetime, or - * offset_datetime. - */ - template - typename std::enable_if< - is_one_of::value>::type - write(const value& v) - { - write(v.get()); - } - - /** - * Write out a boolean. - */ - void write(const value& v) - { - write((v.get() ? "true" : "false")); - } - - /** - * Write out the header of a table. - */ - void write_table_header(bool in_array = false) - { - if (!path_.empty()) - { - indent(); - - write("["); - - if (in_array) - { - write("["); - } - - for (unsigned int i = 0; i < path_.size(); ++i) - { - if (i > 0) - { - write("."); - } - - if (path_[i].find_first_not_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcde" - "fghijklmnopqrstuvwxyz0123456789" - "_-") - == std::string::npos) - { - write(path_[i]); - } - else - { - write("\""); - write(escape_string(path_[i])); - write("\""); - } - } - - if (in_array) - { - write("]"); - } - - write("]"); - endline(); - } - } - - /** - * Write out the identifier for an item in a table. - */ - void write_table_item_header(const base& b) - { - if (!b.is_table() && !b.is_table_array()) - { - indent(); - - if (path_.back().find_first_not_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcde" - "fghijklmnopqrstuvwxyz0123456789" - "_-") - == std::string::npos) - { - write(path_.back()); - } - else - { - write("\""); - write(escape_string(path_.back())); - write("\""); - } - - write(" = "); - } - } - - private: - /** - * Indent the proper number of tabs given the size of - * the path. - */ - void indent() - { - for (std::size_t i = 1; i < path_.size(); ++i) - write(indent_); - } - - /** - * Write a value out to the stream. - */ - template - void write(const T& v) - { - stream_ << v; - has_naked_endline_ = false; - } - - /** - * Write an endline out to the stream - */ - void endline() - { - if (!has_naked_endline_) - { - stream_ << "\n"; - has_naked_endline_ = true; - } - } - - private: - std::ostream& stream_; - const std::string indent_; - std::vector path_; - bool has_naked_endline_; -}; - -inline std::ostream& operator<<(std::ostream& stream, const base& b) -{ - toml_writer writer{stream}; - b.accept(writer); - return stream; -} - -template -std::ostream& operator<<(std::ostream& stream, const value& v) -{ - toml_writer writer{stream}; - v.accept(writer); - return stream; -} - -inline std::ostream& operator<<(std::ostream& stream, const table& t) -{ - toml_writer writer{stream}; - t.accept(writer); - return stream; -} - -inline std::ostream& operator<<(std::ostream& stream, const table_array& t) -{ - toml_writer writer{stream}; - t.accept(writer); - return stream; -} - -inline std::ostream& operator<<(std::ostream& stream, const array& a) -{ - toml_writer writer{stream}; - a.accept(writer); - return stream; -} -} // namespace cpptoml -#endif // CPPTOML_H diff --git a/src/long-to-linked-pe.cpp b/src/long-to-linked-pe.cpp index 5ed531e..30809ee 100644 --- a/src/long-to-linked-pe.cpp +++ b/src/long-to-linked-pe.cpp @@ -184,9 +184,8 @@ main(int argc, char* argv[]) for (auto& infile : infiles) { unsigned flags = 0; - flags |= btllib::SeqReader::Flag::NO_FOLD_CASE; // skip time intensive checking - flags |= btllib::SeqReader::Flag::NO_TRIM_MASKED; - btllib::SeqReader reader(infile, flags, t - 1, 4, 1); + flags |= btllib::SeqReader::Flag::LONG_MODE; + btllib::SeqReader reader(infile, flags, t); btllib::SeqReader::Record record; while ((record = reader.read())) { size_t step = l * 2; @@ -224,7 +223,7 @@ main(int argc, char* argv[]) int read_num = 1; for (size_t i = 0; i <= seq_size - step; i = i + step) { - std::cout << header_symbol << record.name << "_f" << read_num + std::cout << header_symbol << record.id << "_f" << read_num << " BX:Z:" << record.num + 1 << '\n'; std::cout << seq.substr(i, l) << '\n'; if (!with_fasta) { @@ -237,7 +236,7 @@ main(int argc, char* argv[]) } std::string reverse_linked_read = seq.substr(i + l, l); btllib::reverse_complement(reverse_linked_read); - std::cout << header_symbol << record.name << "_f" << read_num + std::cout << header_symbol << record.id << "_f" << read_num << " BX:Z:" << record.num + 1 << '\n'; std::cout << reverse_linked_read << '\n'; if (!with_fasta) { @@ -256,7 +255,7 @@ main(int argc, char* argv[]) size_t remainder = seq_size % step; if (remainder != 0) { size_t curr_i = seq_size - remainder; - std::cout << header_symbol << record.name << "_f" << read_num + std::cout << header_symbol << record.id << "_f" << read_num << " BX:Z:" << record.num + 1 << '\n'; std::string forward_linked_read = seq.substr(curr_i, l); std::cout << forward_linked_read << '\n'; @@ -271,7 +270,7 @@ main(int argc, char* argv[]) std::string reverse_linked_read = seq.substr( seq_size - forward_linked_read.size(), forward_linked_read.size()); btllib::reverse_complement(reverse_linked_read); - std::cout << header_symbol << record.name << "_f" << read_num + std::cout << header_symbol << record.id << "_f" << read_num << " BX:Z:" << record.num + 1 << '\n'; std::cout << reverse_linked_read << '\n'; if (!with_fasta) {