From dcba88e9881c56585123a7665956d3f80421fab2 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Wed, 30 Oct 2024 22:37:16 +0300 Subject: [PATCH] Update GFA reader to support GFA v1.2 paths --- src/common/io/graph/CMakeLists.txt | 2 +- src/common/io/graph/cigar.cpp | 33 +++++ src/common/io/graph/cigar.hpp | 97 ++++++++++++++ src/common/io/graph/cigar.inl | 123 +++++++++++++++++ src/common/io/graph/gfa.cpp | 206 ++++++++++++++--------------- src/common/io/graph/gfa.hpp | 190 ++++++++++++++------------ src/common/io/graph/gfa_reader.cpp | 2 +- src/common/io/graph/gfa_reader.hpp | 1 - 8 files changed, 462 insertions(+), 192 deletions(-) create mode 100644 src/common/io/graph/cigar.cpp create mode 100644 src/common/io/graph/cigar.hpp create mode 100644 src/common/io/graph/cigar.inl diff --git a/src/common/io/graph/CMakeLists.txt b/src/common/io/graph/CMakeLists.txt index f3d9cfdc0..edb0029ad 100644 --- a/src/common/io/graph/CMakeLists.txt +++ b/src/common/io/graph/CMakeLists.txt @@ -10,6 +10,6 @@ project(graphio CXX) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_library(graphio STATIC - gfa.cpp gfa_reader.cpp gfa_writer.cpp + gfa.cpp cigar.cpp gfa_reader.cpp gfa_writer.cpp fastg_writer.cpp) target_link_libraries(graphio foonathan::lexy zlibstatic) diff --git a/src/common/io/graph/cigar.cpp b/src/common/io/graph/cigar.cpp new file mode 100644 index 000000000..55230ba3e --- /dev/null +++ b/src/common/io/graph/cigar.cpp @@ -0,0 +1,33 @@ +//*************************************************************************** +//* Copyright (c) 2023-2024 SPAdes team +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include // lexy::parse +#include +#include + +#include "cigar.hpp" + +#include "cigar.inl" + +namespace cigar { +std::ostream &operator<<(std::ostream &s, const tag &t) { + s << t.name[0] << t.name[1] << ':'; + return std::visit([&](const auto& value) -> std::ostream& { return s << value; }, t.val); +} + +std::optional parseTag(const char* line, size_t len) { + lexy::visualization_options opts; + opts.max_lexeme_width = 35; + + auto result = lexy::parse(lexy::string_input(line, len), lexy_ext::report_error.opts(opts)); + if (result.has_value()) + return std::make_optional(result.value()); + + return {}; +} + + +} diff --git a/src/common/io/graph/cigar.hpp b/src/common/io/graph/cigar.hpp new file mode 100644 index 000000000..8ba1f4bcb --- /dev/null +++ b/src/common/io/graph/cigar.hpp @@ -0,0 +1,97 @@ +//*************************************************************************** +//* Copyright (c) 2023-2024 SPAdes team +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cigar { + struct tag { + char name[2]; + char type; + std::variant val; + + template + tag(std::string_view n, std::string_view t, T v) + : name{n[0], n[1]}, type(t.front()), val(std::move(v)) {} + + friend std::ostream &operator<<(std::ostream &s, const tag &t); + + void print() const { + fprintf(stdout, "%c%c", name[0], name[1]); + fputs(":", stdout); + std::visit([&](const auto& value) { _print(value); }, val); + } + + private: + void _print(int64_t i) const { + std::fprintf(stdout, "%c:%" PRId64, type, i); + } + + void _print(const std::string &str) const { + std::fprintf(stdout, "%c:%s", type, str.c_str()); + } + + void _print(float f) const { + std::fprintf(stdout, "%c:%g", type, f); + } + }; + + struct cigarop { + uint32_t count : 24; + char op : 8; + + void print() const { + std::fprintf(stdout, "%u%c", count, op); + } + }; + + using cigar_string = std::vector; + + static inline std::optional + getTag(const char *name, + const std::vector &tags) { + auto res = std::find_if(tags.begin(), tags.end(), + [=](const tag &tag) { + return (tag.name[0] == name[0] && + tag.name[1] == name[1]); + }); + if (res == tags.end()) + return {}; + + return *res; + } + + template + std::optional getTag(const char *name, + const std::vector &tags) { + auto res = std::find_if(tags.begin(), tags.end(), + [=](const tag &tag) { + return (tag.name[0] == name[0] && + tag.name[1] == name[1]); + }); + if (res == tags.end()) + return {}; + + if (!std::holds_alternative(res->val)) + return {}; + + return std::get(res->val); + } + + std::optional parseTag(const char* line, size_t len); +} diff --git a/src/common/io/graph/cigar.inl b/src/common/io/graph/cigar.inl new file mode 100644 index 000000000..d5e12c386 --- /dev/null +++ b/src/common/io/graph/cigar.inl @@ -0,0 +1,123 @@ +//*************************************************************************** +//* Copyright (c) 2023-2024 SPAdes team +//* All Rights Reserved +//* See file LICENSE for details. +//*************************************************************************** + +#include // lexy::dsl::* +#include // lexy callbacks +#include + +#include + +namespace cigar::grammar { + namespace dsl = lexy::dsl; + + struct tag { + struct tag_character : lexy::token_production { + static constexpr auto rule = dsl::capture(dsl::ascii::alpha_digit); + static constexpr auto value = lexy::as_string; + }; + + struct tag_integer : lexy::token_production { + static constexpr auto rule = + dsl::minus_sign + dsl::integer(dsl::digits<>.no_leading_zero()); + static constexpr auto value = lexy::as_integer; + }; + + struct tag_string : lexy::token_production { + static constexpr auto rule = dsl::identifier(dsl::ascii::print); + static constexpr auto value = lexy::as_string; + }; + + struct tag_float : lexy::token_production { + static constexpr auto rule = [] { + auto integer = dsl::if_(dsl::lit_c < '-' > ) + dsl::digits<>.no_leading_zero(); + auto fraction = dsl::lit_c < '.' > >> dsl::digits<>; + auto exp_char = dsl::lit_c < 'e' > | dsl::lit_c<'E'>; + auto exponent = exp_char >> (dsl::lit_c < '+' > | dsl::lit_c < '-' > ) + dsl::digits<>; + return dsl::peek(dsl::lit_c < '-' > / dsl::digit<>) >> + dsl::position + + integer + + dsl::if_(fraction) + + dsl::if_(exponent) + + dsl::position; + }(); + + static constexpr auto value = lexy::callback( + // std::from_chars(const char*, const char*, float) is only + // available starting from libc++ from LLVM 14 :( + [](const char *first, const char *) { return ::atof(first); } + ); + }; + + struct tag_name : lexy::token_production { + static constexpr auto name = "tag name"; + + static constexpr auto rule = dsl::capture(dsl::token(dsl::ascii::alpha + dsl::ascii::alpha_digit)); + static constexpr auto value = lexy::as_string; + }; + + struct invalid_tag_type { + static constexpr auto name = "invalid tag type"; + }; + + static constexpr auto rule = [] { + auto colon = dsl::lit_c<':'>; + return dsl::p >> colon + + ( + dsl::capture(LEXY_LIT("A")) >> colon + dsl::p < tag_character > | + dsl::capture(LEXY_LIT("i")) >> colon + dsl::p < tag_integer > | + dsl::capture(LEXY_LIT("f")) >> colon + dsl::p < tag_float > | + dsl::capture(LEXY_LIT("Z")) >> colon + dsl::p < tag_string > | + dsl::capture(LEXY_LIT("J")) >> colon + dsl::p < tag_string > | + dsl::capture(LEXY_LIT("H")) >> colon + dsl::p < tag_string > | + dsl::capture(LEXY_LIT("B")) >> colon + dsl::p < tag_string > | + dsl::error + ); + }(); + + static constexpr auto value = lexy::callback( + [](std::string_view name, auto type, auto val) { + return cigar::tag{name, std::string_view{type.data(), type.size()}, val}; + }); + }; + + struct cigar_string { + static constexpr auto name = "CIGAR string"; + + static constexpr auto cigaropcode = + LEXY_CHAR_CLASS("CIGAR opcode", + LEXY_LIT("M") / LEXY_LIT("I") / LEXY_LIT("D") / + LEXY_LIT("N") / LEXY_LIT("S") / LEXY_LIT("H") / + LEXY_LIT("P") / LEXY_LIT("X") / LEXY_LIT("=")) / LEXY_LIT("J"); + + struct cigarop : lexy::transparent_production { + static constexpr auto name = "CIGAR operation"; + + static constexpr auto rule = + dsl::period | + dsl::integer >> dsl::capture(cigaropcode); + static constexpr auto value = lexy::callback( + []() { return cigar::cigarop{0, 0}; }, + [](std::uint32_t cnt, auto lexeme) { + return cigar::cigarop{cnt, lexeme[0]}; + }); + }; + + static constexpr auto rule = dsl::list(dsl::p); + static constexpr auto value = lexy::as_list>; + }; + + static constexpr auto tab = dsl::lit_c<'\t'>; + + struct opt_tags { + static constexpr auto name = "tags"; + + static constexpr auto rule = [] { + auto tags = dsl::list(dsl::p, dsl::trailing_sep(tab)); + return dsl::eof | (tab >> tags + dsl::eof); + }(); + static constexpr auto value = lexy::as_list>; + }; +} diff --git a/src/common/io/graph/gfa.cpp b/src/common/io/graph/gfa.cpp index 728862ce9..850e2497b 100644 --- a/src/common/io/graph/gfa.cpp +++ b/src/common/io/graph/gfa.cpp @@ -5,6 +5,7 @@ //*************************************************************************** #include "gfa.hpp" +#include "cigar.hpp" #include // lexy::parse #include // lexy::trace @@ -16,86 +17,20 @@ #include #include -namespace gfa { +#include "cigar.inl" -std::ostream &operator<<(std::ostream &s, const tag &t) { - s << t.name[0] << t.name[1] << ':'; - return std::visit([&](const auto& value) -> std::ostream& { return s << value; }, t.val); -} +namespace gfa { namespace grammar { namespace dsl = lexy::dsl; -struct tag { - struct tag_character : lexy::token_production { - static constexpr auto rule = dsl::capture(dsl::ascii::alpha_digit); - static constexpr auto value = lexy::as_string; - }; - - struct tag_integer : lexy::token_production { - static constexpr auto rule = - dsl::minus_sign + dsl::integer(dsl::digits<>.no_leading_zero()); - static constexpr auto value = lexy::as_integer; - }; - - struct tag_string : lexy::token_production { - static constexpr auto rule = dsl::identifier(dsl::ascii::print); - static constexpr auto value = lexy::as_string; - }; - - struct tag_float : lexy::token_production { - static constexpr auto rule = [] { - auto integer = dsl::if_(dsl::lit_c<'-'>) + dsl::digits<>.no_leading_zero(); - auto fraction = dsl::lit_c<'.'> >> dsl::digits<>; - auto exp_char = dsl::lit_c<'e'> | dsl::lit_c<'E'>; - auto exponent = exp_char >> (dsl::lit_c<'+'> | dsl::lit_c<'-'>) + dsl::digits<>; - return dsl::peek(dsl::lit_c<'-'> / dsl::digit<>) >> - dsl::position + - integer + - dsl::if_(fraction) + - dsl::if_(exponent) + - dsl::position; - }(); - - static constexpr auto value = lexy::callback( - // std::from_chars(const char*, const char*, float) is only - // available starting from libc++ from LLVM 14 :( - [](const char *first, const char *) { return ::atof(first); } - ); - }; - - struct tag_name : lexy::token_production { - static constexpr auto rule = dsl::capture(dsl::token(dsl::ascii::alpha + dsl::ascii::alpha_digit)); - static constexpr auto value = lexy::as_string; - }; - - static constexpr auto rule = [] { - auto colon = dsl::lit_c<':'>; - return dsl::p + colon + - ( - dsl::capture(LEXY_LIT("A")) >> colon + dsl::p | - dsl::capture(LEXY_LIT("i")) >> colon + dsl::p | - dsl::capture(LEXY_LIT("f")) >> colon + dsl::p | - dsl::capture(LEXY_LIT("Z")) >> colon + dsl::p | - dsl::capture(LEXY_LIT("J")) >> colon + dsl::p | - dsl::capture(LEXY_LIT("H")) >> colon + dsl::p | - dsl::capture(LEXY_LIT("B")) >> colon + dsl::p - ); - }(); - - static constexpr auto value = lexy::callback( - [](std::string_view name, auto type, auto val) { - return gfa::tag{name, std::string_view{type.data(), type.size()}, val}; - }); -}; - -auto tab = dsl::lit_c<'\t'>; +using cigar::grammar::tab; struct segment_name { static constexpr auto name = "segment name"; static constexpr auto rule = dsl::identifier(dsl::ascii::graph - LEXY_LIT("=") - LEXY_LIT("*") - LEXY_LIT(",") - LEXY_LIT(";"), - dsl::ascii::graph - LEXY_LIT(",") - LEXY_LIT(";")); + dsl::ascii::graph - LEXY_LIT(",") - LEXY_LIT(";") - LEXY_LIT("<") - LEXY_LIT(">")); static constexpr auto value = lexy::as_string; }; @@ -105,46 +40,24 @@ struct segment_orientation { static constexpr auto value = lexy::as_string; }; +struct segment_distance { + static constexpr auto rule = + dsl::sign + dsl::integer(dsl::digits<>.no_leading_zero()); + static constexpr auto value = lexy::as_integer; +}; + struct oriented_segment { // Apparently we cannot use segment_name + segment_orientation as GFA grammar is context-dependent // Parse as full segment name and deal with possible invalid input later // static constexpr auto rule = dsl::capture(dsl::token(dsl::p + dsl::p)); static constexpr auto rule = dsl::identifier(dsl::ascii::graph - LEXY_LIT("=") - LEXY_LIT("*") - LEXY_LIT(",") - LEXY_LIT(";"), - dsl::ascii::graph - LEXY_LIT(",") - LEXY_LIT(";")); + dsl::ascii::graph - LEXY_LIT(",") - LEXY_LIT(";") - LEXY_LIT("<") - LEXY_LIT(">")); static constexpr auto value = lexy::as_string; }; -struct cigar_string { - static constexpr auto name = "CIGAR string"; - - static constexpr auto cigaropcode = - LEXY_CHAR_CLASS("CIGAR opcode", - LEXY_LIT("M") / LEXY_LIT("I") / LEXY_LIT("D") / - LEXY_LIT("N") / LEXY_LIT("S") / LEXY_LIT("H") / - LEXY_LIT("P") / LEXY_LIT("X") / LEXY_LIT("=")); - - struct cigarop : lexy::transparent_production { - static constexpr auto name = "CIGAR operation"; - - static constexpr auto rule = dsl::integer >> dsl::capture(cigaropcode); - static constexpr auto value = lexy::callback( - [](std::uint32_t cnt, auto lexeme) { - return gfa::cigarop{cnt, lexeme[0]}; - }); - }; - - static constexpr auto rule = dsl::list(dsl::p); - static constexpr auto value = lexy::as_list>; -}; - -struct opt_tags { - static constexpr auto rule = [] { - auto tags = dsl::list(dsl::p, dsl::sep(tab)); - return dsl::eof | (tab >> tags + dsl::eof); - }(); - static constexpr auto value = lexy::as_list>; -}; +using cigar::grammar::cigar_string; +using cigar::grammar::opt_tags; // Header // ====== @@ -200,6 +113,30 @@ struct link { static constexpr auto value = lexy::construct; }; +// Jump line +// ============= +// Required fields: +// Column Field Type Regexp Description +// 1 RecordType Character J Record type +// 2 From String [!-)+-<>-~][!-~]* Name of segment +// 3 FromOrient String +|- Orientation of From segment +// 4 To String [!-)+-<>-~][!-~]* Name of segment +// 5 ToOrient String +|- Orientation of To segment +// 6 Distance String \*|[-+]?[0-9]+ Optional estimated distance between the segments +struct gaplink { + static constexpr auto name = "GFA jump line"; + + static constexpr auto rule = + LEXY_LIT("J") >> + tab + dsl::p + + tab + dsl::p + + tab + dsl::p + + tab + dsl::p + + tab + (LEXY_LIT("*") | (dsl::else_ >> dsl::p)) + + dsl::p; + static constexpr auto value = lexy::construct; +}; + // Path line // ========= // Required fields @@ -212,7 +149,7 @@ struct path { static constexpr auto name = "GFA path record"; struct segments { - static constexpr auto rule = dsl::list(dsl::p, dsl::sep(dsl::comma)); + static constexpr auto rule = dsl::list(dsl::p, dsl::sep(dsl::comma | dsl::semicolon)); static constexpr auto value = lexy::as_list>; }; @@ -230,6 +167,60 @@ struct path { static constexpr auto value = lexy::construct; }; +// Walk line +// ========= +// Required fields +// Column Field Type Regexp Description +// 1 RecordType Character W Record type +// 2 SampleId String [!-)+-<>-~][!-~]* Sample identifier +// 3 HapIndex Integer [0-9]+ Haplotype index +// 4 SeqId String [!-)+-<>-~][!-~]* Sequence identifier +// 5 SeqStart Integer \*\|[0-9]+ Optional Start position +// 6 SeqEnd Integer \*\|[0-9]+ Optional End position (BED-like half-close-half-open) +// 7 Walk String ([><][!-;=?-~]+)+ Walk +struct walk { + static constexpr auto name = "GFA walk record"; + + struct sample_id : public segment_name { + static constexpr auto name = "sample id"; + }; + + struct sequence_id : public segment_name { + static constexpr auto name = "sequence id"; + }; + + + struct oriented_wsegment { + static constexpr auto rule = dsl::capture(dsl::token(LEXY_ASCII_ONE_OF("<>") + dsl::p)); + static constexpr auto value = lexy::as_string; + }; + + struct wsegments { + static constexpr auto rule = dsl::list(dsl::p); + static constexpr auto value = lexy::as_list>; + }; + + struct opt_uint64_t { + static constexpr auto rule = LEXY_LIT("*") | dsl::integer; + static constexpr auto value = + lexy::callback( + [](uint64_t val) { return val; }, + []() { return std::optional(); }); + }; + + static constexpr auto rule = + LEXY_LIT("W") >> + tab + dsl::p + // SampleId + tab + dsl::integer + // HapIndex + tab + dsl::p + // SeqId + tab + dsl::p + // SeqStart + tab + dsl::p + // SeqEnd + tab + dsl::p + // Walk + dsl::p; + static constexpr auto value = lexy::construct; +}; + + struct record { static constexpr auto name = "GFA record"; @@ -245,7 +236,9 @@ struct record { return dsl::p
| dsl::p | dsl::p | + dsl::p | dsl::p | + dsl::p | comment | // Explicitly ignore all other records (though require proper tab-delimited format) dsl::ascii::alpha >> tab + dsl::until(dsl::newline).or_eof() | @@ -258,10 +251,13 @@ struct record { }; // namespace grammar -std::optional parse_record(const char* line, size_t len) { - auto result = lexy::parse(lexy::string_input(line, len), lexy_ext::report_error); +std::optional parseRecord(const char* line, size_t len) { + lexy::visualization_options opts; + opts.max_lexeme_width = 35; + + auto result = lexy::parse(lexy::string_input(line, len), lexy_ext::report_error.opts(opts)); if (result.has_value()) - return std::make_optional(std::move(result.value())); + return std::make_optional(result.value()); return {}; } diff --git a/src/common/io/graph/gfa.hpp b/src/common/io/graph/gfa.hpp index fc5036948..4878684ed 100644 --- a/src/common/io/graph/gfa.hpp +++ b/src/common/io/graph/gfa.hpp @@ -1,10 +1,13 @@ //*************************************************************************** //* Copyright (c) 2023-2024 SPAdes team -//* Copyright (c) 2022 Saint Petersburg State University //* All Rights Reserved //* See file LICENSE for details. //*************************************************************************** +#pragma once + +#include "cigar.hpp" + #include #include #include @@ -12,6 +15,7 @@ #include #include #include +#include #include #include @@ -19,42 +23,15 @@ #include namespace gfa { -struct tag { - char name[2]; - char type; - std::variant val; - - template - tag(std::string_view n, std::string_view t, T v) - : name{n[0], n[1]}, type(t.front()), val(std::move(v)) - {} - - friend std::ostream &operator<<(std::ostream &s, const tag &t); - - void print() const { - std::fprintf(stdout, "%c%c", name[0], name[1]); - std::fputs(":", stdout); - std::visit([&](const auto& value) { _print(value); }, val); - } - - private: - void _print(int64_t val) const { - std::fprintf(stdout, "%c:%" PRId64, type, val); - } - - void _print(const std::string &str) const { - std::fprintf(stdout, "%c:%s", type, str.c_str()); - } - - void _print(float val) const { - std::fprintf(stdout, "%c:%g", type, val); - } -}; +using cigar::tag; +using cigar::cigarop; +using cigar::getTag; +using cigar::cigar_string; struct header { std::vector tags; - header() {} + header() = default; explicit header(std::vector t) : tags(std::move(t)) {} @@ -74,16 +51,16 @@ struct segment { std::vector tags; explicit segment(std::string_view n, std::vector t) - : name{std::move(n)}, tags(std::move(t)) {} + : name{n}, tags(std::move(t)) {} template explicit segment(std::string_view n, Seq s, std::vector t) - : name{std::move(n)}, seq{s.data(), s.size()}, tags(std::move(t)) {} + : name{n}, seq{s.data(), s.size()}, tags(std::move(t)) {} void print() const { std::fputs("S", stdout); std::fprintf(stdout, "\t%s", std::string(name).c_str()); - std::fprintf(stdout, "\t%s", seq.size() ? std::string(seq).c_str() : "*"); + std::fprintf(stdout, "\t%s", !seq.empty() ? std::string(seq).c_str() : "*"); for (const auto &tag : tags) { fputs("\t", stdout); tag.print(); @@ -91,17 +68,6 @@ struct segment { } }; -struct cigarop { - uint32_t count : 24; - char op : 8; - - void print() const { - std::fprintf(stdout, "%u%c", count, op); - } -}; - -using cigar_string = std::vector; - struct link { std::string_view lhs; bool lhs_revcomp; @@ -112,13 +78,13 @@ struct link { explicit link(std::string_view l, std::string_view lr, std::string_view r, std::string_view rr, std::vector t) - : lhs{std::move(l)}, lhs_revcomp(lr.front() == '-'), rhs{std::move(r)}, rhs_revcomp(rr.front() == '-'), + : lhs{l}, lhs_revcomp(lr.front() == '-'), rhs{r}, rhs_revcomp(rr.front() == '-'), tags(std::move(t)) {} explicit link(std::string_view l, std::string_view lr, std::string_view r, std::string_view rr, cigar_string o, std::vector t) - : lhs{std::move(l)}, lhs_revcomp(lr.front() == '-'), rhs{std::move(r)}, rhs_revcomp(rr.front() == '-'), + : lhs{l}, lhs_revcomp(lr.front() == '-'), rhs{r}, rhs_revcomp(rr.front() == '-'), overlap(std::move(o)), tags(std::move(t)) {} void print() const { @@ -127,7 +93,7 @@ struct link { std::fprintf(stdout, "\t%s\t%c", std::string(rhs).c_str(), rhs_revcomp ? '-' : '+'); std::fputc('\t', stdout); - if (overlap.size() == 0) + if (overlap.empty()) std::fputc('*', stdout); else { for (const auto &ovl : overlap) @@ -141,6 +107,42 @@ struct link { } }; +struct gaplink { + std::string_view lhs; + bool lhs_revcomp; + std::string_view rhs; + bool rhs_revcomp; + int64_t distance; + std::vector tags; + + explicit gaplink(std::string_view l, std::string_view lr, std::string_view r, std::string_view rr, + int64_t d, + std::vector t) + : lhs{l}, lhs_revcomp(lr.front() == '-'), + rhs{r}, rhs_revcomp(rr.front() == '-'), + distance{d}, + tags(std::move(t)) {} + + explicit gaplink(std::string_view l, std::string_view lr, std::string_view r, std::string_view rr, + std::vector t) + : lhs{l}, lhs_revcomp(lr.front() == '-'), + rhs{r}, rhs_revcomp(rr.front() == '-'), + distance{std::numeric_limits::min()}, + tags(std::move(t)) {} + + void print() const { + std::fputs("J", stdout); + std::fprintf(stdout, "\t%s\t%c", std::string(lhs).c_str(), lhs_revcomp ? '-' : '+'); + std::fprintf(stdout, "\t%s\t%c", std::string(rhs).c_str(), rhs_revcomp ? '-' : '+'); + std::fprintf(stdout, "\t%" PRId64, distance); + + for (const auto &tag : tags) { + fputs("\t", stdout); + tag.print(); + } + } +}; + struct path { std::string_view name; std::vector segments; @@ -148,10 +150,10 @@ struct path { std::vector tags; explicit path(std::string_view n, std::vector s, std::vector t) - : name{std::move(n)}, segments(std::move(s)), tags(std::move(t)) {} + : name{n}, segments(std::move(s)), tags(std::move(t)) {} explicit path(std::string_view n, std::vector s, std::vector o, std::vector t) - : name{std::move(n)}, segments(std::move(s)), overlaps(std::move(o)), tags(std::move(t)) {} + : name{n}, segments(std::move(s)), overlaps(std::move(o)), tags(std::move(t)) {} void print() const { std::fputc('P', stdout); @@ -163,7 +165,7 @@ struct path { } std::fputc('\t', stdout); - if (overlaps.size() == 0) + if (overlaps.empty()) std::fputc('*', stdout); else { for (size_t i = 0; i < overlaps.size(); ++i) { @@ -181,35 +183,55 @@ struct path { } }; -using record = std::variant; - -static inline std::optional -getTag(const char *name, - const std::vector &tags) { - auto res = std::find_if(tags.begin(), tags.end(), - [=](const gfa::tag &tag) { - return (tag.name[0] == name[0] && - tag.name[1] == name[1]); - }); - if (res == tags.end()) - return {}; - - return *res; -} - -template -std::optional getTag(const char *name, - const std::vector &tags) { - auto res = std::find_if(tags.begin(), tags.end(), - [=](const gfa::tag &tag) { - return (tag.name[0] == name[0] && - tag.name[1] == name[1]); - }); - if (res == tags.end()) - return {}; - - return std::get(res->val); -} - -std::optional parse_record(const char* line, size_t len); +struct walk { + using opt_uint64_t = std::optional; + + std::string_view SampleId; + unsigned HapIndex; + std::string_view SeqId; + opt_uint64_t SeqStart; + opt_uint64_t SeqEnd; + std::vector Walk; + + std::vector tags; + + explicit walk(std::string_view s, unsigned h, std::string_view seq, + opt_uint64_t ss, opt_uint64_t se, std::vector w, + std::vector t) + : SampleId(s), HapIndex(h), SeqId(seq), + SeqStart(std::move(ss)), SeqEnd(std::move(se)), + Walk(std::move(w)), + tags(std::move(t)) {} + + void print() const { + std::fputc('W', stdout); + std::fprintf(stdout, "\t%s", std::string(SampleId).c_str()); + std::fprintf(stdout, "\t%u", HapIndex); + std::fprintf(stdout, "\t%s", std::string(SeqId).c_str()); + if (SeqStart.has_value()) + std::fprintf(stdout, "\t%" PRIu64, *SeqStart); + else + std::fprintf(stdout, "\t*"); + if (SeqEnd.has_value()) + std::fprintf(stdout, "\t%" PRIu64, *SeqEnd); + else + std::fprintf(stdout, "\t*"); + std::fputc('\t', stdout); + if (Walk.empty()) + std::fputc('*', stdout); + else + for (const auto& seg : Walk) + std::fprintf(stdout, "%s", std::string(seg).c_str()); + + for (const auto &tag : tags) { + std::fputc('\t', stdout); + tag.print(); + } + } + +}; + +using record = std::variant; + +std::optional parseRecord(const char* line, size_t len); } // namespace gfa diff --git a/src/common/io/graph/gfa_reader.cpp b/src/common/io/graph/gfa_reader.cpp index 1de333be8..73f44927f 100644 --- a/src/common/io/graph/gfa_reader.cpp +++ b/src/common/io/graph/gfa_reader.cpp @@ -219,7 +219,7 @@ unsigned GFAReader::to_graph(ConjugateDeBruijnGraph &g, if (read <= 1) continue; // skip empty lines - auto result = gfa::parse_record(line, read - 1); + auto result = gfa::parseRecord(line, read - 1); if (!result) continue; diff --git a/src/common/io/graph/gfa_reader.hpp b/src/common/io/graph/gfa_reader.hpp index 0b8494808..7adc3c906 100644 --- a/src/common/io/graph/gfa_reader.hpp +++ b/src/common/io/graph/gfa_reader.hpp @@ -31,7 +31,6 @@ namespace gfa { struct path; struct segment; struct link; -struct cigarop; class GFAReader { typedef debruijn_graph::DeBruijnGraph Graph;