Skip to content

Commit

Permalink
[BUFIX] fail when passed fastas with duplicate sequence ids (#555)
Browse files Browse the repository at this point in the history
When NvFaidx was used on Fasta files containing duplicate sequence ids,
which violates the FASTA spec, it would silently fail and use the
last-seen sequence as an entry.

This PR fails by default and exposes a parameter to ignore sequence_ids
and use integer indexing instead.
  • Loading branch information
skothenhill-nv authored Dec 25, 2024
1 parent c7281be commit f8db8a2
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 2 deletions.
35 changes: 33 additions & 2 deletions sub-packages/bionemo-noodles/src/bionemo/noodles/nvfaidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,38 @@ class NvFaidx:
See Also: bionemo.noodles.nvfaidx.SequenceAccessor
"""

def __init__(self, fasta_path: str | Path, faidx_path: Optional[str | Path] = None, ignore_existing_fai=True):
def __init__(
self,
fasta_path: str | Path,
faidx_path: Optional[str | Path] = None,
ignore_existing_fai: bool = True,
allow_duplicate_seqids: bool = False,
):
"""Construct a dict-like object representing a memmapped, indexed FASTA file.
This is an indexed fasta reader. Consequences of this are that the FASTA file must be well formed, meaning
sequence-ids and line-lengths must conform to FASTA standards. Additionally, the order of returned seqid, sequence
pairs when iterating over the index is not guaranteed to be the same order as the underlying fasta file.
Args:
fasta_path (str): Path to the FASTA file.
faidx_path (str): Path to the FAI index file. If None, one will be created.
ignore_existing_fai (bool): If True, ignore any existing FAI file and create an in-memory index. Note that
this will also ignore `faidx_path`.
allow_duplicate_seqids (bool): If true, will produce index for invalid fastas which contain duplicate seqids.
In this scenario, indexing is performed by integer rather than strings.
Example with invalid seqids.
>chr1 dupes|not|allowd
ATGATGATGATG
>chr1 whoops|there|is|dupe
ATGATGATGATG
NvFaidx:
{
0 : SequenceAccessor(chr1 dupes|not|allowed),
1 : SequenceAccessor(chr1 whoops|there|is|dupe)
}
"""
if isinstance(fasta_path, Path):
fasta_path = str(fasta_path)
Expand All @@ -178,7 +202,14 @@ def __init__(self, fasta_path: str | Path, faidx_path: Optional[str | Path] = No
case _:
raise ValueError("unreachable condition.")

self.records: Dict[str, PyFaidxRecord] = {record.name: record for record in self.reader.records()}
self.records: Dict[str | int, PyFaidxRecord] = {record.name: record for record in self.reader.records()}
if len(self.records) != len(self.reader.records()):
if not allow_duplicate_seqids:
raise ValueError(
"Non-unique sequence-id detected in FASTA, this is invalid. Correct headers and try again or pass allow_duplicate_seqid'"
)
else:
self.records: Dict[str | int, PyFaidxRecord] = dict(enumerate(self.reader.records()))

def __getitem__(self, seqid: str) -> SequenceAccessor: # noqa: D105
if seqid not in self.records:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
>chr1 version|of|seq1
ACTGACTGACTG
>chr1 version|of|seq2
GGTCAAGGTCAA
>chr1 some|random|inputs
AGTCAAGGTCCA
CGTCAAGGTCCC
GGTCAAGGTCCG
TGTCAAGGTCCT
AGTCAAGGTCAA
CGTCAAGGTCAC
GGTCAAGGTCAG
>chr1 why|is|this|done
CCCCCCCCCCCC
ACGT
>chr1 stop|violated|fasta|spec
A
15 changes: 15 additions & 0 deletions sub-packages/bionemo-noodles/tests/bionemo/noodles/test_nvfaidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ def sample_fasta():
return str(pathlib.Path(__file__).parent.parent.parent / "bionemo/noodles/data/sample.fasta")


@pytest.fixture
def dupes_fasta():
return str(pathlib.Path(__file__).parent.parent.parent / "bionemo/noodles/data/dupes.fasta")


def test_create_faidx_rustbind():
filename = create_test_fasta(num_seqs=2, seq_length=200)
faidx_filename = PyIndexedMmapFastaReader.create_faidx(filename, force=False)
Expand Down Expand Up @@ -345,6 +350,16 @@ def test_parallel_index_creation_nvfaidx():
assert all(lens_equal), (set(lens), sum(lens_equal))


def test_duplicate_seqids(dupes_fasta):
# Fails since we will get back 1 entry in our dict with 5 in our records list.
with pytest.raises(ValueError):
index = NvFaidx(dupes_fasta, allow_duplicate_seqids=False)

index = NvFaidx(dupes_fasta, allow_duplicate_seqids=True)
assert list(index.records.keys()) == list(range(5))
assert len(index) == 5


def test_file_errors():
# test missing fasta file
# test failure to parse fasta file
Expand Down

0 comments on commit f8db8a2

Please sign in to comment.