Skip to content

Commit

Permalink
fail when passed fastas with duplicate records
Browse files Browse the repository at this point in the history
  • Loading branch information
skothenhill-nv committed Dec 24, 2024
1 parent c28772b commit 3b0b179
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 2 deletions.
35 changes: 33 additions & 2 deletions sub-packages/bionemo-noodles/src/bionemo/noodles/nvfaidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,38 @@ class NvFaidx:
See Also: bionemo.noodles.nvfaidx.SequenceAccessor
"""

def __init__(self, fasta_path: str | Path, faidx_path: Optional[str | Path] = None, ignore_existing_fai=True):
def __init__(
self,
fasta_path: str | Path,
faidx_path: Optional[str | Path] = None,
ignore_existing_fai: bool = True,
allow_duplicate_seqids: bool = False,
):
"""Construct a dict-like object representing a memmapped, indexed FASTA file.
This is an indexed fasta reader. Consequences of this are that the FASTA file must be well formed, meaning
sequence-ids and line-lengths must conform to FASTA standards. Additionally, the order of returned seqid, sequence
pairs when iterating over the index is not guaranteed to be the same order as the underlying fasta file.
Args:
fasta_path (str): Path to the FASTA file.
faidx_path (str): Path to the FAI index file. If None, one will be created.
ignore_existing_fai (bool): If True, ignore any existing FAI file and create an in-memory index. Note that
this will also ignore `faidx_path`.
allow_duplicate_seqids (bool): If true, will produce index for invalid fastas which contain duplicate seqids.
In this scenario, indexing is performed by integer rather than strings.
Example with invalid seqids.
>chr1 dupes|not|allowd
ATGATGATGATG
>chr1 whoops|there|is|dupe
ATGATGATGATG
NvFaidx:
{
0 : SequenceAccessor(chr1 dupes|not|allowed),
1 : SequenceAccessor(chr1 whoops|there|is|dupe)
}
"""
if isinstance(fasta_path, Path):
fasta_path = str(fasta_path)
Expand All @@ -178,7 +202,14 @@ def __init__(self, fasta_path: str | Path, faidx_path: Optional[str | Path] = No
case _:
raise ValueError("unreachable condition.")

self.records: Dict[str, PyFaidxRecord] = {record.name: record for record in self.reader.records()}
self.records: Dict[str | int, PyFaidxRecord] = {record.name: record for record in self.reader.records()}
if len(self.records) != len(self.reader.records()):
if not allow_duplicate_seqids:
raise ValueError(
"Non-unique sequence-id detected in FASTA, this is invalid. Correct headers and try again or pass allow_duplicate_seqid'"
)
else:
self.records: Dict[str | int, PyFaidxRecord] = dict(enumerate(self.reader.records()))

def __getitem__(self, seqid: str) -> SequenceAccessor: # noqa: D105
if seqid not in self.records:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
>chr1 version|of|seq1
ACTGACTGACTG
>chr1 version|of|seq2
GGTCAAGGTCAA
>chr1 some|random|inputs
AGTCAAGGTCCA
CGTCAAGGTCCC
GGTCAAGGTCCG
TGTCAAGGTCCT
AGTCAAGGTCAA
CGTCAAGGTCAC
GGTCAAGGTCAG
>chr1 why|is|this|done
CCCCCCCCCCCC
ACGT
>chr1 stop|violated|fasta|spec
A
15 changes: 15 additions & 0 deletions sub-packages/bionemo-noodles/tests/bionemo/noodles/test_nvfaidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ def sample_fasta():
return str(pathlib.Path(__file__).parent.parent.parent / "bionemo/noodles/data/sample.fasta")


@pytest.fixture
def dupes_fasta():
return str(pathlib.Path(__file__).parent.parent.parent / "bionemo/noodles/data/dupes.fasta")


def test_create_faidx_rustbind():
filename = create_test_fasta(num_seqs=2, seq_length=200)
faidx_filename = PyIndexedMmapFastaReader.create_faidx(filename, force=False)
Expand Down Expand Up @@ -345,6 +350,16 @@ def test_parallel_index_creation_nvfaidx():
assert all(lens_equal), (set(lens), sum(lens_equal))


def test_duplicate_seqids(dupes_fasta):
# Fails since we will get back 1 entry in our dict with 5 in our records list.
with pytest.raises(ValueError):
index = NvFaidx(dupes_fasta, allow_duplicate_seqids=False)

index = NvFaidx(dupes_fasta, allow_duplicate_seqids=True)
assert list(index.records.keys()) == list(range(5))
assert len(index) == 5


def test_file_errors():
# test missing fasta file
# test failure to parse fasta file
Expand Down

0 comments on commit 3b0b179

Please sign in to comment.