From 3b0b1797f75a913e626f643a3f2475557c7f4e68 Mon Sep 17 00:00:00 2001 From: Steven Date: Tue, 24 Dec 2024 17:48:15 +0000 Subject: [PATCH] fail when passed fastas with duplicate records --- .../src/bionemo/noodles/nvfaidx.py | 35 +++++++++++++++++-- .../tests/bionemo/noodles/data/dupes.fasta | 17 +++++++++ .../tests/bionemo/noodles/test_nvfaidx.py | 15 ++++++++ 3 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 sub-packages/bionemo-noodles/tests/bionemo/noodles/data/dupes.fasta diff --git a/sub-packages/bionemo-noodles/src/bionemo/noodles/nvfaidx.py b/sub-packages/bionemo-noodles/src/bionemo/noodles/nvfaidx.py index cd44ab1d4..896a48ead 100644 --- a/sub-packages/bionemo-noodles/src/bionemo/noodles/nvfaidx.py +++ b/sub-packages/bionemo-noodles/src/bionemo/noodles/nvfaidx.py @@ -147,14 +147,38 @@ class NvFaidx: See Also: bionemo.noodles.nvfaidx.SequenceAccessor """ - def __init__(self, fasta_path: str | Path, faidx_path: Optional[str | Path] = None, ignore_existing_fai=True): + def __init__( + self, + fasta_path: str | Path, + faidx_path: Optional[str | Path] = None, + ignore_existing_fai: bool = True, + allow_duplicate_seqids: bool = False, + ): """Construct a dict-like object representing a memmapped, indexed FASTA file. + This is an indexed fasta reader. Consequences of this are that the FASTA file must be well formed, meaning + sequence-ids and line-lengths must conform to FASTA standards. Additionally, the order of returned seqid, sequence + pairs when iterating over the index is not guaranteed to be the same order as the underlying fasta file. + Args: fasta_path (str): Path to the FASTA file. faidx_path (str): Path to the FAI index file. If None, one will be created. ignore_existing_fai (bool): If True, ignore any existing FAI file and create an in-memory index. Note that this will also ignore `faidx_path`. + allow_duplicate_seqids (bool): If true, will produce index for invalid fastas which contain duplicate seqids. + In this scenario, indexing is performed by integer rather than strings. + + Example with invalid seqids. + >chr1 dupes|not|allowd + ATGATGATGATG + >chr1 whoops|there|is|dupe + ATGATGATGATG + NvFaidx: + { + 0 : SequenceAccessor(chr1 dupes|not|allowed), + 1 : SequenceAccessor(chr1 whoops|there|is|dupe) + } + """ if isinstance(fasta_path, Path): fasta_path = str(fasta_path) @@ -178,7 +202,14 @@ def __init__(self, fasta_path: str | Path, faidx_path: Optional[str | Path] = No case _: raise ValueError("unreachable condition.") - self.records: Dict[str, PyFaidxRecord] = {record.name: record for record in self.reader.records()} + self.records: Dict[str | int, PyFaidxRecord] = {record.name: record for record in self.reader.records()} + if len(self.records) != len(self.reader.records()): + if not allow_duplicate_seqids: + raise ValueError( + "Non-unique sequence-id detected in FASTA, this is invalid. Correct headers and try again or pass allow_duplicate_seqid'" + ) + else: + self.records: Dict[str | int, PyFaidxRecord] = dict(enumerate(self.reader.records())) def __getitem__(self, seqid: str) -> SequenceAccessor: # noqa: D105 if seqid not in self.records: diff --git a/sub-packages/bionemo-noodles/tests/bionemo/noodles/data/dupes.fasta b/sub-packages/bionemo-noodles/tests/bionemo/noodles/data/dupes.fasta new file mode 100644 index 000000000..f201e4c15 --- /dev/null +++ b/sub-packages/bionemo-noodles/tests/bionemo/noodles/data/dupes.fasta @@ -0,0 +1,17 @@ +>chr1 version|of|seq1 +ACTGACTGACTG +>chr1 version|of|seq2 +GGTCAAGGTCAA +>chr1 some|random|inputs +AGTCAAGGTCCA +CGTCAAGGTCCC +GGTCAAGGTCCG +TGTCAAGGTCCT +AGTCAAGGTCAA +CGTCAAGGTCAC +GGTCAAGGTCAG +>chr1 why|is|this|done +CCCCCCCCCCCC +ACGT +>chr1 stop|violated|fasta|spec +A diff --git a/sub-packages/bionemo-noodles/tests/bionemo/noodles/test_nvfaidx.py b/sub-packages/bionemo-noodles/tests/bionemo/noodles/test_nvfaidx.py index c273370ef..ece33e056 100644 --- a/sub-packages/bionemo-noodles/tests/bionemo/noodles/test_nvfaidx.py +++ b/sub-packages/bionemo-noodles/tests/bionemo/noodles/test_nvfaidx.py @@ -32,6 +32,11 @@ def sample_fasta(): return str(pathlib.Path(__file__).parent.parent.parent / "bionemo/noodles/data/sample.fasta") +@pytest.fixture +def dupes_fasta(): + return str(pathlib.Path(__file__).parent.parent.parent / "bionemo/noodles/data/dupes.fasta") + + def test_create_faidx_rustbind(): filename = create_test_fasta(num_seqs=2, seq_length=200) faidx_filename = PyIndexedMmapFastaReader.create_faidx(filename, force=False) @@ -345,6 +350,16 @@ def test_parallel_index_creation_nvfaidx(): assert all(lens_equal), (set(lens), sum(lens_equal)) +def test_duplicate_seqids(dupes_fasta): + # Fails since we will get back 1 entry in our dict with 5 in our records list. + with pytest.raises(ValueError): + index = NvFaidx(dupes_fasta, allow_duplicate_seqids=False) + + index = NvFaidx(dupes_fasta, allow_duplicate_seqids=True) + assert list(index.records.keys()) == list(range(5)) + assert len(index) == 5 + + def test_file_errors(): # test missing fasta file # test failure to parse fasta file