From 3b0b1797f75a913e626f643a3f2475557c7f4e68 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 24 Dec 2024 17:48:15 +0000
Subject: [PATCH] fail when passed fastas with duplicate records

---
 .../src/bionemo/noodles/nvfaidx.py            | 35 +++++++++++++++++--
 .../tests/bionemo/noodles/data/dupes.fasta    | 17 +++++++++
 .../tests/bionemo/noodles/test_nvfaidx.py     | 15 ++++++++
 3 files changed, 65 insertions(+), 2 deletions(-)
 create mode 100644 sub-packages/bionemo-noodles/tests/bionemo/noodles/data/dupes.fasta

diff --git a/sub-packages/bionemo-noodles/src/bionemo/noodles/nvfaidx.py b/sub-packages/bionemo-noodles/src/bionemo/noodles/nvfaidx.py
index cd44ab1d4..896a48ead 100644
--- a/sub-packages/bionemo-noodles/src/bionemo/noodles/nvfaidx.py
+++ b/sub-packages/bionemo-noodles/src/bionemo/noodles/nvfaidx.py
@@ -147,14 +147,38 @@ class NvFaidx:
     See Also: bionemo.noodles.nvfaidx.SequenceAccessor
     """
 
-    def __init__(self, fasta_path: str | Path, faidx_path: Optional[str | Path] = None, ignore_existing_fai=True):
+    def __init__(
+        self,
+        fasta_path: str | Path,
+        faidx_path: Optional[str | Path] = None,
+        ignore_existing_fai: bool = True,
+        allow_duplicate_seqids: bool = False,
+    ):
         """Construct a dict-like object representing a memmapped, indexed FASTA file.
 
+        This is an indexed fasta reader. Consequences of this are that the FASTA file must be well formed, meaning
+        sequence-ids and line-lengths must conform to FASTA standards. Additionally, the order of returned seqid, sequence
+        pairs when iterating over the index is not guaranteed to be the same order as the underlying fasta file.
+
         Args:
             fasta_path (str): Path to the FASTA file.
             faidx_path (str): Path to the FAI index file. If None, one will be created.
             ignore_existing_fai (bool): If True, ignore any existing FAI file and create an in-memory index. Note that
                 this will also ignore `faidx_path`.
+            allow_duplicate_seqids (bool): If true, will produce index for invalid fastas which contain duplicate seqids.
+                In this scenario, indexing is performed by integer rather than strings.
+
+                Example with invalid seqids.
+                    >chr1 dupes|not|allowd
+                    ATGATGATGATG
+                    >chr1 whoops|there|is|dupe
+                    ATGATGATGATG
+                NvFaidx:
+                    {
+                        0 : SequenceAccessor(chr1 dupes|not|allowed),
+                        1 : SequenceAccessor(chr1 whoops|there|is|dupe)
+                    }
+
         """
         if isinstance(fasta_path, Path):
             fasta_path = str(fasta_path)
@@ -178,7 +202,14 @@ def __init__(self, fasta_path: str | Path, faidx_path: Optional[str | Path] = No
             case _:
                 raise ValueError("unreachable condition.")
 
-        self.records: Dict[str, PyFaidxRecord] = {record.name: record for record in self.reader.records()}
+        self.records: Dict[str | int, PyFaidxRecord] = {record.name: record for record in self.reader.records()}
+        if len(self.records) != len(self.reader.records()):
+            if not allow_duplicate_seqids:
+                raise ValueError(
+                    "Non-unique sequence-id detected in FASTA, this is invalid. Correct headers and try again or pass allow_duplicate_seqid'"
+                )
+            else:
+                self.records: Dict[str | int, PyFaidxRecord] = dict(enumerate(self.reader.records()))
 
     def __getitem__(self, seqid: str) -> SequenceAccessor:  # noqa: D105
         if seqid not in self.records:
diff --git a/sub-packages/bionemo-noodles/tests/bionemo/noodles/data/dupes.fasta b/sub-packages/bionemo-noodles/tests/bionemo/noodles/data/dupes.fasta
new file mode 100644
index 000000000..f201e4c15
--- /dev/null
+++ b/sub-packages/bionemo-noodles/tests/bionemo/noodles/data/dupes.fasta
@@ -0,0 +1,17 @@
+>chr1 version|of|seq1
+ACTGACTGACTG
+>chr1 version|of|seq2
+GGTCAAGGTCAA
+>chr1 some|random|inputs
+AGTCAAGGTCCA
+CGTCAAGGTCCC
+GGTCAAGGTCCG
+TGTCAAGGTCCT
+AGTCAAGGTCAA
+CGTCAAGGTCAC
+GGTCAAGGTCAG
+>chr1 why|is|this|done
+CCCCCCCCCCCC
+ACGT
+>chr1 stop|violated|fasta|spec
+A
diff --git a/sub-packages/bionemo-noodles/tests/bionemo/noodles/test_nvfaidx.py b/sub-packages/bionemo-noodles/tests/bionemo/noodles/test_nvfaidx.py
index c273370ef..ece33e056 100644
--- a/sub-packages/bionemo-noodles/tests/bionemo/noodles/test_nvfaidx.py
+++ b/sub-packages/bionemo-noodles/tests/bionemo/noodles/test_nvfaidx.py
@@ -32,6 +32,11 @@ def sample_fasta():
     return str(pathlib.Path(__file__).parent.parent.parent / "bionemo/noodles/data/sample.fasta")
 
 
+@pytest.fixture
+def dupes_fasta():
+    return str(pathlib.Path(__file__).parent.parent.parent / "bionemo/noodles/data/dupes.fasta")
+
+
 def test_create_faidx_rustbind():
     filename = create_test_fasta(num_seqs=2, seq_length=200)
     faidx_filename = PyIndexedMmapFastaReader.create_faidx(filename, force=False)
@@ -345,6 +350,16 @@ def test_parallel_index_creation_nvfaidx():
         assert all(lens_equal), (set(lens), sum(lens_equal))
 
 
+def test_duplicate_seqids(dupes_fasta):
+    # Fails since we will get back 1 entry in our dict with 5 in our records list.
+    with pytest.raises(ValueError):
+        index = NvFaidx(dupes_fasta, allow_duplicate_seqids=False)
+
+    index = NvFaidx(dupes_fasta, allow_duplicate_seqids=True)
+    assert list(index.records.keys()) == list(range(5))
+    assert len(index) == 5
+
+
 def test_file_errors():
     # test missing fasta file
     # test failure to parse fasta file