From 0cab5960a90debc1d76d177a361549f62ebdf34e Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 6 Dec 2024 21:35:18 +0100 Subject: [PATCH] Do not report a match for ambiguous sequences This only applies when an index is used. At index creation time, ambiguous sequences (those that would match two or more adapters equally well), are now *removed* from the index. That is, when they are encountered in a read, they will not be assigned to any adapter. The intention is to reduce bias because we previously would report a match to one of the adapters (but which one was more or less randomly determined at index creation time). --- CHANGES.rst | 6 ++++ doc/guide.rst | 19 ++++++++++++ src/cutadapt/adapters.py | 62 +++++++++++++++++++++++----------------- tests/test_adapters.py | 22 +++++++++++++- 4 files changed, 81 insertions(+), 28 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index b93c1b3e..b9dda6a7 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -5,6 +5,12 @@ Changelog development version ------------------- +This release increases the major version number because results when +demultiplexing may change with this release. + +* :pr:`827`: When matching multiple adapters (typically when demultiplexing + using barcodes), Cutadapt now no longer assigns ambiguous matches to one + of the adapters/barcodes. * :issue:`808`: Made gzip compression level 1 the default, which improves runtime significantly in many cases. (Compressing the output is often a bottleneck when using multiple threads.) Output files will be larger, but because diff --git a/doc/guide.rst b/doc/guide.rst index 0fda2049..3ef10faf 100644 --- a/doc/guide.rst +++ b/doc/guide.rst @@ -1915,6 +1915,7 @@ To demultiplex this type of data, the .. _speed-up-demultiplexing: +.. _adapter-indexing: Speeding up demultiplexing/adapter indexing ------------------------------------------- @@ -1961,6 +1962,24 @@ Cutadapt’s output:: .. versionadded:: 5.0 An index is created up to three (instead of two) allowed errors. +Ambiguous sequences +~~~~~~~~~~~~~~~~~~~ + +When :ref:`an index is created for multiple anchored adapters `, Cutadapt checks +whether there are any possible input sequences that lead to ambiguous matches, that is, which +would match two or more adapter sequences equally well. +If there are ambiguous sequences, Cutadapt prints a warning like this:: + + WARNING: The adapters are too similar. When creating the index, 31 ambiguous sequences were found that cannot be assigned uniquely. + WARNING: For example, 'TAGTGCTTGA', when found in a read, would result in 10 matches for both bc3 'TAGTGCTTGA' and bc11 'TAGTGCTTGA' + WARNING: Reads with ambiguous sequence will *not* be trimmed. + +If you use ``-no-index``, Cutadapt will not print this warning and instead assign ambiguous reads +to the first of the adapters that match equally well. + +.. versionadded:: 5.0 + Ambiguous sequences were not handled specially in earlier versions. + Demultiplexing paired-end reads in mixed orientation ---------------------------------------------------- diff --git a/src/cutadapt/adapters.py b/src/cutadapt/adapters.py index 3929d36d..08034030 100644 --- a/src/cutadapt/adapters.py +++ b/src/cutadapt/adapters.py @@ -1255,10 +1255,11 @@ def __init__(self, adapters, prefix: bool): for adapter in adapters: self._accept(adapter, prefix) self._adapters = adapters - self._lengths, self._index = self._make_index() + self._lengths, self._index, self._ambiguous = self._make_index() logger.debug( "String lengths in the index: %s", sorted(self._lengths, reverse=True) ) + if len(self._lengths) == 1: self._length = self._lengths[0] self.match_to = self._match_to_one_length @@ -1337,7 +1338,7 @@ def is_acceptable(cls, adapter: SingleAdapter, prefix: bool): return False return True - def _make_index(self) -> Tuple[List[int], "AdapterIndexDict"]: + def _make_index(self) -> Tuple[List[int], "AdapterIndexDict", int]: start_time = time.time() max_k = max( ( @@ -1356,7 +1357,7 @@ def _make_index(self) -> Tuple[List[int], "AdapterIndexDict"]: ) index: Dict[str, Tuple[SingleAdapter, int, int]] = dict() lengths = set() - has_warned = False + ambiguous = [] for adapter in self._adapters: sequence = adapter.sequence k = int(adapter.max_error_rate * len(sequence)) @@ -1367,47 +1368,54 @@ def _make_index(self) -> Tuple[List[int], "AdapterIndexDict"]: other_adapter, other_errors, other_matches = index[s] if matches < other_matches: continue - if other_matches == matches and not has_warned: - self._warn_similar(adapter, other_adapter, k, s, matches) - has_warned = True + if other_matches == matches: + ambiguous.append((s, adapter, other_adapter, k, matches)) index[s] = (adapter, errors, matches) lengths.add(len(s)) else: n = len(sequence) for errors in range(k + 1): + matches = n - errors for s in hamming_sphere(sequence, errors): - matches = n - errors if s in index: other_adapter, other_errors, other_matches = index[s] if matches < other_matches: continue - if other_matches == matches and not has_warned: - self._warn_similar( - adapter, other_adapter, k, s, matches + if other_matches == matches: + ambiguous.append( + (s, adapter, other_adapter, k, matches) ) - has_warned = True index[s] = (adapter, errors, matches) lengths.add(n) + + if ambiguous: + logger.warning( + "WARNING: The adapters are too similar. When creating the index, " + "%d ambiguous sequences were found that cannot be assigned uniquely.", + len(ambiguous), + ) + s, adapter, other_adapter, k, matches = ambiguous[0] + logger.warning( + "WARNING: For example, %r, when found in a read, would result in " + "%s matches for both %s %r and %s %r", + s, + matches, + other_adapter.name, + other_adapter.sequence, + adapter.name, + adapter.sequence, + ) + logger.warning( + "WARNING: Reads with ambiguous sequence will *not* be trimmed." + ) + for s, adapter, other_adapter, k, matches in ambiguous: + del index[s] + elapsed = time.time() - start_time logger.info("Built an index containing %s strings.", len(index)) logger.debug("Building the index took %.1f s", elapsed) - return sorted(lengths, reverse=True), index - - @staticmethod - def _warn_similar(adapter, other_adapter, k, s, matches): - logger.warning( - "Adapters %s %r and %s %r are very similar. At %s allowed errors, " - "the sequence %r cannot be assigned uniquely because the number of " - "matches is %s compared to both adapters.", - other_adapter.name, - other_adapter.sequence, - adapter.name, - adapter.sequence, - k, - s, - matches, - ) + return sorted(lengths, reverse=True), index, len(ambiguous) def _match_to_one_length(self, sequence: str): """ diff --git a/tests/test_adapters.py b/tests/test_adapters.py index f6fb8dc7..f4eecedb 100644 --- a/tests/test_adapters.py +++ b/tests/test_adapters.py @@ -509,7 +509,7 @@ def test_indexed_very_similar(caplog): PrefixAdapter("GAAG", max_errors=1, indels=False), ] ) - assert "cannot be assigned uniquely" in caplog.text + assert "ambiguous sequences" in caplog.text def test_indexed_too_high_k(): @@ -582,6 +582,26 @@ def test_indexed_prefix_adapters_with_n_collision(sequence): assert result.adapter is a2 +def test_indexed_prefix_adapters_ignore_ambiguous_matches(): + a1 = PrefixAdapter("AAAAA", max_errors=1, indels=False) + a2 = PrefixAdapter("TTAAA", max_errors=1, indels=False) + ipa = IndexedPrefixAdapters([a1, a2]) + + result = ipa.match_to("ATAAA") + + assert result is None + + +def test_indexed_prefix_adapters_ignore_ambiguous_matches_with_indels(): + a1 = PrefixAdapter("AGTACGT", max_errors=1, indels=True) + a2 = PrefixAdapter("ACGTAGT", max_errors=1, indels=True) + ipa = IndexedPrefixAdapters([a1, a2]) + + result = ipa.match_to("ACGTACGT") + + assert result is None + + def test_inosine_wildcard(): adapter = BackAdapter("CTGIAIT", max_errors=0, min_overlap=3) match = adapter.match_to("GGCTGAATTGGG")