From acafc667547631f44aae143d5a709b66963d38cc Mon Sep 17 00:00:00 2001 From: David Prihoda Date: Wed, 7 Aug 2024 09:29:23 +0200 Subject: [PATCH] Add Chain.multiple_domains and Chain.batch(multiple_domains=True) --- abnumber/__version__.py | 2 +- abnumber/chain.py | 36 +++++++++++++++++++++++++++++------- abnumber/common.py | 5 +++-- abnumber/exceptions.py | 5 ++++- test/test_chain.py | 31 +++++++++++++++++++++++++++++++ 5 files changed, 68 insertions(+), 11 deletions(-) diff --git a/abnumber/__version__.py b/abnumber/__version__.py index 4596d03..d93912e 100644 --- a/abnumber/__version__.py +++ b/abnumber/__version__.py @@ -1 +1 @@ -__version__ = '0.3.6' +__version__ = '0.3.7' diff --git a/abnumber/chain.py b/abnumber/chain.py index 0cf3f62..e4dcae8 100644 --- a/abnumber/chain.py +++ b/abnumber/chain.py @@ -1,3 +1,4 @@ +import warnings from collections import OrderedDict from typing import Union, List, Generator, Tuple import pandas as pd @@ -5,7 +6,7 @@ from abnumber.alignment import Alignment from abnumber.common import _anarci_align, _validate_chain_type, SUPPORTED_SCHEMES, SUPPORTED_CDR_DEFINITIONS, \ is_integer, SCHEME_BORDERS, _get_unique_chains -from abnumber.exceptions import ChainParseError +from abnumber.exceptions import ChainParseError, MultipleDomainsChainParseError import numpy as np from Bio import SeqIO from Bio.SeqRecord import SeqRecord @@ -99,7 +100,8 @@ def __init__(self, sequence, scheme, cdr_definition=None, name=None, assign_germ if not results: raise ChainParseError(f'Variable chain sequence not recognized: "{sequence}"') if len(results) > 1: - raise ChainParseError(f'Found {len(results)} antibody domains in sequence: "{sequence}"') + warnings.warn('Use Chain.multiple_domains(seq) to parse ScFvs and other sequences with multiple antibody domains') + raise MultipleDomainsChainParseError(f'Found {len(results)} antibody domains in sequence: "{sequence}"') aa_dict, chain_type, tail, species, v_gene, j_gene = results[0] _validate_chain_type(chain_type) @@ -183,7 +185,7 @@ def _init_from_dict(self, aa_dict, allowed_species): regions_list[region_idx][pos] = aa @classmethod - def batch(cls, seq_dict: dict, scheme: str, cdr_definition=None, assign_germline=False, allowed_species=None): + def batch(cls, seq_dict: dict, scheme: str, cdr_definition=None, assign_germline=False, allowed_species=None, multiple_domains=False): """Create multiple Chain objects from dict of sequences :param seq_dict: Dictionary of sequence strings, keys are sequence identifiers @@ -191,6 +193,7 @@ def batch(cls, seq_dict: dict, scheme: str, cdr_definition=None, assign_germline :param cdr_definition: Numbering scheme to be used for definition of CDR regions. Same as ``scheme`` by default. :param assign_germline: Assign germline name using ANARCI based on best sequence identity :param allowed_species: Allowed species for germline assignment. Use ``None`` to allow all species, or one or more of: ``'human', 'mouse','rat','rabbit','rhesus','pig','alpaca'`` + :param multiple_domains: Allow parsing multiple domains in a sequence - return dict name -> list of one or more Chain items :return: tuple with (dict of Chain objects, dict of error strings) """ assert isinstance(seq_dict, dict), f'Expected dictionary of sequences, got: {type(seq_dict).__name__}' @@ -205,11 +208,11 @@ def batch(cls, seq_dict: dict, scheme: str, cdr_definition=None, assign_germline for sequence, results, name in zip(seq_list, all_results, names): if not results: errors[name] = f'Variable chain sequence not recognized: "{sequence}"' - elif len(results) > 1: + elif len(results) > 1 and not multiple_domains: + warnings.warn('Use multiple_domains=True to allow parsing ScFvs and other sequences with multiple antibody domains') errors[name] = f'Found {len(results)} antibody domains: "{sequence}"' else: - aa_dict, chain_type, tail, species, v_gene, j_gene = results[0] - chains[name] = Chain( + found_chains = [Chain( sequence=None, aa_dict=aa_dict, name=name, @@ -220,9 +223,28 @@ def batch(cls, seq_dict: dict, scheme: str, cdr_definition=None, assign_germline species=species, v_gene=v_gene, j_gene=j_gene - ) + ) for aa_dict, chain_type, tail, species, v_gene, j_gene in results] + chains[name] = found_chains if multiple_domains else found_chains[0] return chains, errors + @classmethod + def multiple_domains(cls, sequence: str, scheme: str, cdr_definition=None, name=None, assign_germline=False, allowed_species=None) -> 'Chain': + """Parse multi-domain sequence into a list of Chain objects + + :param sequence: Unaligned string sequence + :param scheme: Numbering scheme to align the sequences + :param cdr_definition: Numbering scheme to be used for definition of CDR regions. Same as ``scheme`` by default. + :param name: Optional sequence identifier + :param assign_germline: Assign germline name using ANARCI based on best sequence identity + :param allowed_species: Allowed species for germline assignment. Use ``None`` to allow all species, or one or more of: ``'human', 'mouse','rat','rabbit','rhesus','pig','alpaca'`` + :return: tuple with (dict of Chain objects, dict of error strings) + """ + chains, errors = cls.batch({name: sequence}, scheme=scheme, cdr_definition=cdr_definition, assign_germline=assign_germline, allowed_species=allowed_species, multiple_domains=True) + if error := errors.get(name): + raise ChainParseError(error) + return chains[name] + + def __repr__(self): return self.format() diff --git a/abnumber/common.py b/abnumber/common.py index 944da0f..dbd33dd 100644 --- a/abnumber/common.py +++ b/abnumber/common.py @@ -37,14 +37,15 @@ def _anarci_align(sequences, scheme, allowed_species, assign_germline=False) -> continue assert len(seq_numbered) == len(seq_ali), 'Unexpected ANARCI output' results = [] - for (positions, start, end), ali in zip(seq_numbered, seq_ali): + for i, ((positions, start, end), ali) in enumerate(zip(seq_numbered, seq_ali)): chain_type = ali['chain_type'] species = ali['species'] v_gene = ali['germlines']['v_gene'][0][1] if assign_germline else None j_gene = ali['germlines']['j_gene'][0][1] if assign_germline else None aa_dict = {Position(chain_type=chain_type, number=num, letter=letter, scheme=scheme): aa for (num, letter), aa in positions if aa != '-'} - tail = sequence[end+1:] + next_start = None if i == len(seq_numbered) - 1 else seq_numbered[i+1][1] + tail = sequence[end+1:next_start] results.append((aa_dict, chain_type, tail, species, v_gene, j_gene)) all_results.append(results) return all_results diff --git a/abnumber/exceptions.py b/abnumber/exceptions.py index 7546712..ad186b2 100644 --- a/abnumber/exceptions.py +++ b/abnumber/exceptions.py @@ -1,2 +1,5 @@ class ChainParseError(Exception): - pass \ No newline at end of file + pass + +class MultipleDomainsChainParseError(ChainParseError): + pass diff --git a/test/test_chain.py b/test/test_chain.py index cd925fb..e58d479 100644 --- a/test/test_chain.py +++ b/test/test_chain.py @@ -268,3 +268,34 @@ def test_batch(): assert 'D' not in chains assert errors['D'] == 'Found 2 antibody domains: "EVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSSEVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSS"' + +def test_batch_multiple_domains(): + chains, errors = Chain.batch({ + 'A': 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTVTVSS', + 'B': 'EVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSS', + 'C': 'FOO', + 'D': 'EVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSSGGGGSQVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSSS' + }, scheme='imgt', multiple_domains=True) + assert len(chains) == 3 + assert len(chains['A']) == 1 + assert chains['A'][0].raw[0] == 'Q' + assert len(chains['B']) == 1 + assert chains['B'][0].raw[0] == 'E' + assert 'C' not in chains + assert errors['C'] == 'Variable chain sequence not recognized: "FOO"' + assert len(chains['D']) == 2 + assert chains['D'][0].raw[0] == 'E' + assert chains['D'][0].tail == 'GGGGS' + assert chains['D'][1].raw[0] == 'Q' + assert chains['D'][1].tail == 'S' + + +def test_multiple_domains(): + vh = 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTVTVSS' + vl = 'ELVMTQSPSSLSASVGDRVNIACRASQGISSALAWYQQKPGKAPRLLIYDASNLESGVPSRFSGSGSGTDFTLTISSLQPEDFAIYYCQQFNSYPLTFGGGTKVEIK' + chains = Chain.multiple_domains('MELVIS' + vh + 'GGGS' + vl + 'CCC', scheme='imgt') + assert len(chains) == 2 + assert chains[0].seq == vh + assert chains[0].tail == 'GGGS' + assert chains[1].seq == vl + assert chains[1].tail == 'CCC'