Skip to content

Commit

Permalink
Add Chain.multiple_domains and Chain.batch(multiple_domains=True)
Browse files Browse the repository at this point in the history
  • Loading branch information
prihoda committed Aug 7, 2024
1 parent 55ace7f commit acafc66
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 11 deletions.
2 changes: 1 addition & 1 deletion abnumber/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.3.6'
__version__ = '0.3.7'
36 changes: 29 additions & 7 deletions abnumber/chain.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import warnings
from collections import OrderedDict
from typing import Union, List, Generator, Tuple
import pandas as pd

from abnumber.alignment import Alignment
from abnumber.common import _anarci_align, _validate_chain_type, SUPPORTED_SCHEMES, SUPPORTED_CDR_DEFINITIONS, \
is_integer, SCHEME_BORDERS, _get_unique_chains
from abnumber.exceptions import ChainParseError
from abnumber.exceptions import ChainParseError, MultipleDomainsChainParseError
import numpy as np
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
Expand Down Expand Up @@ -99,7 +100,8 @@ def __init__(self, sequence, scheme, cdr_definition=None, name=None, assign_germ
if not results:
raise ChainParseError(f'Variable chain sequence not recognized: "{sequence}"')
if len(results) > 1:
raise ChainParseError(f'Found {len(results)} antibody domains in sequence: "{sequence}"')
warnings.warn('Use Chain.multiple_domains(seq) to parse ScFvs and other sequences with multiple antibody domains')
raise MultipleDomainsChainParseError(f'Found {len(results)} antibody domains in sequence: "{sequence}"')
aa_dict, chain_type, tail, species, v_gene, j_gene = results[0]

_validate_chain_type(chain_type)
Expand Down Expand Up @@ -183,14 +185,15 @@ def _init_from_dict(self, aa_dict, allowed_species):
regions_list[region_idx][pos] = aa

@classmethod
def batch(cls, seq_dict: dict, scheme: str, cdr_definition=None, assign_germline=False, allowed_species=None):
def batch(cls, seq_dict: dict, scheme: str, cdr_definition=None, assign_germline=False, allowed_species=None, multiple_domains=False):
"""Create multiple Chain objects from dict of sequences
:param seq_dict: Dictionary of sequence strings, keys are sequence identifiers
:param scheme: Numbering scheme to align the sequences
:param cdr_definition: Numbering scheme to be used for definition of CDR regions. Same as ``scheme`` by default.
:param assign_germline: Assign germline name using ANARCI based on best sequence identity
:param allowed_species: Allowed species for germline assignment. Use ``None`` to allow all species, or one or more of: ``'human', 'mouse','rat','rabbit','rhesus','pig','alpaca'``
:param multiple_domains: Allow parsing multiple domains in a sequence - return dict name -> list of one or more Chain items
:return: tuple with (dict of Chain objects, dict of error strings)
"""
assert isinstance(seq_dict, dict), f'Expected dictionary of sequences, got: {type(seq_dict).__name__}'
Expand All @@ -205,11 +208,11 @@ def batch(cls, seq_dict: dict, scheme: str, cdr_definition=None, assign_germline
for sequence, results, name in zip(seq_list, all_results, names):
if not results:
errors[name] = f'Variable chain sequence not recognized: "{sequence}"'
elif len(results) > 1:
elif len(results) > 1 and not multiple_domains:
warnings.warn('Use multiple_domains=True to allow parsing ScFvs and other sequences with multiple antibody domains')
errors[name] = f'Found {len(results)} antibody domains: "{sequence}"'
else:
aa_dict, chain_type, tail, species, v_gene, j_gene = results[0]
chains[name] = Chain(
found_chains = [Chain(
sequence=None,
aa_dict=aa_dict,
name=name,
Expand All @@ -220,9 +223,28 @@ def batch(cls, seq_dict: dict, scheme: str, cdr_definition=None, assign_germline
species=species,
v_gene=v_gene,
j_gene=j_gene
)
) for aa_dict, chain_type, tail, species, v_gene, j_gene in results]
chains[name] = found_chains if multiple_domains else found_chains[0]
return chains, errors

@classmethod
def multiple_domains(cls, sequence: str, scheme: str, cdr_definition=None, name=None, assign_germline=False, allowed_species=None) -> 'Chain':
"""Parse multi-domain sequence into a list of Chain objects
:param sequence: Unaligned string sequence
:param scheme: Numbering scheme to align the sequences
:param cdr_definition: Numbering scheme to be used for definition of CDR regions. Same as ``scheme`` by default.
:param name: Optional sequence identifier
:param assign_germline: Assign germline name using ANARCI based on best sequence identity
:param allowed_species: Allowed species for germline assignment. Use ``None`` to allow all species, or one or more of: ``'human', 'mouse','rat','rabbit','rhesus','pig','alpaca'``
:return: tuple with (dict of Chain objects, dict of error strings)
"""
chains, errors = cls.batch({name: sequence}, scheme=scheme, cdr_definition=cdr_definition, assign_germline=assign_germline, allowed_species=allowed_species, multiple_domains=True)
if error := errors.get(name):
raise ChainParseError(error)
return chains[name]


def __repr__(self):
return self.format()

Expand Down
5 changes: 3 additions & 2 deletions abnumber/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,15 @@ def _anarci_align(sequences, scheme, allowed_species, assign_germline=False) ->
continue
assert len(seq_numbered) == len(seq_ali), 'Unexpected ANARCI output'
results = []
for (positions, start, end), ali in zip(seq_numbered, seq_ali):
for i, ((positions, start, end), ali) in enumerate(zip(seq_numbered, seq_ali)):
chain_type = ali['chain_type']
species = ali['species']
v_gene = ali['germlines']['v_gene'][0][1] if assign_germline else None
j_gene = ali['germlines']['j_gene'][0][1] if assign_germline else None
aa_dict = {Position(chain_type=chain_type, number=num, letter=letter, scheme=scheme): aa
for (num, letter), aa in positions if aa != '-'}
tail = sequence[end+1:]
next_start = None if i == len(seq_numbered) - 1 else seq_numbered[i+1][1]
tail = sequence[end+1:next_start]
results.append((aa_dict, chain_type, tail, species, v_gene, j_gene))
all_results.append(results)
return all_results
Expand Down
5 changes: 4 additions & 1 deletion abnumber/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
class ChainParseError(Exception):
pass
pass

class MultipleDomainsChainParseError(ChainParseError):
pass
31 changes: 31 additions & 0 deletions test/test_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,3 +268,34 @@ def test_batch():
assert 'D' not in chains
assert errors['D'] == 'Found 2 antibody domains: "EVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSSEVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSS"'


def test_batch_multiple_domains():
chains, errors = Chain.batch({
'A': 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTVTVSS',
'B': 'EVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSS',
'C': 'FOO',
'D': 'EVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSSGGGGSQVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYSEDDERGHYCLDYWGQGTTLTVSSS'
}, scheme='imgt', multiple_domains=True)
assert len(chains) == 3
assert len(chains['A']) == 1
assert chains['A'][0].raw[0] == 'Q'
assert len(chains['B']) == 1
assert chains['B'][0].raw[0] == 'E'
assert 'C' not in chains
assert errors['C'] == 'Variable chain sequence not recognized: "FOO"'
assert len(chains['D']) == 2
assert chains['D'][0].raw[0] == 'E'
assert chains['D'][0].tail == 'GGGGS'
assert chains['D'][1].raw[0] == 'Q'
assert chains['D'][1].tail == 'S'


def test_multiple_domains():
vh = 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTVTVSS'
vl = 'ELVMTQSPSSLSASVGDRVNIACRASQGISSALAWYQQKPGKAPRLLIYDASNLESGVPSRFSGSGSGTDFTLTISSLQPEDFAIYYCQQFNSYPLTFGGGTKVEIK'
chains = Chain.multiple_domains('MELVIS' + vh + 'GGGS' + vl + 'CCC', scheme='imgt')
assert len(chains) == 2
assert chains[0].seq == vh
assert chains[0].tail == 'GGGS'
assert chains[1].seq == vl
assert chains[1].tail == 'CCC'

0 comments on commit acafc66

Please sign in to comment.