Skip to content

Commit

Permalink
switch to nucleotide CDR3 sequences by default, don't allow gaps when…
Browse files Browse the repository at this point in the history
… computing distance on same-length CDR3s
  • Loading branch information
briney committed Oct 23, 2024
1 parent 898474e commit 063a75d
Showing 1 changed file with 28 additions and 24 deletions.
52 changes: 28 additions & 24 deletions abutils/tools/clonify.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def clonify(
id_key: str = "sequence_id",
vgene_key: str = "v_gene",
jgene_key: str = "j_gene",
cdr3_key: str = "cdr3_aa",
cdr3_key: str = "cdr3",
mutations_key: str = "v_mutations",
preclustering_key: str = "cdr3_nt",
preclustering_key: str = "cdr3",
mutation_delimiter: str = "|",
ignore_likely_allelic_variants: bool = False,
allelic_variant_threshold: float = 0.35,
Expand Down Expand Up @@ -159,13 +159,13 @@ def clonify(
jgene_key : str, default='j_gene'
Annotation field containing the J-gene name.
cdr3_key : str, default='cdr3_aa'
Annotation field containing the CDR3 amino acid sequence.
cdr3_key : str, default='cdr3'
Annotation field containing the CDR3 sequence.
mutations_key : str, default='v_mutations'
Annotation field containing the V-gene mutations.
preclustering_key : str, default='cdr3_nt'
preclustering_key : str, default='cdr3'
Annotation field on which to pre-cluster sequences.
mutation_delimiter : str, default='|'
Expand Down Expand Up @@ -498,7 +498,7 @@ def pairwise_distance(
length_penalty_multiplier: Union[int, float] = 2,
vgene_field: str = "v_gene",
jgene_field: str = "j_gene",
cdr3_field: str = "cdr3_aa",
cdr3_field: str = "cdr3",
mutations_field: str = "mutations",
likely_allelic_variants: Optional[Iterable] = None,
debug: bool = False,
Expand All @@ -521,22 +521,25 @@ def pairwise_distance(
Used to compute the penalty for differences in CDR3 length. The length
difference is multiplied by `length_penalty_multiplier`, by default 2
vgene_field : str, optional
vgene_field : str, default="v_gene"
Name of the field in `s1` and `s2` containing the V-gene name,
by default ``"v_gene"``
jgene_field : str, optional
Name of the field in `s1` and `s2` containing the J-gene name,
by default ``"j_gene"``
jgene_field : str, default="j_gene"
Name of the field in `s1` and `s2` containing the J-gene name
cdr3_field : str, optional
Name of the field in `s1` and `s2` containing the CDR3 sequence,
by default ``"cdr3_aa"``
cdr3_field : str, default="cdr3"
Name of the field in `s1` and `s2` containing the CDR3 sequence
mutations_field : str, optional
Name of the field in `s1` and `s2` containing mutation information,
by default ``"mutations"`` Note that this field should be a list of
mutation strings
mutations_field : str, default="mutations"
Name of the field in `s1` and `s2` containing mutation information
likely_allelic_variants : Optional[Iterable], default=None
List of mutations that are likely allelic variants. If provided, shared
mutations that are also in `likely_allelic_variants` will not be counted
towards the mutation bonus.
debug : bool, default=False
If ``True``, debug information will be printed.
Returns
-------
Expand All @@ -550,15 +553,19 @@ def pairwise_distance(
if s1[jgene_field] != s2[jgene_field]:
germline_penalty += 5

# Levenshtein distance
dist = levenshtein_distance(s1[cdr3_field], s2[cdr3_field])

# CDR3 length
s1_len = len(s1[cdr3_field])
s2_len = len(s2[cdr3_field])
length_penalty = abs(s1_len - s2_len) * length_penalty_multiplier
length = min(s1_len, s2_len)

# Levenshtein distance
if s1_len == s2_len:
# don't allow insertions/deletions if the CDR3s are the same length
dist = sum([a != b for a, b in zip(s1[cdr3_field], s2[cdr3_field])])
else:
dist = levenshtein_distance(s1[cdr3_field], s2[cdr3_field])

# mutations
likely_allelic_variants = likely_allelic_variants or []
mutation_bonus = (
Expand All @@ -568,9 +575,6 @@ def pairwise_distance(
)
* shared_mutation_bonus
)
# mutation_bonus = (
# len(set(s1[mutations_field]) & set(s2[mutations_field])) * shared_mutation_bonus
# )
score = germline_penalty + ((dist + length_penalty - mutation_bonus) / length)
return max(score, 0.001) # distance values can't be negative

Expand Down

0 comments on commit 063a75d

Please sign in to comment.