Skip to content

Commit

Permalink
Structure identification for models
Browse files Browse the repository at this point in the history
  • Loading branch information
thomashopf committed Nov 9, 2024
1 parent fd0572e commit 1e4a9b1
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 4 deletions.
104 changes: 103 additions & 1 deletion evcouplings/compare/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
intra_dists, multimer_dists, remap_chains,
inter_dists, remap_complex_chains
)
from evcouplings.compare.sifts import SIFTS, SIFTSResult
from evcouplings.compare.sifts import SIFTS, SIFTSResult, SIFTSWithDynamicUpdate
from evcouplings.compare.ecs import (
coupling_scores_compared, add_precision
)
Expand Down Expand Up @@ -582,6 +582,108 @@ def _individual_distance_map_config_result(individual_distance_map_table):
return individual_maps_result


def _map_alphafold_hits(modeldb_list_file, relevant_ids):
"""
Turn list of AlphaFoldDB sequence hits into structure table
compatible with SIFTS object
Parameters
----------
modeldb_list_file: str
Path to accession_ids.csv file from AlphaFoldDB
relevant_ids: set(str)
DB entry identifiers to include in table (without leading "AFDB:" prefix)
Returns
-------
pd.DataFrame
Structure table for SIFTS object
"""
with open(modeldb_list_file) as f:
_table = []
for line in f:
uniprot_ac, start, end, model_id, model_version = line.strip().split(",")
model_id_with_prefix = "AFDB:" + model_id
start, end = int(start), int(end)
if model_id_with_prefix in relevant_ids:
_table.append({
"uniprot_ac": model_id_with_prefix,
"pdb_id": f"{model_id}-model_v{model_version}",
"pdb_chain": "A",
"resseq_start": start,
"resseq_end": end,
"coord_start": start,
"coord_end": end,
"uniprot_start": start,
"uniprot_end": end,
})

return pd.DataFrame(_table)


def _identify_predicted_structures(**kwargs):
"""
Find suitable predicted structures for mapping on ECs
Parameters
----------
**kwargs
See check_required in code below
Returns
-------
SIFTSResult
Identified structures and residue index mappings
"""
check_required(
kwargs,
[
"modeldb_type", "modeldb_sequence_file", "modeldb_list_file",
"model_max_num_hits",
]
)

# implement custom logic for different model database types in code to avoid
# complicated configuration setup covering all eventualities
AVAILABLE_DB_TYPES = ["alphafolddb_v4"]
modeldb_type = kwargs["modeldb_type"]
if modeldb_type not in AVAILABLE_DB_TYPES:
raise InvalidParameterError(
f"Model DB type {modeldb_type} not available, valid options are: {', '.join(AVAILABLE_DB_TYPES)}"
)

table_callback = None
if modeldb_type == "alphafolddb_v4":
table_callback = lambda ali, hits: (
_map_alphafold_hits(
kwargs["modeldb_list_file"], set(hits.uniprot_ac)
), hits
)

assert table_callback is not None, "table_callback not defined, this indicates logic error above"

# create patched SIFTS object with all model sequences, but only instantiate table
# property once sequences are found to avoid loading gigabytes of tables with pandas
s = SIFTSWithDynamicUpdate(
table_callback=table_callback,
sequence_file=kwargs["modeldb_sequence_file"]
)

# run sequence-based query against model database, always use jackhmmer for this
sifts_map = s.by_alignment(**{
**kwargs,
"pdb_alignment_method": "jackhmmer"
})

sifts_map_full = deepcopy(sifts_map)

# reduce number of structures/hits
if kwargs["model_max_num_hits"] is not None:
sifts_map.hits = sifts_map.hits.iloc[:kwargs["model_max_num_hits"]]

return sifts_map, sifts_map_full


def standard(**kwargs):
"""
Protocol:
Expand Down
35 changes: 32 additions & 3 deletions evcouplings/compare/sifts.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,12 @@ def find_homologs(pdb_alignment_method="jackhmmer", **kwargs):
# read hmmer hittable and simplify
hits = read_hmmer_domtbl(ar["hittable_file"])

hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map(lambda x: x.split("|")[1])
hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map(lambda x: x.split("|")[2])
try:
hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map(lambda x: x.split("|")[1])
hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map(lambda x: x.split("|")[2])
except IndexError:
hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].copy()
hits.loc[:, "uniprot_id"] = pd.NA

hits = hits.rename(
columns={
Expand Down Expand Up @@ -855,10 +859,15 @@ def _create_mapping(r):
**kwargs
)

if callable(self.table):
table, hits = self.table(ali, hits)
else:
table = self.table

# merge with internal table to identify overlap of
# aligned regions and regions with structural coverage
hits = hits.merge(
self.table, on="uniprot_ac", suffixes=("", "_")
table, on="uniprot_ac", suffixes=("", "_")
)

# add 1 to end of range since overlap function treats
Expand Down Expand Up @@ -1002,3 +1011,23 @@ def _agg_type(x):
}

return SIFTSResult(hits_grouped, mappings)


class SIFTSWithDynamicUpdate(SIFTS):
def __init__(self, table_callback, sequence_file):
"""
Version of SIFTS class that allows to dynamically
instantiate table after searching sequence file
(to avoid loading large tables)
Parameters
----------
table_callback: callable
Function that ingests alignment (evcouplings.align.Alignment) and hit table (pd.DataFrame)
and returns dynamic structure mapping table (pd.DataFrame) and hit table (pd.DataFrame)
sequence_file: str
Path to database with sequences corresponding to structures
"""
# note: do *not* call super init to pass verification of input files
self.table = table_callback
self.sequence_file = sequence_file

0 comments on commit 1e4a9b1

Please sign in to comment.