diff --git a/docs/data/index.html b/docs/data/index.html index 3b32a5e..e85f005 100644 --- a/docs/data/index.html +++ b/docs/data/index.html @@ -180,7 +180,7 @@
proteinflow.data
proteinflow.data
proteinflow.data
proteinflow.data
proteinflow.data
proteinflow.data
proteinflow.data
proteinflow.data
-def lru_cache()
-
Make a dummy decorator.
def lru_cache():
- """Make a dummy decorator."""
-
- def wrapper(func):
- return func
-
- return wrapper
-seqs
: list
of str
crds
: list
of np.ndarray
'numpy'
arrays of shape (L, 4, 3)
,
+numpy
arrays of shape (L, 14, 3)
,
in the order of N, C, CA, O
masks
: list
of np.ndarray
-def combine_multiple_frames(*args, **kwargs)
+def combine_multiple_frames(files, output_path='combined.pdb')
Combine multiple PDB files into a single multiframe PDB file.
+files
: list
of str
output_path
: str
, default 'combined.pdb'
def wrapper(*args, **kwargs):
- if module_name not in sys.modules:
- raise ImportError(
- f"{install_name} must be installed to use this function. "
- f"Install it with `pip install {install_name}` or together with most other optional dependencies with `pip install proteinflow[processing]`."
- )
- return func(*args, **kwargs)
+@staticmethod
+@requires_extra("MDAnalysis")
+def combine_multiple_frames(files, output_path="combined.pdb"):
+ """Combine multiple PDB files into a single multiframe PDB file.
+
+ Parameters
+ ----------
+ files : list of str
+ A list of PDB or proteinflow pickle files
+ output_path : str, default 'combined.pdb'
+ Path to the .pdb output file
+
+ """
+ with mda.Writer(output_path, multiframe=True) as writer:
+ for file in files:
+ if file.endswith(".pickle"):
+ file_ = ProteinEntry.from_pickle(file)._temp_pdb_file()
+ else:
+ file_ = file
+ u = mda.Universe(file_)
+ writer.write(u)
@@ -6332,7 +6387,9 @@ Returns
[
x
for x in entry.get_chains()
- if x not in entry.get_chain_type_dict()["antigen"] or not only_antibody
+ if not entry.has_cdr()
+ or not only_antibody
+ or x not in entry.get_chain_type_dict()["antigen"]
]
for entry in entries
]
@@ -6363,7 +6420,9 @@ Returns
chains = [
x
for x in entry.get_chains()
- if x not in entry.get_chain_type_dict()["antigen"] or not only_antibody
+ if not entry.has_cdr()
+ or not only_antibody
+ or x not in entry.get_chain_type_dict()["antigen"]
]
esm_entry = ProteinEntry.from_pdb(path)
chain_rename_dict = {k: v for k, v in zip(string.ascii_uppercase, chains)}
@@ -6372,7 +6431,9 @@ Returns
esm_entry.align_structure(
reference_pdb_path=temp_file,
save_pdb_path=path.rsplit(".", 1)[0] + "_aligned.pdb",
- chain_ids=entry.get_predicted_chains(),
+ chain_ids=entry.get_predicted_chains()
+ if entry.has_predict_mask()
+ else chains,
)
rmsds.append(
entry.ca_rmsd(
@@ -7403,6 +7464,8 @@ Returns
The CA RMSD between the two proteins
"""
+ if only_predicted and not self.has_predict_mask():
+ only_predicted = False
chains = [x for x in self.get_chains() if x in entry.get_chains()]
structure1 = self.get_coordinates(only_known=True, chains=chains)[:, 2]
structure2 = entry.get_coordinates(only_known=True, chains=chains)[:, 2]
@@ -7882,7 +7945,9 @@ Returns
Get the chain types of the protein.
-If the CDRs are not annotated, this function will return None
.
If the CDRs are not annotated, this function will return None
.
+If there is no light or heavy chain, the corresponding key will be missing.
+If there is no antigen chain, the 'antigen'
key will map to an empty list.
chains
: list
of str
, default None
+def set_predict_mask(self, mask_dict)
+
Set the predicted mask.
+mask_dict
: dict
np.ndarray
mask of 0s and 1s of the same length as the chain sequencedef set_predict_mask(self, mask_dict):
+ """Set the predicted mask.
+
+ Parameters
+ ----------
+ mask_dict : dict
+ A dictionary mapping from chain IDs to a `np.ndarray` mask of 0s and 1s of the same length as the chain sequence
+
+ """
+ for chain in mask_dict:
+ if chain not in self.get_chains():
+ raise PDBError("Chain not found")
+ if len(mask_dict[chain]) != self.get_length([chain]):
+ raise PDBError("Mask length does not match sequence length")
+ self.predict_mask = mask_dict
+
def sidechain_coordinates(self, chains=None)
rename_chains
retrieve_ligands_from_pickle
secondary_structure
set_predict_mask
sidechain_coordinates
sidechain_orientation
tm_score
proteinflow.data.torch
proteinflow.data.torch
proteinflow.data.torch
proteinflow.data.torch
proteinflow.data.torch
proteinflow.data.torch
proteinflow.data.torch
proteinflow.data.torch
class ProteinDataset
-(dataset_folder, features_folder='./data/tmp/', clustering_dict_path=None, max_length=None, rewrite=False, use_fraction=1, load_to_ram=False, debug=False, interpolate='none', node_features_type='zeros', debug_file_path=None, entry_type='biounit', classes_to_exclude=None, shuffle_clusters=True, min_cdr_length=None, feature_functions=None, classes_dict_path=None, cut_edges=False, mask_residues=True, lower_limit=15, upper_limit=100, mask_frac=None, mask_whole_chains=False, mask_sequential=False, force_binding_sites_frac=0.15, mask_all_cdrs=False, load_ligands=False, pyg_graph=False, patch_around_mask=False, initial_patch_size=128, antigen_patch_size=128, debug_verbose=False)
+(dataset_folder, features_folder='./data/tmp/', clustering_dict_path=None, max_length=None, rewrite=False, use_fraction=1, load_to_ram=False, debug=False, interpolate='none', node_features_type='zeros', debug_file_path=None, entry_type='biounit', classes_to_exclude=None, shuffle_clusters=True, min_cdr_length=None, feature_functions=None, classes_dict_path=None, cut_edges=False, mask_residues=True, lower_limit=15, upper_limit=100, mask_frac=None, mask_whole_chains=False, mask_sequential=False, force_binding_sites_frac=0.15, mask_all_cdrs=False, load_ligands=False, pyg_graph=False, patch_around_mask=False, initial_patch_size=128, antigen_patch_size=128, require_antigen=False, require_light_chain=False, require_heavy_chain=False)
Dataset to load proteinflow data.
@@ -1171,6 +1239,12 @@patch_around_mask
is True
)antigen_patch_size
: int
, default 128
patch_around_mask
is True
and the dataset is SAbDab)require_antigen
: bool
, default False
True
, only entries with an antigen will be included (used if the dataset is SAbDab)require_light_chain
: bool
, default False
True
, only entries with a light chain will be included (used if the dataset is SAbDab)requre_heavy_chain
: bool
, default False
True
, only entries with a heavy chain will be included (used if the dataset is SAbDab)
-def from_args(dataset_folder, features_folder='./data/tmp/', clustering_dict_path=None, max_length=None, rewrite=False, use_fraction=1, load_to_ram=False, debug=False, interpolate='none', node_features_type=None, entry_type='biounit', classes_to_exclude=None, lower_limit=15, upper_limit=100, mask_residues=True, mask_whole_chains=False, mask_frac=None, force_binding_sites_frac=0, shuffle_clusters=True, shuffle_batches=True, mask_all_cdrs=False, classes_dict_path=None, load_ligands=False, cut_edges=False, *args, **kwargs) ‑> None
+def from_args(dataset_folder, features_folder='./data/tmp/', clustering_dict_path=None, max_length=None, rewrite=False, use_fraction=1, load_to_ram=False, debug=False, interpolate='none', node_features_type=None, entry_type='biounit', classes_to_exclude=None, lower_limit=15, upper_limit=100, mask_residues=True, mask_whole_chains=False, mask_frac=None, force_binding_sites_frac=0, shuffle_clusters=True, shuffle_batches=True, mask_all_cdrs=False, classes_dict_path=None, load_ligands=False, cut_edges=False, require_antigen=False, require_light_chain=False, require_heavy_chain=False, *args, **kwargs) ‑> None
Create a ProteinLoader
instance with a ProteinDataset
from the given arguments.
True
, the ligands will be loaded from the PDB files and added to the featurescut_edges
: bool
, default False
True
, missing values at the edges of the sequence will be cut offrequire_antigen
: bool
, default False
True
, only entries with an antigen will be included (used if the dataset is SAbDab)require_light_chain
: bool
, default False
True
, only entries with a light chain will be included (used if the dataset is SAbDab)require_heavy_chain
: bool
, default False
True
, only entries with a heavy chain will be included (used if the dataset is SAbDab)*args
torch.utils.data.DataLoader
**kwargs
docker pull adaptyvbio/proteinflow
By default installing proteinflow
with conda or pip will only load the dependencies that are required for the main functions of the package: downloading, generating and splitting datasets. If you are interested in using other functions like visualization, metrics and other data processing methods, please install the package with pip install proteinflow[proteinflow.processing]
or use the docker image.
Some metric functions also have separate requirements, see the documentation for details.
+Some metric functions also have separate requirements, see the documentation for details. All of them are installed in the docker image.
python -m pip install prody==2.4.0
before installing proteinflow
.proteinflow.metrics
proteinflow.metrics
+def ablang_pll(sequence, predict_mask, ablang_model_name='heavy', average=False)
+
Compute pseudo log likelihood.
+Note that you need to install ablang
(see https://github.com/oxpig/AbLang/tree/main).
sequence
: str
predict_mask
: np.ndarray
ablang_model_name
: {"heavy", "light"}
, default "heavy"
average
: bool
, default False
pll
: float
@requires_extra("ablang")
+def ablang_pll(
+ sequence,
+ predict_mask,
+ ablang_model_name="heavy",
+ average=False,
+):
+ """Compute pseudo log likelihood.
+
+ Note that you need to install `ablang` (see https://github.com/oxpig/AbLang/tree/main).
+
+ Parameters
+ ----------
+ sequence : str
+ Chain sequence (string of amino acid codes)
+ predict_mask : np.ndarray
+ Predict mask corresponding to the sequence (array of 0 and 1 where 1 indicates a predicted residue)
+ ablang_model_name : {"heavy", "light"}, default "heavy"
+ Name of the AbLang model to use
+ average : bool, default False
+ Whether to average the pseudo log likelihood over the residues
+
+ Returns
+ -------
+ pll: float
+ Pseudo log likelihood
+
+ """
+ ablang_model = ablang.pretrained(
+ ablang_model_name
+ ) # Use "light" if you are working with light chains
+ ablang_model.freeze()
+
+ sequences = []
+ sequence = list(sequence)
+ predict_idx = np.where(predict_mask)[0]
+ for i in predict_idx:
+ sequences.append("".join(sequence[:i]) + "*" + "".join(sequence[i + 1 :]))
+
+ logits = ablang_model(sequences, mode="likelihood")[:, 1:]
+ exp_logits = np.exp(logits)
+ prob = exp_logits / exp_logits.sum(axis=-1, keepdims=True)
+ true_idx = [
+ ablang_model.tokenizer.vocab_to_token[x] - 1
+ for x in np.array(sequence)[predict_idx]
+ ]
+
+ prob = prob[range(prob.shape[0]), predict_idx, true_idx]
+ pll = np.log(prob).sum()
+ if average:
+ pll /= len(predict_idx)
+ return pll
+
+def blosum62_score(seq_before, seq_after)
+
Calculate the BLOSUM62 score between two sequences.
+seq_before
: str
seq_after
: str
score
: int
@requires_extra("blosum")
+def blosum62_score(seq_before, seq_after):
+ """Calculate the BLOSUM62 score between two sequences.
+
+ Parameters
+ ----------
+ seq_before : str
+ The sequence before the mutation
+ seq_after : str
+ The sequence after the mutation
+
+ Returns
+ -------
+ score : int
+ The BLOSUM62 score between the two sequences
+
+ """
+ assert len(seq_before) == len(seq_after)
+ matrix = bl.BLOSUM(62)
+ score = 0
+ for x_before, x_after in zip(seq_before, seq_after):
+ score += matrix[x_before][x_after]
+ return score
+
def ca_rmsd(coordinates1, coordinates2)
+def esm_pll(chain_sequences, predict_masks, esm_model_name='esm2_t30_150M_UR50D', esm_model_objects=None, average=False)
+
Compute pseudo log likelihood.
+chain_sequences
: list
of str
predict_masks
: list
of np.ndarray
esm_model_name
: str
, default "esm2_t30_150M_UR50D"
esm_model_objects
: tuple
, optionalesm_model_name
will be ignored)average
: bool
, default False
pll
: float
@requires_extra("esm", install_name="fair-esm")
+def esm_pll(
+ chain_sequences,
+ predict_masks,
+ esm_model_name="esm2_t30_150M_UR50D",
+ esm_model_objects=None,
+ average=False,
+):
+ """Compute pseudo log likelihood.
+
+ Parameters
+ ----------
+ chain_sequences : list of str
+ List of chain sequences (strings of amino acid codes)
+ predict_masks : list of np.ndarray
+ List of predict masks corresponding to the sequences (arrays of 0 and 1 where 1 indicates a predicted residue)
+ esm_model_name : str, default "esm2_t30_150M_UR50D"
+ Name of the ESM-2 model to use
+ esm_model_objects : tuple, optional
+ Tuple of ESM-2 model, batch converter and tok_to_idx dictionary (if not None, `esm_model_name` will be ignored)
+ average : bool, default False
+ Whether to average the pseudo log likelihood over the residues
+
+ Returns
+ -------
+ pll: float
+ Pseudo log likelihood
+
+ """
+ predict_mask = []
+ for mask in predict_masks:
+ predict_mask.append(mask)
+ predict_mask.append(np.zeros(2))
+ predict_mask = np.concatenate(predict_mask, axis=0)
+ predict_idx = np.where(predict_mask)[0]
+ sequence = []
+ for i, seq in enumerate(chain_sequences):
+ sequence += list(seq)
+ if i != len(chain_sequences) - 1:
+ sequence += ["<eos>", "<cls>"]
+
+ if esm_model_objects is None:
+ esm_model, batch_converter, tok_to_idx = _get_esm_model(esm_model_name)
+ else:
+ esm_model, batch_converter, tok_to_idx = esm_model_objects
+ pll = 0
+ for i in predict_idx:
+ sequence_ = "".join(sequence[:i]) + "<mask>" + "".join(sequence[i + 1 :])
+ _, _, batch_tokens = batch_converter([(0, sequence_)])
+ if torch.cuda.is_available():
+ batch_tokens = batch_tokens.to("cuda")
+ with torch.no_grad():
+ results = esm_model(batch_tokens, repr_layers=[6], return_contacts=False)
+ logits = results["logits"][0, i + 1].detach().cpu()
+ tok_idx = tok_to_idx[sequence[i]]
+ prob = F.softmax(logits[4:24], dim=-1)[tok_idx - 4]
+ pll += torch.log(prob).item()
+ if average:
+ pll /= len(predict_idx)
+ return pll
+
+def esmfold_generate(sequences, filepaths=None)
+
Generate PDB structures using ESMFold.
+Note that you need to install fair-esm
with the esmfold
option (see https://github.com/facebookresearch/esm/tree/main).
+The model also requires > 16GB CPU and GPU memory.
sequences
: list
of str
':'
)filepaths
: list
of str
, default None
@requires_extra("esm", install_name="fair-esm[esmfold]")
+def esmfold_generate(sequences, filepaths=None):
+ """Generate PDB structures using ESMFold.
+
+ Note that you need to install `fair-esm` with the `esmfold` option (see https://github.com/facebookresearch/esm/tree/main).
+ The model also requires > 16GB CPU and GPU memory.
+
+ Parameters
+ ----------
+ sequences : list of str
+ List of sequences to be generated (chains separated with `':'`)
+ filepaths : list of str, default None
+ List of filepaths for the generated structures
+
+ """
+ assert filepaths is None or len(filepaths) == len(sequences)
+ print("Loading the ESMFold model...")
+ model = esm.pretrained.esmfold_v1()
+ model = model.eval().cuda()
+ print("Model loaded.")
+ if filepaths is None:
+ if not os.path.exists("esmfold_output"):
+ os.mkdir("esmfold_output")
+ filepaths = [
+ os.path.join("esmfold_output", f"seq_{i}.pdb")
+ for i in range(len(sequences))
+ ]
+ with torch.no_grad():
+ for sequence, path in tqdm(zip(sequences, filepaths), total=len(sequences)):
+ output = model.infer_pdb(sequence)
+ with open(path, "w") as f:
+ f.write(output)
+
+def igfold_generate(sequence_dicts, filepaths=None, use_openmm=False)
+
Generate PDB structures using IgFold.
+Note that you need to install igfold
(see https://github.com/Graylab/IgFold).
sequence_dicts
: list
of dict
filepaths
: list
of str
, optionaluse_openmm
: bool
, default False
@requires_extra("igfold")
+def igfold_generate(sequence_dicts, filepaths=None, use_openmm=False):
+ """Generate PDB structures using IgFold.
+
+ Note that you need to install `igfold` (see https://github.com/Graylab/IgFold).
+
+ Parameters
+ ----------
+ sequence_dicts : list of dict
+ List of sequence dictionaries (keys: "H", "L" for heavy and light chains)
+ filepaths : list of str, optional
+ List of filepaths for the generated structures
+ use_openmm : bool, default False
+ Whether to use refinement with OpenMM
+
+ """
+ assert filepaths is None or len(filepaths) == len(sequence_dicts)
+ igfold = IgFoldRunner()
+ folder = "igfold_refine_output" if use_openmm else "igfold_output"
+ if filepaths is None:
+ if not os.path.exists(folder):
+ os.mkdir(folder)
+ filepaths = [
+ os.path.join(folder, f"seq_{i}.pdb") for i in range(len(sequence_dicts))
+ ]
+ for seqs, path in tqdm(zip(sequence_dicts, filepaths), total=len(sequence_dicts)):
+ igfold.fold(
+ path, # Output PDB file
+ sequences=seqs, # Antibody sequences
+ do_refine=use_openmm, # Refine the antibody structure
+ use_openmm=use_openmm, # Use OpenMM for refinement
+ do_renum=False, # Renumber predicted antibody structure (Chothia)
+ )
+
+def immunebuilder_generate(sequence_dicts, filepaths=None, protein_type='antibody')
+
Generate PDB structures using ImmuneBuilder.
+Note that you need to install immunebuilder
(see https://github.com/oxpig/ImmuneBuilder)
sequence_dicts
: list
of dict
filepaths
: list
of str
, optionalprotein_type
: {"antibody", "nanobody", "tcr"}
@requires_extra("ImmuneBuilder")
+def immunebuilder_generate(sequence_dicts, filepaths=None, protein_type="antibody"):
+ """Generate PDB structures using ImmuneBuilder.
+
+ Note that you need to install `immunebuilder` (see https://github.com/oxpig/ImmuneBuilder)
+
+ Parameters
+ ----------
+ sequence_dicts : list of dict
+ List of sequence dictionaries (keys: "H", "L" for heavy and light chains)
+ filepaths : list of str, optional
+ List of filepaths for the generated structures
+ protein_type: {"antibody", "nanobody", "tcr"}
+ Type of the structure to generate
+
+ """
+ predictor_classes = {
+ "antibody": ABodyBuilder2,
+ "nanobody": NanoBodyBuilder2,
+ "tcr": TCRBuilder2,
+ }
+ predictor = predictor_classes[protein_type]()
+ folder = "immunebuilder_output"
+ if filepaths is None:
+ if not os.path.exists(folder):
+ os.mkdir(folder)
+ filepaths = [
+ os.path.join(folder, f"seq_{i}.pdb") for i in range(len(sequence_dicts))
+ ]
+ for seqs, path in tqdm(zip(sequence_dicts, filepaths), total=len(sequence_dicts)):
+ out = predictor.predict(seqs)
+ out.save(path)
+
def long_repeat_num(seq, thr=5)
def long_repeat_num(seq, thr=5):
+@requires_extra("blosum")
+def long_repeat_num(seq, thr=5):
"""Calculate the number of long repeats in a sequence.
Parameters
@@ -594,6 +969,56 @@ Returns
return count
+
+def tm_score(coordinates1, coordinates2, sequence1, sequence2)
+
+-
+
Calculate TM-score between two structures.
+Parameters
+
+coordinates1
: np.ndarray
+- The CA coordinates array of the first structure, shaped
(L, 3)
+coordinates2
: ProteinEntry
+- The CA coordinates array of the second structure, shaped
(L, 3)
+sequence1
: str
+- The sequence of the first structure
+sequence2
: str
+- The sequence of the second structure
+
+Returns
+
+tm_score
: float
+- The TM-score between the two structures
+
+
+
+Expand source code
+
+@requires_extra("tmtools")
+def tm_score(coordinates1, coordinates2, sequence1, sequence2):
+ """Calculate TM-score between two structures.
+
+ Parameters
+ ----------
+ coordinates1 : np.ndarray
+ The CA coordinates array of the first structure, shaped `(L, 3)`
+ coordinates2 : ProteinEntry
+ The CA coordinates array of the second structure, shaped `(L, 3)`
+ sequence1 : str
+ The sequence of the first structure
+ sequence2 : str
+ The sequence of the second structure
+
+ Returns
+ -------
+ tm_score : float
+ The TM-score between the two structures
+
+ """
+ res = tm_align(coordinates1, coordinates2, sequence1, sequence2)
+ return (res.tm_norm_chain1 + res.tm_norm_chain2) / 2
+
+