From 1ddba89622ca8337d9da2e8d63dae66105dfde67 Mon Sep 17 00:00:00 2001 From: Liza Kozlova Date: Thu, 21 Dec 2023 13:56:07 +0000 Subject: [PATCH] chore: fix docs generation --- docs/data/index.html | 186 ++++++++++---- docs/data/torch.html | 312 +++++++++++++++++------ docs/index.html | 4 +- docs/metrics/index.html | 436 +++++++++++++++++++++++++++++++- proteinflow/__init__.py | 2 +- proteinflow/extra.py | 2 + proteinflow/metrics/__init__.py | 1 + 7 files changed, 818 insertions(+), 125 deletions(-) diff --git a/docs/data/index.html b/docs/data/index.html index 3b32a5e..e85f005 100644 --- a/docs/data/index.html +++ b/docs/data/index.html @@ -180,7 +180,7 @@

Module proteinflow.data

seqs : list of str Amino acid sequences of the protein (one-letter code) crds : list of np.ndarray - Coordinates of the protein, `'numpy'` arrays of shape `(L, 4, 3)`, + Coordinates of the protein, `numpy` arrays of shape `(L, 14, 3)`, in the order of `N, C, CA, O` masks : list of np.ndarray Mask arrays where 1 indicates residues with known coordinates and 0 @@ -267,6 +267,8 @@

Module proteinflow.data

"""Get the chain types of the protein. If the CDRs are not annotated, this function will return `None`. + If there is no light or heavy chain, the corresponding key will be missing. + If there is no antigen chain, the `'antigen'` key will map to an empty list. Parameters ---------- @@ -650,7 +652,7 @@

Module proteinflow.data

Chain IDs """ - if self.predict_mask is None: + if not self.has_predict_mask(): raise ValueError("Predicted mask not available") return [k for k, v in self.predict_mask.items() if v.sum() != 0] @@ -1696,6 +1698,8 @@

Module proteinflow.data

The CA RMSD between the two proteins """ + if only_predicted and not self.has_predict_mask(): + only_predicted = False chains = [x for x in self.get_chains() if x in entry.get_chains()] structure1 = self.get_coordinates(only_known=True, chains=chains)[:, 2] structure2 = entry.get_coordinates(only_known=True, chains=chains)[:, 2] @@ -1761,7 +1765,9 @@

Module proteinflow.data

[ x for x in entry.get_chains() - if x not in entry.get_chain_type_dict()["antigen"] or not only_antibody + if not entry.has_cdr() + or not only_antibody + or x not in entry.get_chain_type_dict()["antigen"] ] for entry in entries ] @@ -1792,7 +1798,9 @@

Module proteinflow.data

chains = [ x for x in entry.get_chains() - if x not in entry.get_chain_type_dict()["antigen"] or not only_antibody + if not entry.has_cdr() + or not only_antibody + or x not in entry.get_chain_type_dict()["antigen"] ] esm_entry = ProteinEntry.from_pdb(path) chain_rename_dict = {k: v for k, v in zip(string.ascii_uppercase, chains)} @@ -1801,7 +1809,9 @@

Module proteinflow.data

esm_entry.align_structure( reference_pdb_path=temp_file, save_pdb_path=path.rsplit(".", 1)[0] + "_aligned.pdb", - chain_ids=entry.get_predicted_chains(), + chain_ids=entry.get_predicted_chains() + if entry.has_predict_mask() + else chains, ) rmsds.append( entry.ca_rmsd( @@ -2052,6 +2062,22 @@

Module proteinflow.data

u = mda.Universe(file_) writer.write(u) + def set_predict_mask(self, mask_dict): + """Set the predicted mask. + + Parameters + ---------- + mask_dict : dict + A dictionary mapping from chain IDs to a `np.ndarray` mask of 0s and 1s of the same length as the chain sequence + + """ + for chain in mask_dict: + if chain not in self.get_chains(): + raise PDBError("Chain not found") + if len(mask_dict[chain]) != self.get_length([chain]): + raise PDBError("Mask length does not match sequence length") + self.predict_mask = mask_dict + def apply_mask(self, mask): """Apply a mask to the protein. @@ -2921,24 +2947,6 @@

Returns

return crd, mask -
-def lru_cache() -
-
-

Make a dummy decorator.

-
- -Expand source code - -
def lru_cache():
-    """Make a dummy decorator."""
-
-    def wrapper(func):
-        return func
-
-    return wrapper
-
-
@@ -4207,7 +4215,7 @@

Parameters

seqs : list of str
Amino acid sequences of the protein (one-letter code)
crds : list of np.ndarray
-
Coordinates of the protein, 'numpy' arrays of shape (L, 4, 3), +
Coordinates of the protein, numpy arrays of shape (L, 14, 3), in the order of N, C, CA, O
masks : list of np.ndarray
Mask arrays where 1 indicates residues with known coordinates and 0 @@ -4252,7 +4260,7 @@

Parameters

seqs : list of str Amino acid sequences of the protein (one-letter code) crds : list of np.ndarray - Coordinates of the protein, `'numpy'` arrays of shape `(L, 4, 3)`, + Coordinates of the protein, `numpy` arrays of shape `(L, 14, 3)`, in the order of `N, C, CA, O` masks : list of np.ndarray Mask arrays where 1 indicates residues with known coordinates and 0 @@ -4339,6 +4347,8 @@

Parameters

"""Get the chain types of the protein. If the CDRs are not annotated, this function will return `None`. + If there is no light or heavy chain, the corresponding key will be missing. + If there is no antigen chain, the `'antigen'` key will map to an empty list. Parameters ---------- @@ -4722,7 +4732,7 @@

Parameters

Chain IDs """ - if self.predict_mask is None: + if not self.has_predict_mask(): raise ValueError("Predicted mask not available") return [k for k, v in self.predict_mask.items() if v.sum() != 0] @@ -5768,6 +5778,8 @@

Parameters

The CA RMSD between the two proteins """ + if only_predicted and not self.has_predict_mask(): + only_predicted = False chains = [x for x in self.get_chains() if x in entry.get_chains()] structure1 = self.get_coordinates(only_known=True, chains=chains)[:, 2] structure2 = entry.get_coordinates(only_known=True, chains=chains)[:, 2] @@ -5833,7 +5845,9 @@

Parameters

[ x for x in entry.get_chains() - if x not in entry.get_chain_type_dict()["antigen"] or not only_antibody + if not entry.has_cdr() + or not only_antibody + or x not in entry.get_chain_type_dict()["antigen"] ] for entry in entries ] @@ -5864,7 +5878,9 @@

Parameters

chains = [ x for x in entry.get_chains() - if x not in entry.get_chain_type_dict()["antigen"] or not only_antibody + if not entry.has_cdr() + or not only_antibody + or x not in entry.get_chain_type_dict()["antigen"] ] esm_entry = ProteinEntry.from_pdb(path) chain_rename_dict = {k: v for k, v in zip(string.ascii_uppercase, chains)} @@ -5873,7 +5889,9 @@

Parameters

esm_entry.align_structure( reference_pdb_path=temp_file, save_pdb_path=path.rsplit(".", 1)[0] + "_aligned.pdb", - chain_ids=entry.get_predicted_chains(), + chain_ids=entry.get_predicted_chains() + if entry.has_predict_mask() + else chains, ) rmsds.append( entry.ca_rmsd( @@ -6124,6 +6142,22 @@

Parameters

u = mda.Universe(file_) writer.write(u) + def set_predict_mask(self, mask_dict): + """Set the predicted mask. + + Parameters + ---------- + mask_dict : dict + A dictionary mapping from chain IDs to a `np.ndarray` mask of 0s and 1s of the same length as the chain sequence + + """ + for chain in mask_dict: + if chain not in self.get_chains(): + raise PDBError("Chain not found") + if len(mask_dict[chain]) != self.get_length([chain]): + raise PDBError("Mask length does not match sequence length") + self.predict_mask = mask_dict + def apply_mask(self, mask): """Apply a mask to the protein. @@ -6174,21 +6208,42 @@

Class variables

Static methods

-def combine_multiple_frames(*args, **kwargs) +def combine_multiple_frames(files, output_path='combined.pdb')
-
+

Combine multiple PDB files into a single multiframe PDB file.

+

Parameters

+
+
files : list of str
+
A list of PDB or proteinflow pickle files
+
output_path : str, default 'combined.pdb'
+
Path to the .pdb output file
+
Expand source code -
def wrapper(*args, **kwargs):
-    if module_name not in sys.modules:
-        raise ImportError(
-            f"{install_name} must be installed to use this function. "
-            f"Install it with `pip install {install_name}` or together with most other optional dependencies with `pip install proteinflow[processing]`."
-        )
-    return func(*args, **kwargs)
+
@staticmethod
+@requires_extra("MDAnalysis")
+def combine_multiple_frames(files, output_path="combined.pdb"):
+    """Combine multiple PDB files into a single multiframe PDB file.
+
+    Parameters
+    ----------
+    files : list of str
+        A list of PDB or proteinflow pickle files
+    output_path : str, default 'combined.pdb'
+        Path to the .pdb output file
+
+    """
+    with mda.Writer(output_path, multiframe=True) as writer:
+        for file in files:
+            if file.endswith(".pickle"):
+                file_ = ProteinEntry.from_pickle(file)._temp_pdb_file()
+            else:
+                file_ = file
+            u = mda.Universe(file_)
+            writer.write(u)
@@ -6332,7 +6387,9 @@

Returns

[ x for x in entry.get_chains() - if x not in entry.get_chain_type_dict()["antigen"] or not only_antibody + if not entry.has_cdr() + or not only_antibody + or x not in entry.get_chain_type_dict()["antigen"] ] for entry in entries ] @@ -6363,7 +6420,9 @@

Returns

chains = [ x for x in entry.get_chains() - if x not in entry.get_chain_type_dict()["antigen"] or not only_antibody + if not entry.has_cdr() + or not only_antibody + or x not in entry.get_chain_type_dict()["antigen"] ] esm_entry = ProteinEntry.from_pdb(path) chain_rename_dict = {k: v for k, v in zip(string.ascii_uppercase, chains)} @@ -6372,7 +6431,9 @@

Returns

esm_entry.align_structure( reference_pdb_path=temp_file, save_pdb_path=path.rsplit(".", 1)[0] + "_aligned.pdb", - chain_ids=entry.get_predicted_chains(), + chain_ids=entry.get_predicted_chains() + if entry.has_predict_mask() + else chains, ) rmsds.append( entry.ca_rmsd( @@ -7403,6 +7464,8 @@

Returns

The CA RMSD between the two proteins """ + if only_predicted and not self.has_predict_mask(): + only_predicted = False chains = [x for x in self.get_chains() if x in entry.get_chains()] structure1 = self.get_coordinates(only_known=True, chains=chains)[:, 2] structure2 = entry.get_coordinates(only_known=True, chains=chains)[:, 2] @@ -7882,7 +7945,9 @@

Returns

Get the chain types of the protein.

-

If the CDRs are not annotated, this function will return None.

+

If the CDRs are not annotated, this function will return None. +If there is no light or heavy chain, the corresponding key will be missing. +If there is no antigen chain, the 'antigen' key will map to an empty list.

Parameters

chains : list of str, default None
@@ -7902,6 +7967,8 @@

Returns

"""Get the chain types of the protein. If the CDRs are not annotated, this function will return `None`. + If there is no light or heavy chain, the corresponding key will be missing. + If there is no antigen chain, the `'antigen'` key will map to an empty list. Parameters ---------- @@ -8346,7 +8413,7 @@

Returns

Chain IDs """ - if self.predict_mask is None: + if not self.has_predict_mask(): raise ValueError("Predicted mask not available") return [k for k, v in self.predict_mask.items() if v.sum() != 0] @@ -8816,6 +8883,37 @@

Returns

return sse
+
+def set_predict_mask(self, mask_dict) +
+
+

Set the predicted mask.

+

Parameters

+
+
mask_dict : dict
+
A dictionary mapping from chain IDs to a np.ndarray mask of 0s and 1s of the same length as the chain sequence
+
+
+ +Expand source code + +
def set_predict_mask(self, mask_dict):
+    """Set the predicted mask.
+
+    Parameters
+    ----------
+    mask_dict : dict
+        A dictionary mapping from chain IDs to a `np.ndarray` mask of 0s and 1s of the same length as the chain sequence
+
+    """
+    for chain in mask_dict:
+        if chain not in self.get_chains():
+            raise PDBError("Chain not found")
+        if len(mask_dict[chain]) != self.get_length([chain]):
+            raise PDBError("Mask length does not match sequence length")
+    self.predict_mask = mask_dict
+
+
def sidechain_coordinates(self, chains=None)
@@ -9762,7 +9860,6 @@

Index

  • Functions

  • Classes

    @@ -9839,6 +9936,7 @@

    rename_chains

  • retrieve_ligands_from_pickle
  • secondary_structure
  • +
  • set_predict_mask
  • sidechain_coordinates
  • sidechain_orientation
  • tm_score
  • diff --git a/docs/data/torch.html b/docs/data/torch.html index 214edd1..070ea79 100644 --- a/docs/data/torch.html +++ b/docs/data/torch.html @@ -158,6 +158,9 @@

    Module proteinflow.data.torch

    classes_dict_path=None, load_ligands=False, cut_edges=False, + require_antigen=False, + require_light_chain=False, + require_heavy_chain=False, *args, **kwargs, ) -> None: @@ -214,6 +217,12 @@

    Module proteinflow.data.torch

    if `True`, the ligands will be loaded from the PDB files and added to the features cut_edges : bool, default False if `True`, missing values at the edges of the sequence will be cut off + require_antigen : bool, default False + if `True`, only entries with an antigen will be included (used if the dataset is SAbDab) + require_light_chain : bool, default False + if `True`, only entries with a light chain will be included (used if the dataset is SAbDab) + require_heavy_chain : bool, default False + if `True`, only entries with a heavy chain will be included (used if the dataset is SAbDab) *args additional arguments to `torch.utils.data.DataLoader` **kwargs @@ -244,6 +253,9 @@

    Module proteinflow.data.torch

    mask_all_cdrs=mask_all_cdrs, load_ligands=load_ligands, cut_edges=cut_edges, + require_antigen=require_antigen, + require_light_chain=require_light_chain, + require_heavy_chain=require_heavy_chain, ) return ProteinLoader( dataset=dataset, @@ -339,7 +351,9 @@

    Module proteinflow.data.torch

    patch_around_mask=False, initial_patch_size=128, antigen_patch_size=128, - debug_verbose=False, + require_antigen=False, + require_light_chain=False, + require_heavy_chain=False, ): """Initialize the dataset. @@ -410,9 +424,15 @@

    Module proteinflow.data.torch

    the size of the initial patch (used if `patch_around_mask` is `True`) antigen_patch_size : int, default 128 the size of the antigen patch (used if `patch_around_mask` is `True` and the dataset is SAbDab) + require_antigen : bool, default False + if `True`, only entries with an antigen will be included (used if the dataset is SAbDab) + require_light_chain : bool, default False + if `True`, only entries with a light chain will be included (used if the dataset is SAbDab) + requre_heavy_chain : bool, default False + if `True`, only entries with a heavy chain will be included (used if the dataset is SAbDab) """ - self.debug = debug_verbose + self.debug = False if classes_dict_path is None: dataset_parent = os.path.dirname(dataset_folder) @@ -521,51 +541,43 @@

    Module proteinflow.data.torch

    self.files[id][chain].append(filename) if classes_to_exclude is None: classes_to_exclude = [] - elif classes_dict_path is None: - raise ValueError( - "The classes_to_exclude parameter is not None, but classes_dict_path is None. Please provide a path to a pickled classes dictionary." - ) + classes = None + if classes_dict_path is not None: + with open(classes_dict_path, "rb") as f: + classes = pickle.load(f) if clustering_dict_path is not None: - if entry_type == "pair": - classes_to_exclude = set(classes_to_exclude) - classes_to_exclude.add("single_chains") - classes_to_exclude = list(classes_to_exclude) with open(clustering_dict_path, "rb") as f: self.clusters = pickle.load(f) # list of biounit ids by cluster id - try: # old way of storing class information - classes = pickle.load(f) - except EOFError: - if len(classes_to_exclude) > 0: - with open(classes_dict_path, "rb") as f: - classes = pickle.load(f) - to_exclude = set() + if classes is None: # old way of storing class information + try: + classes = pickle.load(f) + except EOFError: + pass + else: + self.clusters = None + if classes is None and len(classes_to_exclude) > 0: + raise ValueError( + "Classes to exclude are given but no classes dictionary is found, please set classes_dict_path to the path of the classes dictionary" + ) + to_exclude = set() + if classes is not None: for c in classes_to_exclude: for key, id_arr in classes.get(c, {}).items(): for id, _ in id_arr: to_exclude.add(id) - for key in list(self.clusters.keys()): - cluster_list = [] - for x in self.clusters[key]: - if x[0] in to_exclude: - continue - id = x[0].split(".")[0] - chain = x[1] - if id not in self.files: - continue - if chain not in self.files[id]: - continue - if len(self.files[id][chain]) == 0: - continue - cluster_list.append([id, chain]) - self.clusters[key] = cluster_list - if len(self.clusters[key]) == 0: - self.clusters.pop(key) + if require_antigen or require_light_chain: + to_exclude.update( + self._exclude_by_chains( + require_antigen, require_light_chain, require_heavy_chain + ) + ) + if self.clusters is not None: + self._exclude_ids_from_clusters(to_exclude) self.data = list(self.clusters.keys()) else: - self.clusters = None - self.data = list(self.files.keys()) + self.data = [x for x in self.files.keys() if x not in to_exclude] # create a smaller dataset if necessary (if we have clustering it's applied earlier) - if clustering_dict_path is None and use_fraction < 1: + if self.clusters is None and use_fraction < 1: self.data = sorted(self.data)[: int(len(self.data) * use_fraction)] if load_to_ram: print("Loading to RAM...") @@ -585,6 +597,60 @@

    Module proteinflow.data.torch

    self.cdr = 0 self.set_cdr(None) + def _exclude_ids_from_clusters(self, to_exclude): + for key in list(self.clusters.keys()): + cluster_list = [] + for x in self.clusters[key]: + if x[0] in to_exclude: + continue + id = x[0].split(".")[0] + chain = x[1] + if id not in self.files: + continue + if chain not in self.files[id]: + continue + if len(self.files[id][chain]) == 0: + continue + cluster_list.append([id, chain]) + self.clusters[key] = cluster_list + if len(self.clusters[key]) == 0: + self.clusters.pop(key) + + def _check_chain_types(self, file): + chain_types = set() + with open(file, "rb") as f: + data = pickle.load(f) + chains = data["chain_dict"].values() + for chain in chains: + chain_mask = data["chain_encoding_all"] == chain + cdr = data["cdr"][chain_mask] + cdr_values = cdr.unique() + if len(cdr_values) == 1: + chain_types.add("antigen") + elif CDR_REVERSE["H1"] in cdr_values: + chain_types.add("heavy") + elif CDR_REVERSE["L1"] in cdr_values: + chain_types.add("light") + return chain_types + + def _exclude_by_chains( + self, require_antigen, require_light_chain, require_heavy_chain + ): + """Exclude entries that do not have an antigen or a light chain.""" + to_exclude = set() + for id in self.files: + filename = list(self.files[id].values())[0][ + 0 + ] # assuming entry type is biounit + chain_types = self._check_chain_types(filename) + if require_antigen and "antigen" not in chain_types: + to_exclude.add(id) + if require_light_chain and "light" not in chain_types: + to_exclude.add(id) + if require_heavy_chain and "heavy" not in chain_types: + to_exclude.add(id) + return to_exclude + def _get_masked_sequence( self, data, @@ -732,6 +798,8 @@

    Module proteinflow.data.torch

    elif self.entry_type == "chain": chain_sets = [[x] for x in chains] elif self.entry_type == "pair": + if len(chains) == 1: + return [] chain_sets = list(combinations(chains, 2)) else: raise RuntimeError( @@ -1058,7 +1126,7 @@

    Classes

    class ProteinDataset -(dataset_folder, features_folder='./data/tmp/', clustering_dict_path=None, max_length=None, rewrite=False, use_fraction=1, load_to_ram=False, debug=False, interpolate='none', node_features_type='zeros', debug_file_path=None, entry_type='biounit', classes_to_exclude=None, shuffle_clusters=True, min_cdr_length=None, feature_functions=None, classes_dict_path=None, cut_edges=False, mask_residues=True, lower_limit=15, upper_limit=100, mask_frac=None, mask_whole_chains=False, mask_sequential=False, force_binding_sites_frac=0.15, mask_all_cdrs=False, load_ligands=False, pyg_graph=False, patch_around_mask=False, initial_patch_size=128, antigen_patch_size=128, debug_verbose=False) +(dataset_folder, features_folder='./data/tmp/', clustering_dict_path=None, max_length=None, rewrite=False, use_fraction=1, load_to_ram=False, debug=False, interpolate='none', node_features_type='zeros', debug_file_path=None, entry_type='biounit', classes_to_exclude=None, shuffle_clusters=True, min_cdr_length=None, feature_functions=None, classes_dict_path=None, cut_edges=False, mask_residues=True, lower_limit=15, upper_limit=100, mask_frac=None, mask_whole_chains=False, mask_sequential=False, force_binding_sites_frac=0.15, mask_all_cdrs=False, load_ligands=False, pyg_graph=False, patch_around_mask=False, initial_patch_size=128, antigen_patch_size=128, require_antigen=False, require_light_chain=False, require_heavy_chain=False)

    Dataset to load proteinflow data.

    @@ -1171,6 +1239,12 @@

    Parameters

    the size of the initial patch (used if patch_around_mask is True)
    antigen_patch_size : int, default 128
    the size of the antigen patch (used if patch_around_mask is True and the dataset is SAbDab)
    +
    require_antigen : bool, default False
    +
    if True, only entries with an antigen will be included (used if the dataset is SAbDab)
    +
    require_light_chain : bool, default False
    +
    if True, only entries with a light chain will be included (used if the dataset is SAbDab)
    +
    requre_heavy_chain : bool, default False
    +
    if True, only entries with a heavy chain will be included (used if the dataset is SAbDab)
    @@ -1262,7 +1336,9 @@

    Parameters

    patch_around_mask=False, initial_patch_size=128, antigen_patch_size=128, - debug_verbose=False, + require_antigen=False, + require_light_chain=False, + require_heavy_chain=False, ): """Initialize the dataset. @@ -1333,9 +1409,15 @@

    Parameters

    the size of the initial patch (used if `patch_around_mask` is `True`) antigen_patch_size : int, default 128 the size of the antigen patch (used if `patch_around_mask` is `True` and the dataset is SAbDab) + require_antigen : bool, default False + if `True`, only entries with an antigen will be included (used if the dataset is SAbDab) + require_light_chain : bool, default False + if `True`, only entries with a light chain will be included (used if the dataset is SAbDab) + requre_heavy_chain : bool, default False + if `True`, only entries with a heavy chain will be included (used if the dataset is SAbDab) """ - self.debug = debug_verbose + self.debug = False if classes_dict_path is None: dataset_parent = os.path.dirname(dataset_folder) @@ -1444,51 +1526,43 @@

    Parameters

    self.files[id][chain].append(filename) if classes_to_exclude is None: classes_to_exclude = [] - elif classes_dict_path is None: - raise ValueError( - "The classes_to_exclude parameter is not None, but classes_dict_path is None. Please provide a path to a pickled classes dictionary." - ) + classes = None + if classes_dict_path is not None: + with open(classes_dict_path, "rb") as f: + classes = pickle.load(f) if clustering_dict_path is not None: - if entry_type == "pair": - classes_to_exclude = set(classes_to_exclude) - classes_to_exclude.add("single_chains") - classes_to_exclude = list(classes_to_exclude) with open(clustering_dict_path, "rb") as f: self.clusters = pickle.load(f) # list of biounit ids by cluster id - try: # old way of storing class information - classes = pickle.load(f) - except EOFError: - if len(classes_to_exclude) > 0: - with open(classes_dict_path, "rb") as f: - classes = pickle.load(f) - to_exclude = set() + if classes is None: # old way of storing class information + try: + classes = pickle.load(f) + except EOFError: + pass + else: + self.clusters = None + if classes is None and len(classes_to_exclude) > 0: + raise ValueError( + "Classes to exclude are given but no classes dictionary is found, please set classes_dict_path to the path of the classes dictionary" + ) + to_exclude = set() + if classes is not None: for c in classes_to_exclude: for key, id_arr in classes.get(c, {}).items(): for id, _ in id_arr: to_exclude.add(id) - for key in list(self.clusters.keys()): - cluster_list = [] - for x in self.clusters[key]: - if x[0] in to_exclude: - continue - id = x[0].split(".")[0] - chain = x[1] - if id not in self.files: - continue - if chain not in self.files[id]: - continue - if len(self.files[id][chain]) == 0: - continue - cluster_list.append([id, chain]) - self.clusters[key] = cluster_list - if len(self.clusters[key]) == 0: - self.clusters.pop(key) + if require_antigen or require_light_chain: + to_exclude.update( + self._exclude_by_chains( + require_antigen, require_light_chain, require_heavy_chain + ) + ) + if self.clusters is not None: + self._exclude_ids_from_clusters(to_exclude) self.data = list(self.clusters.keys()) else: - self.clusters = None - self.data = list(self.files.keys()) + self.data = [x for x in self.files.keys() if x not in to_exclude] # create a smaller dataset if necessary (if we have clustering it's applied earlier) - if clustering_dict_path is None and use_fraction < 1: + if self.clusters is None and use_fraction < 1: self.data = sorted(self.data)[: int(len(self.data) * use_fraction)] if load_to_ram: print("Loading to RAM...") @@ -1508,6 +1582,60 @@

    Parameters

    self.cdr = 0 self.set_cdr(None) + def _exclude_ids_from_clusters(self, to_exclude): + for key in list(self.clusters.keys()): + cluster_list = [] + for x in self.clusters[key]: + if x[0] in to_exclude: + continue + id = x[0].split(".")[0] + chain = x[1] + if id not in self.files: + continue + if chain not in self.files[id]: + continue + if len(self.files[id][chain]) == 0: + continue + cluster_list.append([id, chain]) + self.clusters[key] = cluster_list + if len(self.clusters[key]) == 0: + self.clusters.pop(key) + + def _check_chain_types(self, file): + chain_types = set() + with open(file, "rb") as f: + data = pickle.load(f) + chains = data["chain_dict"].values() + for chain in chains: + chain_mask = data["chain_encoding_all"] == chain + cdr = data["cdr"][chain_mask] + cdr_values = cdr.unique() + if len(cdr_values) == 1: + chain_types.add("antigen") + elif CDR_REVERSE["H1"] in cdr_values: + chain_types.add("heavy") + elif CDR_REVERSE["L1"] in cdr_values: + chain_types.add("light") + return chain_types + + def _exclude_by_chains( + self, require_antigen, require_light_chain, require_heavy_chain + ): + """Exclude entries that do not have an antigen or a light chain.""" + to_exclude = set() + for id in self.files: + filename = list(self.files[id].values())[0][ + 0 + ] # assuming entry type is biounit + chain_types = self._check_chain_types(filename) + if require_antigen and "antigen" not in chain_types: + to_exclude.add(id) + if require_light_chain and "light" not in chain_types: + to_exclude.add(id) + if require_heavy_chain and "heavy" not in chain_types: + to_exclude.add(id) + return to_exclude + def _get_masked_sequence( self, data, @@ -1655,6 +1783,8 @@

    Parameters

    elif self.entry_type == "chain": chain_sets = [[x] for x in chains] elif self.entry_type == "pair": + if len(chains) == 1: + return [] chain_sets = list(combinations(chains, 2)) else: raise RuntimeError( @@ -2180,6 +2310,9 @@

    Parameters

    classes_dict_path=None, load_ligands=False, cut_edges=False, + require_antigen=False, + require_light_chain=False, + require_heavy_chain=False, *args, **kwargs, ) -> None: @@ -2236,6 +2369,12 @@

    Parameters

    if `True`, the ligands will be loaded from the PDB files and added to the features cut_edges : bool, default False if `True`, missing values at the edges of the sequence will be cut off + require_antigen : bool, default False + if `True`, only entries with an antigen will be included (used if the dataset is SAbDab) + require_light_chain : bool, default False + if `True`, only entries with a light chain will be included (used if the dataset is SAbDab) + require_heavy_chain : bool, default False + if `True`, only entries with a heavy chain will be included (used if the dataset is SAbDab) *args additional arguments to `torch.utils.data.DataLoader` **kwargs @@ -2266,6 +2405,9 @@

    Parameters

    mask_all_cdrs=mask_all_cdrs, load_ligands=load_ligands, cut_edges=cut_edges, + require_antigen=require_antigen, + require_light_chain=require_light_chain, + require_heavy_chain=require_heavy_chain, ) return ProteinLoader( dataset=dataset, @@ -2321,7 +2463,7 @@

    Class variables

    Static methods

    -def from_args(dataset_folder, features_folder='./data/tmp/', clustering_dict_path=None, max_length=None, rewrite=False, use_fraction=1, load_to_ram=False, debug=False, interpolate='none', node_features_type=None, entry_type='biounit', classes_to_exclude=None, lower_limit=15, upper_limit=100, mask_residues=True, mask_whole_chains=False, mask_frac=None, force_binding_sites_frac=0, shuffle_clusters=True, shuffle_batches=True, mask_all_cdrs=False, classes_dict_path=None, load_ligands=False, cut_edges=False, *args, **kwargs) ‑> None +def from_args(dataset_folder, features_folder='./data/tmp/', clustering_dict_path=None, max_length=None, rewrite=False, use_fraction=1, load_to_ram=False, debug=False, interpolate='none', node_features_type=None, entry_type='biounit', classes_to_exclude=None, lower_limit=15, upper_limit=100, mask_residues=True, mask_whole_chains=False, mask_frac=None, force_binding_sites_frac=0, shuffle_clusters=True, shuffle_batches=True, mask_all_cdrs=False, classes_dict_path=None, load_ligands=False, cut_edges=False, require_antigen=False, require_light_chain=False, require_heavy_chain=False, *args, **kwargs) ‑> None

    Create a ProteinLoader instance with a ProteinDataset from the given arguments.

    @@ -2376,6 +2518,12 @@

    Parameters

    if True, the ligands will be loaded from the PDB files and added to the features
    cut_edges : bool, default False
    if True, missing values at the edges of the sequence will be cut off
    +
    require_antigen : bool, default False
    +
    if True, only entries with an antigen will be included (used if the dataset is SAbDab)
    +
    require_light_chain : bool, default False
    +
    if True, only entries with a light chain will be included (used if the dataset is SAbDab)
    +
    require_heavy_chain : bool, default False
    +
    if True, only entries with a heavy chain will be included (used if the dataset is SAbDab)
    *args
    additional arguments to torch.utils.data.DataLoader
    **kwargs
    @@ -2411,6 +2559,9 @@

    Parameters

    classes_dict_path=None, load_ligands=False, cut_edges=False, + require_antigen=False, + require_light_chain=False, + require_heavy_chain=False, *args, **kwargs, ) -> None: @@ -2467,6 +2618,12 @@

    Parameters

    if `True`, the ligands will be loaded from the PDB files and added to the features cut_edges : bool, default False if `True`, missing values at the edges of the sequence will be cut off + require_antigen : bool, default False + if `True`, only entries with an antigen will be included (used if the dataset is SAbDab) + require_light_chain : bool, default False + if `True`, only entries with a light chain will be included (used if the dataset is SAbDab) + require_heavy_chain : bool, default False + if `True`, only entries with a heavy chain will be included (used if the dataset is SAbDab) *args additional arguments to `torch.utils.data.DataLoader` **kwargs @@ -2497,6 +2654,9 @@

    Parameters

    mask_all_cdrs=mask_all_cdrs, load_ligands=load_ligands, cut_edges=cut_edges, + require_antigen=require_antigen, + require_light_chain=require_light_chain, + require_heavy_chain=require_heavy_chain, ) return ProteinLoader( dataset=dataset, diff --git a/docs/index.html b/docs/index.html index c03055b..20a8ff1 100644 --- a/docs/index.html +++ b/docs/index.html @@ -45,7 +45,7 @@

    Installation

    docker pull adaptyvbio/proteinflow
     

    By default installing proteinflow with conda or pip will only load the dependencies that are required for the main functions of the package: downloading, generating and splitting datasets. If you are interested in using other functions like visualization, metrics and other data processing methods, please install the package with pip install proteinflow[proteinflow.processing] or use the docker image.

    -

    Some metric functions also have separate requirements, see the documentation for details.

    +

    Some metric functions also have separate requirements, see the documentation for details. All of them are installed in the docker image.

    Troubleshooting

    • If you are using python 3.10 and encountering installation problems, try running python -m pip install prody==2.4.0 before installing proteinflow.
    • @@ -269,7 +269,7 @@

      ProteinFlow Stable Releases

      By default installing `proteinflow` with conda or pip will only load the dependencies that are required for the main functions of the package: downloading, generating and splitting datasets. If you are interested in using other functions like visualization, metrics and other data processing methods, please install the package with `pip install proteinflow[processing]` or use the docker image. -Some metric functions also have separate requirements, see the documentation for details. +Some metric functions also have separate requirements, see the documentation for details. All of them are installed in the docker image. ### Troubleshooting - If you are using python 3.10 and encountering installation problems, try running `python -m pip install prody==2.4.0` before installing `proteinflow`. diff --git a/docs/metrics/index.html b/docs/metrics/index.html index 9bf5e8c..cae2ecf 100644 --- a/docs/metrics/index.html +++ b/docs/metrics/index.html @@ -90,7 +90,7 @@

      Module proteinflow.metrics

      score += matrix[x_before][x_after] return score - +@requires_extra("blosum") def long_repeat_num(seq, thr=5): """Calculate the number of long repeats in a sequence. @@ -445,6 +445,132 @@

      Module proteinflow.metrics

      Functions

      +
      +def ablang_pll(sequence, predict_mask, ablang_model_name='heavy', average=False) +
      +
      +

      Compute pseudo log likelihood.

      +

      Note that you need to install ablang (see https://github.com/oxpig/AbLang/tree/main).

      +

      Parameters

      +
      +
      sequence : str
      +
      Chain sequence (string of amino acid codes)
      +
      predict_mask : np.ndarray
      +
      Predict mask corresponding to the sequence (array of 0 and 1 where 1 indicates a predicted residue)
      +
      ablang_model_name : {"heavy", "light"}, default "heavy"
      +
      Name of the AbLang model to use
      +
      average : bool, default False
      +
      Whether to average the pseudo log likelihood over the residues
      +
      +

      Returns

      +
      +
      pll : float
      +
      Pseudo log likelihood
      +
      +
      + +Expand source code + +
      @requires_extra("ablang")
      +def ablang_pll(
      +    sequence,
      +    predict_mask,
      +    ablang_model_name="heavy",
      +    average=False,
      +):
      +    """Compute pseudo log likelihood.
      +
      +    Note that you need to install `ablang` (see https://github.com/oxpig/AbLang/tree/main).
      +
      +    Parameters
      +    ----------
      +    sequence : str
      +        Chain sequence (string of amino acid codes)
      +    predict_mask : np.ndarray
      +        Predict mask corresponding to the sequence (array of 0 and 1 where 1 indicates a predicted residue)
      +    ablang_model_name : {"heavy", "light"}, default "heavy"
      +        Name of the AbLang model to use
      +    average : bool, default False
      +        Whether to average the pseudo log likelihood over the residues
      +
      +    Returns
      +    -------
      +    pll: float
      +        Pseudo log likelihood
      +
      +    """
      +    ablang_model = ablang.pretrained(
      +        ablang_model_name
      +    )  # Use "light" if you are working with light chains
      +    ablang_model.freeze()
      +
      +    sequences = []
      +    sequence = list(sequence)
      +    predict_idx = np.where(predict_mask)[0]
      +    for i in predict_idx:
      +        sequences.append("".join(sequence[:i]) + "*" + "".join(sequence[i + 1 :]))
      +
      +    logits = ablang_model(sequences, mode="likelihood")[:, 1:]
      +    exp_logits = np.exp(logits)
      +    prob = exp_logits / exp_logits.sum(axis=-1, keepdims=True)
      +    true_idx = [
      +        ablang_model.tokenizer.vocab_to_token[x] - 1
      +        for x in np.array(sequence)[predict_idx]
      +    ]
      +
      +    prob = prob[range(prob.shape[0]), predict_idx, true_idx]
      +    pll = np.log(prob).sum()
      +    if average:
      +        pll /= len(predict_idx)
      +    return pll
      +
      +
      +
      +def blosum62_score(seq_before, seq_after) +
      +
      +

      Calculate the BLOSUM62 score between two sequences.

      +

      Parameters

      +
      +
      seq_before : str
      +
      The sequence before the mutation
      +
      seq_after : str
      +
      The sequence after the mutation
      +
      +

      Returns

      +
      +
      score : int
      +
      The BLOSUM62 score between the two sequences
      +
      +
      + +Expand source code + +
      @requires_extra("blosum")
      +def blosum62_score(seq_before, seq_after):
      +    """Calculate the BLOSUM62 score between two sequences.
      +
      +    Parameters
      +    ----------
      +    seq_before : str
      +        The sequence before the mutation
      +    seq_after : str
      +        The sequence after the mutation
      +
      +    Returns
      +    -------
      +    score : int
      +        The BLOSUM62 score between the two sequences
      +
      +    """
      +    assert len(seq_before) == len(seq_after)
      +    matrix = bl.BLOSUM(62)
      +    score = 0
      +    for x_before, x_after in zip(seq_before, seq_after):
      +        score += matrix[x_before][x_after]
      +    return score
      +
      +
      def ca_rmsd(coordinates1, coordinates2)
      @@ -545,6 +671,254 @@

      Returns

      return struct.b_factor.mean()
    +
    +def esm_pll(chain_sequences, predict_masks, esm_model_name='esm2_t30_150M_UR50D', esm_model_objects=None, average=False) +
    +
    +

    Compute pseudo log likelihood.

    +

    Parameters

    +
    +
    chain_sequences : list of str
    +
    List of chain sequences (strings of amino acid codes)
    +
    predict_masks : list of np.ndarray
    +
    List of predict masks corresponding to the sequences (arrays of 0 and 1 where 1 indicates a predicted residue)
    +
    esm_model_name : str, default "esm2_t30_150M_UR50D"
    +
    Name of the ESM-2 model to use
    +
    esm_model_objects : tuple, optional
    +
    Tuple of ESM-2 model, batch converter and tok_to_idx dictionary (if not None, esm_model_name will be ignored)
    +
    average : bool, default False
    +
    Whether to average the pseudo log likelihood over the residues
    +
    +

    Returns

    +
    +
    pll : float
    +
    Pseudo log likelihood
    +
    +
    + +Expand source code + +
    @requires_extra("esm", install_name="fair-esm")
    +def esm_pll(
    +    chain_sequences,
    +    predict_masks,
    +    esm_model_name="esm2_t30_150M_UR50D",
    +    esm_model_objects=None,
    +    average=False,
    +):
    +    """Compute pseudo log likelihood.
    +
    +    Parameters
    +    ----------
    +    chain_sequences : list of str
    +        List of chain sequences (strings of amino acid codes)
    +    predict_masks : list of np.ndarray
    +        List of predict masks corresponding to the sequences (arrays of 0 and 1 where 1 indicates a predicted residue)
    +    esm_model_name : str, default "esm2_t30_150M_UR50D"
    +        Name of the ESM-2 model to use
    +    esm_model_objects : tuple, optional
    +        Tuple of ESM-2 model, batch converter and tok_to_idx dictionary (if not None, `esm_model_name` will be ignored)
    +    average : bool, default False
    +        Whether to average the pseudo log likelihood over the residues
    +
    +    Returns
    +    -------
    +    pll: float
    +        Pseudo log likelihood
    +
    +    """
    +    predict_mask = []
    +    for mask in predict_masks:
    +        predict_mask.append(mask)
    +        predict_mask.append(np.zeros(2))
    +    predict_mask = np.concatenate(predict_mask, axis=0)
    +    predict_idx = np.where(predict_mask)[0]
    +    sequence = []
    +    for i, seq in enumerate(chain_sequences):
    +        sequence += list(seq)
    +        if i != len(chain_sequences) - 1:
    +            sequence += ["<eos>", "<cls>"]
    +
    +    if esm_model_objects is None:
    +        esm_model, batch_converter, tok_to_idx = _get_esm_model(esm_model_name)
    +    else:
    +        esm_model, batch_converter, tok_to_idx = esm_model_objects
    +    pll = 0
    +    for i in predict_idx:
    +        sequence_ = "".join(sequence[:i]) + "<mask>" + "".join(sequence[i + 1 :])
    +        _, _, batch_tokens = batch_converter([(0, sequence_)])
    +        if torch.cuda.is_available():
    +            batch_tokens = batch_tokens.to("cuda")
    +        with torch.no_grad():
    +            results = esm_model(batch_tokens, repr_layers=[6], return_contacts=False)
    +        logits = results["logits"][0, i + 1].detach().cpu()
    +        tok_idx = tok_to_idx[sequence[i]]
    +        prob = F.softmax(logits[4:24], dim=-1)[tok_idx - 4]
    +        pll += torch.log(prob).item()
    +    if average:
    +        pll /= len(predict_idx)
    +    return pll
    +
    +
    +
    +def esmfold_generate(sequences, filepaths=None) +
    +
    +

    Generate PDB structures using ESMFold.

    +

    Note that you need to install fair-esm with the esmfold option (see https://github.com/facebookresearch/esm/tree/main). +The model also requires > 16GB CPU and GPU memory.

    +

    Parameters

    +
    +
    sequences : list of str
    +
    List of sequences to be generated (chains separated with ':')
    +
    filepaths : list of str, default None
    +
    List of filepaths for the generated structures
    +
    +
    + +Expand source code + +
    @requires_extra("esm", install_name="fair-esm[esmfold]")
    +def esmfold_generate(sequences, filepaths=None):
    +    """Generate PDB structures using ESMFold.
    +
    +    Note that you need to install `fair-esm` with the `esmfold` option (see https://github.com/facebookresearch/esm/tree/main).
    +    The model also requires > 16GB CPU and GPU memory.
    +
    +    Parameters
    +    ----------
    +    sequences : list of str
    +        List of sequences to be generated (chains separated with `':'`)
    +    filepaths : list of str, default None
    +        List of filepaths for the generated structures
    +
    +    """
    +    assert filepaths is None or len(filepaths) == len(sequences)
    +    print("Loading the ESMFold model...")
    +    model = esm.pretrained.esmfold_v1()
    +    model = model.eval().cuda()
    +    print("Model loaded.")
    +    if filepaths is None:
    +        if not os.path.exists("esmfold_output"):
    +            os.mkdir("esmfold_output")
    +        filepaths = [
    +            os.path.join("esmfold_output", f"seq_{i}.pdb")
    +            for i in range(len(sequences))
    +        ]
    +    with torch.no_grad():
    +        for sequence, path in tqdm(zip(sequences, filepaths), total=len(sequences)):
    +            output = model.infer_pdb(sequence)
    +            with open(path, "w") as f:
    +                f.write(output)
    +
    +
    +
    +def igfold_generate(sequence_dicts, filepaths=None, use_openmm=False) +
    +
    +

    Generate PDB structures using IgFold.

    +

    Note that you need to install igfold (see https://github.com/Graylab/IgFold).

    +

    Parameters

    +
    +
    sequence_dicts : list of dict
    +
    List of sequence dictionaries (keys: "H", "L" for heavy and light chains)
    +
    filepaths : list of str, optional
    +
    List of filepaths for the generated structures
    +
    use_openmm : bool, default False
    +
    Whether to use refinement with OpenMM
    +
    +
    + +Expand source code + +
    @requires_extra("igfold")
    +def igfold_generate(sequence_dicts, filepaths=None, use_openmm=False):
    +    """Generate PDB structures using IgFold.
    +
    +    Note that you need to install `igfold` (see https://github.com/Graylab/IgFold).
    +
    +    Parameters
    +    ----------
    +    sequence_dicts : list of dict
    +        List of sequence dictionaries (keys: "H", "L" for heavy and light chains)
    +    filepaths : list of str, optional
    +        List of filepaths for the generated structures
    +    use_openmm : bool, default False
    +        Whether to use refinement with OpenMM
    +
    +    """
    +    assert filepaths is None or len(filepaths) == len(sequence_dicts)
    +    igfold = IgFoldRunner()
    +    folder = "igfold_refine_output" if use_openmm else "igfold_output"
    +    if filepaths is None:
    +        if not os.path.exists(folder):
    +            os.mkdir(folder)
    +        filepaths = [
    +            os.path.join(folder, f"seq_{i}.pdb") for i in range(len(sequence_dicts))
    +        ]
    +    for seqs, path in tqdm(zip(sequence_dicts, filepaths), total=len(sequence_dicts)):
    +        igfold.fold(
    +            path,  # Output PDB file
    +            sequences=seqs,  # Antibody sequences
    +            do_refine=use_openmm,  # Refine the antibody structure
    +            use_openmm=use_openmm,  # Use OpenMM for refinement
    +            do_renum=False,  # Renumber predicted antibody structure (Chothia)
    +        )
    +
    +
    +
    +def immunebuilder_generate(sequence_dicts, filepaths=None, protein_type='antibody') +
    +
    +

    Generate PDB structures using ImmuneBuilder.

    +

    Note that you need to install immunebuilder (see https://github.com/oxpig/ImmuneBuilder)

    +

    Parameters

    +
    +
    sequence_dicts : list of dict
    +
    List of sequence dictionaries (keys: "H", "L" for heavy and light chains)
    +
    filepaths : list of str, optional
    +
    List of filepaths for the generated structures
    +
    protein_type : {"antibody", "nanobody", "tcr"}
    +
    Type of the structure to generate
    +
    +
    + +Expand source code + +
    @requires_extra("ImmuneBuilder")
    +def immunebuilder_generate(sequence_dicts, filepaths=None, protein_type="antibody"):
    +    """Generate PDB structures using ImmuneBuilder.
    +
    +    Note that you need to install `immunebuilder` (see https://github.com/oxpig/ImmuneBuilder)
    +
    +    Parameters
    +    ----------
    +    sequence_dicts : list of dict
    +        List of sequence dictionaries (keys: "H", "L" for heavy and light chains)
    +    filepaths : list of str, optional
    +        List of filepaths for the generated structures
    +    protein_type: {"antibody", "nanobody", "tcr"}
    +        Type of the structure to generate
    +
    +    """
    +    predictor_classes = {
    +        "antibody": ABodyBuilder2,
    +        "nanobody": NanoBodyBuilder2,
    +        "tcr": TCRBuilder2,
    +    }
    +    predictor = predictor_classes[protein_type]()
    +    folder = "immunebuilder_output"
    +    if filepaths is None:
    +        if not os.path.exists(folder):
    +            os.mkdir(folder)
    +        filepaths = [
    +            os.path.join(folder, f"seq_{i}.pdb") for i in range(len(sequence_dicts))
    +        ]
    +    for seqs, path in tqdm(zip(sequence_dicts, filepaths), total=len(sequence_dicts)):
    +        out = predictor.predict(seqs)
    +        out.save(path)
    +
    +
    def long_repeat_num(seq, thr=5)
    @@ -566,7 +940,8 @@

    Returns

    Expand source code -
    def long_repeat_num(seq, thr=5):
    +
    @requires_extra("blosum")
    +def long_repeat_num(seq, thr=5):
         """Calculate the number of long repeats in a sequence.
     
         Parameters
    @@ -594,6 +969,56 @@ 

    Returns

    return count
    +
    +def tm_score(coordinates1, coordinates2, sequence1, sequence2) +
    +
    +

    Calculate TM-score between two structures.

    +

    Parameters

    +
    +
    coordinates1 : np.ndarray
    +
    The CA coordinates array of the first structure, shaped (L, 3)
    +
    coordinates2 : ProteinEntry
    +
    The CA coordinates array of the second structure, shaped (L, 3)
    +
    sequence1 : str
    +
    The sequence of the first structure
    +
    sequence2 : str
    +
    The sequence of the second structure
    +
    +

    Returns

    +
    +
    tm_score : float
    +
    The TM-score between the two structures
    +
    +
    + +Expand source code + +
    @requires_extra("tmtools")
    +def tm_score(coordinates1, coordinates2, sequence1, sequence2):
    +    """Calculate TM-score between two structures.
    +
    +    Parameters
    +    ----------
    +    coordinates1 : np.ndarray
    +        The CA coordinates array of the first structure, shaped `(L, 3)`
    +    coordinates2 : ProteinEntry
    +        The CA coordinates array of the second structure, shaped `(L, 3)`
    +    sequence1 : str
    +        The sequence of the first structure
    +    sequence2 : str
    +        The sequence of the second structure
    +
    +    Returns
    +    -------
    +    tm_score : float
    +        The TM-score between the two structures
    +
    +    """
    +    res = tm_align(coordinates1, coordinates2, sequence1, sequence2)
    +    return (res.tm_norm_chain1 + res.tm_norm_chain2) / 2
    +
    +
    @@ -617,9 +1042,16 @@

    Index

  • Functions

  • diff --git a/proteinflow/__init__.py b/proteinflow/__init__.py index ecf9afd..73ec1a1 100644 --- a/proteinflow/__init__.py +++ b/proteinflow/__init__.py @@ -31,7 +31,7 @@ By default installing `proteinflow` with conda or pip will only load the dependencies that are required for the main functions of the package: downloading, generating and splitting datasets. If you are interested in using other functions like visualization, metrics and other data processing methods, please install the package with `pip install proteinflow[processing]` or use the docker image. -Some metric functions also have separate requirements, see the documentation for details. +Some metric functions also have separate requirements, see the documentation for details. All of them are installed in the docker image. ### Troubleshooting - If you are using python 3.10 and encountering installation problems, try running `python -m pip install prody==2.4.0` before installing `proteinflow`. diff --git a/proteinflow/extra.py b/proteinflow/extra.py index c894c8b..1d14a2b 100644 --- a/proteinflow/extra.py +++ b/proteinflow/extra.py @@ -6,6 +6,7 @@ pass import sys +from functools import wraps def requires_extra(module_name, install_name=None): @@ -23,6 +24,7 @@ def requires_extra(module_name, install_name=None): install_name = module_name def decorator(func): + @wraps(func) def wrapper(*args, **kwargs): if module_name not in sys.modules: raise ImportError( diff --git a/proteinflow/metrics/__init__.py b/proteinflow/metrics/__init__.py index 4b83e09..faa8546 100644 --- a/proteinflow/metrics/__init__.py +++ b/proteinflow/metrics/__init__.py @@ -61,6 +61,7 @@ def blosum62_score(seq_before, seq_after): return score +@requires_extra("blosum") def long_repeat_num(seq, thr=5): """Calculate the number of long repeats in a sequence.