From 1ddba89622ca8337d9da2e8d63dae66105dfde67 Mon Sep 17 00:00:00 2001
From: Liza Kozlova <el.mi.kozlova@gmail.com>
Date: Thu, 21 Dec 2023 13:56:07 +0000
Subject: [PATCH] chore: fix docs generation

---
 docs/data/index.html            | 186 ++++++++++----
 docs/data/torch.html            | 312 +++++++++++++++++------
 docs/index.html                 |   4 +-
 docs/metrics/index.html         | 436 +++++++++++++++++++++++++++++++-
 proteinflow/__init__.py         |   2 +-
 proteinflow/extra.py            |   2 +
 proteinflow/metrics/__init__.py |   1 +
 7 files changed, 818 insertions(+), 125 deletions(-)
diff --git a/docs/data/index.html b/docs/data/index.html
index 3b32a5e..e85f005 100644
--- a/docs/data/index.html
+++ b/docs/data/index.html
@@ -180,7 +180,7 @@ <h1 class="title">Module <code>proteinflow.data</code></h1>
         seqs : list of str
             Amino acid sequences of the protein (one-letter code)
         crds : list of np.ndarray
-            Coordinates of the protein, `&#39;numpy&#39;` arrays of shape `(L, 4, 3)`,
+            Coordinates of the protein, `numpy` arrays of shape `(L, 14, 3)`,
             in the order of `N, C, CA, O`
         masks : list of np.ndarray
             Mask arrays where 1 indicates residues with known coordinates and 0
@@ -267,6 +267,8 @@ <h1 class="title">Module <code>proteinflow.data</code></h1>
         &#34;&#34;&#34;Get the chain types of the protein.
 
         If the CDRs are not annotated, this function will return `None`.
+        If there is no light or heavy chain, the corresponding key will be missing.
+        If there is no antigen chain, the `&#39;antigen&#39;` key will map to an empty list.
 
         Parameters
         ----------
@@ -650,7 +652,7 @@ <h1 class="title">Module <code>proteinflow.data</code></h1>
             Chain IDs
 
         &#34;&#34;&#34;
-        if self.predict_mask is None:
+        if not self.has_predict_mask():
             raise ValueError(&#34;Predicted mask not available&#34;)
         return [k for k, v in self.predict_mask.items() if v.sum() != 0]
 
@@ -1696,6 +1698,8 @@ <h1 class="title">Module <code>proteinflow.data</code></h1>
             The CA RMSD between the two proteins
 
         &#34;&#34;&#34;
+        if only_predicted and not self.has_predict_mask():
+            only_predicted = False
         chains = [x for x in self.get_chains() if x in entry.get_chains()]
         structure1 = self.get_coordinates(only_known=True, chains=chains)[:, 2]
         structure2 = entry.get_coordinates(only_known=True, chains=chains)[:, 2]
@@ -1761,7 +1765,9 @@ <h1 class="title">Module <code>proteinflow.data</code></h1>
             [
                 x
                 for x in entry.get_chains()
-                if x not in entry.get_chain_type_dict()[&#34;antigen&#34;] or not only_antibody
+                if not entry.has_cdr()
+                or not only_antibody
+                or x not in entry.get_chain_type_dict()[&#34;antigen&#34;]
             ]
             for entry in entries
         ]
@@ -1792,7 +1798,9 @@ <h1 class="title">Module <code>proteinflow.data</code></h1>
             chains = [
                 x
                 for x in entry.get_chains()
-                if x not in entry.get_chain_type_dict()[&#34;antigen&#34;] or not only_antibody
+                if not entry.has_cdr()
+                or not only_antibody
+                or x not in entry.get_chain_type_dict()[&#34;antigen&#34;]
             ]
             esm_entry = ProteinEntry.from_pdb(path)
             chain_rename_dict = {k: v for k, v in zip(string.ascii_uppercase, chains)}
@@ -1801,7 +1809,9 @@ <h1 class="title">Module <code>proteinflow.data</code></h1>
             esm_entry.align_structure(
                 reference_pdb_path=temp_file,
                 save_pdb_path=path.rsplit(&#34;.&#34;, 1)[0] + &#34;_aligned.pdb&#34;,
-                chain_ids=entry.get_predicted_chains(),
+                chain_ids=entry.get_predicted_chains()
+                if entry.has_predict_mask()
+                else chains,
             )
             rmsds.append(
                 entry.ca_rmsd(
@@ -2052,6 +2062,22 @@ <h1 class="title">Module <code>proteinflow.data</code></h1>
                 u = mda.Universe(file_)
                 writer.write(u)
 
+    def set_predict_mask(self, mask_dict):
+        &#34;&#34;&#34;Set the predicted mask.
+
+        Parameters
+        ----------
+        mask_dict : dict
+            A dictionary mapping from chain IDs to a `np.ndarray` mask of 0s and 1s of the same length as the chain sequence
+
+        &#34;&#34;&#34;
+        for chain in mask_dict:
+            if chain not in self.get_chains():
+                raise PDBError(&#34;Chain not found&#34;)
+            if len(mask_dict[chain]) != self.get_length([chain]):
+                raise PDBError(&#34;Mask length does not match sequence length&#34;)
+        self.predict_mask = mask_dict
+
     def apply_mask(self, mask):
         &#34;&#34;&#34;Apply a mask to the protein.
 
@@ -2921,24 +2947,6 @@ <h2 id="returns">Returns</h2>
     return crd, mask</code></pre>
 </details>
 </dd>
-<dt id="proteinflow.data.lru_cache"><code class="name flex">
-<span>def <span class="ident">lru_cache</span></span>(<span>)</span>
-</code></dt>
-<dd>
-<div class="desc"><p>Make a dummy decorator.</p></div>
-<details class="source">
-<summary>
-<span>Expand source code</span>
-</summary>
-<pre><code class="python">def lru_cache():
-    &#34;&#34;&#34;Make a dummy decorator.&#34;&#34;&#34;
-
-    def wrapper(func):
-        return func
-
-    return wrapper</code></pre>
-</details>
-</dd>
 </dl>
 </section>
 <section>
@@ -4207,7 +4215,7 @@ <h2 id="parameters">Parameters</h2>
 <dt><strong><code>seqs</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
 <dd>Amino acid sequences of the protein (one-letter code)</dd>
 <dt><strong><code>crds</code></strong> :&ensp;<code>list</code> of <code>np.ndarray</code></dt>
-<dd>Coordinates of the protein, <code>'numpy'</code> arrays of shape <code>(L, 4, 3)</code>,
+<dd>Coordinates of the protein, <code>numpy</code> arrays of shape <code>(L, 14, 3)</code>,
 in the order of <code>N, C, CA, O</code></dd>
 <dt><strong><code>masks</code></strong> :&ensp;<code>list</code> of <code>np.ndarray</code></dt>
 <dd>Mask arrays where 1 indicates residues with known coordinates and 0
@@ -4252,7 +4260,7 @@ <h2 id="parameters">Parameters</h2>
         seqs : list of str
             Amino acid sequences of the protein (one-letter code)
         crds : list of np.ndarray
-            Coordinates of the protein, `&#39;numpy&#39;` arrays of shape `(L, 4, 3)`,
+            Coordinates of the protein, `numpy` arrays of shape `(L, 14, 3)`,
             in the order of `N, C, CA, O`
         masks : list of np.ndarray
             Mask arrays where 1 indicates residues with known coordinates and 0
@@ -4339,6 +4347,8 @@ <h2 id="parameters">Parameters</h2>
         &#34;&#34;&#34;Get the chain types of the protein.
 
         If the CDRs are not annotated, this function will return `None`.
+        If there is no light or heavy chain, the corresponding key will be missing.
+        If there is no antigen chain, the `&#39;antigen&#39;` key will map to an empty list.
 
         Parameters
         ----------
@@ -4722,7 +4732,7 @@ <h2 id="parameters">Parameters</h2>
             Chain IDs
 
         &#34;&#34;&#34;
-        if self.predict_mask is None:
+        if not self.has_predict_mask():
             raise ValueError(&#34;Predicted mask not available&#34;)
         return [k for k, v in self.predict_mask.items() if v.sum() != 0]
 
@@ -5768,6 +5778,8 @@ <h2 id="parameters">Parameters</h2>
             The CA RMSD between the two proteins
 
         &#34;&#34;&#34;
+        if only_predicted and not self.has_predict_mask():
+            only_predicted = False
         chains = [x for x in self.get_chains() if x in entry.get_chains()]
         structure1 = self.get_coordinates(only_known=True, chains=chains)[:, 2]
         structure2 = entry.get_coordinates(only_known=True, chains=chains)[:, 2]
@@ -5833,7 +5845,9 @@ <h2 id="parameters">Parameters</h2>
             [
                 x
                 for x in entry.get_chains()
-                if x not in entry.get_chain_type_dict()[&#34;antigen&#34;] or not only_antibody
+                if not entry.has_cdr()
+                or not only_antibody
+                or x not in entry.get_chain_type_dict()[&#34;antigen&#34;]
             ]
             for entry in entries
         ]
@@ -5864,7 +5878,9 @@ <h2 id="parameters">Parameters</h2>
             chains = [
                 x
                 for x in entry.get_chains()
-                if x not in entry.get_chain_type_dict()[&#34;antigen&#34;] or not only_antibody
+                if not entry.has_cdr()
+                or not only_antibody
+                or x not in entry.get_chain_type_dict()[&#34;antigen&#34;]
             ]
             esm_entry = ProteinEntry.from_pdb(path)
             chain_rename_dict = {k: v for k, v in zip(string.ascii_uppercase, chains)}
@@ -5873,7 +5889,9 @@ <h2 id="parameters">Parameters</h2>
             esm_entry.align_structure(
                 reference_pdb_path=temp_file,
                 save_pdb_path=path.rsplit(&#34;.&#34;, 1)[0] + &#34;_aligned.pdb&#34;,
-                chain_ids=entry.get_predicted_chains(),
+                chain_ids=entry.get_predicted_chains()
+                if entry.has_predict_mask()
+                else chains,
             )
             rmsds.append(
                 entry.ca_rmsd(
@@ -6124,6 +6142,22 @@ <h2 id="parameters">Parameters</h2>
                 u = mda.Universe(file_)
                 writer.write(u)
 
+    def set_predict_mask(self, mask_dict):
+        &#34;&#34;&#34;Set the predicted mask.
+
+        Parameters
+        ----------
+        mask_dict : dict
+            A dictionary mapping from chain IDs to a `np.ndarray` mask of 0s and 1s of the same length as the chain sequence
+
+        &#34;&#34;&#34;
+        for chain in mask_dict:
+            if chain not in self.get_chains():
+                raise PDBError(&#34;Chain not found&#34;)
+            if len(mask_dict[chain]) != self.get_length([chain]):
+                raise PDBError(&#34;Mask length does not match sequence length&#34;)
+        self.predict_mask = mask_dict
+
     def apply_mask(self, mask):
         &#34;&#34;&#34;Apply a mask to the protein.
 
@@ -6174,21 +6208,42 @@ <h3>Class variables</h3>
 <h3>Static methods</h3>
 <dl>
 <dt id="proteinflow.data.ProteinEntry.combine_multiple_frames"><code class="name flex">
-<span>def <span class="ident">combine_multiple_frames</span></span>(<span>*args, **kwargs)</span>
+<span>def <span class="ident">combine_multiple_frames</span></span>(<span>files, output_path='combined.pdb')</span>
 </code></dt>
 <dd>
-<div class="desc"></div>
+<div class="desc"><p>Combine multiple PDB files into a single multiframe PDB file.</p>
+<h2 id="parameters">Parameters</h2>
+<dl>
+<dt><strong><code>files</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
+<dd>A list of PDB or proteinflow pickle files</dd>
+<dt><strong><code>output_path</code></strong> :&ensp;<code>str</code>, default <code>'combined.pdb'</code></dt>
+<dd>Path to the .pdb output file</dd>
+</dl></div>
 <details class="source">
 <summary>
 <span>Expand source code</span>
 </summary>
-<pre><code class="python">def wrapper(*args, **kwargs):
-    if module_name not in sys.modules:
-        raise ImportError(
-            f&#34;{install_name} must be installed to use this function. &#34;
-            f&#34;Install it with `pip install {install_name}` or together with most other optional dependencies with `pip install proteinflow[processing]`.&#34;
-        )
-    return func(*args, **kwargs)</code></pre>
+<pre><code class="python">@staticmethod
+@requires_extra(&#34;MDAnalysis&#34;)
+def combine_multiple_frames(files, output_path=&#34;combined.pdb&#34;):
+    &#34;&#34;&#34;Combine multiple PDB files into a single multiframe PDB file.
+
+    Parameters
+    ----------
+    files : list of str
+        A list of PDB or proteinflow pickle files
+    output_path : str, default &#39;combined.pdb&#39;
+        Path to the .pdb output file
+
+    &#34;&#34;&#34;
+    with mda.Writer(output_path, multiframe=True) as writer:
+        for file in files:
+            if file.endswith(&#34;.pickle&#34;):
+                file_ = ProteinEntry.from_pickle(file)._temp_pdb_file()
+            else:
+                file_ = file
+            u = mda.Universe(file_)
+            writer.write(u)</code></pre>
 </details>
 </dd>
 <dt id="proteinflow.data.ProteinEntry.decode_cdr"><code class="name flex">
@@ -6332,7 +6387,9 @@ <h2 id="returns">Returns</h2>
         [
             x
             for x in entry.get_chains()
-            if x not in entry.get_chain_type_dict()[&#34;antigen&#34;] or not only_antibody
+            if not entry.has_cdr()
+            or not only_antibody
+            or x not in entry.get_chain_type_dict()[&#34;antigen&#34;]
         ]
         for entry in entries
     ]
@@ -6363,7 +6420,9 @@ <h2 id="returns">Returns</h2>
         chains = [
             x
             for x in entry.get_chains()
-            if x not in entry.get_chain_type_dict()[&#34;antigen&#34;] or not only_antibody
+            if not entry.has_cdr()
+            or not only_antibody
+            or x not in entry.get_chain_type_dict()[&#34;antigen&#34;]
         ]
         esm_entry = ProteinEntry.from_pdb(path)
         chain_rename_dict = {k: v for k, v in zip(string.ascii_uppercase, chains)}
@@ -6372,7 +6431,9 @@ <h2 id="returns">Returns</h2>
         esm_entry.align_structure(
             reference_pdb_path=temp_file,
             save_pdb_path=path.rsplit(&#34;.&#34;, 1)[0] + &#34;_aligned.pdb&#34;,
-            chain_ids=entry.get_predicted_chains(),
+            chain_ids=entry.get_predicted_chains()
+            if entry.has_predict_mask()
+            else chains,
         )
         rmsds.append(
             entry.ca_rmsd(
@@ -7403,6 +7464,8 @@ <h2 id="returns">Returns</h2>
         The CA RMSD between the two proteins
 
     &#34;&#34;&#34;
+    if only_predicted and not self.has_predict_mask():
+        only_predicted = False
     chains = [x for x in self.get_chains() if x in entry.get_chains()]
     structure1 = self.get_coordinates(only_known=True, chains=chains)[:, 2]
     structure2 = entry.get_coordinates(only_known=True, chains=chains)[:, 2]
@@ -7882,7 +7945,9 @@ <h2 id="returns">Returns</h2>
 </code></dt>
 <dd>
 <div class="desc"><p>Get the chain types of the protein.</p>
-<p>If the CDRs are not annotated, this function will return <code>None</code>.</p>
+<p>If the CDRs are not annotated, this function will return <code>None</code>.
+If there is no light or heavy chain, the corresponding key will be missing.
+If there is no antigen chain, the <code>'antigen'</code> key will map to an empty list.</p>
 <h2 id="parameters">Parameters</h2>
 <dl>
 <dt><strong><code>chains</code></strong> :&ensp;<code>list</code> of <code>str</code>, default <code>None</code></dt>
@@ -7902,6 +7967,8 @@ <h2 id="returns">Returns</h2>
     &#34;&#34;&#34;Get the chain types of the protein.
 
     If the CDRs are not annotated, this function will return `None`.
+    If there is no light or heavy chain, the corresponding key will be missing.
+    If there is no antigen chain, the `&#39;antigen&#39;` key will map to an empty list.
 
     Parameters
     ----------
@@ -8346,7 +8413,7 @@ <h2 id="returns">Returns</h2>
         Chain IDs
 
     &#34;&#34;&#34;
-    if self.predict_mask is None:
+    if not self.has_predict_mask():
         raise ValueError(&#34;Predicted mask not available&#34;)
     return [k for k, v in self.predict_mask.items() if v.sum() != 0]</code></pre>
 </details>
@@ -8816,6 +8883,37 @@ <h2 id="returns">Returns</h2>
     return sse</code></pre>
 </details>
 </dd>
+<dt id="proteinflow.data.ProteinEntry.set_predict_mask"><code class="name flex">
+<span>def <span class="ident">set_predict_mask</span></span>(<span>self, mask_dict)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Set the predicted mask.</p>
+<h2 id="parameters">Parameters</h2>
+<dl>
+<dt><strong><code>mask_dict</code></strong> :&ensp;<code>dict</code></dt>
+<dd>A dictionary mapping from chain IDs to a <code>np.ndarray</code> mask of 0s and 1s of the same length as the chain sequence</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_predict_mask(self, mask_dict):
+    &#34;&#34;&#34;Set the predicted mask.
+
+    Parameters
+    ----------
+    mask_dict : dict
+        A dictionary mapping from chain IDs to a `np.ndarray` mask of 0s and 1s of the same length as the chain sequence
+
+    &#34;&#34;&#34;
+    for chain in mask_dict:
+        if chain not in self.get_chains():
+            raise PDBError(&#34;Chain not found&#34;)
+        if len(mask_dict[chain]) != self.get_length([chain]):
+            raise PDBError(&#34;Mask length does not match sequence length&#34;)
+    self.predict_mask = mask_dict</code></pre>
+</details>
+</dd>
 <dt id="proteinflow.data.ProteinEntry.sidechain_coordinates"><code class="name flex">
 <span>def <span class="ident">sidechain_coordinates</span></span>(<span>self, chains=None)</span>
 </code></dt>
@@ -9762,7 +9860,6 @@ <h1>Index</h1>
 <li><h3><a href="#header-functions">Functions</a></h3>
 <ul class="">
 <li><code><a title="proteinflow.data.interpolate_coords" href="#proteinflow.data.interpolate_coords">interpolate_coords</a></code></li>
-<li><code><a title="proteinflow.data.lru_cache" href="#proteinflow.data.lru_cache">lru_cache</a></code></li>
 </ul>
 </li>
 <li><h3><a href="#header-classes">Classes</a></h3>
@@ -9839,6 +9936,7 @@ <h4><code><a title="proteinflow.data.ProteinEntry" href="#proteinflow.data.Prote
 <li><code><a title="proteinflow.data.ProteinEntry.rename_chains" href="#proteinflow.data.ProteinEntry.rename_chains">rename_chains</a></code></li>
 <li><code><a title="proteinflow.data.ProteinEntry.retrieve_ligands_from_pickle" href="#proteinflow.data.ProteinEntry.retrieve_ligands_from_pickle">retrieve_ligands_from_pickle</a></code></li>
 <li><code><a title="proteinflow.data.ProteinEntry.secondary_structure" href="#proteinflow.data.ProteinEntry.secondary_structure">secondary_structure</a></code></li>
+<li><code><a title="proteinflow.data.ProteinEntry.set_predict_mask" href="#proteinflow.data.ProteinEntry.set_predict_mask">set_predict_mask</a></code></li>
 <li><code><a title="proteinflow.data.ProteinEntry.sidechain_coordinates" href="#proteinflow.data.ProteinEntry.sidechain_coordinates">sidechain_coordinates</a></code></li>
 <li><code><a title="proteinflow.data.ProteinEntry.sidechain_orientation" href="#proteinflow.data.ProteinEntry.sidechain_orientation">sidechain_orientation</a></code></li>
 <li><code><a title="proteinflow.data.ProteinEntry.tm_score" href="#proteinflow.data.ProteinEntry.tm_score">tm_score</a></code></li>
diff --git a/docs/data/torch.html b/docs/data/torch.html
index 214edd1..070ea79 100644
--- a/docs/data/torch.html
+++ b/docs/data/torch.html
@@ -158,6 +158,9 @@ <h1 class="title">Module <code>proteinflow.data.torch</code></h1>
         classes_dict_path=None,
         load_ligands=False,
         cut_edges=False,
+        require_antigen=False,
+        require_light_chain=False,
+        require_heavy_chain=False,
         *args,
         **kwargs,
     ) -&gt; None:
@@ -214,6 +217,12 @@ <h1 class="title">Module <code>proteinflow.data.torch</code></h1>
             if `True`, the ligands will be loaded from the PDB files and added to the features
         cut_edges : bool, default False
             if `True`, missing values at the edges of the sequence will be cut off
+        require_antigen : bool, default False
+            if `True`, only entries with an antigen will be included (used if the dataset is SAbDab)
+        require_light_chain : bool, default False
+            if `True`, only entries with a light chain will be included (used if the dataset is SAbDab)
+        require_heavy_chain : bool, default False
+            if `True`, only entries with a heavy chain will be included (used if the dataset is SAbDab)
         *args
             additional arguments to `torch.utils.data.DataLoader`
         **kwargs
@@ -244,6 +253,9 @@ <h1 class="title">Module <code>proteinflow.data.torch</code></h1>
             mask_all_cdrs=mask_all_cdrs,
             load_ligands=load_ligands,
             cut_edges=cut_edges,
+            require_antigen=require_antigen,
+            require_light_chain=require_light_chain,
+            require_heavy_chain=require_heavy_chain,
         )
         return ProteinLoader(
             dataset=dataset,
@@ -339,7 +351,9 @@ <h1 class="title">Module <code>proteinflow.data.torch</code></h1>
         patch_around_mask=False,
         initial_patch_size=128,
         antigen_patch_size=128,
-        debug_verbose=False,
+        require_antigen=False,
+        require_light_chain=False,
+        require_heavy_chain=False,
     ):
         &#34;&#34;&#34;Initialize the dataset.
 
@@ -410,9 +424,15 @@ <h1 class="title">Module <code>proteinflow.data.torch</code></h1>
             the size of the initial patch (used if `patch_around_mask` is `True`)
         antigen_patch_size : int, default 128
             the size of the antigen patch (used if `patch_around_mask` is `True` and the dataset is SAbDab)
+        require_antigen : bool, default False
+            if `True`, only entries with an antigen will be included (used if the dataset is SAbDab)
+        require_light_chain : bool, default False
+            if `True`, only entries with a light chain will be included (used if the dataset is SAbDab)
+        requre_heavy_chain : bool, default False
+            if `True`, only entries with a heavy chain will be included (used if the dataset is SAbDab)
 
         &#34;&#34;&#34;
-        self.debug = debug_verbose
+        self.debug = False
 
         if classes_dict_path is None:
             dataset_parent = os.path.dirname(dataset_folder)
@@ -521,51 +541,43 @@ <h1 class="title">Module <code>proteinflow.data.torch</code></h1>
                     self.files[id][chain].append(filename)
         if classes_to_exclude is None:
             classes_to_exclude = []
-        elif classes_dict_path is None:
-            raise ValueError(
-                &#34;The classes_to_exclude parameter is not None, but classes_dict_path is None. Please provide a path to a pickled classes dictionary.&#34;
-            )
+        classes = None
+        if classes_dict_path is not None:
+            with open(classes_dict_path, &#34;rb&#34;) as f:
+                classes = pickle.load(f)
         if clustering_dict_path is not None:
-            if entry_type == &#34;pair&#34;:
-                classes_to_exclude = set(classes_to_exclude)
-                classes_to_exclude.add(&#34;single_chains&#34;)
-                classes_to_exclude = list(classes_to_exclude)
             with open(clustering_dict_path, &#34;rb&#34;) as f:
                 self.clusters = pickle.load(f)  # list of biounit ids by cluster id
-                try:  # old way of storing class information
-                    classes = pickle.load(f)
-                except EOFError:
-                    if len(classes_to_exclude) &gt; 0:
-                        with open(classes_dict_path, &#34;rb&#34;) as f:
-                            classes = pickle.load(f)
-            to_exclude = set()
+                if classes is None:  # old way of storing class information
+                    try:
+                        classes = pickle.load(f)
+                    except EOFError:
+                        pass
+        else:
+            self.clusters = None
+        if classes is None and len(classes_to_exclude) &gt; 0:
+            raise ValueError(
+                &#34;Classes to exclude are given but no classes dictionary is found, please set classes_dict_path to the path of the classes dictionary&#34;
+            )
+        to_exclude = set()
+        if classes is not None:
             for c in classes_to_exclude:
                 for key, id_arr in classes.get(c, {}).items():
                     for id, _ in id_arr:
                         to_exclude.add(id)
-            for key in list(self.clusters.keys()):
-                cluster_list = []
-                for x in self.clusters[key]:
-                    if x[0] in to_exclude:
-                        continue
-                    id = x[0].split(&#34;.&#34;)[0]
-                    chain = x[1]
-                    if id not in self.files:
-                        continue
-                    if chain not in self.files[id]:
-                        continue
-                    if len(self.files[id][chain]) == 0:
-                        continue
-                    cluster_list.append([id, chain])
-                self.clusters[key] = cluster_list
-                if len(self.clusters[key]) == 0:
-                    self.clusters.pop(key)
+        if require_antigen or require_light_chain:
+            to_exclude.update(
+                self._exclude_by_chains(
+                    require_antigen, require_light_chain, require_heavy_chain
+                )
+            )
+        if self.clusters is not None:
+            self._exclude_ids_from_clusters(to_exclude)
             self.data = list(self.clusters.keys())
         else:
-            self.clusters = None
-            self.data = list(self.files.keys())
+            self.data = [x for x in self.files.keys() if x not in to_exclude]
         # create a smaller dataset if necessary (if we have clustering it&#39;s applied earlier)
-        if clustering_dict_path is None and use_fraction &lt; 1:
+        if self.clusters is None and use_fraction &lt; 1:
             self.data = sorted(self.data)[: int(len(self.data) * use_fraction)]
         if load_to_ram:
             print(&#34;Loading to RAM...&#34;)
@@ -585,6 +597,60 @@ <h1 class="title">Module <code>proteinflow.data.torch</code></h1>
         self.cdr = 0
         self.set_cdr(None)
 
+    def _exclude_ids_from_clusters(self, to_exclude):
+        for key in list(self.clusters.keys()):
+            cluster_list = []
+            for x in self.clusters[key]:
+                if x[0] in to_exclude:
+                    continue
+                id = x[0].split(&#34;.&#34;)[0]
+                chain = x[1]
+                if id not in self.files:
+                    continue
+                if chain not in self.files[id]:
+                    continue
+                if len(self.files[id][chain]) == 0:
+                    continue
+                cluster_list.append([id, chain])
+            self.clusters[key] = cluster_list
+            if len(self.clusters[key]) == 0:
+                self.clusters.pop(key)
+
+    def _check_chain_types(self, file):
+        chain_types = set()
+        with open(file, &#34;rb&#34;) as f:
+            data = pickle.load(f)
+        chains = data[&#34;chain_dict&#34;].values()
+        for chain in chains:
+            chain_mask = data[&#34;chain_encoding_all&#34;] == chain
+            cdr = data[&#34;cdr&#34;][chain_mask]
+            cdr_values = cdr.unique()
+            if len(cdr_values) == 1:
+                chain_types.add(&#34;antigen&#34;)
+            elif CDR_REVERSE[&#34;H1&#34;] in cdr_values:
+                chain_types.add(&#34;heavy&#34;)
+            elif CDR_REVERSE[&#34;L1&#34;] in cdr_values:
+                chain_types.add(&#34;light&#34;)
+        return chain_types
+
+    def _exclude_by_chains(
+        self, require_antigen, require_light_chain, require_heavy_chain
+    ):
+        &#34;&#34;&#34;Exclude entries that do not have an antigen or a light chain.&#34;&#34;&#34;
+        to_exclude = set()
+        for id in self.files:
+            filename = list(self.files[id].values())[0][
+                0
+            ]  # assuming entry type is biounit
+            chain_types = self._check_chain_types(filename)
+            if require_antigen and &#34;antigen&#34; not in chain_types:
+                to_exclude.add(id)
+            if require_light_chain and &#34;light&#34; not in chain_types:
+                to_exclude.add(id)
+            if require_heavy_chain and &#34;heavy&#34; not in chain_types:
+                to_exclude.add(id)
+        return to_exclude
+
     def _get_masked_sequence(
         self,
         data,
@@ -732,6 +798,8 @@ <h1 class="title">Module <code>proteinflow.data.torch</code></h1>
         elif self.entry_type == &#34;chain&#34;:
             chain_sets = [[x] for x in chains]
         elif self.entry_type == &#34;pair&#34;:
+            if len(chains) == 1:
+                return []
             chain_sets = list(combinations(chains, 2))
         else:
             raise RuntimeError(
@@ -1058,7 +1126,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
 <dl>
 <dt id="proteinflow.data.torch.ProteinDataset"><code class="flex name class">
 <span>class <span class="ident">ProteinDataset</span></span>
-<span>(</span><span>dataset_folder, features_folder='./data/tmp/', clustering_dict_path=None, max_length=None, rewrite=False, use_fraction=1, load_to_ram=False, debug=False, interpolate='none', node_features_type='zeros', debug_file_path=None, entry_type='biounit', classes_to_exclude=None, shuffle_clusters=True, min_cdr_length=None, feature_functions=None, classes_dict_path=None, cut_edges=False, mask_residues=True, lower_limit=15, upper_limit=100, mask_frac=None, mask_whole_chains=False, mask_sequential=False, force_binding_sites_frac=0.15, mask_all_cdrs=False, load_ligands=False, pyg_graph=False, patch_around_mask=False, initial_patch_size=128, antigen_patch_size=128, debug_verbose=False)</span>
+<span>(</span><span>dataset_folder, features_folder='./data/tmp/', clustering_dict_path=None, max_length=None, rewrite=False, use_fraction=1, load_to_ram=False, debug=False, interpolate='none', node_features_type='zeros', debug_file_path=None, entry_type='biounit', classes_to_exclude=None, shuffle_clusters=True, min_cdr_length=None, feature_functions=None, classes_dict_path=None, cut_edges=False, mask_residues=True, lower_limit=15, upper_limit=100, mask_frac=None, mask_whole_chains=False, mask_sequential=False, force_binding_sites_frac=0.15, mask_all_cdrs=False, load_ligands=False, pyg_graph=False, patch_around_mask=False, initial_patch_size=128, antigen_patch_size=128, require_antigen=False, require_light_chain=False, require_heavy_chain=False)</span>
 </code></dt>
 <dd>
 <div class="desc"><p>Dataset to load proteinflow data.</p>
@@ -1171,6 +1239,12 @@ <h2 id="parameters">Parameters</h2>
 <dd>the size of the initial patch (used if <code>patch_around_mask</code> is <code>True</code>)</dd>
 <dt><strong><code>antigen_patch_size</code></strong> :&ensp;<code>int</code>, default <code>128</code></dt>
 <dd>the size of the antigen patch (used if <code>patch_around_mask</code> is <code>True</code> and the dataset is SAbDab)</dd>
+<dt><strong><code>require_antigen</code></strong> :&ensp;<code>bool</code>, default <code>False</code></dt>
+<dd>if <code>True</code>, only entries with an antigen will be included (used if the dataset is SAbDab)</dd>
+<dt><strong><code>require_light_chain</code></strong> :&ensp;<code>bool</code>, default <code>False</code></dt>
+<dd>if <code>True</code>, only entries with a light chain will be included (used if the dataset is SAbDab)</dd>
+<dt><strong><code>requre_heavy_chain</code></strong> :&ensp;<code>bool</code>, default <code>False</code></dt>
+<dd>if <code>True</code>, only entries with a heavy chain will be included (used if the dataset is SAbDab)</dd>
 </dl></div>
 <details class="source">
 <summary>
@@ -1262,7 +1336,9 @@ <h2 id="parameters">Parameters</h2>
         patch_around_mask=False,
         initial_patch_size=128,
         antigen_patch_size=128,
-        debug_verbose=False,
+        require_antigen=False,
+        require_light_chain=False,
+        require_heavy_chain=False,
     ):
         &#34;&#34;&#34;Initialize the dataset.
 
@@ -1333,9 +1409,15 @@ <h2 id="parameters">Parameters</h2>
             the size of the initial patch (used if `patch_around_mask` is `True`)
         antigen_patch_size : int, default 128
             the size of the antigen patch (used if `patch_around_mask` is `True` and the dataset is SAbDab)
+        require_antigen : bool, default False
+            if `True`, only entries with an antigen will be included (used if the dataset is SAbDab)
+        require_light_chain : bool, default False
+            if `True`, only entries with a light chain will be included (used if the dataset is SAbDab)
+        requre_heavy_chain : bool, default False
+            if `True`, only entries with a heavy chain will be included (used if the dataset is SAbDab)
 
         &#34;&#34;&#34;
-        self.debug = debug_verbose
+        self.debug = False
 
         if classes_dict_path is None:
             dataset_parent = os.path.dirname(dataset_folder)
@@ -1444,51 +1526,43 @@ <h2 id="parameters">Parameters</h2>
                     self.files[id][chain].append(filename)
         if classes_to_exclude is None:
             classes_to_exclude = []
-        elif classes_dict_path is None:
-            raise ValueError(
-                &#34;The classes_to_exclude parameter is not None, but classes_dict_path is None. Please provide a path to a pickled classes dictionary.&#34;
-            )
+        classes = None
+        if classes_dict_path is not None:
+            with open(classes_dict_path, &#34;rb&#34;) as f:
+                classes = pickle.load(f)
         if clustering_dict_path is not None:
-            if entry_type == &#34;pair&#34;:
-                classes_to_exclude = set(classes_to_exclude)
-                classes_to_exclude.add(&#34;single_chains&#34;)
-                classes_to_exclude = list(classes_to_exclude)
             with open(clustering_dict_path, &#34;rb&#34;) as f:
                 self.clusters = pickle.load(f)  # list of biounit ids by cluster id
-                try:  # old way of storing class information
-                    classes = pickle.load(f)
-                except EOFError:
-                    if len(classes_to_exclude) &gt; 0:
-                        with open(classes_dict_path, &#34;rb&#34;) as f:
-                            classes = pickle.load(f)
-            to_exclude = set()
+                if classes is None:  # old way of storing class information
+                    try:
+                        classes = pickle.load(f)
+                    except EOFError:
+                        pass
+        else:
+            self.clusters = None
+        if classes is None and len(classes_to_exclude) &gt; 0:
+            raise ValueError(
+                &#34;Classes to exclude are given but no classes dictionary is found, please set classes_dict_path to the path of the classes dictionary&#34;
+            )
+        to_exclude = set()
+        if classes is not None:
             for c in classes_to_exclude:
                 for key, id_arr in classes.get(c, {}).items():
                     for id, _ in id_arr:
                         to_exclude.add(id)
-            for key in list(self.clusters.keys()):
-                cluster_list = []
-                for x in self.clusters[key]:
-                    if x[0] in to_exclude:
-                        continue
-                    id = x[0].split(&#34;.&#34;)[0]
-                    chain = x[1]
-                    if id not in self.files:
-                        continue
-                    if chain not in self.files[id]:
-                        continue
-                    if len(self.files[id][chain]) == 0:
-                        continue
-                    cluster_list.append([id, chain])
-                self.clusters[key] = cluster_list
-                if len(self.clusters[key]) == 0:
-                    self.clusters.pop(key)
+        if require_antigen or require_light_chain:
+            to_exclude.update(
+                self._exclude_by_chains(
+                    require_antigen, require_light_chain, require_heavy_chain
+                )
+            )
+        if self.clusters is not None:
+            self._exclude_ids_from_clusters(to_exclude)
             self.data = list(self.clusters.keys())
         else:
-            self.clusters = None
-            self.data = list(self.files.keys())
+            self.data = [x for x in self.files.keys() if x not in to_exclude]
         # create a smaller dataset if necessary (if we have clustering it&#39;s applied earlier)
-        if clustering_dict_path is None and use_fraction &lt; 1:
+        if self.clusters is None and use_fraction &lt; 1:
             self.data = sorted(self.data)[: int(len(self.data) * use_fraction)]
         if load_to_ram:
             print(&#34;Loading to RAM...&#34;)
@@ -1508,6 +1582,60 @@ <h2 id="parameters">Parameters</h2>
         self.cdr = 0
         self.set_cdr(None)
 
+    def _exclude_ids_from_clusters(self, to_exclude):
+        for key in list(self.clusters.keys()):
+            cluster_list = []
+            for x in self.clusters[key]:
+                if x[0] in to_exclude:
+                    continue
+                id = x[0].split(&#34;.&#34;)[0]
+                chain = x[1]
+                if id not in self.files:
+                    continue
+                if chain not in self.files[id]:
+                    continue
+                if len(self.files[id][chain]) == 0:
+                    continue
+                cluster_list.append([id, chain])
+            self.clusters[key] = cluster_list
+            if len(self.clusters[key]) == 0:
+                self.clusters.pop(key)
+
+    def _check_chain_types(self, file):
+        chain_types = set()
+        with open(file, &#34;rb&#34;) as f:
+            data = pickle.load(f)
+        chains = data[&#34;chain_dict&#34;].values()
+        for chain in chains:
+            chain_mask = data[&#34;chain_encoding_all&#34;] == chain
+            cdr = data[&#34;cdr&#34;][chain_mask]
+            cdr_values = cdr.unique()
+            if len(cdr_values) == 1:
+                chain_types.add(&#34;antigen&#34;)
+            elif CDR_REVERSE[&#34;H1&#34;] in cdr_values:
+                chain_types.add(&#34;heavy&#34;)
+            elif CDR_REVERSE[&#34;L1&#34;] in cdr_values:
+                chain_types.add(&#34;light&#34;)
+        return chain_types
+
+    def _exclude_by_chains(
+        self, require_antigen, require_light_chain, require_heavy_chain
+    ):
+        &#34;&#34;&#34;Exclude entries that do not have an antigen or a light chain.&#34;&#34;&#34;
+        to_exclude = set()
+        for id in self.files:
+            filename = list(self.files[id].values())[0][
+                0
+            ]  # assuming entry type is biounit
+            chain_types = self._check_chain_types(filename)
+            if require_antigen and &#34;antigen&#34; not in chain_types:
+                to_exclude.add(id)
+            if require_light_chain and &#34;light&#34; not in chain_types:
+                to_exclude.add(id)
+            if require_heavy_chain and &#34;heavy&#34; not in chain_types:
+                to_exclude.add(id)
+        return to_exclude
+
     def _get_masked_sequence(
         self,
         data,
@@ -1655,6 +1783,8 @@ <h2 id="parameters">Parameters</h2>
         elif self.entry_type == &#34;chain&#34;:
             chain_sets = [[x] for x in chains]
         elif self.entry_type == &#34;pair&#34;:
+            if len(chains) == 1:
+                return []
             chain_sets = list(combinations(chains, 2))
         else:
             raise RuntimeError(
@@ -2180,6 +2310,9 @@ <h2 id="parameters">Parameters</h2>
         classes_dict_path=None,
         load_ligands=False,
         cut_edges=False,
+        require_antigen=False,
+        require_light_chain=False,
+        require_heavy_chain=False,
         *args,
         **kwargs,
     ) -&gt; None:
@@ -2236,6 +2369,12 @@ <h2 id="parameters">Parameters</h2>
             if `True`, the ligands will be loaded from the PDB files and added to the features
         cut_edges : bool, default False
             if `True`, missing values at the edges of the sequence will be cut off
+        require_antigen : bool, default False
+            if `True`, only entries with an antigen will be included (used if the dataset is SAbDab)
+        require_light_chain : bool, default False
+            if `True`, only entries with a light chain will be included (used if the dataset is SAbDab)
+        require_heavy_chain : bool, default False
+            if `True`, only entries with a heavy chain will be included (used if the dataset is SAbDab)
         *args
             additional arguments to `torch.utils.data.DataLoader`
         **kwargs
@@ -2266,6 +2405,9 @@ <h2 id="parameters">Parameters</h2>
             mask_all_cdrs=mask_all_cdrs,
             load_ligands=load_ligands,
             cut_edges=cut_edges,
+            require_antigen=require_antigen,
+            require_light_chain=require_light_chain,
+            require_heavy_chain=require_heavy_chain,
         )
         return ProteinLoader(
             dataset=dataset,
@@ -2321,7 +2463,7 @@ <h3>Class variables</h3>
 <h3>Static methods</h3>
 <dl>
 <dt id="proteinflow.data.torch.ProteinLoader.from_args"><code class="name flex">
-<span>def <span class="ident">from_args</span></span>(<span>dataset_folder, features_folder='./data/tmp/', clustering_dict_path=None, max_length=None, rewrite=False, use_fraction=1, load_to_ram=False, debug=False, interpolate='none', node_features_type=None, entry_type='biounit', classes_to_exclude=None, lower_limit=15, upper_limit=100, mask_residues=True, mask_whole_chains=False, mask_frac=None, force_binding_sites_frac=0, shuffle_clusters=True, shuffle_batches=True, mask_all_cdrs=False, classes_dict_path=None, load_ligands=False, cut_edges=False, *args, **kwargs) ‑> None</span>
+<span>def <span class="ident">from_args</span></span>(<span>dataset_folder, features_folder='./data/tmp/', clustering_dict_path=None, max_length=None, rewrite=False, use_fraction=1, load_to_ram=False, debug=False, interpolate='none', node_features_type=None, entry_type='biounit', classes_to_exclude=None, lower_limit=15, upper_limit=100, mask_residues=True, mask_whole_chains=False, mask_frac=None, force_binding_sites_frac=0, shuffle_clusters=True, shuffle_batches=True, mask_all_cdrs=False, classes_dict_path=None, load_ligands=False, cut_edges=False, require_antigen=False, require_light_chain=False, require_heavy_chain=False, *args, **kwargs) ‑> None</span>
 </code></dt>
 <dd>
 <div class="desc"><p>Create a <code><a title="proteinflow.data.torch.ProteinLoader" href="#proteinflow.data.torch.ProteinLoader">ProteinLoader</a></code> instance with a <code><a title="proteinflow.data.torch.ProteinDataset" href="#proteinflow.data.torch.ProteinDataset">ProteinDataset</a></code> from the given arguments.</p>
@@ -2376,6 +2518,12 @@ <h2 id="parameters">Parameters</h2>
 <dd>if <code>True</code>, the ligands will be loaded from the PDB files and added to the features</dd>
 <dt><strong><code>cut_edges</code></strong> :&ensp;<code>bool</code>, default <code>False</code></dt>
 <dd>if <code>True</code>, missing values at the edges of the sequence will be cut off</dd>
+<dt><strong><code>require_antigen</code></strong> :&ensp;<code>bool</code>, default <code>False</code></dt>
+<dd>if <code>True</code>, only entries with an antigen will be included (used if the dataset is SAbDab)</dd>
+<dt><strong><code>require_light_chain</code></strong> :&ensp;<code>bool</code>, default <code>False</code></dt>
+<dd>if <code>True</code>, only entries with a light chain will be included (used if the dataset is SAbDab)</dd>
+<dt><strong><code>require_heavy_chain</code></strong> :&ensp;<code>bool</code>, default <code>False</code></dt>
+<dd>if <code>True</code>, only entries with a heavy chain will be included (used if the dataset is SAbDab)</dd>
 <dt><strong><code>*args</code></strong></dt>
 <dd>additional arguments to <code>torch.utils.data.DataLoader</code></dd>
 <dt><strong><code>**kwargs</code></strong></dt>
@@ -2411,6 +2559,9 @@ <h2 id="parameters">Parameters</h2>
     classes_dict_path=None,
     load_ligands=False,
     cut_edges=False,
+    require_antigen=False,
+    require_light_chain=False,
+    require_heavy_chain=False,
     *args,
     **kwargs,
 ) -&gt; None:
@@ -2467,6 +2618,12 @@ <h2 id="parameters">Parameters</h2>
         if `True`, the ligands will be loaded from the PDB files and added to the features
     cut_edges : bool, default False
         if `True`, missing values at the edges of the sequence will be cut off
+    require_antigen : bool, default False
+        if `True`, only entries with an antigen will be included (used if the dataset is SAbDab)
+    require_light_chain : bool, default False
+        if `True`, only entries with a light chain will be included (used if the dataset is SAbDab)
+    require_heavy_chain : bool, default False
+        if `True`, only entries with a heavy chain will be included (used if the dataset is SAbDab)
     *args
         additional arguments to `torch.utils.data.DataLoader`
     **kwargs
@@ -2497,6 +2654,9 @@ <h2 id="parameters">Parameters</h2>
         mask_all_cdrs=mask_all_cdrs,
         load_ligands=load_ligands,
         cut_edges=cut_edges,
+        require_antigen=require_antigen,
+        require_light_chain=require_light_chain,
+        require_heavy_chain=require_heavy_chain,
     )
     return ProteinLoader(
         dataset=dataset,
diff --git a/docs/index.html b/docs/index.html
index c03055b..20a8ff1 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -45,7 +45,7 @@ <h2 id="installation">Installation</h2>
 <pre><code class="language-bash">docker pull adaptyvbio/proteinflow
 </code></pre>
 <p>By default installing <code><a title="proteinflow" href="#proteinflow">proteinflow</a></code> with conda or pip will only load the dependencies that are required for the main functions of the package: downloading, generating and splitting datasets. If you are interested in using other functions like visualization, metrics and other data processing methods, please install the package with <code>pip install <a title="proteinflow" href="#proteinflow">proteinflow</a>[<a title="proteinflow.processing" href="processing/index.html">proteinflow.processing</a>]</code> or use the docker image.</p>
-<p>Some metric functions also have separate requirements, see the documentation for details.</p>
+<p>Some metric functions also have separate requirements, see the documentation for details. All of them are installed in the docker image.</p>
 <h3 id="troubleshooting">Troubleshooting</h3>
 <ul>
 <li>If you are using python 3.10 and encountering installation problems, try running <code>python -m pip install prody==2.4.0</code> before installing <code><a title="proteinflow" href="#proteinflow">proteinflow</a></code>.</li>
@@ -269,7 +269,7 @@ <h2 id="proteinflow-stable-releases">ProteinFlow Stable Releases</h2>
 
 By default installing `proteinflow` with conda or pip will only load the dependencies that are required for the main functions of the package: downloading, generating and splitting datasets. If you are interested in using other functions like visualization, metrics and other data processing methods, please install the package with `pip install proteinflow[processing]` or use the docker image.
 
-Some metric functions also have separate requirements, see the documentation for details.
+Some metric functions also have separate requirements, see the documentation for details. All of them are installed in the docker image.
 
 ### Troubleshooting
 - If you are using python 3.10 and encountering installation problems, try running `python -m pip install prody==2.4.0` before installing `proteinflow`.
diff --git a/docs/metrics/index.html b/docs/metrics/index.html
index 9bf5e8c..cae2ecf 100644
--- a/docs/metrics/index.html
+++ b/docs/metrics/index.html
@@ -90,7 +90,7 @@ <h1 class="title">Module <code>proteinflow.metrics</code></h1>
         score += matrix[x_before][x_after]
     return score
 
-
+@requires_extra(&#34;blosum&#34;)
 def long_repeat_num(seq, thr=5):
     &#34;&#34;&#34;Calculate the number of long repeats in a sequence.
 
@@ -445,6 +445,132 @@ <h1 class="title">Module <code>proteinflow.metrics</code></h1>
 <section>
 <h2 class="section-title" id="header-functions">Functions</h2>
 <dl>
+<dt id="proteinflow.metrics.ablang_pll"><code class="name flex">
+<span>def <span class="ident">ablang_pll</span></span>(<span>sequence, predict_mask, ablang_model_name='heavy', average=False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Compute pseudo log likelihood.</p>
+<p>Note that you need to install <code>ablang</code> (see <a href="https://github.com/oxpig/AbLang/tree/main">https://github.com/oxpig/AbLang/tree/main</a>).</p>
+<h2 id="parameters">Parameters</h2>
+<dl>
+<dt><strong><code>sequence</code></strong> :&ensp;<code>str</code></dt>
+<dd>Chain sequence (string of amino acid codes)</dd>
+<dt><strong><code>predict_mask</code></strong> :&ensp;<code>np.ndarray</code></dt>
+<dd>Predict mask corresponding to the sequence (array of 0 and 1 where 1 indicates a predicted residue)</dd>
+<dt><strong><code>ablang_model_name</code></strong> :&ensp;<code>{"heavy", "light"}</code>, default <code>"heavy"</code></dt>
+<dd>Name of the AbLang model to use</dd>
+<dt><strong><code>average</code></strong> :&ensp;<code>bool</code>, default <code>False</code></dt>
+<dd>Whether to average the pseudo log likelihood over the residues</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><strong><code>pll</code></strong> :&ensp;<code>float</code></dt>
+<dd>Pseudo log likelihood</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@requires_extra(&#34;ablang&#34;)
+def ablang_pll(
+    sequence,
+    predict_mask,
+    ablang_model_name=&#34;heavy&#34;,
+    average=False,
+):
+    &#34;&#34;&#34;Compute pseudo log likelihood.
+
+    Note that you need to install `ablang` (see https://github.com/oxpig/AbLang/tree/main).
+
+    Parameters
+    ----------
+    sequence : str
+        Chain sequence (string of amino acid codes)
+    predict_mask : np.ndarray
+        Predict mask corresponding to the sequence (array of 0 and 1 where 1 indicates a predicted residue)
+    ablang_model_name : {&#34;heavy&#34;, &#34;light&#34;}, default &#34;heavy&#34;
+        Name of the AbLang model to use
+    average : bool, default False
+        Whether to average the pseudo log likelihood over the residues
+
+    Returns
+    -------
+    pll: float
+        Pseudo log likelihood
+
+    &#34;&#34;&#34;
+    ablang_model = ablang.pretrained(
+        ablang_model_name
+    )  # Use &#34;light&#34; if you are working with light chains
+    ablang_model.freeze()
+
+    sequences = []
+    sequence = list(sequence)
+    predict_idx = np.where(predict_mask)[0]
+    for i in predict_idx:
+        sequences.append(&#34;&#34;.join(sequence[:i]) + &#34;*&#34; + &#34;&#34;.join(sequence[i + 1 :]))
+
+    logits = ablang_model(sequences, mode=&#34;likelihood&#34;)[:, 1:]
+    exp_logits = np.exp(logits)
+    prob = exp_logits / exp_logits.sum(axis=-1, keepdims=True)
+    true_idx = [
+        ablang_model.tokenizer.vocab_to_token[x] - 1
+        for x in np.array(sequence)[predict_idx]
+    ]
+
+    prob = prob[range(prob.shape[0]), predict_idx, true_idx]
+    pll = np.log(prob).sum()
+    if average:
+        pll /= len(predict_idx)
+    return pll</code></pre>
+</details>
+</dd>
+<dt id="proteinflow.metrics.blosum62_score"><code class="name flex">
+<span>def <span class="ident">blosum62_score</span></span>(<span>seq_before, seq_after)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Calculate the BLOSUM62 score between two sequences.</p>
+<h2 id="parameters">Parameters</h2>
+<dl>
+<dt><strong><code>seq_before</code></strong> :&ensp;<code>str</code></dt>
+<dd>The sequence before the mutation</dd>
+<dt><strong><code>seq_after</code></strong> :&ensp;<code>str</code></dt>
+<dd>The sequence after the mutation</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><strong><code>score</code></strong> :&ensp;<code>int</code></dt>
+<dd>The BLOSUM62 score between the two sequences</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@requires_extra(&#34;blosum&#34;)
+def blosum62_score(seq_before, seq_after):
+    &#34;&#34;&#34;Calculate the BLOSUM62 score between two sequences.
+
+    Parameters
+    ----------
+    seq_before : str
+        The sequence before the mutation
+    seq_after : str
+        The sequence after the mutation
+
+    Returns
+    -------
+    score : int
+        The BLOSUM62 score between the two sequences
+
+    &#34;&#34;&#34;
+    assert len(seq_before) == len(seq_after)
+    matrix = bl.BLOSUM(62)
+    score = 0
+    for x_before, x_after in zip(seq_before, seq_after):
+        score += matrix[x_before][x_after]
+    return score</code></pre>
+</details>
+</dd>
 <dt id="proteinflow.metrics.ca_rmsd"><code class="name flex">
 <span>def <span class="ident">ca_rmsd</span></span>(<span>coordinates1, coordinates2)</span>
 </code></dt>
@@ -545,6 +671,254 @@ <h2 id="returns">Returns</h2>
         return struct.b_factor.mean()</code></pre>
 </details>
 </dd>
+<dt id="proteinflow.metrics.esm_pll"><code class="name flex">
+<span>def <span class="ident">esm_pll</span></span>(<span>chain_sequences, predict_masks, esm_model_name='esm2_t30_150M_UR50D', esm_model_objects=None, average=False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Compute pseudo log likelihood.</p>
+<h2 id="parameters">Parameters</h2>
+<dl>
+<dt><strong><code>chain_sequences</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
+<dd>List of chain sequences (strings of amino acid codes)</dd>
+<dt><strong><code>predict_masks</code></strong> :&ensp;<code>list</code> of <code>np.ndarray</code></dt>
+<dd>List of predict masks corresponding to the sequences (arrays of 0 and 1 where 1 indicates a predicted residue)</dd>
+<dt><strong><code>esm_model_name</code></strong> :&ensp;<code>str</code>, default <code>"esm2_t30_150M_UR50D"</code></dt>
+<dd>Name of the ESM-2 model to use</dd>
+<dt><strong><code>esm_model_objects</code></strong> :&ensp;<code>tuple</code>, optional</dt>
+<dd>Tuple of ESM-2 model, batch converter and tok_to_idx dictionary (if not None, <code>esm_model_name</code> will be ignored)</dd>
+<dt><strong><code>average</code></strong> :&ensp;<code>bool</code>, default <code>False</code></dt>
+<dd>Whether to average the pseudo log likelihood over the residues</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><strong><code>pll</code></strong> :&ensp;<code>float</code></dt>
+<dd>Pseudo log likelihood</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@requires_extra(&#34;esm&#34;, install_name=&#34;fair-esm&#34;)
+def esm_pll(
+    chain_sequences,
+    predict_masks,
+    esm_model_name=&#34;esm2_t30_150M_UR50D&#34;,
+    esm_model_objects=None,
+    average=False,
+):
+    &#34;&#34;&#34;Compute pseudo log likelihood.
+
+    Parameters
+    ----------
+    chain_sequences : list of str
+        List of chain sequences (strings of amino acid codes)
+    predict_masks : list of np.ndarray
+        List of predict masks corresponding to the sequences (arrays of 0 and 1 where 1 indicates a predicted residue)
+    esm_model_name : str, default &#34;esm2_t30_150M_UR50D&#34;
+        Name of the ESM-2 model to use
+    esm_model_objects : tuple, optional
+        Tuple of ESM-2 model, batch converter and tok_to_idx dictionary (if not None, `esm_model_name` will be ignored)
+    average : bool, default False
+        Whether to average the pseudo log likelihood over the residues
+
+    Returns
+    -------
+    pll: float
+        Pseudo log likelihood
+
+    &#34;&#34;&#34;
+    predict_mask = []
+    for mask in predict_masks:
+        predict_mask.append(mask)
+        predict_mask.append(np.zeros(2))
+    predict_mask = np.concatenate(predict_mask, axis=0)
+    predict_idx = np.where(predict_mask)[0]
+    sequence = []
+    for i, seq in enumerate(chain_sequences):
+        sequence += list(seq)
+        if i != len(chain_sequences) - 1:
+            sequence += [&#34;&lt;eos&gt;&#34;, &#34;&lt;cls&gt;&#34;]
+
+    if esm_model_objects is None:
+        esm_model, batch_converter, tok_to_idx = _get_esm_model(esm_model_name)
+    else:
+        esm_model, batch_converter, tok_to_idx = esm_model_objects
+    pll = 0
+    for i in predict_idx:
+        sequence_ = &#34;&#34;.join(sequence[:i]) + &#34;&lt;mask&gt;&#34; + &#34;&#34;.join(sequence[i + 1 :])
+        _, _, batch_tokens = batch_converter([(0, sequence_)])
+        if torch.cuda.is_available():
+            batch_tokens = batch_tokens.to(&#34;cuda&#34;)
+        with torch.no_grad():
+            results = esm_model(batch_tokens, repr_layers=[6], return_contacts=False)
+        logits = results[&#34;logits&#34;][0, i + 1].detach().cpu()
+        tok_idx = tok_to_idx[sequence[i]]
+        prob = F.softmax(logits[4:24], dim=-1)[tok_idx - 4]
+        pll += torch.log(prob).item()
+    if average:
+        pll /= len(predict_idx)
+    return pll</code></pre>
+</details>
+</dd>
+<dt id="proteinflow.metrics.esmfold_generate"><code class="name flex">
+<span>def <span class="ident">esmfold_generate</span></span>(<span>sequences, filepaths=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate PDB structures using ESMFold.</p>
+<p>Note that you need to install <code>fair-esm</code> with the <code>esmfold</code> option (see <a href="https://github.com/facebookresearch/esm/tree/main">https://github.com/facebookresearch/esm/tree/main</a>).
+The model also requires &gt; 16GB CPU and GPU memory.</p>
+<h2 id="parameters">Parameters</h2>
+<dl>
+<dt><strong><code>sequences</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
+<dd>List of sequences to be generated (chains separated with <code>':'</code>)</dd>
+<dt><strong><code>filepaths</code></strong> :&ensp;<code>list</code> of <code>str</code>, default <code>None</code></dt>
+<dd>List of filepaths for the generated structures</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@requires_extra(&#34;esm&#34;, install_name=&#34;fair-esm[esmfold]&#34;)
+def esmfold_generate(sequences, filepaths=None):
+    &#34;&#34;&#34;Generate PDB structures using ESMFold.
+
+    Note that you need to install `fair-esm` with the `esmfold` option (see https://github.com/facebookresearch/esm/tree/main).
+    The model also requires &gt; 16GB CPU and GPU memory.
+
+    Parameters
+    ----------
+    sequences : list of str
+        List of sequences to be generated (chains separated with `&#39;:&#39;`)
+    filepaths : list of str, default None
+        List of filepaths for the generated structures
+
+    &#34;&#34;&#34;
+    assert filepaths is None or len(filepaths) == len(sequences)
+    print(&#34;Loading the ESMFold model...&#34;)
+    model = esm.pretrained.esmfold_v1()
+    model = model.eval().cuda()
+    print(&#34;Model loaded.&#34;)
+    if filepaths is None:
+        if not os.path.exists(&#34;esmfold_output&#34;):
+            os.mkdir(&#34;esmfold_output&#34;)
+        filepaths = [
+            os.path.join(&#34;esmfold_output&#34;, f&#34;seq_{i}.pdb&#34;)
+            for i in range(len(sequences))
+        ]
+    with torch.no_grad():
+        for sequence, path in tqdm(zip(sequences, filepaths), total=len(sequences)):
+            output = model.infer_pdb(sequence)
+            with open(path, &#34;w&#34;) as f:
+                f.write(output)</code></pre>
+</details>
+</dd>
+<dt id="proteinflow.metrics.igfold_generate"><code class="name flex">
+<span>def <span class="ident">igfold_generate</span></span>(<span>sequence_dicts, filepaths=None, use_openmm=False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate PDB structures using IgFold.</p>
+<p>Note that you need to install <code>igfold</code> (see <a href="https://github.com/Graylab/IgFold">https://github.com/Graylab/IgFold</a>).</p>
+<h2 id="parameters">Parameters</h2>
+<dl>
+<dt><strong><code>sequence_dicts</code></strong> :&ensp;<code>list</code> of <code>dict</code></dt>
+<dd>List of sequence dictionaries (keys: "H", "L" for heavy and light chains)</dd>
+<dt><strong><code>filepaths</code></strong> :&ensp;<code>list</code> of <code>str</code>, optional</dt>
+<dd>List of filepaths for the generated structures</dd>
+<dt><strong><code>use_openmm</code></strong> :&ensp;<code>bool</code>, default <code>False</code></dt>
+<dd>Whether to use refinement with OpenMM</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@requires_extra(&#34;igfold&#34;)
+def igfold_generate(sequence_dicts, filepaths=None, use_openmm=False):
+    &#34;&#34;&#34;Generate PDB structures using IgFold.
+
+    Note that you need to install `igfold` (see https://github.com/Graylab/IgFold).
+
+    Parameters
+    ----------
+    sequence_dicts : list of dict
+        List of sequence dictionaries (keys: &#34;H&#34;, &#34;L&#34; for heavy and light chains)
+    filepaths : list of str, optional
+        List of filepaths for the generated structures
+    use_openmm : bool, default False
+        Whether to use refinement with OpenMM
+
+    &#34;&#34;&#34;
+    assert filepaths is None or len(filepaths) == len(sequence_dicts)
+    igfold = IgFoldRunner()
+    folder = &#34;igfold_refine_output&#34; if use_openmm else &#34;igfold_output&#34;
+    if filepaths is None:
+        if not os.path.exists(folder):
+            os.mkdir(folder)
+        filepaths = [
+            os.path.join(folder, f&#34;seq_{i}.pdb&#34;) for i in range(len(sequence_dicts))
+        ]
+    for seqs, path in tqdm(zip(sequence_dicts, filepaths), total=len(sequence_dicts)):
+        igfold.fold(
+            path,  # Output PDB file
+            sequences=seqs,  # Antibody sequences
+            do_refine=use_openmm,  # Refine the antibody structure
+            use_openmm=use_openmm,  # Use OpenMM for refinement
+            do_renum=False,  # Renumber predicted antibody structure (Chothia)
+        )</code></pre>
+</details>
+</dd>
+<dt id="proteinflow.metrics.immunebuilder_generate"><code class="name flex">
+<span>def <span class="ident">immunebuilder_generate</span></span>(<span>sequence_dicts, filepaths=None, protein_type='antibody')</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate PDB structures using ImmuneBuilder.</p>
+<p>Note that you need to install <code>immunebuilder</code> (see <a href="https://github.com/oxpig/ImmuneBuilder">https://github.com/oxpig/ImmuneBuilder</a>)</p>
+<h2 id="parameters">Parameters</h2>
+<dl>
+<dt><strong><code>sequence_dicts</code></strong> :&ensp;<code>list</code> of <code>dict</code></dt>
+<dd>List of sequence dictionaries (keys: "H", "L" for heavy and light chains)</dd>
+<dt><strong><code>filepaths</code></strong> :&ensp;<code>list</code> of <code>str</code>, optional</dt>
+<dd>List of filepaths for the generated structures</dd>
+<dt><strong><code>protein_type</code></strong> :&ensp;<code>{"antibody", "nanobody", "tcr"}</code></dt>
+<dd>Type of the structure to generate</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@requires_extra(&#34;ImmuneBuilder&#34;)
+def immunebuilder_generate(sequence_dicts, filepaths=None, protein_type=&#34;antibody&#34;):
+    &#34;&#34;&#34;Generate PDB structures using ImmuneBuilder.
+
+    Note that you need to install `immunebuilder` (see https://github.com/oxpig/ImmuneBuilder)
+
+    Parameters
+    ----------
+    sequence_dicts : list of dict
+        List of sequence dictionaries (keys: &#34;H&#34;, &#34;L&#34; for heavy and light chains)
+    filepaths : list of str, optional
+        List of filepaths for the generated structures
+    protein_type: {&#34;antibody&#34;, &#34;nanobody&#34;, &#34;tcr&#34;}
+        Type of the structure to generate
+
+    &#34;&#34;&#34;
+    predictor_classes = {
+        &#34;antibody&#34;: ABodyBuilder2,
+        &#34;nanobody&#34;: NanoBodyBuilder2,
+        &#34;tcr&#34;: TCRBuilder2,
+    }
+    predictor = predictor_classes[protein_type]()
+    folder = &#34;immunebuilder_output&#34;
+    if filepaths is None:
+        if not os.path.exists(folder):
+            os.mkdir(folder)
+        filepaths = [
+            os.path.join(folder, f&#34;seq_{i}.pdb&#34;) for i in range(len(sequence_dicts))
+        ]
+    for seqs, path in tqdm(zip(sequence_dicts, filepaths), total=len(sequence_dicts)):
+        out = predictor.predict(seqs)
+        out.save(path)</code></pre>
+</details>
+</dd>
 <dt id="proteinflow.metrics.long_repeat_num"><code class="name flex">
 <span>def <span class="ident">long_repeat_num</span></span>(<span>seq, thr=5)</span>
 </code></dt>
@@ -566,7 +940,8 @@ <h2 id="returns">Returns</h2>
 <summary>
 <span>Expand source code</span>
 </summary>
-<pre><code class="python">def long_repeat_num(seq, thr=5):
+<pre><code class="python">@requires_extra(&#34;blosum&#34;)
+def long_repeat_num(seq, thr=5):
     &#34;&#34;&#34;Calculate the number of long repeats in a sequence.
 
     Parameters
@@ -594,6 +969,56 @@ <h2 id="returns">Returns</h2>
     return count</code></pre>
 </details>
 </dd>
+<dt id="proteinflow.metrics.tm_score"><code class="name flex">
+<span>def <span class="ident">tm_score</span></span>(<span>coordinates1, coordinates2, sequence1, sequence2)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Calculate TM-score between two structures.</p>
+<h2 id="parameters">Parameters</h2>
+<dl>
+<dt><strong><code>coordinates1</code></strong> :&ensp;<code>np.ndarray</code></dt>
+<dd>The CA coordinates array of the first structure, shaped <code>(L, 3)</code></dd>
+<dt><strong><code>coordinates2</code></strong> :&ensp;<code>ProteinEntry</code></dt>
+<dd>The CA coordinates array of the second structure, shaped <code>(L, 3)</code></dd>
+<dt><strong><code>sequence1</code></strong> :&ensp;<code>str</code></dt>
+<dd>The sequence of the first structure</dd>
+<dt><strong><code>sequence2</code></strong> :&ensp;<code>str</code></dt>
+<dd>The sequence of the second structure</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><strong><code>tm_score</code></strong> :&ensp;<code>float</code></dt>
+<dd>The TM-score between the two structures</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@requires_extra(&#34;tmtools&#34;)
+def tm_score(coordinates1, coordinates2, sequence1, sequence2):
+    &#34;&#34;&#34;Calculate TM-score between two structures.
+
+    Parameters
+    ----------
+    coordinates1 : np.ndarray
+        The CA coordinates array of the first structure, shaped `(L, 3)`
+    coordinates2 : ProteinEntry
+        The CA coordinates array of the second structure, shaped `(L, 3)`
+    sequence1 : str
+        The sequence of the first structure
+    sequence2 : str
+        The sequence of the second structure
+
+    Returns
+    -------
+    tm_score : float
+        The TM-score between the two structures
+
+    &#34;&#34;&#34;
+    res = tm_align(coordinates1, coordinates2, sequence1, sequence2)
+    return (res.tm_norm_chain1 + res.tm_norm_chain2) / 2</code></pre>
+</details>
+</dd>
 </dl>
 </section>
 <section>
@@ -617,9 +1042,16 @@ <h1>Index</h1>
 </li>
 <li><h3><a href="#header-functions">Functions</a></h3>
 <ul class="">
+<li><code><a title="proteinflow.metrics.ablang_pll" href="#proteinflow.metrics.ablang_pll">ablang_pll</a></code></li>
+<li><code><a title="proteinflow.metrics.blosum62_score" href="#proteinflow.metrics.blosum62_score">blosum62_score</a></code></li>
 <li><code><a title="proteinflow.metrics.ca_rmsd" href="#proteinflow.metrics.ca_rmsd">ca_rmsd</a></code></li>
 <li><code><a title="proteinflow.metrics.confidence_from_file" href="#proteinflow.metrics.confidence_from_file">confidence_from_file</a></code></li>
+<li><code><a title="proteinflow.metrics.esm_pll" href="#proteinflow.metrics.esm_pll">esm_pll</a></code></li>
+<li><code><a title="proteinflow.metrics.esmfold_generate" href="#proteinflow.metrics.esmfold_generate">esmfold_generate</a></code></li>
+<li><code><a title="proteinflow.metrics.igfold_generate" href="#proteinflow.metrics.igfold_generate">igfold_generate</a></code></li>
+<li><code><a title="proteinflow.metrics.immunebuilder_generate" href="#proteinflow.metrics.immunebuilder_generate">immunebuilder_generate</a></code></li>
 <li><code><a title="proteinflow.metrics.long_repeat_num" href="#proteinflow.metrics.long_repeat_num">long_repeat_num</a></code></li>
+<li><code><a title="proteinflow.metrics.tm_score" href="#proteinflow.metrics.tm_score">tm_score</a></code></li>
 </ul>
 </li>
 </ul>
diff --git a/proteinflow/__init__.py b/proteinflow/__init__.py
index ecf9afd..73ec1a1 100644
--- a/proteinflow/__init__.py
+++ b/proteinflow/__init__.py
@@ -31,7 +31,7 @@
 
 By default installing `proteinflow` with conda or pip will only load the dependencies that are required for the main functions of the package: downloading, generating and splitting datasets. If you are interested in using other functions like visualization, metrics and other data processing methods, please install the package with `pip install proteinflow[processing]` or use the docker image.
 
-Some metric functions also have separate requirements, see the documentation for details.
+Some metric functions also have separate requirements, see the documentation for details. All of them are installed in the docker image.
 
 ### Troubleshooting
 - If you are using python 3.10 and encountering installation problems, try running `python -m pip install prody==2.4.0` before installing `proteinflow`.
diff --git a/proteinflow/extra.py b/proteinflow/extra.py
index c894c8b..1d14a2b 100644
--- a/proteinflow/extra.py
+++ b/proteinflow/extra.py
@@ -6,6 +6,7 @@
     pass
 
 import sys
+from functools import wraps
 
 
 def requires_extra(module_name, install_name=None):
@@ -23,6 +24,7 @@ def requires_extra(module_name, install_name=None):
         install_name = module_name
 
     def decorator(func):
+        @wraps(func)
         def wrapper(*args, **kwargs):
             if module_name not in sys.modules:
                 raise ImportError(
diff --git a/proteinflow/metrics/__init__.py b/proteinflow/metrics/__init__.py
index 4b83e09..faa8546 100644
--- a/proteinflow/metrics/__init__.py
+++ b/proteinflow/metrics/__init__.py
@@ -61,6 +61,7 @@ def blosum62_score(seq_before, seq_after):
     return score
 
 
+@requires_extra("blosum")
 def long_repeat_num(seq, thr=5):
     """Calculate the number of long repeats in a sequence.