debbiemarkslab · thomashopf · Jun 9, 2022 · Jun 9, 2022 · Jun 9, 2022 · Jun 10, 2022
diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py
@@ -5,14 +5,16 @@
 Authors:
   Thomas A. Hopf
 """
-
+import os
 import re
 from collections import namedtuple, OrderedDict, defaultdict
 from copy import deepcopy
 
 import numpy as np
 from numba import jit
 
+from evcouplings.couplings.weights import num_cluster_members_nogaps_parallel, num_cluster_members_legacy, \
+    num_cluster_members_nogaps_serial
 from evcouplings.utils.calculations import entropy
 from evcouplings.utils.helpers import DefaultOrderedDict, wrap
 
@@ -422,7 +424,7 @@ def sequences_to_matrix(sequences):
 
     N = len(sequences)
     L = len(next(iter(sequences)))
-    matrix = np.empty((N, L), dtype=np.str)
+    matrix = np.empty((N, L), dtype=str)
 
     for i, seq in enumerate(sequences):
         if len(seq) != L:
@@ -875,12 +877,14 @@ def __ensure_mapped_matrix(self):
                 self.matrix, self.alphabet_map
             )
 
-    def set_weights(self, identity_threshold=0.8):
+    def set_weights(self, identity_threshold=0.8, weights_calc_method="nogaps", weights_file=None, num_cpus=1):
         """
         Calculate weights for sequences in alignment by
         clustering all sequences with sequence identity
         greater or equal to the given threshold.
 
+        Can select either (nogaps (default), legacy or uniform (no weights calculated)).
+
         .. note::
 
             This method sets self.weights. After this method was called, methods/attributes such as
@@ -895,12 +899,40 @@ def set_weights(self, identity_threshold=0.8):
         ----------
         identity_threshold : float, optional (default: 0.8)
             Sequence identity threshold
+        weights_calc_method : {"nogaps", "legacy", "uniform", "custom"}, optional (default: "nogaps")
         """
         self.__ensure_mapped_matrix()
 
-        self.num_cluster_members = num_cluster_members(
-            self.matrix_mapped, identity_threshold
-        )
+        if weights_calc_method == "nogaps":
+            # Note: need to set numba.set_num_threads(self.num_cpu) before calling this method (and reset it afterwards)
+            #   and if num_cpus == 1, rather use num_cluster_members_nogaps_parallel (or set numba.set_num_threads(1) is fine)
+            #   also, numba sets its maximum number of threads on importing, so if num_cpus > threads we should raise a warning
+            #   https://numba.pydata.org/numba-doc/latest/user/threading-layer.html#setting-the-threading-layer
+            if num_cpus > 1:
+                self.num_cluster_members = num_cluster_members_nogaps_parallel(
+                    self.matrix_mapped, identity_threshold
+                )
+            else:
+                self.num_cluster_members = num_cluster_members_nogaps_serial(
+                    self.matrix_mapped, identity_threshold
+                )
+        elif weights_calc_method == "legacy":
+            self.num_cluster_members = num_cluster_members_legacy(
+                self.matrix_mapped, identity_threshold
+            )
+        elif weights_calc_method == "uniform":
+            self.num_cluster_members = np.ones(self.N)
+        elif weights_calc_method == "custom":
+            if weights_file is None:
+                raise ValueError("weights_file must be provided when weights_calc_method is 'custom'")
+            if not os.path.isfile(weights_file):
+                raise ValueError("Sequence weights file does not exist: {}".format(weights_file))
+
+            print("Loading sequence weights from file:", weights_file)
+            self.weights = np.load(file=weights_file)
+        else:
+            raise ValueError("Invalid weights_calc_method: {}. Must be one of {}".format(weights_calc_method, ["nogaps", "legacy", "uniform", "custom"]))
+
         self.weights = 1.0 / self.num_cluster_members
 
         # reset frequencies, since these were based on
@@ -1166,47 +1198,3 @@ def identities_to_seq(seq, matrix):
         identities[i] = id_i
 
     return identities
-
-
-@jit(nopython=True)
-def num_cluster_members(matrix, identity_threshold):
-    """
-    Calculate number of sequences in alignment
-    within given identity_threshold of each other
-
-    Parameters
-    ----------
-    matrix : np.array
-        N x L matrix containing N sequences of length L.
-        Matrix must be mapped to range(0, num_symbols) using
-        map_matrix function
-    identity_threshold : float
-        Sequences with at least this pairwise identity will be
-        grouped in the same cluster.
-
-    Returns
-    -------
-    np.array
-        Vector of length N containing number of cluster
-        members for each sequence (inverse of sequence
-        weight)
-    """
-    N, L = matrix.shape
-    L = 1.0 * L
-
-    # minimal cluster size is 1 (self)
-    num_neighbors = np.ones((N))
-
-    # compare all pairs of sequences
-    for i in range(N - 1):
-        for j in range(i + 1, N):
-            pair_id = 0
-            for k in range(L):
-                if matrix[i, k] == matrix[j, k]:
-                    pair_id += 1
-
-            if pair_id / L >= identity_threshold:
-                num_neighbors[i] += 1
-                num_neighbors[j] += 1
-
-    return num_neighbors
diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py
@@ -8,7 +8,8 @@
 
 """
 
-from collections import OrderedDict, Iterable
+from collections import OrderedDict
+from collections.abc import Iterable
 import re
 from shutil import copy
 import os
@@ -909,7 +910,7 @@ def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, *
             min_cov /= 100
 
         keep_seqs = (1 - ali.count("-", axis="seq")) >= min_cov
-        ali = ali.select(sequences=keep_seqs)
+        ali = ali.select(sequences=keep_seqs)  # TODO this will affect weights output length (make sure to pass to save_weights)
 
     # Calculate frequencies, conservation and identity to query
     # on final alignment (except for lowercase modification)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
@@ -5,7 +5,8 @@
   Thomas A. Hopf
 """
 
-from collections import OrderedDict, Iterable
+from collections import OrderedDict
+from collections.abc import Iterable
 from os import path
 from urllib.error import HTTPError
 

diff --git a/evcouplings/couplings/mapping.py b/evcouplings/couplings/mapping.py
@@ -7,7 +7,7 @@
   Anna G. Green (MultiSegmentCouplingsModel)
 """
 
-from collections import Iterable
+from collections.abc import Iterable
 from copy import deepcopy
 from evcouplings.couplings.model import CouplingsModel
 import pandas as pd

diff --git a/evcouplings/couplings/model.py b/evcouplings/couplings/model.py
@@ -6,7 +6,7 @@
 Authors:
   Thomas A. Hopf
 """
-from collections import Iterable
+from collections.abc import Iterable
 from copy import deepcopy
 
 from numba import jit

diff --git a/evcouplings/couplings/protocol.py b/evcouplings/couplings/protocol.py
@@ -53,6 +53,8 @@
 )
 
 
+# TODO(Lood): Extra config needs to be passed through here:
+#  sequence_weight_strategy, weights_file (if strategy=custom), early_stopping (for just Meff calculation)
 def infer_plmc(**kwargs):
     """
     Run EC computation on alignment. This function contains
@@ -178,12 +180,12 @@ def infer_plmc(**kwargs):
         # finally, scale lambda_J
         lambda_J *= (num_symbols - 1) * (L - 1)
 
-    # run plmc... or reuse pre-exisiting results from previous run
+    # run plmc... or reuse pre-existing results from previous run
     plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg"
 
     # determine if to rerun, only possible if previous results
     # were stored in ali_outcfg_file
-    if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file):
+    if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file):  # TODO(Lood): assume this means we can skip weights calc
         plmc_result = read_config_file(plm_outcfg_file)
 
         # check if the EC/parameter files are there
@@ -199,6 +201,9 @@ def infer_plmc(**kwargs):
         )
 
     else:
+        # TODO Load alignment and calculate weights
+        #  need: alignment file, sequence_weight_strategy, cpus (optional, for numba), theta, alphabet? (for invalid characters)
+
         # run plmc binary
         plmc_result = ct.run_plmc(
             kwargs["alignment_file"],
@@ -214,7 +219,7 @@ def infer_plmc(**kwargs):
             lambda_J=lambda_J,
             lambda_g=kwargs["lambda_group"],
             cpu=kwargs["cpu"],
-            binary=kwargs["plmc"],
+            binary=kwargs["plmc"],  # TODO pass in calculated weights here
         )
 
         # save iteration table to file
@@ -696,6 +701,17 @@ def mean_field(**kwargs):
             format="fasta"
         )
 
+    # TODO hoist calc weights out of dca.fit and into here
+    #  - we could even consider calculating weights in run (top level), but for now we can duplicate it
+    #  then assign weights to the input_alignment object
+
+    # input_alignment.set_weights(
+    #     identity_threshold=kwargs['theta'],
+    #     weights_calc_method=kwargs['weights_calc_method'],
+    #     weights_file=kwargs['weights_file'],   # Optional unless weights_calc_method is 'custom'
+    #     num_cpus=kwargs['cpus'],    # Optional
+    # )
+
     # init mean field direct coupling analysis
     mf_dca = MeanFieldDCA(input_alignment)