Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lood/calc weights #319

Draft
wants to merge 13 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 38 additions & 50 deletions evcouplings/align/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
Authors:
Thomas A. Hopf
"""

import os
import re
from collections import namedtuple, OrderedDict, defaultdict
from copy import deepcopy

import numpy as np
from numba import jit

from evcouplings.couplings.weights import num_cluster_members_nogaps_parallel, num_cluster_members_legacy, \
num_cluster_members_nogaps_serial
from evcouplings.utils.calculations import entropy
from evcouplings.utils.helpers import DefaultOrderedDict, wrap

Expand Down Expand Up @@ -422,7 +424,7 @@ def sequences_to_matrix(sequences):

N = len(sequences)
L = len(next(iter(sequences)))
matrix = np.empty((N, L), dtype=np.str)
matrix = np.empty((N, L), dtype=str)

for i, seq in enumerate(sequences):
if len(seq) != L:
Expand Down Expand Up @@ -875,12 +877,14 @@ def __ensure_mapped_matrix(self):
self.matrix, self.alphabet_map
)

def set_weights(self, identity_threshold=0.8):
def set_weights(self, identity_threshold=0.8, weights_calc_method="nogaps", weights_file=None, num_cpus=1):
"""
Calculate weights for sequences in alignment by
clustering all sequences with sequence identity
greater or equal to the given threshold.

Can select either (nogaps (default), legacy or uniform (no weights calculated)).

.. note::

This method sets self.weights. After this method was called, methods/attributes such as
Expand All @@ -895,12 +899,40 @@ def set_weights(self, identity_threshold=0.8):
----------
identity_threshold : float, optional (default: 0.8)
Sequence identity threshold
weights_calc_method : {"nogaps", "legacy", "uniform", "custom"}, optional (default: "nogaps")
"""
self.__ensure_mapped_matrix()

self.num_cluster_members = num_cluster_members(
self.matrix_mapped, identity_threshold
)
if weights_calc_method == "nogaps":
# Note: need to set numba.set_num_threads(self.num_cpu) before calling this method (and reset it afterwards)
# and if num_cpus == 1, rather use num_cluster_members_nogaps_parallel (or set numba.set_num_threads(1) is fine)
# also, numba sets its maximum number of threads on importing, so if num_cpus > threads we should raise a warning
# https://numba.pydata.org/numba-doc/latest/user/threading-layer.html#setting-the-threading-layer
if num_cpus > 1:
self.num_cluster_members = num_cluster_members_nogaps_parallel(
self.matrix_mapped, identity_threshold
)
else:
self.num_cluster_members = num_cluster_members_nogaps_serial(
self.matrix_mapped, identity_threshold
)
elif weights_calc_method == "legacy":
self.num_cluster_members = num_cluster_members_legacy(
self.matrix_mapped, identity_threshold
)
elif weights_calc_method == "uniform":
self.num_cluster_members = np.ones(self.N)
elif weights_calc_method == "custom":
if weights_file is None:
raise ValueError("weights_file must be provided when weights_calc_method is 'custom'")
if not os.path.isfile(weights_file):
raise ValueError("Sequence weights file does not exist: {}".format(weights_file))

print("Loading sequence weights from file:", weights_file)
self.weights = np.load(file=weights_file)
else:
raise ValueError("Invalid weights_calc_method: {}. Must be one of {}".format(weights_calc_method, ["nogaps", "legacy", "uniform", "custom"]))

self.weights = 1.0 / self.num_cluster_members

# reset frequencies, since these were based on
Expand Down Expand Up @@ -1166,47 +1198,3 @@ def identities_to_seq(seq, matrix):
identities[i] = id_i

return identities


@jit(nopython=True)
def num_cluster_members(matrix, identity_threshold):
"""
Calculate number of sequences in alignment
within given identity_threshold of each other

Parameters
----------
matrix : np.array
N x L matrix containing N sequences of length L.
Matrix must be mapped to range(0, num_symbols) using
map_matrix function
identity_threshold : float
Sequences with at least this pairwise identity will be
grouped in the same cluster.

Returns
-------
np.array
Vector of length N containing number of cluster
members for each sequence (inverse of sequence
weight)
"""
N, L = matrix.shape
L = 1.0 * L

# minimal cluster size is 1 (self)
num_neighbors = np.ones((N))

# compare all pairs of sequences
for i in range(N - 1):
for j in range(i + 1, N):
pair_id = 0
for k in range(L):
if matrix[i, k] == matrix[j, k]:
pair_id += 1

if pair_id / L >= identity_threshold:
num_neighbors[i] += 1
num_neighbors[j] += 1

return num_neighbors
5 changes: 3 additions & 2 deletions evcouplings/align/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

"""

from collections import OrderedDict, Iterable
from collections import OrderedDict
from collections.abc import Iterable
import re
from shutil import copy
import os
Expand Down Expand Up @@ -909,7 +910,7 @@ def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, *
min_cov /= 100

keep_seqs = (1 - ali.count("-", axis="seq")) >= min_cov
ali = ali.select(sequences=keep_seqs)
ali = ali.select(sequences=keep_seqs) # TODO this will affect weights output length (make sure to pass to save_weights)

# Calculate frequencies, conservation and identity to query
# on final alignment (except for lowercase modification)
Expand Down
3 changes: 2 additions & 1 deletion evcouplings/compare/pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
Thomas A. Hopf
"""

from collections import OrderedDict, Iterable
from collections import OrderedDict
from collections.abc import Iterable
from os import path
from urllib.error import HTTPError

Expand Down
2 changes: 1 addition & 1 deletion evcouplings/couplings/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
Anna G. Green (MultiSegmentCouplingsModel)
"""

from collections import Iterable
from collections.abc import Iterable
from copy import deepcopy
from evcouplings.couplings.model import CouplingsModel
import pandas as pd
Expand Down
2 changes: 1 addition & 1 deletion evcouplings/couplings/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
Authors:
Thomas A. Hopf
"""
from collections import Iterable
from collections.abc import Iterable
from copy import deepcopy

from numba import jit
Expand Down
22 changes: 19 additions & 3 deletions evcouplings/couplings/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
)


# TODO(Lood): Extra config needs to be passed through here:
# sequence_weight_strategy, weights_file (if strategy=custom), early_stopping (for just Meff calculation)
def infer_plmc(**kwargs):
"""
Run EC computation on alignment. This function contains
Expand Down Expand Up @@ -178,12 +180,12 @@ def infer_plmc(**kwargs):
# finally, scale lambda_J
lambda_J *= (num_symbols - 1) * (L - 1)

# run plmc... or reuse pre-exisiting results from previous run
# run plmc... or reuse pre-existing results from previous run
plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg"

# determine if to rerun, only possible if previous results
# were stored in ali_outcfg_file
if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file):
if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file): # TODO(Lood): assume this means we can skip weights calc
plmc_result = read_config_file(plm_outcfg_file)

# check if the EC/parameter files are there
Expand All @@ -199,6 +201,9 @@ def infer_plmc(**kwargs):
)

else:
# TODO Load alignment and calculate weights
# need: alignment file, sequence_weight_strategy, cpus (optional, for numba), theta, alphabet? (for invalid characters)

# run plmc binary
plmc_result = ct.run_plmc(
kwargs["alignment_file"],
Expand All @@ -214,7 +219,7 @@ def infer_plmc(**kwargs):
lambda_J=lambda_J,
lambda_g=kwargs["lambda_group"],
cpu=kwargs["cpu"],
binary=kwargs["plmc"],
binary=kwargs["plmc"], # TODO pass in calculated weights here
)

# save iteration table to file
Expand Down Expand Up @@ -696,6 +701,17 @@ def mean_field(**kwargs):
format="fasta"
)

# TODO hoist calc weights out of dca.fit and into here
# - we could even consider calculating weights in run (top level), but for now we can duplicate it
# then assign weights to the input_alignment object

# input_alignment.set_weights(
# identity_threshold=kwargs['theta'],
# weights_calc_method=kwargs['weights_calc_method'],
# weights_file=kwargs['weights_file'], # Optional unless weights_calc_method is 'custom'
# num_cpus=kwargs['cpus'], # Optional
# )

# init mean field direct coupling analysis
mf_dca = MeanFieldDCA(input_alignment)

Expand Down
Loading