-
Notifications
You must be signed in to change notification settings - Fork 75
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added mean field protocol for complexes #223
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -480,6 +480,234 @@ def complex(**kwargs): | |
# used for the EC table above | ||
|
||
return outcfg | ||
|
||
def complex_mean_field(**kwargs): | ||
""" | ||
Protocol: | ||
|
||
Infer ECs for protein complexes from alignment using mean field. | ||
Allows user to select scoring protocol. | ||
|
||
For now, mean field DCA can only be run in focus mode, gaps | ||
included. | ||
|
||
Parameters | ||
---------- | ||
Mandatory kwargs arguments: | ||
See list below in code where calling check_required. | ||
|
||
Returns | ||
------- | ||
outcfg : dict | ||
Output configuration of the pipeline, including | ||
the following fields: | ||
|
||
raw_ec_file | ||
model_file | ||
num_sites | ||
num_sequences | ||
effective_sequences | ||
|
||
focus_mode (passed through) | ||
focus_sequence (passed through) | ||
segments (passed through) | ||
""" | ||
# for additional required parameters, see mean field info | ||
check_required( | ||
kwargs, | ||
[ | ||
"prefix", "min_sequence_distance", | ||
"scoring_model", "use_all_ecs_for_scoring", | ||
"alignment_file","segments","focus_mode", | ||
"focus_sequence","theta","pseudo_count", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. spaces |
||
"alphabet" | ||
] | ||
) | ||
|
||
if not kwargs["focus_mode"]: | ||
raise InvalidParameterError( | ||
"For now, mean field DCA can only be run in focus mode." | ||
) | ||
|
||
|
||
prefix = kwargs["prefix"] | ||
|
||
# infer ECs and load them with plmc | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. delete these obsolete lines |
||
#outcfg, ecs, segments = infer_plmc(**kwargs) | ||
#model = CouplingsModel(outcfg["model_file"]) | ||
|
||
#infer ECs with mean field/DCA | ||
|
||
# Below here is added material from mean_field protocol: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. delete commented line |
||
model = prefix+".model" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. prefix + ".model" |
||
|
||
outcfg = { | ||
"model_file": model, | ||
"raw_ec_file": prefix + "_ECs.txt", | ||
"ec_file": prefix + "_CouplingScores.csv", | ||
"focus_mode": kwargs["focus_mode"], | ||
"focus_sequence": kwargs["focus_sequence"], | ||
"segments": kwargs["segments"] | ||
} | ||
|
||
# make sure input alignment exists | ||
alignment_file = kwargs["alignment_file"] | ||
verify_resources( | ||
"Input alignment does not exist", | ||
kwargs["alignment_file"] | ||
) | ||
|
||
# make sure output directory exists | ||
create_prefix_folders(prefix) | ||
segments = kwargs["segments"] | ||
if segments is not None: | ||
segments = [ | ||
mapping.Segment.from_list(s) for s in segments | ||
] | ||
|
||
# determine alphabet, default is protein | ||
if kwargs["alphabet"] is None: | ||
alphabet = ALPHABET_PROTEIN | ||
else: | ||
alphabet = kwargs["alphabet"] | ||
|
||
# allow shortcuts for protein, DNA, RNA | ||
if alphabet in ALPHABET_MAP: | ||
alphabet = ALPHABET_MAP[alphabet] | ||
|
||
# read in a2m alignment | ||
with open(alignment_file) as f: | ||
input_alignment = Alignment.from_file( | ||
f, alphabet=alphabet, | ||
format="fasta" | ||
) | ||
|
||
# init mean field DCA | ||
mf_dca = MeanFieldDCA(input_alignment) | ||
|
||
# run mean field approximation | ||
model = mf_dca.fit( | ||
theta=kwargs["theta"], | ||
pseudo_count=kwargs["pseudo_count"] | ||
) | ||
|
||
# write ECs to file | ||
model.to_raw_ec_file( | ||
outcfg["raw_ec_file"] | ||
) | ||
|
||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. limit to one whitespace line |
||
# store useful information about model in outcfg | ||
outcfg.update({ | ||
"num_sites": model.L, | ||
"num_valid_sequences": model.N_valid, | ||
"effective_sequences": float(round(model.N_eff, 1)), | ||
#"region_start": int(model.index_list[0]), | ||
}) | ||
|
||
ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"]) | ||
|
||
if segments is not None: | ||
# create index mapping | ||
seg_mapper = mapping.SegmentIndexMapper( | ||
# kwargs["focus_mode"], outcfg["region_start"], *segments | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. delete commented line |
||
kwargs["focus_mode"], kwargs["region_start"], *segments | ||
) | ||
|
||
# apply to EC table | ||
ecs = mapping.segment_map_ecs(ecs,seg_mapper) | ||
|
||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. limit to one whitespace line |
||
# Add dummy probability value for downstream code | ||
#ecs.loc[:,"probability"] = np.nan | ||
|
||
# write model file | ||
#if outcfg["model_file"] is not None: | ||
# model.to_file( | ||
# outcfg["model_file"], | ||
# file_format="plmc_v2" | ||
# ) | ||
|
||
###### End added material | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. delete commented lines |
||
|
||
## Add in segment_i, segment_j, and write ec file | ||
|
||
# following computations are mostly specific to complex pipeline | ||
|
||
# add mixture model probability | ||
if kwargs["scoring_model"] in SCORING_MODELS: | ||
if kwargs["use_all_ecs_for_scoring"] is not None: | ||
use_all_ecs = kwargs["use_all_ecs_for_scoring"] | ||
else: | ||
use_all_ecs = False | ||
|
||
ecs = complex_probability( | ||
ecs, kwargs["scoring_model"], use_all_ecs | ||
) | ||
|
||
else: | ||
raise InvalidParameterError( | ||
"Invalid scoring_model parameter: " + | ||
"{}. Valid options are: {}".format( | ||
kwargs["protocol"], ", ".join(SCORING_MODELS) | ||
) | ||
) | ||
|
||
# also create line-drawing script (for multiple chains) | ||
# by convention, we map first segment to chain A, | ||
# second to B, a.s.f. | ||
chain_mapping = dict( | ||
zip( | ||
[s.segment_id for s in segments], | ||
string.ascii_uppercase, | ||
) | ||
) | ||
|
||
is_single_segment = segments is None or len(segments) == 1 | ||
outcfg = { | ||
**outcfg, | ||
**_postprocess_inference( | ||
ecs, kwargs, model, outcfg, prefix, | ||
generate_line_plot=is_single_segment, | ||
generate_enrichment=is_single_segment, | ||
ec_filter="segment_i != segment_j or abs(i - j) >= {}", | ||
chain=chain_mapping | ||
) | ||
} | ||
|
||
# save just the inter protein ECs | ||
## TODO: eventually have this accomplished by _postprocess_inference | ||
## right now avoiding a second call with a different ec_filter | ||
ecs = pd.read_csv(outcfg["ec_file"]) | ||
outcfg["inter_ec_file"] = prefix + "_CouplingScores_inter.csv" | ||
inter_ecs = ecs.query("segment_i != segment_j") | ||
inter_ecs.to_csv(outcfg["inter_ec_file"], index=False) | ||
|
||
# dump output config to YAML file for debugging/logging | ||
write_config_file(prefix + ".couplings_complex.outcfg", outcfg) | ||
|
||
# TODO: make the following complex-ready | ||
# EC enrichment: | ||
# | ||
# 1) think about making EC enrichment complex-ready and add | ||
# it back here - so far it only makes sense if all ECs are | ||
# on one segment | ||
# | ||
# EVzoom: | ||
# | ||
# 1) at the moment, EVzoom will use numbering before remapping | ||
# we should eventually get this to a point where segments + residue | ||
# index are displayed on EVzoom | ||
# | ||
# 2) note that this will currently use the default mixture model | ||
# selection for determining the EC cutoff, rather than the selection | ||
# used for the EC table above | ||
|
||
return outcfg | ||
|
||
|
||
|
||
|
||
def mean_field(**kwargs): | ||
|
@@ -755,6 +983,9 @@ def _postprocess_inference(ecs, kwargs, model, outcfg, prefix, generate_line_plo | |
|
||
# inference protocol using mean field approximation | ||
"mean_field": mean_field, | ||
|
||
# runs DCA/meanfield for complexes | ||
"complex_mean_field": complex_mean_field | ||
} | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
put spaces after commas in list