Skip to content

Commit

Permalink
Version 1.0.1: exons classified as Deleted or Missing are excluded fr…
Browse files Browse the repository at this point in the history
…om codon.fasta
  • Loading branch information
kirilenkobm committed Sep 8, 2022
1 parent df502d3 commit 61ae39f
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 6 deletions.
85 changes: 83 additions & 2 deletions CESAR_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -1770,13 +1770,62 @@ def check_codon(codon):
return codon


def extract_codon_data(codon_table):
def extract_codon_data(codon_table, excl_exons=None):
"""Get codon alignment."""
t_codons = []
q_codons = []
_excl_exons = excl_exons if excl_exons is not None else set()
prev_exon_was_del = None

for codon in codon_table:
ref_codon = codon["ref_codon"]
que_codon = codon["que_codon"]
# needed if we filter D/M exons out
t_exon_num = codon["t_exon_num"]

# check whether we need to delete the codon or not
this_exon_to_del = t_exon_num in _excl_exons
prev_exon_to_del_this_codon_split = prev_exon_was_del and codon["split_"] > 0

if this_exon_to_del or prev_exon_to_del_this_codon_split:
# this codon to be deleted in query
if len(ref_codon) == 3:
ref_codon = check_codon(ref_codon)
t_codons.append(ref_codon)
q_codons.append(GAP_CODON)
elif len(que_codon) % 3 == 0:
# Frame-preserving ins
ref_subcodons = [check_codon(x) for x in parts(ref_codon, 3)]
que_subcodons = [GAP_CODON for x in range(len(ref_subcodons))]
t_codons.extend(ref_subcodons)
q_codons.extend(que_subcodons)
else:
# something strange
ref_int = check_codon(ref_codon[-3:])
t_codons.extend(ref_int)
q_codons.extend(GAP_CODON)

if this_exon_to_del:
prev_exon_was_del = True
else:
# meaning, prev_exon_to_del_this_codon_split is True
prev_exon_was_del = False
continue

if t_exon_num in _excl_exons:
prev_exon_was_del = True
if len(ref_codon) == 3:
ref_codon = check_codon(ref_codon)
continue

elif prev_exon_was_del and codon["split_"] > 0:
# split codon from prev exon
prev_exon_was_del = False
continue
# this codon is added for sure
prev_exon_was_del = False

# ordinary branch
if len(ref_codon) == len(que_codon) == 3:
# the simplest case
ref_codon = check_codon(ref_codon)
Expand Down Expand Up @@ -1817,6 +1866,7 @@ def process_cesar_out(cesar_raw_out, query_loci, inverts):
)
exon_num_corr = {} # query exon num -> target exon nums
aa_sat_seq = {} # exon num -> no dels
chain_id_to_codon_table = {} # chain id -> raw codon table

for part in raw_data:
# parse output
Expand All @@ -1826,9 +1876,11 @@ def process_cesar_out(cesar_raw_out, query_loci, inverts):
verbose(f"Processing query ID {chain_id}")

codons_data = parse_cesar_out(target_raw, query_raw, v=VERBOSE)
chain_id_to_codon_table[chain_id] = codons_data
# extract protein sequences also here, just to do it in one place
part_pIDs, part_blosums, prot_seqs_part = compute_score(codons_data)
prot_seqs[chain_id] = prot_seqs_part
# convert codon list to {"ref": [t_codons], "que": [q_codons]}
codon_seqs_part = extract_codon_data(codons_data)
codon_seqs[chain_id] = codon_seqs_part
(
Expand Down Expand Up @@ -1883,6 +1935,7 @@ def process_cesar_out(cesar_raw_out, query_loci, inverts):
prot_seqs,
codon_seqs,
aa_sat_seq,
chain_id_to_codon_table,
)
return ret

Expand All @@ -1900,8 +1953,10 @@ def process_cesar_out__fragments(cesar_raw_out, fragm_data, query_loci, inverts)
cesar_raw_lines = cesar_raw_out.split("\n")
target_seq_raw = cesar_raw_lines[1]
query_seq_raw = cesar_raw_lines[3]
chain_id_to_codon_table = {}

codons_data = parse_cesar_out(target_seq_raw, query_seq_raw, v=VERBOSE)
chain_id_to_codon_table[FRAGMENT] = codons_data
# extract protein sequences also here, just to do it in one place
part_pIDs, part_blosums, prot_seqs_part = compute_score(codons_data)
prot_seqs[FRAGMENT] = prot_seqs_part
Expand Down Expand Up @@ -1993,6 +2048,7 @@ def process_cesar_out__fragments(cesar_raw_out, fragm_data, query_loci, inverts)
prot_seqs,
codon_seqs,
aa_sat_seq,
chain_id_to_codon_table
)
return ret

Expand Down Expand Up @@ -2312,6 +2368,17 @@ def read_predefined_regions(arg):
return ret


def redo_codon_sequences(codon_tables, del_mis_exons):
"""Rebuild codon alignments: now excluding deleted and missing exons."""
ret = {}
for chain_id, codon_table in codon_tables.items():
excl_exons = del_mis_exons.get(str(chain_id), set())
codon_seqs_upd = extract_codon_data(codon_table, excl_exons=excl_exons)
ret[chain_id] = codon_seqs_upd
return ret



def realign_exons(args):
"""Entry point."""
memlim = float(args["memlim"]) if args["memlim"] != "Auto" else None
Expand Down Expand Up @@ -2617,6 +2684,11 @@ def realign_exons(args):
prot_s = proc_out[6] # protein sequences in query
codon_s = proc_out[7]
aa_cesar_sat = proc_out[8] # says whether an exon has outstanding quality
# raw codon table \ superset of "codon_s" basically
# after changes in TOGA1.1 is needed again
# TODO: needs refactoring
codon_tables = proc_out[9]

aa_eq_len = aa_eq_len_check(exon_sequences, query_exon_sequences)

if chains:
Expand Down Expand Up @@ -2654,7 +2726,7 @@ def realign_exons(args):
)
verbose(f"Chain to exon to properties = {chain_to_exon_to_properties}")
if args["check_loss"]: # call inact mutations scanner,
loss_report = inact_mut_check(
loss_report, del_mis_exons = inact_mut_check(
cesar_raw_out,
v=VERBOSE,
gene=args["gene"],
Expand All @@ -2665,6 +2737,15 @@ def realign_exons(args):
)
else: # do not call inact mut scanner
loss_report = None
del_mis_exons = None

# del_mis_exons contains:
# chain_id(string) = [0-based exon nums]
if del_mis_exons is not None and len(del_mis_exons.keys()) > 0:
# if exists, need to filter codon alignment accordingly
# chain id is numeric in "codon_table"
codon_s = redo_codon_sequences(codon_tables, del_mis_exons)


# save protein/codon ali and text output
save_prot(args["gene"], prot_s, args["prot_out"])
Expand Down
16 changes: 12 additions & 4 deletions modules/inact_mut_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,8 @@ def classify_exons(
exon_stat = [
"X",
] # exon status, start with 1, X - placeholder
del_miss_nums = []

for exon_num in exon_nums:
# 0-based to 1-based
ex_num_ = exon_num + 1
Expand All @@ -721,6 +723,7 @@ def classify_exons(
miss_num += 1
exons_report.append(mut)
exon_stat.append("M")
del_miss_nums.append(exon_num)
continue
# parse data from CESAR wrapper output
ex_class = exon_class.get(exon_num, None) # exon class
Expand Down Expand Up @@ -750,6 +753,7 @@ def classify_exons(
# add to mut list, add new exon status
exons_report.append(mut)
exon_stat.append("M")
del_miss_nums.append(exon_num)
continue
elif del_ is False:
# exon is deleted: need to write about this
Expand All @@ -767,13 +771,14 @@ def classify_exons(
# add to mut list, append new exon status
exons_report.append(mut)
exon_stat.append("D")
del_miss_nums.append(exon_num)
continue
else:
# something else -> exon is not deleted
exon_stat.append("I") # add I status to exon
pass
# return list of mutation objects + list of exon statuses
return exons_report, exon_stat
return exons_report, exon_stat, del_miss_nums


def muts_to_text(
Expand Down Expand Up @@ -1305,6 +1310,7 @@ def inact_mut_check(
out_of_b_vals = {}
mutations = []
ex_prop_provided = ex_prop is not None
del_miss_exons = {}

for cesar_fraction in cesar_fractions:
# analyze cesar fractions one-by-one
Expand Down Expand Up @@ -1363,7 +1369,7 @@ def inact_mut_check(

# next loop -> for deleted/missed exons
if ex_prop: # if extra data provided by CESAR wrapper we can classify exons
exon_del_miss_, exon_stat_ = classify_exons(
exon_del_miss_, exon_stat_, dm_list = classify_exons(
gene,
q_name,
codon_table,
Expand All @@ -1386,12 +1392,14 @@ def inact_mut_check(
# we don't have extra exons data
# will just skip this part
exon_stat = None
dm_list = []
pass

# big indels may be classified as inactivating mutations
# but the bigger the exon: the bigger an indel should be
# define the thresholds
big_indel_thrs = infer_big_indel_thresholds(ex_lens)
del_miss_exons[q_name] = set(dm_list)

# scan reading frame (codon table) for the rest on inact mutations
inact_muts = scan_rf(
Expand Down Expand Up @@ -1450,7 +1458,7 @@ def inact_mut_check(
middle_80_present,
gene,
)
return report
return report, del_miss_exons


def main():
Expand All @@ -1460,7 +1468,7 @@ def main():
# inact_mut_check() accepts a string containing raw CESAR output
cesar_line = f.read()
f.close()
report = inact_mut_check(
report, _ = inact_mut_check(
cesar_line, u12_introns=args.u12, gene=args.gene, v=args.verbose
)
print(report)
Expand Down
2 changes: 2 additions & 0 deletions toga.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ def __init__(self, args):
self.__check_nf_config()
# to avoid crash on filesystem without locks:
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" # otherwise it could crash
# temporary fix for DSL error in recent NF versions
os.environ["NXF_DEFAULT_DSL"] = "1"

chain_basename = os.path.basename(args.chain_input)

Expand Down

0 comments on commit 61ae39f

Please sign in to comment.