diff --git a/CESAR_wrapper.py b/CESAR_wrapper.py
index d16c058..90fc6ed 100755
--- a/CESAR_wrapper.py
+++ b/CESAR_wrapper.py
@@ -60,6 +60,8 @@
EXP_REG_EXTRA_FLANK = 50
EXON_SEQ_FLANK = 10
+ERR_CODE_FRAGM_ERR = 2
+
# alias; works for Hillerlab-only
two_bit_templ = "/projects/hillerlab/genome/gbdb-HL/{0}/{0}.2bit"
chain_alias_template = "/projects/hillerlab/genome/gbdb-HL/{0}/lastz/vs_{1}/axtChain/{0}.{1}.allfilled.chain.gz"
@@ -82,6 +84,8 @@
LOCATION, "modules", "extract_subchain_slib.so"
)
DEFAULT_CESAR = os.path.join(LOCATION, "CESAR2.0", "cesar")
+OPT_CESAR_LOCATION = os.path.join(LOCATION, "cesar_input_optimiser.py")
+
ex_lib = ctypes.CDLL(extract_subchain_lib_path)
ex_lib.extract_subchain.argtypes = [ctypes.c_char_p, ctypes.c_char_p, ctypes.c_char_p]
ex_lib.extract_subchain.restype = ctypes.POINTER(ctypes.c_char_p)
@@ -176,6 +180,8 @@
EQ_ACC_PROFILE = os.path.join(LOCATION, "supply", "eq_acc_profile.txt")
EQ_DO_PROFILE = os.path.join(LOCATION, "supply", "eq_donor_profile.txt")
+ORTH_LOC_LINE_SUFFIX = "#ORTHLOC"
+
FRAGMENT = -1
@@ -359,7 +365,8 @@ def parse_args():
app.add_argument(
"--exon_flanks_file",
default=None,
- help="Temporary file containing exon flanks ",
+ help=("Temporary file containing exon flanks; do use only if you run "
+ "the optimized CESAR version"),
)
app.add_argument(
"--predefined_regions",
@@ -369,7 +376,44 @@ def parse_args():
"for CESAR optimisation. If set: call optimised CESAR without LASTZ",
)
app.add_argument(
- "--opt_precompute", action="store_true", dest="opt_precompute", help="later"
+ "--opt_precompute",
+ action="store_true",
+ dest="opt_precompute",
+ help="Do only precompute memory consumption for the optimized CESAR version"
+ )
+ app.add_argument(
+ "--save_orth_locus",
+ action="store_true",
+ dest="save_orth_locus",
+ help="Do only precompute orth regions for the optimized CESAR version"
+ )
+ app.add_argument(
+ "--precomputed_orth_loci",
+ default=None,
+ help="Path to loci saved with --save_orth_locus"
+ )
+ app.add_argument(
+ "--do_not_check_exon_chain_intersection",
+ default=False,
+ action="store_true",
+ dest="do_not_check_exon_chain_intersection",
+ help="Do not extract chain blocks (not recommended)"
+ )
+ app.add_argument(
+ "--alt_frame_del",
+ "--lfd",
+ default=False,
+ action="store_true",
+ dest="alt_frame_del",
+ help="Consider codons in alternative frame (between compensated FS) deleted"
+ )
+ app.add_argument(
+ "--mask_all_first_10p",
+ "--m_f10p",
+ action="store_true",
+ dest="mask_all_first_10p",
+ help="Automatically mask all inactivating mutations in first 10% of "
+ "the reading frame, ignoring ATG codons distribution."
)
if len(sys.argv) == 1:
@@ -1415,6 +1459,8 @@ def run_cesar(
cesar_cmd += " --no_lastz"
if opt_precompute:
cesar_cmd += " --memory_consumption"
+
+ verbose(f"Calling CESAR command:\n{cesar_cmd}")
p = subprocess.Popen(
cesar_cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE
)
@@ -1880,9 +1926,10 @@ def process_cesar_out(cesar_raw_out, query_loci, inverts):
# extract protein sequences also here, just to do it in one place
part_pIDs, part_blosums, prot_seqs_part = compute_score(codons_data)
prot_seqs[chain_id] = prot_seqs_part
- # convert codon list to {"ref": [t_codons], "que": [q_codons]}
+ # convert codon list to {"ref": [t_codons], "que": [q_codons]}
codon_seqs_part = extract_codon_data(codons_data)
codon_seqs[chain_id] = codon_seqs_part
+
(
exon_query_seqs,
exon_ref_seqs,
@@ -1925,6 +1972,7 @@ def process_cesar_out(cesar_raw_out, query_loci, inverts):
percIDs[chain_id] = part_pIDs
blosums[chain_id] = part_blosums
query_coords[chain_id] = abs_coords
+
ret = (
exon_queries,
exon_refs,
@@ -1940,6 +1988,18 @@ def process_cesar_out(cesar_raw_out, query_loci, inverts):
return ret
+def __check_fragm_coords_intersect_in_q(query_coords):
+ """Check whether coordinates of predicted exons intersect in the query."""
+ exon_ranges_w_chrom = list(query_coords.values())
+ exon_ranges = [x.split(":")[1].split("-") for x in exon_ranges_w_chrom]
+ exon_starts = [int(x[0]) for x in exon_ranges]
+ exon_ends = [int(x[1]) for x in exon_ranges]
+ exon_intervals = list(zip(exon_starts, exon_ends))
+ exon_intervals_sorted = sorted(exon_intervals, key=lambda x: x[0])
+ intersecting_intervals = _check_seq_of_intervals_intersect(exon_intervals_sorted)
+ return intersecting_intervals
+
+
def process_cesar_out__fragments(cesar_raw_out, fragm_data, query_loci, inverts):
"""Process CESAR output for assembled from fragments gene."""
exon_queries, exon_refs, percIDs, blosums, query_coords, = (
@@ -1953,7 +2013,7 @@ def process_cesar_out__fragments(cesar_raw_out, fragm_data, query_loci, inverts)
cesar_raw_lines = cesar_raw_out.split("\n")
target_seq_raw = cesar_raw_lines[1]
query_seq_raw = cesar_raw_lines[3]
- chain_id_to_codon_table = {}
+ chain_id_to_codon_table = {} # chain id -> raw codon table
codons_data = parse_cesar_out(target_seq_raw, query_seq_raw, v=VERBOSE)
chain_id_to_codon_table[FRAGMENT] = codons_data
@@ -2031,6 +2091,14 @@ def process_cesar_out__fragments(cesar_raw_out, fragm_data, query_loci, inverts)
# this is completely OK
exon_grange = f"{q_chrom}:{exon_abs_start}-{exon_abs_end}"
abs_coords[exon_num] = exon_grange
+
+ # check that abs coords do not intersect
+ coords_intersect = __check_fragm_coords_intersect_in_q(abs_coords)
+ if coords_intersect: # error, cannot go further
+ print("Error! Cannot stitch fragments properly, exon intervals intersect after merge")
+ print(f"Intersecting intervals are: {coords_intersect}")
+ print("Abort")
+ sys.exit(ERR_CODE_FRAGM_ERR)
# add to the global dicts
exon_queries[FRAGMENT] = exon_query_seqs
exon_refs[FRAGMENT] = exon_ref_seqs
@@ -2048,7 +2116,7 @@ def process_cesar_out__fragments(cesar_raw_out, fragm_data, query_loci, inverts)
prot_seqs,
codon_seqs,
aa_sat_seq,
- chain_id_to_codon_table
+ chain_id_to_codon_table,
)
return ret
@@ -2185,10 +2253,12 @@ def invert_complement(seq):
return reverse_complement
-def save_prot(gene_name, prot_seq, prot_out):
+def save_prot(gene_name, prot_seq, prot_out, del_mis_exons_):
"""Save protein sequences."""
if prot_out is None:
return
+ del_mis_exons = set() if del_mis_exons_ is None else set(del_mis_exons_)
+
f = open(prot_out, "w") if prot_out != "stdout" else sys.stdout
for chain_id, seq_collection in prot_seq.items():
proj_name = f"{gene_name}.{chain_id}"
@@ -2197,8 +2267,11 @@ def save_prot(gene_name, prot_seq, prot_out):
que_seqs = []
for exon in exons:
ref_part = seq_collection[exon]["ref"]
- que_part = seq_collection[exon]["que"]
ref_seqs.append(ref_part)
+ if exon in del_mis_exons:
+ que_part = ["-" for _ in ref_part]
+ else:
+ que_part = seq_collection[exon]["que"]
que_seqs.append(que_part)
ref_aa_seq = "".join(ref_seqs)
que_aa_seq = "".join(que_seqs)
@@ -2353,7 +2426,7 @@ def extend_rel_regions(extended_regions, query_len):
return ext_regions
-def read_predefined_regions(arg):
+def read_predefined_regions(arg, gene):
"""Read predefined regions."""
ret = defaultdict(list)
if arg is None:
@@ -2361,23 +2434,67 @@ def read_predefined_regions(arg):
f = open(arg, "r")
for line in f:
line_data = line.rstrip().split("\t")
- line_strip = line.rstrip()
- chain_id = int(line_data[0])
- ret[chain_id].append(line_strip)
+ line_gene = line_data[0]
+ if line_gene != gene:
+ continue
+ chain_id = int(line_data[1])
+ start = line_data[2]
+ end = line_data[3]
+ line_to_out = f"{chain_id}\t{start}\t{end}"
+ ret[chain_id].append(line_to_out)
f.close()
return ret
+def _check_seq_of_intervals_intersect(intervals):
+ intervals_num = len(intervals)
+ for i in range(intervals_num - 1):
+ # (start, end)
+ curr_one = intervals[i]
+ next_one = intervals[i + 1]
+ # sorted by beginning
+ # if start of the next < end of the curr
+ # -> they intersect
+ if next_one[0] < curr_one[1]:
+ return (curr_one[0], curr_one[1])
+ return None # nothing suspicious found
+
+
+def parse_precomp_orth_loci(transcript_name, path):
+ """Read precomputed orthologous loci from a file."""
+ ret_1 = {} # chain to search locus
+ ret_2 = {} # chain to subchain locus
+
+ f = open(path, "r")
+ # sample file line:
+ # #ORTHLOC ENST00000262455 1169 JH567521:462931-522892 JH567521:462931-522892
+ # suffix - transcript - chain - search locus - subch locus
+ for line in f:
+ ld = line.rstrip().split("\t")
+ if ld[1] != transcript_name:
+ continue
+ chain_id = int(ld[2])
+ ret_1[chain_id] = ld[3]
+ ret_2[chain_id] = ld[4]
+ f.close()
+
+ return ret_1, ret_2
+
+
def redo_codon_sequences(codon_tables, del_mis_exons):
"""Rebuild codon alignments: now excluding deleted and missing exons."""
- ret = {}
+ codon_ret = {}
for chain_id, codon_table in codon_tables.items():
excl_exons = del_mis_exons.get(str(chain_id), set())
codon_seqs_upd = extract_codon_data(codon_table, excl_exons=excl_exons)
- ret[chain_id] = codon_seqs_upd
- return ret
+ codon_ret[chain_id] = codon_seqs_upd
+ return codon_ret
+def extract_prot_sequences_from_codon(codon_s):
+ """Extract protein sequences from codon"""
+ return []
+
def realign_exons(args):
"""Entry point."""
@@ -2419,11 +2536,26 @@ def realign_exons(args):
aa_block_sat_chain = {} # one of dicts to mark exceptionally good exon predictions
fragments_data = [] # required for fragmented genomes
chains_in_input = True
- force_include_regions_opt_v = (
- []
- ) # regions we force to include in the optimized version
+
+ # regions we force to include in the optimized version
+ # those regions != orthologous loci in the query, those are used in optimised
+ # cesar versions as regions within ortholoogus regions that must be included
+ # in the CESAR run itself
+ force_include_regions_opt_v = []
+
+
verbose("Reading query regions")
- chain_to_predefined_regions = read_predefined_regions(args["predefined_regions"])
+ chain_to_predefined_regions = read_predefined_regions(args["predefined_regions"], args["gene"])
+
+ # no need to extract chain blocks and compute where is the orthologous locus
+ # it's already precomputed
+ chain_to_precomp_search_loci = {}
+ chain_to_precomp_subch_loci = {}
+ if args["precomputed_orth_loci"]:
+ plc_ = parse_precomp_orth_loci(args["gene"], args["precomputed_orth_loci"])
+ chain_to_precomp_search_loci = plc_[0]
+ chain_to_precomp_subch_loci = plc_[1]
+
# if chains and args["fragments"]:
# the normal branch: call CESAR vs 1+ query sequences
for chain_id in chains: # in region more this part is skipped
@@ -2435,9 +2567,34 @@ def realign_exons(args):
# most likely we need only the chain part that intersects the gene
# and skip the rest:
verbose("Cutting the chain...")
- search_locus, subch_locus, chain_data = chain_cut(
- chain_str, gene_range, args["gene_flank"], args["extra_flank"]
- )
+
+ if args["precomputed_orth_loci"]:
+ search_locus = chain_to_precomp_search_loci[chain_id]
+ subch_locus = chain_to_precomp_subch_loci[chain_id]
+ # TODO: chain_data contains non-necessary information
+ # it's already present in the chain_header variable
+ # can be optimised (historical reasons)
+ _t_strand = True if chain_header[2] == "+" else False
+ _q_strand = True if chain_header[7] == "+" else False
+ _t_size = int(chain_header[3])
+ _q_size = int(chain_header[8])
+ chain_data = (_t_strand, _t_size, _q_strand, _q_size),
+ else:
+ search_locus, subch_locus, chain_data = chain_cut(
+ chain_str, gene_range, args["gene_flank"], args["extra_flank"]
+ )
+
+ if args["save_orth_locus"]:
+ # write STDOUT LINE about orthologous locus
+ # warning: to be used only on precomoute step
+ # do not use it for main CESAR calls (those that actually run CESAR)
+ # if you still like to do this in the main pipeline: exclude lines starting with
+ # ORTH_LOC_LINE_SUFFIX from CESAR wrapper output
+ g_ = args["gene"]
+ line = f"{ORTH_LOC_LINE_SUFFIX}\t{g_}\t{chain_id}\t{search_locus}\t{subch_locus}\n"
+ sys.stdout.write(line)
+
+
# chain data: t_strand, t_size, q_strand, q_size
chain_qStrand = chain_data[2]
chain_qSize = chain_data[3]
@@ -2470,6 +2627,8 @@ def realign_exons(args):
_query_seq_ext, subch_locus, args["gap_size"], directed
)
# blocks are [target_start, target_end, query_start, query_end]
+ # TODO: can be optimised here
+ # and also can be written to log - if same chain & bed - same output
subchain_blocks_raw = extract_subchain(chain_str, subch_locus)
# swap blocks in correct orientation and fill interblock ranges
subchain_blocks = orient_blocks(subchain_blocks_raw, chain_data)
@@ -2581,6 +2740,7 @@ def realign_exons(args):
fragments_data = sorted(fragments_data, key=lambda x: x[5])
else: # gene is - -> reverse sort of chains
fragments_data = sorted(fragments_data, key=lambda x: x[6], reverse=True)
+
# merge query feat dictionaries
exon_gap = merge_dicts(chain_exon_gap.values())
exon_class = merge_dicts(chain_exon_class.values())
@@ -2588,6 +2748,7 @@ def realign_exons(args):
aa_block_sat = merge_dicts(aa_block_sat_chain.values())
missing_exons = intersect_lists(chain_missed.values())
+
query_seq_chunks = []
for elem in fragments_data:
# stitch query seq in a proper order; elem[0] -> chain_id
@@ -2643,7 +2804,18 @@ def realign_exons(args):
append_u12(args["u12"], args["gene"], ref_ss_data)
make_cesar_in(prepared_exons, query_sequences, cesar_in_filename, ref_ss_data)
# run cesar itself
+ # TODO: exclude later
+ # print(force_include_regions_opt_v)
cesar_bin = args.get("cesar_binary") if args.get("cesar_binary") else DEFAULT_CESAR
+ if args.get("cesar_binary"):
+ cesar_bin = args.get("cesar_binary")
+ elif args["opt_cesar"] is True:
+ cesar_bin = OPT_CESAR_LOCATION
+ elif args["opt_precompute"]:
+ cesar_bin = OPT_CESAR_LOCATION
+ else:
+ cesar_bin = DEFAULT_CESAR
+
if not args["cesar_output"]:
cesar_raw_out = run_cesar(
cesar_in_filename,
@@ -2662,8 +2834,6 @@ def realign_exons(args):
else: # very specific case, load already saved CESAR output
with open(args["cesar_output"], "r") as f:
cesar_raw_out = f.read()
- # if force_include_reg_file:
- # os.remove(force_include_reg_file) if os.path.isfile(force_include_reg_file) else None
os.remove(cesar_in_filename) if is_temp else None # wipe temp if temp
# save raw CESAR output and close if required
save(cesar_raw_out, args["raw_output"], t0) if args["raw_output"] else None
@@ -2682,13 +2852,12 @@ def realign_exons(args):
query_coords = proc_out[4] # genomic coordinates in the query
exon_num_corr = proc_out[5] # in case of intron del: ref/que correspondence
prot_s = proc_out[6] # protein sequences in query
- codon_s = proc_out[7]
+ codon_s = proc_out[7] # dict containing sequences of ref and query codons
aa_cesar_sat = proc_out[8] # says whether an exon has outstanding quality
- # raw codon table \ superset of "codon_s" basically
+ # raw codon table \ superset of "codon_s" basically
# after changes in TOGA1.1 is needed again
# TODO: needs refactoring
codon_tables = proc_out[9]
-
aa_eq_len = aa_eq_len_check(exon_sequences, query_exon_sequences)
if chains:
@@ -2697,6 +2866,7 @@ def realign_exons(args):
chain_exon_class = get_a_plus(
chain_exon_class, aa_cesar_sat, aa_block_sat_chain, aa_eq_len
)
+
# time to arrange all these data altogether
final_output, chain_ex_inc = arrange_output(
args["gene"],
@@ -2734,21 +2904,25 @@ def realign_exons(args):
ref_ss=ref_ss_data,
sec_codons=sec_codons,
no_fpi=args["no_fpi"],
+ alt_f_del=args["alt_frame_del"],
+ mask_all_first_10p=args["mask_all_first_10p"],
)
else: # do not call inact mut scanner
loss_report = None
del_mis_exons = None
-
+
# del_mis_exons contains:
# chain_id(string) = [0-based exon nums]
- if del_mis_exons is not None and len(del_mis_exons.keys()) > 0:
+ need_correct_codon_and_prot = del_mis_exons is not None and len(del_mis_exons.keys()) > 0
+ if need_correct_codon_and_prot:
# if exists, need to filter codon alignment accordingly
# chain id is numeric in "codon_table"
codon_s = redo_codon_sequences(codon_tables, del_mis_exons)
+ prot_s = extract_prot_sequences_from_codon(codon_s)
# save protein/codon ali and text output
- save_prot(args["gene"], prot_s, args["prot_out"])
+ # save_prot(args["gene"], prot_s, args["prot_out"], del_mis_exons)
save_codons(args["gene"], codon_s, args["codon_out"])
save(final_output, args["output"], t0, loss_report)
sys.exit(0)
diff --git a/cesar_runner.py b/cesar_runner.py
index fcf0b72..f59999a 100755
--- a/cesar_runner.py
+++ b/cesar_runner.py
@@ -12,6 +12,9 @@
__credits__ = ["Michael Hiller", "Virag Sharma", "David Jebb"]
MAX_ATTEMPTS = 2
+ZERO_CODE = 0
+ERR_CODE = 1
+FRAGM_CHAIN_ISSUE_CODE = 2
def parse_args():
@@ -23,6 +26,7 @@ def parse_args():
"--check_loss", default=None, help="File to save gene loss data if requested"
)
app.add_argument("--rejected_log", default=None, help="Log gene rejection events")
+ app.add_argument("--unproc_log", "--ul", default=None, help="Log unprocessed genes")
# print help if there are no args
if len(sys.argv) < 2:
app.print_help()
@@ -43,13 +47,22 @@ def call_job(cmd):
rc = p.returncode
cmd_out = b_stdout.decode("utf-8")
err_msg = b_stderr.decode("utf-8").replace("\n", " ")
- if rc == 0:
- return cmd_out, 0
+ if rc == ZERO_CODE:
+ return cmd_out, ZERO_CODE
+ elif rc == FRAGM_CHAIN_ISSUE_CODE:
+ err_msg = f"CESAR_wrapper.py detected that fragments overlap for {cmd}, abort"
+ return err_msg, FRAGM_CHAIN_ISSUE_CODE
else:
eprint(err_msg)
eprint(f"\n{cmd} FAILED")
attempts += 1
- return err_msg, 1 # send failure signal
+ return err_msg, ERR_CODE # send failure signal
+
+
+def __job_to_transcript(job):
+ """Extract transcript ID from job."""
+ fields = job.split()
+ return fields[1]
def main():
@@ -60,6 +73,7 @@ def main():
# text file, a command per line
jobs = [x.rstrip() for x in f.readlines()]
jobs_num = len(jobs)
+ unprocessed_genes = []
out = open(args.output, "w") # handle output file
gene_loss_data = [] # list to keep gene loss detector out
@@ -69,6 +83,12 @@ def main():
eprint(f"Calling:\n{job}")
# catch job stdout
job_out, rc = call_job(job)
+ if rc == FRAGM_CHAIN_ISSUE_CODE:
+ # very special case -> nothig we can do
+ # mark as missnig, I guess
+ rejected.append(f"{job}\tfragment chains oevrlap\n")
+ unprocessed_genes.append(__job_to_transcript(job))
+ continue
if rc == 1:
# a job failed with code 1 -> send the signal upstream
# abort execution, write what job exactly failed
@@ -136,6 +156,11 @@ def main():
f.write("".join(rejected))
f.close()
+ if args.unproc_log and len(unprocessed_genes) > 0:
+ f = open(args.unproc_log, "w")
+ for elem in unprocessed_genes:
+ f.write(f"{elem}\n")
+ f.close()
if __name__ == "__main__":
main()
diff --git a/chain_runner.py b/chain_runner.py
index 013b790..8cfcd28 100755
--- a/chain_runner.py
+++ b/chain_runner.py
@@ -560,12 +560,14 @@ def main():
# 2) an argument: "chain ,-sep list of genes"
batch = read_input(args.input_file)
task_size = len(batch)
+ # TODO: check whether I don't need .bst
# load chains dict; it would be much faster to load chain_ID: (start_byte, offset)
# python dict once than ask HDF5 database each time TOGA needs another chain
index_file = args.chain_file.replace(".chain", ".chain_ID_position")
chain_dict = load_chain_dict(index_file)
# call main processing tool
+ # TODO: rename genes to transcripts where appropropriate
for job_num, (chain, genes) in enumerate(batch.items(), 1):
# one unit: one chain + intersected genes
# call routine that extracts chain feature
diff --git a/modules/GLP_values.py b/modules/GLP_values.py
index b633036..a152587 100644
--- a/modules/GLP_values.py
+++ b/modules/GLP_values.py
@@ -6,7 +6,12 @@
DEL_MISS = {MISS_EXON, DEL_EXON}
COMPENSATION = "COMPENSATION"
SSM = "SSM"
+# (ag)acceptor-EXON-donor(gt)
+SSM_D = "SSMD" # Donor, right, GT,GC
+SSM_A = "SSMA" # Acceptor, left, AG
+
START_MISSING = "START_MISSING"
+ATG = "ATG"
FS_DEL = "FS_DEL"
FS_INS = "FS_INS"
BIG_DEL = "BIG_DEL"
diff --git a/modules/classify_chains.py b/modules/classify_chains.py
index 3ec1255..e7e080d 100755
--- a/modules/classify_chains.py
+++ b/modules/classify_chains.py
@@ -91,6 +91,7 @@ def classify_chains(
# -> then this is a proc pseudogene
# move trans chains to a different dataframe
+ # TODO: rename trans -> spanning
# trans chain -> a syntenic chain that passes throw the gene body
# but has no aligning bases in the CDS
trans_lines = df[(df["exon_cover"] == 0) & (df["synt"] > 1)]
diff --git a/modules/gene_losses_summary.py b/modules/gene_losses_summary.py
index fee28a9..7d21130 100755
--- a/modules/gene_losses_summary.py
+++ b/modules/gene_losses_summary.py
@@ -144,13 +144,13 @@ def read_loss_data(loss_dir):
perc = float(line_data[2].split()[1])
projection_to_p_intact_M_intact[projection_id] = perc
continue
- elif line_data[2].startswith("MIDDLE_80%_INTACT"):
- # flag: are there inact mutations in the middle 80% of CDS?
+ elif line_data[2].startswith("MIDDLE_IS_INTACT"):
+ # flag: are there inact mutations in the first 90%/mid 80% of CDS?
raw_val = line_data[2].split()[1]
val = True if raw_val == "TRUE" else False
proj_to_80_p_intact[projection_id] = val
continue
- elif line_data[2].startswith("MIDDLE_80%_PRESENT"):
+ elif line_data[2].startswith("MIDDLE_IS_PRESENT"):
# flag: any missing fragment in the middle 80% if CDS?
raw_val = line_data[2].split()[1]
val = True if raw_val == "TRUE" else False
@@ -246,6 +246,7 @@ def get_projection_classes(
p_intact_M_ign = p_to_pint_m_ign.get(projection, -1)
p_intact_M_int = p_to_pint_m_int.get(projection, -1)
p_i_codons = p_to_i_codon_prop.get(projection, -1)
+ # TODO: rename to NO LOSS IN FIRST 90%
no_loss_in_80_p = p_80_int.get(projection, None)
m_80_present = p_80_pre.get(projection, None)
frame_oub = p_to_p_out_of_bord.get(projection, 0.0)
diff --git a/modules/inact_mut_check.py b/modules/inact_mut_check.py
index c4809d2..059fa8d 100755
--- a/modules/inact_mut_check.py
+++ b/modules/inact_mut_check.py
@@ -124,7 +124,17 @@ def mask_mut(mut):
return upd_mut
-def analyse_splice_sites(ref, query, gene, chain, u12_data=None, v=None):
+def analyse_splice_sites(
+ ref,
+ query,
+ gene,
+ chain,
+ codon_table,
+ atg_codons_data,
+ mask_all_first_10p=False,
+ u12_data=None,
+ v=None,
+):
"""Check correctness of the splice sites."""
u12_data = set() if not u12_data else u12_data # load U12 data if provided
mut_counter = 1 # mut counter -> for mutations IDs
@@ -172,57 +182,99 @@ def analyse_splice_sites(ref, query, gene, chain, u12_data=None, v=None):
# so we can get splice site coordinates
# if there is N in the splice site -> we don't know what's there
# -> we are not sure -> mask this mutation
- left_splice_site = query[start - 2 : start]
- left_splice_site_N = "n" in left_splice_site or "N" in left_splice_site
- right_splice_site = query[end + 1 : end + 3]
- right_splice_size_N = "n" in right_splice_site or "N" in right_splice_site
+ acceptor_splice_site = query[start - 2 : start] # donor
+ acceptor_splice_site_N = (
+ "n" in acceptor_splice_site or "N" in acceptor_splice_site
+ )
+ donor_splice_site = query[end + 1 : end + 3] # acceptor
+ donor_splice_size_N = "n" in donor_splice_site or "N" in donor_splice_site
+
+ # assign to the last / first codon of the exon (depends on what splice site is affected)
+ codon_pos_at_exon = [
+ n for n, c in enumerate(codon_table) if c["t_exon_num"] == exon_num_
+ ]
+ acceptor_codon_num = (
+ codon_pos_at_exon[0] if len(codon_pos_at_exon) > 0 else None
+ )
+ donor_codon_num = codon_pos_at_exon[-1] if len(codon_pos_at_exon) > 0 else None
# check that splice sites are canonical
- left_site_wrong = left_splice_site not in LEFT_SPLICE_CORR
- right_site_wrong = right_splice_site not in RIGHT_SPLICE_CORR
+ acceptor_site_wrong = acceptor_splice_site not in LEFT_SPLICE_CORR
+ donor_site_wrong = donor_splice_site not in RIGHT_SPLICE_CORR
+
eprint(
- f"Exon {exon_num}; L_SPS: {left_splice_site}; R_SPS: {right_splice_site}"
+ f"Exon {exon_num}; L_SPS: {acceptor_splice_site}; R_SPS: {donor_splice_site}"
) if v else None
- if exon_num != 1 and left_site_wrong:
+ # print(exon_num)
+ # print(acceptor_site_wrong)
+ # print(donor_site_wrong)
+
+ if exon_num != 1 and acceptor_site_wrong and acceptor_codon_num:
# add mutation for left (acceptor) splice site
# doesn't apply to the first exon obviously
# mask this mutation if it's suspected to be U12 splice site
mask = True if (exon_num, 0) in u12_data else False
# if N in the splice site -> also mask it
- mask = True if left_splice_site_N else mask
+ mask = True if acceptor_splice_site_N else mask
+
+ # if splice site mutation in first 10% but followed by ATG in first 10% then
+ atg_mask = _define_whether_mask(
+ acceptor_codon_num,
+ atg_codons_data["left_t"],
+ atg_codons_data["right_t"],
+ atg_codons_data["atg_codon_nums"],
+ mask_all_first_10p=mask_all_first_10p,
+ )
+
+ mask = True if atg_mask is True else mask
+
# create mutation object, describe what happened
- mut_ = f"{LEFT_SPLICE_CORR}->{left_splice_site}"
- mut_id = f"SSM_{mut_counter}"
+ mut_ = f"{LEFT_SPLICE_CORR}->{acceptor_splice_site}"
+ mut_id = f"{SSM_A}_{mut_counter}"
+ # print(mut_id)
mut_counter += 1
mut = Mutation(
gene=gene,
chain=chain,
exon=exon_num,
- position=0,
- mclass=SSM,
+ position=acceptor_codon_num,
+ mclass=SSM_A,
mut=mut_,
masked=mask,
mut_id=mut_id,
)
sps_report.append(mut) # add mutation to the list
- if exon_num != exons_num and right_site_wrong:
+ if exon_num != exons_num and donor_site_wrong and donor_codon_num:
# add mutation for right (donor) splice site
# doesn't apply to the last exon
# mask this mutation if it's suspected to be U12 splice site
mask = True if (exon_num, 1) in u12_data else False
- mask = True if right_splice_size_N else mask # if N in mutation -> mask it
+ mask = True if donor_splice_size_N else mask # if N in mutation -> mask it
+
+ # if splice site mutation in first 10% but followed by ATG in first 10% then
+ atg_mask = _define_whether_mask(
+ donor_codon_num,
+ atg_codons_data["left_t"],
+ atg_codons_data["right_t"],
+ atg_codons_data["atg_codon_nums"],
+ mask_all_first_10p=mask_all_first_10p,
+ )
+
+ mask = True if atg_mask is True else mask
+
# create mutation object
- mut_ = f"{RIGHT_SPLICE_CORR}->{right_splice_site}"
- mut_id = f"SSM_{mut_counter}"
+ mut_ = f"{RIGHT_SPLICE_CORR}->{donor_splice_site}"
+ mut_id = f"{SSM_D}_{mut_counter}"
+ # print(mut_id)
mut_counter += 1
mut = Mutation(
gene=gene,
chain=chain,
exon=exon_num,
- position=1,
- mclass=SSM,
+ position=donor_codon_num,
+ mclass=SSM_D,
mut=mut_,
masked=mask,
mut_id=mut_id,
@@ -242,16 +294,22 @@ def analyse_splice_sites(ref, query, gene, chain, u12_data=None, v=None):
curr = sps_report[i]
curr_exon = curr.exon
prev_exon = prev.exon
+
if curr_exon != prev_exon + 1:
# if current exon doesn't follow the previous immediately -> not the case
# like prev mut exon is 3 and current is 6
continue
+
# if exons follow each other (like 3 and 4) then continue
curr_pos = curr.position
prev_pos = prev.position
+ curr_type = curr.mclass
+ prev_type = curr.mclass
+
# they must belong to the same intron, check this
- if not (curr_pos == 0 and prev_pos == 1):
+ if not (prev_type == SSM_D and curr_type == SSM_A):
continue
+
prev_to_what = prev.mut.split("->")[1]
curr_to_what = curr.mut.split("->")[1]
# if it was -- in both cases: this is intron deletion
@@ -323,15 +381,78 @@ def corr_exon_num_or_no_fs(codon, exon_num):
return exon_num
+def _find_atg_codons(codon_table):
+ """Find reference codons aligned to ATG."""
+ atg_codon_nums = []
+ for num, codon in enumerate(codon_table, 1):
+ que_codon = codon["que_codon"]
+ que_codon_no_gap = que_codon.replace("-", "")
+ if not que_codon_no_gap:
+ # there are only gaps -> nothing to catch
+ continue
+ triplets = parts(que_codon_no_gap, n=3)
+ start_triplets = [x for x in triplets if x == "ATG"]
+ if len(start_triplets) > 0:
+ atg_codon_nums.append(num)
+ return atg_codon_nums
+
+
+def _get_next_bigger_num(num, lst):
+ for elem in lst:
+ if elem >= num:
+ return elem
+ return 999999999 # TODO: ideally, last codon position
+
+
+def _define_whether_mask(
+ num, left_t, right_t, atg_codon_nums, mask_all_first_10p=False
+):
+ """Check whether the mutation is going to be masked due to first/last 10% or not."""
+ # TODO: optimise this part, the only ATG position needed is the closest to 10%
+ # which is not above 10%, that's it.
+ if num >= right_t:
+ return True
+ elif left_t < num < right_t:
+ return False
+ if num <= left_t and mask_all_first_10p is True:
+ # automatically mask mut in first 10%
+ # don't account for ATG codons distribution
+ return True
+ # num in first 10%, need to find the next start
+ next_atg_pos = _get_next_bigger_num(num, atg_codon_nums)
+ return next_atg_pos <= left_t
+
+
+def make_atg_data(codon_table):
+ codons_num = len(codon_table)
+
+ # get first or last 10% if CDS: mask mutations in this region:
+ perc_10 = codons_num // 10
+ left_t = perc_10
+ right_t = codons_num - perc_10
+
+ atg_codon_nums = _find_atg_codons(codon_table)
+
+ # TODO: refactor this
+ atg_codon_nums_data = {
+ "left_t": left_t,
+ "right_t": right_t,
+ "atg_codon_nums": atg_codon_nums,
+ }
+ return atg_codon_nums_data
+
+
def scan_rf(
codon_table,
gene,
chain,
+ atg_codon_nums_data,
exon_stat=None,
v=False,
big_indel_thrs=None,
sec_codons=None,
no_fpi=False,
+ mask_all_first_10p=False,
):
"""Scan codon table for inactivating mutations."""
# sec_codons -> selenocysteine-coding codons in reference
@@ -339,13 +460,9 @@ def scan_rf(
codons_num = len(codon_table)
in_mut_report = [] # save mutations here
- # get first or last 10% if CDS: mask mutations in this region:
- perc_10 = codons_num // 10
- left_t = perc_10
- right_t = codons_num - perc_10
-
# init mutation counters: for IDs
mut_number_stop = 1
+ mut_number_atg = 1 # not inact, but need to track
mut_number_fs = 1
mut_number_big_indel = 1
q_dels_in_a_row = 0 # number of deletions in a row: for big indel detection
@@ -353,7 +470,14 @@ def scan_rf(
for num, codon in enumerate(codon_table, 1):
# go codon-by-codon
# if in first/last 10%: it will be masked
- mask = True if num <= left_t or num >= right_t else False
+ # mask = True if num <= left_t or num >= right_t else False
+ mask = _define_whether_mask(
+ num,
+ atg_codon_nums_data["left_t"],
+ atg_codon_nums_data["right_t"],
+ atg_codon_nums_data["atg_codon_nums"],
+ mask_all_first_10p=mask_all_first_10p,
+ )
# determine whether it's the first or last exon:
last_codon = True if num == codons_num else False
first_codon = True if num == 1 else False
@@ -407,7 +531,10 @@ def scan_rf(
position=1,
mclass=START_MISSING,
mut=que_codon_,
- masked=mask,
+ # this mutation should not affect the classification
+ # so it's always masked, but saved to indicate that
+ # 1st ATG is missing:
+ masked=True,
mut_id=mut_id,
)
in_mut_report.append(mut)
@@ -487,6 +614,8 @@ def scan_rf(
# need to split query sequence in triplets
# check that any of them is a stop-codon
stop_triplets = [x for x in triplets if x in STOPS]
+ start_triplets = [x for x in triplets if x == "ATG"]
+
if len(stop_triplets) > 0 and not last_codon:
# we have premature stop codon
mut_ = f"{ref_codon.replace('-', '')}->{stop_triplets[0]}"
@@ -532,6 +661,27 @@ def scan_rf(
if v:
eprint("Detected STOP")
eprint(codon)
+ if len(start_triplets) > 0:
+ # not an inactivating mutation but needs to be saved
+ mut_ = f"{ref_codon.replace('-', '')}->ATG"
+ mclass = ATG
+ mut_id = f"ATG_{mut_number_atg}"
+ st_ex_num = ex_num
+ mut_number_atg += 1
+ mut = Mutation(
+ gene=gene,
+ chain=chain,
+ exon=st_ex_num,
+ position=num,
+ mclass=mclass,
+ mut=mut_,
+ masked=True,
+ mut_id=mut_id,
+ )
+ in_mut_report.append(mut)
+ if v:
+ eprint("Detected ATG (non-inactivating)")
+ eprint(codon)
# I can infer number of codons in each exon directly from codon table
return in_mut_report
@@ -550,7 +700,7 @@ def detect_compensations(inact_mut, codon_table):
fs_num = len(fs)
if fs_num <= 1:
# need at least 2 frameshifs, otherwise there is nothing to compensate
- return answer
+ return answer, []
# first iteration: detect potential compensatory events, based only
# on their sizes
potent_compensations = []
@@ -583,10 +733,12 @@ def detect_compensations(inact_mut, codon_table):
potent_compensations.append(fs_ids)
break
if len(potent_compensations) == 0:
- return [] # no potential compensations, skip this
+ return [], [] # no potential compensations, skip this
# verify potential compensations, check for stop codons in alt frame
what_is_compensated = set() # to avoid twice compensated FS
+ alt_frame_codons = [] # lenghts of codons in alt frame
+
for comp in potent_compensations:
# comp -> a list of potentially compensated FS IDs
if len(what_is_compensated.intersection(comp)) > 0:
@@ -610,7 +762,7 @@ def detect_compensations(inact_mut, codon_table):
# add compensation track
# no stop codons
# to create a mut object we need gene name, chain id, exon num etc
- # I just use the first mut in the row for that (ethalon)
+ alt_frame_codons.append((start_pos - 1, end_pos))
ethalon_mut = comp_muts[0]
gene = ethalon_mut.gene
chain = ethalon_mut.chain
@@ -620,7 +772,11 @@ def detect_compensations(inact_mut, codon_table):
mut_id = f"C_{m_counter}"
# mutation -> comma-separated list of compensated mutation IDs
fs_ids = [x.split("_")[1] for x in comp]
- mut = "FS_{}".format(",".join(fs_ids))
+ # mut = "FS_{}".format(",".join(fs_ids))
+ # changed format FEB 2022: FS_{start_num}-{end_num}
+ # just a comma-separated list can be too long
+ _ids_range = f"{fs_ids[0]}-{fs_ids[-1]}"
+ mut = f"FS_{_ids_range}"
comp_mut = Mutation(
gene=gene,
chain=chain,
@@ -632,11 +788,12 @@ def detect_compensations(inact_mut, codon_table):
mut_id=mut_id,
)
answer.append(comp_mut)
+ # calculieren positions of first and last alt frame codons
for c in comp: # add compensated muts to comp muts set
# to avoid adding comp mutations twice
what_is_compensated.add(c)
m_counter += 1
- return answer
+ return answer, alt_frame_codons
def mask_compensated_fs(mut_list):
@@ -648,7 +805,11 @@ def mask_compensated_fs(mut_list):
comp_fs_ids = [] # there are compensatory events
for cmp in comp_muts:
# get list of comp FS IDs
- fs_nums = cmp.mut.split("_")[1].split(",")
+ # fs_nums = cmp.mut.split("_")[1].split(",")
+ comp_ids_range_str = cmp.mut.split("_")[1].split("-")
+ _comp_start = int(comp_ids_range_str[0])
+ _comp_end = int(comp_ids_range_str[1])
+ fs_nums = list(range(_comp_start, _comp_end + 1))
fs_ids = [f"FS_{x}" for x in fs_nums]
comp_fs_ids.extend(fs_ids)
comp_fs_ids = set(comp_fs_ids)
@@ -680,6 +841,13 @@ def compute_percent_id(seq_1, seq_2):
return pid
+def _get_last_codon_for_each_exon(codon_table):
+ ret = {}
+ for num, elem in enumerate(codon_table, 1):
+ ret[elem["t_exon_num"] + 1] = num
+ return ret
+
+
def classify_exons(
gene,
que,
@@ -690,16 +858,20 @@ def classify_exons(
exon_blosum,
missing_exons,
ex_inc,
+ atg_codons_data,
+ mask_all_first_10p=False,
v=False,
):
"""Classify exons as intact, deleted and missing."""
del_num, miss_num = 1, 1 # counters for mutation IDs
# get a list of exon numbers:
exon_nums = list(range(codon_table[-1]["t_exon_num"] + 1))
+ exon_to_last_codon_of_exon = _get_last_codon_for_each_exon(codon_table)
exons_report = [] # save data heve
exon_stat = [
"X",
] # exon status, start with 1, X - placeholder
+
del_miss_nums = []
for exon_num in exon_nums:
@@ -734,7 +906,7 @@ def classify_exons(
# classify whether it's deleted or not:
# print(f"Exon {exon_num} classification with the following params: ") if v else None
# print(ex_class, exon_excl, ex_pid, ex_blosum) if v else None
- del_, q = classify_exon(ex_class, exon_excl, ex_pid, ex_blosum, v=v)
+ ex_non_del, q = classify_exon(ex_class, exon_excl, ex_pid, ex_blosum, v=v)
# print(f"Results are: {del_} {q}") if v else None
if ex_class == "M" or ex_gap:
@@ -755,8 +927,18 @@ def classify_exons(
exon_stat.append("M")
del_miss_nums.append(exon_num)
continue
- elif del_ is False:
+ elif ex_non_del is False:
# exon is deleted: need to write about this
+ last_codon_num = exon_to_last_codon_of_exon.get(ex_num_, 0)
+
+ atg_mask = _define_whether_mask(
+ last_codon_num,
+ atg_codons_data["left_t"],
+ atg_codons_data["right_t"],
+ atg_codons_data["atg_codon_nums"],
+ mask_all_first_10p=mask_all_first_10p,
+ )
+
mut = Mutation(
gene=gene,
chain=que,
@@ -764,7 +946,7 @@ def classify_exons(
position=0,
mut="-",
mclass=DEL_EXON,
- masked=False,
+ masked=atg_mask,
mut_id=f"DEL_{del_num}",
)
del_num += 1
@@ -817,11 +999,11 @@ def muts_to_text(
strings.append(out_of_ch_line)
for q, val in m_80_i.items():
val_str = "TRUE" if val is True else "FALSE"
- m_80_line = f"# {gene}\t{q}\tMIDDLE_80%_INTACT {val_str}"
+ m_80_line = f"# {gene}\t{q}\tMIDDLE_IS_INTACT {val_str}"
strings.append(m_80_line)
for q, val in m_80_p.items():
val_str = "TRUE" if val is True else "FALSE"
- m_80_line = f"# {gene}\t{q}\tMIDDLE_80%_PRESENT {val_str}"
+ m_80_line = f"# {gene}\t{q}\tMIDDLE_IS_PRESENT {val_str}"
strings.append(m_80_line)
return "\n".join(strings) + "\n"
@@ -836,7 +1018,15 @@ def get_exon_num_corr(codons_data):
return ans
-def compute_intact_perc(codon_table, mutations, q_name, v=False):
+def compute_intact_perc(
+ codon_table,
+ mutations,
+ q_name,
+ alt_frame_ranges,
+ alt_f_del=False,
+ v=False,
+ mask_all_first_10p=False,
+):
"""Compute intact %ID-related features."""
# compute per query
query_muts = [m for m in mutations if m.chain == q_name]
@@ -855,6 +1045,8 @@ def compute_intact_perc(codon_table, mutations, q_name, v=False):
del_codon_nums = [
n for n, c in enumerate(codon_table) if c["t_exon_num"] in del_exons
]
+ # read codons in alt frame as deleted
+
safe_del_codon_nums = [
n for n, c in enumerate(codon_table) if c["t_exon_num"] in safe_del_exons
]
@@ -862,6 +1054,14 @@ def compute_intact_perc(codon_table, mutations, q_name, v=False):
n for n, c in enumerate(codon_table) if c["t_exon_num"] in miss_exons
]
+ if alt_f_del is True:
+ # if so, we consider codons in alternative frame deleted
+ # alt frame -> between compensated frameshifts
+ for elem in alt_frame_ranges:
+ s_, e_ = elem
+ for codon_num in range(s_, e_):
+ safe_del_codon_nums.append(codon_num)
+
# update codons status -> what is Missing, Deleted or Lost (not-safely deleted exons)
for del_codon in del_codon_nums:
codon_status[del_codon] = "L"
@@ -871,9 +1071,26 @@ def compute_intact_perc(codon_table, mutations, q_name, v=False):
codon_status[miss_codon] = "M"
# get IDs of compensated FS
- compensations = [m.mut for m in query_muts if m.mclass == COMPENSATION]
- comp_nums = ",".join([c.split("_")[1] for c in compensations]).split(",")
- comp_fs = {f"FS_{c}" for c in comp_nums}
+ compensations = [m for m in query_muts if m.mclass == COMPENSATION]
+ comp_fs = [] # list to keep compensated frameshifts
+ for comp in compensations:
+ comp_field = comp.mut
+ comp_ids_range_str = comp_field.split("_")[1].split("-")
+ # # fmt: FS_{start}-{end}
+ _comp_start = int(comp_ids_range_str[0])
+ _comp_end = int(comp_ids_range_str[1])
+ comp_ids = list(range(_comp_start, _comp_end + 1))
+ comp_fs_strings = [f"FS_{i}" for i in comp_ids]
+ comp_fs.append(comp_fs_strings)
+
+ # comp_nums = ",".join([c.split("_")[1] for c in compensations]).split(",")
+ # comp_ids_range_str = comp_field.split("_")[1].split("-")
+ # # fmt: FS_{start}-{end}
+ # _comp_start = int(comp_ids_range_str[0])
+ # _comp_end = int(comp_ids_range_str[1])
+ # comp_ids = list(range(_comp_start, _comp_end + 1))
+ # comp_pairs = get_comp_pairs(comp_ids)
+ # comp_fs = {f"FS_{c}" for c in comp_nums}
# put inactivating mutations coordinates in codon status table
for m in query_muts:
@@ -890,7 +1107,7 @@ def compute_intact_perc(codon_table, mutations, q_name, v=False):
continue
elif m.masked is True:
continue
- elif m.mclass == SSM:
+ elif m.mclass == SSM_A or m.mclass == SSM_D:
# deal with splice site mutations
if m.masked is True:
# U12 or N-containing splice site -> do not consider
@@ -972,8 +1189,14 @@ def compute_intact_perc(codon_table, mutations, q_name, v=False):
ten_perc = gene_len // 10
# cut middle 80% codons states:
middle = codon_status_string[ten_perc:-ten_perc]
- middle_80_intact = False if "L" in middle else True
- middle_80_present = False if "M" in middle else True
+ first_90 = codon_status_string[:-ten_perc]
+
+ if mask_all_first_10p is True:
+ middle_is_intact = False if "L" in middle else True
+ else:
+ middle_is_intact = False if "L" in first_90 else True
+ middle_is_present = False if "M" in middle else True
+
# compute non-missing sequence
not_m = len(codon_status) - codon_status.count("M")
if not_m > 0: # beware of zero division error
@@ -985,8 +1208,9 @@ def compute_intact_perc(codon_table, mutations, q_name, v=False):
p_intact_ignore_m,
p_intact_intact_m,
num_of_I_codons,
- middle_80_intact,
- middle_80_present,
+ middle_is_intact,
+ # first_90_intact,
+ middle_is_present,
)
@@ -1198,13 +1422,17 @@ def get_exon_pairs(exon_stat):
return pairs
-def detect_split_stops(codon_table, gene, q_name, exon_stat):
+def detect_split_stops(
+ codon_table, gene, q_name, exon_stat, atg_codons_data, mask_all_first_10p=False
+):
"""Considering all exon deletions find all split stop codons."""
# we need to get pairs of intact (not deleted or missing) exons
# between what there is a row of Deleted exons (but not missing)
# if there is a missing exon in between -> do not make any conclusions
# split stop codons may occur between these pairs
i_exon_pairs = get_exon_pairs(exon_stat)
+ exon_to_last_codon_of_exon = _get_last_codon_for_each_exon(codon_table)
+
if len(i_exon_pairs) == 0:
# no such pairs -> no way to detect split stop
return [] # return nothing
@@ -1212,6 +1440,7 @@ def detect_split_stops(codon_table, gene, q_name, exon_stat):
mut_num = 1
muts = []
for pair in i_exon_pairs:
+ # those are 1-based
first_exon = pair[0]
second_exon = pair[1]
# in codon table numbers are 0-based, so correct
@@ -1248,14 +1477,23 @@ def detect_split_stops(codon_table, gene, q_name, exon_stat):
# ex_num = second_exon if s_part_len > f_part_len else first_exon
mut_id = f"SP_STOP_{mut_num}"
mut_num += 1
+ position = exon_to_last_codon_of_exon[first_exon]
+ atg_mask = _define_whether_mask(
+ position,
+ atg_codons_data["left_t"],
+ atg_codons_data["right_t"],
+ atg_codons_data["atg_codon_nums"],
+ mask_all_first_10p=mask_all_first_10p,
+ )
+
mut = Mutation(
gene=gene,
chain=q_name,
exon=second_exon,
- position=0,
+ position=position,
mclass=STOP,
mut=mut_,
- masked=False,
+ masked=atg_mask,
mut_id=mut_id,
)
muts.append(mut)
@@ -1285,6 +1523,8 @@ def inact_mut_check(
ref_ss=None,
sec_codons=None,
no_fpi=False,
+ alt_f_del=False,
+ mask_all_first_10p=False, # mask all inact mutations in 1st 10% regardless on ATGs
):
"""Detect inactivating mutations in the CESAR output."""
# read cesar output
@@ -1304,12 +1544,11 @@ def inact_mut_check(
# initiate lists/ dicts for them
p_intact_ignore_M = {}
p_intact_intact_M = {}
- middle_80_intact = {}
- middle_80_present = {}
+ middle_is_intact = {}
+ middle_is_present = {}
i_codons_prop = {}
out_of_b_vals = {}
mutations = []
- ex_prop_provided = ex_prop is not None
del_miss_exons = {}
for cesar_fraction in cesar_fractions:
@@ -1354,21 +1593,15 @@ def inact_mut_check(
# err_msg = f"Cannot find CESAR wrapper features for query {q_name}"
# raise ValueError(err_msg)
- # now we extract inactivation mutations
- # then add them to fraction_mutations list
-
- # extract splice site mutations
- sps_mutations = analyse_splice_sites(
- ref, query, gene, q_name, u12_introns_data, v=v
- )
- fraction_mutations.extend(sps_mutations)
- # create codon table to extract other mutations
+ # create codon table to extract mutations
# codon table: list of objects, describing a codon
# such as sequence in reference and query, exon number and so on
codon_table = parse_cesar_out(ref, query)
+ atg_codons_data = make_atg_data(codon_table)
- # next loop -> for deleted/missed exons
+ # next loop -> for deleted/missed exons
if ex_prop: # if extra data provided by CESAR wrapper we can classify exons
+ # dm list -> list of 0-based exon nums which are del or missing
exon_del_miss_, exon_stat_, dm_list = classify_exons(
gene,
q_name,
@@ -1379,6 +1612,8 @@ def inact_mut_check(
exon_blosum,
missing_exons,
ex_inc,
+ atg_codons_data,
+ mask_all_first_10p=mask_all_first_10p,
v=v,
)
# get lists of deleted/missing exons
@@ -1398,32 +1633,59 @@ def inact_mut_check(
# big indels may be classified as inactivating mutations
# but the bigger the exon: the bigger an indel should be
# define the thresholds
- big_indel_thrs = infer_big_indel_thresholds(ex_lens)
del_miss_exons[q_name] = set(dm_list)
+ big_indel_thrs = infer_big_indel_thresholds(ex_lens)
# scan reading frame (codon table) for the rest on inact mutations
inact_muts = scan_rf(
codon_table,
gene,
q_name,
+ atg_codons_data,
exon_stat=exon_stat,
v=v,
big_indel_thrs=big_indel_thrs,
sec_codons=sec_codons,
no_fpi=no_fpi,
+ mask_all_first_10p=mask_all_first_10p,
)
# save this data
fraction_mutations.extend(inact_muts)
+ # now we extract inactivation mutations
+ # then add them to fraction_mutations list
+ # extract splice site mutations
+ sps_mutations = analyse_splice_sites(
+ ref,
+ query,
+ gene,
+ q_name,
+ codon_table,
+ atg_codons_data,
+ mask_all_first_10p,
+ u12_introns_data,
+ v=v,
+ )
+ fraction_mutations.extend(sps_mutations)
+
+
+
# get a list of split stop codons: stop codons that appear after exon deletions
# such as:
# GCAAACGCAGCt-------------[DELETED EXON]-------agTCCCATTTCCAACTGATC
# exon deletion raises an inframe stop codon: t + ag
- split_stop_codons = detect_split_stops(codon_table, gene, q_name, exon_stat)
+ split_stop_codons = detect_split_stops(
+ codon_table,
+ gene,
+ q_name,
+ exon_stat,
+ atg_codons_data,
+ mask_all_first_10p=mask_all_first_10p,
+ )
fraction_mutations.extend(split_stop_codons)
# detect compensated frameshifts
- compensations = detect_compensations(inact_muts, codon_table)
+ compensations, alt_frame_ranges = detect_compensations(inact_muts, codon_table)
fraction_mutations.extend(compensations)
# also mask compensated frameshifts:
fraction_mutations = mask_compensated_fs(fraction_mutations)
@@ -1434,13 +1696,19 @@ def inact_mut_check(
# extract and save %intact-related features
pintact_features = compute_intact_perc(
- codon_table, fraction_mutations, q_name, v=v
+ codon_table,
+ fraction_mutations,
+ q_name,
+ alt_frame_ranges,
+ alt_f_del=alt_f_del,
+ v=v,
+ mask_all_first_10p=mask_all_first_10p,
)
p_intact_ignore_M[q_name] = pintact_features[0]
p_intact_intact_M[q_name] = pintact_features[1]
i_codons_prop[q_name] = pintact_features[2]
- middle_80_intact[q_name] = pintact_features[3]
- middle_80_present[q_name] = pintact_features[4]
+ middle_is_intact[q_name] = pintact_features[3]
+ middle_is_present[q_name] = pintact_features[4]
# compute %of gene that lies outside chain borders
out_of_borders_prop = get_out_of_borders_prop(codon_table, missing_exons)
@@ -1454,8 +1722,8 @@ def inact_mut_check(
p_intact_intact_M,
i_codons_prop,
out_of_b_vals,
- middle_80_intact,
- middle_80_present,
+ middle_is_intact,
+ middle_is_present,
gene,
)
return report, del_miss_exons
diff --git a/split_exon_realign_jobs.py b/split_exon_realign_jobs.py
index f6e6901..e100ca1 100755
--- a/split_exon_realign_jobs.py
+++ b/split_exon_realign_jobs.py
@@ -31,7 +31,7 @@
WRAPPER_TEMPLATE = (
WRAPPER_ABSPATH
+ " {0} {1} {2} {3} {4} {5} --cesar_binary {6}"
- + " --uhq_flank {7} --memlim {8}"
+ + " --uhq_flank {7}"
)
CESAR_RUNNER = os.path.abspath(
os.path.join(LOCATION, "cesar_runner.py")
@@ -40,7 +40,7 @@
"GGLOB",
"TRANS",
} # chain classes that could lead to very long query loci
-BIGMEM_LIM = 1000 # mem limit for bigmem partition
+BIGMEM_LIM = 500 # mem limit for bigmem partition
REL_LENGTH_THR = 50
ABS_LENGTH_TRH = 1000000
EXTRA_MEM = 100000 # extra memory "just in case"
@@ -59,6 +59,10 @@
PROJECTION = "PROJECTION"
TRANSCRIPT = "TRANSCRIPT"
+MEM_FIT = "MEM_FIT"
+MEM_BIGMEM = "MEM_BIGMEM"
+MEM_DONTFIT = "MEM_DONTFIT"
+
# connect shared lib; define input and output data types
chain_coords_conv_lib_path = os.path.join(
LOCATION, "modules", "chain_coords_converter_slib.so"
@@ -191,6 +195,12 @@ def parse_args():
default=None,
help="Memory consumption was already precomputed",
)
+ app.add_argument(
+ "--precomp_regions_data_dir",
+ default=None,
+ help=("Directory containing files with precomputed regions. "
+ "Likely is not needed for standalone script scenario. ")
+ )
app.add_argument(
"--predefined_glp_class_path",
default=None,
@@ -198,6 +208,28 @@ def parse_args():
"(i) Projections with too short query region (L or M) and "
"(ii) Projections with very long query region (M)",
)
+ app.add_argument(
+ "--unprocessed_log",
+ "--unp",
+ default=None,
+ help="Store unprocessed genes in a separate file."
+ )
+ app.add_argument(
+ "--debug",
+ "-d",
+ action="store_true",
+ dest="debug",
+ help="Log debugging data"
+ )
+ app.add_argument(
+ "--mask_all_first_10p",
+ "--m_f10p",
+ action="store_true",
+ dest="mask_all_first_10p",
+ help="Automatically mask all inactivating mutations in first 10% of "
+ "the reading frame, ignoring ATG codons distribution. "
+ "(Default mode in V1.0, not recommended to use in later versions)"
+ )
# print help if there are no args
if len(sys.argv) < 2:
app.print_help()
@@ -495,7 +527,8 @@ def fill_buckets(buckets, all_jobs):
# memlim[5] -> jobs that require <= 5Gb
# memlim[10] -> jobs that require > 5Gb AND <= 10Gb
buckets[memlim] = [
- job for job, job_mem in all_jobs.items() if prev_lim < job_mem <= memlim
+ f"{job} --memlim {memlim}" for job, job_mem in all_jobs.items()
+ if prev_lim < job_mem <= memlim
]
prev_lim = memlim
# remove empty
@@ -507,10 +540,17 @@ def save_jobs(filled_buckets, bucket_jobs_num, jobs_dir):
"""Save cesar calls in the dir assigned."""
os.mkdir(jobs_dir) if not os.path.isdir(jobs_dir) else None
file_num, to_combine = 0, []
+ bucket_saved = {k: True for k in filled_buckets.keys()}
+
for bucket_id, jobs in filled_buckets.items():
num_of_files = bucket_jobs_num[bucket_id]
# just in case
num_of_files = len(jobs) if num_of_files >= len(jobs) else num_of_files
+ if num_of_files == 0:
+ # avoiding zero division error
+ bucket_saved[bucket_id] = False
+ print(f"Warning! No files to save jobs for bucket {bucket_id}")
+ continue
size_of_file = len(jobs) // num_of_files
# size_of_file = size_of_file + 1 if len(jobs) % num_of_files != 0 else size_of_file
jobs_split = parts(jobs, n=size_of_file)
@@ -522,6 +562,11 @@ def save_jobs(filled_buckets, bucket_jobs_num, jobs_dir):
f.write("\n".join(part) + "\n")
f.close()
to_combine.append(file_path)
+ # check if anything saved
+ if all(x is False for x in bucket_saved.values()):
+ print("Could not create any CESAR job. Probably, ALL genes require much more memory than the available.")
+ print("If this result is unexpected, please contact the developers.")
+ sys.exit(1)
return to_combine
@@ -551,7 +596,13 @@ def save_bigmem_jobs(bigmem_joblist, jobs_dir):
def save_combined_joblist(
- to_combine, combined_file, results_dir, inact_mut_dat, rejected_log, name=""
+ to_combine,
+ combined_file,
+ results_dir,
+ inact_mut_dat,
+ rejected_log,
+ unproc_log,
+ name=""
):
"""Save joblist of joblists (combined joblist)."""
f = open(combined_file, "w")
@@ -565,6 +616,9 @@ def save_combined_joblist(
if rejected_log:
log_path = os.path.join(rejected_log, f"{basename}.txt")
combined_command += f" --rejected_log {log_path}"
+ if unproc_log:
+ log_path = os.path.join(unproc_log, f"{basename}.txt")
+ combined_command += f" --unproc_log {log_path}"
f.write(combined_command + "\n")
f.close()
@@ -603,6 +657,133 @@ def read_precomp_mem(precomp_file):
return ret
+def get_trans_to_regions_file(precomp_regions_data_dir):
+ ret = {}
+ if precomp_regions_data_dir is None:
+ return ret
+ files_in = os.listdir(precomp_regions_data_dir)
+ for filename in files_in:
+ path = os.path.abspath(os.path.join(precomp_regions_data_dir, filename))
+ f = open(path, "r")
+ transcripts_in_file = set(x.split("\t")[0] for x in f)
+ f.close()
+ for t in transcripts_in_file:
+ ret[t] = path
+ return ret
+
+
+def build_job(gene, chains_arg, args, gene_fragments, trans_to_reg_precomp_file, u12_this_gene, mask_all_first_10p=False):
+ """Build CESAR job."""
+ # # 0 gene; 1 chains; 2 bed_file; 3 bdb chain_file; 4 tDB; 5 qDB; 6 output; 7 cesar_bin
+ job = WRAPPER_TEMPLATE.format(
+ gene,
+ chains_arg,
+ os.path.abspath(args.bdb_bed_file),
+ os.path.abspath(args.bdb_chain_file),
+ os.path.abspath(args.tDB),
+ os.path.abspath(args.qDB),
+ os.path.abspath(args.cesar_binary),
+ args.uhq_flank,
+ # gig,
+ )
+ # add some flags if required
+ job = job + " --mask_stops" if args.mask_stops else job
+ job = job + " --check_loss" if args.check_loss else job
+ job = job + " --no_fpi" if args.no_fpi else job
+ job = job + " --fragments" if gene_fragments else job
+ job = job + " --opt_cesar" if args.opt_cesar else job
+ job = job + " --alt_frame_del" # TODO: toga master script parameter
+
+ precomp_file = trans_to_reg_precomp_file.get(gene)
+ job = job + f" --predefined_regions {precomp_file}" if precomp_file else job
+
+ # add U12 introns data if this gene has them:
+ job = job + f" --u12 {os.path.abspath(args.u12)}" if u12_this_gene else job
+
+ # add mask_all_first_10p flag if needed
+ job = job + f" --mask_all_first_10p" if mask_all_first_10p else job
+ return job
+
+
+def compute_ref_part_of_mem(block_sizes):
+ """Compute num_states and r_length."""
+ num_states, r_length = 0, 0
+ for block_size in block_sizes:
+ # num_states += 6 + 6 * reference->num_codons + 1 + 2 + 2 + 22 + 6;
+ # /* 22 and 6 for acc and donor states */
+ num_codons = block_size // 3
+ num_states += 6 + 6 * num_codons + 1 + 2 + 2 + 22 + 6
+ # r_length += 11 + 6 * fasta.references[i]->length
+ # + donors[i]->length + acceptors[i]->length;
+ r_length += block_size
+ return num_states, r_length
+
+
+def compute_memory_gig_for_qlen(num_states, r_length, q_length_max):
+ memory = (
+ (num_states * 4 * 8)
+ + (num_states * q_length_max * 4)
+ + (num_states * 304)
+ + (2 * q_length_max + r_length) * 8
+ + (q_length_max + r_length) * 2 * 1
+ + EXTRA_MEM
+ )
+ gig = math.ceil(memory / 1000000000) + 0.25
+ return gig
+
+
+def _get_chain_arg_and_gig_arg(chains_list, chain_to_mem):
+ gig_arg = max([chain_to_mem[c] for c in chains_list])
+ return gig_arg
+
+
+def compute_memory(chains, precomp_gig, block_sizes, gene_chains_data, gene_fragments, mem_limit):
+ """Compute memory requirements for different chains."""
+ if precomp_gig:
+ # chains arg: just all chains, everything is precomputed
+ return {tuple(chains): (precomp_gig, MEM_FIT)}
+ # proceed to memory estimation
+ # the same procedure as inside CESAR2.0 code
+ # required memory depends on numerous params
+ # first, we need reference transcript-related parameters
+ # query-related parameters will be later
+ num_states, r_length = compute_ref_part_of_mem(block_sizes)
+
+ # branch 2: fragmented gene, here we have to use sum of query lengts
+ if gene_fragments:
+ # in case of fragmented genome: we stitch queries together
+ # so query length = sum of all queries
+ q_length_max = sum([v for v in gene_chains_data.values()])
+ gig = compute_memory_gig_for_qlen(num_states, r_length, q_length_max)
+ return {tuple(chains): (gig, MEM_FIT)}
+
+ # branch 3, maybe the most common one
+ ret = {}
+ chain_to_mem_consumption = {}
+ for chain_id, q_length in gene_chains_data.items():
+ chain_gig = compute_memory_gig_for_qlen(num_states, q_length, q_length)
+ chain_to_mem_consumption[chain_id] = chain_gig
+ # for every chain, we have the memory consumption
+ chains_that_fit = [c_id for c_id, gig in chain_to_mem_consumption.items() if gig <= mem_limit]
+ chain_that_goto_bigmem = [c_id for c_id, gig in chain_to_mem_consumption.items() if mem_limit < gig <= BIGMEM_LIM]
+ chains_that_dont_fit = [c_id for c_id, gig in chain_to_mem_consumption.items() if gig > BIGMEM_LIM]
+
+ # process each bucket of chains individually
+ if len(chains_that_fit) > 0:
+ # good, there are some chains that can be easily processed
+ mem = _get_chain_arg_and_gig_arg(chains_that_fit, chain_to_mem_consumption)
+ ret[tuple(chains_that_fit)] = (mem, MEM_FIT)
+ if len(chain_that_goto_bigmem) > 0:
+ # there are some bigmem jobs -> to be executed separately
+ # Deprecated branch, TODO: check whether it makes sense nowadays
+ mem = _get_chain_arg_and_gig_arg(chain_that_goto_bigmem, chain_to_mem_consumption)
+ ret[tuple(chain_that_goto_bigmem)] = (mem, MEM_BIGMEM)
+ if len(chains_that_dont_fit) > 0:
+ mem = _get_chain_arg_and_gig_arg(chains_that_dont_fit, chain_to_mem_consumption)
+ ret[tuple(chains_that_dont_fit)] = (mem, MEM_DONTFIT)
+ return ret
+
+
def main():
"""Entry point."""
t0 = dt.now()
@@ -618,6 +799,10 @@ def main():
# if memory is precomputed: use it
precomp_mem = read_precomp_mem(args.precomp_memory_data)
+ # precomp_regions = read_precomp_reg(args.precomp_regions_data)
+
+ # TODO: to optimize later
+ trans_to_reg_precomp_file = get_trans_to_regions_file(args.precomp_regions_data_dir)
# get lists of orthologous chains per each gene
# skipped_1 - no chains found -> log them
predefined_glp_class = {} # for projections which are M and L without CESAR
@@ -658,6 +843,7 @@ def main():
args.qDB,
)
predefined_glp_class.update(predef_glp)
+ predef_glp_class__chains_step = {}
# start making the jobs
all_jobs = {}
@@ -667,8 +853,8 @@ def main():
for gene in batch.keys():
u12_this_gene = u12_data.get(gene)
block_sizes = bed_data[gene][3]
-
gene_chains_data = regions.get(gene)
+
# check that there is something for this gene
if not gene_chains_data:
continue
@@ -681,94 +867,47 @@ def main():
gene_chains_data = {
k: v for k, v in gene_chains_data.items() if k in gene_fragments
}
+
chains = gene_chains_data.keys()
if len(chains) == 0:
continue
- chains_arg = ",".join(chains) # chain ids -> one of the cmd args
-
- # if memory is precomputed then use it
precomp_gig = precomp_mem.get(gene, None)
- if precomp_gig is None:
- # proceed to memory estimation
- # the same procedure as inside CESAR2.0 code
- num_states, r_length = 0, 0
-
- # required memory depends on numerous params
- # first, we need reference transcript-related parameters
- # query-related parameters will be later
- for block_size in block_sizes:
- # num_states += 6 + 6 * reference->num_codons + 1 + 2 + 2 + 22 + 6;
- # /* 22 and 6 for acc and donor states */
- num_codons = block_size // 3
- num_states += 6 + 6 * num_codons + 1 + 2 + 2 + 22 + 6
- # r_length += 11 + 6 * fasta.references[i]->length
- # + donors[i]->length + acceptors[i]->length;
- r_length += block_size
-
- # now compute query sequence-related parameters
- query_lens = [v for v in gene_chains_data.values()]
- if (
- gene_fragments
- ): # in case of fragmented genome: we stitch queries together
- # so query length = sum of all queries
- q_length_max = sum(query_lens)
- else: # not fragmented genome: processins queries separately
- # thus we need only the max length
- q_length_max = max(query_lens)
- # and now compute the amount of required memory
- memory = (
- (num_states * 4 * 8)
- + (num_states * q_length_max * 4)
- + (num_states * 304)
- + (2 * q_length_max + r_length) * 8
- + (q_length_max + r_length) * 2 * 1
- + EXTRA_MEM
- )
- gig = math.ceil(memory / 1000000000) + 0.25
- else:
- # memory was precomputed
- gig = precomp_gig
-
- # gig = compute_amount_of_memory(block_sizes, q_length_max, args.opt_cesar)
- # # 0 gene; 1 chains; 2 bed_file; 3 bdb chain_file; 4 tDB; 5 qDB; 6 output; 7 cesar_bin
- job = WRAPPER_TEMPLATE.format(
- gene,
- chains_arg,
- os.path.abspath(args.bdb_bed_file),
- os.path.abspath(args.bdb_chain_file),
- os.path.abspath(args.tDB),
- os.path.abspath(args.qDB),
- os.path.abspath(args.cesar_binary),
- args.uhq_flank,
- gig,
- )
- # add some flags if required
- job = job + " --mask_stops" if args.mask_stops else job
- job = job + " --check_loss" if args.check_loss else job
- job = job + " --no_fpi" if args.no_fpi else job
- job = job + " --fragments" if gene_fragments else job
- job = job + " --opt_cesar" if args.opt_cesar else job
-
- # add U12 introns data if this gene has them:
- job = job + f" --u12 {os.path.abspath(args.u12)}" if u12_this_gene else job
-
- # define whether it's an ordinary or a bigmem job
- # depending on the memory requirements
- if gig <= mem_limit: # ordinary job
- all_jobs[job] = gig
- elif gig <= BIGMEM_LIM:
- skipped_3.append((gene, ",".join(chains), f"requires {gig}) -> bigmem job"))
- predef_glp[gene] = f"{TRANSCRIPT}\tM"
- bigmem_jobs.append(job)
- else:
- skipped_3.append(
- (
- gene,
- ",".join(chains),
- f"big mem limit ({BIGMEM_LIM} gig) exceeded (needs {gig})",
- )
- )
- predef_glp[gene] = f"{TRANSCRIPT}\tM"
+ chain_arg_to_gig = compute_memory(chains,
+ precomp_gig,
+ block_sizes,
+ gene_chains_data,
+ gene_fragments,
+ mem_limit)
+ for chains_tup, (gig, stat) in chain_arg_to_gig.items():
+ chains_arg = ",".join(chains_tup)
+ job = build_job(gene,
+ chains_arg,
+ args,
+ gene_fragments,
+ trans_to_reg_precomp_file,
+ u12_this_gene,
+ mask_all_first_10p=args.mask_all_first_10p)
+
+ # define whether it's an ordinary or a bigmem job
+ # depending on the memory requirements
+ if stat == MEM_FIT: # ordinary job
+ all_jobs[job] = gig
+ elif stat == MEM_BIGMEM:
+ to_app = (gene, chains_arg, f"requires {gig}) -> bigmem job")
+ skipped_3.append(to_app)
+ bigmem_jobs.append(job)
+ for chain_id in chains_tup:
+ proj_id = f"{gene}.{chain_id}"
+ predef_glp_class__chains_step[proj_id] = f"{PROJECTION}\tM"
+ else:
+ to_app = (gene, chains_arg, f"big mem limit ({BIGMEM_LIM} gig) exceeded (needs {gig})",)
+ skipped_3.append(to_app)
+ for chain_id in chains_tup:
+ proj_id = f"{gene}.{chain_id}"
+ predef_glp_class__chains_step[proj_id] = f"{PROJECTION}\tM"
+
+ # TODO: predefined GLP classes to be refactored
+ predefined_glp_class.update(predef_glp_class__chains_step)
eprint(f"\nThere are {len(all_jobs.keys())} jobs in total.")
eprint("Splitting the jobs.")
@@ -796,7 +935,12 @@ def main():
# save joblist of joblists
save_combined_joblist(
- to_combine, args.combined, args.results, args.check_loss, args.rejected_log
+ to_combine,
+ args.combined,
+ args.results,
+ args.check_loss,
+ args.rejected_log,
+ args.unprocessed_log
)
# save bigmem jobs, a bit different logic
@@ -808,6 +952,7 @@ def main():
args.results,
args.check_loss,
args.rejected_log,
+ None, # TODO: decide what we do with this branch
name="bigmem",
)
diff --git a/supply/extract_codon_alignment.py b/supply/extract_codon_alignment.py
index e87b0ee..bb7016c 100644
--- a/supply/extract_codon_alignment.py
+++ b/supply/extract_codon_alignment.py
@@ -27,7 +27,6 @@
# actually, it was nearly random CESAR alignment for this exon
# v2.2 fixed a bug: wrong handling of completely deleted exons
-
SEQ_NUMBER_LIMIT = 1500
CESAR_RESULTS_FILE = "cesar_results.txt"
EXON_SEQ_CLASSES = {"query_exon", "reference_exon"}
@@ -217,6 +216,22 @@ def parse_args():
action="store_true",
help="Do not consider UL projections as orthologous (NOT IMPLEMENTED YET)",
)
+ app.add_argument(
+ "--force_repair",
+ "--fr",
+ dest="force_repair",
+ action="store_true",
+ help=("Force repair missing parts of the alignment. "
+ "Please use in case the script continuously fails to produce "
+ "the result. Can be needed in case of massive alignments with "
+ "abundant missing/corrupted sequence.")
+ )
+ app.add_argument(
+ "--save_aligner_commands",
+ default=None,
+ help="Save a sequence of MACSE commands to the specified location. "
+ "Temporary files will not be deleted!"
+ )
# if no args: print help message
if len(sys.argv) < MIN_ARG_NUM:
@@ -610,13 +625,16 @@ def macse_alignment(in_fasta, temp_dir, macse_caller, v=False):
)
cmd = f"{macse_caller} -prog alignSequences -seq {STDIN} -out_NT {out_file} -out_AA {to_del_file}"
if v:
- print_stderr("Aligning:")
- print_stderr(in_fasta)
+ print_stderr("Calling:\n")
+ print_stderr(cmd)
+ # print_stderr("Aligning:")
+ # print_stderr(in_fasta)
p = subprocess.Popen(cmd, stdin=PIPE, stderr=PIPE, stdout=PIPE, shell=True)
_, stderr_ = p.communicate(input=in_fasta.encode())
rc = p.returncode
if rc != 0: # MACSE crashed
print_stderr("# Error! Macse CRASHED")
+ # print_stderr(f"Input file: {in_fasta}")
err_msg = stderr_.decode(UTF_8)
print_stderr(err_msg)
sys.exit(1)
@@ -742,19 +760,20 @@ def split_into_exons(sp_to_codon_alis, codon_to_exon, codon_to_seq, debug=False)
ret = {}
ali_id_to_seqs = defaultdict(dict)
codon_tot_num_ = len(codon_to_exon)
- if debug:
- print_stderr("Codon to exon_dict")
- print_stderr(codon_to_exon)
- print_stderr(f"In total {codon_tot_num_} codons")
+ # if debug:
+ # TODO: add to level 2 verbosity
+ # print_stderr("Codon to exon_dict")
+ # print_stderr(codon_to_exon)
+ # print_stderr(f"In total {codon_tot_num_} codons")
- print_stderr(f"Codon sequences:\n{codon_to_seq}")
+ # print_stderr(f"Codon sequences:\n{codon_to_seq}")
for seq_id, seq in sp_to_codon_alis.items():
sp, proj_id, is_ref = seq_id
key_ = (sp, proj_id)
ali_id_to_seqs[key_][is_ref] = seq
for k, v in ali_id_to_seqs.items():
- print_stderr(f"processing {k}") if debug else None
+ # print_stderr(f"processing {k}") if debug else None
ref_codons = v[True].split()
que_codons = v[False].split()
codon_num = len(ref_codons)
@@ -767,13 +786,13 @@ def split_into_exons(sp_to_codon_alis, codon_to_exon, codon_to_seq, debug=False)
for i_num, (r_c, q_c) in enumerate(zip(ref_codons, que_codons)):
exon_num = codon_to_exon[ref_codon_num]
debug_line = f"{k}: extracting codon {ref_codon_num} out of {codon_tot_num_} | exon {exon_num}"
- print_stderr(debug_line) if debug else None
+ # print_stderr(debug_line) if debug else None
expected_ref_codon = codon_to_seq[ref_codon_num]
debug_line = (
f"{r_c} -> {expected_ref_codon} {q_c} {ref_codon_num} {exon_num}"
)
- print_stderr(debug_line) if debug else None
+ # print_stderr(debug_line) if debug else None
ref_is_gap = r_c == CODON_GAP
ref_is_XXX = r_c == CODON_XXX
ref_has_N = "N" in r_c
@@ -781,7 +800,7 @@ def split_into_exons(sp_to_codon_alis, codon_to_exon, codon_to_seq, debug=False)
ref_is_undefined = ref_is_gap or ref_is_XXX or ref_has_N
debug_line = f"# ref gap: {ref_is_gap} ref unknown: {ref_is_XXX} ref expected: {ref_is_exp}"
- print_stderr(debug_line) if debug else None
+ # print_stderr(debug_line) if debug else None
if not ref_is_gap:
ref_codon_num += 1
@@ -790,12 +809,12 @@ def split_into_exons(sp_to_codon_alis, codon_to_exon, codon_to_seq, debug=False)
and ref_is_undefined is False
and exp_codon_lock is False
):
- print_stderr(f"Warning! {k}: Fixing ref codon_num..")
+ # print_stderr(f"Warning! {k}: Fixing ref codon_num..")
wrong_codon_num = ref_codon_num
ref_codon_num = fix_codon_num(
wrong_codon_num, expected_ref_codon, codon_to_seq
)
- print_stderr(f"Corrected from {wrong_codon_num} to {ref_codon_num}")
+ # print_stderr(f"Corrected from {wrong_codon_num} to {ref_codon_num}")
codon_pair = (r_c, q_c)
exon_to_sequences[exon_num].append(codon_pair)
@@ -804,7 +823,7 @@ def split_into_exons(sp_to_codon_alis, codon_to_exon, codon_to_seq, debug=False)
# this will cause an index error
# need to handle this overflow correctrly
exp_codon_lock = True
- print_stderr(f"Warning! {k}: Codon sequence overflow...")
+ # print_stderr(f"Warning! {k}: Codon sequence overflow...")
ref_codon_num = codon_tot_num_ - 1
ret[k] = exon_to_sequences
@@ -816,19 +835,28 @@ def reformat_codon_alis(sp_to_exon_codon_alis, all_exons):
ret = defaultdict(dict)
for (sp, proj_id), v in sp_to_exon_codon_alis.items():
for exon in all_exons:
- exon_related_seq = v[exon]
- ref_codons = [x[0] for x in exon_related_seq]
- que_codons = [x[1] for x in exon_related_seq]
- ref_seq = "".join(ref_codons)
- que_seq = "".join(que_codons)
+ exon_related_seq = v.get(exon, None)
ref_seq_key = (sp, proj_id, True)
que_seq_key = (sp, proj_id, False)
- ret[exon][ref_seq_key] = ref_seq
- ret[exon][que_seq_key] = que_seq
+
+ if exon_related_seq:
+ ref_codons = [x[0] for x in exon_related_seq]
+ que_codons = [x[1] for x in exon_related_seq]
+ ref_seq = "".join(ref_codons)
+ que_seq = "".join(que_codons)
+ ret[exon][ref_seq_key] = ref_seq
+ ret[exon][que_seq_key] = que_seq
+ else:
+ # print(sp_to_exon_codon_alis)
+ # print(f"Error here, cannot find key: {exon}")
+ # print(v)
+ # sys.exit(1)
+ ret[exon][ref_seq_key] = "NNN"
+ ret[exon][que_seq_key] = "NNN"
return ret
-def merge_exon_fastas(fastas_list, fasta_headers):
+def merge_exon_fastas(fastas_list, fasta_headers, force_repair=False):
"""Merge per-exon fastas."""
id_to_chunks = defaultdict(list)
fasta_lines = []
@@ -1221,6 +1249,7 @@ def main():
)
# ALIGNING EXON-BY-EXON
else:
+ commands_sequence = []
fasta_headers_all = set([f"{x[0]}\t{x[1]}" for x in sp_to_codon_alis.keys()])
# we like to align exons separately, first of all get
# codon number -> exon number mapping
@@ -1260,7 +1289,9 @@ def main():
aligned_fasta_exon, f"aligned_exon_{exon}", args.intermediate_data
)
aligned_fastas.append(aligned_fasta_exon)
- aligned_fasta = merge_exon_fastas(aligned_fastas, fasta_headers_all)
+ aligned_fasta = merge_exon_fastas(aligned_fastas,
+ fasta_headers_all,
+ force_repair=args.force_repair)
f = open(args.output, "w") if args.output else sys.stdout
f.write(aligned_fasta)
diff --git a/toga.py b/toga.py
index c61df23..e68931c 100755
--- a/toga.py
+++ b/toga.py
@@ -23,8 +23,8 @@
from modules.merge_chains_output import merge_chains_output
from modules.make_pr_pseudogenes_anno import create_ppgene_track
from modules.merge_cesar_output import merge_cesar_output
-from modules.gene_losses_summary import I, gene_losses_summary
-from modules.orthology_type_map import orthology_type_map, trim_prefix
+from modules.gene_losses_summary import gene_losses_summary
+from modules.orthology_type_map import orthology_type_map
from modules.classify_chains import classify_chains
from modules.get_transcripts_quality import classify_transcripts
from modules.make_query_isoforms import get_query_isoforms_data
@@ -57,10 +57,19 @@
CESAR_RUNNER_TMP = "{0} {1} {2} --check_loss {3} --rejected_log {4}"
CESAR_PRECOMPUTED_REGIONS_DIRNAME = "cesar_precomputed_regions"
CESAR_PRECOMPUTED_MEMORY_DIRNAME = "cesar_precomputed_memory"
+CESAR_PRECOMPUTED_ORTHO_LOCI_DIRNAME = "cesar_precomputed_orthologous_loci"
+
CESAR_PRECOMPUTED_MEMORY_DATA = "cesar_precomputed_memory.tsv"
+CESAR_PRECOMPUTED_REGIONS_DATA = "cesar_precomputed_regions.tsv"
+CESAR_PRECOMPUTED_ORTHO_LOCI_DATA = "cesar_precomputed_orthologous_loci.tsv"
+
+NUM_CESAR_MEM_PRECOMP_JUBS = 500
+
+
TEMP_CHAIN_CLASS = "temp_chain_trans_class"
MODULES_DIR = "modules"
RUNNING = "RUNNING"
+CRASHED = "CRASHED"
TEMP = "temp"
# automatically enable flush
@@ -92,10 +101,11 @@ def __init__(self, args):
else: # if none: we cannot call os.path.abspath method
self.nextflow_bigmem_config = None
self.__check_nf_config()
+
# to avoid crash on filesystem without locks:
- os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" # otherwise it could crash
+ os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
# temporary fix for DSL error in recent NF versions
- os.environ["NXF_DEFAULT_DSL"] = "1"
+ os.environ["NXF_DEFAULT_DSL"] = "1"
chain_basename = os.path.basename(args.chain_input)
@@ -184,7 +194,11 @@ def __init__(self, args):
self.cesar_binary = (
self.DEFAULT_CESAR if not args.cesar_binary else args.cesar_binary
)
+ self.opt_cesar_binary = os.path.abspath(
+ os.path.join(LOCATION, "cesar_input_optimiser.py")
+ )
self.cesar_is_opt = args.using_optimized_cesar
+ self.output_opt_cesar_regions = args.output_opt_cesar_regions
self.time_log = args.time_marks
self.stop_at_chain_class = args.stop_at_chain_class
self.rejected_log = os.path.join(self.wd, "genes_rejection_reason.tsv")
@@ -207,6 +221,7 @@ def __init__(self, args):
self.keep_nf_logs = args.do_not_del_nf_logs
self.exec_cesar_parts_sequentially = args.cesar_exec_seq
self.ld_model_arg = args.ld_model
+ self.mask_all_first_10p = args.mask_all_first_10p
self.cesar_ok_merged = (
None # Flag: indicates whether any cesar job BATCHES crashed
@@ -237,12 +252,20 @@ def __init__(self, args):
self.gene_loss_data = os.path.join(self.temp_wd, "inact_mut_data")
self.query_annotation = os.path.join(self.wd, "query_annotation.bed")
self.loss_summ = os.path.join(self.wd, "loss_summ_data.tsv")
+ # directory to store intermediate files with technically unprocessable transcripts:
+ self.technical_cesar_err = os.path.join(self.temp_wd, "technical_cesar_err")
+ # unprocessed transcripts to be considered Missing:
+ self.technical_cesar_err_merged = os.path.join(
+ self.temp_wd, "technical_cesar_err.txt"
+ )
+
self.bed_fragm_exons_data = os.path.join(
self.temp_wd, "bed_fragments_to_exons.tsv"
)
self.precomp_mem_cesar = os.path.join(
self.temp_wd, CESAR_PRECOMPUTED_MEMORY_DATA
)
+ self.precomp_reg_dir = None
self.cesar_mem_was_precomputed = False
self.u12_arg = args.u12
self.u12 = None # assign after U12 file check
@@ -517,14 +540,14 @@ def __check_isoforms_file(self, t_in_bed):
f.write(f"{gene}\t{trans}\n")
print("Isoforms file is OK")
- @staticmethod
- def die(msg, rc=1):
+ def die(self, msg, rc=1):
"""Show msg in stderr, exit with the rc given."""
print(msg)
print(f"Program finished with exit code {rc}\n")
# for t_file in self.temp_files: # remove temp files if required
# os.remove(t_file) if os.path.isfile(t_file) and not self.keep_temp else None
# shutil.rmtree(t_file) if os.path.isdir(t_file) and not self.keep_temp else None
+ self.__mark_crashed()
sys.exit(rc)
def __modules_addr(self):
@@ -695,7 +718,7 @@ def __call_proc(self, cmd, extra_msg=None):
rc = subprocess.call(cmd, shell=True)
if rc != 0:
print(extra_msg) if extra_msg else None
- self.die(f"Error! Process {cmd} died! Abort.")
+ self.die(f"Error! Process:\n{cmd}\ndied! Abort.")
print(f"{cmd} done with code 0")
def __find_two_bit(self, db):
@@ -770,7 +793,7 @@ def run(self):
# 9) classify projections/genes as lost/intact
# also measure projections confidence levels
print("#### STEP 9: Gene loss pipeline classification\n")
- self.__transcript_quality()
+ self.__transcript_quality() # maybe remove -> not used anywhere
self.__gene_loss_summary()
self.__time_mark("Got gene loss summary")
@@ -793,7 +816,6 @@ def run(self):
print(f"{self.cesar_crashed_batches_log}\n")
print(f"Saved results to {self.wd}")
self.__left_done_mark()
- self.die(f"Done! Estimated time: {dt.now() - self.t0}", rc=0)
def __mark_start(self):
"""Indicate that TOGA process have started."""
@@ -803,6 +825,16 @@ def __mark_start(self):
f.write(f"TOGA process started at {now_}\n")
f.close()
+ def __mark_crashed(self):
+ """Indicate that TOGA process died."""
+ running_f = os.path.join(self.wd, RUNNING)
+ crashed_f = os.path.join(self.wd, CRASHED)
+ os.remove(running_f) if os.path.isfile(running_f) else None
+ f = open(crashed_f, "w")
+ now_ = str(dt.now())
+ f.write(f"TOGA CRASHED AT {now_}\n")
+ f.close()
+
def __make_indexed_chain(self):
"""Make chain index file."""
# make *.bb file
@@ -920,7 +952,9 @@ def __classify_chains(self):
self.pred_scores = os.path.join(self.temp_wd, "orthology_scores.tsv")
self.se_model = os.path.join(self.LOCATION, "models", "se_model.dat")
self.me_model = os.path.join(self.LOCATION, "models", "me_model.dat")
- self.ld_model = os.path.join(self.LOCATION, "long_distance_model", "long_dist_model.dat")
+ self.ld_model = os.path.join(
+ self.LOCATION, "long_distance_model", "long_dist_model.dat"
+ )
cl_rej_log = os.path.join(self.rejected_dir, "classify_chains_rejected.txt")
ld_arg_ = self.ld_model if self.ld_model_arg else None
if not os.path.isfile(self.se_model) or not os.path.isfile(self.me_model):
@@ -933,7 +967,7 @@ def __classify_chains(self):
rejected=cl_rej_log,
raw_out=self.pred_scores,
annot_threshold=self.orth_score_threshold,
- ld_model=ld_arg_
+ ld_model=ld_arg_,
)
# extract not classified transcripts
# first column in the rejected log
@@ -968,6 +1002,128 @@ def __split_file(src, dst_dir, pieces_num):
f.write("".join(piece))
f.close()
return paths_to_pieces
+
+ def __get_transcript_to_strand(self):
+ """Get """
+ ret = {}
+ f = open(self.ref_bed, 'r')
+ for line in f:
+ ld = line.rstrip().split("\t")
+ trans = ld[3]
+ direction = ld[5]
+ ret[trans] = direction
+ f.close()
+ return ret
+
+ def __get_chain_to_qstrand(self):
+ ret = {}
+ f = open(self.chain_file, "r")
+ for line in f:
+ if not line.startswith("chain"):
+ continue
+ fields = line.rstrip().split()
+ chain_id = int(fields[-1])
+ q_strand = fields[9]
+ ret[chain_id] = q_strand
+ f.close()
+ return ret
+
+ def __fold_exon_data(self, exons_data, out_bed):
+ """Convert exon data into bed12."""
+ projection_to_exons = defaultdict(list)
+ # 1: make projection:
+ f = open(exons_data, "r")
+ for line in f:
+ ld = line.rstrip().split("\t")
+ transcript, _chain, _start, _end = ld
+ chain = int(_chain)
+ start = int(_start)
+ end = int(_end)
+ region = (start, end)
+ projection_to_exons[(transcript, chain)].append(region)
+ # 2 - get search loci for each projection
+ projection_to_search_loc = {}
+
+ # CESAR_PRECOMPUTED_ORTHO_LOCI_DATA = "cesar_precomputed_orthologous_loci.tsv"
+ f = open(self.precomp_query_loci_path, "r")
+ for elem in f:
+ elem_data = elem.split("\t")
+ transcript = elem_data[1]
+ chain = int(elem_data[2])
+ projection = f"{transcript}.{chain}"
+ search_locus = elem_data[3]
+ chrom, s_e = search_locus.split(":")
+ s_e_split = s_e.split("-")
+ start, end = int(s_e_split[0]), int(s_e_split[1])
+ projection_to_search_loc[(transcript, chain)] = (chrom, start, end)
+ f.close()
+ trans_to_strand = self.__get_transcript_to_strand()
+ chain_to_qstrand = self.__get_chain_to_qstrand()
+ # 3 - save bed 12
+ f = open(out_bed, "w")
+ # print(projection_to_search_loc)
+ for (transcript, chain), exons in projection_to_exons.items():
+ # print(transcript, chain)
+ projection = f"{transcript}.{chain}"
+ trans_strand = trans_to_strand[transcript]
+ chain_strand = chain_to_qstrand[chain]
+ to_invert = trans_strand != chain_strand
+
+ exons_sort = sorted(exons, key=lambda x: x[0])
+ if to_invert:
+ exons_sort = exons_sort[::-1]
+
+ # crash here
+ search_locus = projection_to_search_loc[(transcript, chain)]
+ chrom = search_locus[0]
+ search_start = search_locus[1]
+ search_end = search_locus[2]
+
+ if to_invert:
+ bed_start = search_end - exons_sort[0][1]
+ bed_end = search_end - exons_sort[-1][0]
+ else:
+ bed_start = exons_sort[0][0] + search_start
+ bed_end = exons_sort[-1][1] + search_start
+
+ block_sizes, block_starts = [], []
+ for exon in exons_sort:
+ abs_start_in_s, abs_end_in_s = exon
+ # print(exon)
+
+ if to_invert:
+ abs_start = search_end - abs_end_in_s
+ abs_end = search_end - abs_start_in_s
+ else:
+ abs_start = abs_start_in_s + search_start
+ abs_end = abs_end_in_s + search_start
+
+ # print(abs_start, abs_end)
+
+ rel_start = abs_start - bed_start
+ block_size = abs_end - abs_start
+ block_sizes.append(block_size)
+ block_starts.append(rel_start)
+ block_sizes_field = ",".join(map(str, block_sizes))
+ block_starts_field = ",".join(map(str, block_starts))
+ all_fields = (
+ chrom,
+ bed_start,
+ bed_end,
+ projection,
+ 0,
+ 0,
+ bed_start,
+ bed_end,
+ "0,0,0",
+ len(exons),
+ block_sizes_field,
+ block_starts_field,
+ )
+ f.write("\t".join(map(str, all_fields)))
+ f.write("\n")
+ f.close()
+ f.close()
def __precompute_data_for_opt_cesar(self):
"""Precompute memory data for optimised CESAR.
@@ -977,28 +1133,56 @@ def __precompute_data_for_opt_cesar(self):
"""
if not self.cesar_is_opt:
return # in case we use standard CESAR this procedure is not needed
+
+ # need gene: chains
+ # artificial bed file with all possible exons per gene, maybe the longest if intersection... or not?
+ # what if there is a giant exon that does not align but smaller versions do?
print("COMPUTING MEMORY REQUIREMENTS FOR OPTIMISED CESAR")
mem_dir = os.path.join(self.wd, CESAR_PRECOMPUTED_MEMORY_DIRNAME)
+ self.precomp_reg_dir = os.path.join(self.wd, CESAR_PRECOMPUTED_REGIONS_DIRNAME)
chain_class_temp_dir = os.path.join(self.wd, TEMP_CHAIN_CLASS)
+ self.precomp_query_loci_dir = os.path.join(
+ self.wd, CESAR_PRECOMPUTED_ORTHO_LOCI_DIRNAME
+ )
+ self.precomp_query_loci_path = os.path.join(
+ self.temp_wd, CESAR_PRECOMPUTED_ORTHO_LOCI_DATA
+ )
+
os.mkdir(mem_dir) if not os.path.isdir(mem_dir) else None
os.mkdir(chain_class_temp_dir) if not os.path.isdir(
chain_class_temp_dir
) else None
+ os.mkdir(self.precomp_reg_dir) if not os.path.isdir(
+ self.precomp_reg_dir
+ ) else None
+ os.mkdir(self.precomp_query_loci_dir) if not os.path.isdir(
+ self.precomp_query_loci_dir
+ ) else None
+
self.temp_files.append(mem_dir)
self.temp_files.append(chain_class_temp_dir)
+ self.temp_files.append(self.precomp_reg_dir)
+ self.temp_files.append(self.precomp_query_loci_dir)
+
# split file containing gene: orthologous/paralogous chains into 100 pieces
- class_pieces = self.__split_file(self.orthologs, chain_class_temp_dir, 100)
+ class_pieces = self.__split_file(
+ self.orthologs, chain_class_temp_dir, NUM_CESAR_MEM_PRECOMP_JUBS
+ )
precompute_jobs = []
# for nextflow abspath is better:
precomp_abspath = os.path.abspath(self.PRECOMPUTE_OPT_CESAR_DATA)
- cesar_bin_abspath = os.path.abspath(self.cesar_binary)
+ # cesar_bin_abspath = os.path.abspath(self.cesar_binary)
+
# create joblist: one job per piece
for num, class_piece in enumerate(class_pieces, 1):
- out_path = os.path.join(mem_dir, f"part_{num}")
+ out_m_path = os.path.join(mem_dir, f"part_{num}")
+ out_reg_path = os.path.join(self.precomp_reg_dir, f"part_{num}")
+ out_ol_path = os.path.join(self.precomp_query_loci_dir, f"path_{num}")
+
cmd = (
- f"{precomp_abspath} {class_piece} {self.wd} {cesar_bin_abspath} "
- f"{self.t_2bit} {self.q_2bit} {out_path}"
+ f"{precomp_abspath} {class_piece} {self.wd} {self.opt_cesar_binary} "
+ f"{self.t_2bit} {self.q_2bit} {out_m_path} --ro {out_reg_path} --ol {out_ol_path}"
)
precompute_jobs.append(cmd)
# save joblist
@@ -1016,7 +1200,7 @@ def __precompute_data_for_opt_cesar(self):
print(f"Project path: {project_path}")
if self.para: # run jobs with para, skip nextflow
- cmd = f'para make {project_name} {precomp_mem_joblist} -q="short"'
+ cmd = f'para make {project_name} {precomp_mem_joblist} -q="shortmed"'
print(f"Calling {cmd}")
rc = subprocess.call(cmd, shell=True)
else: # calling jobs with nextflow
@@ -1038,8 +1222,18 @@ def __precompute_data_for_opt_cesar(self):
shutil.rmtree(project_path) if os.path.isdir(project_path) else None
# merge files and quit
self.__merge_dir(mem_dir, self.precomp_mem_cesar)
+ # self.__merge_dir(precomp_reg_dir, self.precomp_reg_cesar)
self.cesar_mem_was_precomputed = True
+ if self.output_opt_cesar_regions:
+ save_to = os.path.join(self.wd, "precomp_regions_for_cesar.bed")
+ exon_regions_df = os.path.join(self.wd, "precomp_regions_exons.tsv")
+ self.__merge_dir(self.precomp_reg_dir, exon_regions_df)
+ self.__merge_dir(self.precomp_query_loci_dir, self.precomp_query_loci_path)
+ self.__fold_exon_data(exon_regions_df, save_to)
+ print(f"Bed file containing precomputed regions is saved to: {save_to}")
+ sys.exit(0)
+
def __split_cesar_jobs(self):
"""Call split_exon_realign_jobs.py."""
if not self.t_2bit or not self.q_2bit:
@@ -1073,9 +1267,14 @@ def __split_cesar_jobs(self):
self.cesar_combined = os.path.join(self.temp_wd, "cesar_combined")
self.cesar_results = os.path.join(self.temp_wd, "cesar_results")
self.cesar_bigmem_jobs = os.path.join(self.temp_wd, "cesar_bigmem")
+
self.temp_files.append(self.cesar_jobs_dir)
self.temp_files.append(self.cesar_combined)
self.temp_files.append(self.predefined_glp_cesar_split)
+ self.temp_files.append(self.technical_cesar_err)
+ os.mkdir(self.technical_cesar_err) if not os.path.isdir(
+ self.technical_cesar_err
+ ) else None
# different field names depending on --ml flag
@@ -1083,6 +1282,9 @@ def __split_cesar_jobs(self):
self.temp_files.append(self.gene_loss_data)
skipped_path = os.path.join(self.rejected_dir, "SPLIT_CESAR.txt")
self.paralogs_log = os.path.join(self.temp_wd, "paralogs.txt")
+ cesar_binary_to_use = (
+ self.opt_cesar_binary if self.cesar_is_opt else self.cesar_binary
+ )
split_cesar_cmd = (
f"{self.SPLIT_EXON_REALIGN_JOBS} "
@@ -1099,15 +1301,17 @@ def __split_cesar_jobs(self):
f"--chains_limit {self.cesar_chain_limit} "
f"--skipped_genes {skipped_path} "
f"--rejected_log {self.rejected_dir} "
- f"--cesar_binary {self.cesar_binary} "
+ f"--cesar_binary {cesar_binary_to_use} "
f"--paralogs_log {self.paralogs_log} "
f"--uhq_flank {self.uhq_flank} "
- f"--predefined_glp_class_path {self.predefined_glp_cesar_split}"
+ f"--predefined_glp_class_path {self.predefined_glp_cesar_split} "
+ f"--unprocessed_log {self.technical_cesar_err}"
)
if self.annotate_paralogs: # very rare occasion
split_cesar_cmd += f" --annotate_paralogs"
-
+ if self.mask_all_first_10p:
+ split_cesar_cmd += f" --mask_all_first_10p"
# split_cesar_cmd = split_cesar_cmd + f" --cesar_binary {self.cesar_binary}" \
# if self.cesar_binary else split_cesar_cmd
split_cesar_cmd = (
@@ -1131,6 +1335,7 @@ def __split_cesar_jobs(self):
split_cesar_cmd += f" --fragments_data {fragm_dict_file}"
if self.cesar_mem_was_precomputed:
split_cesar_cmd += f" --precomp_memory_data {self.precomp_mem_cesar}"
+ split_cesar_cmd += f" --precomp_regions_data_dir {self.precomp_reg_dir}"
self.__call_proc(split_cesar_cmd, "Could not split CESAR jobs!")
def __get_cesar_jobs_for_bucket(self, comb_file, bucket_req):
@@ -1255,7 +1460,7 @@ def __run_cesar_jobs(self):
nf_project_path = os.path.join(self.nextflow_dir, nf_project_name)
project_paths.append(nf_project_path)
- # create subprocess object
+ # create subprocess object
if not self.para: # create nextflow cmd
os.mkdir(nf_project_path) if not os.path.isdir(
nf_project_path
@@ -1271,7 +1476,7 @@ def __run_cesar_jobs(self):
if memory_mb > 0:
cmd += f" --memoryMb={memory_mb}"
p = subprocess.Popen(cmd, shell=True)
-
+
sys.stderr.write(f"Pushed cluster jobs with {cmd}\n")
# wait for the process if calling processes sequentially
@@ -1389,8 +1594,11 @@ def __rebuild_crashed_jobs(self, crashed_jobs):
chain_ids = self.__read_chain_arg(chains_arg)
if chain_ids is None:
continue
- memlim_arg_ind = elem_args.index(MEMLIM_ARG) + 1
- mem_val = float(elem_args[memlim_arg_ind])
+ try:
+ memlim_arg_ind = elem_args.index(MEMLIM_ARG) + 1
+ mem_val = float(elem_args[memlim_arg_ind])
+ except ValueError:
+ mem_val = self.cesar_mem_limit
bucket_lim = self.__get_bucket_val(mem_val, buckets)
if FRAGM_ARG in elem_args:
@@ -1502,6 +1710,8 @@ def __check_cesar_completeness(self):
p_objects.append(p)
time.sleep(CESAR_PUSH_INTERVAL)
self.__monitor_jobs(p_objects, project_paths, die_if_sc_1=True)
+ # TODO: maybe some extra sanity check
+
# need to check whether anything crashed again
crashed_twice = []
for elem in err_log_files:
@@ -1521,6 +1731,19 @@ def __check_cesar_completeness(self):
err_msg = f"Some CESAR jobs crashed twice, please check {crashed_log}; Abort"
self.die(err_msg, 1)
+ def __append_technicall_err_to_predef_class(self, transcripts_path, out_path):
+ """Append file with predefined clasifications."""
+ if not os.path.isfile(transcripts_path):
+ # in this case, we don't have transcripts with tech error
+ # can simply quit the function
+ return
+ with open(transcripts_path, "r") as f:
+ transcripts_list = [x.rstrip() for x in f]
+ f = open(out_path, "a")
+ for elem in transcripts_list:
+ f.write(f"TRANSCRIPT\t{elem}\tM\n")
+ f.close()
+
def __merge_cesar_output(self):
"""Merge CESAR output, save final fasta and bed."""
print("Merging CESAR output to make fasta and bed files.")
@@ -1538,6 +1761,17 @@ def __merge_cesar_output(self):
self.trash_exons,
fragm_data=self.bed_fragm_exons_data,
)
+
+ # need to merge files containing transcripts that were not processed
+ # for some technical reason, such as intersecting fragments in the query
+ self.__merge_dir(
+ self.technical_cesar_err, self.technical_cesar_err_merged, ignore_empty=True
+ )
+
+ self.__append_technicall_err_to_predef_class(
+ self.technical_cesar_err_merged, self.predefined_glp_cesar_split
+ )
+
if len(all_ok) == 0:
# there are no empty output files -> parsed without errors
print("CESAR results merged")
@@ -1617,11 +1851,16 @@ def __orthology_type_map(self):
)
@staticmethod
- def __merge_dir(dir_name, output):
+ def __merge_dir(dir_name, output, ignore_empty=False):
"""Merge all files in a directory into one."""
files_list = os.listdir(dir_name)
- if len(files_list) == 0:
+ if len(files_list) == 0 and ignore_empty is False:
sys.exit(f"Error! {dir_name} is empty")
+ elif len(files_list) == 0 and ignore_empty is True:
+ # in this case we allow empty directories
+ # just remove the directory and return
+ shutil.rmtree(dir_name)
+ return
buffer = open(output, "w")
for filename in files_list:
path = os.path.join(dir_name, filename)
@@ -1632,7 +1871,7 @@ def __merge_dir(dir_name, output):
# buffer.write(lines)
buffer.write(content)
buffer.close()
- shutil.rmtree(dir_name)
+ # shutil.rmtree(dir_name)
def __check_crashed_cesar_jobs(self):
"""Check whether any CESAR jobs crashed.
@@ -1649,6 +1888,11 @@ def __check_crashed_cesar_jobs(self):
if "CESAR" not in line:
# not related to CESAR
continue
+ if "fragment chains oevrlap" in line:
+ # they are not really "crashed"
+ # those jobs could not produce a meaningful result
+ # only intersecting exons
+ continue
# extract cesar wrapper command from the log
cesar_cmd = line.split("\t")[0]
self.crashed_cesar_jobs.append(cesar_cmd)
@@ -1771,9 +2015,6 @@ def parse_args():
default=None,
help="Find orthologs " "for a single reference chromosome only",
)
- # app.add_argument("--limit_to_query_chrom", default=None, help="Annotate "
- # "a particular query scaffold/chromosome only")
- # nextflow related
app.add_argument(
"--nextflow_dir",
"--nd",
@@ -1857,6 +2098,14 @@ def parse_args():
dest="using_optimized_cesar",
help="Instead of CESAR, use lastz-based optimized version",
)
+ app.add_argument(
+ "--output_opt_cesar_regions",
+ "--oocr",
+ action="store_true",
+ dest="output_opt_cesar_regions",
+ help="If optimised CESAR is used, save the included regions, "
+ "and quit. The parameter is defined for debugging purposes only.",
+ )
app.add_argument(
"--mask_stops",
"--ms",
@@ -1880,7 +2129,7 @@ def parse_args():
action="store_true",
dest="cesar_exec_seq",
help="Execute different CESAR jobs partitions sequentially, "
- "not in parallel."
+ "not in parallel.",
)
app.add_argument(
"--cesar_chain_limit",
@@ -1946,13 +2195,30 @@ def parse_args():
dest="annotate_paralogs",
action="store_true",
help="Annotate paralogs instead of orthologs. "
- "(experimental feature for very specific needs)",
+ "(experimental feature for very specific needs)",
+ )
+ app.add_argument(
+ "--mask_all_first_10p",
+ "--m_f10p",
+ action="store_true",
+ dest="mask_all_first_10p",
+ help="Automatically mask all inactivating mutations in first 10 percent of "
+ "the reading frame, ignoring ATG codons distribution. "
+ "(Default mode in V1.0, not recommended to use in later versions)",
)
# print help if there are no args
if len(sys.argv) < 2:
app.print_help()
sys.exit(0)
args = app.parse_args()
+
+ # some sanity checks
+ if args.output_opt_cesar_regions and not args.using_optimized_cesar:
+ err_msg = (
+ "Error! Please use --output_opt_cesar_regions parameter "
+ " with --using_optimized_cesar flag only"
+ )
+ sys.exit(err_msg)
return args
diff --git a/ucsc_browser_visualisation/bb_schema.as b/ucsc_browser_visualisation/bb_schema.as
new file mode 100644
index 0000000..d867477
--- /dev/null
+++ b/ucsc_browser_visualisation/bb_schema.as
@@ -0,0 +1,43 @@
+table togaBigBed
+"TOGA predicted gene model"
+(
+ string chrom; "Reference sequence chromosome or scaffold"
+ uint chromStart; "Start position in chromosome"
+ uint chromEnd; "End position in chromosome"
+ string name; "Name or ID of item, ideally both human readable and unique"
+ uint score; "Score (0-1000)"
+ char[1] strand; "+ or - for strand"
+ uint thickStart; "Start of where display should be thick (start codon)"
+ uint thickEnd; "End of where display should be thick (stop codon)"
+ uint itemRgb; "RGB value (use R,G,B string in input file)"
+ int blockCount; "Number of blocks"
+ int[blockCount] blockSizes; "Comma separated list of block sizes"
+ int[blockCount] chromStarts; "Start positions relative to chromStart"
+
+ string ref_trans_id; "Reference transcript ID"
+ string ref_region; "Transcript region in the reference"
+ string query_region; "Region in the query"
+ float chain_score; "Chain orthology probability score"
+
+ float chain_synteny; "Chain synteny log10 value"
+ float chain_flank; "Chain flank feature"
+ float chain_gl_cds_fract; "Chain global CDS fraction value"
+ float chain_loc_cds_fract; "Chain local CDS fraction value"
+ float chain_exon_cov; "Chain exon coverage value"
+
+ float chain_intron_cov; "Chain intron coverage value"
+ string status; "Gene loss classification"
+ float perc_intact_ign_M; "% intact ignoring missing"
+ float perc_intact_int_M; "% intact considering missing as intact"
+ float intact_codon_prop; "% intact codons"
+
+ float ouf_prop; "% out of chain"
+ string mid_intact; "Is middle 80% intact"
+ string mid_pres; "Is middle 80% fully present"
+ lstring prot_alignment; "HTML-formatted protein alignment"
+ lstring svg_line; "SVG inactivating mutations visualization"
+
+ lstring ref_link; "Reference transcript link"
+ lstring inact_mut_html_table; "HTML-formatted inactivating mutations table"
+ lstring exon_ali_html; "HTML-formatted exon alignment"
+)
diff --git a/ucsc_browser_visualisation/generate_tab_files.py b/ucsc_browser_visualisation/generate_tab_files.py
index 6933d88..c17c583 100755
--- a/ucsc_browser_visualisation/generate_tab_files.py
+++ b/ucsc_browser_visualisation/generate_tab_files.py
@@ -16,8 +16,8 @@
"INTACT_PERC_INTACT_M",
"INTACT_CODONS_PROP",
"OUT_OF_CHAIN_PROP",
- "MIDDLE_80%_INTACT",
- "MIDDLE_80%_PRESENT",
+ "MIDDLE_IS_INTACT",
+ "MIDDLE_IS_PRESENT",
]
# for assembled from fragments: we cannot get chain class features
FRAGM_FEATS = (0, 0.0, 0.0, 0.0, 0.0, 0.0)
@@ -61,6 +61,11 @@ def parse_args():
"data from default output fasta files. Will be "
"the default behaviour in the future"),
)
+ app.add_argument("--no_plots",
+ "--np",
+ dest="no_plots",
+ action="store_true",
+ help="If inactivating mutation plots are already generated, do not recreate them")
if len(sys.argv) < 3:
app.print_help()
sys.exit(0)
@@ -449,7 +454,7 @@ def get_seq_data_from_fasta(wd, all_projections):
return [], projection_to_prot_ali, projection_to_codon_ali
-def get_sequence_data(wd, all_projections):
+def get_sequence_data(wd, all_projections, exon_to_stat):
"""Parse nucleotide and protein sequence data."""
print("Reading sequence data")
gigafasta = os.path.join(wd, TEMP, CESAR_RESULTS)
@@ -525,6 +530,7 @@ def get_sequence_data(wd, all_projections):
ali_class = header_data[7]
exp_range = header_data[8]
in_exp = ONE_S if header_data[9] == "INC" else ZERO_S
+ is_del_or_no = exon_to_stat.get(exon_id, "I")
exon_data = (
location,
pid,
@@ -533,6 +539,7 @@ def get_sequence_data(wd, all_projections):
ali_class,
exp_range,
in_exp,
+ is_del_or_no, # added 26 Aug 2022
sequence,
)
query_exon_to_nucl_data[exon_id] = exon_data
@@ -604,6 +611,7 @@ def get_inact_data(wd, all_projections):
projection_to_inact_muts = []
proj_to_inact_features = defaultdict(dict)
proj_to_inact_features_rows = []
+ exon_to_del_miss = {}
f = open(inact_mut_file, "r")
for line in f:
if not line.startswith("#"):
@@ -614,25 +622,35 @@ def get_inact_data(wd, all_projections):
projection = f"{trans}.{chain}"
if projection not in all_projections:
continue
+
if len(line_data) == 8:
# inactivating mutation data
+
+ # IF Deleted exon OR Missing exon -> also save the data
+ # needed in exon ali visualizations
mask_field = line_data[6]
is_inact = ZERO_S if mask_field == "masked" else ONE_S
line_data[6] = is_inact
+ mut_trimmed = line_data[5][:20]
+ line_data[5] = mut_trimmed
mut_track = line_data[2:]
sql_row = (projection, *mut_track)
projection_to_inact_muts.append(sql_row)
+ if line_data[4] == "Deleted exon":
+ exon_to_del_miss[(projection, int(line_data[2]))] = "D"
+ elif line_data[4] == "Missing exon":
+ exon_to_del_miss[(projection, int(line_data[2]))] = "M"
continue
else:
# features
feat, val_ = line_data[2].split()
# boolean features to num (for SQL table)
- if feat == "MIDDLE_80%_PRESENT":
+ if feat == "MIDDLE_IS_PRESENT":
if val_ == "TRUE":
val = ONE_S
else:
val = ZERO_S
- elif feat == "MIDDLE_80%_INTACT":
+ elif feat == "MIDDLE_IS_INTACT":
if val_ == "TRUE":
val = ONE_S
else:
@@ -657,7 +675,7 @@ def get_inact_data(wd, all_projections):
vals.append(val)
sql_row = (proj, *vals)
proj_to_inact_features_rows.append(sql_row)
- return proj_to_inact_features_rows, projection_to_inact_muts
+ return proj_to_inact_features_rows, projection_to_inact_muts, exon_to_del_miss
def save_toga_inact_mut_tab(out_dir, inact_mut_data):
@@ -694,16 +712,17 @@ def main():
proj_to_chain_score = get_chain_scores(args.wd, all_projections)
proj_to_chain_features = get_chain_features(args.wd, all_projections)
projection_to_loss_class = get_projection_class(args.wd)
+ proj_to_inact_feat, proj_to_inact_mut, exon_to_stat = get_inact_data(args.wd, all_projections)
+
if args.no_raw_cesar_output:
raise NotImplementedError("Cancelled branch")
seq_data = get_seq_data_from_fasta(args.wd, all_projections)
exit()
else:
- seq_data = get_sequence_data(args.wd, all_projections)
+ seq_data = get_sequence_data(args.wd, all_projections, exon_to_stat)
projection_exon_data = seq_data[0]
projection_to_prot_ali = seq_data[1]
projection_to_codon_ali = seq_data[2]
- proj_to_inact_feat, proj_to_inact_mut = get_inact_data(args.wd, all_projections)
save_toga_info_tab(
args.output,
all_projections,
diff --git a/ucsc_browser_visualisation/get_names_from_bed.py b/ucsc_browser_visualisation/get_names_from_bed.py
new file mode 100755
index 0000000..1cc316e
--- /dev/null
+++ b/ucsc_browser_visualisation/get_names_from_bed.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+"""Just extract names from toga output bed file.
+
+Works like xenoRefGenelx.pl"""
+import sys
+
+if len(sys.argv) != 2:
+ to_read = None
+ sys.exit(f"Usage: {sys.argv[0]} [query_annotation.bed] | sort -u > ix.txt")
+else:
+ to_read = sys.argv[1]
+
+f = open(to_read, "r")
+for line in f:
+ id_field = line.rstrip().split("\t")[3]
+ no_chain_id = ".".join(id_field.split(".")[:-1])
+ dot_split = no_chain_id.split(".")
+ if len(dot_split) > 1:
+ to_out = [id_field, no_chain_id] + dot_split
+ else:
+ to_out = [id_field, no_chain_id]
+ line = "\t".join(to_out)
+ print(line)
+f.close()
diff --git a/ucsc_browser_visualisation/make_bigbed_data_public.py b/ucsc_browser_visualisation/make_bigbed_data_public.py
new file mode 100755
index 0000000..8a2a118
--- /dev/null
+++ b/ucsc_browser_visualisation/make_bigbed_data_public.py
@@ -0,0 +1,608 @@
+#!/usr/bin/env python3
+"""Perform all operations to create BigBed file for UCSC browser."""
+import os
+from re import sub
+import sys
+import argparse
+import subprocess
+from collections import defaultdict
+import getpass
+
+
+GENERATE_TAB_FILES = "generate_tab_files.py"
+GENERATE_PLOTS = "make_togaPlot.py"
+LOCATION = os.path.dirname(__file__)
+SCHEMA_LOCATION = os.path.join(LOCATION, "bb_schema.as")
+
+IFEAT_PLACEHOLDER = ["0.0" for _ in range(6)]
+PROT_PLACEHOLDER = ["NO_DATA "]
+UNDEF = "UNDEFINED"
+SVG_PLACEHOLDER = """
+
+
+
+
+"""
+PLOT_PLACEHOLDER = [SVG_PLACEHOLDER, ]
+
+
+def parse_args():
+ """Parse args."""
+ app = argparse.ArgumentParser()
+ app.add_argument("project_dir", help="Directory containing TOGA output")
+ app.add_argument("--do_not_cleanup",
+ action="store_true",
+ dest="do_not_cleanup",
+ help="Do not clean tabs dir up.")
+ app.add_argument("--bb_version",
+ default="v1",
+ help="Version tag")
+ app.add_argument("--no_plots",
+ "--np",
+ dest="no_plots",
+ action="store_true",
+ help="If inactivating mutation plots are already generated, do not recreate them")
+ # TODO: chroms sizes if not standard or not HL
+ if len(sys.argv) < 2:
+ app.print_help()
+ sys.exit(0)
+ args = app.parse_args()
+ return args
+
+
+def call_gen_tab_files(project_dir):
+ """Call Generate tab files subprocess."""
+ print(f"Calling {GENERATE_TAB_FILES}")
+ out_dir = os.path.join(project_dir, "tabs")
+ os.mkdir(out_dir) if not os.path.isdir(out_dir) else None
+ exe_ = os.path.join(LOCATION, GENERATE_TAB_FILES)
+ cmd = f"{exe_} {project_dir} {out_dir}"
+ rc = subprocess.call(cmd, shell=True)
+ if rc != 0:
+ raise ValueError(f"Command {cmd} died - cannot generate tab files")
+ print("5 of 6 tab files generated")
+
+
+def call_gen_plot_files(project_dir, no_plots=False):
+ """Generate svg plots."""
+ print(f"Calling {GENERATE_PLOTS}")
+ out_file = os.path.join(project_dir, "tabs", "togaPlot.tab")
+ if no_plots is True and os.path.isfile(out_file):
+ print(f"--no_plots flag is on, found {out_file} --> skip making plots")
+ return
+ elif no_plots is True and not os.path.isfile(out_file):
+ print(f"Error! --no_plots flag is on, but {out_file} not found\nAbort")
+ sys.exit(1)
+ exe_ = os.path.join(LOCATION, GENERATE_PLOTS)
+ cmd = f"{exe_} {project_dir} {out_file}"
+ rc = subprocess.call(cmd, shell=True)
+ if rc != 0:
+ raise ValueError(f"Command {cmd} died - cannot generate plots")
+ print("All tab files generated")
+
+
+def read_tab_file(tab_file):
+ """Make trans_id: data track."""
+ trans_to_dat = {}
+ f = open(tab_file, "r")
+ for line in f:
+ line_data = line.rstrip().split("\t")
+ trans = line_data[0]
+ data = line_data[1:]
+ trans_to_dat[trans] = data
+ f.close()
+ return trans_to_dat
+
+
+
+def __gen_info_placeholder(p_name):
+ """Generate info placeholder."""
+ t_name = ".".join(p_name.split(".")[:-1])
+ placeholder = [t_name, UNDEF, UNDEF, "0.0", "0", "0.0", "0.0", "0.0", "0.0", "0.0", UNDEF]
+ return placeholder
+
+
+def make_toga_data(project_dir):
+ """Merge togaPlot, togaProt, togaInfo and togaInactFeat tables into togaData."""
+ tabs_dir = os.path.join(project_dir, "tabs")
+ toga_info_file = os.path.join(tabs_dir, "togaInfo.tab")
+ toga_inact_feat_file = os.path.join(tabs_dir, "togaInactFeat.tab")
+ toga_plot_file = os.path.join(tabs_dir, "togaPlot.tab")
+ toga_prot_file = os.path.join(tabs_dir, "togaProt.tab")
+ out_file = os.path.join(tabs_dir, "togaData.tab")
+
+ print("Reading tab files to merge")
+ trans_to_info = read_tab_file(toga_info_file)
+ trans_to_ifeat = read_tab_file(toga_inact_feat_file)
+ trans_to_plot = read_tab_file(toga_plot_file)
+ trans_to_prot = read_tab_file(toga_prot_file)
+
+ transcripts = trans_to_info.keys()
+ print(f"Got data for {len(transcripts)} transcripts")
+
+ f = open(out_file, "w")
+ for t in transcripts:
+ info = trans_to_info.get(t)
+ if info is None:
+ info = __gen_info_placeholder(t)
+ ifeat = trans_to_ifeat.get(t, IFEAT_PLACEHOLDER.copy())
+ prot = trans_to_prot.get(t, PROT_PLACEHOLDER.copy())
+ plot = trans_to_plot.get(t, PLOT_PLACEHOLDER.copy())
+ data_lst = [t] + info + ifeat + prot + plot
+ tab_str = "\t".join(data_lst)
+ f.write(tab_str)
+ f.write("\n")
+ f.close()
+ # do cleanup
+ os.remove(toga_info_file)
+ os.remove(toga_inact_feat_file)
+ os.remove(toga_plot_file)
+ os.remove(toga_prot_file)
+ print("Done")
+
+
+def _check_seq_of_intervals_intersect(intervals):
+ """Check whether any interval in the seq intersects"""
+ intervals_num = len(intervals)
+ for i in range(intervals_num - 1):
+ # (start, end)
+ curr_one = intervals[i]
+ next_one = intervals[i + 1]
+ # sorted by beginning
+ # if start of the next < end of the curr
+ # -> they intersect
+ if next_one[0] < curr_one[1]:
+ return (curr_one[0], curr_one[1])
+ return None # nothing suspicious found
+
+
+def _filter_bed(in_bed, out_bed):
+ """Filter out bed entries with:
+ - None chromosome
+ - Intersecting blocks."""
+ _in = open(in_bed, "r")
+ _out = open(out_bed, "w")
+ for line in _in:
+ ld = line.rstrip().split("\t")
+ chrom = ld[0]
+ if chrom == "None":
+ continue
+ block_sizes = [int(x) for x in ld[10].split(",") if x != ""]
+ block_starts = [int(x) for x in ld[11].split(",") if x != ""]
+ block_count = len(block_starts)
+ if block_count == 1:
+ # no need to check whether exons intersect if there is only 1 exon
+ _out.write(line)
+ continue
+ block_ends = [block_starts[i] + block_sizes[i] for i in range(block_count)]
+ exon_ranges = sorted(zip(block_starts, block_ends), key=lambda x: x[0])
+ ranges_intersect = _check_seq_of_intervals_intersect(exon_ranges)
+ if ranges_intersect:
+ continue
+ _out.write(line)
+ _in.close()
+ _out.close()
+
+
+def make_sorted_bed(project_dir):
+ """For bigBed, we need properly sorted bed file."""
+ non_sorted_bed = os.path.join(project_dir, "query_annotation.bed")
+ tabs_dir = os.path.join(project_dir, "tabs")
+ bed_no_none = os.path.join(project_dir, "query_annotation_intact_chroms.bed")
+ _filter_bed(non_sorted_bed, bed_no_none)
+ sorted_bed = os.path.join(tabs_dir, "query_annot_sorted.bed")
+ cmd = f"sort -k1,1 -k2,2n {bed_no_none} > {sorted_bed}"
+ subprocess.call(cmd, shell=True)
+ # sorted bed file, good
+
+
+def __get_ts_from_bed(bed):
+ """Get transcript IDs from bed file."""
+ f = open(bed, 'r')
+ ts = []
+ ts_to_bed_line = {}
+ for line in f:
+ ld = line.rstrip().split("\t")
+ t = ld[3]
+ ts.append(t)
+ ts_to_bed_line[t] = ld
+ f.close()
+ return ts, ts_to_bed_line
+
+
+def __get_ts_to_toga_data(toga_data_file):
+ """Get transID: togaData fields."""
+ ret = {}
+ f = open(toga_data_file, "r")
+ for line in f:
+ ld = line.rstrip().split("\t")
+ t = ld[0]
+ fields = ld[1:]
+ ret[t] = fields
+ f.close()
+ return ret
+
+
+def __get_ts_to_toga_inact(toga_inact_file):
+ """Read inact mut data."""
+ ts_to_exon_to_data = defaultdict(dict)
+ f = open(toga_inact_file, 'r')
+ for line in f:
+ line_data = line.rstrip().split("\t")
+ ts = line_data[0]
+ exon_num = int(line_data[1])
+ rest = line_data[2:]
+ ts_to_exon_to_data[ts][exon_num] = rest
+ f.close()
+ # now: TS to HTML_formatted line
+ trans_to_inact_line = {}
+ for t in ts_to_exon_to_data.keys():
+ exon_nums_unsort = ts_to_exon_to_data[t].keys()
+ exon_nums = sorted(exon_nums_unsort)
+ t_lines = []
+ for exon_num in exon_nums:
+ inact_mut_data = ts_to_exon_to_data[t][exon_num]
+ """
+ transcript varchar(50) not null, -- unique projection ID: ${ref_transcript_ID}.${chain_ID}
+ exon_num int unsigned not null, -- exon number
+ position int unsigned not null, -- possition where mutation happened
+ mut_class varchar(15) not null, -- mutation class such as FS deletion
+ mutation varchar(20) not null, -- what exactly happened
+ is_inact tinyint unsigned not null, -- is this mutation inactivating, yes 1 or not 0
+ mut_id varchar(10) not null -- mut identifier
+ """
+ # template:
+ """
+ struct togaInactMut *info = NULL;
+ info = togaInactMutLoad(row);
+ printf("
\n");
+ printf("%s \n", info->exon_num);
+ printf("%s \n", info->position);
+ printf("%s \n", info->mut_class);
+ printf("%s \n", info->mutation);
+ if (sameWord(info->is_inact, ONE_)){
+ printf("%s \n", YES_);
+ } else {
+ printf("%s \n", NO_);
+ }
+ printf("%s \n", info->mut_id);
+ printf(" \n");
+ togaInactMutFree(&info);
+ """
+ position = inact_mut_data[0]
+ mut_class = inact_mut_data[1]
+ mutation = inact_mut_data[2]
+ is_inact = "YES" if inact_mut_data[3] == "1" else "NO"
+ mut_id = inact_mut_data[4]
+ lines = []
+ lines.append("")
+ lines.append(f"{exon_num} ")
+ lines.append(f"{position} ")
+ lines.append(f"{mut_class} ")
+ lines.append(f"{mutation} ")
+ lines.append(f"{is_inact} ")
+ lines.append(f"{mut_id} ")
+ lines.append(" ")
+ inact_mut_line = "".join(lines)
+ t_lines.append(inact_mut_line)
+ t_line = "".join(t_lines)
+ trans_to_inact_line[t] = t_line
+ return trans_to_inact_line
+
+
+def __get_ts_to_toga_exons(toga_exons_file):
+ trans_to_exon_line = {}
+ """Table schema
+
+ transcript varchar(50) not null, -- unique projection ID: ${ref_transcript_ID}.${chain_ID}
+ exon_num int unsigned not null, -- exon number
+ exon_region varchar(100) not null, -- region where exon was detected
+ pid float not null, -- nucleotide %id
+ blosum float not null, -- normalized blosum score
+ gaps tinyint unsigned not null, -- are there any asm gaps near? 1 - yes 0 - no
+ ali_class varchar(4) not null, -- alignemnt class: A, B, C, A+
+ exp_region varchar(50) not null, -- where exon was expected
+ in_exp_region tinyint unsigned not null, -- detected in expected region or not 1 yes 0 no
+ is_del_or_no char -- D - deleted, M - missing, I - intact
+ alignment longblob not null -- exon sequence in query
+ """
+ ts_to_exon_to_data = defaultdict(dict)
+ f = open(toga_exons_file, "r")
+ for line in f:
+ ld = line.rstrip().split("\t")
+ t = ld[0]
+ ex_num = int(ld[1])
+ datum = ld[2:]
+ ts_to_exon_to_data[t][ex_num] = datum
+ f.close()
+
+
+ for transcript in ts_to_exon_to_data.keys():
+ exons_unsorted = ts_to_exon_to_data[transcript].keys()
+ exon_nums = sorted(exons_unsorted)
+ transcript_lines = []
+ for exon_num in exon_nums:
+ exon_lines = []
+ datum = ts_to_exon_to_data[transcript][exon_num]
+ """To convert this:
+ struct togaNucl *info = NULL;
+ info = togaNuclLoad(row);
+ printf("Exon number: %s \n", info->exon_num);
+ printf("Exon region: %s \n", info->exon_region);
+ printf("Nucleotide percent identity: %s | BLOSUM: %s \n", info->pid, info->blosum);
+ if (sameWord(info->gaps, ONE_)){
+ printf("Intersects assembly gaps: %s \n", YES_);
+ } else {
+ printf("Intersects assembly gaps: %s \n", NO_);
+ }
+ printf("Exon alignment class: %s \n", info->ali_class);
+ if (sameWord(info->in_exp_region, ONE_)){
+ printf("Detected within expected region (%s): %s \n", info->exp_region, YES_);
+ } else {
+ printf("Detected within expected region (%s): %s \n", info->exp_region, NO_);
+ }
+ // printf("Expected region: %s \n", info->exp_region);
+ printf(" \n");
+ printf("Sequence alignment between reference and query exon: \n");
+ printf("%s \n", info->alignment);
+ togaNuclFree(&info);
+ """
+ exon_region = datum[0]
+ pid = datum[1]
+ blosum = datum[2]
+ gaps = "YES" if datum[3] == "1" else "NO"
+ ali_class = datum[4]
+ exp_reg = datum[5]
+ in_exp_region = "YES" if datum[6] == "1" else "NO"
+ is_del_or_no = datum[7]
+ ali = datum[8]
+
+ if is_del_or_no == "D":
+ exon_lines.append(f"Exon number: {exon_num} - Deleted ")
+ elif is_del_or_no == "M":
+ exon_lines.append(f"Exon number: {exon_num} - Missing ")
+ else:
+ exon_lines.append(f"Exon number: {exon_num} ")
+
+ exon_lines.append(f"Exon region: {exon_region} ")
+ exon_lines.append(f"Nucleotide percent identity: {pid} | BLOSUM: {blosum} ")
+ exon_lines.append(f"Intersects assembly gaps: {gaps} ")
+ exon_lines.append(f"Exon alignment class: {ali_class} ")
+ exon_lines.append(f"Detected within expected region ({exp_reg}): {in_exp_region} ")
+ exon_lines.append(f" ")
+ exon_lines.append(f"Sequence alignment between reference and query exon: ")
+ exon_lines.append(f"{ali} ")
+ exon_line = "".join(exon_lines)
+ transcript_lines.append(exon_line)
+ transcript_line = "".join(transcript_lines)
+ trans_to_exon_line[transcript] = transcript_line
+ return trans_to_exon_line
+
+
+def __get_ref_transcript(q_trans):
+ return ".".join(q_trans.split(".")[:-1])
+
+
+def get_ref_ts_to_link(p_dir):
+ """Get ref_transcript: link dict."""
+ ret = {}
+ ref_2bit_link = os.path.join(p_dir, "t2bit.link")
+ ref_2bit_path = os.readlink(ref_2bit_link)
+ ref_2bit_basename = os.path.basename(ref_2bit_path)
+ ref_genome_gname = ".".join(ref_2bit_basename.split(".")[:-1])
+ ref_toga_dir = f"/projects/hillerlab/genome/gbdb-HL/{ref_genome_gname}/TOGA/"
+ ref_ts_to_link_path = os.path.join(ref_toga_dir, "toga.transcript2Link.txt")
+ # dirname = os.path.dirname(ref_2bit_path)
+ # chrom_sizes_path = os.path.join(dirname, "chrom.sizes")
+ if not os.path.isfile(ref_ts_to_link_path):
+ return ret, ref_genome_gname
+ f = open(ref_ts_to_link_path, "r")
+ for line in f:
+ ld = line.rstrip().split("\t")
+ ret[ld[0]] = ld[1]
+ f.close()
+ return ret, ref_genome_gname
+
+
+def get_que_chrom_sizes(p_dir):
+ que_2bit_link = os.path.join(p_dir, "q2bit.link")
+ que_2bit_path = os.readlink(que_2bit_link)
+ dirname = os.path.dirname(que_2bit_path)
+ chrom_sizes_path = os.path.join(dirname, "chrom.sizes")
+ que_2bit_basename = os.path.basename(que_2bit_path)
+ que_genome_gname = ".".join(que_2bit_basename.split(".")[:-1])
+ return chrom_sizes_path, que_genome_gname
+
+
+def merge_all_tables(project_dir, r_ts_to_link):
+ """Merge all tabs in ready-to-merge bed file."""
+ print("Joining tsv for bigbed")
+ tabs_dir = os.path.join(project_dir, "tabs")
+ sorted_bed = os.path.join(tabs_dir, "query_annot_sorted.bed")
+ toga_data_file = os.path.join(tabs_dir, "togaData.tab")
+ toga_inact_file = os.path.join(tabs_dir, "togaInactMut.tab")
+ toga_nucl_file = os.path.join(tabs_dir, "togaNucl.tab")
+ transcripts_ordered, t_to_bed = __get_ts_from_bed(sorted_bed)
+ ts_to_data_track = __get_ts_to_toga_data(toga_data_file)
+ ts_to_inact_data = __get_ts_to_toga_inact(toga_inact_file)
+ ts_to_exon_data = __get_ts_to_toga_exons(toga_nucl_file)
+
+
+ bigbed_material = os.path.join(tabs_dir, "query_annot_for_big.tsv")
+ f = open(bigbed_material, "w")
+ for transcript in transcripts_ordered:
+ ref_trans = __get_ref_transcript(transcript)
+ bed_fields = t_to_bed[transcript]
+ data_track = ts_to_data_track[transcript]
+ inact_line = ts_to_inact_data.get(transcript, " ")
+ exon_line = ts_to_exon_data[transcript]
+ link = r_ts_to_link.get(ref_trans, ref_trans) # just reference transcript if no link
+ combined_datum_lst = bed_fields + data_track + [link] + [inact_line] + [exon_line]
+ combined_datum = "\t".join(combined_datum_lst)
+ f.write(f"{combined_datum}\n")
+ f.close()
+
+
+def sort_and_make_bb(project_dir, chrom_sizes):
+ """bed12+22 as the result
+ sort -k1,1 -k2,2n query_annot_for_big.tsv > query_annot_for_big_sorta.tsv
+ bedToBigBed -type=bed12+22 query_annot_for_big_sorta.tsv
+ /projects/hillerlab/genome/gbdb-HL/mm10/chrom.sizes query_annot.bb -tab
+ """
+ print("Making bigbed...")
+ tabs_dir = os.path.join(project_dir, "tabs")
+ bigbed_material_not_sorted = os.path.join(tabs_dir, "query_annot_for_big.tsv")
+ bigbed_material = os.path.join(tabs_dir, "query_annot_for_big__sorted.tsv")
+ sort_cmd = f"sort -k1,1 -k2,2n {bigbed_material_not_sorted} > {bigbed_material}"
+ rc = subprocess.call(sort_cmd, shell=True)
+ if rc != 0:
+ sys.exit(f"Error! Command:\n{sort_cmd}\ncrashed")
+ output = os.path.join(tabs_dir, "query_annotation.bb")
+ # now transform into bigbed
+ print("Calling:")
+ bb_cmd = f"bedToBigBed -type=bed12+22 {bigbed_material} {chrom_sizes} {output} -tab -extraIndex=name -as={SCHEMA_LOCATION}"
+ print(bb_cmd)
+ rc = subprocess.call(bb_cmd, shell=True)
+ if rc != 0:
+ sys.exit(f"Error! Command:\n{bb_cmd}\ncrashed")
+
+ # make ix.txt
+ query_annot = os.path.join(project_dir, "query_annotation.bed")
+ make_ix_script = os.path.join(LOCATION, "get_names_from_bed.py")
+ ix_txt = os.path.join(tabs_dir, "query_annotation.ix.txt")
+ ix_cmd = f"{make_ix_script} {query_annot} | sort -u > {ix_txt}"
+ rc = subprocess.call(ix_cmd, shell=True)
+ if rc != 0:
+ sys.exit(f"Error! Command:\n{ix_cmd}\ncrashed")
+
+ # make .ix and .ixx indexes
+ ix_path = os.path.join(tabs_dir, "query_annotation.bb.ix")
+ ixx_path = os.path.join(tabs_dir, "query_annotation.bb.ixx")
+ ixixx_cmd = f"ixIxx {ix_txt} {ix_path} {ixx_path}"
+ rc = subprocess.call(ixixx_cmd, shell=True)
+ if rc != 0:
+ sys.exit(f"Error! Command:\n{ixixx_cmd}\ncrashed")
+
+
+def cleanup(project_dir, dont):
+ """Clean project dir up."""
+ if dont: # do not cleanup
+ return
+ tabs_dir = os.path.join(project_dir, "tabs")
+ bigbed_material_not_sorted = os.path.join(tabs_dir, "query_annot_for_big.tsv")
+ bigbed_material = os.path.join(tabs_dir, "query_annot_for_big__sorted.tsv")
+ toga_info_file = os.path.join(tabs_dir, "togaInfo.tab")
+ toga_inact_feat_file = os.path.join(tabs_dir, "togaInactFeat.tab")
+ toga_plot_file = os.path.join(tabs_dir, "togaPlot.tab")
+ toga_prot_file = os.path.join(tabs_dir, "togaProt.tab")
+ sorted_bed = os.path.join(tabs_dir, "query_annot_sorted.bed")
+ toga_data_file = os.path.join(tabs_dir, "togaData.tab")
+ toga_inact_file = os.path.join(tabs_dir, "togaInactMut.tab")
+ toga_nucl_file = os.path.join(tabs_dir, "togaNucl.tab")
+ to_rm = [bigbed_material_not_sorted, bigbed_material, toga_info_file,
+ toga_inact_feat_file, toga_plot_file, toga_prot_file,
+ sorted_bed, toga_data_file, toga_inact_file, toga_nucl_file]
+ for path in to_rm:
+ os.remove(path) if os.path.isfile(path) else None
+
+
+def __ssh_mkdir(uname, dirname):
+ cmd = f"ssh {uname}@genome mkdir -p {dirname}"
+ print(f"Calling {cmd}")
+ subprocess.call(cmd, shell=True)
+
+
+def __ssh_ln(uname, src, dest):
+ """Call ln on delta"""
+ cmd = f"ssh {uname}@genome ln -sf {src} {dest}"
+ print(f"Calling {cmd}")
+ subprocess.call(cmd, shell=True)
+
+
+def __get_ccase_refname(refname):
+ if refname == "hg38":
+ return "Hg38"
+ elif refname == "mm10":
+ return "Mm10"
+ elif refname == "mm39":
+ return "Mm39"
+ else:
+ return refname
+
+
+def load_bb_track(project_dir, dont, ref, que, bb_ver):
+ if dont: # do not load anything
+ return
+ tabs_dir = os.path.join(project_dir, "tabs")
+ bb_file = os.path.join(tabs_dir, "query_annotation.bb")
+ uname = getpass.getuser()
+ # scp query_annotation.bb genome:/genome/gbdb-HL/$ref/TOGA/vs_$query/HLTOGAannotVs$refv1.bb
+ ref_dir__genome = f"/genome/gbdb-HL/{ref}"
+ toga_ref_dir__genome = f"{ref_dir__genome}/TOGA"
+ # make if needed
+ __ssh_mkdir(uname, toga_ref_dir__genome)
+ vs_que_dir__genome = f"{toga_ref_dir__genome}/vs_{que}"
+ __ssh_mkdir(uname, vs_que_dir__genome)
+ # transfer data
+ bb_file = os.path.join(tabs_dir, "query_annotation.bb")
+ ix_file = os.path.join(tabs_dir, "query_annotation.bb.ix")
+ ixx_file = os.path.join(tabs_dir, "query_annotation.bb.ixx")
+
+ ref_camel_case = __get_ccase_refname(ref)
+ bb_filename__genome = f"HLTOGAannotVs{ref_camel_case}{bb_ver}.bb"
+ ix_filename__genome = f"HLTOGAannotVs{ref_camel_case}{bb_ver}.ix"
+ ixx_filename__genome = f"HLTOGAannotVs{ref_camel_case}{bb_ver}.ixx"
+
+ bb_path_genome = f"{vs_que_dir__genome}/{bb_filename__genome}"
+ ix_path_genome = f"{vs_que_dir__genome}/{ix_filename__genome}"
+ ixx_path_genome = f"{vs_que_dir__genome}/{ixx_filename__genome}"
+
+ rsync_cmd = f"rsync -av {bb_file} {uname}@genome:{bb_path_genome}"
+ subprocess.call(rsync_cmd, shell=True)
+
+ rsync_cmd = f"rsync -av {ix_file} {uname}@genome:{ix_path_genome}"
+ subprocess.call(rsync_cmd, shell=True)
+
+ rsync_cmd = f"rsync -av {ixx_file} {uname}@genome:{ixx_path_genome}"
+ subprocess.call(rsync_cmd, shell=True)
+
+ # make link in /var/www
+ www_data__genome = "/var/www/data"
+ www_que__genome = f"{www_data__genome}/{que}"
+ __ssh_mkdir(uname, www_que__genome)
+
+ www_bb_link_dest = f"{www_que__genome}/{bb_filename__genome}"
+ www_ix_link_dest = f"{www_que__genome}/{ix_filename__genome}"
+ www_ixx_link_dest = f"{www_que__genome}/{ixx_filename__genome}"
+
+ __ssh_ln(uname, bb_path_genome, www_bb_link_dest)
+ __ssh_ln(uname, ix_path_genome, www_ix_link_dest)
+ __ssh_ln(uname, ixx_path_genome, www_ixx_link_dest)
+
+
+def main():
+ """Entry point."""
+ args = parse_args()
+ r_ts_to_link, ref_name = get_ref_ts_to_link(args.project_dir)
+ chrom_sizes, que_name = get_que_chrom_sizes(args.project_dir)
+ # call generate_tab_files.py and make_togaPlot.py
+ call_gen_tab_files(args.project_dir)
+ call_gen_plot_files(args.project_dir, no_plots=args.no_plots)
+ # merge 4 transcriptID-related tables into a single one
+ make_toga_data(args.project_dir)
+
+ # OK, now making bigBed
+ # togaNucl.tab and togaInactMut.tab
+ # -> merge into HTML_formatted strings
+ # -> add to togaData
+ # save to as.tab for bedToBigBed
+ make_sorted_bed(args.project_dir)
+
+ merge_all_tables(args.project_dir, r_ts_to_link)
+ sort_and_make_bb(args.project_dir, chrom_sizes)
+
+ cleanup(args.project_dir, args.do_not_cleanup)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/ucsc_browser_visualisation/make_sql_data.py b/ucsc_browser_visualisation/make_sql_data.py
deleted file mode 100755
index 41eba17..0000000
--- a/ucsc_browser_visualisation/make_sql_data.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-"""Perform all operations to create SQL tables for UCSC browser."""
-import os
-import sys
-import argparse
-import subprocess
-
-GENERATE_TAB_FILES = "generate_tab_files.py"
-GENERATE_PLOTS = "make_togaPlot.py"
-LOCATION = os.path.dirname(__file__)
-
-IFEAT_PLACEHOLDER = ["0.0" for _ in range(6)]
-PROT_PLACEHOLDER = ["NO_DATA "]
-UNDEF = "UNDEFINED"
-SVG_PLACEHOLDER = """
-
-
-
-
-"""
-PLOT_PLACEHOLDER = [SVG_PLACEHOLDER, ]
-
-
-def parse_args():
- """Parse args."""
- app = argparse.ArgumentParser()
- app.add_argument("project_dir", help="Directory containing TOGA output")
- if len(sys.argv) < 2:
- app.print_help()
- sys.exit(0)
- args = app.parse_args()
- return args
-
-
-def call_gen_tab_files(project_dir):
- """Call Generate tab files subprocess."""
- print(f"Calling {GENERATE_TAB_FILES}")
- out_dir = os.path.join(project_dir, "tabs")
- os.mkdir(out_dir) if not os.path.isdir(out_dir) else None
- exe_ = os.path.join(LOCATION, GENERATE_TAB_FILES)
- cmd = f"{exe_} {project_dir} {out_dir}"
- rc = subprocess.call(cmd, shell=True)
- if rc != 0:
- raise ValueError(f"Command {cmd} died - cannot generate tab files")
- print("5 of 6 tab files generated")
-
-
-def call_gen_plot_files(project_dir):
- """Generate svg plots."""
- print(f"Calling {GENERATE_PLOTS}")
- out_file = os.path.join(project_dir, "tabs", "togaPlot.tab")
- exe_ = os.path.join(LOCATION, GENERATE_PLOTS)
- cmd = f"{exe_} {project_dir} {out_file}"
- rc = subprocess.call(cmd, shell=True)
- if rc != 0:
- raise ValueError(f"Command {cmd} died - cannot generate plots")
- print("All tab files generated")
-
-
-def read_tab_file(tab_file):
- """Make trans_id: data track."""
- trans_to_dat = {}
- f = open(tab_file, "r")
- for line in f:
- line_data = line.rstrip().split("\t")
- trans = line_data[0]
- data = line_data[1:]
- trans_to_dat[trans] = data
- f.close()
- return trans_to_dat
-
-
-
-def __gen_info_placeholder(p_name):
- """Generate info placeholder."""
- t_name = ".".join(p_name.split(".")[:-1])
- placeholder = [t_name, UNDEF, UNDEF, "0.0", "0", "0.0", "0.0", "0.0", "0.0", "0.0", UNDEF]
- return placeholder
-
-
-def make_toga_data(project_dir):
- """Merge togaPlot, togaProt, togaInfo and togaInactFeat tables into togaData."""
- tabs_dir = os.path.join(project_dir, "tabs")
- toga_info_file = os.path.join(tabs_dir, "togaInfo.tab")
- toga_inact_feat_file = os.path.join(tabs_dir, "togaInactFeat.tab")
- toga_plot_file = os.path.join(tabs_dir, "togaPlot.tab")
- toga_prot_file = os.path.join(tabs_dir, "togaProt.tab")
- out_file = os.path.join(tabs_dir, "togaData.tab")
-
- print("Reading tab files to merge")
- trans_to_info = read_tab_file(toga_info_file)
- trans_to_ifeat = read_tab_file(toga_inact_feat_file)
- trans_to_plot = read_tab_file(toga_plot_file)
- trans_to_prot = read_tab_file(toga_prot_file)
-
- transcripts = trans_to_info.keys()
- print(f"Got data for {len(transcripts)} transcripts")
-
- f = open(out_file, "w")
- for t in transcripts:
- info = trans_to_info.get(t)
- if info is None:
- info = __gen_info_placeholder(t)
- ifeat = trans_to_ifeat.get(t, IFEAT_PLACEHOLDER.copy())
- prot = trans_to_prot.get(t, PROT_PLACEHOLDER.copy())
- plot = trans_to_plot.get(t, PLOT_PLACEHOLDER.copy())
- data_lst = [t] + info + ifeat + prot + plot
- tab_str = "\t".join(data_lst)
- f.write(tab_str)
- f.write("\n")
- f.close()
- # do cleanup
- os.remove(toga_info_file)
- os.remove(toga_inact_feat_file)
- os.remove(toga_plot_file)
- os.remove(toga_prot_file)
- print("Done")
-
-
-def main():
- """Entry point."""
- args = parse_args()
- # call generate_tab_files.py and make_togaPlot.py
- call_gen_tab_files(args.project_dir)
- call_gen_plot_files(args.project_dir)
- # merge 4 transcriptID-related tables into a single one
- make_toga_data(args.project_dir)
-
-
-if __name__ == "__main__":
- main()
diff --git a/ucsc_browser_visualisation/readme.md b/ucsc_browser_visualisation/readme.md
index 4241c7e..c2f53d2 100644
--- a/ucsc_browser_visualisation/readme.md
+++ b/ucsc_browser_visualisation/readme.md
@@ -76,13 +76,22 @@ We will add another condition to handle TOGA tracks.
Put another condition as shown here:
+
```c
-else if (startsWith("HLTOGA", table) && hTableExists(database, "TOGAData"))
-{
- doHillerLabTOGAGene(tdb, item);
-}
+else if (startsWith("HLTOGAannot", trackHubSkipHubName(table)))
+ {
+ doHillerLabTOGAGene(database, tdb, item, table);
+ }
```
+
+[//]: # (```c)
+[//]: # (else if (startsWith("HLTOGA", table) && hTableExists(database, "TOGAData")))
+[//]: # ({)
+[//]: # ( doHillerLabTOGAGene(database, tdb, item, table);)
+[//]: # (}
+[//]: # (```)
+
Then re-build your browser.
### Loading TOGA tables
@@ -92,26 +101,18 @@ Then re-build your browser.
${project_dir}: directory containing TOGA results and intermediate data.
```shell
-./ucsc_browser_visualisation/make_sql_data.py ${project_dir}```
+./ucsc_browser_visualisation/make_bigbed_data_public.py ${project_dir}```
# Wait, it will take a few minutes, most likely less than an hour
```
#### Load tab files to browser database
-${tab_files_dir} = ${project_dir}/tabs
-${query} - annotated genome identifier.
-Use *.sql files located in the ucsc_browser_visualisation directory.
-Call the following commands:
+Transfer created *.bb, *.ix, and *.ixx files to the machine hosting your instance of UCSC genome browser, if need be.
+Create a table schema for bigBed tracks, using bigDataUrl field to specify the bigBed URL, and searchTrix field to specify the *.ix file URL (no need to specify *.ixx file separately, it just should be located in the same directory with the *.ix file.)
-```shell
-hgLoadSqlTab ${query} TOGAData togaData.sql ${tab_files_dir}/togaInfo.tab
-hgLoadSqlTab ${query} TOGANucl togaNucl.sql ${tab_files_dir}/togaNucl.tab
-hgLoadSqlTab ${query} TOGAInactMut togaInactMut.sql ${tab_files_dir}/togaInactMut.tab
+Please also specify the following fields:
```
-
-Also load bed file using hgLoadBed command.
-Create a track starting with HLTOGA, for instance HLTOGAannotation.
-
-```shell script
-hgLoadBed ${query} HLTOGAannotation ${project_dir}/query_annotation.bed
+type bigBed 12 +
+labelFields name
+searchIndex name
```
diff --git a/ucsc_browser_visualisation/togaClick.c b/ucsc_browser_visualisation/togaClick.c
index e2f10e6..e3d53c8 100644
--- a/ucsc_browser_visualisation/togaClick.c
+++ b/ucsc_browser_visualisation/togaClick.c
@@ -3,6 +3,44 @@
#include "hgc.h"
#include "togaClick.h"
#include "string.h"
+#include "htmshell.h"
+#include "chromAlias.h"
+
+
+struct togaDataBB *togaDataBBLoad(char **row)
+/* Load a togaData from row fetched with select * from togaData
+ * from database. Dispose of this with togaDataFree(). */
+{
+ struct togaDataBB *ret;
+ AllocVar(ret);
+ ret->projection = cloneString(row[0]);
+ ret->ref_trans_id = cloneString(row[1]);
+ ret->ref_region = cloneString(row[2]);
+ ret->query_region = cloneString(row[3]);
+ ret->chain_score = cloneString(row[4]);
+
+ ret->chain_synteny = cloneString(row[5]);
+ ret->chain_flank = cloneString(row[6]);
+ ret->chain_gl_cds_fract = cloneString(row[7]);
+ ret->chain_loc_cds_fract = cloneString(row[8]);
+ ret->chain_exon_cov = cloneString(row[9]);
+
+ ret->chain_intron_cov = cloneString(row[10]);
+ ret->status = cloneString(row[11]);
+ ret->perc_intact_ign_M = cloneString(row[12]);
+ ret->perc_intact_int_M = cloneString(row[13]);
+ ret->intact_codon_prop = cloneString(row[14]);
+
+ ret->ouf_prop = cloneString(row[15]);
+ ret->mid_intact = cloneString(row[16]);
+ ret->mid_pres = cloneString(row[17]);
+ ret->prot_alignment = cloneString(row[18]);
+ ret->svg_line = cloneString(row[19]);
+ ret->ref_link = cloneString(row[20]);
+ ret->inact_mut_html_table = cloneString(row[21]);
+ ret->exon_ali_html = cloneString(row[22]);
+ return ret;
+}
struct togaData *togaDataLoad(char **row)
@@ -38,6 +76,43 @@ struct togaData *togaDataLoad(char **row)
}
+void togaDataBBFree(struct togaDataBB **pEl)
+/* Free a single dynamically allocated togaDatasuch as created
+ * with togaDataLoad(). */
+{
+ struct togaDataBB *el;
+
+ if ((el = *pEl) == NULL) return;
+ freeMem(el->projection);
+ freeMem(el->ref_trans_id);
+ freeMem(el->ref_region);
+ freeMem(el->query_region);
+ freeMem(el->chain_score);
+
+ freeMem(el->chain_synteny);
+ freeMem(el->chain_flank);
+ freeMem(el->chain_gl_cds_fract);
+ freeMem(el->chain_loc_cds_fract);
+ freeMem(el->chain_exon_cov);
+
+ freeMem(el->chain_intron_cov);
+ freeMem(el->status);
+ freeMem(el->perc_intact_ign_M);
+ freeMem(el->perc_intact_int_M);
+ freeMem(el->intact_codon_prop);
+
+ freeMem(el->ouf_prop);
+ freeMem(el->mid_intact);
+ freeMem(el->mid_pres);
+ freeMem(el->prot_alignment);
+ freeMem(el->svg_line);
+ freeMem(el->ref_link);
+ freeMem(el->inact_mut_html_table);
+ freeMem(el->exon_ali_html);
+ freez(pEl);
+}
+
+
void togaDataFree(struct togaData **pEl)
/* Free a single dynamically allocated togaDatasuch as created
* with togaDataLoad(). */
@@ -168,10 +243,207 @@ Prefix must be HLTOGAannot */
}
-void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name)
+void HLprintQueryProtSeqForAli(char *proteinAlignment) {
+ // take protein sequence alignment
+ // print only the query sequence
+ char *str = proteinAlignment;
+ int printed_char_num = 0;
+ while ((str = strstr(str, "que:")) != NULL)
+ {
+ str += 10;
+ char ch;
+ while ((ch = *str++) != '<') {
+ if (ch != '-') {
+ putchar(ch);
+ ++printed_char_num;
+ }
+ if (printed_char_num == 80) {
+ printed_char_num = 0;
+ printf(" ");
+ }
+ }
+ }
+}
+
+
+
+void doHillerLabTOGAGeneBig(char *database, struct trackDb *tdb, char *item, char *table_name)
+/* Put up TOGA Gene track info. */
+// To think about -> put into a single bigBed
+// string: HTML formatted inact mut
+// string: HTML formatted exon ali section
+{
+int start = cartInt(cart, "o");
+int end = cartInt(cart, "t");
+char *chrom = cartString(cart, "c");
+char *fileName = bbiNameFromSettingOrTable(tdb, NULL, tdb->table);
+struct bbiFile *bbi = bigBedFileOpenAlias(hReplaceGbdb(fileName), chromAliasFindAliases);
+struct lm *lm = lmInit(0);
+struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm);
+struct bigBedInterval *bb;
+char *fields[bbi->fieldCount];
+for (bb = bbList; bb != NULL; bb = bb->next)
+ {
+ if (!(bb->start == start && bb->end == end))
+ continue;
+
+ // our names are unique
+ char *name = cloneFirstWordByDelimiterNoSkip(bb->rest, '\t');
+ boolean match = (isEmpty(name) && isEmpty(item)) || sameOk(name, item);
+ if (!match)
+ continue;
+
+ char startBuf[16], endBuf[16];
+ bigBedIntervalToRow(bb, chrom, startBuf, endBuf, fields, bbi->fieldCount);
+ break;
+ }
+
+printf("Projection %s \n", item);
+struct togaDataBB *info = togaDataBBLoad(&fields[11]); // Bogdan: why 11? 0-11 are bed-like fields likely
+
+printf("Reference transcript: %s ", info->ref_link);
+printf("Genomic locus in reference: %s \n", info->ref_region);
+printf("Genomic locus in query: %s \n", info->query_region);
+
+printf("Projection classification: %s \n", info->status);
+printf("Probability that query locus is orthologous: %s \n", info->chain_score);
+// list of chain features (for orthology classification)
+printf("Show features used for ortholog probability \n");
+printf("\n");
+printf("
\n");
+printf("Synteny (log10 value): %s \n", info->chain_synteny);
+printf("Global CDS fraction: %s \n", info->chain_gl_cds_fract);
+printf("Local CDS fraction: %s \n", info->chain_loc_cds_fract);
+printf("Local intron fraction: %s \n", info->chain_intron_cov);
+printf("Local CDS coverage: %s \n", info->chain_exon_cov);
+printf("Flank fraction: %s \n", info->chain_flank);
+printf(" \n");
+
+printf("
\n
Feature description: \n");
+printf("For each projection (one reference transcript and one overlapping chain),\n");
+printf("TOGA computes the following features by intersecting the reference coordinates of aligning\n");
+printf("blocks in the chain with different gene parts (coding exons, UTR (untranslated region) exons, introns)\n");
+printf("and the respective intergenic regions.\n
\n");
+
+printf("We define the following variables:\n
\n");
+printf("c: number of reference bases in the intersection between chain blocks and coding exons of the gene under consideration. \n");
+printf("C: number of reference bases in the intersection between chain blocks and coding exons of all genes. \n");
+printf("a: number of reference bases in the intersection between chain blocks and coding exons and introns of the gene under consideration. \n");
+printf("A: number of reference bases in the intersection between chain blocks and coding exons and introns of all genes and the intersection\n");
+printf("between chain blocks and intergenic regions (excludes UTRs). \n");
+printf("f: number of reference bases in chain blocks overlapping the 10 kb flanks of the gene under consideration.\n");
+printf("Alignment blocks overlapping exons of another gene that is located in these 10 kb flanks are ignored. \n");
+printf("i: number of reference bases in the intersection between chain blocks and introns of the gene under consideration. \n");
+printf("CDS (coding sequence): length of the coding region of the gene under consideration. \n");
+printf("I: sum of all intron lengths of the gene under consideration. \n");
+printf(" \n");
+printf("Using these variables, TOGA computes the following features:\n");
+printf("
\n");
+printf("“global CDS fraction” as C / A. Chains with a high value have alignments that largely overlap coding exons,");
+printf("which is a hallmark of paralogous or processed pseudogene chains. In contrast, chains with a low value also align many ");
+printf("intronic and intergenic regions, which is a hallmark of orthologous chains. \n");
+printf("“local CDS fraction” as c / a. Orthologous chains tend to have a lower value, as intronic ");
+printf("regions partially align. This feature is not computed for single-exon genes. \n");
+printf("“local intron fraction” as i / I. Orthologous chains tend to have a higher value.");
+printf("This feature is not computed for single-exon genes. \n");
+printf("“flank fraction” as f / 20,000. Orthologous chains tend to have higher values,");
+printf("as flanking intergenic regions partially align. This feature is important to detect orthologous loci of single-exon genes. \n");
+printf("“synteny” as log10 of the number of genes, whose coding exons overlap by at least one base aligning");
+printf("blocks of this chain. Orthologous chains tend to cover several genes located in a conserved order, resulting in higher synteny values. \n");
+printf("“local CDS coverage” as c / CDS, which is only used for single-exon genes. \n");
+printf(" \n");
+
+
+printf("\n
\n \n");
+htmlHorizontalLine();
+
+// show inact mut plot
+printf("Visualization of inactivating mutations on exon-intron structure \n");
+printf("%s \n", info->svg_line);
+printf(" Exons shown in grey are missing (often overlap assembly gaps).\nExons shown in");
+printf(" red or blue are deleted or do not align at all.\nRed indicates that the exon deletion ");
+printf("shifts the reading frame, while blue indicates that exon deletion(s) are framepreserving. \n");
+
+// GLP features
+printf("Show features used for transcript classification \n");
+printf("\n");
+printf("
\n");
+printf("Percent intact, ignoring missing sequence: %s \n", info->perc_intact_ign_M);
+printf("Percent intact, treating missing as intact sequence: %s \n", info->perc_intact_int_M);
+printf("Proportion of intact codons: %s \n", info->intact_codon_prop);
+printf("Percent of CDS not covered by this chain (0 unless the chain covers only a part of the gene): %s \n", info->ouf_prop);
+if (sameWord(info->mid_intact, ONE_))
+{
+ printf("Middle 80 percent of CDS intact: %s \n", YES_);
+} else {
+ printf("Middle 80 percent of CDS intact: %s \n", NO_);
+}
+if (sameWord(info->mid_pres, ONE_))
+{
+ printf("Middle 80 percent of CDS present: %s \n", YES_);
+} else {
+ printf("Middle 80 percent of CDS present: %s \n", NO_);
+}
+printf(" \n
\n \n");
+
+
+htmlHorizontalLine();
+
+printf("Predicted protein sequence \n");
+
+printf("Show protein sequence of query \n");
+printf("\n");
+// printf("{protein seq of the query without dashes or other things. Should end with *}\n");
+printf("");
+HLprintQueryProtSeqForAli(info->prot_alignment);
+printf("\n \n \n
\n");
+
+// and show protein sequence
+htmlHorizontalLine();
+printf("Protein sequence alignment \n");
+printf("Show alignment between reference and query \n");
+printf("\n");
+printf("%s \n", info->prot_alignment);
+printf("
\n \n");
+
+// show inactivating mutations if required
+printf("List of inactivating mutations \n");
+
+printf("Show inactivating mutations \n");
+printf("\n");
+printf("
\n"); // init table
+printf("Exon number Codon number Mutation class Mutation Treated as inactivating Mutation ID \n");
+printf(" \n");
+printf("%s\n", info->inact_mut_html_table);
+printf("
\n");
+printf("
\n \n");
+
+// show exons data
+htmlHorizontalLine();
+printf("Exon alignments \n");
+
+printf("Show exon sequences and features \n");
+printf("\n");
+// printf("%s\n", info->exon_ali_string);
+printf("%s\n", info->exon_ali_html);
+
+htmlHorizontalLine();
+
+// TODO: check whether I need this
+printf("%s", hgTracksPathAndSettings());
+hPrintf("
");
+hPrintf("");
+hPrintf("");
+
+
+printTrackHtml(tdb); // and do I need this?
+}
+
+
+void doHillerLabTOGAGene(char *database, struct trackDb *tdb, char *item, char *table_name)
/* Put up TOGA Gene track info. */
{
- int start = cartInt(cart, "o");
+ //int start = cartInt(cart, "o");
char headerTitle[512];
char suffix[512];
strcpy(suffix, table_name);
@@ -180,6 +452,13 @@ void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name)
genericHeader(tdb, headerTitle);
printf("
TOGA gene annotation \n");
// htmlHorizontalLine();
+
+ if (startsWith("bigBed", tdb->type))
+ {
+ doHillerLabTOGAGeneBig(database, tdb, item, table_name);
+ return;
+ }
+
struct sqlConnection *conn = hAllocConn(database);
// define TOGA table names: initate with pre-defined prefixes
@@ -209,57 +488,103 @@ void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name)
if ((row = sqlNextRow(sr)) != NULL) {
info = togaDataLoad(row); // parse sql output
// fill HTML template:
- printf("
Projected via: %s ",
+ printf("
Reference transcript: %s ",
info->ref_trans_id, info->ref_trans_id);
- printf("
Region in reference: %s
\n", info->ref_region);
- printf("
Region in query: %s
\n", info->query_region);
+ printf("
Genomic locus in reference: %s
\n", info->ref_region);
+ printf("
Genomic locus in query: %s
\n", info->query_region);
- printf("
Projection class: %s
\n", info->status);
- printf("
Chain score: %s
\n", info->chain_score);
+ printf("
Projection classification: %s
\n", info->status);
+ printf("
Probability that query locus is orthologous: %s
\n", info->chain_score);
// list of chain features (for orthology classification)
- printf("
Show chain features for classification \n");
+ printf("
Show features used for ortholog probability \n");
printf("
\n");
printf("
\n");
- printf("Synteny: %s \n", info->chain_synteny);
+ printf("Synteny (log10 value): %s \n", info->chain_synteny);
printf("Global CDS fraction: %s \n", info->chain_gl_cds_fract);
printf("Local CDS fraction: %s \n", info->chain_loc_cds_fract);
printf("Local intron fraction: %s \n", info->chain_intron_cov);
printf("Local CDS coverage: %s \n", info->chain_exon_cov);
printf("Flank fraction: %s \n", info->chain_flank);
+ printf(" \n");
+
+ printf("
\n
Feature description: \n");
+ printf("For each projection (one reference transcript and one overlapping chain),\n");
+ printf("TOGA computes the following features by intersecting the reference coordinates of aligning\n");
+ printf("blocks in the chain with different gene parts (coding exons, UTR (untranslated region) exons, introns)\n");
+ printf("and the respective intergenic regions.\n
\n");
+
+ printf("We define the following variables:\n
\n");
+ printf("c: number of reference bases in the intersection between chain blocks and coding exons of the gene under consideration. \n");
+ printf("C: number of reference bases in the intersection between chain blocks and coding exons of all genes. \n");
+ printf("a: number of reference bases in the intersection between chain blocks and coding exons and introns of the gene under consideration. \n");
+ printf("A: number of reference bases in the intersection between chain blocks and coding exons and introns of all genes and the intersection\n");
+ printf("between chain blocks and intergenic regions (excludes UTRs). \n");
+ printf("f: number of reference bases in chain blocks overlapping the 10 kb flanks of the gene under consideration.\n");
+ printf("Alignment blocks overlapping exons of another gene that is located in these 10 kb flanks are ignored. \n");
+ printf("i: number of reference bases in the intersection between chain blocks and introns of the gene under consideration. \n");
+ printf("CDS (coding sequence): length of the coding region of the gene under consideration. \n");
+ printf("I: sum of all intron lengths of the gene under consideration. \n");
+ printf(" \n");
+ printf("Using these variables, TOGA computes the following features:\n");
+ printf("
\n");
+ printf("“global CDS fraction” as C / A. Chains with a high value have alignments that largely overlap coding exons,");
+ printf("which is a hallmark of paralogous or processed pseudogene chains. In contrast, chains with a low value also align many ");
+ printf("intronic and intergenic regions, which is a hallmark of orthologous chains. \n");
+ printf("“local CDS fraction” as c / a. Orthologous chains tend to have a lower value, as intronic ");
+ printf("regions partially align. This feature is not computed for single-exon genes. \n");
+ printf("“local intron fraction” as i / I. Orthologous chains tend to have a higher value.");
+ printf("This feature is not computed for single-exon genes. \n");
+ printf("“flank fraction” as f / 20,000. Orthologous chains tend to have higher values,");
+ printf("as flanking intergenic regions partially align. This feature is important to detect orthologous loci of single-exon genes. \n");
+ printf("“synteny” as log10 of the number of genes, whose coding exons overlap by at least one base aligning");
+ printf("blocks of this chain. Orthologous chains tend to cover several genes located in a conserved order, resulting in higher synteny values. \n");
+ printf("“local CDS coverage” as c / CDS, which is only used for single-exon genes. \n");
+ printf(" \n");
+
+
printf("\n
\n
\n");
htmlHorizontalLine();
// show inact mut plot
- printf("
Inactivating mutations plot \n");
+ printf("
Visualization of inactivating mutations on exon-intron structure \n");
printf("%s
\n", info->svg_line);
+ printf("
Exons shown in grey are missing (often overlap assembly gaps).\nExons shown in");
+ printf(" red or blue are deleted or do not align at all.\nRed indicates that the exon deletion ");
+ printf("shifts the reading frame, while blue indicates that exon deletion(s) are framepreserving.
\n");
// GLP features
- printf("
Show GLP features \n");
+ printf("
Show features used for transcript classification \n");
printf("
\n");
printf("
\n");
- printf("Percent intact ignoring missing seq: %s \n", info->perc_intact_ign_M);
- printf("Percent intact (miss == intact): %s \n", info->perc_intact_int_M);
- printf("Intact codon proportion %s \n", info->intact_codon_prop);
- printf("Out of chain proportion: %s \n", info->ouf_prop);
+ printf("Percent intact, ignoring missing sequence: %s \n", info->perc_intact_ign_M);
+ printf("Percent intact, treating missing as intact sequence: %s \n", info->perc_intact_int_M);
+ printf("Proportion of intact codons: %s \n", info->intact_codon_prop);
+ printf("Percent of CDS not covered by this chain (0 unless the chain covers only a part of the gene): %s \n", info->ouf_prop);
if (sameWord(info->mid_intact, ONE_))
{
- printf("Middle 80 percent intact: %s \n", YES_);
+ printf("Middle 80 percent of CDS intact: %s \n", YES_);
} else {
- printf("Middle 80 percent intact: %s \n", NO_);
+ printf("Middle 80 percent of CDS intact: %s \n", NO_);
}
if (sameWord(info->mid_pres, ONE_))
{
- printf("Middle 80 percent present: %s \n", YES_);
+ printf("Middle 80 percent of CDS present: %s \n", YES_);
} else {
- printf("Middle 80 percent present: %s \n", NO_);
+ printf("Middle 80 percent of CDS present: %s \n", NO_);
}
printf(" \n
\n
\n");
+ printf("
Query protein sequence ");
+
+ printf("
Show protein sequence of query \n");
+ printf("
\n");
+ printf("{protein seq of the query without dashes or other things. Should end with *}\n");
+ printf(" \n \n
\n");
// and show protein sequence
htmlHorizontalLine();
- printf("
Protein sequence \n");
- printf("
Show protein alignment \n");
- printf("
\n");
+ printf("
Protein sequence alignment \n");
+ printf("
Show alignment between reference and query \n");
+ printf("
\n");
printf("%s \n", info->prot_alignment);
printf("
\n
\n");
@@ -273,7 +598,7 @@ void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name)
}
// show inactivating mutations if required
- printf("
Inactivating mutations \n");
+ printf("
List of inactivating mutations \n");
if (hTableExists(database, togaInactMutTableName))
{
@@ -285,7 +610,7 @@ void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name)
printf("
Show inactivating mutations \n");
printf("
\n");
printf("
\n"); // init table
- printf("exon pos m_class mut is_inact mut_id \n");
+ printf("Exon number Codon number Mutation class Mutation Treated as inactivating Mutation ID \n");
printf(" \n");
while ((row = sqlNextRow(sr)) != NULL)
{
@@ -314,7 +639,7 @@ void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name)
// show exons data
htmlHorizontalLine();
- printf("Exons data \n");
+ printf("Exon alignments \n");
if (hTableExists(database, togaNuclTableName))
{
@@ -323,8 +648,6 @@ void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name)
char **row;
printf("Show exon sequences and features \n");
printf("\n");
- // TODO: make sure this clause is not necessary
- // order by exon_num
sqlSafef(query, sizeof(query), "select * from %s where transcript='%s'", togaNuclTableName, item);
sr = sqlGetResult(conn, query);
@@ -342,13 +665,13 @@ void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name)
}
printf("Exon alignment class: %s \n", info->ali_class);
if (sameWord(info->in_exp_region, ONE_)){
- printf("Detected within expected region: %s \n", YES_);
+ printf("Detected within expected region (%s): %s \n", info->exp_region, YES_);
} else {
- printf("Detected within expected region: %s \n", NO_);
+ printf("Detected within expected region (%s): %s \n", info->exp_region, NO_);
}
- printf("Expected region: %s \n", info->exp_region);
+ // printf("Expected region: %s \n", info->exp_region);
printf(" \n");
- printf("Sequence alignment: \n");
+ printf("Sequence alignment between reference and query exon: \n");
printf("%s \n", info->alignment);
togaNuclFree(&info);
}
diff --git a/ucsc_browser_visualisation/togaClick.h b/ucsc_browser_visualisation/togaClick.h
index 00cecee..c94e80b 100644
--- a/ucsc_browser_visualisation/togaClick.h
+++ b/ucsc_browser_visualisation/togaClick.h
@@ -15,6 +15,35 @@
#define HLTOGA_MAXCHAR 255
+struct togaDataBB
+{
+ char *projection;
+ char *ref_trans_id;
+ char *ref_region;
+ char *query_region;
+ char *chain_score;
+ char *chain_synteny;
+ char *chain_flank;
+ char *chain_gl_cds_fract;
+ char *chain_loc_cds_fract;
+ char *chain_exon_cov;
+ char *chain_intron_cov;
+ char *status;
+ char *perc_intact_ign_M;
+ char *perc_intact_int_M;
+ char *intact_codon_prop;
+ char *ouf_prop;
+ char *mid_intact;
+ char *mid_pres;
+ char *prot_alignment;
+ char *svg_line;
+ char *ref_link;
+ char *inact_mut_html_table;
+ char *exon_ali_html;
+};
+
+
+
struct togaData
{
char *projection;
@@ -67,10 +96,19 @@ struct togaInactMut
};
+struct togaDataBB *togaDataBBLoad(char **row);
+/* Load a togaData from row fetched with select * from togaData
+ * from database. Dispose of this with togaDataFree(). */
+
+
struct togaData *togaDataLoad(char **row);
/* Load a togaData from row fetched with select * from togaData
* from database. Dispose of this with togaDataFree(). */
+void togaDataBBFree(struct togaDataBB **pEl);
+/* Free a single dynamically allocated togaDatasuch as created
+ * with togaDataLoad(). */
+
void togaDataFree(struct togaData **pEl);
/* Free a single dynamically allocated togaDatasuch as created
* with togaDataLoad(). */
@@ -95,7 +133,8 @@ void extractHLTOGAsuffix(char *suffix);
/* Extract suffix from TOGA table name.
Prefix must be HLTOGAannot */
-void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name);
+// void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name); // old
+void doHillerLabTOGAGene(char *database, struct trackDb *tdb, char *item, char *table_name);
/* Put up TOGA Gene track info. */
#endif // TOGACLICK_H
diff --git a/ucsc_browser_visualisation/togaData.sql b/ucsc_browser_visualisation/togaData.sql
deleted file mode 100644
index ab4415e..0000000
--- a/ucsc_browser_visualisation/togaData.sql
+++ /dev/null
@@ -1,23 +0,0 @@
-CREATE TABLE TOGAData (
- transcript varchar(50) not null, -- unique projection ID: ${ref_transcript_ID}.${chain_ID}
- ref_transcript_ID varchar(50) not null, -- transcript ID in reference
- ref_region varchar(100) not null, -- transcript region in reference
- query_region varchar(100) not null, -- projection region in query
- chain_score float not null, -- chain orthology score
- chain_synteny int unsigned not null, -- chain synteny
- chain_flank float not null, -- flank coverage
- chain_gl_cds_fract float not null, -- global CDS fraction
- chain_loc_cds_fract float not null, -- local CDS fraction
- chain_exon_cov float not null, -- local CDS coverage
- chain_intron_cov float not null, -- local intron coverage
- status varchar(24) not null, -- projection GLP status: loss, intact, etc
- perc_intact_ign_M float not null, -- %intact ignoring Missing
- perc_intact_int_M float not null, -- %intact considering missing seq intact
- intact_codon_prop float not null, -- % of intact codons
- ouf_prop float not null, -- out of chain proportion
- mid_intact tinyint unsigned not null, -- middle 80% intact? 1 - True 0 - False, else - undefined
- mid_pres tinyint unsigned not null, -- middle 80% present? 1 - True 0 - False, else - undefined
- prot_sequence longblob not null, -- protein sequence
- svg_plot longblob not null, -- svg string
- PRIMARY KEY(transcript)
-);
diff --git a/ucsc_browser_visualisation/togaInactMut.sql b/ucsc_browser_visualisation/togaInactMut.sql
deleted file mode 100644
index eb53d35..0000000
--- a/ucsc_browser_visualisation/togaInactMut.sql
+++ /dev/null
@@ -1,9 +0,0 @@
-CREATE TABLE TOGAInactMut (
- transcript varchar(50) not null, -- unique projection ID: ${ref_transcript_ID}.${chain_ID}
- exon_num int unsigned not null, -- exon number
- position int unsigned not null, -- possition where mutation happened
- mut_class varchar(15) not null, -- mutation class such as FS deletion
- mutation varchar(20) not null, -- what exactly happened
- is_inact tinyint unsigned not null, -- is this mutation inactivating, yes 1 or not 0
- mut_id varchar(10) not null -- mut identifier
-);
diff --git a/ucsc_browser_visualisation/togaNucl.sql b/ucsc_browser_visualisation/togaNucl.sql
deleted file mode 100644
index 9d2e189..0000000
--- a/ucsc_browser_visualisation/togaNucl.sql
+++ /dev/null
@@ -1,12 +0,0 @@
-CREATE TABLE TOGANucl (
- transcript varchar(50) not null, -- unique projection ID: ${ref_transcript_ID}.${chain_ID}
- exon_num int unsigned not null, -- exon number
- exon_region varchar(100) not null, -- region where exon was detected
- pid float not null, -- nucleotide %id
- blosum float not null, -- normalized blosum score
- gaps tinyint unsigned not null, -- are there any asm gaps near? 1 - yes 0 - no
- ali_class varchar(4) not null, -- alignemnt class: A, B, C, A+
- exp_region varchar(50) not null, -- where exon was expected
- in_exp_region tinyint unsigned not null, -- detected in expected region or not 1 yes 0 no
- alignment longblob not null -- exon sequence in query
-);