diff --git a/CESAR_wrapper.py b/CESAR_wrapper.py index d16c058..90fc6ed 100755 --- a/CESAR_wrapper.py +++ b/CESAR_wrapper.py @@ -60,6 +60,8 @@ EXP_REG_EXTRA_FLANK = 50 EXON_SEQ_FLANK = 10 +ERR_CODE_FRAGM_ERR = 2 + # alias; works for Hillerlab-only two_bit_templ = "/projects/hillerlab/genome/gbdb-HL/{0}/{0}.2bit" chain_alias_template = "/projects/hillerlab/genome/gbdb-HL/{0}/lastz/vs_{1}/axtChain/{0}.{1}.allfilled.chain.gz" @@ -82,6 +84,8 @@ LOCATION, "modules", "extract_subchain_slib.so" ) DEFAULT_CESAR = os.path.join(LOCATION, "CESAR2.0", "cesar") +OPT_CESAR_LOCATION = os.path.join(LOCATION, "cesar_input_optimiser.py") + ex_lib = ctypes.CDLL(extract_subchain_lib_path) ex_lib.extract_subchain.argtypes = [ctypes.c_char_p, ctypes.c_char_p, ctypes.c_char_p] ex_lib.extract_subchain.restype = ctypes.POINTER(ctypes.c_char_p) @@ -176,6 +180,8 @@ EQ_ACC_PROFILE = os.path.join(LOCATION, "supply", "eq_acc_profile.txt") EQ_DO_PROFILE = os.path.join(LOCATION, "supply", "eq_donor_profile.txt") +ORTH_LOC_LINE_SUFFIX = "#ORTHLOC" + FRAGMENT = -1 @@ -359,7 +365,8 @@ def parse_args(): app.add_argument( "--exon_flanks_file", default=None, - help="Temporary file containing exon flanks ", + help=("Temporary file containing exon flanks; do use only if you run " + "the optimized CESAR version"), ) app.add_argument( "--predefined_regions", @@ -369,7 +376,44 @@ def parse_args(): "for CESAR optimisation. If set: call optimised CESAR without LASTZ", ) app.add_argument( - "--opt_precompute", action="store_true", dest="opt_precompute", help="later" + "--opt_precompute", + action="store_true", + dest="opt_precompute", + help="Do only precompute memory consumption for the optimized CESAR version" + ) + app.add_argument( + "--save_orth_locus", + action="store_true", + dest="save_orth_locus", + help="Do only precompute orth regions for the optimized CESAR version" + ) + app.add_argument( + "--precomputed_orth_loci", + default=None, + help="Path to loci saved with --save_orth_locus" + ) + app.add_argument( + "--do_not_check_exon_chain_intersection", + default=False, + action="store_true", + dest="do_not_check_exon_chain_intersection", + help="Do not extract chain blocks (not recommended)" + ) + app.add_argument( + "--alt_frame_del", + "--lfd", + default=False, + action="store_true", + dest="alt_frame_del", + help="Consider codons in alternative frame (between compensated FS) deleted" + ) + app.add_argument( + "--mask_all_first_10p", + "--m_f10p", + action="store_true", + dest="mask_all_first_10p", + help="Automatically mask all inactivating mutations in first 10% of " + "the reading frame, ignoring ATG codons distribution." ) if len(sys.argv) == 1: @@ -1415,6 +1459,8 @@ def run_cesar( cesar_cmd += " --no_lastz" if opt_precompute: cesar_cmd += " --memory_consumption" + + verbose(f"Calling CESAR command:\n{cesar_cmd}") p = subprocess.Popen( cesar_cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE ) @@ -1880,9 +1926,10 @@ def process_cesar_out(cesar_raw_out, query_loci, inverts): # extract protein sequences also here, just to do it in one place part_pIDs, part_blosums, prot_seqs_part = compute_score(codons_data) prot_seqs[chain_id] = prot_seqs_part - # convert codon list to {"ref": [t_codons], "que": [q_codons]} + # convert codon list to {"ref": [t_codons], "que": [q_codons]} codon_seqs_part = extract_codon_data(codons_data) codon_seqs[chain_id] = codon_seqs_part + ( exon_query_seqs, exon_ref_seqs, @@ -1925,6 +1972,7 @@ def process_cesar_out(cesar_raw_out, query_loci, inverts): percIDs[chain_id] = part_pIDs blosums[chain_id] = part_blosums query_coords[chain_id] = abs_coords + ret = ( exon_queries, exon_refs, @@ -1940,6 +1988,18 @@ def process_cesar_out(cesar_raw_out, query_loci, inverts): return ret +def __check_fragm_coords_intersect_in_q(query_coords): + """Check whether coordinates of predicted exons intersect in the query.""" + exon_ranges_w_chrom = list(query_coords.values()) + exon_ranges = [x.split(":")[1].split("-") for x in exon_ranges_w_chrom] + exon_starts = [int(x[0]) for x in exon_ranges] + exon_ends = [int(x[1]) for x in exon_ranges] + exon_intervals = list(zip(exon_starts, exon_ends)) + exon_intervals_sorted = sorted(exon_intervals, key=lambda x: x[0]) + intersecting_intervals = _check_seq_of_intervals_intersect(exon_intervals_sorted) + return intersecting_intervals + + def process_cesar_out__fragments(cesar_raw_out, fragm_data, query_loci, inverts): """Process CESAR output for assembled from fragments gene.""" exon_queries, exon_refs, percIDs, blosums, query_coords, = ( @@ -1953,7 +2013,7 @@ def process_cesar_out__fragments(cesar_raw_out, fragm_data, query_loci, inverts) cesar_raw_lines = cesar_raw_out.split("\n") target_seq_raw = cesar_raw_lines[1] query_seq_raw = cesar_raw_lines[3] - chain_id_to_codon_table = {} + chain_id_to_codon_table = {} # chain id -> raw codon table codons_data = parse_cesar_out(target_seq_raw, query_seq_raw, v=VERBOSE) chain_id_to_codon_table[FRAGMENT] = codons_data @@ -2031,6 +2091,14 @@ def process_cesar_out__fragments(cesar_raw_out, fragm_data, query_loci, inverts) # this is completely OK exon_grange = f"{q_chrom}:{exon_abs_start}-{exon_abs_end}" abs_coords[exon_num] = exon_grange + + # check that abs coords do not intersect + coords_intersect = __check_fragm_coords_intersect_in_q(abs_coords) + if coords_intersect: # error, cannot go further + print("Error! Cannot stitch fragments properly, exon intervals intersect after merge") + print(f"Intersecting intervals are: {coords_intersect}") + print("Abort") + sys.exit(ERR_CODE_FRAGM_ERR) # add to the global dicts exon_queries[FRAGMENT] = exon_query_seqs exon_refs[FRAGMENT] = exon_ref_seqs @@ -2048,7 +2116,7 @@ def process_cesar_out__fragments(cesar_raw_out, fragm_data, query_loci, inverts) prot_seqs, codon_seqs, aa_sat_seq, - chain_id_to_codon_table + chain_id_to_codon_table, ) return ret @@ -2185,10 +2253,12 @@ def invert_complement(seq): return reverse_complement -def save_prot(gene_name, prot_seq, prot_out): +def save_prot(gene_name, prot_seq, prot_out, del_mis_exons_): """Save protein sequences.""" if prot_out is None: return + del_mis_exons = set() if del_mis_exons_ is None else set(del_mis_exons_) + f = open(prot_out, "w") if prot_out != "stdout" else sys.stdout for chain_id, seq_collection in prot_seq.items(): proj_name = f"{gene_name}.{chain_id}" @@ -2197,8 +2267,11 @@ def save_prot(gene_name, prot_seq, prot_out): que_seqs = [] for exon in exons: ref_part = seq_collection[exon]["ref"] - que_part = seq_collection[exon]["que"] ref_seqs.append(ref_part) + if exon in del_mis_exons: + que_part = ["-" for _ in ref_part] + else: + que_part = seq_collection[exon]["que"] que_seqs.append(que_part) ref_aa_seq = "".join(ref_seqs) que_aa_seq = "".join(que_seqs) @@ -2353,7 +2426,7 @@ def extend_rel_regions(extended_regions, query_len): return ext_regions -def read_predefined_regions(arg): +def read_predefined_regions(arg, gene): """Read predefined regions.""" ret = defaultdict(list) if arg is None: @@ -2361,23 +2434,67 @@ def read_predefined_regions(arg): f = open(arg, "r") for line in f: line_data = line.rstrip().split("\t") - line_strip = line.rstrip() - chain_id = int(line_data[0]) - ret[chain_id].append(line_strip) + line_gene = line_data[0] + if line_gene != gene: + continue + chain_id = int(line_data[1]) + start = line_data[2] + end = line_data[3] + line_to_out = f"{chain_id}\t{start}\t{end}" + ret[chain_id].append(line_to_out) f.close() return ret +def _check_seq_of_intervals_intersect(intervals): + intervals_num = len(intervals) + for i in range(intervals_num - 1): + # (start, end) + curr_one = intervals[i] + next_one = intervals[i + 1] + # sorted by beginning + # if start of the next < end of the curr + # -> they intersect + if next_one[0] < curr_one[1]: + return (curr_one[0], curr_one[1]) + return None # nothing suspicious found + + +def parse_precomp_orth_loci(transcript_name, path): + """Read precomputed orthologous loci from a file.""" + ret_1 = {} # chain to search locus + ret_2 = {} # chain to subchain locus + + f = open(path, "r") + # sample file line: + # #ORTHLOC ENST00000262455 1169 JH567521:462931-522892 JH567521:462931-522892 + # suffix - transcript - chain - search locus - subch locus + for line in f: + ld = line.rstrip().split("\t") + if ld[1] != transcript_name: + continue + chain_id = int(ld[2]) + ret_1[chain_id] = ld[3] + ret_2[chain_id] = ld[4] + f.close() + + return ret_1, ret_2 + + def redo_codon_sequences(codon_tables, del_mis_exons): """Rebuild codon alignments: now excluding deleted and missing exons.""" - ret = {} + codon_ret = {} for chain_id, codon_table in codon_tables.items(): excl_exons = del_mis_exons.get(str(chain_id), set()) codon_seqs_upd = extract_codon_data(codon_table, excl_exons=excl_exons) - ret[chain_id] = codon_seqs_upd - return ret + codon_ret[chain_id] = codon_seqs_upd + return codon_ret +def extract_prot_sequences_from_codon(codon_s): + """Extract protein sequences from codon""" + return [] + def realign_exons(args): """Entry point.""" @@ -2419,11 +2536,26 @@ def realign_exons(args): aa_block_sat_chain = {} # one of dicts to mark exceptionally good exon predictions fragments_data = [] # required for fragmented genomes chains_in_input = True - force_include_regions_opt_v = ( - [] - ) # regions we force to include in the optimized version + + # regions we force to include in the optimized version + # those regions != orthologous loci in the query, those are used in optimised + # cesar versions as regions within ortholoogus regions that must be included + # in the CESAR run itself + force_include_regions_opt_v = [] + + verbose("Reading query regions") - chain_to_predefined_regions = read_predefined_regions(args["predefined_regions"]) + chain_to_predefined_regions = read_predefined_regions(args["predefined_regions"], args["gene"]) + + # no need to extract chain blocks and compute where is the orthologous locus + # it's already precomputed + chain_to_precomp_search_loci = {} + chain_to_precomp_subch_loci = {} + if args["precomputed_orth_loci"]: + plc_ = parse_precomp_orth_loci(args["gene"], args["precomputed_orth_loci"]) + chain_to_precomp_search_loci = plc_[0] + chain_to_precomp_subch_loci = plc_[1] + # if chains and args["fragments"]: # the normal branch: call CESAR vs 1+ query sequences for chain_id in chains: # in region more this part is skipped @@ -2435,9 +2567,34 @@ def realign_exons(args): # most likely we need only the chain part that intersects the gene # and skip the rest: verbose("Cutting the chain...") - search_locus, subch_locus, chain_data = chain_cut( - chain_str, gene_range, args["gene_flank"], args["extra_flank"] - ) + + if args["precomputed_orth_loci"]: + search_locus = chain_to_precomp_search_loci[chain_id] + subch_locus = chain_to_precomp_subch_loci[chain_id] + # TODO: chain_data contains non-necessary information + # it's already present in the chain_header variable + # can be optimised (historical reasons) + _t_strand = True if chain_header[2] == "+" else False + _q_strand = True if chain_header[7] == "+" else False + _t_size = int(chain_header[3]) + _q_size = int(chain_header[8]) + chain_data = (_t_strand, _t_size, _q_strand, _q_size), + else: + search_locus, subch_locus, chain_data = chain_cut( + chain_str, gene_range, args["gene_flank"], args["extra_flank"] + ) + + if args["save_orth_locus"]: + # write STDOUT LINE about orthologous locus + # warning: to be used only on precomoute step + # do not use it for main CESAR calls (those that actually run CESAR) + # if you still like to do this in the main pipeline: exclude lines starting with + # ORTH_LOC_LINE_SUFFIX from CESAR wrapper output + g_ = args["gene"] + line = f"{ORTH_LOC_LINE_SUFFIX}\t{g_}\t{chain_id}\t{search_locus}\t{subch_locus}\n" + sys.stdout.write(line) + + # chain data: t_strand, t_size, q_strand, q_size chain_qStrand = chain_data[2] chain_qSize = chain_data[3] @@ -2470,6 +2627,8 @@ def realign_exons(args): _query_seq_ext, subch_locus, args["gap_size"], directed ) # blocks are [target_start, target_end, query_start, query_end] + # TODO: can be optimised here + # and also can be written to log - if same chain & bed - same output subchain_blocks_raw = extract_subchain(chain_str, subch_locus) # swap blocks in correct orientation and fill interblock ranges subchain_blocks = orient_blocks(subchain_blocks_raw, chain_data) @@ -2581,6 +2740,7 @@ def realign_exons(args): fragments_data = sorted(fragments_data, key=lambda x: x[5]) else: # gene is - -> reverse sort of chains fragments_data = sorted(fragments_data, key=lambda x: x[6], reverse=True) + # merge query feat dictionaries exon_gap = merge_dicts(chain_exon_gap.values()) exon_class = merge_dicts(chain_exon_class.values()) @@ -2588,6 +2748,7 @@ def realign_exons(args): aa_block_sat = merge_dicts(aa_block_sat_chain.values()) missing_exons = intersect_lists(chain_missed.values()) + query_seq_chunks = [] for elem in fragments_data: # stitch query seq in a proper order; elem[0] -> chain_id @@ -2643,7 +2804,18 @@ def realign_exons(args): append_u12(args["u12"], args["gene"], ref_ss_data) make_cesar_in(prepared_exons, query_sequences, cesar_in_filename, ref_ss_data) # run cesar itself + # TODO: exclude later + # print(force_include_regions_opt_v) cesar_bin = args.get("cesar_binary") if args.get("cesar_binary") else DEFAULT_CESAR + if args.get("cesar_binary"): + cesar_bin = args.get("cesar_binary") + elif args["opt_cesar"] is True: + cesar_bin = OPT_CESAR_LOCATION + elif args["opt_precompute"]: + cesar_bin = OPT_CESAR_LOCATION + else: + cesar_bin = DEFAULT_CESAR + if not args["cesar_output"]: cesar_raw_out = run_cesar( cesar_in_filename, @@ -2662,8 +2834,6 @@ def realign_exons(args): else: # very specific case, load already saved CESAR output with open(args["cesar_output"], "r") as f: cesar_raw_out = f.read() - # if force_include_reg_file: - # os.remove(force_include_reg_file) if os.path.isfile(force_include_reg_file) else None os.remove(cesar_in_filename) if is_temp else None # wipe temp if temp # save raw CESAR output and close if required save(cesar_raw_out, args["raw_output"], t0) if args["raw_output"] else None @@ -2682,13 +2852,12 @@ def realign_exons(args): query_coords = proc_out[4] # genomic coordinates in the query exon_num_corr = proc_out[5] # in case of intron del: ref/que correspondence prot_s = proc_out[6] # protein sequences in query - codon_s = proc_out[7] + codon_s = proc_out[7] # dict containing sequences of ref and query codons aa_cesar_sat = proc_out[8] # says whether an exon has outstanding quality - # raw codon table \ superset of "codon_s" basically + # raw codon table \ superset of "codon_s" basically # after changes in TOGA1.1 is needed again # TODO: needs refactoring codon_tables = proc_out[9] - aa_eq_len = aa_eq_len_check(exon_sequences, query_exon_sequences) if chains: @@ -2697,6 +2866,7 @@ def realign_exons(args): chain_exon_class = get_a_plus( chain_exon_class, aa_cesar_sat, aa_block_sat_chain, aa_eq_len ) + # time to arrange all these data altogether final_output, chain_ex_inc = arrange_output( args["gene"], @@ -2734,21 +2904,25 @@ def realign_exons(args): ref_ss=ref_ss_data, sec_codons=sec_codons, no_fpi=args["no_fpi"], + alt_f_del=args["alt_frame_del"], + mask_all_first_10p=args["mask_all_first_10p"], ) else: # do not call inact mut scanner loss_report = None del_mis_exons = None - + # del_mis_exons contains: # chain_id(string) = [0-based exon nums] - if del_mis_exons is not None and len(del_mis_exons.keys()) > 0: + need_correct_codon_and_prot = del_mis_exons is not None and len(del_mis_exons.keys()) > 0 + if need_correct_codon_and_prot: # if exists, need to filter codon alignment accordingly # chain id is numeric in "codon_table" codon_s = redo_codon_sequences(codon_tables, del_mis_exons) + prot_s = extract_prot_sequences_from_codon(codon_s) # save protein/codon ali and text output - save_prot(args["gene"], prot_s, args["prot_out"]) + # save_prot(args["gene"], prot_s, args["prot_out"], del_mis_exons) save_codons(args["gene"], codon_s, args["codon_out"]) save(final_output, args["output"], t0, loss_report) sys.exit(0) diff --git a/cesar_runner.py b/cesar_runner.py index fcf0b72..f59999a 100755 --- a/cesar_runner.py +++ b/cesar_runner.py @@ -12,6 +12,9 @@ __credits__ = ["Michael Hiller", "Virag Sharma", "David Jebb"] MAX_ATTEMPTS = 2 +ZERO_CODE = 0 +ERR_CODE = 1 +FRAGM_CHAIN_ISSUE_CODE = 2 def parse_args(): @@ -23,6 +26,7 @@ def parse_args(): "--check_loss", default=None, help="File to save gene loss data if requested" ) app.add_argument("--rejected_log", default=None, help="Log gene rejection events") + app.add_argument("--unproc_log", "--ul", default=None, help="Log unprocessed genes") # print help if there are no args if len(sys.argv) < 2: app.print_help() @@ -43,13 +47,22 @@ def call_job(cmd): rc = p.returncode cmd_out = b_stdout.decode("utf-8") err_msg = b_stderr.decode("utf-8").replace("\n", " ") - if rc == 0: - return cmd_out, 0 + if rc == ZERO_CODE: + return cmd_out, ZERO_CODE + elif rc == FRAGM_CHAIN_ISSUE_CODE: + err_msg = f"CESAR_wrapper.py detected that fragments overlap for {cmd}, abort" + return err_msg, FRAGM_CHAIN_ISSUE_CODE else: eprint(err_msg) eprint(f"\n{cmd} FAILED") attempts += 1 - return err_msg, 1 # send failure signal + return err_msg, ERR_CODE # send failure signal + + +def __job_to_transcript(job): + """Extract transcript ID from job.""" + fields = job.split() + return fields[1] def main(): @@ -60,6 +73,7 @@ def main(): # text file, a command per line jobs = [x.rstrip() for x in f.readlines()] jobs_num = len(jobs) + unprocessed_genes = [] out = open(args.output, "w") # handle output file gene_loss_data = [] # list to keep gene loss detector out @@ -69,6 +83,12 @@ def main(): eprint(f"Calling:\n{job}") # catch job stdout job_out, rc = call_job(job) + if rc == FRAGM_CHAIN_ISSUE_CODE: + # very special case -> nothig we can do + # mark as missnig, I guess + rejected.append(f"{job}\tfragment chains oevrlap\n") + unprocessed_genes.append(__job_to_transcript(job)) + continue if rc == 1: # a job failed with code 1 -> send the signal upstream # abort execution, write what job exactly failed @@ -136,6 +156,11 @@ def main(): f.write("".join(rejected)) f.close() + if args.unproc_log and len(unprocessed_genes) > 0: + f = open(args.unproc_log, "w") + for elem in unprocessed_genes: + f.write(f"{elem}\n") + f.close() if __name__ == "__main__": main() diff --git a/chain_runner.py b/chain_runner.py index 013b790..8cfcd28 100755 --- a/chain_runner.py +++ b/chain_runner.py @@ -560,12 +560,14 @@ def main(): # 2) an argument: "chain ,-sep list of genes" batch = read_input(args.input_file) task_size = len(batch) + # TODO: check whether I don't need .bst # load chains dict; it would be much faster to load chain_ID: (start_byte, offset) # python dict once than ask HDF5 database each time TOGA needs another chain index_file = args.chain_file.replace(".chain", ".chain_ID_position") chain_dict = load_chain_dict(index_file) # call main processing tool + # TODO: rename genes to transcripts where appropropriate for job_num, (chain, genes) in enumerate(batch.items(), 1): # one unit: one chain + intersected genes # call routine that extracts chain feature diff --git a/modules/GLP_values.py b/modules/GLP_values.py index b633036..a152587 100644 --- a/modules/GLP_values.py +++ b/modules/GLP_values.py @@ -6,7 +6,12 @@ DEL_MISS = {MISS_EXON, DEL_EXON} COMPENSATION = "COMPENSATION" SSM = "SSM" +# (ag)acceptor-EXON-donor(gt) +SSM_D = "SSMD" # Donor, right, GT,GC +SSM_A = "SSMA" # Acceptor, left, AG + START_MISSING = "START_MISSING" +ATG = "ATG" FS_DEL = "FS_DEL" FS_INS = "FS_INS" BIG_DEL = "BIG_DEL" diff --git a/modules/classify_chains.py b/modules/classify_chains.py index 3ec1255..e7e080d 100755 --- a/modules/classify_chains.py +++ b/modules/classify_chains.py @@ -91,6 +91,7 @@ def classify_chains( # -> then this is a proc pseudogene # move trans chains to a different dataframe + # TODO: rename trans -> spanning # trans chain -> a syntenic chain that passes throw the gene body # but has no aligning bases in the CDS trans_lines = df[(df["exon_cover"] == 0) & (df["synt"] > 1)] diff --git a/modules/gene_losses_summary.py b/modules/gene_losses_summary.py index fee28a9..7d21130 100755 --- a/modules/gene_losses_summary.py +++ b/modules/gene_losses_summary.py @@ -144,13 +144,13 @@ def read_loss_data(loss_dir): perc = float(line_data[2].split()[1]) projection_to_p_intact_M_intact[projection_id] = perc continue - elif line_data[2].startswith("MIDDLE_80%_INTACT"): - # flag: are there inact mutations in the middle 80% of CDS? + elif line_data[2].startswith("MIDDLE_IS_INTACT"): + # flag: are there inact mutations in the first 90%/mid 80% of CDS? raw_val = line_data[2].split()[1] val = True if raw_val == "TRUE" else False proj_to_80_p_intact[projection_id] = val continue - elif line_data[2].startswith("MIDDLE_80%_PRESENT"): + elif line_data[2].startswith("MIDDLE_IS_PRESENT"): # flag: any missing fragment in the middle 80% if CDS? raw_val = line_data[2].split()[1] val = True if raw_val == "TRUE" else False @@ -246,6 +246,7 @@ def get_projection_classes( p_intact_M_ign = p_to_pint_m_ign.get(projection, -1) p_intact_M_int = p_to_pint_m_int.get(projection, -1) p_i_codons = p_to_i_codon_prop.get(projection, -1) + # TODO: rename to NO LOSS IN FIRST 90% no_loss_in_80_p = p_80_int.get(projection, None) m_80_present = p_80_pre.get(projection, None) frame_oub = p_to_p_out_of_bord.get(projection, 0.0) diff --git a/modules/inact_mut_check.py b/modules/inact_mut_check.py index c4809d2..059fa8d 100755 --- a/modules/inact_mut_check.py +++ b/modules/inact_mut_check.py @@ -124,7 +124,17 @@ def mask_mut(mut): return upd_mut -def analyse_splice_sites(ref, query, gene, chain, u12_data=None, v=None): +def analyse_splice_sites( + ref, + query, + gene, + chain, + codon_table, + atg_codons_data, + mask_all_first_10p=False, + u12_data=None, + v=None, +): """Check correctness of the splice sites.""" u12_data = set() if not u12_data else u12_data # load U12 data if provided mut_counter = 1 # mut counter -> for mutations IDs @@ -172,57 +182,99 @@ def analyse_splice_sites(ref, query, gene, chain, u12_data=None, v=None): # so we can get splice site coordinates # if there is N in the splice site -> we don't know what's there # -> we are not sure -> mask this mutation - left_splice_site = query[start - 2 : start] - left_splice_site_N = "n" in left_splice_site or "N" in left_splice_site - right_splice_site = query[end + 1 : end + 3] - right_splice_size_N = "n" in right_splice_site or "N" in right_splice_site + acceptor_splice_site = query[start - 2 : start] # donor + acceptor_splice_site_N = ( + "n" in acceptor_splice_site or "N" in acceptor_splice_site + ) + donor_splice_site = query[end + 1 : end + 3] # acceptor + donor_splice_size_N = "n" in donor_splice_site or "N" in donor_splice_site + + # assign to the last / first codon of the exon (depends on what splice site is affected) + codon_pos_at_exon = [ + n for n, c in enumerate(codon_table) if c["t_exon_num"] == exon_num_ + ] + acceptor_codon_num = ( + codon_pos_at_exon[0] if len(codon_pos_at_exon) > 0 else None + ) + donor_codon_num = codon_pos_at_exon[-1] if len(codon_pos_at_exon) > 0 else None # check that splice sites are canonical - left_site_wrong = left_splice_site not in LEFT_SPLICE_CORR - right_site_wrong = right_splice_site not in RIGHT_SPLICE_CORR + acceptor_site_wrong = acceptor_splice_site not in LEFT_SPLICE_CORR + donor_site_wrong = donor_splice_site not in RIGHT_SPLICE_CORR + eprint( - f"Exon {exon_num}; L_SPS: {left_splice_site}; R_SPS: {right_splice_site}" + f"Exon {exon_num}; L_SPS: {acceptor_splice_site}; R_SPS: {donor_splice_site}" ) if v else None - if exon_num != 1 and left_site_wrong: + # print(exon_num) + # print(acceptor_site_wrong) + # print(donor_site_wrong) + + if exon_num != 1 and acceptor_site_wrong and acceptor_codon_num: # add mutation for left (acceptor) splice site # doesn't apply to the first exon obviously # mask this mutation if it's suspected to be U12 splice site mask = True if (exon_num, 0) in u12_data else False # if N in the splice site -> also mask it - mask = True if left_splice_site_N else mask + mask = True if acceptor_splice_site_N else mask + + # if splice site mutation in first 10% but followed by ATG in first 10% then + atg_mask = _define_whether_mask( + acceptor_codon_num, + atg_codons_data["left_t"], + atg_codons_data["right_t"], + atg_codons_data["atg_codon_nums"], + mask_all_first_10p=mask_all_first_10p, + ) + + mask = True if atg_mask is True else mask + # create mutation object, describe what happened - mut_ = f"{LEFT_SPLICE_CORR}->{left_splice_site}" - mut_id = f"SSM_{mut_counter}" + mut_ = f"{LEFT_SPLICE_CORR}->{acceptor_splice_site}" + mut_id = f"{SSM_A}_{mut_counter}" + # print(mut_id) mut_counter += 1 mut = Mutation( gene=gene, chain=chain, exon=exon_num, - position=0, - mclass=SSM, + position=acceptor_codon_num, + mclass=SSM_A, mut=mut_, masked=mask, mut_id=mut_id, ) sps_report.append(mut) # add mutation to the list - if exon_num != exons_num and right_site_wrong: + if exon_num != exons_num and donor_site_wrong and donor_codon_num: # add mutation for right (donor) splice site # doesn't apply to the last exon # mask this mutation if it's suspected to be U12 splice site mask = True if (exon_num, 1) in u12_data else False - mask = True if right_splice_size_N else mask # if N in mutation -> mask it + mask = True if donor_splice_size_N else mask # if N in mutation -> mask it + + # if splice site mutation in first 10% but followed by ATG in first 10% then + atg_mask = _define_whether_mask( + donor_codon_num, + atg_codons_data["left_t"], + atg_codons_data["right_t"], + atg_codons_data["atg_codon_nums"], + mask_all_first_10p=mask_all_first_10p, + ) + + mask = True if atg_mask is True else mask + # create mutation object - mut_ = f"{RIGHT_SPLICE_CORR}->{right_splice_site}" - mut_id = f"SSM_{mut_counter}" + mut_ = f"{RIGHT_SPLICE_CORR}->{donor_splice_site}" + mut_id = f"{SSM_D}_{mut_counter}" + # print(mut_id) mut_counter += 1 mut = Mutation( gene=gene, chain=chain, exon=exon_num, - position=1, - mclass=SSM, + position=donor_codon_num, + mclass=SSM_D, mut=mut_, masked=mask, mut_id=mut_id, @@ -242,16 +294,22 @@ def analyse_splice_sites(ref, query, gene, chain, u12_data=None, v=None): curr = sps_report[i] curr_exon = curr.exon prev_exon = prev.exon + if curr_exon != prev_exon + 1: # if current exon doesn't follow the previous immediately -> not the case # like prev mut exon is 3 and current is 6 continue + # if exons follow each other (like 3 and 4) then continue curr_pos = curr.position prev_pos = prev.position + curr_type = curr.mclass + prev_type = curr.mclass + # they must belong to the same intron, check this - if not (curr_pos == 0 and prev_pos == 1): + if not (prev_type == SSM_D and curr_type == SSM_A): continue + prev_to_what = prev.mut.split("->")[1] curr_to_what = curr.mut.split("->")[1] # if it was -- in both cases: this is intron deletion @@ -323,15 +381,78 @@ def corr_exon_num_or_no_fs(codon, exon_num): return exon_num +def _find_atg_codons(codon_table): + """Find reference codons aligned to ATG.""" + atg_codon_nums = [] + for num, codon in enumerate(codon_table, 1): + que_codon = codon["que_codon"] + que_codon_no_gap = que_codon.replace("-", "") + if not que_codon_no_gap: + # there are only gaps -> nothing to catch + continue + triplets = parts(que_codon_no_gap, n=3) + start_triplets = [x for x in triplets if x == "ATG"] + if len(start_triplets) > 0: + atg_codon_nums.append(num) + return atg_codon_nums + + +def _get_next_bigger_num(num, lst): + for elem in lst: + if elem >= num: + return elem + return 999999999 # TODO: ideally, last codon position + + +def _define_whether_mask( + num, left_t, right_t, atg_codon_nums, mask_all_first_10p=False +): + """Check whether the mutation is going to be masked due to first/last 10% or not.""" + # TODO: optimise this part, the only ATG position needed is the closest to 10% + # which is not above 10%, that's it. + if num >= right_t: + return True + elif left_t < num < right_t: + return False + if num <= left_t and mask_all_first_10p is True: + # automatically mask mut in first 10% + # don't account for ATG codons distribution + return True + # num in first 10%, need to find the next start + next_atg_pos = _get_next_bigger_num(num, atg_codon_nums) + return next_atg_pos <= left_t + + +def make_atg_data(codon_table): + codons_num = len(codon_table) + + # get first or last 10% if CDS: mask mutations in this region: + perc_10 = codons_num // 10 + left_t = perc_10 + right_t = codons_num - perc_10 + + atg_codon_nums = _find_atg_codons(codon_table) + + # TODO: refactor this + atg_codon_nums_data = { + "left_t": left_t, + "right_t": right_t, + "atg_codon_nums": atg_codon_nums, + } + return atg_codon_nums_data + + def scan_rf( codon_table, gene, chain, + atg_codon_nums_data, exon_stat=None, v=False, big_indel_thrs=None, sec_codons=None, no_fpi=False, + mask_all_first_10p=False, ): """Scan codon table for inactivating mutations.""" # sec_codons -> selenocysteine-coding codons in reference @@ -339,13 +460,9 @@ def scan_rf( codons_num = len(codon_table) in_mut_report = [] # save mutations here - # get first or last 10% if CDS: mask mutations in this region: - perc_10 = codons_num // 10 - left_t = perc_10 - right_t = codons_num - perc_10 - # init mutation counters: for IDs mut_number_stop = 1 + mut_number_atg = 1 # not inact, but need to track mut_number_fs = 1 mut_number_big_indel = 1 q_dels_in_a_row = 0 # number of deletions in a row: for big indel detection @@ -353,7 +470,14 @@ def scan_rf( for num, codon in enumerate(codon_table, 1): # go codon-by-codon # if in first/last 10%: it will be masked - mask = True if num <= left_t or num >= right_t else False + # mask = True if num <= left_t or num >= right_t else False + mask = _define_whether_mask( + num, + atg_codon_nums_data["left_t"], + atg_codon_nums_data["right_t"], + atg_codon_nums_data["atg_codon_nums"], + mask_all_first_10p=mask_all_first_10p, + ) # determine whether it's the first or last exon: last_codon = True if num == codons_num else False first_codon = True if num == 1 else False @@ -407,7 +531,10 @@ def scan_rf( position=1, mclass=START_MISSING, mut=que_codon_, - masked=mask, + # this mutation should not affect the classification + # so it's always masked, but saved to indicate that + # 1st ATG is missing: + masked=True, mut_id=mut_id, ) in_mut_report.append(mut) @@ -487,6 +614,8 @@ def scan_rf( # need to split query sequence in triplets # check that any of them is a stop-codon stop_triplets = [x for x in triplets if x in STOPS] + start_triplets = [x for x in triplets if x == "ATG"] + if len(stop_triplets) > 0 and not last_codon: # we have premature stop codon mut_ = f"{ref_codon.replace('-', '')}->{stop_triplets[0]}" @@ -532,6 +661,27 @@ def scan_rf( if v: eprint("Detected STOP") eprint(codon) + if len(start_triplets) > 0: + # not an inactivating mutation but needs to be saved + mut_ = f"{ref_codon.replace('-', '')}->ATG" + mclass = ATG + mut_id = f"ATG_{mut_number_atg}" + st_ex_num = ex_num + mut_number_atg += 1 + mut = Mutation( + gene=gene, + chain=chain, + exon=st_ex_num, + position=num, + mclass=mclass, + mut=mut_, + masked=True, + mut_id=mut_id, + ) + in_mut_report.append(mut) + if v: + eprint("Detected ATG (non-inactivating)") + eprint(codon) # I can infer number of codons in each exon directly from codon table return in_mut_report @@ -550,7 +700,7 @@ def detect_compensations(inact_mut, codon_table): fs_num = len(fs) if fs_num <= 1: # need at least 2 frameshifs, otherwise there is nothing to compensate - return answer + return answer, [] # first iteration: detect potential compensatory events, based only # on their sizes potent_compensations = [] @@ -583,10 +733,12 @@ def detect_compensations(inact_mut, codon_table): potent_compensations.append(fs_ids) break if len(potent_compensations) == 0: - return [] # no potential compensations, skip this + return [], [] # no potential compensations, skip this # verify potential compensations, check for stop codons in alt frame what_is_compensated = set() # to avoid twice compensated FS + alt_frame_codons = [] # lenghts of codons in alt frame + for comp in potent_compensations: # comp -> a list of potentially compensated FS IDs if len(what_is_compensated.intersection(comp)) > 0: @@ -610,7 +762,7 @@ def detect_compensations(inact_mut, codon_table): # add compensation track # no stop codons # to create a mut object we need gene name, chain id, exon num etc - # I just use the first mut in the row for that (ethalon) + alt_frame_codons.append((start_pos - 1, end_pos)) ethalon_mut = comp_muts[0] gene = ethalon_mut.gene chain = ethalon_mut.chain @@ -620,7 +772,11 @@ def detect_compensations(inact_mut, codon_table): mut_id = f"C_{m_counter}" # mutation -> comma-separated list of compensated mutation IDs fs_ids = [x.split("_")[1] for x in comp] - mut = "FS_{}".format(",".join(fs_ids)) + # mut = "FS_{}".format(",".join(fs_ids)) + # changed format FEB 2022: FS_{start_num}-{end_num} + # just a comma-separated list can be too long + _ids_range = f"{fs_ids[0]}-{fs_ids[-1]}" + mut = f"FS_{_ids_range}" comp_mut = Mutation( gene=gene, chain=chain, @@ -632,11 +788,12 @@ def detect_compensations(inact_mut, codon_table): mut_id=mut_id, ) answer.append(comp_mut) + # calculieren positions of first and last alt frame codons for c in comp: # add compensated muts to comp muts set # to avoid adding comp mutations twice what_is_compensated.add(c) m_counter += 1 - return answer + return answer, alt_frame_codons def mask_compensated_fs(mut_list): @@ -648,7 +805,11 @@ def mask_compensated_fs(mut_list): comp_fs_ids = [] # there are compensatory events for cmp in comp_muts: # get list of comp FS IDs - fs_nums = cmp.mut.split("_")[1].split(",") + # fs_nums = cmp.mut.split("_")[1].split(",") + comp_ids_range_str = cmp.mut.split("_")[1].split("-") + _comp_start = int(comp_ids_range_str[0]) + _comp_end = int(comp_ids_range_str[1]) + fs_nums = list(range(_comp_start, _comp_end + 1)) fs_ids = [f"FS_{x}" for x in fs_nums] comp_fs_ids.extend(fs_ids) comp_fs_ids = set(comp_fs_ids) @@ -680,6 +841,13 @@ def compute_percent_id(seq_1, seq_2): return pid +def _get_last_codon_for_each_exon(codon_table): + ret = {} + for num, elem in enumerate(codon_table, 1): + ret[elem["t_exon_num"] + 1] = num + return ret + + def classify_exons( gene, que, @@ -690,16 +858,20 @@ def classify_exons( exon_blosum, missing_exons, ex_inc, + atg_codons_data, + mask_all_first_10p=False, v=False, ): """Classify exons as intact, deleted and missing.""" del_num, miss_num = 1, 1 # counters for mutation IDs # get a list of exon numbers: exon_nums = list(range(codon_table[-1]["t_exon_num"] + 1)) + exon_to_last_codon_of_exon = _get_last_codon_for_each_exon(codon_table) exons_report = [] # save data heve exon_stat = [ "X", ] # exon status, start with 1, X - placeholder + del_miss_nums = [] for exon_num in exon_nums: @@ -734,7 +906,7 @@ def classify_exons( # classify whether it's deleted or not: # print(f"Exon {exon_num} classification with the following params: ") if v else None # print(ex_class, exon_excl, ex_pid, ex_blosum) if v else None - del_, q = classify_exon(ex_class, exon_excl, ex_pid, ex_blosum, v=v) + ex_non_del, q = classify_exon(ex_class, exon_excl, ex_pid, ex_blosum, v=v) # print(f"Results are: {del_} {q}") if v else None if ex_class == "M" or ex_gap: @@ -755,8 +927,18 @@ def classify_exons( exon_stat.append("M") del_miss_nums.append(exon_num) continue - elif del_ is False: + elif ex_non_del is False: # exon is deleted: need to write about this + last_codon_num = exon_to_last_codon_of_exon.get(ex_num_, 0) + + atg_mask = _define_whether_mask( + last_codon_num, + atg_codons_data["left_t"], + atg_codons_data["right_t"], + atg_codons_data["atg_codon_nums"], + mask_all_first_10p=mask_all_first_10p, + ) + mut = Mutation( gene=gene, chain=que, @@ -764,7 +946,7 @@ def classify_exons( position=0, mut="-", mclass=DEL_EXON, - masked=False, + masked=atg_mask, mut_id=f"DEL_{del_num}", ) del_num += 1 @@ -817,11 +999,11 @@ def muts_to_text( strings.append(out_of_ch_line) for q, val in m_80_i.items(): val_str = "TRUE" if val is True else "FALSE" - m_80_line = f"# {gene}\t{q}\tMIDDLE_80%_INTACT {val_str}" + m_80_line = f"# {gene}\t{q}\tMIDDLE_IS_INTACT {val_str}" strings.append(m_80_line) for q, val in m_80_p.items(): val_str = "TRUE" if val is True else "FALSE" - m_80_line = f"# {gene}\t{q}\tMIDDLE_80%_PRESENT {val_str}" + m_80_line = f"# {gene}\t{q}\tMIDDLE_IS_PRESENT {val_str}" strings.append(m_80_line) return "\n".join(strings) + "\n" @@ -836,7 +1018,15 @@ def get_exon_num_corr(codons_data): return ans -def compute_intact_perc(codon_table, mutations, q_name, v=False): +def compute_intact_perc( + codon_table, + mutations, + q_name, + alt_frame_ranges, + alt_f_del=False, + v=False, + mask_all_first_10p=False, +): """Compute intact %ID-related features.""" # compute per query query_muts = [m for m in mutations if m.chain == q_name] @@ -855,6 +1045,8 @@ def compute_intact_perc(codon_table, mutations, q_name, v=False): del_codon_nums = [ n for n, c in enumerate(codon_table) if c["t_exon_num"] in del_exons ] + # read codons in alt frame as deleted + safe_del_codon_nums = [ n for n, c in enumerate(codon_table) if c["t_exon_num"] in safe_del_exons ] @@ -862,6 +1054,14 @@ def compute_intact_perc(codon_table, mutations, q_name, v=False): n for n, c in enumerate(codon_table) if c["t_exon_num"] in miss_exons ] + if alt_f_del is True: + # if so, we consider codons in alternative frame deleted + # alt frame -> between compensated frameshifts + for elem in alt_frame_ranges: + s_, e_ = elem + for codon_num in range(s_, e_): + safe_del_codon_nums.append(codon_num) + # update codons status -> what is Missing, Deleted or Lost (not-safely deleted exons) for del_codon in del_codon_nums: codon_status[del_codon] = "L" @@ -871,9 +1071,26 @@ def compute_intact_perc(codon_table, mutations, q_name, v=False): codon_status[miss_codon] = "M" # get IDs of compensated FS - compensations = [m.mut for m in query_muts if m.mclass == COMPENSATION] - comp_nums = ",".join([c.split("_")[1] for c in compensations]).split(",") - comp_fs = {f"FS_{c}" for c in comp_nums} + compensations = [m for m in query_muts if m.mclass == COMPENSATION] + comp_fs = [] # list to keep compensated frameshifts + for comp in compensations: + comp_field = comp.mut + comp_ids_range_str = comp_field.split("_")[1].split("-") + # # fmt: FS_{start}-{end} + _comp_start = int(comp_ids_range_str[0]) + _comp_end = int(comp_ids_range_str[1]) + comp_ids = list(range(_comp_start, _comp_end + 1)) + comp_fs_strings = [f"FS_{i}" for i in comp_ids] + comp_fs.append(comp_fs_strings) + + # comp_nums = ",".join([c.split("_")[1] for c in compensations]).split(",") + # comp_ids_range_str = comp_field.split("_")[1].split("-") + # # fmt: FS_{start}-{end} + # _comp_start = int(comp_ids_range_str[0]) + # _comp_end = int(comp_ids_range_str[1]) + # comp_ids = list(range(_comp_start, _comp_end + 1)) + # comp_pairs = get_comp_pairs(comp_ids) + # comp_fs = {f"FS_{c}" for c in comp_nums} # put inactivating mutations coordinates in codon status table for m in query_muts: @@ -890,7 +1107,7 @@ def compute_intact_perc(codon_table, mutations, q_name, v=False): continue elif m.masked is True: continue - elif m.mclass == SSM: + elif m.mclass == SSM_A or m.mclass == SSM_D: # deal with splice site mutations if m.masked is True: # U12 or N-containing splice site -> do not consider @@ -972,8 +1189,14 @@ def compute_intact_perc(codon_table, mutations, q_name, v=False): ten_perc = gene_len // 10 # cut middle 80% codons states: middle = codon_status_string[ten_perc:-ten_perc] - middle_80_intact = False if "L" in middle else True - middle_80_present = False if "M" in middle else True + first_90 = codon_status_string[:-ten_perc] + + if mask_all_first_10p is True: + middle_is_intact = False if "L" in middle else True + else: + middle_is_intact = False if "L" in first_90 else True + middle_is_present = False if "M" in middle else True + # compute non-missing sequence not_m = len(codon_status) - codon_status.count("M") if not_m > 0: # beware of zero division error @@ -985,8 +1208,9 @@ def compute_intact_perc(codon_table, mutations, q_name, v=False): p_intact_ignore_m, p_intact_intact_m, num_of_I_codons, - middle_80_intact, - middle_80_present, + middle_is_intact, + # first_90_intact, + middle_is_present, ) @@ -1198,13 +1422,17 @@ def get_exon_pairs(exon_stat): return pairs -def detect_split_stops(codon_table, gene, q_name, exon_stat): +def detect_split_stops( + codon_table, gene, q_name, exon_stat, atg_codons_data, mask_all_first_10p=False +): """Considering all exon deletions find all split stop codons.""" # we need to get pairs of intact (not deleted or missing) exons # between what there is a row of Deleted exons (but not missing) # if there is a missing exon in between -> do not make any conclusions # split stop codons may occur between these pairs i_exon_pairs = get_exon_pairs(exon_stat) + exon_to_last_codon_of_exon = _get_last_codon_for_each_exon(codon_table) + if len(i_exon_pairs) == 0: # no such pairs -> no way to detect split stop return [] # return nothing @@ -1212,6 +1440,7 @@ def detect_split_stops(codon_table, gene, q_name, exon_stat): mut_num = 1 muts = [] for pair in i_exon_pairs: + # those are 1-based first_exon = pair[0] second_exon = pair[1] # in codon table numbers are 0-based, so correct @@ -1248,14 +1477,23 @@ def detect_split_stops(codon_table, gene, q_name, exon_stat): # ex_num = second_exon if s_part_len > f_part_len else first_exon mut_id = f"SP_STOP_{mut_num}" mut_num += 1 + position = exon_to_last_codon_of_exon[first_exon] + atg_mask = _define_whether_mask( + position, + atg_codons_data["left_t"], + atg_codons_data["right_t"], + atg_codons_data["atg_codon_nums"], + mask_all_first_10p=mask_all_first_10p, + ) + mut = Mutation( gene=gene, chain=q_name, exon=second_exon, - position=0, + position=position, mclass=STOP, mut=mut_, - masked=False, + masked=atg_mask, mut_id=mut_id, ) muts.append(mut) @@ -1285,6 +1523,8 @@ def inact_mut_check( ref_ss=None, sec_codons=None, no_fpi=False, + alt_f_del=False, + mask_all_first_10p=False, # mask all inact mutations in 1st 10% regardless on ATGs ): """Detect inactivating mutations in the CESAR output.""" # read cesar output @@ -1304,12 +1544,11 @@ def inact_mut_check( # initiate lists/ dicts for them p_intact_ignore_M = {} p_intact_intact_M = {} - middle_80_intact = {} - middle_80_present = {} + middle_is_intact = {} + middle_is_present = {} i_codons_prop = {} out_of_b_vals = {} mutations = [] - ex_prop_provided = ex_prop is not None del_miss_exons = {} for cesar_fraction in cesar_fractions: @@ -1354,21 +1593,15 @@ def inact_mut_check( # err_msg = f"Cannot find CESAR wrapper features for query {q_name}" # raise ValueError(err_msg) - # now we extract inactivation mutations - # then add them to fraction_mutations list - - # extract splice site mutations - sps_mutations = analyse_splice_sites( - ref, query, gene, q_name, u12_introns_data, v=v - ) - fraction_mutations.extend(sps_mutations) - # create codon table to extract other mutations + # create codon table to extract mutations # codon table: list of objects, describing a codon # such as sequence in reference and query, exon number and so on codon_table = parse_cesar_out(ref, query) + atg_codons_data = make_atg_data(codon_table) - # next loop -> for deleted/missed exons + # next loop -> for deleted/missed exons if ex_prop: # if extra data provided by CESAR wrapper we can classify exons + # dm list -> list of 0-based exon nums which are del or missing exon_del_miss_, exon_stat_, dm_list = classify_exons( gene, q_name, @@ -1379,6 +1612,8 @@ def inact_mut_check( exon_blosum, missing_exons, ex_inc, + atg_codons_data, + mask_all_first_10p=mask_all_first_10p, v=v, ) # get lists of deleted/missing exons @@ -1398,32 +1633,59 @@ def inact_mut_check( # big indels may be classified as inactivating mutations # but the bigger the exon: the bigger an indel should be # define the thresholds - big_indel_thrs = infer_big_indel_thresholds(ex_lens) del_miss_exons[q_name] = set(dm_list) + big_indel_thrs = infer_big_indel_thresholds(ex_lens) # scan reading frame (codon table) for the rest on inact mutations inact_muts = scan_rf( codon_table, gene, q_name, + atg_codons_data, exon_stat=exon_stat, v=v, big_indel_thrs=big_indel_thrs, sec_codons=sec_codons, no_fpi=no_fpi, + mask_all_first_10p=mask_all_first_10p, ) # save this data fraction_mutations.extend(inact_muts) + # now we extract inactivation mutations + # then add them to fraction_mutations list + # extract splice site mutations + sps_mutations = analyse_splice_sites( + ref, + query, + gene, + q_name, + codon_table, + atg_codons_data, + mask_all_first_10p, + u12_introns_data, + v=v, + ) + fraction_mutations.extend(sps_mutations) + + + # get a list of split stop codons: stop codons that appear after exon deletions # such as: # GCAAACGCAGCt-------------[DELETED EXON]-------agTCCCATTTCCAACTGATC # exon deletion raises an inframe stop codon: t + ag - split_stop_codons = detect_split_stops(codon_table, gene, q_name, exon_stat) + split_stop_codons = detect_split_stops( + codon_table, + gene, + q_name, + exon_stat, + atg_codons_data, + mask_all_first_10p=mask_all_first_10p, + ) fraction_mutations.extend(split_stop_codons) # detect compensated frameshifts - compensations = detect_compensations(inact_muts, codon_table) + compensations, alt_frame_ranges = detect_compensations(inact_muts, codon_table) fraction_mutations.extend(compensations) # also mask compensated frameshifts: fraction_mutations = mask_compensated_fs(fraction_mutations) @@ -1434,13 +1696,19 @@ def inact_mut_check( # extract and save %intact-related features pintact_features = compute_intact_perc( - codon_table, fraction_mutations, q_name, v=v + codon_table, + fraction_mutations, + q_name, + alt_frame_ranges, + alt_f_del=alt_f_del, + v=v, + mask_all_first_10p=mask_all_first_10p, ) p_intact_ignore_M[q_name] = pintact_features[0] p_intact_intact_M[q_name] = pintact_features[1] i_codons_prop[q_name] = pintact_features[2] - middle_80_intact[q_name] = pintact_features[3] - middle_80_present[q_name] = pintact_features[4] + middle_is_intact[q_name] = pintact_features[3] + middle_is_present[q_name] = pintact_features[4] # compute %of gene that lies outside chain borders out_of_borders_prop = get_out_of_borders_prop(codon_table, missing_exons) @@ -1454,8 +1722,8 @@ def inact_mut_check( p_intact_intact_M, i_codons_prop, out_of_b_vals, - middle_80_intact, - middle_80_present, + middle_is_intact, + middle_is_present, gene, ) return report, del_miss_exons diff --git a/split_exon_realign_jobs.py b/split_exon_realign_jobs.py index f6e6901..e100ca1 100755 --- a/split_exon_realign_jobs.py +++ b/split_exon_realign_jobs.py @@ -31,7 +31,7 @@ WRAPPER_TEMPLATE = ( WRAPPER_ABSPATH + " {0} {1} {2} {3} {4} {5} --cesar_binary {6}" - + " --uhq_flank {7} --memlim {8}" + + " --uhq_flank {7}" ) CESAR_RUNNER = os.path.abspath( os.path.join(LOCATION, "cesar_runner.py") @@ -40,7 +40,7 @@ "GGLOB", "TRANS", } # chain classes that could lead to very long query loci -BIGMEM_LIM = 1000 # mem limit for bigmem partition +BIGMEM_LIM = 500 # mem limit for bigmem partition REL_LENGTH_THR = 50 ABS_LENGTH_TRH = 1000000 EXTRA_MEM = 100000 # extra memory "just in case" @@ -59,6 +59,10 @@ PROJECTION = "PROJECTION" TRANSCRIPT = "TRANSCRIPT" +MEM_FIT = "MEM_FIT" +MEM_BIGMEM = "MEM_BIGMEM" +MEM_DONTFIT = "MEM_DONTFIT" + # connect shared lib; define input and output data types chain_coords_conv_lib_path = os.path.join( LOCATION, "modules", "chain_coords_converter_slib.so" @@ -191,6 +195,12 @@ def parse_args(): default=None, help="Memory consumption was already precomputed", ) + app.add_argument( + "--precomp_regions_data_dir", + default=None, + help=("Directory containing files with precomputed regions. " + "Likely is not needed for standalone script scenario. ") + ) app.add_argument( "--predefined_glp_class_path", default=None, @@ -198,6 +208,28 @@ def parse_args(): "(i) Projections with too short query region (L or M) and " "(ii) Projections with very long query region (M)", ) + app.add_argument( + "--unprocessed_log", + "--unp", + default=None, + help="Store unprocessed genes in a separate file." + ) + app.add_argument( + "--debug", + "-d", + action="store_true", + dest="debug", + help="Log debugging data" + ) + app.add_argument( + "--mask_all_first_10p", + "--m_f10p", + action="store_true", + dest="mask_all_first_10p", + help="Automatically mask all inactivating mutations in first 10% of " + "the reading frame, ignoring ATG codons distribution. " + "(Default mode in V1.0, not recommended to use in later versions)" + ) # print help if there are no args if len(sys.argv) < 2: app.print_help() @@ -495,7 +527,8 @@ def fill_buckets(buckets, all_jobs): # memlim[5] -> jobs that require <= 5Gb # memlim[10] -> jobs that require > 5Gb AND <= 10Gb buckets[memlim] = [ - job for job, job_mem in all_jobs.items() if prev_lim < job_mem <= memlim + f"{job} --memlim {memlim}" for job, job_mem in all_jobs.items() + if prev_lim < job_mem <= memlim ] prev_lim = memlim # remove empty @@ -507,10 +540,17 @@ def save_jobs(filled_buckets, bucket_jobs_num, jobs_dir): """Save cesar calls in the dir assigned.""" os.mkdir(jobs_dir) if not os.path.isdir(jobs_dir) else None file_num, to_combine = 0, [] + bucket_saved = {k: True for k in filled_buckets.keys()} + for bucket_id, jobs in filled_buckets.items(): num_of_files = bucket_jobs_num[bucket_id] # just in case num_of_files = len(jobs) if num_of_files >= len(jobs) else num_of_files + if num_of_files == 0: + # avoiding zero division error + bucket_saved[bucket_id] = False + print(f"Warning! No files to save jobs for bucket {bucket_id}") + continue size_of_file = len(jobs) // num_of_files # size_of_file = size_of_file + 1 if len(jobs) % num_of_files != 0 else size_of_file jobs_split = parts(jobs, n=size_of_file) @@ -522,6 +562,11 @@ def save_jobs(filled_buckets, bucket_jobs_num, jobs_dir): f.write("\n".join(part) + "\n") f.close() to_combine.append(file_path) + # check if anything saved + if all(x is False for x in bucket_saved.values()): + print("Could not create any CESAR job. Probably, ALL genes require much more memory than the available.") + print("If this result is unexpected, please contact the developers.") + sys.exit(1) return to_combine @@ -551,7 +596,13 @@ def save_bigmem_jobs(bigmem_joblist, jobs_dir): def save_combined_joblist( - to_combine, combined_file, results_dir, inact_mut_dat, rejected_log, name="" + to_combine, + combined_file, + results_dir, + inact_mut_dat, + rejected_log, + unproc_log, + name="" ): """Save joblist of joblists (combined joblist).""" f = open(combined_file, "w") @@ -565,6 +616,9 @@ def save_combined_joblist( if rejected_log: log_path = os.path.join(rejected_log, f"{basename}.txt") combined_command += f" --rejected_log {log_path}" + if unproc_log: + log_path = os.path.join(unproc_log, f"{basename}.txt") + combined_command += f" --unproc_log {log_path}" f.write(combined_command + "\n") f.close() @@ -603,6 +657,133 @@ def read_precomp_mem(precomp_file): return ret +def get_trans_to_regions_file(precomp_regions_data_dir): + ret = {} + if precomp_regions_data_dir is None: + return ret + files_in = os.listdir(precomp_regions_data_dir) + for filename in files_in: + path = os.path.abspath(os.path.join(precomp_regions_data_dir, filename)) + f = open(path, "r") + transcripts_in_file = set(x.split("\t")[0] for x in f) + f.close() + for t in transcripts_in_file: + ret[t] = path + return ret + + +def build_job(gene, chains_arg, args, gene_fragments, trans_to_reg_precomp_file, u12_this_gene, mask_all_first_10p=False): + """Build CESAR job.""" + # # 0 gene; 1 chains; 2 bed_file; 3 bdb chain_file; 4 tDB; 5 qDB; 6 output; 7 cesar_bin + job = WRAPPER_TEMPLATE.format( + gene, + chains_arg, + os.path.abspath(args.bdb_bed_file), + os.path.abspath(args.bdb_chain_file), + os.path.abspath(args.tDB), + os.path.abspath(args.qDB), + os.path.abspath(args.cesar_binary), + args.uhq_flank, + # gig, + ) + # add some flags if required + job = job + " --mask_stops" if args.mask_stops else job + job = job + " --check_loss" if args.check_loss else job + job = job + " --no_fpi" if args.no_fpi else job + job = job + " --fragments" if gene_fragments else job + job = job + " --opt_cesar" if args.opt_cesar else job + job = job + " --alt_frame_del" # TODO: toga master script parameter + + precomp_file = trans_to_reg_precomp_file.get(gene) + job = job + f" --predefined_regions {precomp_file}" if precomp_file else job + + # add U12 introns data if this gene has them: + job = job + f" --u12 {os.path.abspath(args.u12)}" if u12_this_gene else job + + # add mask_all_first_10p flag if needed + job = job + f" --mask_all_first_10p" if mask_all_first_10p else job + return job + + +def compute_ref_part_of_mem(block_sizes): + """Compute num_states and r_length.""" + num_states, r_length = 0, 0 + for block_size in block_sizes: + # num_states += 6 + 6 * reference->num_codons + 1 + 2 + 2 + 22 + 6; + # /* 22 and 6 for acc and donor states */ + num_codons = block_size // 3 + num_states += 6 + 6 * num_codons + 1 + 2 + 2 + 22 + 6 + # r_length += 11 + 6 * fasta.references[i]->length + # + donors[i]->length + acceptors[i]->length; + r_length += block_size + return num_states, r_length + + +def compute_memory_gig_for_qlen(num_states, r_length, q_length_max): + memory = ( + (num_states * 4 * 8) + + (num_states * q_length_max * 4) + + (num_states * 304) + + (2 * q_length_max + r_length) * 8 + + (q_length_max + r_length) * 2 * 1 + + EXTRA_MEM + ) + gig = math.ceil(memory / 1000000000) + 0.25 + return gig + + +def _get_chain_arg_and_gig_arg(chains_list, chain_to_mem): + gig_arg = max([chain_to_mem[c] for c in chains_list]) + return gig_arg + + +def compute_memory(chains, precomp_gig, block_sizes, gene_chains_data, gene_fragments, mem_limit): + """Compute memory requirements for different chains.""" + if precomp_gig: + # chains arg: just all chains, everything is precomputed + return {tuple(chains): (precomp_gig, MEM_FIT)} + # proceed to memory estimation + # the same procedure as inside CESAR2.0 code + # required memory depends on numerous params + # first, we need reference transcript-related parameters + # query-related parameters will be later + num_states, r_length = compute_ref_part_of_mem(block_sizes) + + # branch 2: fragmented gene, here we have to use sum of query lengts + if gene_fragments: + # in case of fragmented genome: we stitch queries together + # so query length = sum of all queries + q_length_max = sum([v for v in gene_chains_data.values()]) + gig = compute_memory_gig_for_qlen(num_states, r_length, q_length_max) + return {tuple(chains): (gig, MEM_FIT)} + + # branch 3, maybe the most common one + ret = {} + chain_to_mem_consumption = {} + for chain_id, q_length in gene_chains_data.items(): + chain_gig = compute_memory_gig_for_qlen(num_states, q_length, q_length) + chain_to_mem_consumption[chain_id] = chain_gig + # for every chain, we have the memory consumption + chains_that_fit = [c_id for c_id, gig in chain_to_mem_consumption.items() if gig <= mem_limit] + chain_that_goto_bigmem = [c_id for c_id, gig in chain_to_mem_consumption.items() if mem_limit < gig <= BIGMEM_LIM] + chains_that_dont_fit = [c_id for c_id, gig in chain_to_mem_consumption.items() if gig > BIGMEM_LIM] + + # process each bucket of chains individually + if len(chains_that_fit) > 0: + # good, there are some chains that can be easily processed + mem = _get_chain_arg_and_gig_arg(chains_that_fit, chain_to_mem_consumption) + ret[tuple(chains_that_fit)] = (mem, MEM_FIT) + if len(chain_that_goto_bigmem) > 0: + # there are some bigmem jobs -> to be executed separately + # Deprecated branch, TODO: check whether it makes sense nowadays + mem = _get_chain_arg_and_gig_arg(chain_that_goto_bigmem, chain_to_mem_consumption) + ret[tuple(chain_that_goto_bigmem)] = (mem, MEM_BIGMEM) + if len(chains_that_dont_fit) > 0: + mem = _get_chain_arg_and_gig_arg(chains_that_dont_fit, chain_to_mem_consumption) + ret[tuple(chains_that_dont_fit)] = (mem, MEM_DONTFIT) + return ret + + def main(): """Entry point.""" t0 = dt.now() @@ -618,6 +799,10 @@ def main(): # if memory is precomputed: use it precomp_mem = read_precomp_mem(args.precomp_memory_data) + # precomp_regions = read_precomp_reg(args.precomp_regions_data) + + # TODO: to optimize later + trans_to_reg_precomp_file = get_trans_to_regions_file(args.precomp_regions_data_dir) # get lists of orthologous chains per each gene # skipped_1 - no chains found -> log them predefined_glp_class = {} # for projections which are M and L without CESAR @@ -658,6 +843,7 @@ def main(): args.qDB, ) predefined_glp_class.update(predef_glp) + predef_glp_class__chains_step = {} # start making the jobs all_jobs = {} @@ -667,8 +853,8 @@ def main(): for gene in batch.keys(): u12_this_gene = u12_data.get(gene) block_sizes = bed_data[gene][3] - gene_chains_data = regions.get(gene) + # check that there is something for this gene if not gene_chains_data: continue @@ -681,94 +867,47 @@ def main(): gene_chains_data = { k: v for k, v in gene_chains_data.items() if k in gene_fragments } + chains = gene_chains_data.keys() if len(chains) == 0: continue - chains_arg = ",".join(chains) # chain ids -> one of the cmd args - - # if memory is precomputed then use it precomp_gig = precomp_mem.get(gene, None) - if precomp_gig is None: - # proceed to memory estimation - # the same procedure as inside CESAR2.0 code - num_states, r_length = 0, 0 - - # required memory depends on numerous params - # first, we need reference transcript-related parameters - # query-related parameters will be later - for block_size in block_sizes: - # num_states += 6 + 6 * reference->num_codons + 1 + 2 + 2 + 22 + 6; - # /* 22 and 6 for acc and donor states */ - num_codons = block_size // 3 - num_states += 6 + 6 * num_codons + 1 + 2 + 2 + 22 + 6 - # r_length += 11 + 6 * fasta.references[i]->length - # + donors[i]->length + acceptors[i]->length; - r_length += block_size - - # now compute query sequence-related parameters - query_lens = [v for v in gene_chains_data.values()] - if ( - gene_fragments - ): # in case of fragmented genome: we stitch queries together - # so query length = sum of all queries - q_length_max = sum(query_lens) - else: # not fragmented genome: processins queries separately - # thus we need only the max length - q_length_max = max(query_lens) - # and now compute the amount of required memory - memory = ( - (num_states * 4 * 8) - + (num_states * q_length_max * 4) - + (num_states * 304) - + (2 * q_length_max + r_length) * 8 - + (q_length_max + r_length) * 2 * 1 - + EXTRA_MEM - ) - gig = math.ceil(memory / 1000000000) + 0.25 - else: - # memory was precomputed - gig = precomp_gig - - # gig = compute_amount_of_memory(block_sizes, q_length_max, args.opt_cesar) - # # 0 gene; 1 chains; 2 bed_file; 3 bdb chain_file; 4 tDB; 5 qDB; 6 output; 7 cesar_bin - job = WRAPPER_TEMPLATE.format( - gene, - chains_arg, - os.path.abspath(args.bdb_bed_file), - os.path.abspath(args.bdb_chain_file), - os.path.abspath(args.tDB), - os.path.abspath(args.qDB), - os.path.abspath(args.cesar_binary), - args.uhq_flank, - gig, - ) - # add some flags if required - job = job + " --mask_stops" if args.mask_stops else job - job = job + " --check_loss" if args.check_loss else job - job = job + " --no_fpi" if args.no_fpi else job - job = job + " --fragments" if gene_fragments else job - job = job + " --opt_cesar" if args.opt_cesar else job - - # add U12 introns data if this gene has them: - job = job + f" --u12 {os.path.abspath(args.u12)}" if u12_this_gene else job - - # define whether it's an ordinary or a bigmem job - # depending on the memory requirements - if gig <= mem_limit: # ordinary job - all_jobs[job] = gig - elif gig <= BIGMEM_LIM: - skipped_3.append((gene, ",".join(chains), f"requires {gig}) -> bigmem job")) - predef_glp[gene] = f"{TRANSCRIPT}\tM" - bigmem_jobs.append(job) - else: - skipped_3.append( - ( - gene, - ",".join(chains), - f"big mem limit ({BIGMEM_LIM} gig) exceeded (needs {gig})", - ) - ) - predef_glp[gene] = f"{TRANSCRIPT}\tM" + chain_arg_to_gig = compute_memory(chains, + precomp_gig, + block_sizes, + gene_chains_data, + gene_fragments, + mem_limit) + for chains_tup, (gig, stat) in chain_arg_to_gig.items(): + chains_arg = ",".join(chains_tup) + job = build_job(gene, + chains_arg, + args, + gene_fragments, + trans_to_reg_precomp_file, + u12_this_gene, + mask_all_first_10p=args.mask_all_first_10p) + + # define whether it's an ordinary or a bigmem job + # depending on the memory requirements + if stat == MEM_FIT: # ordinary job + all_jobs[job] = gig + elif stat == MEM_BIGMEM: + to_app = (gene, chains_arg, f"requires {gig}) -> bigmem job") + skipped_3.append(to_app) + bigmem_jobs.append(job) + for chain_id in chains_tup: + proj_id = f"{gene}.{chain_id}" + predef_glp_class__chains_step[proj_id] = f"{PROJECTION}\tM" + else: + to_app = (gene, chains_arg, f"big mem limit ({BIGMEM_LIM} gig) exceeded (needs {gig})",) + skipped_3.append(to_app) + for chain_id in chains_tup: + proj_id = f"{gene}.{chain_id}" + predef_glp_class__chains_step[proj_id] = f"{PROJECTION}\tM" + + # TODO: predefined GLP classes to be refactored + predefined_glp_class.update(predef_glp_class__chains_step) eprint(f"\nThere are {len(all_jobs.keys())} jobs in total.") eprint("Splitting the jobs.") @@ -796,7 +935,12 @@ def main(): # save joblist of joblists save_combined_joblist( - to_combine, args.combined, args.results, args.check_loss, args.rejected_log + to_combine, + args.combined, + args.results, + args.check_loss, + args.rejected_log, + args.unprocessed_log ) # save bigmem jobs, a bit different logic @@ -808,6 +952,7 @@ def main(): args.results, args.check_loss, args.rejected_log, + None, # TODO: decide what we do with this branch name="bigmem", ) diff --git a/supply/extract_codon_alignment.py b/supply/extract_codon_alignment.py index e87b0ee..bb7016c 100644 --- a/supply/extract_codon_alignment.py +++ b/supply/extract_codon_alignment.py @@ -27,7 +27,6 @@ # actually, it was nearly random CESAR alignment for this exon # v2.2 fixed a bug: wrong handling of completely deleted exons - SEQ_NUMBER_LIMIT = 1500 CESAR_RESULTS_FILE = "cesar_results.txt" EXON_SEQ_CLASSES = {"query_exon", "reference_exon"} @@ -217,6 +216,22 @@ def parse_args(): action="store_true", help="Do not consider UL projections as orthologous (NOT IMPLEMENTED YET)", ) + app.add_argument( + "--force_repair", + "--fr", + dest="force_repair", + action="store_true", + help=("Force repair missing parts of the alignment. " + "Please use in case the script continuously fails to produce " + "the result. Can be needed in case of massive alignments with " + "abundant missing/corrupted sequence.") + ) + app.add_argument( + "--save_aligner_commands", + default=None, + help="Save a sequence of MACSE commands to the specified location. " + "Temporary files will not be deleted!" + ) # if no args: print help message if len(sys.argv) < MIN_ARG_NUM: @@ -610,13 +625,16 @@ def macse_alignment(in_fasta, temp_dir, macse_caller, v=False): ) cmd = f"{macse_caller} -prog alignSequences -seq {STDIN} -out_NT {out_file} -out_AA {to_del_file}" if v: - print_stderr("Aligning:") - print_stderr(in_fasta) + print_stderr("Calling:\n") + print_stderr(cmd) + # print_stderr("Aligning:") + # print_stderr(in_fasta) p = subprocess.Popen(cmd, stdin=PIPE, stderr=PIPE, stdout=PIPE, shell=True) _, stderr_ = p.communicate(input=in_fasta.encode()) rc = p.returncode if rc != 0: # MACSE crashed print_stderr("# Error! Macse CRASHED") + # print_stderr(f"Input file: {in_fasta}") err_msg = stderr_.decode(UTF_8) print_stderr(err_msg) sys.exit(1) @@ -742,19 +760,20 @@ def split_into_exons(sp_to_codon_alis, codon_to_exon, codon_to_seq, debug=False) ret = {} ali_id_to_seqs = defaultdict(dict) codon_tot_num_ = len(codon_to_exon) - if debug: - print_stderr("Codon to exon_dict") - print_stderr(codon_to_exon) - print_stderr(f"In total {codon_tot_num_} codons") + # if debug: + # TODO: add to level 2 verbosity + # print_stderr("Codon to exon_dict") + # print_stderr(codon_to_exon) + # print_stderr(f"In total {codon_tot_num_} codons") - print_stderr(f"Codon sequences:\n{codon_to_seq}") + # print_stderr(f"Codon sequences:\n{codon_to_seq}") for seq_id, seq in sp_to_codon_alis.items(): sp, proj_id, is_ref = seq_id key_ = (sp, proj_id) ali_id_to_seqs[key_][is_ref] = seq for k, v in ali_id_to_seqs.items(): - print_stderr(f"processing {k}") if debug else None + # print_stderr(f"processing {k}") if debug else None ref_codons = v[True].split() que_codons = v[False].split() codon_num = len(ref_codons) @@ -767,13 +786,13 @@ def split_into_exons(sp_to_codon_alis, codon_to_exon, codon_to_seq, debug=False) for i_num, (r_c, q_c) in enumerate(zip(ref_codons, que_codons)): exon_num = codon_to_exon[ref_codon_num] debug_line = f"{k}: extracting codon {ref_codon_num} out of {codon_tot_num_} | exon {exon_num}" - print_stderr(debug_line) if debug else None + # print_stderr(debug_line) if debug else None expected_ref_codon = codon_to_seq[ref_codon_num] debug_line = ( f"{r_c} -> {expected_ref_codon} {q_c} {ref_codon_num} {exon_num}" ) - print_stderr(debug_line) if debug else None + # print_stderr(debug_line) if debug else None ref_is_gap = r_c == CODON_GAP ref_is_XXX = r_c == CODON_XXX ref_has_N = "N" in r_c @@ -781,7 +800,7 @@ def split_into_exons(sp_to_codon_alis, codon_to_exon, codon_to_seq, debug=False) ref_is_undefined = ref_is_gap or ref_is_XXX or ref_has_N debug_line = f"# ref gap: {ref_is_gap} ref unknown: {ref_is_XXX} ref expected: {ref_is_exp}" - print_stderr(debug_line) if debug else None + # print_stderr(debug_line) if debug else None if not ref_is_gap: ref_codon_num += 1 @@ -790,12 +809,12 @@ def split_into_exons(sp_to_codon_alis, codon_to_exon, codon_to_seq, debug=False) and ref_is_undefined is False and exp_codon_lock is False ): - print_stderr(f"Warning! {k}: Fixing ref codon_num..") + # print_stderr(f"Warning! {k}: Fixing ref codon_num..") wrong_codon_num = ref_codon_num ref_codon_num = fix_codon_num( wrong_codon_num, expected_ref_codon, codon_to_seq ) - print_stderr(f"Corrected from {wrong_codon_num} to {ref_codon_num}") + # print_stderr(f"Corrected from {wrong_codon_num} to {ref_codon_num}") codon_pair = (r_c, q_c) exon_to_sequences[exon_num].append(codon_pair) @@ -804,7 +823,7 @@ def split_into_exons(sp_to_codon_alis, codon_to_exon, codon_to_seq, debug=False) # this will cause an index error # need to handle this overflow correctrly exp_codon_lock = True - print_stderr(f"Warning! {k}: Codon sequence overflow...") + # print_stderr(f"Warning! {k}: Codon sequence overflow...") ref_codon_num = codon_tot_num_ - 1 ret[k] = exon_to_sequences @@ -816,19 +835,28 @@ def reformat_codon_alis(sp_to_exon_codon_alis, all_exons): ret = defaultdict(dict) for (sp, proj_id), v in sp_to_exon_codon_alis.items(): for exon in all_exons: - exon_related_seq = v[exon] - ref_codons = [x[0] for x in exon_related_seq] - que_codons = [x[1] for x in exon_related_seq] - ref_seq = "".join(ref_codons) - que_seq = "".join(que_codons) + exon_related_seq = v.get(exon, None) ref_seq_key = (sp, proj_id, True) que_seq_key = (sp, proj_id, False) - ret[exon][ref_seq_key] = ref_seq - ret[exon][que_seq_key] = que_seq + + if exon_related_seq: + ref_codons = [x[0] for x in exon_related_seq] + que_codons = [x[1] for x in exon_related_seq] + ref_seq = "".join(ref_codons) + que_seq = "".join(que_codons) + ret[exon][ref_seq_key] = ref_seq + ret[exon][que_seq_key] = que_seq + else: + # print(sp_to_exon_codon_alis) + # print(f"Error here, cannot find key: {exon}") + # print(v) + # sys.exit(1) + ret[exon][ref_seq_key] = "NNN" + ret[exon][que_seq_key] = "NNN" return ret -def merge_exon_fastas(fastas_list, fasta_headers): +def merge_exon_fastas(fastas_list, fasta_headers, force_repair=False): """Merge per-exon fastas.""" id_to_chunks = defaultdict(list) fasta_lines = [] @@ -1221,6 +1249,7 @@ def main(): ) # ALIGNING EXON-BY-EXON else: + commands_sequence = [] fasta_headers_all = set([f"{x[0]}\t{x[1]}" for x in sp_to_codon_alis.keys()]) # we like to align exons separately, first of all get # codon number -> exon number mapping @@ -1260,7 +1289,9 @@ def main(): aligned_fasta_exon, f"aligned_exon_{exon}", args.intermediate_data ) aligned_fastas.append(aligned_fasta_exon) - aligned_fasta = merge_exon_fastas(aligned_fastas, fasta_headers_all) + aligned_fasta = merge_exon_fastas(aligned_fastas, + fasta_headers_all, + force_repair=args.force_repair) f = open(args.output, "w") if args.output else sys.stdout f.write(aligned_fasta) diff --git a/toga.py b/toga.py index c61df23..e68931c 100755 --- a/toga.py +++ b/toga.py @@ -23,8 +23,8 @@ from modules.merge_chains_output import merge_chains_output from modules.make_pr_pseudogenes_anno import create_ppgene_track from modules.merge_cesar_output import merge_cesar_output -from modules.gene_losses_summary import I, gene_losses_summary -from modules.orthology_type_map import orthology_type_map, trim_prefix +from modules.gene_losses_summary import gene_losses_summary +from modules.orthology_type_map import orthology_type_map from modules.classify_chains import classify_chains from modules.get_transcripts_quality import classify_transcripts from modules.make_query_isoforms import get_query_isoforms_data @@ -57,10 +57,19 @@ CESAR_RUNNER_TMP = "{0} {1} {2} --check_loss {3} --rejected_log {4}" CESAR_PRECOMPUTED_REGIONS_DIRNAME = "cesar_precomputed_regions" CESAR_PRECOMPUTED_MEMORY_DIRNAME = "cesar_precomputed_memory" +CESAR_PRECOMPUTED_ORTHO_LOCI_DIRNAME = "cesar_precomputed_orthologous_loci" + CESAR_PRECOMPUTED_MEMORY_DATA = "cesar_precomputed_memory.tsv" +CESAR_PRECOMPUTED_REGIONS_DATA = "cesar_precomputed_regions.tsv" +CESAR_PRECOMPUTED_ORTHO_LOCI_DATA = "cesar_precomputed_orthologous_loci.tsv" + +NUM_CESAR_MEM_PRECOMP_JUBS = 500 + + TEMP_CHAIN_CLASS = "temp_chain_trans_class" MODULES_DIR = "modules" RUNNING = "RUNNING" +CRASHED = "CRASHED" TEMP = "temp" # automatically enable flush @@ -92,10 +101,11 @@ def __init__(self, args): else: # if none: we cannot call os.path.abspath method self.nextflow_bigmem_config = None self.__check_nf_config() + # to avoid crash on filesystem without locks: - os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" # otherwise it could crash + os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" # temporary fix for DSL error in recent NF versions - os.environ["NXF_DEFAULT_DSL"] = "1" + os.environ["NXF_DEFAULT_DSL"] = "1" chain_basename = os.path.basename(args.chain_input) @@ -184,7 +194,11 @@ def __init__(self, args): self.cesar_binary = ( self.DEFAULT_CESAR if not args.cesar_binary else args.cesar_binary ) + self.opt_cesar_binary = os.path.abspath( + os.path.join(LOCATION, "cesar_input_optimiser.py") + ) self.cesar_is_opt = args.using_optimized_cesar + self.output_opt_cesar_regions = args.output_opt_cesar_regions self.time_log = args.time_marks self.stop_at_chain_class = args.stop_at_chain_class self.rejected_log = os.path.join(self.wd, "genes_rejection_reason.tsv") @@ -207,6 +221,7 @@ def __init__(self, args): self.keep_nf_logs = args.do_not_del_nf_logs self.exec_cesar_parts_sequentially = args.cesar_exec_seq self.ld_model_arg = args.ld_model + self.mask_all_first_10p = args.mask_all_first_10p self.cesar_ok_merged = ( None # Flag: indicates whether any cesar job BATCHES crashed @@ -237,12 +252,20 @@ def __init__(self, args): self.gene_loss_data = os.path.join(self.temp_wd, "inact_mut_data") self.query_annotation = os.path.join(self.wd, "query_annotation.bed") self.loss_summ = os.path.join(self.wd, "loss_summ_data.tsv") + # directory to store intermediate files with technically unprocessable transcripts: + self.technical_cesar_err = os.path.join(self.temp_wd, "technical_cesar_err") + # unprocessed transcripts to be considered Missing: + self.technical_cesar_err_merged = os.path.join( + self.temp_wd, "technical_cesar_err.txt" + ) + self.bed_fragm_exons_data = os.path.join( self.temp_wd, "bed_fragments_to_exons.tsv" ) self.precomp_mem_cesar = os.path.join( self.temp_wd, CESAR_PRECOMPUTED_MEMORY_DATA ) + self.precomp_reg_dir = None self.cesar_mem_was_precomputed = False self.u12_arg = args.u12 self.u12 = None # assign after U12 file check @@ -517,14 +540,14 @@ def __check_isoforms_file(self, t_in_bed): f.write(f"{gene}\t{trans}\n") print("Isoforms file is OK") - @staticmethod - def die(msg, rc=1): + def die(self, msg, rc=1): """Show msg in stderr, exit with the rc given.""" print(msg) print(f"Program finished with exit code {rc}\n") # for t_file in self.temp_files: # remove temp files if required # os.remove(t_file) if os.path.isfile(t_file) and not self.keep_temp else None # shutil.rmtree(t_file) if os.path.isdir(t_file) and not self.keep_temp else None + self.__mark_crashed() sys.exit(rc) def __modules_addr(self): @@ -695,7 +718,7 @@ def __call_proc(self, cmd, extra_msg=None): rc = subprocess.call(cmd, shell=True) if rc != 0: print(extra_msg) if extra_msg else None - self.die(f"Error! Process {cmd} died! Abort.") + self.die(f"Error! Process:\n{cmd}\ndied! Abort.") print(f"{cmd} done with code 0") def __find_two_bit(self, db): @@ -770,7 +793,7 @@ def run(self): # 9) classify projections/genes as lost/intact # also measure projections confidence levels print("#### STEP 9: Gene loss pipeline classification\n") - self.__transcript_quality() + self.__transcript_quality() # maybe remove -> not used anywhere self.__gene_loss_summary() self.__time_mark("Got gene loss summary") @@ -793,7 +816,6 @@ def run(self): print(f"{self.cesar_crashed_batches_log}\n") print(f"Saved results to {self.wd}") self.__left_done_mark() - self.die(f"Done! Estimated time: {dt.now() - self.t0}", rc=0) def __mark_start(self): """Indicate that TOGA process have started.""" @@ -803,6 +825,16 @@ def __mark_start(self): f.write(f"TOGA process started at {now_}\n") f.close() + def __mark_crashed(self): + """Indicate that TOGA process died.""" + running_f = os.path.join(self.wd, RUNNING) + crashed_f = os.path.join(self.wd, CRASHED) + os.remove(running_f) if os.path.isfile(running_f) else None + f = open(crashed_f, "w") + now_ = str(dt.now()) + f.write(f"TOGA CRASHED AT {now_}\n") + f.close() + def __make_indexed_chain(self): """Make chain index file.""" # make *.bb file @@ -920,7 +952,9 @@ def __classify_chains(self): self.pred_scores = os.path.join(self.temp_wd, "orthology_scores.tsv") self.se_model = os.path.join(self.LOCATION, "models", "se_model.dat") self.me_model = os.path.join(self.LOCATION, "models", "me_model.dat") - self.ld_model = os.path.join(self.LOCATION, "long_distance_model", "long_dist_model.dat") + self.ld_model = os.path.join( + self.LOCATION, "long_distance_model", "long_dist_model.dat" + ) cl_rej_log = os.path.join(self.rejected_dir, "classify_chains_rejected.txt") ld_arg_ = self.ld_model if self.ld_model_arg else None if not os.path.isfile(self.se_model) or not os.path.isfile(self.me_model): @@ -933,7 +967,7 @@ def __classify_chains(self): rejected=cl_rej_log, raw_out=self.pred_scores, annot_threshold=self.orth_score_threshold, - ld_model=ld_arg_ + ld_model=ld_arg_, ) # extract not classified transcripts # first column in the rejected log @@ -968,6 +1002,128 @@ def __split_file(src, dst_dir, pieces_num): f.write("".join(piece)) f.close() return paths_to_pieces + + def __get_transcript_to_strand(self): + """Get """ + ret = {} + f = open(self.ref_bed, 'r') + for line in f: + ld = line.rstrip().split("\t") + trans = ld[3] + direction = ld[5] + ret[trans] = direction + f.close() + return ret + + def __get_chain_to_qstrand(self): + ret = {} + f = open(self.chain_file, "r") + for line in f: + if not line.startswith("chain"): + continue + fields = line.rstrip().split() + chain_id = int(fields[-1]) + q_strand = fields[9] + ret[chain_id] = q_strand + f.close() + return ret + + def __fold_exon_data(self, exons_data, out_bed): + """Convert exon data into bed12.""" + projection_to_exons = defaultdict(list) + # 1: make projection: + f = open(exons_data, "r") + for line in f: + ld = line.rstrip().split("\t") + transcript, _chain, _start, _end = ld + chain = int(_chain) + start = int(_start) + end = int(_end) + region = (start, end) + projection_to_exons[(transcript, chain)].append(region) + # 2 - get search loci for each projection + projection_to_search_loc = {} + + # CESAR_PRECOMPUTED_ORTHO_LOCI_DATA = "cesar_precomputed_orthologous_loci.tsv" + f = open(self.precomp_query_loci_path, "r") + for elem in f: + elem_data = elem.split("\t") + transcript = elem_data[1] + chain = int(elem_data[2]) + projection = f"{transcript}.{chain}" + search_locus = elem_data[3] + chrom, s_e = search_locus.split(":") + s_e_split = s_e.split("-") + start, end = int(s_e_split[0]), int(s_e_split[1]) + projection_to_search_loc[(transcript, chain)] = (chrom, start, end) + f.close() + trans_to_strand = self.__get_transcript_to_strand() + chain_to_qstrand = self.__get_chain_to_qstrand() + # 3 - save bed 12 + f = open(out_bed, "w") + # print(projection_to_search_loc) + for (transcript, chain), exons in projection_to_exons.items(): + # print(transcript, chain) + projection = f"{transcript}.{chain}" + trans_strand = trans_to_strand[transcript] + chain_strand = chain_to_qstrand[chain] + to_invert = trans_strand != chain_strand + + exons_sort = sorted(exons, key=lambda x: x[0]) + if to_invert: + exons_sort = exons_sort[::-1] + + # crash here + search_locus = projection_to_search_loc[(transcript, chain)] + chrom = search_locus[0] + search_start = search_locus[1] + search_end = search_locus[2] + + if to_invert: + bed_start = search_end - exons_sort[0][1] + bed_end = search_end - exons_sort[-1][0] + else: + bed_start = exons_sort[0][0] + search_start + bed_end = exons_sort[-1][1] + search_start + + block_sizes, block_starts = [], [] + for exon in exons_sort: + abs_start_in_s, abs_end_in_s = exon + # print(exon) + + if to_invert: + abs_start = search_end - abs_end_in_s + abs_end = search_end - abs_start_in_s + else: + abs_start = abs_start_in_s + search_start + abs_end = abs_end_in_s + search_start + + # print(abs_start, abs_end) + + rel_start = abs_start - bed_start + block_size = abs_end - abs_start + block_sizes.append(block_size) + block_starts.append(rel_start) + block_sizes_field = ",".join(map(str, block_sizes)) + block_starts_field = ",".join(map(str, block_starts)) + all_fields = ( + chrom, + bed_start, + bed_end, + projection, + 0, + 0, + bed_start, + bed_end, + "0,0,0", + len(exons), + block_sizes_field, + block_starts_field, + ) + f.write("\t".join(map(str, all_fields))) + f.write("\n") + f.close() + f.close() def __precompute_data_for_opt_cesar(self): """Precompute memory data for optimised CESAR. @@ -977,28 +1133,56 @@ def __precompute_data_for_opt_cesar(self): """ if not self.cesar_is_opt: return # in case we use standard CESAR this procedure is not needed + + # need gene: chains + # artificial bed file with all possible exons per gene, maybe the longest if intersection... or not? + # what if there is a giant exon that does not align but smaller versions do? print("COMPUTING MEMORY REQUIREMENTS FOR OPTIMISED CESAR") mem_dir = os.path.join(self.wd, CESAR_PRECOMPUTED_MEMORY_DIRNAME) + self.precomp_reg_dir = os.path.join(self.wd, CESAR_PRECOMPUTED_REGIONS_DIRNAME) chain_class_temp_dir = os.path.join(self.wd, TEMP_CHAIN_CLASS) + self.precomp_query_loci_dir = os.path.join( + self.wd, CESAR_PRECOMPUTED_ORTHO_LOCI_DIRNAME + ) + self.precomp_query_loci_path = os.path.join( + self.temp_wd, CESAR_PRECOMPUTED_ORTHO_LOCI_DATA + ) + os.mkdir(mem_dir) if not os.path.isdir(mem_dir) else None os.mkdir(chain_class_temp_dir) if not os.path.isdir( chain_class_temp_dir ) else None + os.mkdir(self.precomp_reg_dir) if not os.path.isdir( + self.precomp_reg_dir + ) else None + os.mkdir(self.precomp_query_loci_dir) if not os.path.isdir( + self.precomp_query_loci_dir + ) else None + self.temp_files.append(mem_dir) self.temp_files.append(chain_class_temp_dir) + self.temp_files.append(self.precomp_reg_dir) + self.temp_files.append(self.precomp_query_loci_dir) + # split file containing gene: orthologous/paralogous chains into 100 pieces - class_pieces = self.__split_file(self.orthologs, chain_class_temp_dir, 100) + class_pieces = self.__split_file( + self.orthologs, chain_class_temp_dir, NUM_CESAR_MEM_PRECOMP_JUBS + ) precompute_jobs = [] # for nextflow abspath is better: precomp_abspath = os.path.abspath(self.PRECOMPUTE_OPT_CESAR_DATA) - cesar_bin_abspath = os.path.abspath(self.cesar_binary) + # cesar_bin_abspath = os.path.abspath(self.cesar_binary) + # create joblist: one job per piece for num, class_piece in enumerate(class_pieces, 1): - out_path = os.path.join(mem_dir, f"part_{num}") + out_m_path = os.path.join(mem_dir, f"part_{num}") + out_reg_path = os.path.join(self.precomp_reg_dir, f"part_{num}") + out_ol_path = os.path.join(self.precomp_query_loci_dir, f"path_{num}") + cmd = ( - f"{precomp_abspath} {class_piece} {self.wd} {cesar_bin_abspath} " - f"{self.t_2bit} {self.q_2bit} {out_path}" + f"{precomp_abspath} {class_piece} {self.wd} {self.opt_cesar_binary} " + f"{self.t_2bit} {self.q_2bit} {out_m_path} --ro {out_reg_path} --ol {out_ol_path}" ) precompute_jobs.append(cmd) # save joblist @@ -1016,7 +1200,7 @@ def __precompute_data_for_opt_cesar(self): print(f"Project path: {project_path}") if self.para: # run jobs with para, skip nextflow - cmd = f'para make {project_name} {precomp_mem_joblist} -q="short"' + cmd = f'para make {project_name} {precomp_mem_joblist} -q="shortmed"' print(f"Calling {cmd}") rc = subprocess.call(cmd, shell=True) else: # calling jobs with nextflow @@ -1038,8 +1222,18 @@ def __precompute_data_for_opt_cesar(self): shutil.rmtree(project_path) if os.path.isdir(project_path) else None # merge files and quit self.__merge_dir(mem_dir, self.precomp_mem_cesar) + # self.__merge_dir(precomp_reg_dir, self.precomp_reg_cesar) self.cesar_mem_was_precomputed = True + if self.output_opt_cesar_regions: + save_to = os.path.join(self.wd, "precomp_regions_for_cesar.bed") + exon_regions_df = os.path.join(self.wd, "precomp_regions_exons.tsv") + self.__merge_dir(self.precomp_reg_dir, exon_regions_df) + self.__merge_dir(self.precomp_query_loci_dir, self.precomp_query_loci_path) + self.__fold_exon_data(exon_regions_df, save_to) + print(f"Bed file containing precomputed regions is saved to: {save_to}") + sys.exit(0) + def __split_cesar_jobs(self): """Call split_exon_realign_jobs.py.""" if not self.t_2bit or not self.q_2bit: @@ -1073,9 +1267,14 @@ def __split_cesar_jobs(self): self.cesar_combined = os.path.join(self.temp_wd, "cesar_combined") self.cesar_results = os.path.join(self.temp_wd, "cesar_results") self.cesar_bigmem_jobs = os.path.join(self.temp_wd, "cesar_bigmem") + self.temp_files.append(self.cesar_jobs_dir) self.temp_files.append(self.cesar_combined) self.temp_files.append(self.predefined_glp_cesar_split) + self.temp_files.append(self.technical_cesar_err) + os.mkdir(self.technical_cesar_err) if not os.path.isdir( + self.technical_cesar_err + ) else None # different field names depending on --ml flag @@ -1083,6 +1282,9 @@ def __split_cesar_jobs(self): self.temp_files.append(self.gene_loss_data) skipped_path = os.path.join(self.rejected_dir, "SPLIT_CESAR.txt") self.paralogs_log = os.path.join(self.temp_wd, "paralogs.txt") + cesar_binary_to_use = ( + self.opt_cesar_binary if self.cesar_is_opt else self.cesar_binary + ) split_cesar_cmd = ( f"{self.SPLIT_EXON_REALIGN_JOBS} " @@ -1099,15 +1301,17 @@ def __split_cesar_jobs(self): f"--chains_limit {self.cesar_chain_limit} " f"--skipped_genes {skipped_path} " f"--rejected_log {self.rejected_dir} " - f"--cesar_binary {self.cesar_binary} " + f"--cesar_binary {cesar_binary_to_use} " f"--paralogs_log {self.paralogs_log} " f"--uhq_flank {self.uhq_flank} " - f"--predefined_glp_class_path {self.predefined_glp_cesar_split}" + f"--predefined_glp_class_path {self.predefined_glp_cesar_split} " + f"--unprocessed_log {self.technical_cesar_err}" ) if self.annotate_paralogs: # very rare occasion split_cesar_cmd += f" --annotate_paralogs" - + if self.mask_all_first_10p: + split_cesar_cmd += f" --mask_all_first_10p" # split_cesar_cmd = split_cesar_cmd + f" --cesar_binary {self.cesar_binary}" \ # if self.cesar_binary else split_cesar_cmd split_cesar_cmd = ( @@ -1131,6 +1335,7 @@ def __split_cesar_jobs(self): split_cesar_cmd += f" --fragments_data {fragm_dict_file}" if self.cesar_mem_was_precomputed: split_cesar_cmd += f" --precomp_memory_data {self.precomp_mem_cesar}" + split_cesar_cmd += f" --precomp_regions_data_dir {self.precomp_reg_dir}" self.__call_proc(split_cesar_cmd, "Could not split CESAR jobs!") def __get_cesar_jobs_for_bucket(self, comb_file, bucket_req): @@ -1255,7 +1460,7 @@ def __run_cesar_jobs(self): nf_project_path = os.path.join(self.nextflow_dir, nf_project_name) project_paths.append(nf_project_path) - # create subprocess object + # create subprocess object if not self.para: # create nextflow cmd os.mkdir(nf_project_path) if not os.path.isdir( nf_project_path @@ -1271,7 +1476,7 @@ def __run_cesar_jobs(self): if memory_mb > 0: cmd += f" --memoryMb={memory_mb}" p = subprocess.Popen(cmd, shell=True) - + sys.stderr.write(f"Pushed cluster jobs with {cmd}\n") # wait for the process if calling processes sequentially @@ -1389,8 +1594,11 @@ def __rebuild_crashed_jobs(self, crashed_jobs): chain_ids = self.__read_chain_arg(chains_arg) if chain_ids is None: continue - memlim_arg_ind = elem_args.index(MEMLIM_ARG) + 1 - mem_val = float(elem_args[memlim_arg_ind]) + try: + memlim_arg_ind = elem_args.index(MEMLIM_ARG) + 1 + mem_val = float(elem_args[memlim_arg_ind]) + except ValueError: + mem_val = self.cesar_mem_limit bucket_lim = self.__get_bucket_val(mem_val, buckets) if FRAGM_ARG in elem_args: @@ -1502,6 +1710,8 @@ def __check_cesar_completeness(self): p_objects.append(p) time.sleep(CESAR_PUSH_INTERVAL) self.__monitor_jobs(p_objects, project_paths, die_if_sc_1=True) + # TODO: maybe some extra sanity check + # need to check whether anything crashed again crashed_twice = [] for elem in err_log_files: @@ -1521,6 +1731,19 @@ def __check_cesar_completeness(self): err_msg = f"Some CESAR jobs crashed twice, please check {crashed_log}; Abort" self.die(err_msg, 1) + def __append_technicall_err_to_predef_class(self, transcripts_path, out_path): + """Append file with predefined clasifications.""" + if not os.path.isfile(transcripts_path): + # in this case, we don't have transcripts with tech error + # can simply quit the function + return + with open(transcripts_path, "r") as f: + transcripts_list = [x.rstrip() for x in f] + f = open(out_path, "a") + for elem in transcripts_list: + f.write(f"TRANSCRIPT\t{elem}\tM\n") + f.close() + def __merge_cesar_output(self): """Merge CESAR output, save final fasta and bed.""" print("Merging CESAR output to make fasta and bed files.") @@ -1538,6 +1761,17 @@ def __merge_cesar_output(self): self.trash_exons, fragm_data=self.bed_fragm_exons_data, ) + + # need to merge files containing transcripts that were not processed + # for some technical reason, such as intersecting fragments in the query + self.__merge_dir( + self.technical_cesar_err, self.technical_cesar_err_merged, ignore_empty=True + ) + + self.__append_technicall_err_to_predef_class( + self.technical_cesar_err_merged, self.predefined_glp_cesar_split + ) + if len(all_ok) == 0: # there are no empty output files -> parsed without errors print("CESAR results merged") @@ -1617,11 +1851,16 @@ def __orthology_type_map(self): ) @staticmethod - def __merge_dir(dir_name, output): + def __merge_dir(dir_name, output, ignore_empty=False): """Merge all files in a directory into one.""" files_list = os.listdir(dir_name) - if len(files_list) == 0: + if len(files_list) == 0 and ignore_empty is False: sys.exit(f"Error! {dir_name} is empty") + elif len(files_list) == 0 and ignore_empty is True: + # in this case we allow empty directories + # just remove the directory and return + shutil.rmtree(dir_name) + return buffer = open(output, "w") for filename in files_list: path = os.path.join(dir_name, filename) @@ -1632,7 +1871,7 @@ def __merge_dir(dir_name, output): # buffer.write(lines) buffer.write(content) buffer.close() - shutil.rmtree(dir_name) + # shutil.rmtree(dir_name) def __check_crashed_cesar_jobs(self): """Check whether any CESAR jobs crashed. @@ -1649,6 +1888,11 @@ def __check_crashed_cesar_jobs(self): if "CESAR" not in line: # not related to CESAR continue + if "fragment chains oevrlap" in line: + # they are not really "crashed" + # those jobs could not produce a meaningful result + # only intersecting exons + continue # extract cesar wrapper command from the log cesar_cmd = line.split("\t")[0] self.crashed_cesar_jobs.append(cesar_cmd) @@ -1771,9 +2015,6 @@ def parse_args(): default=None, help="Find orthologs " "for a single reference chromosome only", ) - # app.add_argument("--limit_to_query_chrom", default=None, help="Annotate " - # "a particular query scaffold/chromosome only") - # nextflow related app.add_argument( "--nextflow_dir", "--nd", @@ -1857,6 +2098,14 @@ def parse_args(): dest="using_optimized_cesar", help="Instead of CESAR, use lastz-based optimized version", ) + app.add_argument( + "--output_opt_cesar_regions", + "--oocr", + action="store_true", + dest="output_opt_cesar_regions", + help="If optimised CESAR is used, save the included regions, " + "and quit. The parameter is defined for debugging purposes only.", + ) app.add_argument( "--mask_stops", "--ms", @@ -1880,7 +2129,7 @@ def parse_args(): action="store_true", dest="cesar_exec_seq", help="Execute different CESAR jobs partitions sequentially, " - "not in parallel." + "not in parallel.", ) app.add_argument( "--cesar_chain_limit", @@ -1946,13 +2195,30 @@ def parse_args(): dest="annotate_paralogs", action="store_true", help="Annotate paralogs instead of orthologs. " - "(experimental feature for very specific needs)", + "(experimental feature for very specific needs)", + ) + app.add_argument( + "--mask_all_first_10p", + "--m_f10p", + action="store_true", + dest="mask_all_first_10p", + help="Automatically mask all inactivating mutations in first 10 percent of " + "the reading frame, ignoring ATG codons distribution. " + "(Default mode in V1.0, not recommended to use in later versions)", ) # print help if there are no args if len(sys.argv) < 2: app.print_help() sys.exit(0) args = app.parse_args() + + # some sanity checks + if args.output_opt_cesar_regions and not args.using_optimized_cesar: + err_msg = ( + "Error! Please use --output_opt_cesar_regions parameter " + " with --using_optimized_cesar flag only" + ) + sys.exit(err_msg) return args diff --git a/ucsc_browser_visualisation/bb_schema.as b/ucsc_browser_visualisation/bb_schema.as new file mode 100644 index 0000000..d867477 --- /dev/null +++ b/ucsc_browser_visualisation/bb_schema.as @@ -0,0 +1,43 @@ +table togaBigBed +"TOGA predicted gene model" +( + string chrom; "Reference sequence chromosome or scaffold" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + string name; "Name or ID of item, ideally both human readable and unique" + uint score; "Score (0-1000)" + char[1] strand; "+ or - for strand" + uint thickStart; "Start of where display should be thick (start codon)" + uint thickEnd; "End of where display should be thick (stop codon)" + uint itemRgb; "RGB value (use R,G,B string in input file)" + int blockCount; "Number of blocks" + int[blockCount] blockSizes; "Comma separated list of block sizes" + int[blockCount] chromStarts; "Start positions relative to chromStart" + + string ref_trans_id; "Reference transcript ID" + string ref_region; "Transcript region in the reference" + string query_region; "Region in the query" + float chain_score; "Chain orthology probability score" + + float chain_synteny; "Chain synteny log10 value" + float chain_flank; "Chain flank feature" + float chain_gl_cds_fract; "Chain global CDS fraction value" + float chain_loc_cds_fract; "Chain local CDS fraction value" + float chain_exon_cov; "Chain exon coverage value" + + float chain_intron_cov; "Chain intron coverage value" + string status; "Gene loss classification" + float perc_intact_ign_M; "% intact ignoring missing" + float perc_intact_int_M; "% intact considering missing as intact" + float intact_codon_prop; "% intact codons" + + float ouf_prop; "% out of chain" + string mid_intact; "Is middle 80% intact" + string mid_pres; "Is middle 80% fully present" + lstring prot_alignment; "HTML-formatted protein alignment" + lstring svg_line; "SVG inactivating mutations visualization" + + lstring ref_link; "Reference transcript link" + lstring inact_mut_html_table; "HTML-formatted inactivating mutations table" + lstring exon_ali_html; "HTML-formatted exon alignment" +) diff --git a/ucsc_browser_visualisation/generate_tab_files.py b/ucsc_browser_visualisation/generate_tab_files.py index 6933d88..c17c583 100755 --- a/ucsc_browser_visualisation/generate_tab_files.py +++ b/ucsc_browser_visualisation/generate_tab_files.py @@ -16,8 +16,8 @@ "INTACT_PERC_INTACT_M", "INTACT_CODONS_PROP", "OUT_OF_CHAIN_PROP", - "MIDDLE_80%_INTACT", - "MIDDLE_80%_PRESENT", + "MIDDLE_IS_INTACT", + "MIDDLE_IS_PRESENT", ] # for assembled from fragments: we cannot get chain class features FRAGM_FEATS = (0, 0.0, 0.0, 0.0, 0.0, 0.0) @@ -61,6 +61,11 @@ def parse_args(): "data from default output fasta files. Will be " "the default behaviour in the future"), ) + app.add_argument("--no_plots", + "--np", + dest="no_plots", + action="store_true", + help="If inactivating mutation plots are already generated, do not recreate them") if len(sys.argv) < 3: app.print_help() sys.exit(0) @@ -449,7 +454,7 @@ def get_seq_data_from_fasta(wd, all_projections): return [], projection_to_prot_ali, projection_to_codon_ali -def get_sequence_data(wd, all_projections): +def get_sequence_data(wd, all_projections, exon_to_stat): """Parse nucleotide and protein sequence data.""" print("Reading sequence data") gigafasta = os.path.join(wd, TEMP, CESAR_RESULTS) @@ -525,6 +530,7 @@ def get_sequence_data(wd, all_projections): ali_class = header_data[7] exp_range = header_data[8] in_exp = ONE_S if header_data[9] == "INC" else ZERO_S + is_del_or_no = exon_to_stat.get(exon_id, "I") exon_data = ( location, pid, @@ -533,6 +539,7 @@ def get_sequence_data(wd, all_projections): ali_class, exp_range, in_exp, + is_del_or_no, # added 26 Aug 2022 sequence, ) query_exon_to_nucl_data[exon_id] = exon_data @@ -604,6 +611,7 @@ def get_inact_data(wd, all_projections): projection_to_inact_muts = [] proj_to_inact_features = defaultdict(dict) proj_to_inact_features_rows = [] + exon_to_del_miss = {} f = open(inact_mut_file, "r") for line in f: if not line.startswith("#"): @@ -614,25 +622,35 @@ def get_inact_data(wd, all_projections): projection = f"{trans}.{chain}" if projection not in all_projections: continue + if len(line_data) == 8: # inactivating mutation data + + # IF Deleted exon OR Missing exon -> also save the data + # needed in exon ali visualizations mask_field = line_data[6] is_inact = ZERO_S if mask_field == "masked" else ONE_S line_data[6] = is_inact + mut_trimmed = line_data[5][:20] + line_data[5] = mut_trimmed mut_track = line_data[2:] sql_row = (projection, *mut_track) projection_to_inact_muts.append(sql_row) + if line_data[4] == "Deleted exon": + exon_to_del_miss[(projection, int(line_data[2]))] = "D" + elif line_data[4] == "Missing exon": + exon_to_del_miss[(projection, int(line_data[2]))] = "M" continue else: # features feat, val_ = line_data[2].split() # boolean features to num (for SQL table) - if feat == "MIDDLE_80%_PRESENT": + if feat == "MIDDLE_IS_PRESENT": if val_ == "TRUE": val = ONE_S else: val = ZERO_S - elif feat == "MIDDLE_80%_INTACT": + elif feat == "MIDDLE_IS_INTACT": if val_ == "TRUE": val = ONE_S else: @@ -657,7 +675,7 @@ def get_inact_data(wd, all_projections): vals.append(val) sql_row = (proj, *vals) proj_to_inact_features_rows.append(sql_row) - return proj_to_inact_features_rows, projection_to_inact_muts + return proj_to_inact_features_rows, projection_to_inact_muts, exon_to_del_miss def save_toga_inact_mut_tab(out_dir, inact_mut_data): @@ -694,16 +712,17 @@ def main(): proj_to_chain_score = get_chain_scores(args.wd, all_projections) proj_to_chain_features = get_chain_features(args.wd, all_projections) projection_to_loss_class = get_projection_class(args.wd) + proj_to_inact_feat, proj_to_inact_mut, exon_to_stat = get_inact_data(args.wd, all_projections) + if args.no_raw_cesar_output: raise NotImplementedError("Cancelled branch") seq_data = get_seq_data_from_fasta(args.wd, all_projections) exit() else: - seq_data = get_sequence_data(args.wd, all_projections) + seq_data = get_sequence_data(args.wd, all_projections, exon_to_stat) projection_exon_data = seq_data[0] projection_to_prot_ali = seq_data[1] projection_to_codon_ali = seq_data[2] - proj_to_inact_feat, proj_to_inact_mut = get_inact_data(args.wd, all_projections) save_toga_info_tab( args.output, all_projections, diff --git a/ucsc_browser_visualisation/get_names_from_bed.py b/ucsc_browser_visualisation/get_names_from_bed.py new file mode 100755 index 0000000..1cc316e --- /dev/null +++ b/ucsc_browser_visualisation/get_names_from_bed.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +"""Just extract names from toga output bed file. + +Works like xenoRefGenelx.pl""" +import sys + +if len(sys.argv) != 2: + to_read = None + sys.exit(f"Usage: {sys.argv[0]} [query_annotation.bed] | sort -u > ix.txt") +else: + to_read = sys.argv[1] + +f = open(to_read, "r") +for line in f: + id_field = line.rstrip().split("\t")[3] + no_chain_id = ".".join(id_field.split(".")[:-1]) + dot_split = no_chain_id.split(".") + if len(dot_split) > 1: + to_out = [id_field, no_chain_id] + dot_split + else: + to_out = [id_field, no_chain_id] + line = "\t".join(to_out) + print(line) +f.close() diff --git a/ucsc_browser_visualisation/make_bigbed_data_public.py b/ucsc_browser_visualisation/make_bigbed_data_public.py new file mode 100755 index 0000000..8a2a118 --- /dev/null +++ b/ucsc_browser_visualisation/make_bigbed_data_public.py @@ -0,0 +1,608 @@ +#!/usr/bin/env python3 +"""Perform all operations to create BigBed file for UCSC browser.""" +import os +from re import sub +import sys +import argparse +import subprocess +from collections import defaultdict +import getpass + + +GENERATE_TAB_FILES = "generate_tab_files.py" +GENERATE_PLOTS = "make_togaPlot.py" +LOCATION = os.path.dirname(__file__) +SCHEMA_LOCATION = os.path.join(LOCATION, "bb_schema.as") + +IFEAT_PLACEHOLDER = ["0.0" for _ in range(6)] +PROT_PLACEHOLDER = ["NO_DATA"] +UNDEF = "UNDEFINED" +SVG_PLACEHOLDER = """ + + + + +""" +PLOT_PLACEHOLDER = [SVG_PLACEHOLDER, ] + + +def parse_args(): + """Parse args.""" + app = argparse.ArgumentParser() + app.add_argument("project_dir", help="Directory containing TOGA output") + app.add_argument("--do_not_cleanup", + action="store_true", + dest="do_not_cleanup", + help="Do not clean tabs dir up.") + app.add_argument("--bb_version", + default="v1", + help="Version tag") + app.add_argument("--no_plots", + "--np", + dest="no_plots", + action="store_true", + help="If inactivating mutation plots are already generated, do not recreate them") + # TODO: chroms sizes if not standard or not HL + if len(sys.argv) < 2: + app.print_help() + sys.exit(0) + args = app.parse_args() + return args + + +def call_gen_tab_files(project_dir): + """Call Generate tab files subprocess.""" + print(f"Calling {GENERATE_TAB_FILES}") + out_dir = os.path.join(project_dir, "tabs") + os.mkdir(out_dir) if not os.path.isdir(out_dir) else None + exe_ = os.path.join(LOCATION, GENERATE_TAB_FILES) + cmd = f"{exe_} {project_dir} {out_dir}" + rc = subprocess.call(cmd, shell=True) + if rc != 0: + raise ValueError(f"Command {cmd} died - cannot generate tab files") + print("5 of 6 tab files generated") + + +def call_gen_plot_files(project_dir, no_plots=False): + """Generate svg plots.""" + print(f"Calling {GENERATE_PLOTS}") + out_file = os.path.join(project_dir, "tabs", "togaPlot.tab") + if no_plots is True and os.path.isfile(out_file): + print(f"--no_plots flag is on, found {out_file} --> skip making plots") + return + elif no_plots is True and not os.path.isfile(out_file): + print(f"Error! --no_plots flag is on, but {out_file} not found\nAbort") + sys.exit(1) + exe_ = os.path.join(LOCATION, GENERATE_PLOTS) + cmd = f"{exe_} {project_dir} {out_file}" + rc = subprocess.call(cmd, shell=True) + if rc != 0: + raise ValueError(f"Command {cmd} died - cannot generate plots") + print("All tab files generated") + + +def read_tab_file(tab_file): + """Make trans_id: data track.""" + trans_to_dat = {} + f = open(tab_file, "r") + for line in f: + line_data = line.rstrip().split("\t") + trans = line_data[0] + data = line_data[1:] + trans_to_dat[trans] = data + f.close() + return trans_to_dat + + + +def __gen_info_placeholder(p_name): + """Generate info placeholder.""" + t_name = ".".join(p_name.split(".")[:-1]) + placeholder = [t_name, UNDEF, UNDEF, "0.0", "0", "0.0", "0.0", "0.0", "0.0", "0.0", UNDEF] + return placeholder + + +def make_toga_data(project_dir): + """Merge togaPlot, togaProt, togaInfo and togaInactFeat tables into togaData.""" + tabs_dir = os.path.join(project_dir, "tabs") + toga_info_file = os.path.join(tabs_dir, "togaInfo.tab") + toga_inact_feat_file = os.path.join(tabs_dir, "togaInactFeat.tab") + toga_plot_file = os.path.join(tabs_dir, "togaPlot.tab") + toga_prot_file = os.path.join(tabs_dir, "togaProt.tab") + out_file = os.path.join(tabs_dir, "togaData.tab") + + print("Reading tab files to merge") + trans_to_info = read_tab_file(toga_info_file) + trans_to_ifeat = read_tab_file(toga_inact_feat_file) + trans_to_plot = read_tab_file(toga_plot_file) + trans_to_prot = read_tab_file(toga_prot_file) + + transcripts = trans_to_info.keys() + print(f"Got data for {len(transcripts)} transcripts") + + f = open(out_file, "w") + for t in transcripts: + info = trans_to_info.get(t) + if info is None: + info = __gen_info_placeholder(t) + ifeat = trans_to_ifeat.get(t, IFEAT_PLACEHOLDER.copy()) + prot = trans_to_prot.get(t, PROT_PLACEHOLDER.copy()) + plot = trans_to_plot.get(t, PLOT_PLACEHOLDER.copy()) + data_lst = [t] + info + ifeat + prot + plot + tab_str = "\t".join(data_lst) + f.write(tab_str) + f.write("\n") + f.close() + # do cleanup + os.remove(toga_info_file) + os.remove(toga_inact_feat_file) + os.remove(toga_plot_file) + os.remove(toga_prot_file) + print("Done") + + +def _check_seq_of_intervals_intersect(intervals): + """Check whether any interval in the seq intersects""" + intervals_num = len(intervals) + for i in range(intervals_num - 1): + # (start, end) + curr_one = intervals[i] + next_one = intervals[i + 1] + # sorted by beginning + # if start of the next < end of the curr + # -> they intersect + if next_one[0] < curr_one[1]: + return (curr_one[0], curr_one[1]) + return None # nothing suspicious found + + +def _filter_bed(in_bed, out_bed): + """Filter out bed entries with: + - None chromosome + - Intersecting blocks.""" + _in = open(in_bed, "r") + _out = open(out_bed, "w") + for line in _in: + ld = line.rstrip().split("\t") + chrom = ld[0] + if chrom == "None": + continue + block_sizes = [int(x) for x in ld[10].split(",") if x != ""] + block_starts = [int(x) for x in ld[11].split(",") if x != ""] + block_count = len(block_starts) + if block_count == 1: + # no need to check whether exons intersect if there is only 1 exon + _out.write(line) + continue + block_ends = [block_starts[i] + block_sizes[i] for i in range(block_count)] + exon_ranges = sorted(zip(block_starts, block_ends), key=lambda x: x[0]) + ranges_intersect = _check_seq_of_intervals_intersect(exon_ranges) + if ranges_intersect: + continue + _out.write(line) + _in.close() + _out.close() + + +def make_sorted_bed(project_dir): + """For bigBed, we need properly sorted bed file.""" + non_sorted_bed = os.path.join(project_dir, "query_annotation.bed") + tabs_dir = os.path.join(project_dir, "tabs") + bed_no_none = os.path.join(project_dir, "query_annotation_intact_chroms.bed") + _filter_bed(non_sorted_bed, bed_no_none) + sorted_bed = os.path.join(tabs_dir, "query_annot_sorted.bed") + cmd = f"sort -k1,1 -k2,2n {bed_no_none} > {sorted_bed}" + subprocess.call(cmd, shell=True) + # sorted bed file, good + + +def __get_ts_from_bed(bed): + """Get transcript IDs from bed file.""" + f = open(bed, 'r') + ts = [] + ts_to_bed_line = {} + for line in f: + ld = line.rstrip().split("\t") + t = ld[3] + ts.append(t) + ts_to_bed_line[t] = ld + f.close() + return ts, ts_to_bed_line + + +def __get_ts_to_toga_data(toga_data_file): + """Get transID: togaData fields.""" + ret = {} + f = open(toga_data_file, "r") + for line in f: + ld = line.rstrip().split("\t") + t = ld[0] + fields = ld[1:] + ret[t] = fields + f.close() + return ret + + +def __get_ts_to_toga_inact(toga_inact_file): + """Read inact mut data.""" + ts_to_exon_to_data = defaultdict(dict) + f = open(toga_inact_file, 'r') + for line in f: + line_data = line.rstrip().split("\t") + ts = line_data[0] + exon_num = int(line_data[1]) + rest = line_data[2:] + ts_to_exon_to_data[ts][exon_num] = rest + f.close() + # now: TS to HTML_formatted line + trans_to_inact_line = {} + for t in ts_to_exon_to_data.keys(): + exon_nums_unsort = ts_to_exon_to_data[t].keys() + exon_nums = sorted(exon_nums_unsort) + t_lines = [] + for exon_num in exon_nums: + inact_mut_data = ts_to_exon_to_data[t][exon_num] + """ + transcript varchar(50) not null, -- unique projection ID: ${ref_transcript_ID}.${chain_ID} + exon_num int unsigned not null, -- exon number + position int unsigned not null, -- possition where mutation happened + mut_class varchar(15) not null, -- mutation class such as FS deletion + mutation varchar(20) not null, -- what exactly happened + is_inact tinyint unsigned not null, -- is this mutation inactivating, yes 1 or not 0 + mut_id varchar(10) not null -- mut identifier + """ + # template: + """ + struct togaInactMut *info = NULL; + info = togaInactMutLoad(row); + printf("\n"); + printf("%s\n", info->exon_num); + printf("%s\n", info->position); + printf("%s\n", info->mut_class); + printf("%s\n", info->mutation); + if (sameWord(info->is_inact, ONE_)){ + printf("%s\n", YES_); + } else { + printf("%s\n", NO_); + } + printf("%s\n", info->mut_id); + printf("\n"); + togaInactMutFree(&info); + """ + position = inact_mut_data[0] + mut_class = inact_mut_data[1] + mutation = inact_mut_data[2] + is_inact = "YES" if inact_mut_data[3] == "1" else "NO" + mut_id = inact_mut_data[4] + lines = [] + lines.append("") + lines.append(f"{exon_num}") + lines.append(f"{position}") + lines.append(f"{mut_class}") + lines.append(f"{mutation}") + lines.append(f"{is_inact}") + lines.append(f"{mut_id}") + lines.append("") + inact_mut_line = "".join(lines) + t_lines.append(inact_mut_line) + t_line = "".join(t_lines) + trans_to_inact_line[t] = t_line + return trans_to_inact_line + + +def __get_ts_to_toga_exons(toga_exons_file): + trans_to_exon_line = {} + """Table schema + + transcript varchar(50) not null, -- unique projection ID: ${ref_transcript_ID}.${chain_ID} + exon_num int unsigned not null, -- exon number + exon_region varchar(100) not null, -- region where exon was detected + pid float not null, -- nucleotide %id + blosum float not null, -- normalized blosum score + gaps tinyint unsigned not null, -- are there any asm gaps near? 1 - yes 0 - no + ali_class varchar(4) not null, -- alignemnt class: A, B, C, A+ + exp_region varchar(50) not null, -- where exon was expected + in_exp_region tinyint unsigned not null, -- detected in expected region or not 1 yes 0 no + is_del_or_no char -- D - deleted, M - missing, I - intact + alignment longblob not null -- exon sequence in query + """ + ts_to_exon_to_data = defaultdict(dict) + f = open(toga_exons_file, "r") + for line in f: + ld = line.rstrip().split("\t") + t = ld[0] + ex_num = int(ld[1]) + datum = ld[2:] + ts_to_exon_to_data[t][ex_num] = datum + f.close() + + + for transcript in ts_to_exon_to_data.keys(): + exons_unsorted = ts_to_exon_to_data[transcript].keys() + exon_nums = sorted(exons_unsorted) + transcript_lines = [] + for exon_num in exon_nums: + exon_lines = [] + datum = ts_to_exon_to_data[transcript][exon_num] + """To convert this: + struct togaNucl *info = NULL; + info = togaNuclLoad(row); + printf("
Exon number: %s

\n", info->exon_num); + printf("Exon region: %s
\n", info->exon_region); + printf("Nucleotide percent identity: %s | BLOSUM: %s
\n", info->pid, info->blosum); + if (sameWord(info->gaps, ONE_)){ + printf("Intersects assembly gaps: %s
\n", YES_); + } else { + printf("Intersects assembly gaps: %s
\n", NO_); + } + printf("Exon alignment class: %s
\n", info->ali_class); + if (sameWord(info->in_exp_region, ONE_)){ + printf("Detected within expected region (%s): %s
\n", info->exp_region, YES_); + } else { + printf("Detected within expected region (%s): %s
\n", info->exp_region, NO_); + } + // printf("Expected region: %s
\n", info->exp_region); + printf("
\n"); + printf("Sequence alignment between reference and query exon:
\n"); + printf("%s
\n", info->alignment); + togaNuclFree(&info); + """ + exon_region = datum[0] + pid = datum[1] + blosum = datum[2] + gaps = "YES" if datum[3] == "1" else "NO" + ali_class = datum[4] + exp_reg = datum[5] + in_exp_region = "YES" if datum[6] == "1" else "NO" + is_del_or_no = datum[7] + ali = datum[8] + + if is_del_or_no == "D": + exon_lines.append(f"
Exon number: {exon_num} - Deleted

") + elif is_del_or_no == "M": + exon_lines.append(f"
Exon number: {exon_num} - Missing

") + else: + exon_lines.append(f"
Exon number: {exon_num}

") + + exon_lines.append(f"Exon region: {exon_region}
") + exon_lines.append(f"Nucleotide percent identity: {pid} | BLOSUM: {blosum}
") + exon_lines.append(f"Intersects assembly gaps: {gaps}
") + exon_lines.append(f"Exon alignment class: {ali_class}
") + exon_lines.append(f"Detected within expected region ({exp_reg}): {in_exp_region}
") + exon_lines.append(f"
") + exon_lines.append(f"Sequence alignment between reference and query exon:
") + exon_lines.append(f"{ali}
") + exon_line = "".join(exon_lines) + transcript_lines.append(exon_line) + transcript_line = "".join(transcript_lines) + trans_to_exon_line[transcript] = transcript_line + return trans_to_exon_line + + +def __get_ref_transcript(q_trans): + return ".".join(q_trans.split(".")[:-1]) + + +def get_ref_ts_to_link(p_dir): + """Get ref_transcript: link dict.""" + ret = {} + ref_2bit_link = os.path.join(p_dir, "t2bit.link") + ref_2bit_path = os.readlink(ref_2bit_link) + ref_2bit_basename = os.path.basename(ref_2bit_path) + ref_genome_gname = ".".join(ref_2bit_basename.split(".")[:-1]) + ref_toga_dir = f"/projects/hillerlab/genome/gbdb-HL/{ref_genome_gname}/TOGA/" + ref_ts_to_link_path = os.path.join(ref_toga_dir, "toga.transcript2Link.txt") + # dirname = os.path.dirname(ref_2bit_path) + # chrom_sizes_path = os.path.join(dirname, "chrom.sizes") + if not os.path.isfile(ref_ts_to_link_path): + return ret, ref_genome_gname + f = open(ref_ts_to_link_path, "r") + for line in f: + ld = line.rstrip().split("\t") + ret[ld[0]] = ld[1] + f.close() + return ret, ref_genome_gname + + +def get_que_chrom_sizes(p_dir): + que_2bit_link = os.path.join(p_dir, "q2bit.link") + que_2bit_path = os.readlink(que_2bit_link) + dirname = os.path.dirname(que_2bit_path) + chrom_sizes_path = os.path.join(dirname, "chrom.sizes") + que_2bit_basename = os.path.basename(que_2bit_path) + que_genome_gname = ".".join(que_2bit_basename.split(".")[:-1]) + return chrom_sizes_path, que_genome_gname + + +def merge_all_tables(project_dir, r_ts_to_link): + """Merge all tabs in ready-to-merge bed file.""" + print("Joining tsv for bigbed") + tabs_dir = os.path.join(project_dir, "tabs") + sorted_bed = os.path.join(tabs_dir, "query_annot_sorted.bed") + toga_data_file = os.path.join(tabs_dir, "togaData.tab") + toga_inact_file = os.path.join(tabs_dir, "togaInactMut.tab") + toga_nucl_file = os.path.join(tabs_dir, "togaNucl.tab") + transcripts_ordered, t_to_bed = __get_ts_from_bed(sorted_bed) + ts_to_data_track = __get_ts_to_toga_data(toga_data_file) + ts_to_inact_data = __get_ts_to_toga_inact(toga_inact_file) + ts_to_exon_data = __get_ts_to_toga_exons(toga_nucl_file) + + + bigbed_material = os.path.join(tabs_dir, "query_annot_for_big.tsv") + f = open(bigbed_material, "w") + for transcript in transcripts_ordered: + ref_trans = __get_ref_transcript(transcript) + bed_fields = t_to_bed[transcript] + data_track = ts_to_data_track[transcript] + inact_line = ts_to_inact_data.get(transcript, "
") + exon_line = ts_to_exon_data[transcript] + link = r_ts_to_link.get(ref_trans, ref_trans) # just reference transcript if no link + combined_datum_lst = bed_fields + data_track + [link] + [inact_line] + [exon_line] + combined_datum = "\t".join(combined_datum_lst) + f.write(f"{combined_datum}\n") + f.close() + + +def sort_and_make_bb(project_dir, chrom_sizes): + """bed12+22 as the result + sort -k1,1 -k2,2n query_annot_for_big.tsv > query_annot_for_big_sorta.tsv + bedToBigBed -type=bed12+22 query_annot_for_big_sorta.tsv + /projects/hillerlab/genome/gbdb-HL/mm10/chrom.sizes query_annot.bb -tab + """ + print("Making bigbed...") + tabs_dir = os.path.join(project_dir, "tabs") + bigbed_material_not_sorted = os.path.join(tabs_dir, "query_annot_for_big.tsv") + bigbed_material = os.path.join(tabs_dir, "query_annot_for_big__sorted.tsv") + sort_cmd = f"sort -k1,1 -k2,2n {bigbed_material_not_sorted} > {bigbed_material}" + rc = subprocess.call(sort_cmd, shell=True) + if rc != 0: + sys.exit(f"Error! Command:\n{sort_cmd}\ncrashed") + output = os.path.join(tabs_dir, "query_annotation.bb") + # now transform into bigbed + print("Calling:") + bb_cmd = f"bedToBigBed -type=bed12+22 {bigbed_material} {chrom_sizes} {output} -tab -extraIndex=name -as={SCHEMA_LOCATION}" + print(bb_cmd) + rc = subprocess.call(bb_cmd, shell=True) + if rc != 0: + sys.exit(f"Error! Command:\n{bb_cmd}\ncrashed") + + # make ix.txt + query_annot = os.path.join(project_dir, "query_annotation.bed") + make_ix_script = os.path.join(LOCATION, "get_names_from_bed.py") + ix_txt = os.path.join(tabs_dir, "query_annotation.ix.txt") + ix_cmd = f"{make_ix_script} {query_annot} | sort -u > {ix_txt}" + rc = subprocess.call(ix_cmd, shell=True) + if rc != 0: + sys.exit(f"Error! Command:\n{ix_cmd}\ncrashed") + + # make .ix and .ixx indexes + ix_path = os.path.join(tabs_dir, "query_annotation.bb.ix") + ixx_path = os.path.join(tabs_dir, "query_annotation.bb.ixx") + ixixx_cmd = f"ixIxx {ix_txt} {ix_path} {ixx_path}" + rc = subprocess.call(ixixx_cmd, shell=True) + if rc != 0: + sys.exit(f"Error! Command:\n{ixixx_cmd}\ncrashed") + + +def cleanup(project_dir, dont): + """Clean project dir up.""" + if dont: # do not cleanup + return + tabs_dir = os.path.join(project_dir, "tabs") + bigbed_material_not_sorted = os.path.join(tabs_dir, "query_annot_for_big.tsv") + bigbed_material = os.path.join(tabs_dir, "query_annot_for_big__sorted.tsv") + toga_info_file = os.path.join(tabs_dir, "togaInfo.tab") + toga_inact_feat_file = os.path.join(tabs_dir, "togaInactFeat.tab") + toga_plot_file = os.path.join(tabs_dir, "togaPlot.tab") + toga_prot_file = os.path.join(tabs_dir, "togaProt.tab") + sorted_bed = os.path.join(tabs_dir, "query_annot_sorted.bed") + toga_data_file = os.path.join(tabs_dir, "togaData.tab") + toga_inact_file = os.path.join(tabs_dir, "togaInactMut.tab") + toga_nucl_file = os.path.join(tabs_dir, "togaNucl.tab") + to_rm = [bigbed_material_not_sorted, bigbed_material, toga_info_file, + toga_inact_feat_file, toga_plot_file, toga_prot_file, + sorted_bed, toga_data_file, toga_inact_file, toga_nucl_file] + for path in to_rm: + os.remove(path) if os.path.isfile(path) else None + + +def __ssh_mkdir(uname, dirname): + cmd = f"ssh {uname}@genome mkdir -p {dirname}" + print(f"Calling {cmd}") + subprocess.call(cmd, shell=True) + + +def __ssh_ln(uname, src, dest): + """Call ln on delta""" + cmd = f"ssh {uname}@genome ln -sf {src} {dest}" + print(f"Calling {cmd}") + subprocess.call(cmd, shell=True) + + +def __get_ccase_refname(refname): + if refname == "hg38": + return "Hg38" + elif refname == "mm10": + return "Mm10" + elif refname == "mm39": + return "Mm39" + else: + return refname + + +def load_bb_track(project_dir, dont, ref, que, bb_ver): + if dont: # do not load anything + return + tabs_dir = os.path.join(project_dir, "tabs") + bb_file = os.path.join(tabs_dir, "query_annotation.bb") + uname = getpass.getuser() + # scp query_annotation.bb genome:/genome/gbdb-HL/$ref/TOGA/vs_$query/HLTOGAannotVs$refv1.bb + ref_dir__genome = f"/genome/gbdb-HL/{ref}" + toga_ref_dir__genome = f"{ref_dir__genome}/TOGA" + # make if needed + __ssh_mkdir(uname, toga_ref_dir__genome) + vs_que_dir__genome = f"{toga_ref_dir__genome}/vs_{que}" + __ssh_mkdir(uname, vs_que_dir__genome) + # transfer data + bb_file = os.path.join(tabs_dir, "query_annotation.bb") + ix_file = os.path.join(tabs_dir, "query_annotation.bb.ix") + ixx_file = os.path.join(tabs_dir, "query_annotation.bb.ixx") + + ref_camel_case = __get_ccase_refname(ref) + bb_filename__genome = f"HLTOGAannotVs{ref_camel_case}{bb_ver}.bb" + ix_filename__genome = f"HLTOGAannotVs{ref_camel_case}{bb_ver}.ix" + ixx_filename__genome = f"HLTOGAannotVs{ref_camel_case}{bb_ver}.ixx" + + bb_path_genome = f"{vs_que_dir__genome}/{bb_filename__genome}" + ix_path_genome = f"{vs_que_dir__genome}/{ix_filename__genome}" + ixx_path_genome = f"{vs_que_dir__genome}/{ixx_filename__genome}" + + rsync_cmd = f"rsync -av {bb_file} {uname}@genome:{bb_path_genome}" + subprocess.call(rsync_cmd, shell=True) + + rsync_cmd = f"rsync -av {ix_file} {uname}@genome:{ix_path_genome}" + subprocess.call(rsync_cmd, shell=True) + + rsync_cmd = f"rsync -av {ixx_file} {uname}@genome:{ixx_path_genome}" + subprocess.call(rsync_cmd, shell=True) + + # make link in /var/www + www_data__genome = "/var/www/data" + www_que__genome = f"{www_data__genome}/{que}" + __ssh_mkdir(uname, www_que__genome) + + www_bb_link_dest = f"{www_que__genome}/{bb_filename__genome}" + www_ix_link_dest = f"{www_que__genome}/{ix_filename__genome}" + www_ixx_link_dest = f"{www_que__genome}/{ixx_filename__genome}" + + __ssh_ln(uname, bb_path_genome, www_bb_link_dest) + __ssh_ln(uname, ix_path_genome, www_ix_link_dest) + __ssh_ln(uname, ixx_path_genome, www_ixx_link_dest) + + +def main(): + """Entry point.""" + args = parse_args() + r_ts_to_link, ref_name = get_ref_ts_to_link(args.project_dir) + chrom_sizes, que_name = get_que_chrom_sizes(args.project_dir) + # call generate_tab_files.py and make_togaPlot.py + call_gen_tab_files(args.project_dir) + call_gen_plot_files(args.project_dir, no_plots=args.no_plots) + # merge 4 transcriptID-related tables into a single one + make_toga_data(args.project_dir) + + # OK, now making bigBed + # togaNucl.tab and togaInactMut.tab + # -> merge into HTML_formatted strings + # -> add to togaData + # save to as.tab for bedToBigBed + make_sorted_bed(args.project_dir) + + merge_all_tables(args.project_dir, r_ts_to_link) + sort_and_make_bb(args.project_dir, chrom_sizes) + + cleanup(args.project_dir, args.do_not_cleanup) + + +if __name__ == "__main__": + main() diff --git a/ucsc_browser_visualisation/make_sql_data.py b/ucsc_browser_visualisation/make_sql_data.py deleted file mode 100755 index 41eba17..0000000 --- a/ucsc_browser_visualisation/make_sql_data.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -"""Perform all operations to create SQL tables for UCSC browser.""" -import os -import sys -import argparse -import subprocess - -GENERATE_TAB_FILES = "generate_tab_files.py" -GENERATE_PLOTS = "make_togaPlot.py" -LOCATION = os.path.dirname(__file__) - -IFEAT_PLACEHOLDER = ["0.0" for _ in range(6)] -PROT_PLACEHOLDER = ["NO_DATA"] -UNDEF = "UNDEFINED" -SVG_PLACEHOLDER = """ - - - - -""" -PLOT_PLACEHOLDER = [SVG_PLACEHOLDER, ] - - -def parse_args(): - """Parse args.""" - app = argparse.ArgumentParser() - app.add_argument("project_dir", help="Directory containing TOGA output") - if len(sys.argv) < 2: - app.print_help() - sys.exit(0) - args = app.parse_args() - return args - - -def call_gen_tab_files(project_dir): - """Call Generate tab files subprocess.""" - print(f"Calling {GENERATE_TAB_FILES}") - out_dir = os.path.join(project_dir, "tabs") - os.mkdir(out_dir) if not os.path.isdir(out_dir) else None - exe_ = os.path.join(LOCATION, GENERATE_TAB_FILES) - cmd = f"{exe_} {project_dir} {out_dir}" - rc = subprocess.call(cmd, shell=True) - if rc != 0: - raise ValueError(f"Command {cmd} died - cannot generate tab files") - print("5 of 6 tab files generated") - - -def call_gen_plot_files(project_dir): - """Generate svg plots.""" - print(f"Calling {GENERATE_PLOTS}") - out_file = os.path.join(project_dir, "tabs", "togaPlot.tab") - exe_ = os.path.join(LOCATION, GENERATE_PLOTS) - cmd = f"{exe_} {project_dir} {out_file}" - rc = subprocess.call(cmd, shell=True) - if rc != 0: - raise ValueError(f"Command {cmd} died - cannot generate plots") - print("All tab files generated") - - -def read_tab_file(tab_file): - """Make trans_id: data track.""" - trans_to_dat = {} - f = open(tab_file, "r") - for line in f: - line_data = line.rstrip().split("\t") - trans = line_data[0] - data = line_data[1:] - trans_to_dat[trans] = data - f.close() - return trans_to_dat - - - -def __gen_info_placeholder(p_name): - """Generate info placeholder.""" - t_name = ".".join(p_name.split(".")[:-1]) - placeholder = [t_name, UNDEF, UNDEF, "0.0", "0", "0.0", "0.0", "0.0", "0.0", "0.0", UNDEF] - return placeholder - - -def make_toga_data(project_dir): - """Merge togaPlot, togaProt, togaInfo and togaInactFeat tables into togaData.""" - tabs_dir = os.path.join(project_dir, "tabs") - toga_info_file = os.path.join(tabs_dir, "togaInfo.tab") - toga_inact_feat_file = os.path.join(tabs_dir, "togaInactFeat.tab") - toga_plot_file = os.path.join(tabs_dir, "togaPlot.tab") - toga_prot_file = os.path.join(tabs_dir, "togaProt.tab") - out_file = os.path.join(tabs_dir, "togaData.tab") - - print("Reading tab files to merge") - trans_to_info = read_tab_file(toga_info_file) - trans_to_ifeat = read_tab_file(toga_inact_feat_file) - trans_to_plot = read_tab_file(toga_plot_file) - trans_to_prot = read_tab_file(toga_prot_file) - - transcripts = trans_to_info.keys() - print(f"Got data for {len(transcripts)} transcripts") - - f = open(out_file, "w") - for t in transcripts: - info = trans_to_info.get(t) - if info is None: - info = __gen_info_placeholder(t) - ifeat = trans_to_ifeat.get(t, IFEAT_PLACEHOLDER.copy()) - prot = trans_to_prot.get(t, PROT_PLACEHOLDER.copy()) - plot = trans_to_plot.get(t, PLOT_PLACEHOLDER.copy()) - data_lst = [t] + info + ifeat + prot + plot - tab_str = "\t".join(data_lst) - f.write(tab_str) - f.write("\n") - f.close() - # do cleanup - os.remove(toga_info_file) - os.remove(toga_inact_feat_file) - os.remove(toga_plot_file) - os.remove(toga_prot_file) - print("Done") - - -def main(): - """Entry point.""" - args = parse_args() - # call generate_tab_files.py and make_togaPlot.py - call_gen_tab_files(args.project_dir) - call_gen_plot_files(args.project_dir) - # merge 4 transcriptID-related tables into a single one - make_toga_data(args.project_dir) - - -if __name__ == "__main__": - main() diff --git a/ucsc_browser_visualisation/readme.md b/ucsc_browser_visualisation/readme.md index 4241c7e..c2f53d2 100644 --- a/ucsc_browser_visualisation/readme.md +++ b/ucsc_browser_visualisation/readme.md @@ -76,13 +76,22 @@ We will add another condition to handle TOGA tracks. Put another condition as shown here: + ```c -else if (startsWith("HLTOGA", table) && hTableExists(database, "TOGAData")) -{ - doHillerLabTOGAGene(tdb, item); -} +else if (startsWith("HLTOGAannot", trackHubSkipHubName(table))) + { + doHillerLabTOGAGene(database, tdb, item, table); + } ``` + +[//]: # (```c) +[//]: # (else if (startsWith("HLTOGA", table) && hTableExists(database, "TOGAData"))) +[//]: # ({) +[//]: # ( doHillerLabTOGAGene(database, tdb, item, table);) +[//]: # (} +[//]: # (```) + Then re-build your browser. ### Loading TOGA tables @@ -92,26 +101,18 @@ Then re-build your browser. ${project_dir}: directory containing TOGA results and intermediate data. ```shell -./ucsc_browser_visualisation/make_sql_data.py ${project_dir}``` +./ucsc_browser_visualisation/make_bigbed_data_public.py ${project_dir}``` # Wait, it will take a few minutes, most likely less than an hour ``` #### Load tab files to browser database -${tab_files_dir} = ${project_dir}/tabs -${query} - annotated genome identifier. -Use *.sql files located in the ucsc_browser_visualisation directory. -Call the following commands: +Transfer created *.bb, *.ix, and *.ixx files to the machine hosting your instance of UCSC genome browser, if need be. +Create a table schema for bigBed tracks, using bigDataUrl field to specify the bigBed URL, and searchTrix field to specify the *.ix file URL (no need to specify *.ixx file separately, it just should be located in the same directory with the *.ix file.) -```shell -hgLoadSqlTab ${query} TOGAData togaData.sql ${tab_files_dir}/togaInfo.tab -hgLoadSqlTab ${query} TOGANucl togaNucl.sql ${tab_files_dir}/togaNucl.tab -hgLoadSqlTab ${query} TOGAInactMut togaInactMut.sql ${tab_files_dir}/togaInactMut.tab +Please also specify the following fields: ``` - -Also load bed file using hgLoadBed command. -Create a track starting with HLTOGA, for instance HLTOGAannotation. - -```shell script -hgLoadBed ${query} HLTOGAannotation ${project_dir}/query_annotation.bed +type bigBed 12 + +labelFields name +searchIndex name ``` diff --git a/ucsc_browser_visualisation/togaClick.c b/ucsc_browser_visualisation/togaClick.c index e2f10e6..e3d53c8 100644 --- a/ucsc_browser_visualisation/togaClick.c +++ b/ucsc_browser_visualisation/togaClick.c @@ -3,6 +3,44 @@ #include "hgc.h" #include "togaClick.h" #include "string.h" +#include "htmshell.h" +#include "chromAlias.h" + + +struct togaDataBB *togaDataBBLoad(char **row) +/* Load a togaData from row fetched with select * from togaData + * from database. Dispose of this with togaDataFree(). */ +{ + struct togaDataBB *ret; + AllocVar(ret); + ret->projection = cloneString(row[0]); + ret->ref_trans_id = cloneString(row[1]); + ret->ref_region = cloneString(row[2]); + ret->query_region = cloneString(row[3]); + ret->chain_score = cloneString(row[4]); + + ret->chain_synteny = cloneString(row[5]); + ret->chain_flank = cloneString(row[6]); + ret->chain_gl_cds_fract = cloneString(row[7]); + ret->chain_loc_cds_fract = cloneString(row[8]); + ret->chain_exon_cov = cloneString(row[9]); + + ret->chain_intron_cov = cloneString(row[10]); + ret->status = cloneString(row[11]); + ret->perc_intact_ign_M = cloneString(row[12]); + ret->perc_intact_int_M = cloneString(row[13]); + ret->intact_codon_prop = cloneString(row[14]); + + ret->ouf_prop = cloneString(row[15]); + ret->mid_intact = cloneString(row[16]); + ret->mid_pres = cloneString(row[17]); + ret->prot_alignment = cloneString(row[18]); + ret->svg_line = cloneString(row[19]); + ret->ref_link = cloneString(row[20]); + ret->inact_mut_html_table = cloneString(row[21]); + ret->exon_ali_html = cloneString(row[22]); + return ret; +} struct togaData *togaDataLoad(char **row) @@ -38,6 +76,43 @@ struct togaData *togaDataLoad(char **row) } +void togaDataBBFree(struct togaDataBB **pEl) +/* Free a single dynamically allocated togaDatasuch as created + * with togaDataLoad(). */ +{ + struct togaDataBB *el; + + if ((el = *pEl) == NULL) return; + freeMem(el->projection); + freeMem(el->ref_trans_id); + freeMem(el->ref_region); + freeMem(el->query_region); + freeMem(el->chain_score); + + freeMem(el->chain_synteny); + freeMem(el->chain_flank); + freeMem(el->chain_gl_cds_fract); + freeMem(el->chain_loc_cds_fract); + freeMem(el->chain_exon_cov); + + freeMem(el->chain_intron_cov); + freeMem(el->status); + freeMem(el->perc_intact_ign_M); + freeMem(el->perc_intact_int_M); + freeMem(el->intact_codon_prop); + + freeMem(el->ouf_prop); + freeMem(el->mid_intact); + freeMem(el->mid_pres); + freeMem(el->prot_alignment); + freeMem(el->svg_line); + freeMem(el->ref_link); + freeMem(el->inact_mut_html_table); + freeMem(el->exon_ali_html); + freez(pEl); +} + + void togaDataFree(struct togaData **pEl) /* Free a single dynamically allocated togaDatasuch as created * with togaDataLoad(). */ @@ -168,10 +243,207 @@ Prefix must be HLTOGAannot */ } -void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name) +void HLprintQueryProtSeqForAli(char *proteinAlignment) { + // take protein sequence alignment + // print only the query sequence + char *str = proteinAlignment; + int printed_char_num = 0; + while ((str = strstr(str, "que:")) != NULL) + { + str += 10; + char ch; + while ((ch = *str++) != '<') { + if (ch != '-') { + putchar(ch); + ++printed_char_num; + } + if (printed_char_num == 80) { + printed_char_num = 0; + printf("
"); + } + } + } +} + + + +void doHillerLabTOGAGeneBig(char *database, struct trackDb *tdb, char *item, char *table_name) +/* Put up TOGA Gene track info. */ +// To think about -> put into a single bigBed +// string: HTML formatted inact mut +// string: HTML formatted exon ali section +{ +int start = cartInt(cart, "o"); +int end = cartInt(cart, "t"); +char *chrom = cartString(cart, "c"); +char *fileName = bbiNameFromSettingOrTable(tdb, NULL, tdb->table); +struct bbiFile *bbi = bigBedFileOpenAlias(hReplaceGbdb(fileName), chromAliasFindAliases); +struct lm *lm = lmInit(0); +struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm); +struct bigBedInterval *bb; +char *fields[bbi->fieldCount]; +for (bb = bbList; bb != NULL; bb = bb->next) + { + if (!(bb->start == start && bb->end == end)) + continue; + + // our names are unique + char *name = cloneFirstWordByDelimiterNoSkip(bb->rest, '\t'); + boolean match = (isEmpty(name) && isEmpty(item)) || sameOk(name, item); + if (!match) + continue; + + char startBuf[16], endBuf[16]; + bigBedIntervalToRow(bb, chrom, startBuf, endBuf, fields, bbi->fieldCount); + break; + } + +printf("

Projection %s


\n", item); +struct togaDataBB *info = togaDataBBLoad(&fields[11]); // Bogdan: why 11? 0-11 are bed-like fields likely + +printf("Reference transcript: %s
", info->ref_link); +printf("Genomic locus in reference: %s
\n", info->ref_region); +printf("Genomic locus in query: %s
\n", info->query_region); + +printf("Projection classification: %s
\n", info->status); +printf("Probability that query locus is orthologous: %s
\n", info->chain_score); +// list of chain features (for orthology classification) +printf("Show features used for ortholog probability\n"); +printf("
\n"); +printf("\n"); + +printf("
\nFeature description:\n"); +printf("For each projection (one reference transcript and one overlapping chain),\n"); +printf("TOGA computes the following features by intersecting the reference coordinates of aligning\n"); +printf("blocks in the chain with different gene parts (coding exons, UTR (untranslated region) exons, introns)\n"); +printf("and the respective intergenic regions.\n
\n"); + +printf("We define the following variables:\n\n"); +printf("Using these variables, TOGA computes the following features:\n"); +printf("\n"); + + +printf("\n
\n
\n"); +htmlHorizontalLine(); + +// show inact mut plot +printf("

Visualization of inactivating mutations on exon-intron structure

\n"); +printf("%s
\n", info->svg_line); +printf("
Exons shown in grey are missing (often overlap assembly gaps).\nExons shown in"); +printf(" red or blue are deleted or do not align at all.\nRed indicates that the exon deletion "); +printf("shifts the reading frame, while blue indicates that exon deletion(s) are framepreserving.
\n"); + +// GLP features +printf("Show features used for transcript classification\n"); +printf("
\n"); +printf("\n
\n
\n"); + + +htmlHorizontalLine(); + +printf("

Predicted protein sequence


\n"); + +printf("Show protein sequence of query\n"); +printf("
\n"); +// printf("{protein seq of the query without dashes or other things. Should end with *}\n"); +printf(""); +HLprintQueryProtSeqForAli(info->prot_alignment); +printf("\n
\n
\n
\n"); + +// and show protein sequence +htmlHorizontalLine(); +printf("

Protein sequence alignment


\n"); +printf("Show alignment between reference and query\n"); +printf("
\n"); +printf("%s
\n", info->prot_alignment); +printf("
\n

\n"); + +// show inactivating mutations if required +printf("

List of inactivating mutations


\n"); + +printf("Show inactivating mutations\n"); +printf("
\n"); +printf("\n"); // init table +printf("\n"); +printf("\n"); +printf("%s\n", info->inact_mut_html_table); +printf("
Exon numberCodon numberMutation classMutationTreated as inactivatingMutation ID
\n"); +printf("
\n
\n"); + +// show exons data +htmlHorizontalLine(); +printf("

Exon alignments


\n"); + +printf("Show exon sequences and features

\n"); +printf("
\n"); +// printf("%s\n", info->exon_ali_string); +printf("%s\n", info->exon_ali_html); + +htmlHorizontalLine(); + +// TODO: check whether I need this +printf("%s", hgTracksPathAndSettings()); +hPrintf(""); +hPrintf(""); +hPrintf(""); + + +printTrackHtml(tdb); // and do I need this? +} + + +void doHillerLabTOGAGene(char *database, struct trackDb *tdb, char *item, char *table_name) /* Put up TOGA Gene track info. */ { - int start = cartInt(cart, "o"); + //int start = cartInt(cart, "o"); char headerTitle[512]; char suffix[512]; strcpy(suffix, table_name); @@ -180,6 +452,13 @@ void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name) genericHeader(tdb, headerTitle); printf("

TOGA gene annotation

\n"); // htmlHorizontalLine(); + + if (startsWith("bigBed", tdb->type)) + { + doHillerLabTOGAGeneBig(database, tdb, item, table_name); + return; + } + struct sqlConnection *conn = hAllocConn(database); // define TOGA table names: initate with pre-defined prefixes @@ -209,57 +488,103 @@ void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name) if ((row = sqlNextRow(sr)) != NULL) { info = togaDataLoad(row); // parse sql output // fill HTML template: - printf("Projected via: %s
", + printf("Reference transcript: %s
", info->ref_trans_id, info->ref_trans_id); - printf("Region in reference: %s
\n", info->ref_region); - printf("Region in query: %s
\n", info->query_region); + printf("Genomic locus in reference: %s
\n", info->ref_region); + printf("Genomic locus in query: %s
\n", info->query_region); - printf("Projection class: %s
\n", info->status); - printf("Chain score: %s
\n", info->chain_score); + printf("Projection classification: %s
\n", info->status); + printf("Probability that query locus is orthologous: %s
\n", info->chain_score); // list of chain features (for orthology classification) - printf("Show chain features for classification\n"); + printf("Show features used for ortholog probability\n"); printf("
\n"); printf("
    \n"); - printf("
  • Synteny: %s
  • \n", info->chain_synteny); + printf("
  • Synteny (log10 value): %s
  • \n", info->chain_synteny); printf("
  • Global CDS fraction: %s
  • \n", info->chain_gl_cds_fract); printf("
  • Local CDS fraction: %s
  • \n", info->chain_loc_cds_fract); printf("
  • Local intron fraction: %s
  • \n", info->chain_intron_cov); printf("
  • Local CDS coverage: %s
  • \n", info->chain_exon_cov); printf("
  • Flank fraction: %s
  • \n", info->chain_flank); + printf("
\n"); + + printf("
\nFeature description:\n"); + printf("For each projection (one reference transcript and one overlapping chain),\n"); + printf("TOGA computes the following features by intersecting the reference coordinates of aligning\n"); + printf("blocks in the chain with different gene parts (coding exons, UTR (untranslated region) exons, introns)\n"); + printf("and the respective intergenic regions.\n
\n"); + + printf("We define the following variables:\n
    \n"); + printf("
  • c: number of reference bases in the intersection between chain blocks and coding exons of the gene under consideration.
  • \n"); + printf("
  • C: number of reference bases in the intersection between chain blocks and coding exons of all genes.
  • \n"); + printf("
  • a: number of reference bases in the intersection between chain blocks and coding exons and introns of the gene under consideration.
  • \n"); + printf("
  • A: number of reference bases in the intersection between chain blocks and coding exons and introns of all genes and the intersection\n"); + printf("between chain blocks and intergenic regions (excludes UTRs).
  • \n"); + printf("
  • f: number of reference bases in chain blocks overlapping the 10 kb flanks of the gene under consideration.\n"); + printf("Alignment blocks overlapping exons of another gene that is located in these 10 kb flanks are ignored.
  • \n"); + printf("
  • i: number of reference bases in the intersection between chain blocks and introns of the gene under consideration.
  • \n"); + printf("
  • CDS (coding sequence): length of the coding region of the gene under consideration.
  • \n"); + printf("
  • I: sum of all intron lengths of the gene under consideration.
  • \n"); + printf("
\n"); + printf("Using these variables, TOGA computes the following features:\n"); + printf("
    \n"); + printf("
  • “global CDS fraction” as C / A. Chains with a high value have alignments that largely overlap coding exons,"); + printf("which is a hallmark of paralogous or processed pseudogene chains. In contrast, chains with a low value also align many "); + printf("intronic and intergenic regions, which is a hallmark of orthologous chains.
  • \n"); + printf("
  • “local CDS fraction” as c / a. Orthologous chains tend to have a lower value, as intronic "); + printf("regions partially align. This feature is not computed for single-exon genes.
  • \n"); + printf("
  • “local intron fraction” as i / I. Orthologous chains tend to have a higher value."); + printf("This feature is not computed for single-exon genes.
  • \n"); + printf("
  • “flank fraction” as f / 20,000. Orthologous chains tend to have higher values,"); + printf("as flanking intergenic regions partially align. This feature is important to detect orthologous loci of single-exon genes.
  • \n"); + printf("
  • “synteny” as log10 of the number of genes, whose coding exons overlap by at least one base aligning"); + printf("blocks of this chain. Orthologous chains tend to cover several genes located in a conserved order, resulting in higher synteny values.
  • \n"); + printf("
  • “local CDS coverage” as c / CDS, which is only used for single-exon genes.
  • \n"); + printf("
\n"); + + printf("\n
\n
\n"); htmlHorizontalLine(); // show inact mut plot - printf("

Inactivating mutations plot

\n"); + printf("

Visualization of inactivating mutations on exon-intron structure

\n"); printf("%s
\n", info->svg_line); + printf("
Exons shown in grey are missing (often overlap assembly gaps).\nExons shown in"); + printf(" red or blue are deleted or do not align at all.\nRed indicates that the exon deletion "); + printf("shifts the reading frame, while blue indicates that exon deletion(s) are framepreserving.
\n"); // GLP features - printf("Show GLP features\n"); + printf("Show features used for transcript classification\n"); printf("
\n"); printf("
    \n"); - printf("
  • Percent intact ignoring missing seq: %s
  • \n", info->perc_intact_ign_M); - printf("
  • Percent intact (miss == intact): %s
  • \n", info->perc_intact_int_M); - printf("
  • Intact codon proportion %s
  • \n", info->intact_codon_prop); - printf("
  • Out of chain proportion: %s
  • \n", info->ouf_prop); + printf("
  • Percent intact, ignoring missing sequence: %s
  • \n", info->perc_intact_ign_M); + printf("
  • Percent intact, treating missing as intact sequence: %s
  • \n", info->perc_intact_int_M); + printf("
  • Proportion of intact codons: %s
  • \n", info->intact_codon_prop); + printf("
  • Percent of CDS not covered by this chain (0 unless the chain covers only a part of the gene): %s
  • \n", info->ouf_prop); if (sameWord(info->mid_intact, ONE_)) { - printf("
  • Middle 80 percent intact: %s
  • \n", YES_); + printf("
  • Middle 80 percent of CDS intact: %s
  • \n", YES_); } else { - printf("
  • Middle 80 percent intact: %s
  • \n", NO_); + printf("
  • Middle 80 percent of CDS intact: %s
  • \n", NO_); } if (sameWord(info->mid_pres, ONE_)) { - printf("
  • Middle 80 percent present: %s
  • \n", YES_); + printf("
  • Middle 80 percent of CDS present: %s
  • \n", YES_); } else { - printf("
  • Middle 80 percent present: %s
  • \n", NO_); + printf("
  • Middle 80 percent of CDS present: %s
  • \n", NO_); } printf("
\n
\n
\n"); + printf("

Query protein sequence


"); + + printf("Show protein sequence of query\n"); + printf("
\n"); + printf("{protein seq of the query without dashes or other things. Should end with *}\n"); + printf("
\n
\n
\n"); // and show protein sequence htmlHorizontalLine(); - printf("

Protein sequence


\n"); - printf("Show protein alignment\n"); - printf("
\n"); + printf("

Protein sequence alignment


\n"); + printf("Show alignment between reference and query\n"); + printf("
\n"); printf("%s
\n", info->prot_alignment); printf("
\n

\n"); @@ -273,7 +598,7 @@ void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name) } // show inactivating mutations if required - printf("

Inactivating mutations


\n"); + printf("

List of inactivating mutations


\n"); if (hTableExists(database, togaInactMutTableName)) { @@ -285,7 +610,7 @@ void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name) printf("Show inactivating mutations\n"); printf("
\n"); printf("\n"); // init table - printf("\n"); + printf("\n"); printf("\n"); while ((row = sqlNextRow(sr)) != NULL) { @@ -314,7 +639,7 @@ void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name) // show exons data htmlHorizontalLine(); - printf("

Exons data


\n"); + printf("

Exon alignments


\n"); if (hTableExists(database, togaNuclTableName)) { @@ -323,8 +648,6 @@ void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name) char **row; printf("Show exon sequences and features\n"); printf("
\n"); - // TODO: make sure this clause is not necessary - // order by exon_num sqlSafef(query, sizeof(query), "select * from %s where transcript='%s'", togaNuclTableName, item); sr = sqlGetResult(conn, query); @@ -342,13 +665,13 @@ void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name) } printf("Exon alignment class: %s
\n", info->ali_class); if (sameWord(info->in_exp_region, ONE_)){ - printf("Detected within expected region: %s
\n", YES_); + printf("Detected within expected region (%s): %s
\n", info->exp_region, YES_); } else { - printf("Detected within expected region: %s
\n", NO_); + printf("Detected within expected region (%s): %s
\n", info->exp_region, NO_); } - printf("Expected region: %s
\n", info->exp_region); + // printf("Expected region: %s
\n", info->exp_region); printf("
\n"); - printf("Sequence alignment:
\n"); + printf("Sequence alignment between reference and query exon:
\n"); printf("%s
\n", info->alignment); togaNuclFree(&info); } diff --git a/ucsc_browser_visualisation/togaClick.h b/ucsc_browser_visualisation/togaClick.h index 00cecee..c94e80b 100644 --- a/ucsc_browser_visualisation/togaClick.h +++ b/ucsc_browser_visualisation/togaClick.h @@ -15,6 +15,35 @@ #define HLTOGA_MAXCHAR 255 +struct togaDataBB +{ + char *projection; + char *ref_trans_id; + char *ref_region; + char *query_region; + char *chain_score; + char *chain_synteny; + char *chain_flank; + char *chain_gl_cds_fract; + char *chain_loc_cds_fract; + char *chain_exon_cov; + char *chain_intron_cov; + char *status; + char *perc_intact_ign_M; + char *perc_intact_int_M; + char *intact_codon_prop; + char *ouf_prop; + char *mid_intact; + char *mid_pres; + char *prot_alignment; + char *svg_line; + char *ref_link; + char *inact_mut_html_table; + char *exon_ali_html; +}; + + + struct togaData { char *projection; @@ -67,10 +96,19 @@ struct togaInactMut }; +struct togaDataBB *togaDataBBLoad(char **row); +/* Load a togaData from row fetched with select * from togaData + * from database. Dispose of this with togaDataFree(). */ + + struct togaData *togaDataLoad(char **row); /* Load a togaData from row fetched with select * from togaData * from database. Dispose of this with togaDataFree(). */ +void togaDataBBFree(struct togaDataBB **pEl); +/* Free a single dynamically allocated togaDatasuch as created + * with togaDataLoad(). */ + void togaDataFree(struct togaData **pEl); /* Free a single dynamically allocated togaDatasuch as created * with togaDataLoad(). */ @@ -95,7 +133,8 @@ void extractHLTOGAsuffix(char *suffix); /* Extract suffix from TOGA table name. Prefix must be HLTOGAannot */ -void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name); +// void doHillerLabTOGAGene(struct trackDb *tdb, char *item, char *table_name); // old +void doHillerLabTOGAGene(char *database, struct trackDb *tdb, char *item, char *table_name); /* Put up TOGA Gene track info. */ #endif // TOGACLICK_H diff --git a/ucsc_browser_visualisation/togaData.sql b/ucsc_browser_visualisation/togaData.sql deleted file mode 100644 index ab4415e..0000000 --- a/ucsc_browser_visualisation/togaData.sql +++ /dev/null @@ -1,23 +0,0 @@ -CREATE TABLE TOGAData ( - transcript varchar(50) not null, -- unique projection ID: ${ref_transcript_ID}.${chain_ID} - ref_transcript_ID varchar(50) not null, -- transcript ID in reference - ref_region varchar(100) not null, -- transcript region in reference - query_region varchar(100) not null, -- projection region in query - chain_score float not null, -- chain orthology score - chain_synteny int unsigned not null, -- chain synteny - chain_flank float not null, -- flank coverage - chain_gl_cds_fract float not null, -- global CDS fraction - chain_loc_cds_fract float not null, -- local CDS fraction - chain_exon_cov float not null, -- local CDS coverage - chain_intron_cov float not null, -- local intron coverage - status varchar(24) not null, -- projection GLP status: loss, intact, etc - perc_intact_ign_M float not null, -- %intact ignoring Missing - perc_intact_int_M float not null, -- %intact considering missing seq intact - intact_codon_prop float not null, -- % of intact codons - ouf_prop float not null, -- out of chain proportion - mid_intact tinyint unsigned not null, -- middle 80% intact? 1 - True 0 - False, else - undefined - mid_pres tinyint unsigned not null, -- middle 80% present? 1 - True 0 - False, else - undefined - prot_sequence longblob not null, -- protein sequence - svg_plot longblob not null, -- svg string - PRIMARY KEY(transcript) -); diff --git a/ucsc_browser_visualisation/togaInactMut.sql b/ucsc_browser_visualisation/togaInactMut.sql deleted file mode 100644 index eb53d35..0000000 --- a/ucsc_browser_visualisation/togaInactMut.sql +++ /dev/null @@ -1,9 +0,0 @@ -CREATE TABLE TOGAInactMut ( - transcript varchar(50) not null, -- unique projection ID: ${ref_transcript_ID}.${chain_ID} - exon_num int unsigned not null, -- exon number - position int unsigned not null, -- possition where mutation happened - mut_class varchar(15) not null, -- mutation class such as FS deletion - mutation varchar(20) not null, -- what exactly happened - is_inact tinyint unsigned not null, -- is this mutation inactivating, yes 1 or not 0 - mut_id varchar(10) not null -- mut identifier -); diff --git a/ucsc_browser_visualisation/togaNucl.sql b/ucsc_browser_visualisation/togaNucl.sql deleted file mode 100644 index 9d2e189..0000000 --- a/ucsc_browser_visualisation/togaNucl.sql +++ /dev/null @@ -1,12 +0,0 @@ -CREATE TABLE TOGANucl ( - transcript varchar(50) not null, -- unique projection ID: ${ref_transcript_ID}.${chain_ID} - exon_num int unsigned not null, -- exon number - exon_region varchar(100) not null, -- region where exon was detected - pid float not null, -- nucleotide %id - blosum float not null, -- normalized blosum score - gaps tinyint unsigned not null, -- are there any asm gaps near? 1 - yes 0 - no - ali_class varchar(4) not null, -- alignemnt class: A, B, C, A+ - exp_region varchar(50) not null, -- where exon was expected - in_exp_region tinyint unsigned not null, -- detected in expected region or not 1 yes 0 no - alignment longblob not null -- exon sequence in query -);
exonposm_classmutis_inactmut_id
Exon numberCodon numberMutation classMutationTreated as inactivatingMutation ID