From ef651f4345698570cab1598376a59a6dde118ee4 Mon Sep 17 00:00:00 2001 From: kcleal Date: Fri, 15 Nov 2024 16:51:12 +0000 Subject: [PATCH] Better read merging for long reads. New clustering methods for long reads. Improved runtime. --- dysgu/call_component.pyx | 268 +++++++++++++------ dysgu/cluster.pyx | 25 +- dysgu/consensus.pyx | 180 ++++++++++--- dysgu/coverage.pyx | 17 +- dysgu/extra_metrics.pxd | 2 +- dysgu/extra_metrics.pyx | 2 +- dysgu/find_reads.hpp | 2 +- dysgu/graph.pyx | 360 ++++++++++++++----------- dysgu/io_funcs.pyx | 31 +-- dysgu/main.py | 15 +- dysgu/map_set_utils.pxd | 9 +- dysgu/map_set_utils.pyx | 99 ++++--- dysgu/merge_svs.pyx | 554 +++++++++++++++++++++++++-------------- dysgu/post_call.py | 42 ++- dysgu/re_map.py | 12 +- dysgu/sv2bam.pyx | 2 +- pyproject.toml | 2 +- setup.py | 4 +- 18 files changed, 1075 insertions(+), 551 deletions(-) diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index 1c13a34..280dd01 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -2,6 +2,7 @@ from __future__ import absolute_import from collections import Counter, defaultdict, namedtuple import logging +import math import numpy as np cimport numpy as np from numpy.random import normal @@ -24,6 +25,12 @@ import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.simplefilter(action='ignore', category=FutureWarning) + +from sklearn.cluster import KMeans, DBSCAN +from sklearn.exceptions import ConvergenceWarning +warnings.filterwarnings('ignore', category=ConvergenceWarning) + + np.random.seed(1) ctypedef EventResult EventResult_t @@ -60,11 +67,12 @@ cpdef n_aligned_bases(AlignedSegment aln): cdef base_quals_aligned_clipped(AlignedSegment a): - cdef int left_clip, right_clip cdef float aligned_base_quals = 0 cdef float aligned_bases = 0 cdef float clipped_base_quals = 0 - left_clip, right_clip = clip_sizes(a) + cdef int left_clip = 0 + cdef int right_clip = 0 + clip_sizes(a, left_clip, right_clip) clipped_bases = left_clip + right_clip cdef const unsigned char[:] quals = a.query_qualities cdef int i @@ -95,6 +103,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, cdef float clipped_base_quals = 0 cdef float clipped_bases = 0 cdef int abq, ab, cbq, cb + cdef int left_clip, right_clip paired_end = set([]) seen = set([]) er.spanning = len(spanning) @@ -110,7 +119,9 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, er.n_small_tlen += 1 if paired_end_reads and paired_end and flag & 8: er.n_unmapped_mates += 1 - left_clip, right_clip = clip_sizes_hard(a) + left_clip = 0 + right_clip = 0 + clip_sizes_hard(a, left_clip, right_clip) if left_clip > 0 and right_clip > 0: er.double_clips += 1 has_sa = a.has_tag("SA") @@ -150,8 +161,11 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, er.minus += 1 else: er.plus += 1 - ct = a.cigartuples - if ct[0][0] == 4 or ct[-1][0] == 4: + + left_clip = 0 + right_clip = 0 + clip_sizes(a, left_clip, right_clip) + if left_clip or right_clip: er.sc += 1 if a.flag & 1: # paired read abq, ab, cbq, cb = base_quals_aligned_clipped(a) @@ -164,7 +178,9 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, flag = a.flag if flag & 2: er.NP += 1 - left_clip, right_clip = clip_sizes_hard(a) + left_clip = 0 + right_clip = 0 + clip_sizes_hard(a, left_clip, right_clip) if left_clip > 0 and right_clip > 0: er.double_clips += 1 if a.has_tag("SA"): @@ -403,11 +419,15 @@ cdef make_generic_insertion_item(aln, int insert_size, int insert_std): v_item.svtype = "INS" aln_span = aln.reference_end - aln.pos v_item.size_inferred = 1 + cdef int left_clip, right_clip if insert_std > 0: rand_insert_pos = abs(insert_size - aln_span + int(normal(0, insert_std))) else: # single read mode v_item.svtype = "BND" - clip_s = max(clip_sizes(aln)) + left_clip = 0 + right_clip = 0 + clip_sizes(aln, left_clip, right_clip) + clip_s = max(left_clip, right_clip) rand_insert_pos = 100 if not clip_s else clip_s v_item.inferred_sv_len = 0 if rand_insert_pos < 0 else rand_insert_pos return v_item @@ -417,20 +437,19 @@ def consensus_matches_gap(target_gap, target_svlen, cigar, threshold=0.9): if not cigar or target_svlen < 20: return True for op, l in cigar: - if l < 20: - continue if op == 1 and target_gap == "INS" and min(l, target_svlen) / max(l, target_svlen) > threshold: return True if op == 2 and target_gap == "DEL" and min(l, target_svlen) / max(l, target_svlen) > threshold: return True return False + cpdef int assign_contig_to_break(asmb, EventResult_t er, side, spanning): if not asmb: return 0 cdef int ref_bases = 0 if spanning: - if not consensus_matches_gap(er.svtype, er.svlen, asmb["cigar"]): + if "cigar" in asmb and not consensus_matches_gap(er.svtype, er.svlen, asmb["cigar"]): return 0 er.contig = asmb["contig"] er.contig_cigar = asmb["cigar"] @@ -543,6 +562,7 @@ cdef make_single_call(sub_informative, insert_size, insert_stdev, insert_ppf, mi as2 = None ref_bases = 0 if to_assemble or len(spanning_alignments) > 0: + # echo('MAKE SINGLE CALL') if er.preciseA: as1 = consensus.base_assemble(u_reads, er.posA, 500) if as1 and (er.svtype != "TRA" or (as1['contig'] and (as1['contig'][0].islower() or as1['contig'][-1].islower()))): @@ -638,6 +658,7 @@ cdef partition_single(informative, insert_size, insert_stdev, insert_ppf, spanni idx += 1 sub_cluster_calls = [] cdef EventResult_t er + if try_cluster: try: Z = linkage(coords, 'single') @@ -685,6 +706,7 @@ cdef partition_single(informative, insert_size, insert_stdev, insert_ppf, spanni er = make_single_call(informative, insert_size, insert_stdev, insert_ppf, min_support, to_assemble, spanning_alignments, 1, generic_insertions, st, paired_end) sub_cluster_calls.append(er) + return sub_cluster_calls @@ -738,32 +760,37 @@ cdef group_read_subsets(rds, insert_ppf, insert_size, insert_stdev): return spanning_alignments, informative, generic_insertions -def bicluster_spanning_lr(spanning, informative): - if len(spanning) <= 2: +def dbscan_spanning(spanning, informative): + if len(spanning) <= 3: return [(spanning, informative)] - srt = sorted(spanning, key=lambda x: x.len) - if srt[0].len / srt[-1].len > 0.5: + + X = np.array([s.len for s in spanning]).reshape(-1, 1) + X_max = X.max() + if X.min() == X_max: return [(spanning, informative)] - part_a = [srt[0]] - sum_a = srt[0].len - part_b = [srt[-1]] - sum_b = srt[-1].len - for i in range(1, len(srt) - 2): - avg_a = sum_a / len(part_a) - avg_b = sum_b / len(part_b) - t = srt[i] - if abs(t.len - avg_a) < abs(t.len - avg_b): - part_a.append(t) - sum_a += t.len - else: - part_b.append(t) - sum_b += t.len - return [(part_a, []), (part_b, [])] + eps = min(int(X_max * 0.03), int(math.pow(X_max, 0.45))) + eps = max(1, eps) + + cl = DBSCAN(eps=eps, min_samples=2) + labels = cl.fit_predict(X) + + m = int(max(labels)) + if len(labels) == 0 or m == -1: + return [(spanning, informative)] + + result = [[[], []] for i in range(m + 1)] + for idx, l in enumerate(labels): + if l == -1: + continue + result[l][0].append(spanning[idx]) + + return result def process_spanning(paired_end, spanning_alignments, divergence, length_extend, informative, generic_insertions, insert_ppf, to_assemble): + # echo("PROCESS SPANNING") cdef int min_found_support = 0 cdef str svtype, jointype cdef bint passed @@ -818,7 +845,7 @@ def process_spanning(paired_end, spanning_alignments, divergence, length_extend, # 1.6.8 posA_95 = abs(posA - posA_adjusted) posB_95 = abs(posB - posB_adjusted) - + # echo([sp.len for sp in spanning_alignments], "pos adjusted:", [b[0] for b in size_pos_bounded]) svlen = int(np.median([sp.len for sp in spanning_alignments])) posA = spanning_alignments[best_index].pos posB = spanning_alignments[best_index].end @@ -832,6 +859,10 @@ def process_spanning(paired_end, spanning_alignments, divergence, length_extend, er.preciseA = True er.preciseB = True + er.qnames = set([]) + for item in spanning_alignments: + er.qnames.add(hash(item.align.qname)) + ab = abs(posB - posA) if svlen > 0: jitter = ((posA_95 + posB_95) / 2) / svlen @@ -851,7 +882,7 @@ def process_spanning(paired_end, spanning_alignments, divergence, length_extend, er.query_overlap = 0 er.jitter = jitter u_reads = [i.align for i in spanning_alignments] - v_reads = [] + min_found_support = len(spanning_alignments) if len(generic_insertions) > 0: min_found_support += len(generic_insertions) @@ -874,19 +905,21 @@ def process_spanning(paired_end, spanning_alignments, divergence, length_extend, as1 = None as2 = None ref_bases = 0 - if to_assemble: - if er.preciseA: + if to_assemble and er.preciseA: + + if not paired_end and er.svtype == "INS" and er.svlen > 50: + # echo('Making contig from', best_align.qname, spanning_alignments[best_index].cigar_index) + as1 = consensus.contig_from_read_cigar(best_align, spanning_alignments[best_index].cigar_index) + # as1 = consensus.base_assemble([best_align.align], er.posA, 500) + else: as1 = consensus.base_assemble(u_reads, er.posA, 500) - if as1: - ref_bases += assign_contig_to_break(as1, er, "A", spanning_alignments) - if er.preciseB: - as2 = consensus.base_assemble(v_reads, er.posB, 500) - if as2: - ref_bases += assign_contig_to_break(as2, er, "B", 0) - if not as1 and len(generic_insertions) > 0: - as1 = consensus.base_assemble([item.read_a for item in generic_insertions], er.posA, 500) - if as1: - ref_bases += assign_contig_to_break(as1, er, "A", 0) + + if as1: + ref_bases += assign_contig_to_break(as1, er, "A", spanning_alignments) + # elif len(generic_insertions) > 0: + # as1 = consensus.base_assemble([item.read_a for item in generic_insertions], er.posA, 500) + # if as1: + # ref_bases += assign_contig_to_break(as1, er, "A", 0) er.linked = 0 er.block_edge = 0 er.ref_bases = ref_bases @@ -895,15 +928,12 @@ def process_spanning(paired_end, spanning_alignments, divergence, length_extend, start_ins = 0 ct = best_align.cigartuples target_len = svlen - # if er.preciseA: - # target_len = svlen - # else: - # target_len = ct[cigar_index][1] for i in range(spanning_alignments[best_index].cigar_index): if ct[i][0] in {0, 1, 4, 7, 8}: start_ins += ct[i][1] er.variant_seq = best_align.seq[start_ins:start_ins + target_len] er.ref_seq = best_align.seq[start_ins - 1] + # echo("contig is ", er.contig) return er @@ -923,17 +953,24 @@ cdef single(rds, int insert_size, int insert_stdev, float insert_ppf, int clip_l # Use spanning if available, otherwise informative, otherwise generic spanning_alignments, informative, generic_insertions = group_read_subsets(rds, insert_ppf, insert_size, insert_stdev) + n_spanning = len(spanning_alignments) + + if n_spanning and len(generic_insertions) > n_spanning * 10: + generic_insertions = [] + if n_spanning and len(informative) > n_spanning * 20: + informative = [] + if len(spanning_alignments) > 0: - if not paired_end: + # if not paired_end: candidates = [] - for spanning_alignments, informative in bicluster_spanning_lr(spanning_alignments, informative): + for spanning_alignments, informative in dbscan_spanning(spanning_alignments, informative): candidates.append(process_spanning(paired_end, spanning_alignments, divergence, length_extend, informative, generic_insertions, insert_ppf, to_assemble)) return candidates - else: + # else: # single event - return process_spanning(paired_end, spanning_alignments, divergence, length_extend, informative, - generic_insertions, insert_ppf, to_assemble) + # return process_spanning(paired_end, spanning_alignments, divergence, length_extend, informative, + # generic_insertions, insert_ppf, to_assemble) elif len(informative) > 0: @@ -1291,29 +1328,83 @@ cdef tuple mask_soft_clips(int aflag, int bflag, a_ct, b_ct): return left_clipA, right_clipA, left_clipB, right_clipB -cdef query_start_end_from_cigartuples(r): - cdef int end = 0 - cdef int start = 0 - cdef int query_length = r.infer_read_length() # Note, this also counts hard-clips - end = query_length - if r.cigartuples[0][0] == 4 or r.cigartuples[0][0] == 5: - start += r.cigartuples[0][1] - if r.cigartuples[-1][0] == 4 or r.cigartuples[-1][0] == 5: - end -= r.cigartuples[-1][1] - return start, end, query_length +# cdef query_start_end_from_cigartuples(r): +# cdef int end = 0 +# cdef int start = 0 +# cdef int query_length = r.infer_read_length() # Note, this also counts hard-clips +# end = query_length +# if r.cigartuples[0][0] == 4 or r.cigartuples[0][0] == 5: +# start += r.cigartuples[0][1] +# if r.cigartuples[-1][0] == 4 or r.cigartuples[-1][0] == 5: +# end -= r.cigartuples[-1][1] +# return start, end, query_length +# +# +# cdef start_end_query_pair(r1, r2): +# cdef int query_length, s1, e1, s2, e2, start_temp, r1l, r2l +# # r1 and r2 might be on same read, if this is case infer the query position on the read +# s1, e1, r1l = query_start_end_from_cigartuples(r1) +# s2, e2, r2l = query_start_end_from_cigartuples(r2) +# if r1.flag & 64 == r2.flag & 64: # same read +# if r2.flag & 16 != r1.flag & 16: # different strand, count from end +# query_length = r1l # Note, this also counts hard-clips +# start_temp = query_length - e2 +# e2 = start_temp + e2 - s2 +# s2 = start_temp +# return s1, e1, s2, e2, r1l, r2l + + +cdef int query_start_end_from_cigar(AlignedSegment r, int *start, int *end): + cdef uint32_t cigar_value + cdef uint32_t cigar_l + cdef uint32_t *cigar_p + cdef int opp, length + cdef int query_length = 0 + cigar_l = r._delegate.core.n_cigar + cigar_p = bam_get_cigar(r._delegate) + if cigar_l == 0: + return 0 + + # Calculate query length and handle starting clip + start[0] = 0 + for i in range(cigar_l): + cigar_value = cigar_p[i] + opp = cigar_value & 15 # Get operation + length = cigar_value >> 4 # Get length + + if opp in (0, 1, 4, 7, 8): # M, I, S, =, X + query_length += length + elif opp == 5: # H + query_length += length + + if i == 0 and (opp == 4 or opp == 5): # S or H + start[0] = length + + # Set initial end position + end[0] = query_length + + # Handle ending clip + cigar_value = cigar_p[cigar_l - 1] + opp = cigar_value & 15 + length = cigar_value >> 4 + if opp == 4 or opp == 5: # S or H + end[0] -= length + + return query_length + +cdef start_end_query_pair(AlignedSegment r1, AlignedSegment r2): + cdef int s1 = 0, e1 = 0, s2 = 0, e2 = 0 + cdef int r1l, r2l, start_temp + r1l = query_start_end_from_cigar(r1, &s1, &e1) + r2l = query_start_end_from_cigar(r2, &s2, &e2) -cdef start_end_query_pair(r1, r2): - cdef int query_length, s1, e1, s2, e2, start_temp, r1l, r2l - # r1 and r2 might be on same read, if this is case infer the query position on the read - s1, e1, r1l = query_start_end_from_cigartuples(r1) - s2, e2, r2l = query_start_end_from_cigartuples(r2) if r1.flag & 64 == r2.flag & 64: # same read - if r2.flag & 16 != r1.flag & 16: # different strand, count from end - query_length = r1l # Note, this also counts hard-clips - start_temp = query_length - e2 + if r2.flag & 16 != r1.flag & 16: # different strand + start_temp = r1l - e2 e2 = start_temp + e2 - s2 s2 = start_temp + return s1, e1, s2, e2, r1l, r2l @@ -1324,17 +1415,18 @@ def sort_by_length(x): cdef void assemble_partitioned_reads(EventResult_t er, u_reads, v_reads, int block_edge, int assemble): as1 = None as2 = None - if assemble: - if er.preciseA: - as1 = consensus.base_assemble(u_reads, er.posA, 500) - if as1: - if er.spanning == 0 and not (as1['left_clips'] or as1['right_clips']): - as1 = None - if (er.spanning == 0 or as1 is None) and er.preciseB: - as2 = consensus.base_assemble(v_reads, er.posB, 500) - if as2 : - if not (as2['left_clips'] or as2['right_clips']): - as2 = None + # todo + # if assemble: + # if er.preciseA: + # as1 = consensus.base_assemble(u_reads, er.posA, 500) + # if as1: + # if er.spanning == 0 and not (as1['left_clips'] or as1['right_clips']): + # as1 = None + # if (er.spanning == 0 or as1 is None) and er.preciseB: + # as2 = consensus.base_assemble(v_reads, er.posB, 500) + # if as2 : + # if not (as2['left_clips'] or as2['right_clips']): + # as2 = None er.linked = 0 er.block_edge = block_edge er.contig = None @@ -1720,7 +1812,8 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, for (u, v), d in data.s_between: #.items(): rd_u = get_reads(bam, d[0], data.reads, n2n, add_to_buffer, info) # [(Nodeinfo, alignment)..] rd_v = get_reads(bam, d[1], data.reads, n2n, add_to_buffer, info) - + # echo("rd_u", [rr[1].qname for rr in rd_u]) + # echo("rd_v", [rr[1].qname for rr in rd_v]) total_reads = len(rd_u) + len(rd_v) buffered_reads += total_reads if add_to_buffer and buffered_reads > 50000: @@ -1804,6 +1897,9 @@ cpdef list call_from_block_model(bam, data, clip_length, insert_size, insert_std n_parts = len(data.parts) if data.parts else 0 events = [] info = data.info + # echo(data.parts) + # echo(data.s_between) + # echo(data.s_within) if data.reads is None: data.reads = {} # next deal with info - need to filter these into the partitions, then deal with them in single / one_edge @@ -1825,8 +1921,8 @@ cpdef list call_from_block_model(bam, data, clip_length, insert_size, insert_std else: events.append(ev) events = [e for e in events if e and (e.svlen > 0 or e.svtype == "TRA")] - #for e in events: - # echo("call_component svlen", e.svlen, f" support={e.su}, {e.chrA}:{e.posA}-{e.posB}, {e.chrB}") - # if e.svlen_precise: - # set_ins_seq(e) + for e in events: + # echo("call_component svlen", e.svlen, f" support={e.su}, {e.chrA}:{e.posA}-{e.posB}, {e.chrB}") + if e.svlen_precise: + set_ins_seq(e) return events diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx index 5d77be2..97d6982 100644 --- a/dysgu/cluster.pyx +++ b/dysgu/cluster.pyx @@ -1,4 +1,5 @@ # cython: language_level=3 + from __future__ import absolute_import import datetime import time @@ -184,15 +185,15 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N if args["mode"] == "pacbio-sequel2": max_dist, max_clust_dist = 35, 500000 if args["merge_dist"] is None: - args["merge_dist"] = 700 + args["merge_dist"] = 1000 elif args["mode"] == "pacbio-revio": max_dist, max_clust_dist = 50, 500000 if args["merge_dist"] is None: - args["merge_dist"] = 700 + args["merge_dist"] = 1000 elif args["mode"] == "nanopore-r9" or args["mode"] == "nanopore-r10": max_dist, max_clust_dist = 100, 500000 if args["merge_dist"] is None: - args["merge_dist"] = 700 + args["merge_dist"] = 1000 # set upper bound on single-partition size max_single_size = min(max(args["max_cov"] * 50, 10000), 100000) # limited between 5000 - 50,000 reads @@ -436,6 +437,10 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N os.remove(f"{tdir}/job_{p}.done.pkl") if len(block_edge_events) == 0: return [], None + + # for item1 in block_edge_events: + # echo(item1.svtype, item1.su, item1.svlen) + logging.info("Number of components {}. N candidates {}".format(components_seen, len(block_edge_events))) keeps = len([i for i in block_edge_events if i.site_info]) if keeps: @@ -452,12 +457,17 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N # Merge across calls if args["merge_within"] == "True": merged = merge_svs.merge_events(block_edge_events, args["merge_dist"], regions, bool(paired_end), try_rev=False, pick_best=False, - debug=True, min_size=args["min_size"], - max_comparisons=args["max_comparisons"] if "max_comparisons" in args else 100) + debug=True, min_size=args["min_size"], + max_comparisons=args["max_comparisons"] if "max_comparisons" in args else 100, + procs=args['procs']) else: merged = block_edge_events logging.info("Number of candidate SVs merged: {}".format(len(block_edge_events) - len(merged))) logging.info("Number of candidate SVs after merge: {}".format(len(merged))) + + # echo("no--->") + # for item1 in merged: + # echo(item1.svtype, item1.su, item1.svlen) before = len(merged) if auto_support: @@ -490,12 +500,15 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N preliminaries = re_map.drop_svs_near_reference_gaps(preliminaries, paired_end, ref_genome, args["drop_gaps"] == "True") preliminaries = post_call.ref_repetitiveness(preliminaries, ref_genome) preliminaries = post_call.strand_binom_t(preliminaries) - # preliminaries = consensus.contig_info(preliminaries) # GC info, repetitiveness + preliminaries = extra_metrics.find_repeat_expansions(preliminaries, insert_stdev) preliminaries = post_call.compressability(preliminaries) preliminaries = post_call.get_gt_metric2(preliminaries, args["mode"], True) + preliminaries = post_call.get_ref_base(preliminaries, ref_genome, args["symbolic_sv_size"]) + # preliminaries = post_call.filter_microsatellite_non_diploid(preliminaries) + preliminaries = extra_metrics.sample_level_density(preliminaries, regions) preliminaries = coverage_analyser.normalize_coverage_values(preliminaries) diff --git a/dysgu/consensus.pyx b/dysgu/consensus.pyx index 093e13d..7708f2b 100644 --- a/dysgu/consensus.pyx +++ b/dysgu/consensus.pyx @@ -1,11 +1,12 @@ -#distutils: language = c++ -#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True +# cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True + """ -A basic assembler/consensus sequence generator. Takes an overlap graph and merges reads in-place in a POA style. +A basic consensus sequence generator. Takes an overlap graph and merges reads in-place in a POA style. """ import warnings import array +from re import match warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.simplefilter(action='ignore', category=FutureWarning) @@ -140,7 +141,7 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T cdef str seq cdef tuple k - cdef bint done = 0 + cdef cpp_vector[int] vv = [0, 0, 0, 0] cdef int r_end = r.reference_end @@ -164,8 +165,7 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T if opp == 4 and length > 250: i = length - 250 length = 250 - if done: - break + if opp == 4: if start: for o in range(length, 0, -1): @@ -245,12 +245,11 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T if current_pos < approx_position and approx_position - current_pos > max_distance: i += 1 continue - elif current_pos > approx_position and current_pos - approx_position > max_distance: - break + # elif current_pos > approx_position and current_pos - approx_position > max_distance: + # break ref_bases += 1 if ref_bases > target_bases: - done = 1 - break + return qual = quals[i] base = bam_seqi(char_ptr_rseq, i) i += 1 @@ -556,6 +555,11 @@ cdef dict get_consensus(rd, int position, int max_distance): else: cigar = [tuple(ct) for ct in cigar] + # if len(seq) < len(rd[0].seq) and longest_left_sc > 10 and longest_right_sc > 10: + # return {} + # Not a good consensus, use first read instead + # return trim_sequence_from_cigar(rd[0], position, max_distance) + return {"contig": seq, "left_clips": longest_left_sc, "right_clips": longest_right_sc, @@ -582,13 +586,17 @@ cdef trim_sequence_from_cigar(r, int approx_pos, int max_distance): cdef int pos = start_pos # current genome position cdef int end_index = len(ct) - 1 cdef bint started = False - cdef int opp, length + cdef int opp, length, keep_start, keep_end cdef int longest_left_sc = 0 cdef int longest_right_sc = 0 + cdef int ref_bases = 0 parts = [] + pos_index = -1 for opp, length in ct: + if ref_bases > 300 and pos > approx_pos + 150: + break if opp == 4 and index == 0: if abs(pos - approx_pos) < 50: @@ -616,29 +624,30 @@ cdef trim_sequence_from_cigar(r, int approx_pos, int max_distance): elif opp == 0 or opp == 7 or opp == 8 or opp == 3: - if not started: - if abs(pos + length - approx_pos) < 300: - parts.append(r.seq[seq_index:seq_index + length]) # upper implied + op_end = pos + length - started = True - start_pos = pos - pos += length - start_index = index - seq_start = seq_index + # If we're completely before the region of interest + if op_end < approx_pos - 250: + pos += length + seq_index += length + continue - else: - pos += length + # If we're completely after the region of interest + if pos > approx_pos + 250: + break - seq_index += length + # Calculate which portion of this match we want to keep + keep_start = max(0, approx_pos - 250 - pos) + keep_end = min(length, approx_pos + 250 - pos) - else: - if abs(pos + length - approx_pos) > 300: - end_index = index + 1 - break - else: - parts.append(r.seq[seq_index:seq_index + length]) - pos += length - seq_index += length + # If this match overlaps our region of interest + if keep_end > keep_start: + started = True + ref_bases += keep_end - keep_start + parts.append(seq[seq_index + keep_start:seq_index + keep_end]) + + pos += length + seq_index += length index += 1 @@ -693,6 +702,103 @@ cpdef dict base_assemble(rd, int position, int max_distance): return get_consensus(rd, position, max_distance) +cpdef contig_from_read_cigar(r, int cigar_index): + ct = r.cigartuples + seq = r.seq + query_pos = 0 # Position in query sequence + ref_pos = r.pos # Position in reference + window_size = 500 # Size of sequence context to include + + # First pass: find the target position in query coordinates + target_query_start = 0 + for i in range(cigar_index): + op, length = ct[i] + if op in {0, 1, 4, 7, 8}: # Operations that consume query sequence + target_query_start += length + + # Calculate window boundaries in query coordinates + target_length = ct[cigar_index][1] + window_start = max(0, target_query_start - window_size) + window_end = min(len(seq), target_query_start + target_length + window_size) + + parts = [] + query_pos = 0 + ref_pos = r.pos + longest_left_sc = 0 + longest_right_sc = 0 + ref_bases = 0 + + cigar_blocks = [] + started = False + # Second pass: build the sequence + for i, (op, length) in enumerate(ct): + if query_pos > window_end: + break + + if op == 0 or op == 7 or op == 8: # Match/mismatch + if query_pos + length > window_start: + start_idx = max(0, window_start - query_pos) + end_idx = min(length, window_end - query_pos) + match_seq = seq[query_pos + start_idx:query_pos + end_idx] + parts.append(match_seq.upper()) + started = True + ref_bases += len(match_seq) + if cigar_blocks and cigar_blocks[-1][0] == 0: # elide 7 and 8 ops + cigar_blocks[-1][1] += len(match_seq) + else: + cigar_blocks.append([0, len(match_seq)]) + query_pos += length + ref_pos += length + + elif op == 4: # Soft clip + clip_length = min(length, 250) + if query_pos < window_end and query_pos + length > window_start: + start_idx = max(0, window_start - query_pos) + end_idx = min(clip_length, window_end - query_pos) + clip_seq = seq[query_pos + start_idx:query_pos + end_idx].lower() + parts.append(clip_seq) + cigar_blocks.append([4, len(clip_seq)]) + if query_pos < target_query_start: # Left clip + longest_left_sc = len(clip_seq) + else: # Right clip + longest_right_sc = len(clip_seq) + query_pos += length + + elif op == 1: # Insertion + if query_pos + length > window_start and query_pos < window_end: + if i < cigar_index: + parts.append(seq[max(window_start, query_pos):query_pos + length].lower()) + if i == cigar_index: + # Full insertion if it's the target + parts.append(seq[query_pos:query_pos + length].lower()) + elif i > cigar_index: + parts.append(seq[query_pos:min(window_end, query_pos + length)].lower()) + cigar_blocks.append([1, len(parts[-1])]) + if query_pos + length >= window_end: + break + query_pos += length + + elif op == 2: # Deletion + ref_pos += length + if started: + cigar_blocks.append([length, 2]) + + contig = "".join(parts) + + return { + "contig": contig, + "cigar": cigar_blocks, + "left_clips": longest_left_sc, + "right_clips": longest_right_sc, + "ref_bases": ref_bases, + "ref_start": r.pos, + "ref_end": r.reference_end, + "bamrname": r.rname, + "left_weight": 0, + "right_weight": 0, + } + + cpdef float compute_rep(seq): cdef unordered_map[float, int] last_visited @@ -731,6 +837,7 @@ cpdef float compute_rep(seq): return tot_amount / total_seen + cdef tuple get_rep(contig_seq): cdef int left_clip_end = 0 @@ -822,10 +929,17 @@ def contig_info(events): def check_contig_match(a, b, rel_diffs=False, diffs=8, ol_length=21, supress_seq=True, return_int=False): - if not a or not b or len(a) > 5000 or len(b) > 5000: + if not a or not b: return 0 - - query = StripedSmithWaterman(str(a), suppress_sequences=supress_seq) + if len(a) > 10000: + a = a[:10000] + if len(b) > 10000: + b = b[:10000] + + query = StripedSmithWaterman(str(a), suppress_sequences=supress_seq, + match_score=2, mismatch_score=-3, gap_open_penalty=10, + gap_extend_penalty=1 + ) alignment = query(str(b)) if not return_int: return alignment diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx index 7486b2e..5265096 100644 --- a/dysgu/coverage.pyx +++ b/dysgu/coverage.pyx @@ -15,6 +15,7 @@ from libc.stdint cimport uint32_t from pysam.libcalignedsegment cimport AlignedSegment from pysam.libchtslib cimport bam_get_cigar + def index_stats(f, rl=None): if rl is None: rl = [] @@ -147,7 +148,9 @@ cdef class GenomeScanner: # when 'run' command is invoked, run this block. cov track already exists from find-reads if self.cov_track_path is None and self.bam_iter is None: for aln in self.input_bam: - if aln.flag & 1284 or aln.mapq < mq_thresh or aln.cigartuples is None: # not primary, duplicate or unmapped? + cigar_l = aln._delegate.core.n_cigar + + if aln.flag & 1284 or aln.mapq < mq_thresh or cigar_l == 0: # not primary, duplicate or unmapped? continue self._add_to_bin_buffer(aln, tell) tell = 0 if self.no_tell else self.input_bam.tell() @@ -248,7 +251,9 @@ cdef class GenomeScanner: name = aln.qname.__hash__(), aln.flag, aln.pos if name in seen_reads: continue - if aln.flag & 1284 or aln.mapq < mq_thresh or aln.cigartuples is None: + + cigar_l = aln._delegate.core.n_cigar + if aln.flag & 1284 or aln.mapq < mq_thresh or cigar_l == 0: continue if aln.rname != self.current_tid: if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences: @@ -342,7 +347,8 @@ cdef class GenomeScanner: prev_alignment = a if ibam is None: - if a.flag & 1284 or a.mapq < self.mapq_threshold or a.cigartuples is None: + cigar_l = a._delegate.core.n_cigar + if a.flag & 1284 or a.mapq < self.mapq_threshold or cigar_l == 0: continue tell = 0 if self.no_tell else self.input_bam.tell() if self.no_tell: @@ -458,10 +464,11 @@ cdef class GenomeScanner: elif self.no_tell: raise BufferError("Read buffer has overflowed, increase --buffer-size") - def _add_to_bin_buffer(self, a, tell): + def _add_to_bin_buffer(self, AlignedSegment a, tell): # Calculates coverage information on fly, drops high coverage regions, buffers reads cdef int flag = a.flag - if flag & 1540 or a.cigartuples is None or a.seq is None: + cdef uint32_t cigar_l = a._delegate.core.n_cigar + if flag & 1540 or cigar_l == 0: # or a.seq is None: return cdef int rname = a.rname cdef int apos = a.pos diff --git a/dysgu/extra_metrics.pxd b/dysgu/extra_metrics.pxd index d4bae53..c481e9f 100644 --- a/dysgu/extra_metrics.pxd +++ b/dysgu/extra_metrics.pxd @@ -1,4 +1,4 @@ -#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True +#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8 from libc.stdint cimport uint32_t diff --git a/dysgu/extra_metrics.pyx b/dysgu/extra_metrics.pyx index d4bef24..f6aa31f 100644 --- a/dysgu/extra_metrics.pyx +++ b/dysgu/extra_metrics.pyx @@ -1,4 +1,4 @@ -#cython: language_level=3, boundscheck=True, c_string_type=unicode, c_string_encoding=utf8, infer_types=True +#cython: language_level=3, c_string_type=unicode, c_string_encoding=utf8 import numpy as np cimport numpy as np from dysgu.map_set_utils cimport unordered_map, EventResult diff --git a/dysgu/find_reads.hpp b/dysgu/find_reads.hpp index 7138af1..8a56410 100644 --- a/dysgu/find_reads.hpp +++ b/dysgu/find_reads.hpp @@ -38,7 +38,7 @@ class CoverageTrack ~CoverageTrack() {} std::vector cov_array; // Assume coverage never overflows int32 - int max_coverage; + int max_coverage = 250; int index = 0; void set_max_cov(int m) { diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx index 5d641ba..27a0779 100644 --- a/dysgu/graph.pyx +++ b/dysgu/graph.pyx @@ -1,4 +1,5 @@ -#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True +# cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True + from __future__ import absolute_import from collections import defaultdict, deque, namedtuple import numpy as np @@ -12,7 +13,7 @@ from dysgu.map_set_utils cimport unordered_map as robin_map, Py_SimpleGraph from dysgu.map_set_utils cimport multimap as cpp_map from dysgu cimport map_set_utils from dysgu.io_funcs import intersecter -from dysgu.map_set_utils cimport unordered_set, cigar_clip, clip_sizes_hard, is_reciprocal_overlapping, span_position_distance +from dysgu.map_set_utils cimport unordered_set, cigar_clip, clip_sizes, clip_sizes_hard, is_reciprocal_overlapping, span_position_distance from dysgu.map_set_utils cimport hash as xxhasher from dysgu.map_set_utils cimport MinimizerTable from dysgu.extra_metrics import BadClipCounter @@ -248,7 +249,9 @@ cdef class ClipScoper: cdef void update(self, r, int input_read, int chrom, int position, unordered_set[int]& clustered_nodes): - clip_left, clip_right = map_set_utils.clip_sizes(r) + cdef int clip_left = 0 + cdef int clip_right = 0 + clip_sizes(r, clip_left, clip_right) if chrom != self.current_chrom: self.scope_left.clear() self.scope_right.clear() @@ -282,10 +285,12 @@ cdef class PairedEndScoper: cdef int clst_dist, max_dist, local_chrom, max_search_depth cdef cpp_map[int, LocalVal] loci # Track the local breaks and mapping locations cdef vector[cpp_map[int, LocalVal]] chrom_scope # Track the mate-pair breaks and locations + cdef public vector[int] found_exact, found2 cdef float norm - cdef float thresh # spd + cdef float thresh # spd cdef float position_distance_thresh cdef bint paired_end + def __init__(self, max_dist, clst_dist, n_references, norm, thresh, paired_end, position_distance_thresh, max_search_depth): self.clst_dist = clst_dist self.max_dist = max_dist @@ -298,135 +303,126 @@ cdef class PairedEndScoper: cdef cpp_map[int, LocalVal] scope for n in range(n_references + 1): # Add one for special 'insertion chromosome' self.chrom_scope.push_back(scope) + cdef void empty_scopes(self) nogil: for idx in range(self.chrom_scope.size()): if not self.chrom_scope[idx].empty(): self.chrom_scope[idx].clear() self.loci.clear() - cdef vector[int] find_other_nodes(self, int node_name, int current_chrom, int current_pos, int chrom2, int pos2, - ReadEnum_t read_enum, int length_from_cigar, bint trust_ins_len) nogil: - # todo make this code less nested and more readable - cdef int idx, i, count_back, steps, node_name2 - cdef int sep = 0 - cdef int sep2 = 0 - cdef vector[int] found2 - cdef vector[int] found_exact - cdef cpp_map[int, LocalVal]* forward_scope = &self.chrom_scope[chrom2] + cdef void erase_items_out_of_range(self, int current_pos) nogil: + cdef cpp_map[int, LocalVal].iterator local_it, local_it2 + cdef cpp_pair[int, LocalVal] vitem + local_it = self.loci.lower_bound(current_pos - self.clst_dist) + local_it2 = self.loci.begin() + while local_it2 != local_it: + vitem = dereference(local_it2) + self.chrom_scope[vitem.second.chrom2].erase(vitem.second.pos2) + preincrement(local_it2) + if local_it != self.loci.begin(): + self.loci.erase(self.loci.begin(), local_it) + + cdef bint process_vitem(self, cpp_pair[int, LocalVal] vitem, int node_name, int current_chrom, int current_pos, + int chrom2, int pos2, ReadEnum_t read_enum, int length_from_cigar, bint trust_ins_len): # nogil: + cdef int node_name2 = vitem.second.node_name + cdef int sep, sep2 + cdef float max_span, span_distance + if (read_enum == DELETION and vitem.second.read_enum == INSERTION) or \ + (read_enum == INSERTION and vitem.second.read_enum == DELETION): + return True + if node_name2 == node_name: + return True + + if current_chrom != chrom2 or is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2): + sep = c_abs(vitem.first - pos2) + if sep >= self.max_dist: + return False # break the loop + sep2 = c_abs(vitem.second.pos2 - current_pos) + if vitem.second.chrom2 == chrom2 and sep2 < self.max_dist: + if sep < 150: + if length_from_cigar > 0 and vitem.second.length_from_cigar > 0: + max_span = max(length_from_cigar, vitem.second.length_from_cigar) + span_distance = c_abs(length_from_cigar - vitem.second.length_from_cigar) / max_span + if span_distance < self.position_distance_thresh: + self.found_exact.push_back(node_name2) + else: + self.found2.push_back(node_name2) + # self.found_exact.push_back(node_name2) + elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): + self.found2.push_back(node_name2) + + elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): + self.found2.push_back(node_name2) + + elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): + self.found2.push_back(node_name2) + + return True # continue the loop + + cdef void search_forward(self, cpp_map[int, LocalVal]* forward_scope, int node_name, int current_chrom, int current_pos, + int chrom2, int pos2, ReadEnum_t read_enum, int length_from_cigar, bint trust_ins_len):# nogil: + cdef cpp_map[int, LocalVal].iterator local_it + cdef cpp_pair[int, LocalVal] vitem + cdef int steps = 0 + local_it = forward_scope.lower_bound(pos2) + while local_it != forward_scope.end() and steps < self.max_search_depth: + vitem = dereference(local_it) + steps += 1 + if not self.process_vitem(vitem, node_name, current_chrom, current_pos, chrom2, pos2, + read_enum, length_from_cigar, trust_ins_len): + break + preincrement(local_it) + + cdef void search_backward(self, cpp_map[int, LocalVal]* forward_scope, int node_name, int current_chrom, int current_pos, + int chrom2, int pos2, ReadEnum_t read_enum, int length_from_cigar, bint trust_ins_len):# nogil: + cdef cpp_map[int, LocalVal].iterator local_it + cdef cpp_pair[int, LocalVal] vitem + cdef int steps = 0 + local_it = forward_scope.lower_bound(pos2) + if local_it != forward_scope.begin(): + predecrement(local_it) + else: + return + + while True: + vitem = dereference(local_it) + steps += 1 + if not self.process_vitem(vitem, node_name, current_chrom, current_pos, chrom2, pos2, + read_enum, length_from_cigar, trust_ins_len): + break + if local_it == forward_scope.begin() or steps >= self.max_search_depth: + break + predecrement(local_it) + + cdef void find_other_nodes(self, int node_name, int current_chrom, int current_pos, int chrom2, int pos2, + ReadEnum_t read_enum, int length_from_cigar, bint trust_ins_len): # nogil: + if not self.found2.empty(): + self.found2.clear() + if not self.found_exact.empty(): + self.found_exact.clear() + + cdef cpp_map[int, LocalVal]* forward_scope if chrom2 == 10000000: forward_scope = &self.chrom_scope.back() else: forward_scope = &self.chrom_scope[chrom2] - cdef cpp_map[int, LocalVal].iterator local_it, local_it2 - cdef cpp_pair[int, LocalVal] vitem - cdef float max_span, span_distance - # Debug - # echo("current_chrom ", current_chrom, "chrom2", chrom2, "current_pos", current_pos, "pos2", pos2) - # echo("Forward scope len", forward_scope.size(), "Loci scope len", self.loci.size()) - # local_it = forward_scope.begin() - # while local_it != forward_scope.end(): - # vitem = dereference(local_it) - # echo(vitem.first, vitem.second) - # preincrement(local_it) - - # Re-initialize empty + + # Re-initialize if chromosome has changed if current_chrom != self.local_chrom: self.local_chrom = current_chrom self.empty_scopes() - if not self.loci.empty(): - # Erase items out of range in forward scope - local_it = self.loci.lower_bound(current_pos - self.clst_dist) - local_it2 = self.loci.begin() - while local_it2 != local_it: - vitem = dereference(local_it2) - self.chrom_scope[vitem.second.chrom2].erase(vitem.second.pos2) - preincrement(local_it2) - if local_it != self.loci.begin(): - self.loci.erase(self.loci.begin(), local_it) - - local_it = forward_scope.lower_bound(pos2) - steps = 0 - if local_it != forward_scope.end(): - while local_it != forward_scope.end() and steps < self.max_search_depth: - vitem = dereference(local_it) - preincrement(local_it) - steps += 1 - - if (read_enum == DELETION and vitem.second.read_enum == INSERTION) or (read_enum == INSERTION and vitem.second.read_enum == DELETION): - continue - node_name2 = vitem.second.node_name - if node_name2 == node_name: # Can happen due to within-read events - continue - if current_chrom != chrom2 or is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2): - sep = c_abs(vitem.first - pos2) - if sep >= self.max_dist: - break - sep2 = c_abs(vitem.second.pos2 - current_pos) - if vitem.second.chrom2 == chrom2 and sep2 < self.max_dist: - if sep < 150: - if length_from_cigar > 0 and vitem.second.length_from_cigar > 0: - max_span = max(length_from_cigar, vitem.second.length_from_cigar) - span_distance = c_abs(length_from_cigar - vitem.second.length_from_cigar) / max_span - if span_distance < self.position_distance_thresh: - found_exact.push_back(node_name2) - else: - found_exact.push_back(node_name2) - elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): - found2.push_back(node_name2) - elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): - found2.push_back(node_name2) - elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): - found2.push_back(node_name2) - if not found_exact.empty(): - return found_exact - - local_it = forward_scope.lower_bound(pos2) - vitem = dereference(local_it) - if local_it != forward_scope.begin(): - predecrement(local_it) # Move back one before staring search, otherwise same value is processed twice - steps = 0 - while local_it != forward_scope.begin() and steps < self.max_search_depth: - vitem = dereference(local_it) - predecrement(local_it) - steps += 1 - if (read_enum == DELETION and vitem.second.read_enum == INSERTION) or (read_enum == INSERTION and vitem.second.read_enum == DELETION): - continue - node_name2 = vitem.second.node_name - if node_name2 == node_name: - continue - if current_chrom != chrom2 or is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2): - sep = c_abs(vitem.first - pos2) - if sep >= self.max_dist: - break - sep2 = c_abs(vitem.second.pos2 - current_pos) - if vitem.second.chrom2 == chrom2 and sep2 < self.max_dist: - if sep < 150: - if length_from_cigar > 0 and vitem.second.length_from_cigar > 0: - max_span = max(length_from_cigar, vitem.second.length_from_cigar) - span_distance = c_abs(length_from_cigar - vitem.second.length_from_cigar) / max_span - if span_distance < self.position_distance_thresh: - found_exact.push_back(node_name2) - else: - found_exact.push_back(node_name2) - elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): - found2.push_back(node_name2) - elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): - found2.push_back(node_name2) - elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): - found2.push_back(node_name2) - - if not found_exact.empty(): - return found_exact - else: - return found2 + if not self.loci.empty(): + self.erase_items_out_of_range(current_pos) + self.search_forward(forward_scope, node_name, current_chrom, current_pos, chrom2, pos2, + read_enum, length_from_cigar, trust_ins_len) + if not self.found_exact.empty(): + return + self.search_backward(forward_scope, node_name, current_chrom, current_pos, chrom2, pos2, + read_enum, length_from_cigar, trust_ins_len) cdef void add_item(self, int node_name, int current_chrom, int current_pos, int chrom2, int pos2, - ReadEnum_t read_enum, int length_from_cigar): # nogil: - - # Add to scopes, if event is within read, add two references to forward scope. Otherwise when the - # first break point drops from the local scope, the forward scope break will not connect with other - # events that are added later. This is a fix for long read deletions mainly + ReadEnum_t read_enum, int length_from_cigar): # nogil: if chrom2 == -1: return # No chrom2 was set, single-end? cdef cpp_map[int, LocalVal]* forward_scope @@ -434,6 +430,7 @@ cdef class PairedEndScoper: forward_scope = &self.chrom_scope.back() else: forward_scope = &self.chrom_scope[chrom2] + # Add to local scope cdef cpp_pair[int, LocalVal] pp pp.first = current_pos @@ -600,6 +597,7 @@ cdef class NodeToName: if self.stored_nodes[query_node].hash_val == self.stored_nodes[target_node].hash_val: return True + cdef get_query_pos_from_cigarstring(cigar, pos): # Infer the position on the query sequence of the alignment using cigar string cdef int end = 0 @@ -650,17 +648,6 @@ cdef void parse_cigar(str cigar, int *start, int *end, int *ref_end): in_lead = 0 c_cigar += 1 -def get_query_pos_from_cigartuples(r, query_length): - # Infer the position on the query sequence of the alignment using cigar string - start = 0 - # query_length = r.infer_read_length() # Note, this also counts hard-clips - end = query_length - if r.cigartuples[0][0] == 4 or r.cigartuples[0][0] == 5: - start += r.cigartuples[0][1] - if r.cigartuples[-1][0] == 4 or r.cigartuples[-1][0] == 5: - end -= r.cigartuples[-1][1] - return start, end, query_length - AlnBlock = namedtuple("SA", ["query_start", "query_end", "ref_start", "ref_end", "chrom", "mq", "strand", "this"]) JoinEvent = namedtuple("JE", ["chrom", "event_pos", "chrom2", "pos2", "query_pos", "query_end", "read_enum", "cigar_index"]) @@ -675,6 +662,7 @@ class AlignmentsSA: self.join_result = [] self.query_length = r.infer_read_length() # Note, this also counts hard-clips self._alignments_from_sa(r, gettid) + self.cigar_l = 0 def connect_alignments(self, r, max_dist=1000, mq_thresh=0, read_enum=0): if len(self.query_aligns) > 1 and self.index is not None: @@ -683,15 +671,35 @@ class AlignmentsSA: if self.index < len(self.query_aligns) - 1: self._connect_right(r, max_dist, mq_thresh, read_enum) - def _alignments_from_sa(self, r, gettid): - qstart, qend, query_length = get_query_pos_from_cigartuples(r, self.query_length) + def _alignments_from_sa(self, AlignedSegment r, gettid): + + cdef int ref_start, ref_end, query_start, query_end, qstart, qend + + cdef uint32_t cigar_value, cigar_index + cdef uint32_t cigar_l + cdef uint32_t *cigar_p + cdef int event_pos, opp, length + + qstart = 0 + qend = self.query_length + cigar_l = r._delegate.core.n_cigar + self.cigar_l = cigar_l + cigar_p = bam_get_cigar(r._delegate) + cigar_value = cigar_p[0] + opp = cigar_value & 15 + if opp == 4 or opp == 5: + qstart += cigar_value >> 4 + cigar_value = cigar_p[cigar_l - 1] + opp = cigar_value & 15 + if opp == 4 or opp == 5: + qend += cigar_value >> 4 + this_aln = AlnBlock(query_start=qstart, query_end=qend, ref_start=r.pos, ref_end=r.reference_end, chrom=r.rname, mq=r.mapq, strand="-" if r.flag & 16 else "+", this=True) self.aln_strand = this_aln.strand query_aligns = [this_aln] - cdef int ref_start, ref_end, query_start, query_end for sa_block in r.get_tag("SA").split(";"): if sa_block == "": break @@ -702,7 +710,7 @@ class AlignmentsSA: ref_end = ref_start parse_cigar(sa[3], &query_start, &query_end, &ref_end) if this_aln.strand != sa[2]: - start_temp = query_length - query_end + start_temp = self.query_length - query_end query_end = start_temp + query_end - query_start query_start = start_temp query_aligns.append(AlnBlock(query_start, query_end, ref_start, ref_end, gettid(sa[0]), int(sa[4]), sa[2], False)) @@ -751,7 +759,7 @@ class AlignmentsSA: query_pos = self.query_length - query_end query_end = self.query_length - qtemp chrom = a.chrom - cigar_index = len(r.cigartuples) - 1 + cigar_index = self.cigar_l - 1 if b.strand == self.aln_strand: pos2 = b.ref_start else: @@ -802,15 +810,28 @@ cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_ genome_scanner.add_to_buffer(r, node_name, tell) # Add read to buffer both_overlap = p1_overlaps and p2_overlaps + + # Vectors for making edges. 'exact' nodes are nodes with highly similar signatures + cdef vector[int] *found_nodes_exact = &pe_scope.found_exact + cdef vector[int] *found2 = &pe_scope.found2 + if not paired_end or (paired_end and read_enum != BREAKEND and not mm_only and not chrom2 == -1): - other_nodes = pe_scope.find_other_nodes(node_name, chrom, event_pos, chrom2, pos2, read_enum, length_from_cigar, trust_ins_len) - for other_node in other_nodes: - if node_to_name.same_template(node_name, other_node): - continue - if not G.hasEdge(node_name, other_node): - G.addEdge(node_name, other_node, 2) # 'black' edge - # if r.cigartuples[cigar_index][1] == 288 and event_pos > 159764: - # echo("be", node_name, other_node) + pe_scope.find_other_nodes(node_name, chrom, event_pos, chrom2, pos2, read_enum, length_from_cigar, + trust_ins_len) + + # other_nodes = pe_scope.find_other_nodes(node_name, chrom, event_pos, chrom2, pos2, read_enum, length_from_cigar, trust_ins_len) + if not dereference(found_nodes_exact).empty(): + for other_node in dereference(found_nodes_exact): + if node_to_name.same_template(node_name, other_node): + continue + if not G.hasEdge(node_name, other_node): + G.addEdge(node_name, other_node, 2) # 'black' edge + else: + for other_node in dereference(found2): + if node_to_name.same_template(node_name, other_node): + continue + if not G.hasEdge(node_name, other_node): + G.addEdge(node_name, other_node, 2) # 'black' edge elif chrom != chrom2 and clip_l != -1: # Note all paired-end reads have BREAKENDS where chrom != chrom2, but also includes translocations cluster_clipped(G, r, clip_scope, chrom, event_pos, node_name) @@ -846,22 +867,34 @@ cdef int good_quality_clip(AlignedSegment r, int clip_length): if len(quals) == 0: return 1 cdef char* char_ptr_rseq = bam_get_seq(r._delegate) - cdef int i, length, w_sum - cdef int window_length = 10 + cdef uint32_t i, length, w_sum + cdef uint32_t window_length = 10 if clip_length < window_length: window_length = clip_length cdef float avg_qual cdef int poly = window_length - 1 cdef total_good = window_length - 1 - ct = r.cigartuples - if r.flag & 2304: # supplementary, usually hard-clipped, assume good clipping - if ct[0][0] == 5 or ct[-1][0] == 5: + + cdef uint32_t first_cigar_value, last_cigar_value + cdef uint32_t cigar_l + cdef uint32_t *cigar_p + cdef uint32_t opp + cigar_l = r._delegate.core.n_cigar + cigar_p = bam_get_cigar(r._delegate) + first_cigar_value = cigar_p[0] + last_cigar_value = cigar_p[cigar_l -1] + + if r.flag & 2304: # supplementary, usually hard-clipped + if r.mapq < 20 and first_cigar_value & 15 == 5 and last_cigar_value & 15 == 5: # hard clipped both sides + return 0 + if first_cigar_value & 15 == 5 or last_cigar_value & 15 == 5: return 1 cdef int[17] basecounts # char value is index into array cdef bint homopolymer cdef int base_index - if ct[0][0] == 4: - length = r.cigartuples[0][1] + + if first_cigar_value & 15 == 4: # left soft-clip + length = first_cigar_value >> 4 if length >= window_length and length >= clip_length: for i in range(length - window_length, -1, -1): # average of current window by counting leftwards @@ -886,8 +919,8 @@ cdef int good_quality_clip(AlignedSegment r, int clip_length): return 1 total_good = window_length - 1 - if ct[-1][0] == 4: - length = r.cigartuples[-1][1] + if last_cigar_value & 15 == 4: + length = last_cigar_value >> 4 if length >= window_length and length >= clip_length: for i in range(len(r.query_qualities) - length, len(r.query_qualities) - window_length): # average of current window by counting rightwards @@ -933,6 +966,14 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int cdef uint64_t v cdef bint success cdef bint good_clip + + cdef uint32_t first_cigar_value, last_cigar_value + cdef uint32_t cigar_l + cdef uint32_t *cigar_p + cdef uint32_t opp + cigar_l = r._delegate.core.n_cigar + cigar_p = bam_get_cigar(r._delegate) + if paired_end and read_enum == SPLIT and flag & 8: # clip event, or whole read, but mate is unmapped return @@ -979,7 +1020,8 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int return if read_enum == DELETION or read_enum == INSERTION: chrom2 = chrom - if r.cigartuples[cigar_index][0] != 1: # not insertion, use length of cigar event + # if r.cigartuples[cigar_index][0] != 1: # not insertion, use length of cigar event + if cigar_p[cigar_index] & 15 != 1: # not insertion, use length of cigar event pos2 = cigar_pos2 else: pos2 = event_pos @@ -1000,7 +1042,9 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int return if read_enum == DELETION or read_enum == INSERTION: chrom2 = chrom - if r.cigartuples[cigar_index][0] != 1: # not insertion, use length of cigar event + + # if r.cigartuples[cigar_index][0] != 1: # not insertion, use length of cigar event + if cigar_p[cigar_index] & 15 != 1: pos2 = cigar_pos2 else: pos2 = event_pos @@ -1033,7 +1077,8 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int if read_enum == BREAKEND: return chrom2 = r.rname - if cigar_index != -1 and r.cigartuples[cigar_index][0] != 1: # If not insertion + # if cigar_index != -1 and r.cigartuples[cigar_index][0] != 1: # If not insertion + if cigar_index != -1 and cigar_p[cigar_index] & 15 != 1: # If not insertion pos2 = cigar_pos2 else: pos2 = event_pos @@ -1193,7 +1238,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering logging.info("Building graph with clustering {} bp".format(clustering_dist)) cdef TemplateEdges_t template_edges = TemplateEdges() # Edges are added between alignments from same template, after building main graph - cdef int event_pos, cigar_index, opp, length + node_to_name = NodeToName() # Map of nodes -> read ids cdef ClipScoper_t clip_scope = ClipScoper(minimizer_dist, k=k, m=m, clip_length=clip_l, # Keeps track of local reads minimizer_support_thresh=minimizer_support_thresh, @@ -1220,8 +1265,11 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering cdef uint32_t cigar_value cdef uint32_t cigar_l cdef uint32_t *cigar_p + cdef int event_pos, cigar_index, opp, length cdef long n_aligned_bases = 0 cdef int elide_threshold = 150 + cdef int left_clip_size, right_clip_size + for chunk in genome_scanner.iter_genome(): for r, tell in chunk: if r.mapq < mapq_thresh: @@ -1286,7 +1334,9 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering # Whole alignment will be used, try infer position from soft-clip cigar_index = -1 pos2 = -1 - left_clip_size, right_clip_size = clip_sizes_hard(r) # soft and hard-clips + left_clip_size = 0 + right_clip_size = 0 + clip_sizes_hard(r, left_clip_size, right_clip_size) # soft and hard-clips if r.flag & 8 and clipped: # paired by inference # skip if both ends are clipped, usually means its a chunk of badly mapped sequence # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ): diff --git a/dysgu/io_funcs.pyx b/dysgu/io_funcs.pyx index 84960a2..4c37bf6 100644 --- a/dysgu/io_funcs.pyx +++ b/dysgu/io_funcs.pyx @@ -1,10 +1,9 @@ -#cython: language_level=2, boundscheck=True, wraparound=True -#distutils: language=c++ +#cython: language_level=3 import numpy as np cimport numpy as np import logging -from map_set_utils import merge_intervals, echo +from dysgu.map_set_utils import merge_intervals, Py_BasicIntervalTree, echo from collections import defaultdict from importlib.metadata import version import sortedcontainers @@ -12,37 +11,33 @@ import pandas as pd import os import sys import gzip -from dysgu.map_set_utils import Py_BasicIntervalTree import random from libc.stdlib cimport malloc -cdef char *basemap = [ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', - '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', - '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', - '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', - '\0', 'T', '\0', 'G', '\0', '\0', '\0', 'C', '\0', '\0', '\0', '\0', '\0', '\0', 'N', '\0', - '\0', '\0', '\0', '\0', 'A', 'A', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', - '\0', 't', '\0', 'g', '\0', '\0', '\0', 'c', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', - '\0', '\0', '\0', '\0', 'a', 'a' ] +cdef char *basemap = [ b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', + b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', + b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', + b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', + b'\0', b'T', b'\0', b'G', b'\0', b'\0', b'\0', b'C', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'N', b'\0', + b'\0', b'\0', b'\0', b'\0', b'A', b'A', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', + b'\0', b't', b'\0', b'g', b'\0', b'\0', b'\0', b'c', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', + b'\0', b'\0', b'\0', b'\0', b'a', b'a' ] np.random.seed(0) random.seed(0) cpdef str reverse_complement(str seq, int seq_len): - """https://bioinformatics.stackexchange.com/questions/3583/\ - what-is-the-fastest-way-to-get-the-reverse-complement-of-a-dna-sequence-in-pytho/3595#3595""" - cdef char *seq_dest = malloc(seq_len + 1) - seq_dest[seq_len] = '\0' + seq_dest[seq_len] = b'\0' - cdef bytes py_bytes = seq.encode('UTF-8') + cdef bytes py_bytes = seq.encode('ascii') cdef char *seq_src = py_bytes cdef int i = 0 for i in range(seq_len): seq_dest[seq_len - i - 1] = basemap[seq_src[i]] - return seq_dest[:seq_len].decode('UTF-8') + return seq_dest[:seq_len].decode('ascii') def bed_iter(path): diff --git a/dysgu/main.py b/dysgu/main.py index 1497dfa..eb25979 100644 --- a/dysgu/main.py +++ b/dysgu/main.py @@ -1,3 +1,5 @@ +# cython: language_level=3 + from __future__ import absolute_import import click import os @@ -126,10 +128,11 @@ def apply_preset(kwargs): kwargs["mode"] = "nanopore-r10" if kwargs["mode"] != "pe": kwargs["paired"] = "False" - p = presets[kwargs["mode"]] options = new_options_set options = {k: v for k, v in options.items() if v is not None} - kwargs.update(defaults) + for k, v in defaults.items(): + if kwargs.get(k) is None: + kwargs[k] = v kwargs.update(presets[kwargs["mode"]].items()) kwargs.update(options) @@ -250,7 +253,7 @@ def cli(): default="True", type=click.Choice(["True", "False"]), show_default=True) @click.option("--drop-gaps", help="Drop SVs near gaps +/- 250 bp of Ns in reference", default="True", type=click.Choice(["True", "False"]), show_default=True) -@click.option("--merge-dist", help="Attempt merging of SVs below this distance threshold. Default for paired-end data is (insert-median + 5*insert_std) for paired reads, or 700 bp for single-end reads", +@click.option("--merge-dist", help="Attempt merging of SVs below this distance threshold. Default for paired-end data is (insert-median + 5*insert_std) for paired reads, or 1000 bp for single-end reads", default=None, type=int, show_default=False) @click.option("--paired", help="Paired-end reads or single", default="True", show_default=True, type=click.Choice(["True", "False"])) @@ -412,7 +415,7 @@ def get_reads(ctx, **kwargs): @click.option("--drop-gaps", help="Drop SVs near gaps +/- 250 bp of Ns in reference", default="True", type=click.Choice(["True", "False"]), show_default=True) @click.option("--merge-dist", help="Attempt merging of SVs below this distance threshold, default is (insert-median + 5*insert_std) for paired" - "reads, or 700 bp for single-end reads", + "reads, or 1000 bp for single-end reads", default=None, type=int, show_default=False) @click.option("--paired", help="Paired-end reads or single", default="True", show_default=True, type=click.Choice(["True", "False"])) @@ -594,3 +597,7 @@ def test_command(ctx, **kwargs): click.echo("PASS: " + c + "\n", err=True) logging.info("Run test complete") + + +if __name__ == "__main__": + cli() diff --git a/dysgu/map_set_utils.pxd b/dysgu/map_set_utils.pxd index 587e54b..c33d7ff 100644 --- a/dysgu/map_set_utils.pxd +++ b/dysgu/map_set_utils.pxd @@ -9,6 +9,9 @@ cimport numpy as np from libc.stdint cimport uint64_t, int32_t, int8_t +from pysam.libcalignedsegment cimport AlignedSegment +from pysam.libchtslib cimport bam_get_qname, bam_seqi, bam_get_seq, bam_get_cigar + ctypedef cpp_vector[int] int_vec_t ctypedef cpp_pair[int, int] get_val_result @@ -291,9 +294,9 @@ cdef extern from "" namespace "std" nogil: cdef int cigar_exists(r) -cdef tuple clip_sizes(r) +cdef void clip_sizes(AlignedSegment r, int& left, int& right) -cdef tuple clip_sizes_hard(r) +cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right) cdef int cigar_clip(r, int clip_length) @@ -322,4 +325,4 @@ cdef class EventResult: cdef public bint preciseA, preciseB, linked, modified, remapped cdef public int8_t svlen_precise cdef public object contig, contig2, contig_cigar, contig2_cigar, svtype, join_type, chrA, chrB, exp_seq, sample, type, \ - partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info + partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info, qnames diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx index e97101b..e883dd9 100644 --- a/dysgu/map_set_utils.pyx +++ b/dysgu/map_set_utils.pyx @@ -1,7 +1,5 @@ #cython: language_level=3 import click -import numpy as np -cimport numpy as np import cython import time import logging @@ -12,6 +10,9 @@ from libc.stdlib cimport abs as c_abs from libc.math cimport fabs as c_fabs from libc.stdint cimport uint32_t, uint16_t, int16_t, int32_t +from pysam.libcalignedsegment cimport AlignedSegment +from pysam.libchtslib cimport bam_get_qname, bam_seqi, bam_get_seq, bam_get_cigar + ctypedef cpp_pair[int, int] cpp_item ctypedef cpp_pair[long, int] cpp_long_item @@ -245,35 +246,70 @@ cdef int cigar_exists(r): return 0 -cdef tuple clip_sizes(r): - c = r.cigartuples - if not c: - return 0, 0 - - cdef int left = 0 - cdef int right = 0 - - if c[0][0] == 4: - left = c[0][1] - if c[-1][0] == 4: - right = c[-1][1] - return left, right - - -cdef tuple clip_sizes_hard(r): - c = r.cigartuples - if not c: - return 0, 0 - - cdef int left = 0 - cdef int right = 0 - c1 = c[0][0] - if c1 == 4 or c1 == 5: - left = c[0][1] - c1 = c[-1][0] - if c1 == 4 or c1 == 5: - right = c[-1][1] - return left, right +cdef void clip_sizes(AlignedSegment r, int& left, int& right): + cdef uint32_t cigar_value + cdef uint32_t cigar_l + cdef uint32_t *cigar_p + cdef int opp, length + cigar_l = r._delegate.core.n_cigar + cigar_p = bam_get_cigar(r._delegate) + if cigar_l == 0: + return + cigar_value = cigar_p[0] + opp = cigar_value & 15 + if opp == 4: + left = cigar_value >> 4 + cigar_value = cigar_p[cigar_l - 1] + opp = cigar_value & 15 + if opp == 4: + right = cigar_value >> 4 + + # c = r.cigartuples + # if not c: + # return 0, 0 + # + # cdef int left = 0 + # cdef int right = 0 + # + # if c[0][0] == 4: + # left = c[0][1] + # if c[-1][0] == 4: + # right = c[-1][1] + # return left, right + + +cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right): + cdef uint32_t cigar_value + cdef uint32_t cigar_l + cdef uint32_t *cigar_p + cdef int opp, length + + cigar_l = r._delegate.core.n_cigar + cigar_p = bam_get_cigar(r._delegate) + if cigar_l == 0: + return + cigar_value = cigar_p[0] + opp = cigar_value & 15 + if opp == 4 or opp == 5: + left = cigar_value >> 4 + cigar_value = cigar_p[cigar_l - 1] + opp = cigar_value & 15 + if opp == 4 or opp == 5: + right = cigar_value >> 4 + + # c = r.cigartuples + # if not c: + # return 0, 0 + # + # cdef int left = 0 + # cdef int right = 0 + # c1 = c[0][0] + # if c1 == 4 or c1 == 5: + # left = c[0][1] + # c1 = c[-1][0] + # if c1 == 4 or c1 == 5: + # right = c[-1][1] + # return left, right cdef int cigar_clip(r, int clip_length): @@ -447,6 +483,7 @@ cdef class EventResult: self.n_sa = 0 self.n_gaps = 0 self.compress = 0 + self.qnames = set([]) def __repr__(self): return str(to_dict(self)) diff --git a/dysgu/merge_svs.pyx b/dysgu/merge_svs.pyx index b2c6cff..a3b8f1a 100644 --- a/dysgu/merge_svs.pyx +++ b/dysgu/merge_svs.pyx @@ -1,15 +1,19 @@ # cython: language_level=3 + from __future__ import absolute_import + +from multiprocessing import Pool + import numpy as np import random -from collections import defaultdict +from collections import defaultdict, deque import networkx as nx from dysgu import consensus, io_funcs from dysgu.map_set_utils cimport is_reciprocal_overlapping, EventResult from dysgu.map_set_utils import echo from dysgu.io_funcs import intersecter -import itertools from cython.operator import dereference +from functools import cmp_to_key ctypedef EventResult EventResult_t @@ -26,15 +30,20 @@ def get_chrom_key(ei): def compare_subset(potential, int max_dist, int max_comparisons): tmp_list = defaultdict(list) cdef int idx, jdx, dist1, ci_a + cdef int half_d = (max_dist * 0.5) for idx in range(len(potential)): ei = potential[idx] - tmp_list[get_chrom_key(ei)].append((ei.posA - ei.cipos95A - max_dist, ei.posA + ei.cipos95A + max_dist, idx)) + chrom_key = get_chrom_key(ei) + tmp_list[chrom_key].append((ei.posA - ei.cipos95A - max_dist, ei.posA + ei.cipos95A + max_dist, idx)) + if ei.chrA == ei.chrB and ei.svlen > half_d: + tmp_list[chrom_key].append( + (ei.posB - ei.cipos95B - max_dist, ei.posB + ei.cipos95B + max_dist, idx)) nc2 = {k: io_funcs.iitree(v, add_value=True) for k, v in tmp_list.items()} for idx in range(len(potential)): ei = potential[idx] ols = nc2[get_chrom_key(ei)].allOverlappingIntervals(ei.posA, ei.posA + 1) - ols.remove(idx) + ols = [i for i in set(ols) if i != idx] if len(ols) > max_comparisons: random.shuffle(ols) ols = ols[:max_comparisons] @@ -43,11 +52,6 @@ def compare_subset(potential, int max_dist, int max_comparisons): yield ei, ej, idx, jdx -def compare_all(potential): - for idx, jdx in itertools.product(range(len(potential)), range(len(potential))): - yield potential[idx], potential[jdx], idx, jdx - - cdef span_position_distance(ei, ej): if ei.svtype != "INS": span1 = abs(ei.posB - ei.posA) @@ -129,7 +133,6 @@ def consistent_alignment_and_cigars(ei, ej, l_ratio): if ei.contig_ref_end > 0 and ej.contig_ref_end > 0: ref_overlap = min(ei.contig_ref_end, ej.contig_ref_end) - max(ei.contig_ref_start, ej.contig_ref_start) if ref_overlap < 0: - #echo("no ref overlap", (ei.contig_ref_start, ej.contig_ref_start), (ei.contig_ref_end, ej.contig_ref_end)) return False else: return True @@ -147,16 +150,20 @@ def consistent_alignment_and_cigars(ei, ej, l_ratio): if ej.svtype == "DEL": diff_i = sum(l for op, l in ei.contig_cigar if op == 2 and l > 10) diff_j = sum(l for op, l in ej.contig_cigar if op == 2 and l > 10) - #echo(ei.contig_cigar) - #echo(ej.contig_cigar) tot_non_ref = diff_i + diff_j tot_sv_len = ei.svlen + ej.svlen ratio = tot_non_ref / tot_sv_len - #echo("RATIO", ratio, tot_non_ref, tot_sv_len) if ratio > 1.8: # or ratio < 0.25: return False return True + +def jaccard_similarity(set1, set2): + intersection = len(set1.intersection(set2)) + union = len(set1.union(set2)) + return intersection / union if union != 0 else 0 + + def get_consensus_seqs(ei, ej): ci = ei.contig ci2 = ei.contig2 @@ -178,8 +185,6 @@ def get_consensus_seqs(ei, ej): def contig_pairs_iter(ci, ci2, ci_alt, cj, cj2, cj_alt): if ci and cj: yield ci, cj - if ci_alt and cj_alt: - yield ci_alt, cj_alt if ci2 and cj2: yield ci2, cj2 if ci2 and cj: @@ -194,263 +199,408 @@ def contig_pairs_iter(ci, ci2, ci_alt, cj, cj2, cj_alt): yield ci, cj_alt if ci2 and cj_alt: yield ci2, cj_alt + if ci_alt and cj_alt: + yield ci_alt, cj_alt -cdef int matches_with_max_gap(str cigar, int max_gap): +cdef int matches_with_max_gap(char *c_cigar, int max_gap): # get matching bases, and filter large alignment gaps cdef: - # int idx = 0 int matches = 0 + int gaps = 0 int num = 0 # To accumulate the number as we parse through the string char op - cdef bytes t = cigar.encode('utf8') - cdef char *c_cigar = t # Convert Python string to char* - # cigartuples = [] + while dereference(c_cigar): if b'0' <= dereference(c_cigar) <= b'9': # Convert digit char to int and accumulate num = num * 10 + (dereference(c_cigar) - 48 ) else: op = dereference(c_cigar) - if (op == b'D' or op == b'I') and num > max_gap: - return 0 + if op == b'D' or op == b'I': + if num > max_gap: + return 0 + gaps += num if op == b'M': matches += num - # cigartuples.append((cigar[idx], num)) num = 0 c_cigar += 1 - # idx += 1 + if matches and gaps / matches > 0.05: #todo + return 0 return matches -cdef bint bad_alignment(alignment, ei, ej, v): - if not alignment: - return True - cdef int matches = matches_with_max_gap(alignment.cigar, 30) - if matches == 0: +cdef bint bad_insertion_coverage(char *c_cigar, seq, bint is_query): + # Count the matching bases over the lowercase 'insertion' seq only + cdef: + int seq_index = 0 + int matches = 0 + int gaps = 0 + int num = 0 + int i + char op + while dereference(c_cigar): + if b'0' <= dereference(c_cigar) <= b'9': + num = num * 10 + (dereference(c_cigar) - 48) + else: + op = dereference(c_cigar) + if op == b'D' or op == b'I': + if seq[seq_index].islower(): + gaps += num + if op == b'D' and not is_query: + seq_index += num + if op == b'M': + for i in range(seq_index, seq_index + num - 1): + if seq[i].islower(): + matches += 1 + seq_index += num + num = 0 + c_cigar += 1 + if not matches or gaps / matches > 0.05: #todo + return True + return False + + +cdef bint bad_alignment(alignment, ei, ej, v, paired_end): + if not alignment: return True qs = alignment.query_begin qe = alignment.query_end + qlen = len(v[0]) + tlen = len(v[1]) ts = alignment.target_begin te = alignment.target_end_optimal - total_sv_length = ei.svlen + ej.svlen - unaligned_bases = min(qs, ts) + min(len(v[0]) - qe, (len(v[1]) - te)) - if unaligned_bases > total_sv_length: + + q_right_clip = qlen - qe + t_right_clip = tlen - te + + if not paired_end: + soft_clip_tolerance = max(10, min(qlen, tlen) * 0.04) + matches_threshold = min(qlen, tlen) * 0.6 + max_gap = 15 + ins_cov = 0.8 + else: + soft_clip_tolerance = 10 + matches_threshold = min(qlen, tlen) * 0.5 + max_gap = 5 + ins_cov = 0.9 + + cdef bytes t = alignment.cigar.encode('utf8') + cdef char *c_cigar = t + + # Skip alignments with larger gaps or not many matching bases + cdef int matches = matches_with_max_gap(c_cigar, max_gap) + if matches == 0 or matches < matches_threshold: return True + # Keep if aligned inserted bases ~ svlen + if (ei.svtype == "INS" and ei.spanning > 0) or (ej.svtype == "INS" and ej.spanning > 0): + a_into_b = True + b_into_a = True + if ei.svtype == "INS" and ei.spanning > 0: + ei_ins_count = len([i for i in v[0][qs:qe] if i.islower()]) + if ei_ins_count / ei.svlen < ins_cov: + a_into_b = False + elif bad_insertion_coverage(c_cigar, v[0][qs:qe], True): + a_into_b = False + + if ej.svtype == "INS" and ej.spanning > 0: + ej_ins_count = len([i for i in v[1][ts:te] if i.islower()]) + if bad_insertion_coverage(c_cigar, v[1][ts:te], False): + b_into_a = False + elif ej_ins_count / ej.svlen < ins_cov: + b_into_a = False + if not a_into_b and not b_into_a: + return True + + # Keep, if one sequence is more or less completely aligned to the other + if qs < soft_clip_tolerance and q_right_clip < soft_clip_tolerance: + return False + if ts < soft_clip_tolerance and t_right_clip < soft_clip_tolerance: + return False + + if qs > soft_clip_tolerance: + if q_right_clip > soft_clip_tolerance: + return True + if ts > soft_clip_tolerance: + return True + + if ts > soft_clip_tolerance: + if t_right_clip > soft_clip_tolerance: + return True + if qs > soft_clip_tolerance: + return True + + if q_right_clip > soft_clip_tolerance: + if qs > soft_clip_tolerance: + return True + if t_right_clip > soft_clip_tolerance: + return True + + if t_right_clip > soft_clip_tolerance: + if ts > soft_clip_tolerance: + return True + if q_right_clip > soft_clip_tolerance: + return True + + return False # ok + + +def process_contig_aignments(ci, ci2, ci_alt, cj, cj2, cj_alt, ei, ej, paired_end, idx, jdx, same_sample): + for v in contig_pairs_iter(ci, ci2, ci_alt, cj, cj2, cj_alt): + if not v[0] or not v[1]: + continue + if same_sample: + res = consensus.check_contig_match(v[0], v[1], return_int=False) + if bad_alignment(res, ei, ej, v, paired_end): + break + return idx, jdx + # echo("---->MERGED3", ei.svlen, ej.svlen, ei.svtype, (idx, jdx)) + + elif consensus.check_contig_match(v[0], v[1], return_int=True): + return idx, jdx + def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, rel_diffs=False, diffs=15, - same_sample=True, aggressive_ins_merge=False, debug=False, max_comparisons=20): - event_iter = compare_all(potential) if len(potential) < 50 else compare_subset(potential, max_dist, max_comparisons) + same_sample=True, aggressive_ins_merge=False, debug=False, max_comparisons=20, procs=1): + event_iter = compare_subset(potential, max_dist, max_comparisons) - seen, disjoint_nodes = set(), set() + seen, disjoint_edges = set(), set() out_edges = defaultdict(int) - #echo("LEN potential", len(potential), paired_end) + + avg_su_thresh = max(np.median([i.su for i in potential]) * 0.4, 10) + + job = [] + pool = None + if procs > 1: + pool = Pool(procs) for ei, ej, idx, jdx in event_iter: + if len(job) > 1000: + if procs == 1: + for item in job: + edge = process_contig_aignments(*item) + if not edge: + continue + else: + G.add_edge(edge[0], edge[1], loci_same=True) + else: + for edge in pool.starmap(process_contig_aignments, job): + if not edge: + continue + else: + G.add_edge(edge[0], edge[1], loci_same=True) + job = [] + i_id, j_id = ei.event_id, ej.event_id - ins_dup = (ei.svtype == "INS" and ej.svtype == "DUP") or (ei.svtype == "DUP" and ej.svtype == "INS") + id_key = (min(idx, jdx), max(idx, jdx)) + + # if id_key not in seen: + # echo("merge candidate", ei.svlen, ej.svlen, "positions", ei.posA, ej.posA, ei.svtype, ej.svtype, 'SU=', ei.su, ej.su) + + if ei.su > avg_su_thresh and ej.su > avg_su_thresh and ei.spanning > 0 and ej.spanning > 0: + disjoint_edges.add(id_key) - if (i_id == j_id or (i_id, j_id) in seen or (j_id, i_id) in seen or - out_edges[idx] > 10 or out_edges[jdx] > 10 or - (not same_sample and ei.sample == ej.sample) or - (ej.svtype == "DEL" and ei.svtype != "DEL") or - (ei.svtype == "DEL" and ej.svtype != "DEL")): + if (same_sample and i_id == j_id) or \ + (not same_sample and ei.sample == ej.sample) or \ + id_key in seen or \ + out_edges[idx] > 10 or out_edges[jdx] > 10: + # or + # (ej.svtype == "DEL" and ei.svtype != "DEL") or + # (ei.svtype == "DEL" and ej.svtype != "DEL")): # (ei.svtype in {"INS", "DUP"} and ej.svtype not in {"INS", "DUP"}) or # ei.svtype != ej.svtype): + continue - continue + seen.add(id_key) - seen.add((i_id, j_id)) intra = ei.chrA == ej.chrA and ei.chrB == ej.chrB - loci_similar = True - if paired_end: - loci_similar = similar_locations(intra, ei, ej) - if not loci_similar: - #echo("loci not similar") - continue + # loci_similar = True + # if paired_end: + # loci_similar = similar_locations(intra, ei, ej) + # if not loci_similar: + # continue if not intra: out_edges[idx] += 1 out_edges[jdx] += 1 - G.add_edge(i_id, j_id) - #echo("not intra added") + G.add_edge(idx, jdx) continue - #echo("merge candidate", ei.svlen, ej.svlen, "positions", ei.posA, ej.posA, ei.svtype, ej.svtype) - - one_is_imprecise = (not ei.preciseA or not ei.preciseB or ei.svlen_precise or - not ej.preciseA or not ej.preciseB or ej.svlen_precise) + if ei.spanning > 0 and ej.spanning > 0 and jaccard_similarity(ei.qnames, ej.qnames) > 0.1: + disjoint_edges.add(id_key) + continue any_contigs_to_check, ci, ci2, ci_alt, cj, cj2, cj_alt = get_consensus_seqs(ei, ej) if paired_end: overlap = max(0, min(ei.posA, ej.posA) - max(ei.posB, ej.posB)) if ei.spanning > 0 and ej.spanning > 0 and overlap == 0 and ei.svtype != "INS": - disjoint_nodes.add(i_id) - disjoint_nodes.add(j_id) - #echo("filetered disjoint") + disjoint_edges.add(id_key) + # disjoint_nodes.add(j_id) continue if (same_sample and ei.svtype == "DEL" and ei.su < 3 and ej.su < 3 and not any_contigs_to_check and ei.spanning == 0 and ej.spanning == 0 and ei.sc == 0 and ej.sc == 0): - #echo("other1") continue ml = max(int(ei.svlen), int(ej.svlen)) if ml == 0 and ei.svtype != 'TRA': - #echo("other2") continue l_ratio = min(int(ei.svlen), int(ej.svlen)) / ml if ml else 1 - merge_conditions_met = False - if ins_dup or ei.svtype == "TRA" or ej.svtype == "TRA": - merge_conditions_met = True - elif ei.svtype == "INS": - if aggressive_ins_merge or paired_end: #(paired_end and isinstance(ei.variant_seq, str) and isinstance(ej.variant_seq, str) and l_ratio > 0.7): - merge_conditions_met = True - elif ml > 0 and l_ratio > 0.7: - merge_conditions_met = True - else: - spd = span_position_distance(ei, ej) + if paired_end: # Merge insertion-like sequences aggressively + ei_ins_like = ei.svtype in "INSDUPTRA" + ej_ins_like = ej.svtype in "INSDUPTRA" recpi_overlap = is_reciprocal_overlapping(ei.posA, ei.posB, ej.posA, ej.posB) - both_in_include = intersecter(tree, ei.chrA, ei.posA, ei.posA + 1) and intersecter(tree, ei.chrB, ei.posB, ei.posB + 1) - - merge_conditions_met = ( - (paired_end and ei.spanning > 0 and ej.spanning > 0 and (recpi_overlap or spd > 0.3)) or - ((recpi_overlap or spd > 0.3 or (loci_similar and any_contigs_to_check)) and not both_in_include) - ) - - - if not merge_conditions_met: - #echo("merge conditions not met", i_id, j_id, ei.svlen, ej.svlen) - continue + if ei_ins_like and ej_ins_like and abs(ei.posA - ej.posA) < 50 and (ei.spanning == 0 and ej.spanning == 0 and ei.remap_score == 0 and ej.remap_score == 0): + G.add_edge(idx, jdx, loci_same=False) + continue + elif recpi_overlap and ei.svtype == "DEL" and ej.svtype == "DEL" and ei.remap_score > 0 and ej.remap_score > 0: + G.add_edge(idx, jdx, loci_same=False) + continue - if same_sample and not paired_end and not consistent_alignment_and_cigars(ei, ej, l_ratio): - #echo("inconsistent cigars") - continue + # merge_conditions_met = False + # if ins_dup or ei.svtype == "TRA" or ej.svtype == "TRA": + # merge_conditions_met = True + # elif ei.svtype == "INS": + # if aggressive_ins_merge or paired_end: #(paired_end and isinstance(ei.variant_seq, str) and isinstance(ej.variant_seq, str) and l_ratio > 0.7): + # merge_conditions_met = True + # elif ml > 0 and l_ratio > 0.7: + # merge_conditions_met = True + # else: + # spd = span_position_distance(ei, ej) + # recpi_overlap = is_reciprocal_overlapping(ei.posA, ei.posB, ej.posA, ej.posB) + # both_in_include = intersecter(tree, ei.chrA, ei.posA, ei.posA + 1) and intersecter(tree, ei.chrB, ei.posB, ei.posB + 1) + # + # merge_conditions_met = ( + # (paired_end and ei.spanning > 0 and ej.spanning > 0 and (recpi_overlap or spd > 0.3)) or + # ((recpi_overlap or spd > 0.3 or (loci_similar and any_contigs_to_check)) and not both_in_include) + # ) + + + # if not merge_conditions_met: + # echo("merge conditions not met", i_id, j_id, ei.svlen, ej.svlen) + # continue + + # if same_sample and not paired_end and not consistent_alignment_and_cigars(ei, ej, l_ratio): + # echo("inconsistent cigars") + # continue # Loci are similar, check contig match or reciprocal overlap if not any_contigs_to_check: + one_is_imprecise = (not ei.preciseA or not ei.preciseB or ei.svlen_precise or + not ej.preciseA or not ej.preciseB or ej.svlen_precise) if ml > 0 and (l_ratio > 0.5 or (one_is_imprecise and l_ratio > 0.3)): out_edges[idx] += 1 out_edges[jdx] += 1 - G.add_edge(i_id, j_id, loci_same=False) - #echo("MERGED2", ei.svlen, ej.svlen, ei.svtype) + G.add_edge(idx, jdx, loci_same=False) + # echo("MERGED2", ei.svlen, ej.svlen, ei.svtype) continue - #echo("no contigs to check fail") + elif procs > 1: + job.append( + (ci, ci2, ci_alt, cj, cj2, cj_alt, ei, ej, paired_end, idx, jdx, same_sample) + ) else: - #echo("processing contig pairs", ci, ci2, ci_alt, cj, cj2, cj_alt) - for v in contig_pairs_iter(ci, ci2, ci_alt, cj, cj2, cj_alt): - # echo(v) - if same_sample: - res = consensus.check_contig_match(v[0], v[1], return_int=False) - if bad_alignment(res, ei, ej, v): - continue - - G.add_edge(i_id, j_id, loci_same=True) - #echo("MERGED3", ei.svlen, ej.svlen, ei.svtype) - - elif consensus.check_contig_match(v[0], v[1], return_int=True): - G.add_edge(i_id, j_id, loci_same=True) - else: - continue - #echo("no align", v[0], v[1]) - break - - - - + edge = process_contig_aignments(ci, ci2, ci_alt, cj, cj2, cj_alt, ei, ej, paired_end, idx, jdx, same_sample) + if not edge: + continue + else: + G.add_edge(edge[0], edge[1], loci_same=True) - #v = (ci_alt, cj_alt) if ci_alt and cj_alt else (ci, cj) if ci and cj else None - - # v = ((ci, cj) if (ci and cj) else (ci_alt, cj_alt)) if ci_alt and cj_alt else None - # if v is None: - # echo("skipped due to no contigs") - # continue - # - # if v and same_sample: - # - # - # res = consensus.check_contig_match(v[0], v[1], return_int=False) - # echo("RES", res, v[0], v[1], ei.svlen, ej.svlen) - # - # if res: - # qs, qe, ts, te, cig, _, _ = res - # if large_cigar_gap(cig, 30): - # echo("GAP too big", cig) - # continue - # total_sv_length = ei.svlen + ej.svlen - # aligned_bases = max(qe - qs, te - ts) - # unaligned_bases = qs + ts + (len(v[0]) - qe) + (len(v[1]) - te) - # echo(unaligned_bases, total_sv_length, aligned_bases, cig) - # echo(ei.contig_cigar, ej.contig_cigar) - # #if unaligned_bases < total_sv_length: - # # if aligned_bases > ref_overlap or unaligned_bases < total_sv_length: - # if unaligned_bases < total_sv_length: - # G.add_edge(i_id, j_id, loci_same=True) - # echo("MERGED3", ei.svlen, ej.svlen, ei.svtype) - # continue - # - # elif v and not same_sample and consensus.check_contig_match(v[0], v[1], return_int=True): - # G.add_edge(i_id, j_id, loci_same=True) - # continue - # - # # else: - # # Handle contig matching for alternate sequences - # for alt_combination in [(ci_alt, cj), (ci_alt, cj2), (cj_alt, ci), (cj_alt, ci2)]: - # echo("testing alt combo") - # if alt_combination[0] and alt_combination[1] and consensus.check_contig_match(alt_combination[0], alt_combination[1], return_int=True): - # out_edges[idx] += 1 - # out_edges[jdx] += 1 - # G.add_edge(i_id, j_id, loci_same=True) - # # echo(f"MERGED4") - # echo("MERGED4", ei.svlen, ej.svlen, ei.svtype) - # break - - return G, disjoint_nodes - - -def cut_components(G, disjoint_nodes): - components = nx.algorithms.components.connected_components(G) - if len(disjoint_nodes) > 0: - # try split this component into disjoint sets. This method works for small cluster sizes (most of the time) - # but can fail when there are many disjoint nodes. Label propagation might be needed for these - components2 = [] - for c in components: - n_disjoin = set([]) - for node in c: - if node in disjoint_nodes: - n_disjoin.add(node) - if len(n_disjoin) <= 1: - components2.append(c) + if job: + for edge in pool.starmap(process_contig_aignments, job): + if not edge: continue - out_e = defaultdict(list) - for node in n_disjoin: - for neigh in G.neighbors(node): - out_e[neigh].append(node) - G3 = nx.Graph() - for k, v in out_e.items(): - G3.add_edge(k, random.choice(v)) - components2 += list(nx.algorithms.components.connected_components(G3)) - return components2 - return components + else: + G.add_edge(edge[0], edge[1], loci_same=True) + + return G, disjoint_edges + + +def split_graph_by_forbidden_edges(G, forbidden_pairs, potential): + + def bfs_find_and_cut(start, target, work_graph, potential): + queue = deque([(start, None)]) # (node, parent) + visited = {start} + parent_map = {} # Keep track of how we reached each node + while queue: + current, parent = queue.popleft() + for neighbor in work_graph.neighbors(current): + if neighbor == target: + work_graph.remove_edge(current, neighbor) + return True + if neighbor not in visited: + visited.add(neighbor) + parent_map[neighbor] = current + queue.append((neighbor, current)) + return False + work_graph = G.copy() -cpdef srt_func(c): - if c.type != "pe" and c.type != "": - return 100 + c.su - return c.su + (3 * c.spanning) + node_component_size = {} + components = nx.connected_components(work_graph) + for component in components: + component_size = len(component) + max_degree = max(work_graph.degree(u) for u in component) + # If one node connects all others, don't cut + if max_degree == len(component) - 1: + continue + for node in component: + node_component_size[node] = component_size + + for node1, node2 in forbidden_pairs: + if node1 not in work_graph or node2 not in work_graph: + continue + # For n=3, assume nodes really should be merged. This can arise when + # one large INDEL matches two smaller INDELS that are adjacent on the same read + # if node_component_size.get(node1, 0) <= 3 or node_component_size.get(node2, 0) <= 3: + # continue + bfs_find_and_cut(node1, node2, work_graph, potential) + + final_components = [work_graph.subgraph(c).copy() for c in nx.connected_components(work_graph)] + return final_components + + # # Splits a graph into components ensuring that specified pairs of nodes are not in the same component + # work_graph = G.copy() + # for node1, node2 in forbidden_pairs: + # if node1 not in work_graph or node2 not in work_graph: + # continue + # if nx.has_path(work_graph, node1, node2): + # path = nx.shortest_path(work_graph, node1, node2) + # # Remove the middle edge from the path to disconnect them + # middle_idx = len(path) // 2 + # work_graph.remove_edge(path[middle_idx-1], path[middle_idx]) + # components = [work_graph.subgraph(c).copy() for c in nx.connected_components(work_graph)] + # return components + + +def srt_func(item1, item2): + # if item1.type != "pe" and item1.type != "": + # return 100 + item1.su + support1 = item1.su + (3 * item1.spanning) + support2 = item2.su + (3 * item2.spanning) + if support1 > support2: + return -1 + elif support2 > support1: + return 1 + + # If supports are equal, compare svlen + if item1.svlen > item2.svlen: + return -1 + elif item2.svlen > item1.svlen: + return 1 + return 0 # If everything is equal def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pick_best=False, add_partners=False, rel_diffs=False, diffs=15, same_sample=True, debug=False, min_size=0, aggressive_ins_merge=False, - skip_imprecise=False, max_comparisons=100): + skip_imprecise=False, max_comparisons=100, procs=1): """Try and merge similar events, use overlap of both breaks points """ max_dist = max_dist / 2 @@ -458,30 +608,30 @@ def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pic return potential # Cluster events on graph G = nx.Graph() - G, disjoint_nodes = enumerate_events(G, potential, max_dist, try_rev, tree, paired_end, rel_diffs, diffs, same_sample, + G, forbidden_edges = enumerate_events(G, potential, max_dist, try_rev, tree, paired_end, rel_diffs, diffs, same_sample, aggressive_ins_merge=aggressive_ins_merge, - debug=debug, max_comparisons=max_comparisons) + debug=debug, max_comparisons=max_comparisons, procs=procs) + # for x, y in forbidden_edges: + # echo('forbidden edge', (x, y), potential[x].svlen, potential[y].svlen) found = [] - for item in potential: # Add singletons, non-merged - if not G.has_node(item.event_id): + for idx, item in enumerate(potential): # Add singletons, non-merged + if not G.has_node(idx): found.append(item) # Try and merge SVs with identical breaks, then merge ones with less accurate breaks - this helps prevent # over merging SVs that are close together - components = cut_components(G, disjoint_nodes) - node_to_event = {i.event_id: i for i in potential} + components = split_graph_by_forbidden_edges(G, forbidden_edges, potential) + #node_to_event = {i.event_id: i for i in potential} cdef int k for grp in components: - #echo(grp) - best = [node_to_event[n] for n in grp] + best = [potential[n] for n in grp] - best.sort(key=srt_func, reverse=True) + best.sort(key=cmp_to_key(srt_func), reverse=False) w0 = best[0] - #echo(w0.contig, w0.contig2) - #echo([b.svtype for b in best]) - #echo([b.svlen for b in best], w0.svlen, w0.posA, w0.svtype, w0.rep) + # echo('merge groups', [i for i in best]) + # echo([(b.svlen, b.su, b.svtype) for b in best], w0.svtype, w0.svlen) if not pick_best: weight = w0.pe + w0.supp + w0.spanning spanned = bool(w0.spanning) @@ -498,6 +648,7 @@ def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pic best_var_seq = w0.variant_seq for k in range(1, len(best)): item = best[k] + w0.qnames |= item.qnames w0.pe += item.pe w0.supp += item.supp w0.sc += item.sc @@ -514,7 +665,7 @@ def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pic if not spanned: if item.spanning: w0.svlen = item.svlen - elif min_size > w0.svlen < item.svlen: + elif w0.remap_score == 0 and min_size > w0.svlen < item.svlen: w0.svlen = item.svlen elif item.svtype == "INS" and svt in {"INS","DUP","TRA","INV"}: if not spanned: @@ -522,9 +673,7 @@ def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pic w0.svlen = item.svlen w0.variant_seq = item.variant_seq # elif item.svlen * 0.6 < w0.svlen < item.svlen or min_size > w0.svlen < item.svlen: - elif min_size > w0.svlen < item.svlen: - #echo("best var seq", best_var_seq) - # if best_var_seq == -1: + elif w0.remap_score == 0 and min_size > w0.svlen < item.svlen: w0.svlen = item.svlen w0.svtype = item.svtype if best_var_seq: @@ -562,4 +711,5 @@ def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pic if add_partners: w0.partners = [i.event_id for i in best[1:]] found.append(w0) + return found diff --git a/dysgu/post_call.py b/dysgu/post_call.py index eabb885..6ae925d 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -13,6 +13,9 @@ import warnings pd.options.mode.chained_assignment = None from dysgu.scikitbio._ssw_wrapper import StripedSmithWaterman +from dysgu.io_funcs import iitree +from dysgu.consensus import compute_rep +from collections import defaultdict import os import warnings from sklearn.exceptions import InconsistentVersionWarning @@ -352,7 +355,7 @@ def strand_binom_t(events): def get_ref_base(events, ref_genome, symbolic_sv_size): for e in events: - if not e.ref_seq: + if not e.ref_seq and not e.svtype == "BND": if symbolic_sv_size == -1 or e.svtype == "INS" or e.svtype == "TRA": if e.posA == 0: e.posA = 1 @@ -657,6 +660,43 @@ def get_gt_metric2(events, mode, add_gt=True): return events +def filter_microsatellite_non_diploid(potential): + tmp_list = defaultdict(list) + max_dist = 50 + half_d = (max_dist * 0.5) + candidates = [] + for idx in range(len(potential)): + ei = potential[idx] + if (ei.svtype != "DEL" and ei.svtype != "INS") or not ei.variant_seq: + continue + rep = compute_rep(ei.variant_seq) + if rep < 0.4: + continue + candidates.append(idx) + tmp_list[ei.chrA].append((ei.posA - ei.cipos95A - max_dist, ei.posA + ei.cipos95A + max_dist, idx)) + if ei.chrA == ei.chrB and ei.svlen > half_d: + tmp_list[ei.chrA].append((ei.posB - ei.cipos95B - max_dist, ei.posB + ei.cipos95B + max_dist, idx)) + + nc2 = {k: iitree(v, add_value=True) for k, v in tmp_list.items()} + seen = set([]) + to_drop = set([]) + for idx in candidates: + if idx in seen: + continue + ei = potential[idx] + ols = set(nc2[ei.chrA].allOverlappingIntervals(ei.posA, ei.posA + 1)) + ols2 = set(nc2[ei.chrB].allOverlappingIntervals(ei.posB, ei.posB + 1)) + ols |= ols2 + ols.add(idx) + if len(ols) <= 2: + continue + bad = sorted([(i, potential[i]) for i in ols], key=lambda x: x[1].su, reverse=True)[2:] + for jdx, p in bad: + to_drop.add(jdx) + seen |= ols + return [p for i, p in enumerate(potential) if i not in to_drop] + + def compressability(events): for e in events: c1 = [] diff --git a/dysgu/re_map.py b/dysgu/re_map.py index c881c13..f6115df 100644 --- a/dysgu/re_map.py +++ b/dysgu/re_map.py @@ -94,8 +94,18 @@ def merge_align_regions(locations): if abs(s - last[0]) < merge_dist and abs(e - last[1]) < merge_dist: new_l[-1][1] = e else: - break + new_l.append([s, e]) + # break # return None + # Choose block closest to break point + best_i = 0 + dist = 100_000 + for i, (s, e) in enumerate(new_l): + mid = e - s + if abs(mid - 500) < dist: + best_i = i + new_l = [new_l[best_i]] + return new_l diff --git a/dysgu/sv2bam.pyx b/dysgu/sv2bam.pyx index f7c14c2..e84fe8a 100644 --- a/dysgu/sv2bam.pyx +++ b/dysgu/sv2bam.pyx @@ -1,4 +1,4 @@ -#cython: language_level=3 +# cython: language_level=3 from __future__ import absolute_import import pysam diff --git a/pyproject.toml b/pyproject.toml index 66a8190..938cc7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ authors = [ { name = "Kez Cleal", email = "clealk@cardiff.ac.uk" } ] license = { text = "MIT" } -requires-python = ">=3.10" +requires-python = ">=3.9" dependencies = [ "setuptools >= 61.0", "cython", diff --git a/setup.py b/setup.py index d255a8e..012713a 100644 --- a/setup.py +++ b/setup.py @@ -115,7 +115,9 @@ def get_extension_modules(): include_dirs=include_dirs, runtime_library_dirs=runtime_dirs, extra_compile_args=extras, - define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")], + define_macros=[ + ("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION"), + ], language="c++")) return cythonize(ext_modules)