diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index ee482dc..225fe0b 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -772,19 +772,19 @@ cdef single(rds, int insert_size, int insert_stdev, float insert_ppf, int clip_l svlen_adjusted = int(np.median([b[0] for b in size_pos_bounded])) posA_adjusted = int(np.median([b[1] for b in size_pos_bounded])) posB_adjusted = int(np.median([b[2] for b in size_pos_bounded])) - if svtype_m == 2: # del - posA = posA_adjusted - posB = posB_adjusted - svlen = svlen_adjusted - er.preciseA = True - er.preciseB = True - else: + # if svtype_m == 2: # del + # posA = posA_adjusted + # posB = posB_adjusted + # svlen = svlen_adjusted + # er.preciseA = True + # er.preciseB = True + # else: # to_assemble = False - er.preciseA = True - er.preciseB = True - posA = posA_adjusted - posB = posB_adjusted - svlen = svlen_adjusted + er.preciseA = True + er.preciseB = True + posA = posA_adjusted + posB = posB_adjusted + svlen = svlen_adjusted else: svlen = int(np.median([sp[4] for sp in spanning_alignments])) diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx index 8c7f300..072216a 100644 --- a/dysgu/cluster.pyx +++ b/dysgu/cluster.pyx @@ -837,6 +837,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N temp_dir=tdir, find_n_aligned_bases=find_n_aligned_bases, position_distance_thresh=args['sd'], + max_search_depth=args['search_depth'] ) sites_index = None if sites_adder: diff --git a/dysgu/extra_metrics.pyx b/dysgu/extra_metrics.pyx index 5e5a2c6..b5c44c6 100644 --- a/dysgu/extra_metrics.pyx +++ b/dysgu/extra_metrics.pyx @@ -262,7 +262,7 @@ cpdef filter_poorly_aligned_ends(spanning_alignments, float divergence=0.02): return spanning, 1 - (len(spanning) / len(spanning_alignments)) -cpdef gap_size_upper_bound(AlignedSegment alignment, int cigarindex, int pos_input, int end_input, int length_extend=15, float divergence=0.02): +cpdef gap_size_upper_bound(AlignedSegment alignment, int cigarindex, int pos_input, int end_input, int length_extend=15, float divergence=0.02, float len_t=3): # expand indel using cigar, merges nearby gaps into the SV event cdef int pos, end, l, opp, extent_left, extent_right, candidate_type, candidate_len, len_input, i, dist, dist_thresh, middle, last_seen_size cdef uint32_t cigar_value @@ -276,7 +276,7 @@ cpdef gap_size_upper_bound(AlignedSegment alignment, int cigarindex, int pos_inp candidate_type = cigar_value & BAM_CIGAR_MASK candidate_len = cigar_value >> BAM_CIGAR_SHIFT len_input = candidate_len - dist_thresh = min(len_input * 3, ( log2_32(1 + candidate_len) / divergence)) + dist_thresh = min( (len_input * len_t), ( log2_32(1 + candidate_len) / divergence)) i = cigarindex + 1 dist = 0 last_seen_size = candidate_len @@ -308,7 +308,7 @@ cpdef gap_size_upper_bound(AlignedSegment alignment, int cigarindex, int pos_inp else: candidate_len += l extent_right = end - dist_thresh = min(l * 3, ( log2_32(1 + candidate_len) / divergence)) + dist_thresh = min( (l * len_t), ( log2_32(1 + candidate_len) / divergence)) dist = 0 i += 1 @@ -342,7 +342,7 @@ cpdef gap_size_upper_bound(AlignedSegment alignment, int cigarindex, int pos_inp else: candidate_len += l extent_left = pos - dist_thresh = min(l * 3, ( log2_32(1 + candidate_len) / divergence) ) + dist_thresh = min( (l * len_t), ( log2_32(1 + candidate_len) / divergence) ) dist = 0 i -= 1 diff --git a/dysgu/filter_normals.py b/dysgu/filter_normals.py index 06ba4b5..7857808 100644 --- a/dysgu/filter_normals.py +++ b/dysgu/filter_normals.py @@ -118,8 +118,10 @@ def load_samples(args, pths): def output_vcf(args, template_vcf): hdr = template_vcf.header input_command = ' '.join(argv) - hdr.add_line('##FILTER=') + hdr.add_line('##FILTER=') + hdr.add_line('##FILTER=') hdr.add_line('##FILTER=') + hdr.add_line('##FILTER=') hdr.add_line('##FILTER=') hdr.add_line(f'##command="{input_command}"') return pysam.VariantFile("-" if not args['svs_out'] else args['svs_out'], "w", header=hdr) @@ -187,7 +189,7 @@ def get_sv_type(r, chrom, chrom2): def make_interval_tree(args, infile, sample_name, normal_vcfs): if not normal_vcfs: return None - intervals = defaultdict(lambda: Py_BasicIntervalTree()) + intervals = defaultdict(lambda: defaultdict(lambda: Py_BasicIntervalTree())) # chrom : SVTYPE : interval ignored = defaultdict(int) added = 0 for normal_vcf in normal_vcfs: @@ -217,18 +219,20 @@ def make_interval_tree(args, infile, sample_name, normal_vcfs): continue start = r.start # note pysam converts to zero-based index like bam stop = r.stop + svlen = r.info["SVLEN"] if "SVLEN" in r.info else 1000000 if chrom == chrom2 and stop < start: start = r.stop - 1 stop = r.start - 1 if chrom == chrom2 and abs(stop - start) <= distance: - intervals[chrom].add(min(start, stop) - distance, max(start, stop) + distance, idx) + intervals[chrom][r.info["SVTYPE"]].add(min(start, stop) - distance, max(start, stop) + distance, svlen) else: posB = get_posB(r) - intervals[chrom].add(start - distance, start + distance, idx) - intervals[chrom2].add(posB - distance, posB + distance, idx) + intervals[chrom][r.info["SVTYPE"]].add(start - distance, start + distance, svlen) + intervals[chrom2][r.info["SVTYPE"]].add(posB - distance, posB + distance, svlen) added += 1 - for tree in intervals.values(): - tree.index() + for tree_v in intervals.values(): + for tree in tree_v.values(): + tree.index() if ignored: logging.warning(f'Ignored SV types: {ignored}') if added == 0: @@ -405,16 +409,16 @@ def within_read_end_position(event_pos, svtype, cigartuples, cigar_index): end = event_pos + cigartuples[cigar_index][1] return end -def matching_gap(posA, posB, r, svtype, is_insertion, svlen, pos_threshold=100, span_threshold=0.1): +def matching_gap(posA, posB, r, svtype, is_insertion, svlen, pos_threshold=100, span_threshold=0.1, paired_end=True, cipos95=0): ct = r.cigartuples pos = r.pos - pos_original = None min_length = svlen * 0.2 # 0.25 max_distance = svlen * 10 ins_like = svtype == "DUP" or svtype == "INV" debug = False # debug = True cigar_index = 0 + # echo(span_threshold) for k, l in ct: if k == CHARD_CLIP or k == CSOFT_CLIP: cigar_index += 1 @@ -457,20 +461,22 @@ def matching_gap(posA, posB, r, svtype, is_insertion, svlen, pos_threshold=100, end = pos + l if l > min_length and abs(posA - pos) < max_distance: if is_insertion: - if span_position_distance(posA, posA + svlen, pos, end, pos_threshold, span_threshold): + if min(svlen, l) / max(svlen, l) > span_threshold: if debug: echo("ret2", f"{posA}-{posA + svlen}", f"{pos}-{end}") return True elif span_position_distance(posA, posB, pos, end, pos_threshold, span_threshold): if debug: echo("ret3") return True - if l > 100 and ((k == CINS and is_insertion) or (k == CDEL and svtype == "DEL")): + if not paired_end and l > 20 and ((k == CINS and is_insertion) or (k == CDEL and svtype == "DEL")): this_l, this_pos, this_end = gap_size_upper_bound(r, cigar_index, pos, pos + 1) if this_l > min_length and abs(posA - this_pos) < max_distance: if is_insertion: - if span_position_distance(posA, posA + svlen, this_pos, this_end, pos_threshold, span_threshold): + if min(svlen, this_l) / max(svlen, this_l) > span_threshold: if debug: echo("ret2b", f"{posA}-{posA + svlen}", f"{this_pos}-{this_end}") return True + elif cipos95 and min(svlen + cipos95, this_l) / max(svlen + cipos95, this_l) > span_threshold: + return True elif span_position_distance(posA, posB, this_pos, this_end, pos_threshold, span_threshold): if debug: echo("ret3b") return True @@ -623,13 +629,16 @@ def has_clip(r): return False -def clip_align_matches(seq1, seq2, clip_side, paired_end): +def clip_align_matches(seq1, seq2, clip_side, paired_end, is_alt_seq=False): if not seq1 or not seq2: return False - min_seq = min(len(seq1), len(seq2)) + if not is_alt_seq: + min_seq = min(len(seq1), len(seq2)) + else: + min_seq = len(seq2) if min_seq < 8: - if min_seq == len(seq2): - return False + # if min_seq == len(seq2): + # return False if clip_side == SeqType.RIGHT_CLIP: seq2_clipped = seq2[int(len(seq2) / 2) + (1 if len(seq2) % 2 == 0 else 0):] if seq2_clipped.startswith(seq1): @@ -644,13 +653,15 @@ def clip_align_matches(seq1, seq2, clip_side, paired_end): a = aln(seq2) score = a.optimal_alignment_score pct = score / (min_seq * 2) + # echo(pct, "1") if pct > 0.7: return True else: el = edlib.align(seq1, seq2, mode="HW", task="locations") if el and el['locations']: - pct = el['locations'][0][1] - el['locations'][0][0] / min_seq - if pct > 0.7: + pct = 1 - (el['editDistance'] / min_seq) + # echo(pct, "2", min(len(seq1), len(seq2)), el['editDistance']) + if pct > 0.7 and (el['locations'][0][1] - el['locations'][0][0]) / min_seq > 0.7: return True if paired_end: seq1_e = seq1.encode("ascii") @@ -667,36 +678,34 @@ def matching_soft_clips(r, reads_with_nearby_soft_clips, pe): if not break_seqs.any_seqs: return False for clip_side, align, target_side in reads_with_nearby_soft_clips: - checked = False ct = align.cigartuples + if break_seqs.alt_seq: + if clip_align_matches(get_left_clip(align), break_seqs.alt_seq, SeqType.LEFT_CLIP, pe, True): + return True + if clip_align_matches(get_right_clip(align), break_seqs.alt_seq, SeqType.RIGHT_CLIP, pe, True): + return True if target_side == "A": if clip_side == SeqType.LEFT_CLIP and break_seqs.left_clip_A: - checked = True if clip_align_matches(get_left_clip(align), break_seqs.left_clip_A, SeqType.LEFT_CLIP, pe) or (pe and ct[0][0] == CHARD_CLIP): return True elif clip_side == SeqType.RIGHT_CLIP and break_seqs.right_clip_A: - checked = True if clip_align_matches(get_right_clip(align), break_seqs.right_clip_A, SeqType.RIGHT_CLIP, pe) or (pe and ct[-1][0] == CHARD_CLIP): return True else: if clip_side == SeqType.LEFT_CLIP and break_seqs.left_clip_B: - checked = True if clip_align_matches(get_left_clip(align), break_seqs.left_clip_B, SeqType.LEFT_CLIP, pe) or (pe and ct[0][0] == CHARD_CLIP): return True elif clip_side == SeqType.RIGHT_CLIP and break_seqs.right_clip_B: - checked = True if clip_align_matches(get_right_clip(align), break_seqs.right_clip_B, SeqType.RIGHT_CLIP, pe) or (pe and ct[-1][0] == CHARD_CLIP): return True - if not checked and break_seqs.alt_seq: - if clip_align_matches(get_left_clip(align), break_seqs.alt_seq, SeqType.LEFT_CLIP, pe): - return True - if clip_align_matches(get_right_clip(align), break_seqs.alt_seq, SeqType.RIGHT_CLIP, pe): - return True + return False def has_low_support(r, sample, support_fraction): d = r.samples[sample] + if "SVLEN" in r.info and "RPOLY" in r.info and r.info['RPOLY'] > r.info["SVLEN"]: + support_fraction *= 2 m = max(d['ICN'], d['OCN']) if m == 0: cov = d['COV'] @@ -712,7 +721,6 @@ def has_low_support(r, sample, support_fraction): def has_low_WR_support(r, sample, support_fraction, n_overlapping, n_mapq0): sf = support_fraction / 2 - # cov = r.samples[sample]['COV'] d = r.samples[sample] m = max(d['ICN'], d['OCN']) if m == 0: @@ -729,6 +737,11 @@ def has_low_WR_support(r, sample, support_fraction, n_overlapping, n_mapq0): return False +def has_low_covering_support(covering_reads, n_mapq0): + if covering_reads - n_mapq0 <= 2: + return True + + def too_many_clipped_reads(r, clipped_reads, support_fraction): sf = support_fraction / 2 d = r.samples[r.samples.keys()[0]] @@ -742,7 +755,6 @@ def too_many_clipped_reads(r, clipped_reads, support_fraction): cov = d['COV'] * ((min(d['ICN'], d['OCN']) / m)) # max_nearby_clipped_reads = round(3 + sf * cov) max_nearby_clipped_reads = round(1 + sf * cov) - # echo("max", clipped_reads, max_nearby_clipped_reads, support_fraction) if clipped_reads >= max_nearby_clipped_reads: return True return False @@ -780,7 +792,6 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, debug = False # debug = True for is_paired_end, aln in iterate_bams_single_region(bams, chromA, posA, pad, bam_is_paired_end): - if is_paired_end: distance = 50 clip_length = 3 @@ -917,19 +928,20 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa if abs(a_posA - posA) > pad or abs(a_posB - posB) > pad: continue if not is_insertion and not aln.flag & 10: # proper pair, mate unmapped - if span_position_distance(posA, posB, a_posA, a_posB, pos_threshold=20, span_threshold=0.7): + if span_position_distance(posA, posB, a_posA, a_posB, pos_threshold=20, span_threshold=0.75): if debug: echo("2") return "normal" if not is_insertion and matching_supplementary(aln, infile, posA, posB): if debug: echo("3") return "normal" - if svlen < 80 and matching_gap(posA, posB, aln, svtype, is_insertion, svlen): + if svlen < 80 and matching_gap(posA, posB, aln, svtype, is_insertion, svlen, span_threshold=0.75): if debug: echo("4") return "normal" cache_nearby_soft_clip(posA, posB, aln, ct, svtype, cached, distance=50, clip_length=3) else: - if matching_gap(posA, posB, aln, svtype, is_insertion, svlen, pos_threshold=30, span_threshold=0.7): + cipos95 = r.info["CIPOS95"] if "CIPOS95" in r.info else 0 + if matching_gap(posA, posB, aln, svtype, is_insertion, svlen, pos_threshold=30, span_threshold=0.75, paired_end=False, cipos95=cipos95): if debug: echo("5", svlen, aln.qname) return "normal" if matching_supplementary(aln, infile, posA, posB): @@ -946,18 +958,13 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa nearby_soft_clips += 1 cache_nearby_soft_clip(posA, posB, aln, ct, svtype, cached, distance=min(500, svlen * 0.5), clip_length=50) - if not is_paired_end and not covered: + if not is_paired_end and not covered or has_low_covering_support(covered, n_mapq_0): if debug: echo("7") return "lowSupport" if not is_paired_end and has_low_WR_support(r, sample, support_fraction, n_overlpapping, n_mapq_0): if debug: echo("8") return "lowSupport" - # echo(svtype) - # if svtype == "INV": - # support_fraction = support_fraction / 20 - # echo(support_fraction) if (not is_paired_end or (svtype in "INVTRA" and not any_contigs)) and too_many_clipped_reads(r, nearby_soft_clips, support_fraction): - # if not is_paired_end and too_many_clipped_reads(r, nearby_soft_clips, support_fraction): if debug: echo("9") return "lowSupport" if cached and matching_soft_clips(r, cached, is_paired_end): @@ -983,6 +990,28 @@ def update_filter_value(r, sample_name, old_filter_values, pass_prob, new_value= r.filter.add(v) +def check_for_interval_overlap(intervals, r, chrom_tid, chrom2_tid, posB, filter_results): + if not intervals: + return False + + svtype = r.info["SVTYPE"] + if chrom_tid in intervals and svtype in intervals[chrom_tid]: + posA_overlaps = set(intervals[chrom_tid][svtype].allOverlappingIntervals(r.pos, r.pos + 1)) + if chrom2_tid in intervals and svtype in intervals[chrom2_tid]: + posB_overlaps = set(intervals[chrom2_tid][svtype].allOverlappingIntervals(posB, posB + 1)) + matching_svlens = posA_overlaps.intersection(posB_overlaps) + if matching_svlens: + for other_svlen in matching_svlens: + if "SVLEN" in r.info: + if r.info["SVLEN"] > 500 or r.info["SVLEN"] < 100: + t = 0.8 + else: + t = 0.5 + if min(other_svlen, r.info["SVLEN"]) / max(other_svlen, r.info["SVLEN"]) > t: + filter_results['normal'] += 1 + return True + + def run_filtering(args): t0 = time.time() pths = get_bam_paths(args) @@ -995,26 +1024,37 @@ def run_filtering(args): pad = args["interval_size"] keep_all = args["keep_all"] min_prob = args['min_prob'] + min_mapq = args['min_mapq'] pass_prob = args['pass_prob'] support_fraction = args['support_fraction'] max_divergence = args['max_divergence'] filter_results = defaultdict(int) written = 0 + samp = list(vcf.header.samples)[0] + for idx, r in enumerate(vcf): - # if r.id != "187669": + # if r.id != "207736": # continue # echo(dict(r.info)) # echo(dict(r.samples[sample_name])) old_filter_value = list(r.filter.keys()) r.filter.clear() + mq_pri = r.samples[samp]["MAPQP"] + mq_sup = r.samples[samp]["MAPQS"] if "MAPQS" in r.samples[samp] else 0 + if mq_pri < min_mapq and mq_sup < min_mapq: + filter_results['lowMapQ'] += 1 + if keep_all: + update_filter_value(r, old_filter_value, pass_prob, new_value="lowMapQ") + out_vcf.write(r) + continue if min_prob != 0 and 'PROB' in r.samples[sample_name] and r.samples[sample_name]['PROB'] < min_prob: - filter_results['dropped, lowProb'] += 1 + filter_results['lowProb'] += 1 if keep_all: update_filter_value(r, old_filter_value, pass_prob, new_value="lowProb") out_vcf.write(r) continue if has_low_support(r, sample_name, support_fraction): - filter_results['dropped, low support'] += 1 + filter_results['lowSupport'] += 1 if keep_all: update_filter_value(r, sample_name, old_filter_value, pass_prob, new_value="lowSupport") out_vcf.write(r) @@ -1023,17 +1063,13 @@ def run_filtering(args): posB = get_posB(r) if chrom_tid == -1: logging.exception(f'Chromosome name {r.chrom} not found in bam file header') - if intervals: - if chrom_tid in intervals: - posA_overlaps = set(intervals[chrom_tid].allOverlappingIntervals(r.pos, r.pos+1)) - if chrom2_tid in intervals: - posB_overlaps = set(intervals[chrom2_tid].allOverlappingIntervals(posB, posB + 1)) - if posA_overlaps.intersection(posB_overlaps): - filter_results['dropped, normal'] += 1 - if keep_all: - update_filter_value(r, sample_name, old_filter_value, pass_prob, new_value="normal") - out_vcf.write(r) - continue + + if check_for_interval_overlap(intervals, r, chrom_tid, chrom2_tid, posB, filter_results): + if keep_all: + update_filter_value(r, sample_name, old_filter_value, pass_prob, new_value="normalOverlap") + out_vcf.write(r) + continue + if not bams: update_filter_value(r, sample_name, old_filter_value, pass_prob) out_vcf.write(r) @@ -1051,9 +1087,9 @@ def run_filtering(args): else: if keep_all: r.filter.add(reason) - update_filter_value(r, sample_name, old_filter_value, pass_prob, new_value="normal") + update_filter_value(r, sample_name, old_filter_value, pass_prob, new_value=reason) out_vcf.write(r) - filter_results[f'dropped, {reason}'] += 1 + filter_results[f'{reason}'] += 1 out_vcf.close() logging.info(f'Filter results: {dict(sorted(filter_results.items()))}') logging.info("dysgu filter {} complete, n={}, h:m:s, {}".format(args['input_vcf'], diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx index a4674af..932c89b 100644 --- a/dysgu/graph.pyx +++ b/dysgu/graph.pyx @@ -279,18 +279,17 @@ cdef LocalVal make_local_val(int chrom2, int pos2, int node_name, ReadEnum_t rea cdef class PairedEndScoper: - cdef int clst_dist - cdef int max_dist - cdef int local_chrom + cdef int clst_dist, max_dist, local_chrom, max_search_depth cdef cpp_map[int, LocalVal] loci # Track the local breaks and mapping locations cdef vector[cpp_map[int, LocalVal]] chrom_scope # Track the mate-pair breaks and locations cdef float norm cdef float thresh # spd cdef float position_distance_thresh cdef bint paired_end - def __init__(self, max_dist, clst_dist, n_references, norm, thresh, paired_end, position_distance_thresh): + def __init__(self, max_dist, clst_dist, n_references, norm, thresh, paired_end, position_distance_thresh, max_search_depth): self.clst_dist = clst_dist self.max_dist = max_dist + self.max_search_depth = max_search_depth self.local_chrom = -1 self.norm = norm self.thresh = thresh @@ -348,7 +347,7 @@ cdef class PairedEndScoper: local_it = forward_scope.lower_bound(pos2) steps = 0 if local_it != forward_scope.end(): - while local_it != forward_scope.end() and steps < 20: + while local_it != forward_scope.end() and steps < self.max_search_depth: vitem = dereference(local_it) preincrement(local_it) steps += 1 @@ -386,7 +385,7 @@ cdef class PairedEndScoper: if local_it != forward_scope.begin(): predecrement(local_it) # Move back one before staring search, otherwise same value is processed twice steps = 0 - while local_it != forward_scope.begin() and steps < 20: + while local_it != forward_scope.begin() and steps < self.max_search_depth: vitem = dereference(local_it) predecrement(local_it) steps += 1 @@ -1164,7 +1163,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering int paired_end=1, int read_length=150, bint contigs=True, float norm_thresh=100, float spd_thresh=0.3, bint mm_only=False, sites=None, bint trust_ins_len=True, low_mem=False, temp_dir=".", - find_n_aligned_bases=True, position_distance_thresh=0.8): + find_n_aligned_bases=True, position_distance_thresh=0.8, max_search_depth=20): logging.info("Building graph with clustering {} bp".format(clustering_dist)) cdef TemplateEdges_t template_edges = TemplateEdges() # Edges are added between alignments from same template, after building main graph cdef int event_pos, cigar_index, opp, length @@ -1173,7 +1172,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering minimizer_support_thresh=minimizer_support_thresh, minimizer_breadth=minimizer_breadth, read_length=read_length) # Infers long-range connections, outside local scope using pe information - cdef PairedEndScoper_t pe_scope = PairedEndScoper(max_dist, clustering_dist, infile.header.nreferences, norm_thresh, spd_thresh, paired_end, position_distance_thresh) + cdef PairedEndScoper_t pe_scope = PairedEndScoper(max_dist, clustering_dist, infile.header.nreferences, norm_thresh, spd_thresh, paired_end, position_distance_thresh, max_search_depth) bad_clip_counter = BadClipCounter(infile.header.nreferences, low_mem, temp_dir) cdef Py_SimpleGraph G = map_set_utils.Py_SimpleGraph() diff --git a/dysgu/io_funcs.pyx b/dysgu/io_funcs.pyx index d486bfa..9409f60 100644 --- a/dysgu/io_funcs.pyx +++ b/dysgu/io_funcs.pyx @@ -129,7 +129,7 @@ cpdef list col_names(small_output): if small_output: return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise", "rep", "gc", - ["GT", "GQ", "MAPQpri", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase", + ["GT", "GQ", "MAPQpri", "MAPQsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase", "raw_reads_10kb", "neigh10kb", "plus", "minus", "remap_score", "remap_ed", "bad_clip_count", "fcc", "inner_cn", "outer_cn", "prob"] ] @@ -264,7 +264,7 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small info_extras += [f"MeanPROB={round(mean_prob, 3)}", f"MaxPROB={round(max_prob, 3)}"] if small_output: - fmt_keys = "GT:GQ:MAPQP:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB" + fmt_keys = "GT:GQ:MAPQP:MAPQS:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB" else: fmt_keys = "GT:GQ:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB" @@ -309,7 +309,7 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small def get_fmt(r, small_output): if small_output: - v = [r["GT"], r["GQ"], r['MAPQpri'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'], + v = [r["GT"], r["GQ"], r['MAPQpri'], r['MAPQsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'], r['raw_reads_10kb'], r['neigh10kb'], r["plus"], r["minus"], r["remap_score"], r["remap_ed"], r["bad_clip_count"], round(r["fcc"], 3), round(r["inner_cn"], 3), round(r["outer_cn"], 3), r['prob'] ] diff --git a/dysgu/main.py b/dysgu/main.py index 3e73d2e..954921b 100644 --- a/dysgu/main.py +++ b/dysgu/main.py @@ -190,6 +190,7 @@ def cli(): @click.option('--dist-norm', help=f"Distance normalizer [default: {defaults['dist_norm']}]", type=float, callback=add_option_set) @click.option('--spd', help="Span position distance", default=0.3, type=float, show_default=True) @click.option('--sd', help="Span distance, only SV span is considered, lower values separate multi-allelic sites", default=0.8, type=float, show_default=True) +@click.option('--search-depth', help="Search through this many local reads for matching SVs. Increase this to identify low frequency events", default=20, type=float, show_default=True) @click.option('--trust-ins-len', help=f"Trust insertion length from cigar, for high error rate reads use False [default: {defaults['trust_ins_len']}]", type=str, callback=add_option_set) @click.option('--length-extend', help=f"Extend SV length if any nearby gaps found with length >= length-extend. Ignored for paired-end reads", type=int, default=15, show_default=True) @@ -350,6 +351,7 @@ def get_reads(ctx, **kwargs): @click.option('--dist-norm', help=f"Distance normalizer [default: {defaults['dist_norm']}]", type=float, callback=add_option_set) @click.option('--spd', help="Span position distance", default=0.3, type=float, show_default=True) @click.option('--sd', help="Span distance, only SV span is considered, lower values separate multi-allelic sites", default=0.8, type=float, show_default=True) +@click.option('--search-depth', help="Search through this many local reads for matching SVs. Increase this to identify low frequency events", default=20, type=float, show_default=True) @click.option('--trust-ins-len', help=f"Trust insertion length from cigar, for high error rate reads use False [default: {defaults['trust_ins_len']}]", type=str, callback=add_option_set) @click.option('--length-extend', help=f"Extend SV length if any nearby gaps found with length >= length-extend. Ignored for paired-end reads", type=int, default=15, show_default=True) @@ -471,8 +473,9 @@ def view_data(ctx, **kwargs): @click.option("--target-sample", help="If input_vcf if multi-sample, use target-sample as input", required=False, type=str, default="") @click.option("--keep-all", help="All SVs classified as normal will be kept in the output, labelled as filter=normal", is_flag=True, flag_value=True, show_default=False, default=False) @click.option("--ignore-read-groups", help="Ignore ReadGroup RG tags when parsing sample names. Filenames will be used instead", is_flag=True, flag_value=True, show_default=False, default=False) -@click.option("--max-divergence", help="Remove SV if normal_bam displays divergence > max-divergence at same location", default=0.08, type=float, show_default=True) +@click.option("--max-divergence", help="Remove SV if normal_bam displays divergence > max-divergence at same location", default=0.1, type=float, show_default=True) @click.option("--min-prob", help="Remove SV with PROB value < min-prob", default=0.1, type=float, show_default=True) +@click.option("--min-mapq", help="Remove SV with mean mapqq < min-mapq", default=10, type=float, show_default=True) @click.option("--pass-prob", help="Re-label SV as PASS if PROB value >= pass-prob", default=1.0, type=float, show_default=True) @click.option("--interval-size", help="Interval size for searching normal-vcf/normal-bams", default=1000, type=int, show_default=True) @click.option("--random-bam-sample", help="Choose N random normal-bams to search. Use -1 to ignore", default=-1, type=int, show_default=True)