diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3b7e848..6b23803 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,7 +17,7 @@ jobs: - name: Build wheels uses: pypa/cibuildwheel@v2.16.5 env: - CIBW_PROJECT_REQUIRES_PYTHON: ">=3.8" # >=3.8 <3.12 + CIBW_PROJECT_REQUIRES_PYTHON: ">=3.10" # >=3.8 <3.12 CIBW_SKIP: "*-win32 *-manylinux_i686 pp* *musl* " # cp311-macosx* CIBW_ARCHS_MACOS: "x86_64 arm64" CIBW_ARCHS_LINUX: "auto" diff --git a/README.rst b/README.rst index a19c837..d992af3 100644 --- a/README.rst +++ b/README.rst @@ -42,15 +42,16 @@ for calling structural variants using paired-end or long read sequencing data. S ⚙️ Installation --------------- -Dysgu requires Python >=3.8 - 3.10 plus htslib and has been tested on linux and MacOS. +Dysgu can be installed via pip using Python 3.10 - 3.12 and has been tested on linux and MacOS. The list of python packages needed can be found in requirements.txt. To install:: - conda install -c bioconda -c conda-forge dysgu + pip install dysgu -Or, fetch from PyPI:: +Or, from conda:: + + conda install -c bioconda -c conda-forge dysgu - pip install dysgu To build from source, run the install script: ``bash INSTALL.sh``. diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index a897f5e..ee482dc 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -36,7 +36,7 @@ ctypedef enum ReadEnum_t: BREAKEND = 4 -cdef n_aligned_bases(AlignedSegment aln): +cpdef n_aligned_bases(AlignedSegment aln): cdef int opp, l, aligned, large_gaps, n_small_gaps, i cdef uint32_t cigar_value cdef uint32_t cigar_l = aln._delegate.core.n_cigar @@ -123,7 +123,9 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, MAPQsupp += a.mapq if a.has_tag("NM"): if a_bases: - NMsupp += float(a.get_tag("NM")) / a_bases + nm = float(a.get_tag("NM")) + NMsupp += nm / a_bases + NMbase += (nm - large_gaps) / a_bases if a.has_tag("AS"): this_as = a.get_tag("AS") if this_as > maxASsupp: @@ -173,7 +175,9 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, MAPQsupp += a.mapq if a.has_tag("NM"): if a_bases: - NMsupp += float(a.get_tag("NM")) / a_bases + nm = float(a.get_tag("NM")) + NMsupp += nm / a_bases + NMbase += (nm - large_gaps) / a_bases if a.has_tag("AS"): this_as = a.get_tag("AS") if this_as > maxASsupp: @@ -202,7 +206,9 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, clipped_base_quals += cbq clipped_bases += cb - cdef int tot = er.supp + total_pri + cdef float tot = er.plus + er.minus # er.supp + total_pri + if tot == 0: + return er.NMpri = (NMpri / total_pri) * 100 if total_pri > 0 else 0 er.NMsupp = (NMsupp / er.supp) * 100 if er.supp > 0 else 0 er.NMbase = (NMbase / tot) * 100 if tot > 0 else 0 @@ -222,7 +228,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, cdef int within_read_end_position(event_pos, svtype, cigartuples, cigar_index): - cdef int i, end, idx, opp, length, cigar_skip, n_aligned_bases, target_bases + cdef int i, end if svtype == 1: # insertion (or other e.g. duplication/inversion within read) return event_pos + 1 else: # deletion type @@ -1607,7 +1613,7 @@ cdef get_reads(infile, nodes_info, buffered_reads, n2n, bint add_to_buffer, site if sites_index and int_node in sites_index: continue n = n2n[int_node] - if int_node in buffered_reads: + if buffered_reads and int_node in buffered_reads: aligns.append((n, buffered_reads[int_node])) continue fpos.append((n, int_node)) @@ -1664,59 +1670,58 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, # Sometimes partitions are not linked, happens when there is not much support between partitions # Then need to decide whether to call from a single partition n2n = data.n2n - seen = set(range(len(data.parts))) + seen = set(range(len(data.parts))) if data.parts else {} out_counts = defaultdict(int) # The number of 'outward' links to other clusters cdef int buffered_reads = 0 cdef bint add_to_buffer = 1 cdef int int_node cdef unsigned long[:] partition events = [] - if info: - sites_info = list(info.values()) - else: - sites_info = [] - # u and v are the part ids, d[0] and d[1] are the lists of nodes for those parts - for (u, v), d in data.s_between: #.items(): - rd_u = get_reads(bam, d[0], data.reads, n2n, add_to_buffer, info) # [(Nodeinfo, alignment)..] - rd_v = get_reads(bam, d[1], data.reads, n2n, add_to_buffer, info) - total_reads = len(rd_u) + len(rd_v) - buffered_reads += total_reads - if add_to_buffer and buffered_reads > 50000: - add_to_buffer = 0 - out_counts[u] += len(rd_u) - out_counts[v] += len(rd_v) - if len(rd_u) == 0 or len(rd_v) == 0: - continue - if u in seen: - seen.remove(u) - if v in seen: - seen.remove(v) - - events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, - assemble_contigs, - sites_info, paired_end) - - # finds reads that should be a single partition - # u_reads, v_reads, u_single, v_single = filter_single_partitions(rd_u, rd_v) - # if len(u_reads) > 0 and len(v_reads) > 0: - # events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs, - # sites_info, paired_end) - # if u_single: - # res = single(u_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - # sites_info, paired_end, length_extend, divergence) - # if res: - # if isinstance(res, EventResult): - # events.append(res) - # else: - # events += res - # if v_single: - # res = single(v_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - # sites_info, paired_end, length_extend, divergence) - # if res: - # if isinstance(res, EventResult): - # events.append(res) - # else: - # events += res + sites_info = list(info.values()) if info else [] + + if data.s_between: + # u and v are the part ids, d[0] and d[1] are the lists of nodes for those parts + for (u, v), d in data.s_between: #.items(): + rd_u = get_reads(bam, d[0], data.reads, n2n, add_to_buffer, info) # [(Nodeinfo, alignment)..] + rd_v = get_reads(bam, d[1], data.reads, n2n, add_to_buffer, info) + total_reads = len(rd_u) + len(rd_v) + buffered_reads += total_reads + if add_to_buffer and buffered_reads > 50000: + add_to_buffer = 0 + out_counts[u] += len(rd_u) + out_counts[v] += len(rd_v) + if len(rd_u) == 0 or len(rd_v) == 0: + continue + if u in seen: + seen.remove(u) + if v in seen: + seen.remove(v) + + events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, + assemble_contigs, + sites_info, paired_end) + + # finds reads that should be a single partition + # u_reads, v_reads, u_single, v_single = filter_single_partitions(rd_u, rd_v) + # if len(u_reads) > 0 and len(v_reads) > 0: + # events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs, + # sites_info, paired_end) + # if u_single: + # res = single(u_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, + # sites_info, paired_end, length_extend, divergence) + # if res: + # if isinstance(res, EventResult): + # events.append(res) + # else: + # events += res + # if v_single: + # res = single(v_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, + # sites_info, paired_end, length_extend, divergence) + # if res: + # if isinstance(res, EventResult): + # events.append(res) + # else: + # events += res # Process any singles / unconnected blocks if seen: @@ -1737,33 +1742,33 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, events += res # Check for events within clustered nodes - for k, d in data.s_within: #.items(): - o_count = out_counts[k] - i_counts = len(d) - if i_counts > max_single_size: - continue - if o_count > 0 and i_counts > (2*min_support) and i_counts > o_count: - rds = get_reads(bam, d, data.reads, data.n2n, 0, info) - if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): - continue - res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - sites_info, paired_end, length_extend, divergence) - if res: - if isinstance(res, EventResult): - events.append(res) - else: - events += res + if data.s_within: + for k, d in data.s_within: #.items(): + o_count = out_counts[k] + i_counts = len(d) + if i_counts > max_single_size: + continue + if o_count > 0 and i_counts > (2*min_support) and i_counts > o_count: + rds = get_reads(bam, d, data.reads, data.n2n, 0, info) + if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): + continue + res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, + sites_info, paired_end, length_extend, divergence) + if res: + if isinstance(res, EventResult): + events.append(res) + else: + events += res return events cpdef list call_from_block_model(bam, data, clip_length, insert_size, insert_stdev, insert_ppf, min_support, lower_bound_support, assemble_contigs, max_single_size, sites_index, bint paired_end, int length_extend, float divergence): - n_parts = len(data.parts) + n_parts = len(data.parts) if data.parts else 0 events = [] - if data.info: - info = data.info - else: - info = None + info = data.info + if data.reads is None: + data.reads = {} # next deal with info - need to filter these into the partitions, then deal with them in single / one_edge cdef EventResult_t e if n_parts >= 1: @@ -1773,10 +1778,7 @@ cpdef list call_from_block_model(bam, data, clip_length, insert_size, insert_std if len(data.n2n) > max_single_size: return [] rds = get_reads(bam, data.n2n.keys(), data.reads, data.n2n, 0, info) - if info: - sites_info = list(info.values()) - else: - sites_info = [] + sites_info = list(info.values()) if info else [] if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): return [] ev = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, sites_info, paired_end, length_extend, divergence) diff --git a/dysgu/filter_normals.py b/dysgu/filter_normals.py index bafbf30..26f4ca6 100644 --- a/dysgu/filter_normals.py +++ b/dysgu/filter_normals.py @@ -9,6 +9,8 @@ import os from dysgu.map_set_utils import Py_BasicIntervalTree, is_overlapping from dysgu.graph import AlignmentsSA +from dysgu.extra_metrics import gap_size_upper_bound +from dysgu.call_component import n_aligned_bases import logging from collections import defaultdict from enum import Enum @@ -118,6 +120,7 @@ def output_vcf(args, template_vcf): input_command = ' '.join(argv) hdr.add_line('##FILTER=') hdr.add_line('##FILTER=') + hdr.add_line('##FILTER=') hdr.add_line(f'##command="{input_command}"') return pysam.VariantFile("-" if not args['svs_out'] else args['svs_out'], "w", header=hdr) @@ -395,23 +398,37 @@ def matching_supplementary(aln, infile, posA, posB): return False +def within_read_end_position(event_pos, svtype, cigartuples, cigar_index): + if svtype == 1: # insertion (or other e.g. duplication/inversion within read) + return event_pos + 1 + else: # deletion type + end = event_pos + cigartuples[cigar_index][1] + return end + def matching_gap(posA, posB, r, svtype, is_insertion, svlen, pos_threshold=100, span_threshold=0.1): ct = r.cigartuples pos = r.pos + pos_original = None min_length = svlen * 0.2 # 0.25 max_distance = svlen * 10 ins_like = svtype == "DUP" or svtype == "INV" debug = False + # debug = True + cigar_index = 0 for k, l in ct: if k == CHARD_CLIP or k == CSOFT_CLIP: + cigar_index += 1 continue if k == CMATCH or k == CDIFF: pos += l + cigar_index += 1 continue if is_insertion and k == CDEL: pos += l + cigar_index += 1 continue elif not is_insertion and k == CINS: + span_similarity = min(l, svlen) / max(l, svlen) if l > 250 and ins_like and span_similarity > 0.85 and abs(posA - pos) < max(l, svlen): if debug: echo("ret0", svlen, f"{posA}-{posA + svlen}", f"{pos}-{pos + l}", (k, l)) @@ -419,7 +436,24 @@ def matching_gap(posA, posB, r, svtype, is_insertion, svlen, pos_threshold=100, elif svlen * 2 < l and (abs(posA - pos) < max_distance or (abs(posB - pos + l) < max_distance)): if debug: echo("ret1") return True + + # if l > 100: + # pos_original = pos + # l, pos, _ = gap_size_upper_bound(r, cigar_index, pos, pos+1) + # span_similarity = min(l, svlen) / max(l, svlen) + # if l > 250 and ins_like and span_similarity > 0.85 and abs(posA - pos) < max(l, svlen): + # if debug: echo("ret0b", svlen, f"{posA}-{posA + svlen}", f"{pos}-{pos + l}", (k, l)) + # return True + # elif svlen * 2 < l and (abs(posA - pos) < max_distance or (abs(posB - pos + l) < max_distance)): + # if debug: echo("ret1b") + # return True + # if pos_original is not None: + # pos = pos_original + # pos_original = None + + cigar_index += 1 continue + end = pos + l if l > min_length and abs(posA - pos) < max_distance: if is_insertion: @@ -429,10 +463,23 @@ def matching_gap(posA, posB, r, svtype, is_insertion, svlen, pos_threshold=100, elif span_position_distance(posA, posB, pos, end, pos_threshold, span_threshold): if debug: echo("ret3") return True + + if l > 100 and ((k == CINS and is_insertion) or (k == CDEL and svtype == "DEL")): + this_l, this_pos, this_end = gap_size_upper_bound(r, cigar_index, pos, pos + 1) + if this_l > min_length and abs(posA - this_pos) < max_distance: + if is_insertion: + if span_position_distance(posA, posA + svlen, this_pos, this_end, pos_threshold, span_threshold): + if debug: echo("ret2b", f"{posA}-{posA + svlen}", f"{this_pos}-{this_end}") + return True + elif span_position_distance(posA, posB, this_pos, this_end, pos_threshold, span_threshold): + if debug: echo("ret3b") + return True + if k == CDEL: pos += l if pos > posB + svlen: break + cigar_index += 1 return False @@ -542,7 +589,7 @@ def cache_nearby_soft_clip(posA, posB, align, join_type, svtype, cached, distanc def any_nearby_soft_clip(posA, posB, align, join_type, svtype, distance=30, clip_length=3): ct = align.cigartuples - if svtype == "BND" or svtype == "INV" or svtype == "DUP": # respect join orientation + if svtype == "BND" or svtype == "DUP": # respect join orientation if join_type[0] == "3": if clip_position_matches(ct[-1], clip_length, posA, align.reference_end, distance): return True @@ -558,7 +605,7 @@ def any_nearby_soft_clip(posA, posB, align, join_type, svtype, distance=30, clip return True if clip_position_matches(ct[0], clip_length, posB, align.pos, distance): return True - elif svtype == "INS" or svtype == "TRA": # check all clips + elif svtype == "INS" or svtype == "TRA" or svtype == "INV": # check all clips if clip_position_matches(ct[-1], clip_length, posA, align.reference_end, distance): return True if clip_position_matches(ct[-1], clip_length, posB, align.reference_end, distance): @@ -680,15 +727,32 @@ def has_low_WR_support(r, sample, support_fraction, n_overlapping, n_mapq0): def too_many_clipped_reads(r, clipped_reads, support_fraction): sf = support_fraction / 2 - cov = r.samples[r.samples.keys()[0]]['COV'] - max_nearby_clipped_reads = round(3 + sf * cov) - if clipped_reads > max_nearby_clipped_reads: + d = r.samples[r.samples.keys()[0]] + # cov = r.samples[r.samples.keys()[0]]['COV'] + m = max(d['ICN'], d['OCN']) + if m == 0: + cov = d['COV'] + else: + cov = d['COV'] * (min(d['ICN'], d['OCN']) / m) + # max_nearby_clipped_reads = round(3 + sf * cov) + max_nearby_clipped_reads = round(1 + sf * cov) + # echo("max", clipped_reads, max_nearby_clipped_reads, support_fraction) + if clipped_reads >= max_nearby_clipped_reads: return True return False -def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, keep_all, sample, - support_fraction): +def poorly_aligned_region(sum_edit_distance, sum_n_small_gap, NMbase, n_aligns, fmt, max_divergence=0.08): + # Normal-reads error + if sum_edit_distance/n_aligns > max_divergence or sum_n_small_gap/n_aligns > max_divergence / 2: + return True + # Variant error rate > 4 times the normal-reads error rate + NMbase = (NMbase / n_aligns) * 100 + if "NMB" in fmt and fmt["NMB"] > 4 * NMbase: + return True + +def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, sample, + support_fraction, max_divergence): ct = r.info["CT"] chromA = r.chrom posA = r.pos @@ -701,47 +765,54 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, has_contig = 'CONTIG' in r.info or 'CONTIG2' in r.info or r.alts[0][0] != "<" is_paired_end = False good_step = good_step_translocation(r, sample) - debug = False #True + sum_edit_distance = 0 + sum_n_small_gap = 0 + n_aligns = 0 + NMbase = 0 + # debug = True + + debug = False + # debug = True for is_paired_end, aln in iterate_bams_single_region(bams, chromA, posA, pad, bam_is_paired_end): + if is_paired_end: distance = 50 clip_length = 3 if aln.flag & 12 or (aln.flag & 2 and not has_clip(aln)): continue if chromB == infile.get_reference_name(aln.rnext) and is_overlapping(next_start, next_end, aln.pnext - pad, aln.pnext + pad): - if keep_all: - r.filter.add("normal") - return False + return "normal" if matching_supplementary(aln, infile, posA, posB): - if keep_all: - r.filter.add("normal") - return False + return "normal" if has_contig: cache_nearby_soft_clip(posA, posB, aln, ct, "TRA", cached, distance, clip_length) else: + n_aligns += 1 + a_bases, large_gaps, n_small_gaps = n_aligned_bases(aln) + if a_bases > 0: + sum_n_small_gap += n_small_gaps / a_bases + if aln.has_tag("NM"): + if a_bases: + nm = float(aln.get_tag("NM")) + sum_edit_distance += nm / a_bases + NMbase += (nm - large_gaps) / a_bases distance = 500 clip_length = 50 if aln.flag & 4 or not has_clip(aln): continue if matching_supplementary(aln, infile, posA, posB): if debug: echo("tra0") - if keep_all: - r.filter.add("normal") - return False + return "normal" if not good_step and matching_ins_translocation(posA, aln): if debug: echo("tra1") - if keep_all: - r.filter.add("normal") - return False + return "normal" if has_contig: cache_nearby_soft_clip(posA, posB, aln, ct, "TRA", cached, distance, clip_length) if any_nearby_soft_clip(posA, posB, aln, ct, "TRA", 30, clip_length=50): nearby_soft_clips += 1 elif any_nearby_soft_clip(posA, posB, aln, ct, "TRA", 30, clip_length=250): if debug: echo("tra2") - if keep_all: - r.filter.add("lowSupport") - return False + return "lowSupport" for is_paired_end, aln in iterate_bams_single_region(bams, chromB, posB, pad, bam_is_paired_end): if is_paired_end: @@ -750,56 +821,55 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, if aln.flag & 12 or (aln.flag & 2 and not has_clip(aln)): continue if r.chrom == infile.get_reference_name(aln.rnext) and is_overlapping(current_start, current_end, aln.pnext - pad, aln.pnext + pad): - if keep_all: - r.filter.add("normal") - return False + return "normal" if matching_supplementary(aln, infile, posB, posA): - if keep_all: - r.filter.add("normal") - return False + return "normal" if has_contig: cache_nearby_soft_clip(r.pos, posB, aln, ct, "TRA", cached, distance, clip_length) else: + n_aligns += 1 + a_bases, large_gaps, n_small_gaps = n_aligned_bases(aln) + if a_bases > 0: + sum_n_small_gap += n_small_gaps / a_bases + if aln.has_tag("NM"): + if a_bases: + nm = float(aln.get_tag("NM")) + sum_edit_distance += nm / a_bases + NMbase += (nm - large_gaps) / a_bases distance = 500 clip_length = 50 if aln.flag & 4 or not has_clip(aln): continue if matching_supplementary(aln, infile, posB, posA): if debug: echo("tra3") - if keep_all: - r.filter.add("normal") - return False + return "normal" if not good_step and matching_ins_translocation(posB, aln): if debug: echo("tra4") - if keep_all: - r.filter.add("normal") - return False + return "normal" if has_contig: cache_nearby_soft_clip(posA, posB, aln, ct, "TRA", cached, distance, clip_length) if any_nearby_soft_clip(r.pos, posB, aln, ct, "TRA", 30, clip_length=50): nearby_soft_clips += 1 elif any_nearby_soft_clip(posA, posB, aln, ct, "TRA", 30, clip_length=250): if debug: echo("tra5") - if keep_all: - r.filter.add("lowSupport") - return False + return "lowSupport" if not is_paired_end and too_many_clipped_reads(r, nearby_soft_clips, support_fraction): if debug: echo("tra6") - if keep_all: - r.filter.add("lowSupport") - return False + return "lowSupport" if cached and matching_soft_clips(r, cached, is_paired_end): if debug: echo("tra7") - if keep_all: - r.filter.add("normal") - return False - return True + return "normal" + + if not is_paired_end and poorly_aligned_region(sum_edit_distance, sum_n_small_gap, NMbase, n_aligns, r.samples[sample], max_divergence): + return "highDivergence" + return None -def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pad, sample, keep_all): +def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pad, sample, max_divergence): svlen = r.info["SVLEN"] svtype = r.info["SVTYPE"] + any_contigs = ("CONTIGA" in r.info and r.info["CONTIGA"]) or ("CONTIGB" in r.info and r.info["CONTIGB"]) is_insertion = svtype == "INS" posA = r.pos ct = r.info["CT"] @@ -809,9 +879,24 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa is_paired_end = False n_mapq_0 = 0 n_overlpapping = 0 - # debug = False if r.rid != "229738" else True - debug = False #False + n_aligns = 0 + sum_edit_distance = 0 + sum_n_small_gap = 0 + NMbase = 0 + + debug = False + # debug = True for is_paired_end, aln in iterate_bams(bams, r.chrom, posA, r.chrom, posB, pad, bam_is_paired_end): + n_aligns += 1 + a_bases, large_gaps, n_small_gaps = n_aligned_bases(aln) + if a_bases > 0: + sum_n_small_gap += n_small_gaps / a_bases + if aln.has_tag("NM"): + if a_bases: + nm = float(aln.get_tag("NM")) + sum_edit_distance += nm / a_bases + NMbase += (nm - large_gaps) / a_bases + if is_paired_end: a_posA = min(aln.pos, aln.pnext) a_posB = max(aln.pos, aln.pnext) @@ -820,9 +905,7 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa if aln.rname != aln.rnext: if matching_supplementary(aln, infile, posA, posB): if debug: echo("1") - if keep_all: - r.filter.add("normal") - return False + return "normal" cache_nearby_soft_clip(posA, posB, aln, ct, svtype, cached) continue if abs(a_posA - posA) > pad or abs(a_posB - posB) > pad: @@ -830,32 +913,22 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa if not is_insertion and not aln.flag & 10: # proper pair, mate unmapped if span_position_distance(posA, posB, a_posA, a_posB, pos_threshold=20, span_threshold=0.7): if debug: echo("2") - if keep_all: - r.filter.add("normal") - return False + return "normal" if not is_insertion and matching_supplementary(aln, infile, posA, posB): if debug: echo("3") - if keep_all: - r.filter.add("normal") - return False + return "normal" if svlen < 80 and matching_gap(posA, posB, aln, svtype, is_insertion, svlen): if debug: echo("4") - if keep_all: - r.filter.add("normal") - return False + return "normal" cache_nearby_soft_clip(posA, posB, aln, ct, svtype, cached, distance=50, clip_length=3) else: if matching_gap(posA, posB, aln, svtype, is_insertion, svlen, pos_threshold=30, span_threshold=0.7): if debug: echo("5", svlen, aln.qname) - if keep_all: - r.filter.add("normal") - return False + return "normal" if matching_supplementary(aln, infile, posA, posB): if debug: echo("6") - if keep_all: - r.filter.add("normal") - return False + return "normal" if is_overlapping(posA, posA + 1, aln.pos, aln.reference_end): n_overlpapping += 1 if aln.mapq == 0: @@ -865,29 +938,29 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa covered += 1 if any_nearby_soft_clip(posA, posB, aln, ct, svtype, 30, clip_length=50): nearby_soft_clips += 1 - # cache_nearby_soft_clip(posA, posB, aln, ct, svtype, cached, distance=min(500, svlen * 0.5), clip_length=50) + cache_nearby_soft_clip(posA, posB, aln, ct, svtype, cached, distance=min(500, svlen * 0.5), clip_length=50) if not is_paired_end and not covered: if debug: echo("7") - if keep_all: - r.filter.add("lowSupport") - return False + return "lowSupport" if not is_paired_end and has_low_WR_support(r, sample, support_fraction, n_overlpapping, n_mapq_0): if debug: echo("8") - if keep_all: - r.filter.add("lowSupport") - return False - if not is_paired_end and too_many_clipped_reads(r, nearby_soft_clips, support_fraction): + return "lowSupport" + # echo(svtype) + # if svtype == "INV": + # support_fraction = support_fraction / 20 + # echo(support_fraction) + if (not is_paired_end or (svtype in "INVTRA" and not any_contigs)) and too_many_clipped_reads(r, nearby_soft_clips, support_fraction): + # if not is_paired_end and too_many_clipped_reads(r, nearby_soft_clips, support_fraction): if debug: echo("9") - if keep_all: - r.filter.add("lowSupport") - return False + return "lowSupport" if cached and matching_soft_clips(r, cached, is_paired_end): if debug: echo("10") - if keep_all: - r.filter.add("normal") - return False - return True + return "normal" + if not is_paired_end and poorly_aligned_region(sum_edit_distance, sum_n_small_gap, NMbase, n_aligns, r.samples[sample], max_divergence): + return "highDivergence" + + return None def update_filter_value(r, sample_name, old_filter_values, pass_prob, new_value=""): @@ -918,11 +991,14 @@ def run_filtering(args): min_prob = args['min_prob'] pass_prob = args['pass_prob'] support_fraction = args['support_fraction'] + max_divergence = args['max_divergence'] filter_results = defaultdict(int) written = 0 for idx, r in enumerate(vcf): - # if r.id != "165324": + # if r.id != "187669": # continue + # echo(dict(r.info)) + # echo(dict(r.samples[sample_name])) old_filter_value = list(r.filter.keys()) r.filter.clear() if min_prob != 0 and 'PROB' in r.samples[sample_name] and r.samples[sample_name]['PROB'] < min_prob: @@ -947,7 +1023,7 @@ def run_filtering(args): if chrom2_tid in intervals: posB_overlaps = set(intervals[chrom2_tid].allOverlappingIntervals(posB, posB + 1)) if posA_overlaps.intersection(posB_overlaps): - filter_results['dropped, normal SV overlap'] += 1 + filter_results['dropped, normal'] += 1 if keep_all: update_filter_value(r, sample_name, old_filter_value, pass_prob, new_value="normal") out_vcf.write(r) @@ -959,18 +1035,19 @@ def run_filtering(args): continue sv_type = get_sv_type(r, chrom_tid, chrom2_tid) if sv_type == "TRA" or sv_type == "BND": - good = process_translocation(r, r.info["CHR2"], posB, bams, infile, bam_is_paired_end, pad=pad, keep_all=keep_all, sample=sample_name, support_fraction=support_fraction) + reason = process_translocation(r, r.info["CHR2"], posB, bams, infile, bam_is_paired_end, pad=pad, sample=sample_name, support_fraction=support_fraction, max_divergence=max_divergence) else: - good = process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pad=pad, sample=sample_name, keep_all=keep_all) - if good: + reason = process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pad=pad, sample=sample_name, max_divergence=max_divergence) + if reason is None: update_filter_value(r, sample_name, old_filter_value, pass_prob) out_vcf.write(r) written += 1 else: if keep_all: + r.filter.add(reason) update_filter_value(r, sample_name, old_filter_value, pass_prob, new_value="normal") out_vcf.write(r) - filter_results['dropped, normal read support'] += 1 + filter_results[f'dropped, {reason}'] += 1 out_vcf.close() logging.info(f'Filter results: {dict(sorted(filter_results.items()))}') logging.info("dysgu filter {} complete, n={}, h:m:s, {}".format(args['input_vcf'], diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx index 74d3f5a..a4674af 100644 --- a/dysgu/graph.pyx +++ b/dysgu/graph.pyx @@ -1358,9 +1358,9 @@ cdef tuple count_support_between(Py_SimpleGraph G, parts): cdef tuple t cdef unsigned long[:] p if len(parts) == 0: - return {}, {} + return None, None elif len(parts) == 1: - return {}, {0: parts[0]} + return None, {0: parts[0]} cdef Py_Int2IntMap p2i = map_set_utils.Py_Int2IntMap() for i, p in enumerate(parts): for node in p: @@ -1390,7 +1390,7 @@ cdef tuple count_support_between(Py_SimpleGraph G, parts): if t in seen_t: continue if t not in counts: - counts[t] = [set([]), set([])] + counts[t] = (set([]), set([])) if j < i: counts[t][0].add(child) counts[t][1].add(node) @@ -1406,7 +1406,7 @@ cdef tuple count_support_between(Py_SimpleGraph G, parts): self_counts[i].append(node) seen_t.update(current_t) # Only count edge once for t in current_t: # save memory by converting support_between to array - counts[t] = [np.fromiter(m, dtype="uint32", count=len(m)) for m in counts[t]] + counts[t] = tuple(np.fromiter(m, dtype="uint32", count=len(m)) for m in counts[t]) return counts, self_counts @@ -1519,18 +1519,19 @@ cpdef proc_component(node_to_name, component, read_buffer, infile, Py_SimpleGrap # Explore component for locally interacting nodes; create partitions using these partitions = get_partitions(G, component) support_between, support_within = count_support_between(G, partitions) - if len(support_between) == 0 and len(support_within) == 0: + + if not support_between and not support_within: if not paired_end: if len(n2n) >= min_support or len(reads) >= min_support or info: - return GraphComponent((), (), {}, (), n2n, info) + return GraphComponent(None, None, None, None, n2n, info) else: return else: # single paired end template can have 3 nodes e.g. two reads plus supplementary if min_support == 1 and (len(n2n) >= min_support or len(reads) >= min_support): - return GraphComponent((), (), {}, (), n2n, info) + return GraphComponent(None, None, None, None, n2n, info) elif len(reads) >= min_support or info: - return GraphComponent((), (), {}, (), n2n, info) + return GraphComponent(None, None, None, None, n2n, info) else: return # Debug: @@ -1552,4 +1553,11 @@ cpdef proc_component(node_to_name, component, read_buffer, infile, Py_SimpleGrap # echo("s_between", sb) # echo("s_within", support_within) - return GraphComponent(partitions, tuple(support_between.items()), reads, tuple(support_within.items()), n2n, info) + support_between = tuple(support_between.items()) if support_between else None + support_within = tuple(support_within.items()) if support_within else None + return GraphComponent(partitions, + support_between, + reads, + support_within, + n2n, + info) diff --git a/dysgu/io_funcs.pyx b/dysgu/io_funcs.pyx index baa3f7b..d486bfa 100644 --- a/dysgu/io_funcs.pyx +++ b/dysgu/io_funcs.pyx @@ -129,7 +129,7 @@ cpdef list col_names(small_output): if small_output: return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise", "rep", "gc", - ["GT", "GQ", "MAPQpri", "su", "spanning", "pe", "supp", "sc", "bnd", + ["GT", "GQ", "MAPQpri", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase", "raw_reads_10kb", "neigh10kb", "plus", "minus", "remap_score", "remap_ed", "bad_clip_count", "fcc", "inner_cn", "outer_cn", "prob"] ] @@ -264,7 +264,7 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small info_extras += [f"MeanPROB={round(mean_prob, 3)}", f"MaxPROB={round(max_prob, 3)}"] if small_output: - fmt_keys = "GT:GQ:MAPQP:SU:WR:PE:SR:SC:BND:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB" + fmt_keys = "GT:GQ:MAPQP:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB" else: fmt_keys = "GT:GQ:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB" @@ -309,7 +309,7 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small def get_fmt(r, small_output): if small_output: - v = [r["GT"], r["GQ"], r['MAPQpri'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], + v = [r["GT"], r["GQ"], r['MAPQpri'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'], r['raw_reads_10kb'], r['neigh10kb'], r["plus"], r["minus"], r["remap_score"], r["remap_ed"], r["bad_clip_count"], round(r["fcc"], 3), round(r["inner_cn"], 3), round(r["outer_cn"], 3), r['prob'] ] diff --git a/dysgu/main.py b/dysgu/main.py index dd22cba..0a54d0c 100644 --- a/dysgu/main.py +++ b/dysgu/main.py @@ -430,6 +430,7 @@ def call_events(ctx, **kwargs): @click.option("-d", "--wd", help="Working directory to use/create when merging", type=click.Path(exists=False), required=False) @click.option("-c", "--clean", help="Remove working directory when finished", is_flag=True, flag_value=True, show_default=False, default=False) @click.option("--progress", help="Prints detailed progress information", is_flag=True, flag_value=True, show_default=False, default=False) +@click.option("--update-cohort", help="Updated this cohort file with new calls from input_files", required=False, type=click.Path(exists=True)) @click.option("--collapse-nearby", help="Merges more aggressively by collapsing nearby SV", default="True", type=click.Choice(["True", "False"]), show_default=True) @click.option("--merge-across", help="Merge records across input samples", default="True", @@ -470,8 +471,9 @@ def view_data(ctx, **kwargs): @click.option("--target-sample", help="If input_vcf if multi-sample, use target-sample as input", required=False, type=str, default="") @click.option("--keep-all", help="All SVs classified as normal will be kept in the output, labelled as filter=normal", is_flag=True, flag_value=True, show_default=False, default=False) @click.option("--ignore-read-groups", help="Ignore ReadGroup RG tags when parsing sample names. Filenames will be used instead", is_flag=True, flag_value=True, show_default=False, default=False) -@click.option("--min-prob", help="Remove SVs with PROB value < min-prob", default=0.1, type=float, show_default=True) -@click.option("--pass-prob", help="Re-label SVs as PASS if PROB value >= pass-prob", default=1.0, type=float, show_default=True) +@click.option("--max-divergence", help="Remove SV if normal_bam displays divergence > max-divergence at same location", default=0.08, type=float, show_default=True) +@click.option("--min-prob", help="Remove SV with PROB value < min-prob", default=0.1, type=float, show_default=True) +@click.option("--pass-prob", help="Re-label SV as PASS if PROB value >= pass-prob", default=1.0, type=float, show_default=True) @click.option("--interval-size", help="Interval size for searching normal-vcf/normal-bams", default=1000, type=int, show_default=True) @click.option("--random-bam-sample", help="Choose N random normal-bams to search. Use -1 to ignore", default=-1, type=int, show_default=True) @click.pass_context diff --git a/dysgu/sites_utils.py b/dysgu/sites_utils.py index 222e5c0..223c183 100644 --- a/dysgu/sites_utils.py +++ b/dysgu/sites_utils.py @@ -65,7 +65,7 @@ def vcf_reader(pth, infile, parse_probs, sample_name, ignore_sample, default_pro continue if ignore_sample and sample_name in r.samples and "GT" in r.samples[sample_name]: this_gt = r.samples[sample_name]['GT'] - if not (this_gt == "0/0" or this_gt == "0" or this_gt == "." or this_gt == "./." or this_gt == "0|0"): + if not (this_gt == (0, 0) or this_gt == "0/0" or this_gt == "0" or this_gt == "." or this_gt == "./." or this_gt == "0|0"): continue if "CHROM2" in r.info: chrom2_info = r.info["CHROM2"] @@ -149,7 +149,6 @@ def vcf_reader(pth, infile, parse_probs, sample_name, ignore_sample, default_pro raise ValueError("PROB or MeanPROB must be in a float in range 0-1, error at CHROM {}, POS {}".format(chrom, start)) recs[chrom].append(Site(chrom, start, chrom2, stop, svt, idx, r.id, svlen, pval)) - if not_parsed: logging.warning("Some records had incompatible SVTYPE: {}".format( str({k: v for k, v in Counter(not_parsed).items()}))) diff --git a/dysgu/view.py b/dysgu/view.py index b2db62b..1b17bc4 100644 --- a/dysgu/view.py +++ b/dysgu/view.py @@ -631,6 +631,54 @@ def shard_data(args, input_files, Global, show_progress): os.remove(item) +def update_cohort_only(args): + + cohort = pysam.VariantFile(args["update_cohort"]) + cohort_samples = set(cohort.header.samples) + # contigs = list(cohort.header.contigs) + samples = {} + # iters = {} + #current = {} + samps_counts = defaultdict(int) + for pth in args["input_files"]: + v = pysam.VariantFile(pth) + if v.index is None: + raise ValueError(f"Input files must be indexed. {pth} has not index") + samps = list(v.header.samples) + if len(samps) > 1: + raise ValueError(f"Only one sample supported per input file but {pth} has {list(samps)}") + samp = samps[0] + samps_counts[samp] += 1 + if samp in samples: + logging.warning(f"{samp} already in samples list, {samp} is assumed to be {samp}_{samps_counts[samp]} in cohort file") + samp = f"{samp}_{samps_counts[samp]}" + if samp not in cohort_samples: + raise ValueError(f"Input file has sample name {samp}, but this was not found in the cohort file {args['update_cohort']}") + samples[samp] = v + + for r in cohort.fetch(): + ch_svtype = r.info["SVTYPE"] + for k in samples: + ch_fmt = r.samples[k] + sample_items = [i for i in list(samples[k].fetch(r.chrom, r.pos, r.pos + 10)) if (ch_svtype == i.info["SVTYPE"] and abs(i.pos - r.pos) < 10)] + if not sample_items: + continue + index = 0 + if len(sample_items) > 1: + # choose row with any genotype in sample, or last row + for index, candidate in enumerate(sample_items): + candidate_gt = candidate.samples[k]["GT"] + if candidate_gt != (0, 0): + break + item = sample_items[index] + if item.samples[k]["GT"] == (0, 0): + continue + echo("samples items", len(sample_items)) + if ch_fmt["PROB"] != item.samples[k]["PROB"]: + echo(k, r.chrom, r.pos, item.chrom, item.pos, dict(ch_fmt)["PROB"], dict(item.samples[k])["PROB"]) + echo("---") + + def view_file(args): t0 = time.time() @@ -645,22 +693,29 @@ def view_file(args): if not all(os.path.splitext(i)[1] == ".csv" for i in args["input_files"]): raise ValueError("All input files must have .csv extension") - args["metrics"] = False # only option supported so far - args["contigs"] = False + if args["update_cohort"]: + for opt, v in (("out_format", "vcf"), ("merge_within", "False"), ("merge_across", "True"), ("add_kind", "False"), ("separate", "False")): + if args[opt] != v: + raise ValueError(f"{opt}={v} fot supported with --update-cohort") + update_cohort_only(args) - soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) - manager = multiprocessing.Manager() - Global = manager.Namespace() - Global.open_files = soft # we dont know how many file descriptors are already open by the user, assume all - Global.soft = soft - - if not args["wd"]: - seen_names, names_list = get_names_list(args["input_files"]) - process_file_list(args, args["input_files"], seen_names, names_list, log_messages=True) else: - shard_data(args, args["input_files"], Global=Global, show_progress=args['progress']) - if args['clean']: - os.rmdir(args['wd']) + args["metrics"] = False # only option supported so far + args["contigs"] = False + + soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) + manager = multiprocessing.Manager() + Global = manager.Namespace() + Global.open_files = soft # we dont know how many file descriptors are already open by the user, assume all + Global.soft = soft + + if not args["wd"]: + seen_names, names_list = get_names_list(args["input_files"]) + process_file_list(args, args["input_files"], seen_names, names_list, log_messages=True) + else: + shard_data(args, args["input_files"], Global=Global, show_progress=args['progress']) + if args['clean']: + os.rmdir(args['wd']) logging.info("dysgu merge complete h:m:s, {}".format(str(datetime.timedelta(seconds=int(time.time() - t0))), time.time() - t0)) diff --git a/setup.py b/setup.py index 22940f5..484e2a2 100644 --- a/setup.py +++ b/setup.py @@ -144,7 +144,7 @@ def get_extra_args(): # Dysgu modules for item in ["sv2bam", "io_funcs", "graph", "coverage", "assembler", "call_component", - "map_set_utils", "cluster", "sv_category", "extra_metrics"]: # "post_call_metrics", + "map_set_utils", "cluster", "sv_category", "extra_metrics"]: ext_modules.append(Extension(f"dysgu.{item}", [f"dysgu/{item}.pyx"], @@ -166,7 +166,7 @@ def get_extra_args(): description="Structural variant calling", license="MIT", version='1.6.3', - python_requires='>=3.8', + python_requires='>=3.10', install_requires=[ # runtime requires 'setuptools>=63.0', 'cython', @@ -179,7 +179,6 @@ def get_extra_args(): 'scikit-learn>=0.22', 'sortedcontainers', 'lightgbm', - #'edlib', ], setup_requires=[ 'setuptools>=63.0', @@ -193,7 +192,6 @@ def get_extra_args(): 'scikit-learn>=0.22', 'sortedcontainers', 'lightgbm', - #'edlib' ], packages=["dysgu", "dysgu.tests", "dysgu.scikitbio", "dysgu.edlib"], ext_modules=cythonize(ext_modules),