Skip to content

Commit

Permalink
Code improvements for call_component, graph. Better filtering of soma…
Browse files Browse the repository at this point in the history
…tic svs. Slightly improved runtime and memory.
  • Loading branch information
kcleal committed Mar 14, 2024
1 parent 16ca7f2 commit 300c743
Show file tree
Hide file tree
Showing 10 changed files with 584 additions and 290 deletions.
233 changes: 143 additions & 90 deletions dysgu/call_component.pyx

Large diffs are not rendered by default.

32 changes: 29 additions & 3 deletions dysgu/cluster.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,14 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re
event_iter = compare_subset(potential, max_dist) # Use iitree, generate overlap tree and perform intersections
seen = set([])
pad = 100
# max_edges = 8
disjoint_nodes = set([]) # if a component has more than one disjoint nodes it needs to be broken apart
# edge_counts = defaultdict(int)
for ei, ej, idx, jdx in event_iter:
i_id = ei.event_id
j_id = ej.event_id
# if edge_counts[i_id] > max_edges and edge_counts[j_id] > max_edges:
# continue
if not same_sample:
if ei.sample == ej.sample:
continue
Expand Down Expand Up @@ -183,6 +187,8 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re
# Force merging of translocations that have similar loci
if not intra:
G.add_edge(i_id, j_id, loci_same=loci_same) #, w=0)
# edge_counts[i_id] += 1
# edge_counts[j_id] += 1
continue

one_is_imprecise = False
Expand Down Expand Up @@ -265,6 +271,8 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re
if ml > 0:
if l_ratio > 0.5 or (one_is_imprecise and l_ratio > 0.3):
G.add_edge(i_id, j_id, loci_same=loci_same)
# edge_counts[i_id] += 1
# edge_counts[j_id] += 1
else:
v = None
if ci_alt and cj_alt:
Expand All @@ -285,26 +293,42 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re
if ei.remap_score == 0 or ej.remap_score == 0:
if (v[0][0].islower() and v[1][-1].islower()) or (v[0][-1].islower() and v[1][0].islower()):
G.add_edge(i_id, j_id, loci_same=loci_same)
# edge_counts[i_id] += 1; edge_counts[j_id] += 1
continue
if assembler.check_contig_match(v[0], v[1], return_int=True):
G.add_edge(i_id, j_id, loci_same=True)
# edge_counts[i_id] += 1; edge_counts[j_id] += 1
# see if alt sequence can be found in other contig
else:
# echo("checking alts")
# if ci_alt and cj_alt:
# echo("yes0", ci_alt, cj_alt)
# if assembler.check_contig_match(ci_alt, cj_alt, return_int=True):
# G.add_edge(i_id, j_id, loci_same=True)
# edge_counts[i_id] += 1; edge_counts[j_id] += 1
# continue
if ci_alt and cj:
if assembler.check_contig_match(ci_alt, cj, return_int=True):
G.add_edge(i_id, j_id, loci_same=True)
# edge_counts[i_id] += 1; edge_counts[j_id] += 1
continue
if ci_alt and cj2:
if assembler.check_contig_match(ci_alt, cj2, return_int=True):
G.add_edge(i_id, j_id, loci_same=True)
# edge_counts[i_id] += 1;
# edge_counts[j_id] += 1
continue
if cj_alt and ci:
if assembler.check_contig_match(cj_alt, ci, return_int=True):
G.add_edge(i_id, j_id, loci_same=True)
# edge_counts[i_id] += 1;
# edge_counts[j_id] += 1
continue
if cj_alt and ci2:
if assembler.check_contig_match(cj_alt, ci2, return_int=True):
G.add_edge(i_id, j_id, loci_same=True)
# edge_counts[i_id] += 1;
# edge_counts[j_id] += 1
continue
return G, disjoint_nodes

Expand Down Expand Up @@ -811,7 +835,9 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
trust_ins_len=args["trust_ins_len"] == "True",
low_mem=low_mem,
temp_dir=tdir,
find_n_aligned_bases=find_n_aligned_bases)
find_n_aligned_bases=find_n_aligned_bases,
position_distance_thresh=args['sd'],
)
sites_index = None
if sites_adder:
sites_index = sites_adder.sites_index
Expand Down Expand Up @@ -937,7 +963,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
pickle.dump(res, completed_file)
else:
j_submitted, w_idx = heapq.heappop(minhq)
heapq.heappush(minhq, (j_submitted + len(res["n2n"]), w_idx))
heapq.heappush(minhq, (j_submitted + len(res.n2n), w_idx))
msg_queues[w_idx][1].send(res)
else:
# most partitions processed here, dict returned, or None
Expand Down Expand Up @@ -969,7 +995,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
pickle.dump(res, completed_file)
else:
j_submitted, w_idx = heapq.heappop(minhq)
heapq.heappush(minhq, (j_submitted + len(res["n2n"]), w_idx))
heapq.heappush(minhq, (j_submitted + len(res.n2n), w_idx))
msg_queues[w_idx][1].send(res)

if completed_file is not None:
Expand Down
3 changes: 3 additions & 0 deletions dysgu/edlib/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

from .edlib import (align)
__all__ = ['align']
53 changes: 47 additions & 6 deletions dysgu/filter_normals.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import random
from sys import stderr, argv

import numpy as np
import pysam
from pysam import CSOFT_CLIP, CHARD_CLIP, CDEL, CINS, CDIFF, CMATCH
import time
Expand All @@ -14,6 +16,7 @@
import zlib
from dysgu.edlib import edlib
import glob
from dysgu.map_set_utils import echo


random.seed(42)
Expand Down Expand Up @@ -398,6 +401,7 @@ def matching_gap(posA, posB, r, svtype, is_insertion, svlen, pos_threshold=100,
min_length = svlen * 0.2 # 0.25
max_distance = svlen * 10
ins_like = svtype == "DUP" or svtype == "INV"
debug = False
for k, l in ct:
if k == CHARD_CLIP or k == CSOFT_CLIP:
continue
Expand All @@ -408,17 +412,22 @@ def matching_gap(posA, posB, r, svtype, is_insertion, svlen, pos_threshold=100,
pos += l
continue
elif not is_insertion and k == CINS:
if l > 250 and ins_like:
span_similarity = min(l, svlen) / max(l, svlen)
if l > 250 and ins_like and span_similarity > 0.85 and abs(posA - pos) < max(l, svlen):
if debug: echo("ret0", svlen, f"{posA}-{posA + svlen}", f"{pos}-{pos + l}", (k, l))
return True
elif svlen * 2 < l and (abs(posA - pos) < max_distance or (abs(posB - pos + l) < max_distance)):
if debug: echo("ret1")
return True
continue
end = pos + l
if l > min_length and abs(posA - pos) < max_distance:
if is_insertion:
if span_position_distance(posA, posA + svlen, pos, end, pos_threshold, span_threshold):
if debug: echo("ret2", f"{posA}-{posA + svlen}", f"{pos}-{end}")
return True
elif span_position_distance(posA, posB, pos, end, pos_threshold, span_threshold):
if debug: echo("ret3")
return True
if k == CDEL:
pos += l
Expand Down Expand Up @@ -640,7 +649,12 @@ def matching_soft_clips(r, reads_with_nearby_soft_clips, pe):


def has_low_support(r, sample, support_fraction):
cov = r.samples[sample]['COV']
d = r.samples[sample]
m = max(d['ICN'], d['OCN'])
if m == 0:
cov = d['COV']
else:
cov = d['COV'] * (min(d['ICN'], d['OCN']) / m)
min_support = round(1.5 + support_fraction * cov)
if r.info['SU'] < min_support:
return True
Expand All @@ -649,7 +663,13 @@ def has_low_support(r, sample, support_fraction):

def has_low_WR_support(r, sample, support_fraction, n_overlapping, n_mapq0):
sf = support_fraction / 2
cov = r.samples[sample]['COV']
# cov = r.samples[sample]['COV']
d = r.samples[sample]
m = max(d['ICN'], d['OCN'])
if m == 0:
cov = d['COV']
else:
cov = d['COV'] * (min(d['ICN'], d['OCN']) / m)
if n_mapq0 / n_overlapping > 0.3:
cov = max(cov, n_overlapping)
min_support = min(4, round(1.5 + sf * cov))
Expand Down Expand Up @@ -681,6 +701,7 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad,
has_contig = 'CONTIG' in r.info or 'CONTIG2' in r.info or r.alts[0][0] != "<"
is_paired_end = False
good_step = good_step_translocation(r, sample)
debug = False #True
for is_paired_end, aln in iterate_bams_single_region(bams, chromA, posA, pad, bam_is_paired_end):
if is_paired_end:
distance = 50
Expand All @@ -703,10 +724,12 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad,
if aln.flag & 4 or not has_clip(aln):
continue
if matching_supplementary(aln, infile, posA, posB):
if debug: echo("tra0")
if keep_all:
r.filter.add("normal")
return False
if not good_step and matching_ins_translocation(posA, aln):
if debug: echo("tra1")
if keep_all:
r.filter.add("normal")
return False
Expand All @@ -715,6 +738,7 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad,
if any_nearby_soft_clip(posA, posB, aln, ct, "TRA", 30, clip_length=50):
nearby_soft_clips += 1
elif any_nearby_soft_clip(posA, posB, aln, ct, "TRA", 30, clip_length=250):
if debug: echo("tra2")
if keep_all:
r.filter.add("lowSupport")
return False
Expand All @@ -741,10 +765,12 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad,
if aln.flag & 4 or not has_clip(aln):
continue
if matching_supplementary(aln, infile, posB, posA):
if debug: echo("tra3")
if keep_all:
r.filter.add("normal")
return False
if not good_step and matching_ins_translocation(posB, aln):
if debug: echo("tra4")
if keep_all:
r.filter.add("normal")
return False
Expand All @@ -753,15 +779,18 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad,
if any_nearby_soft_clip(r.pos, posB, aln, ct, "TRA", 30, clip_length=50):
nearby_soft_clips += 1
elif any_nearby_soft_clip(posA, posB, aln, ct, "TRA", 30, clip_length=250):
if debug: echo("tra5")
if keep_all:
r.filter.add("lowSupport")
return False

if not is_paired_end and too_many_clipped_reads(r, nearby_soft_clips, support_fraction):
if debug: echo("tra6")
if keep_all:
r.filter.add("lowSupport")
return False
if cached and matching_soft_clips(r, cached, is_paired_end):
if debug: echo("tra7")
if keep_all:
r.filter.add("normal")
return False
Expand All @@ -780,6 +809,8 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa
is_paired_end = False
n_mapq_0 = 0
n_overlpapping = 0
# debug = False if r.rid != "229738" else True
debug = False #False
for is_paired_end, aln in iterate_bams(bams, r.chrom, posA, r.chrom, posB, pad, bam_is_paired_end):
if is_paired_end:
a_posA = min(aln.pos, aln.pnext)
Expand All @@ -788,6 +819,7 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa
continue
if aln.rname != aln.rnext:
if matching_supplementary(aln, infile, posA, posB):
if debug: echo("1")
if keep_all:
r.filter.add("normal")
return False
Expand All @@ -796,26 +828,31 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa
if abs(a_posA - posA) > pad or abs(a_posB - posB) > pad:
continue
if not is_insertion and not aln.flag & 10: # proper pair, mate unmapped
if span_position_distance(posA, posB, a_posA, a_posB, pos_threshold=20, span_threshold=0.5):
if span_position_distance(posA, posB, a_posA, a_posB, pos_threshold=20, span_threshold=0.7):
if debug: echo("2")
if keep_all:
r.filter.add("normal")
return False
if not is_insertion and matching_supplementary(aln, infile, posA, posB):
if debug: echo("3")
if keep_all:
r.filter.add("normal")
return False
if svlen < 80 and matching_gap(posA, posB, aln, svtype, is_insertion, svlen):
if debug: echo("4")
if keep_all:
r.filter.add("normal")
return False
cache_nearby_soft_clip(posA, posB, aln, ct, svtype, cached, distance=50, clip_length=3)

else:
if matching_gap(posA, posB, aln, svtype, is_insertion, svlen, pos_threshold=30, span_threshold=0.7):
if debug: echo("5", svlen, aln.qname)
if keep_all:
r.filter.add("normal")
return False
if matching_supplementary(aln, infile, posA, posB):
if debug: echo("6")
if keep_all:
r.filter.add("normal")
return False
Expand All @@ -828,21 +865,25 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa
covered += 1
if any_nearby_soft_clip(posA, posB, aln, ct, svtype, 30, clip_length=50):
nearby_soft_clips += 1
cache_nearby_soft_clip(posA, posB, aln, ct, svtype, cached, distance=min(500, svlen * 0.5), clip_length=50)
# cache_nearby_soft_clip(posA, posB, aln, ct, svtype, cached, distance=min(500, svlen * 0.5), clip_length=50)

if not is_paired_end and not covered:
if debug: echo("7")
if keep_all:
r.filter.add("lowSupport")
return False
if not is_paired_end and has_low_WR_support(r, sample, support_fraction, n_overlpapping, n_mapq_0):
if debug: echo("8")
if keep_all:
r.filter.add("lowSupport")
return False
if not is_paired_end and too_many_clipped_reads(r, nearby_soft_clips, support_fraction):
if debug: echo("9")
if keep_all:
r.filter.add("lowSupport")
return False
if cached and matching_soft_clips(r, cached, is_paired_end):
if debug: echo("10")
if keep_all:
r.filter.add("normal")
return False
Expand Down Expand Up @@ -880,7 +921,7 @@ def run_filtering(args):
filter_results = defaultdict(int)
written = 0
for idx, r in enumerate(vcf):
# if r.id != "25979":
# if r.id != "165324":
# continue
old_filter_value = list(r.filter.keys())
r.filter.clear()
Expand Down
Loading

0 comments on commit 300c743

Please sign in to comment.