Skip to content

Commit

Permalink
Better read merging for long reads. New clustering methods for long r…
Browse files Browse the repository at this point in the history
…eads. Improved runtime.
  • Loading branch information
kcleal committed Nov 15, 2024
1 parent 8081665 commit ef651f4
Show file tree
Hide file tree
Showing 18 changed files with 1,075 additions and 551 deletions.
268 changes: 182 additions & 86 deletions dysgu/call_component.pyx

Large diffs are not rendered by default.

25 changes: 19 additions & 6 deletions dysgu/cluster.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# cython: language_level=3

from __future__ import absolute_import
import datetime
import time
Expand Down Expand Up @@ -184,15 +185,15 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
if args["mode"] == "pacbio-sequel2":
max_dist, max_clust_dist = 35, 500000
if args["merge_dist"] is None:
args["merge_dist"] = 700
args["merge_dist"] = 1000
elif args["mode"] == "pacbio-revio":
max_dist, max_clust_dist = 50, 500000
if args["merge_dist"] is None:
args["merge_dist"] = 700
args["merge_dist"] = 1000
elif args["mode"] == "nanopore-r9" or args["mode"] == "nanopore-r10":
max_dist, max_clust_dist = 100, 500000
if args["merge_dist"] is None:
args["merge_dist"] = 700
args["merge_dist"] = 1000

# set upper bound on single-partition size
max_single_size = min(max(args["max_cov"] * 50, 10000), 100000) # limited between 5000 - 50,000 reads
Expand Down Expand Up @@ -436,6 +437,10 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
os.remove(f"{tdir}/job_{p}.done.pkl")
if len(block_edge_events) == 0:
return [], None

# for item1 in block_edge_events:
# echo(item1.svtype, item1.su, item1.svlen)

logging.info("Number of components {}. N candidates {}".format(components_seen, len(block_edge_events)))
keeps = len([i for i in block_edge_events if i.site_info])
if keeps:
Expand All @@ -452,12 +457,17 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
# Merge across calls
if args["merge_within"] == "True":
merged = merge_svs.merge_events(block_edge_events, args["merge_dist"], regions, bool(paired_end), try_rev=False, pick_best=False,
debug=True, min_size=args["min_size"],
max_comparisons=args["max_comparisons"] if "max_comparisons" in args else 100)
debug=True, min_size=args["min_size"],
max_comparisons=args["max_comparisons"] if "max_comparisons" in args else 100,
procs=args['procs'])
else:
merged = block_edge_events
logging.info("Number of candidate SVs merged: {}".format(len(block_edge_events) - len(merged)))
logging.info("Number of candidate SVs after merge: {}".format(len(merged)))

# echo("no--->")
# for item1 in merged:
# echo(item1.svtype, item1.su, item1.svlen)
before = len(merged)

if auto_support:
Expand Down Expand Up @@ -490,12 +500,15 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
preliminaries = re_map.drop_svs_near_reference_gaps(preliminaries, paired_end, ref_genome, args["drop_gaps"] == "True")
preliminaries = post_call.ref_repetitiveness(preliminaries, ref_genome)
preliminaries = post_call.strand_binom_t(preliminaries)
# preliminaries = consensus.contig_info(preliminaries) # GC info, repetitiveness

preliminaries = extra_metrics.find_repeat_expansions(preliminaries, insert_stdev)
preliminaries = post_call.compressability(preliminaries)
preliminaries = post_call.get_gt_metric2(preliminaries, args["mode"], True)

preliminaries = post_call.get_ref_base(preliminaries, ref_genome, args["symbolic_sv_size"])

# preliminaries = post_call.filter_microsatellite_non_diploid(preliminaries)

preliminaries = extra_metrics.sample_level_density(preliminaries, regions)
preliminaries = coverage_analyser.normalize_coverage_values(preliminaries)

Expand Down
180 changes: 147 additions & 33 deletions dysgu/consensus.pyx
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#distutils: language = c++
#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True
# cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True

"""
A basic assembler/consensus sequence generator. Takes an overlap graph and merges reads in-place in a POA style.
A basic consensus sequence generator. Takes an overlap graph and merges reads in-place in a POA style.
"""

import warnings
import array
from re import match

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
Expand Down Expand Up @@ -140,7 +141,7 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T

cdef str seq
cdef tuple k
cdef bint done = 0

cdef cpp_vector[int] vv = [0, 0, 0, 0]

cdef int r_end = r.reference_end
Expand All @@ -164,8 +165,7 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T
if opp == 4 and length > 250:
i = length - 250
length = 250
if done:
break

if opp == 4:
if start:
for o in range(length, 0, -1):
Expand Down Expand Up @@ -245,12 +245,11 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T
if current_pos < approx_position and approx_position - current_pos > max_distance:
i += 1
continue
elif current_pos > approx_position and current_pos - approx_position > max_distance:
break
# elif current_pos > approx_position and current_pos - approx_position > max_distance:
# break
ref_bases += 1
if ref_bases > target_bases:
done = 1
break
return
qual = quals[i]
base = bam_seqi(char_ptr_rseq, i)
i += 1
Expand Down Expand Up @@ -556,6 +555,11 @@ cdef dict get_consensus(rd, int position, int max_distance):
else:
cigar = [tuple(ct) for ct in cigar]

# if len(seq) < len(rd[0].seq) and longest_left_sc > 10 and longest_right_sc > 10:
# return {}
# Not a good consensus, use first read instead
# return trim_sequence_from_cigar(rd[0], position, max_distance)

return {"contig": seq,
"left_clips": longest_left_sc,
"right_clips": longest_right_sc,
Expand All @@ -582,13 +586,17 @@ cdef trim_sequence_from_cigar(r, int approx_pos, int max_distance):
cdef int pos = start_pos # current genome position
cdef int end_index = len(ct) - 1
cdef bint started = False
cdef int opp, length
cdef int opp, length, keep_start, keep_end

cdef int longest_left_sc = 0
cdef int longest_right_sc = 0
cdef int ref_bases = 0

parts = []
pos_index = -1
for opp, length in ct:
if ref_bases > 300 and pos > approx_pos + 150:
break

if opp == 4 and index == 0:
if abs(pos - approx_pos) < 50:
Expand Down Expand Up @@ -616,29 +624,30 @@ cdef trim_sequence_from_cigar(r, int approx_pos, int max_distance):

elif opp == 0 or opp == 7 or opp == 8 or opp == 3:

if not started:
if abs(pos + length - approx_pos) < 300:
parts.append(r.seq[seq_index:seq_index + length]) # upper implied
op_end = pos + length

started = True
start_pos = pos
pos += length
start_index = index
seq_start = seq_index
# If we're completely before the region of interest
if op_end < approx_pos - 250:
pos += length
seq_index += length
continue

else:
pos += length
# If we're completely after the region of interest
if pos > approx_pos + 250:
break

seq_index += length
# Calculate which portion of this match we want to keep
keep_start = max(0, approx_pos - 250 - pos)
keep_end = min(length, approx_pos + 250 - pos)

else:
if abs(pos + length - approx_pos) > 300:
end_index = index + 1
break
else:
parts.append(r.seq[seq_index:seq_index + length])
pos += length
seq_index += length
# If this match overlaps our region of interest
if keep_end > keep_start:
started = True
ref_bases += keep_end - keep_start
parts.append(seq[seq_index + keep_start:seq_index + keep_end])

pos += length
seq_index += length

index += 1

Expand Down Expand Up @@ -693,6 +702,103 @@ cpdef dict base_assemble(rd, int position, int max_distance):
return get_consensus(rd, position, max_distance)


cpdef contig_from_read_cigar(r, int cigar_index):
ct = r.cigartuples
seq = r.seq
query_pos = 0 # Position in query sequence
ref_pos = r.pos # Position in reference
window_size = 500 # Size of sequence context to include

# First pass: find the target position in query coordinates
target_query_start = 0
for i in range(cigar_index):
op, length = ct[i]
if op in {0, 1, 4, 7, 8}: # Operations that consume query sequence
target_query_start += length

# Calculate window boundaries in query coordinates
target_length = ct[cigar_index][1]
window_start = max(0, target_query_start - window_size)
window_end = min(len(seq), target_query_start + target_length + window_size)

parts = []
query_pos = 0
ref_pos = r.pos
longest_left_sc = 0
longest_right_sc = 0
ref_bases = 0

cigar_blocks = []
started = False
# Second pass: build the sequence
for i, (op, length) in enumerate(ct):
if query_pos > window_end:
break

if op == 0 or op == 7 or op == 8: # Match/mismatch
if query_pos + length > window_start:
start_idx = max(0, window_start - query_pos)
end_idx = min(length, window_end - query_pos)
match_seq = seq[query_pos + start_idx:query_pos + end_idx]
parts.append(match_seq.upper())
started = True
ref_bases += len(match_seq)
if cigar_blocks and cigar_blocks[-1][0] == 0: # elide 7 and 8 ops
cigar_blocks[-1][1] += len(match_seq)
else:
cigar_blocks.append([0, len(match_seq)])
query_pos += length
ref_pos += length

elif op == 4: # Soft clip
clip_length = min(length, 250)
if query_pos < window_end and query_pos + length > window_start:
start_idx = max(0, window_start - query_pos)
end_idx = min(clip_length, window_end - query_pos)
clip_seq = seq[query_pos + start_idx:query_pos + end_idx].lower()
parts.append(clip_seq)
cigar_blocks.append([4, len(clip_seq)])
if query_pos < target_query_start: # Left clip
longest_left_sc = len(clip_seq)
else: # Right clip
longest_right_sc = len(clip_seq)
query_pos += length

elif op == 1: # Insertion
if query_pos + length > window_start and query_pos < window_end:
if i < cigar_index:
parts.append(seq[max(window_start, query_pos):query_pos + length].lower())
if i == cigar_index:
# Full insertion if it's the target
parts.append(seq[query_pos:query_pos + length].lower())
elif i > cigar_index:
parts.append(seq[query_pos:min(window_end, query_pos + length)].lower())
cigar_blocks.append([1, len(parts[-1])])
if query_pos + length >= window_end:
break
query_pos += length

elif op == 2: # Deletion
ref_pos += length
if started:
cigar_blocks.append([length, 2])

contig = "".join(parts)

return {
"contig": contig,
"cigar": cigar_blocks,
"left_clips": longest_left_sc,
"right_clips": longest_right_sc,
"ref_bases": ref_bases,
"ref_start": r.pos,
"ref_end": r.reference_end,
"bamrname": r.rname,
"left_weight": 0,
"right_weight": 0,
}


cpdef float compute_rep(seq):

cdef unordered_map[float, int] last_visited
Expand Down Expand Up @@ -731,6 +837,7 @@ cpdef float compute_rep(seq):

return tot_amount / total_seen


cdef tuple get_rep(contig_seq):

cdef int left_clip_end = 0
Expand Down Expand Up @@ -822,10 +929,17 @@ def contig_info(events):


def check_contig_match(a, b, rel_diffs=False, diffs=8, ol_length=21, supress_seq=True, return_int=False):
if not a or not b or len(a) > 5000 or len(b) > 5000:
if not a or not b:
return 0

query = StripedSmithWaterman(str(a), suppress_sequences=supress_seq)
if len(a) > 10000:
a = a[:10000]
if len(b) > 10000:
b = b[:10000]

query = StripedSmithWaterman(str(a), suppress_sequences=supress_seq,
match_score=2, mismatch_score=-3, gap_open_penalty=10,
gap_extend_penalty=1
)
alignment = query(str(b))
if not return_int:
return alignment
Expand Down
17 changes: 12 additions & 5 deletions dysgu/coverage.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ from libc.stdint cimport uint32_t
from pysam.libcalignedsegment cimport AlignedSegment
from pysam.libchtslib cimport bam_get_cigar


def index_stats(f, rl=None):
if rl is None:
rl = []
Expand Down Expand Up @@ -147,7 +148,9 @@ cdef class GenomeScanner:
# when 'run' command is invoked, run this block. cov track already exists from find-reads
if self.cov_track_path is None and self.bam_iter is None:
for aln in self.input_bam:
if aln.flag & 1284 or aln.mapq < mq_thresh or aln.cigartuples is None: # not primary, duplicate or unmapped?
cigar_l = aln._delegate.core.n_cigar

if aln.flag & 1284 or aln.mapq < mq_thresh or cigar_l == 0: # not primary, duplicate or unmapped?
continue
self._add_to_bin_buffer(aln, tell)
tell = 0 if self.no_tell else self.input_bam.tell()
Expand Down Expand Up @@ -248,7 +251,9 @@ cdef class GenomeScanner:
name = aln.qname.__hash__(), aln.flag, aln.pos
if name in seen_reads:
continue
if aln.flag & 1284 or aln.mapq < mq_thresh or aln.cigartuples is None:

cigar_l = aln._delegate.core.n_cigar
if aln.flag & 1284 or aln.mapq < mq_thresh or cigar_l == 0:
continue
if aln.rname != self.current_tid:
if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences:
Expand Down Expand Up @@ -342,7 +347,8 @@ cdef class GenomeScanner:
prev_alignment = a

if ibam is None:
if a.flag & 1284 or a.mapq < self.mapq_threshold or a.cigartuples is None:
cigar_l = a._delegate.core.n_cigar
if a.flag & 1284 or a.mapq < self.mapq_threshold or cigar_l == 0:
continue
tell = 0 if self.no_tell else self.input_bam.tell()
if self.no_tell:
Expand Down Expand Up @@ -458,10 +464,11 @@ cdef class GenomeScanner:
elif self.no_tell:
raise BufferError("Read buffer has overflowed, increase --buffer-size")

def _add_to_bin_buffer(self, a, tell):
def _add_to_bin_buffer(self, AlignedSegment a, tell):
# Calculates coverage information on fly, drops high coverage regions, buffers reads
cdef int flag = a.flag
if flag & 1540 or a.cigartuples is None or a.seq is None:
cdef uint32_t cigar_l = a._delegate.core.n_cigar
if flag & 1540 or cigar_l == 0: # or a.seq is None:
return
cdef int rname = a.rname
cdef int apos = a.pos
Expand Down
2 changes: 1 addition & 1 deletion dysgu/extra_metrics.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True
#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8

from libc.stdint cimport uint32_t

Expand Down
Loading

0 comments on commit ef651f4

Please sign in to comment.