Better read merging for long reads. New clustering methods for long r…

…eads. Improved runtime.
kcleal · Nov 15, 2024 · ef651f4 · ef651f4
1 parent 8081665
commit ef651f4
Show file tree

Hide file tree

Showing 18 changed files with 1,075 additions and 551 deletions.
diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx
@@ -1,4 +1,5 @@
 # cython: language_level=3
+
 from __future__ import absolute_import
 import datetime
 import time
@@ -184,15 +185,15 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
         if args["mode"] == "pacbio-sequel2":
             max_dist, max_clust_dist = 35, 500000
             if args["merge_dist"] is None:
-                args["merge_dist"] = 700
+                args["merge_dist"] = 1000
         elif args["mode"] == "pacbio-revio":
             max_dist, max_clust_dist = 50, 500000
             if args["merge_dist"] is None:
-                args["merge_dist"] = 700
+                args["merge_dist"] = 1000
         elif args["mode"] == "nanopore-r9" or args["mode"] == "nanopore-r10":
             max_dist, max_clust_dist = 100, 500000
             if args["merge_dist"] is None:
-                args["merge_dist"] = 700
+                args["merge_dist"] = 1000
 
     # set upper bound on single-partition size
     max_single_size = min(max(args["max_cov"] * 50, 10000), 100000)  # limited between 5000 - 50,000 reads
@@ -436,6 +437,10 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
             os.remove(f"{tdir}/job_{p}.done.pkl")
     if len(block_edge_events) == 0:
         return [], None
+
+    # for item1 in block_edge_events:
+    #     echo(item1.svtype, item1.su, item1.svlen)
+
     logging.info("Number of components {}. N candidates {}".format(components_seen, len(block_edge_events)))
     keeps = len([i for i in block_edge_events if i.site_info])
     if keeps:
@@ -452,12 +457,17 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
     # Merge across calls
     if args["merge_within"] == "True":
         merged = merge_svs.merge_events(block_edge_events, args["merge_dist"], regions, bool(paired_end), try_rev=False, pick_best=False,
-                                         debug=True, min_size=args["min_size"],
-                              max_comparisons=args["max_comparisons"] if "max_comparisons" in args else 100)
+                                        debug=True, min_size=args["min_size"],
+                                        max_comparisons=args["max_comparisons"] if "max_comparisons" in args else 100,
+                                        procs=args['procs'])
     else:
         merged = block_edge_events
     logging.info("Number of candidate SVs merged: {}".format(len(block_edge_events) - len(merged)))
     logging.info("Number of candidate SVs after merge: {}".format(len(merged)))
+
+    # echo("no--->")
+    # for item1 in merged:
+    #     echo(item1.svtype, item1.su, item1.svlen)
     before = len(merged)
 
     if auto_support:
@@ -490,12 +500,15 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
     preliminaries = re_map.drop_svs_near_reference_gaps(preliminaries, paired_end, ref_genome, args["drop_gaps"] == "True")
     preliminaries = post_call.ref_repetitiveness(preliminaries, ref_genome)
     preliminaries = post_call.strand_binom_t(preliminaries)
-    # preliminaries = consensus.contig_info(preliminaries)  # GC info, repetitiveness
+
     preliminaries = extra_metrics.find_repeat_expansions(preliminaries, insert_stdev)
     preliminaries = post_call.compressability(preliminaries)
     preliminaries = post_call.get_gt_metric2(preliminaries, args["mode"], True)
+
     preliminaries = post_call.get_ref_base(preliminaries, ref_genome, args["symbolic_sv_size"])
 
+    # preliminaries = post_call.filter_microsatellite_non_diploid(preliminaries)
+
     preliminaries = extra_metrics.sample_level_density(preliminaries, regions)
     preliminaries = coverage_analyser.normalize_coverage_values(preliminaries)
 

diff --git a/dysgu/consensus.pyx b/dysgu/consensus.pyx
@@ -1,11 +1,12 @@
-#distutils: language = c++
-#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True
+# cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True
+
 """
-A basic assembler/consensus sequence generator. Takes an overlap graph and merges reads in-place in a POA style.
+A basic consensus sequence generator. Takes an overlap graph and merges reads in-place in a POA style.
 """
 
 import warnings
 import array
+from re import match
 
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 warnings.simplefilter(action='ignore', category=FutureWarning)
@@ -140,7 +141,7 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T
 
     cdef str seq
     cdef tuple k
-    cdef bint done = 0
+
     cdef cpp_vector[int] vv = [0, 0, 0, 0]
 
     cdef int r_end = r.reference_end
@@ -164,8 +165,7 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T
         if opp == 4 and length > 250:
             i = length - 250
             length = 250
-        if done:
-            break
+
         if opp == 4:
             if start:
                 for o in range(length, 0, -1):
@@ -245,12 +245,11 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T
                 if current_pos < approx_position and approx_position - current_pos > max_distance:
                     i += 1
                     continue
-                elif current_pos > approx_position and current_pos - approx_position > max_distance:
-                    break
+                # elif current_pos > approx_position and current_pos - approx_position > max_distance:
+                #     break
                 ref_bases += 1
                 if ref_bases > target_bases:
-                    done = 1
-                    break
+                    return
                 qual = quals[i]
                 base = bam_seqi(char_ptr_rseq, i)
                 i += 1
@@ -556,6 +555,11 @@ cdef dict get_consensus(rd, int position, int max_distance):
     else:
         cigar = [tuple(ct) for ct in cigar]
 
+    # if len(seq) < len(rd[0].seq) and longest_left_sc > 10 and longest_right_sc > 10:
+    #     return {}
+        # Not a good consensus, use first read instead
+        # return trim_sequence_from_cigar(rd[0], position, max_distance)
+
     return {"contig": seq,
             "left_clips": longest_left_sc,
             "right_clips": longest_right_sc,
@@ -582,13 +586,17 @@ cdef trim_sequence_from_cigar(r, int approx_pos, int max_distance):
     cdef int pos = start_pos  # current genome position
     cdef int end_index = len(ct) - 1
     cdef bint started = False
-    cdef int opp, length
+    cdef int opp, length, keep_start, keep_end
 
     cdef int longest_left_sc = 0
     cdef int longest_right_sc = 0
+    cdef int ref_bases = 0
 
     parts = []
+    pos_index = -1
     for opp, length in ct:
+        if ref_bases > 300 and pos > approx_pos + 150:
+            break
 
         if opp == 4 and index == 0:
             if abs(pos - approx_pos) < 50:
@@ -616,29 +624,30 @@ cdef trim_sequence_from_cigar(r, int approx_pos, int max_distance):
 
         elif opp == 0 or opp == 7 or opp == 8 or opp == 3:
 
-            if not started:
-                if abs(pos + length - approx_pos) < 300:
-                    parts.append(r.seq[seq_index:seq_index + length])  # upper implied
+            op_end = pos + length
 
-                    started = True
-                    start_pos = pos
-                    pos += length
-                    start_index = index
-                    seq_start = seq_index
+            # If we're completely before the region of interest
+            if op_end < approx_pos - 250:
+                pos += length
+                seq_index += length
+                continue
 
-                else:
-                    pos += length
+            # If we're completely after the region of interest
+            if pos > approx_pos + 250:
+                break
 
-                seq_index += length
+            # Calculate which portion of this match we want to keep
+            keep_start = max(0, approx_pos - 250 - pos)
+            keep_end = min(length, approx_pos + 250 - pos)
 
-            else:
-                if abs(pos + length - approx_pos) > 300:
-                    end_index = index + 1
-                    break
-                else:
-                    parts.append(r.seq[seq_index:seq_index + length])
-                    pos += length
-                    seq_index += length
+            # If this match overlaps our region of interest
+            if keep_end > keep_start:
+                started = True
+                ref_bases += keep_end - keep_start
+                parts.append(seq[seq_index + keep_start:seq_index + keep_end])
+
+            pos += length
+            seq_index += length
 
         index += 1
 
@@ -693,6 +702,103 @@ cpdef dict base_assemble(rd, int position, int max_distance):
     return get_consensus(rd, position, max_distance)
 
 
+cpdef contig_from_read_cigar(r, int cigar_index):
+    ct = r.cigartuples
+    seq = r.seq
+    query_pos = 0  # Position in query sequence
+    ref_pos = r.pos  # Position in reference
+    window_size = 500  # Size of sequence context to include
+
+    # First pass: find the target position in query coordinates
+    target_query_start = 0
+    for i in range(cigar_index):
+        op, length = ct[i]
+        if op in {0, 1, 4, 7, 8}:  # Operations that consume query sequence
+            target_query_start += length
+
+    # Calculate window boundaries in query coordinates
+    target_length = ct[cigar_index][1]
+    window_start = max(0, target_query_start - window_size)
+    window_end = min(len(seq), target_query_start + target_length + window_size)
+
+    parts = []
+    query_pos = 0
+    ref_pos = r.pos
+    longest_left_sc = 0
+    longest_right_sc = 0
+    ref_bases = 0
+
+    cigar_blocks = []
+    started = False
+    # Second pass: build the sequence
+    for i, (op, length) in enumerate(ct):
+        if query_pos > window_end:
+            break
+
+        if op == 0 or op == 7 or op == 8:  # Match/mismatch
+            if query_pos + length > window_start:
+                start_idx = max(0, window_start - query_pos)
+                end_idx = min(length, window_end - query_pos)
+                match_seq = seq[query_pos + start_idx:query_pos + end_idx]
+                parts.append(match_seq.upper())
+                started = True
+                ref_bases += len(match_seq)
+                if cigar_blocks and cigar_blocks[-1][0] == 0:  # elide 7 and 8 ops
+                    cigar_blocks[-1][1] += len(match_seq)
+                else:
+                    cigar_blocks.append([0, len(match_seq)])
+            query_pos += length
+            ref_pos += length
+
+        elif op == 4:  # Soft clip
+            clip_length = min(length, 250)
+            if query_pos < window_end and query_pos + length > window_start:
+                start_idx = max(0, window_start - query_pos)
+                end_idx = min(clip_length, window_end - query_pos)
+                clip_seq = seq[query_pos + start_idx:query_pos + end_idx].lower()
+                parts.append(clip_seq)
+                cigar_blocks.append([4, len(clip_seq)])
+                if query_pos < target_query_start:  # Left clip
+                    longest_left_sc = len(clip_seq)
+                else:  # Right clip
+                    longest_right_sc = len(clip_seq)
+            query_pos += length
+
+        elif op == 1:  # Insertion
+            if query_pos + length > window_start and query_pos < window_end:
+                if i < cigar_index:
+                    parts.append(seq[max(window_start, query_pos):query_pos + length].lower())
+                if i == cigar_index:
+                    # Full insertion if it's the target
+                    parts.append(seq[query_pos:query_pos + length].lower())
+                elif i > cigar_index:
+                    parts.append(seq[query_pos:min(window_end, query_pos + length)].lower())
+                cigar_blocks.append([1, len(parts[-1])])
+                if query_pos + length >= window_end:
+                    break
+            query_pos += length
+
+        elif op == 2:  # Deletion
+            ref_pos += length
+            if started:
+                cigar_blocks.append([length, 2])
+
+    contig = "".join(parts)
+
+    return {
+        "contig": contig,
+        "cigar": cigar_blocks,
+        "left_clips": longest_left_sc,
+        "right_clips": longest_right_sc,
+        "ref_bases": ref_bases,
+        "ref_start": r.pos,
+        "ref_end": r.reference_end,
+        "bamrname": r.rname,
+        "left_weight": 0,
+        "right_weight": 0,
+    }
+
+
 cpdef float compute_rep(seq):
 
     cdef unordered_map[float, int] last_visited
@@ -731,6 +837,7 @@ cpdef float compute_rep(seq):
 
     return tot_amount / total_seen
 
+
 cdef tuple get_rep(contig_seq):
 
     cdef int left_clip_end = 0
@@ -822,10 +929,17 @@ def contig_info(events):
 
 
 def check_contig_match(a, b, rel_diffs=False, diffs=8, ol_length=21, supress_seq=True, return_int=False):
-    if not a or not b or len(a) > 5000 or len(b) > 5000:
+    if not a or not b:
         return 0
-
-    query = StripedSmithWaterman(str(a), suppress_sequences=supress_seq)
+    if len(a) > 10000:
+        a = a[:10000]
+    if len(b) > 10000:
+        b = b[:10000]
+
+    query = StripedSmithWaterman(str(a), suppress_sequences=supress_seq,
+                                 match_score=2, mismatch_score=-3, gap_open_penalty=10,
+                                 gap_extend_penalty=1
+                                 )
     alignment = query(str(b))
     if not return_int:
         return alignment

diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx
@@ -15,6 +15,7 @@ from libc.stdint cimport uint32_t
 from pysam.libcalignedsegment cimport AlignedSegment
 from pysam.libchtslib cimport bam_get_cigar
 
+
 def index_stats(f, rl=None):
     if rl is None:
         rl = []
@@ -147,7 +148,9 @@ cdef class GenomeScanner:
             # when 'run' command is invoked, run this block. cov track already exists from find-reads
             if self.cov_track_path is None and self.bam_iter is None:
                 for aln in self.input_bam:
-                    if aln.flag & 1284 or aln.mapq < mq_thresh or aln.cigartuples is None:  # not primary, duplicate or unmapped?
+                    cigar_l = aln._delegate.core.n_cigar
+
+                    if aln.flag & 1284 or aln.mapq < mq_thresh or cigar_l == 0:  # not primary, duplicate or unmapped?
                         continue
                     self._add_to_bin_buffer(aln, tell)
                     tell = 0 if self.no_tell else self.input_bam.tell()
@@ -248,7 +251,9 @@ cdef class GenomeScanner:
                     name = aln.qname.__hash__(), aln.flag, aln.pos
                     if name in seen_reads:
                         continue
-                    if aln.flag & 1284 or aln.mapq < mq_thresh or aln.cigartuples is None:
+
+                    cigar_l = aln._delegate.core.n_cigar
+                    if aln.flag & 1284 or aln.mapq < mq_thresh or cigar_l == 0:
                         continue
                     if aln.rname != self.current_tid:
                         if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences:
@@ -342,7 +347,8 @@ cdef class GenomeScanner:
             prev_alignment = a
 
             if ibam is None:
-                if a.flag & 1284 or a.mapq < self.mapq_threshold or a.cigartuples is None:
+                cigar_l = a._delegate.core.n_cigar
+                if a.flag & 1284 or a.mapq < self.mapq_threshold or cigar_l == 0:
                     continue
                 tell = 0 if self.no_tell else self.input_bam.tell()
                 if self.no_tell:
@@ -458,10 +464,11 @@ cdef class GenomeScanner:
         elif self.no_tell:
             raise BufferError("Read buffer has overflowed, increase --buffer-size")
 
-    def _add_to_bin_buffer(self, a, tell):
+    def _add_to_bin_buffer(self, AlignedSegment a, tell):
         # Calculates coverage information on fly, drops high coverage regions, buffers reads
         cdef int flag = a.flag
-        if flag & 1540 or a.cigartuples is None or a.seq is None:
+        cdef uint32_t cigar_l = a._delegate.core.n_cigar
+        if flag & 1540 or cigar_l == 0: # or a.seq is None:
             return
         cdef int rname = a.rname
         cdef int apos = a.pos

diff --git a/dysgu/extra_metrics.pxd b/dysgu/extra_metrics.pxd
@@ -1,4 +1,4 @@
-#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True
+#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8
 
 from libc.stdint cimport uint32_t