From ef651f4345698570cab1598376a59a6dde118ee4 Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Fri, 15 Nov 2024 16:51:12 +0000
Subject: [PATCH] Better read merging for long reads. New clustering methods
 for long reads. Improved runtime.

---
 dysgu/call_component.pyx | 268 +++++++++++++------
 dysgu/cluster.pyx        |  25 +-
 dysgu/consensus.pyx      | 180 ++++++++++---
 dysgu/coverage.pyx       |  17 +-
 dysgu/extra_metrics.pxd  |   2 +-
 dysgu/extra_metrics.pyx  |   2 +-
 dysgu/find_reads.hpp     |   2 +-
 dysgu/graph.pyx          | 360 ++++++++++++++-----------
 dysgu/io_funcs.pyx       |  31 +--
 dysgu/main.py            |  15 +-
 dysgu/map_set_utils.pxd  |   9 +-
 dysgu/map_set_utils.pyx  |  99 ++++---
 dysgu/merge_svs.pyx      | 554 +++++++++++++++++++++++++--------------
 dysgu/post_call.py       |  42 ++-
 dysgu/re_map.py          |  12 +-
 dysgu/sv2bam.pyx         |   2 +-
 pyproject.toml           |   2 +-
 setup.py                 |   4 +-
 18 files changed, 1075 insertions(+), 551 deletions(-)

diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
index 1c13a34..280dd01 100644
--- a/dysgu/call_component.pyx
+++ b/dysgu/call_component.pyx
@@ -2,6 +2,7 @@
 from __future__ import absolute_import
 from collections import Counter, defaultdict, namedtuple
 import logging
+import math
 import numpy as np
 cimport numpy as np
 from numpy.random import normal
@@ -24,6 +25,12 @@ import warnings
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 warnings.simplefilter(action='ignore', category=FutureWarning)
 
+
+from sklearn.cluster import KMeans, DBSCAN
+from sklearn.exceptions import ConvergenceWarning
+warnings.filterwarnings('ignore', category=ConvergenceWarning)
+
+
 np.random.seed(1)
 
 ctypedef EventResult EventResult_t
@@ -60,11 +67,12 @@ cpdef n_aligned_bases(AlignedSegment aln):
 
 
 cdef base_quals_aligned_clipped(AlignedSegment a):
-    cdef int left_clip, right_clip
     cdef float aligned_base_quals = 0
     cdef float aligned_bases = 0
     cdef float clipped_base_quals = 0
-    left_clip, right_clip = clip_sizes(a)
+    cdef int left_clip = 0
+    cdef int right_clip = 0
+    clip_sizes(a, left_clip, right_clip)
     clipped_bases = left_clip + right_clip
     cdef const unsigned char[:] quals = a.query_qualities
     cdef int i
@@ -95,6 +103,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
     cdef float clipped_base_quals = 0
     cdef float clipped_bases = 0
     cdef int abq, ab, cbq, cb
+    cdef int left_clip, right_clip
     paired_end = set([])
     seen = set([])
     er.spanning = len(spanning)
@@ -110,7 +119,9 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             er.n_small_tlen += 1
         if paired_end_reads and paired_end and flag & 8:
             er.n_unmapped_mates += 1
-        left_clip, right_clip = clip_sizes_hard(a)
+        left_clip = 0
+        right_clip = 0
+        clip_sizes_hard(a, left_clip, right_clip)
         if left_clip > 0 and right_clip > 0:
             er.double_clips += 1
         has_sa = a.has_tag("SA")
@@ -150,8 +161,11 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
             er.minus += 1
         else:
             er.plus += 1
-        ct = a.cigartuples
-        if ct[0][0] == 4 or ct[-1][0] == 4:
+
+        left_clip = 0
+        right_clip = 0
+        clip_sizes(a, left_clip, right_clip)
+        if left_clip or right_clip:
             er.sc += 1
         if a.flag & 1:  # paired read
             abq, ab, cbq, cb = base_quals_aligned_clipped(a)
@@ -164,7 +178,9 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins,
         flag = a.flag
         if flag & 2:
             er.NP += 1
-        left_clip, right_clip = clip_sizes_hard(a)
+        left_clip = 0
+        right_clip = 0
+        clip_sizes_hard(a, left_clip, right_clip)
         if left_clip > 0 and right_clip > 0:
             er.double_clips += 1
         if a.has_tag("SA"):
@@ -403,11 +419,15 @@ cdef make_generic_insertion_item(aln, int insert_size, int insert_std):
     v_item.svtype = "INS"
     aln_span = aln.reference_end - aln.pos
     v_item.size_inferred = 1
+    cdef int left_clip, right_clip
     if insert_std > 0:
         rand_insert_pos = abs(insert_size - aln_span + int(normal(0, insert_std)))
     else:  # single read mode
         v_item.svtype = "BND"
-        clip_s = max(clip_sizes(aln))
+        left_clip = 0
+        right_clip = 0
+        clip_sizes(aln, left_clip, right_clip)
+        clip_s = max(left_clip, right_clip)
         rand_insert_pos = 100 if not clip_s else clip_s
     v_item.inferred_sv_len = 0 if rand_insert_pos < 0 else rand_insert_pos
     return v_item
@@ -417,20 +437,19 @@ def consensus_matches_gap(target_gap, target_svlen, cigar, threshold=0.9):
     if not cigar or target_svlen < 20:
         return True
     for op, l in cigar:
-        if l < 20:
-            continue
         if op == 1 and target_gap == "INS" and min(l, target_svlen) / max(l, target_svlen) > threshold:
             return True
         if op == 2 and target_gap == "DEL" and min(l, target_svlen) / max(l, target_svlen) > threshold:
             return True
     return False
 
+
 cpdef int assign_contig_to_break(asmb, EventResult_t er, side, spanning):
     if not asmb:
         return 0
     cdef int ref_bases = 0
     if spanning:
-        if not consensus_matches_gap(er.svtype, er.svlen, asmb["cigar"]):
+        if "cigar" in asmb and not consensus_matches_gap(er.svtype, er.svlen, asmb["cigar"]):
             return 0
         er.contig = asmb["contig"]
         er.contig_cigar = asmb["cigar"]
@@ -543,6 +562,7 @@ cdef make_single_call(sub_informative, insert_size, insert_stdev, insert_ppf, mi
     as2 = None
     ref_bases = 0
     if to_assemble or len(spanning_alignments) > 0:
+        # echo('MAKE SINGLE CALL')
         if er.preciseA:
             as1 = consensus.base_assemble(u_reads, er.posA, 500)
             if as1 and (er.svtype != "TRA" or (as1['contig'] and (as1['contig'][0].islower() or as1['contig'][-1].islower()))):
@@ -638,6 +658,7 @@ cdef partition_single(informative, insert_size, insert_stdev, insert_ppf, spanni
                 idx += 1
     sub_cluster_calls = []
     cdef EventResult_t er
+
     if try_cluster:
         try:
             Z = linkage(coords, 'single')
@@ -685,6 +706,7 @@ cdef partition_single(informative, insert_size, insert_stdev, insert_ppf, spanni
         er = make_single_call(informative, insert_size, insert_stdev, insert_ppf, min_support, to_assemble,
                               spanning_alignments, 1, generic_insertions, st, paired_end)
         sub_cluster_calls.append(er)
+
     return sub_cluster_calls
 
 
@@ -738,32 +760,37 @@ cdef group_read_subsets(rds, insert_ppf, insert_size, insert_stdev):
     return spanning_alignments, informative, generic_insertions
 
 
-def bicluster_spanning_lr(spanning, informative):
-    if len(spanning) <= 2:
+def dbscan_spanning(spanning, informative):
+    if len(spanning) <= 3:
         return [(spanning, informative)]
-    srt = sorted(spanning, key=lambda x: x.len)
-    if srt[0].len / srt[-1].len > 0.5:
+
+    X = np.array([s.len for s in spanning]).reshape(-1, 1)
+    X_max = X.max()
+    if X.min() == X_max:
         return [(spanning, informative)]
-    part_a = [srt[0]]
-    sum_a = srt[0].len
-    part_b = [srt[-1]]
-    sum_b = srt[-1].len
-    for i in range(1, len(srt) - 2):
-        avg_a = sum_a / len(part_a)
-        avg_b = sum_b / len(part_b)
-        t = srt[i]
-        if abs(t.len - avg_a) < abs(t.len - avg_b):
-            part_a.append(t)
-            sum_a += t.len
-        else:
-            part_b.append(t)
-            sum_b += t.len
-    return [(part_a, []), (part_b, [])]
 
+    eps = min(int(X_max * 0.03), int(math.pow(X_max, 0.45)))
+    eps = max(1, eps)
+
+    cl = DBSCAN(eps=eps, min_samples=2)
+    labels = cl.fit_predict(X)
+
+    m = int(max(labels))
+    if len(labels) == 0 or m == -1:
+        return [(spanning, informative)]
+
+    result = [[[], []] for i in range(m + 1)]
+    for idx, l in enumerate(labels):
+        if l == -1:
+            continue
+        result[l][0].append(spanning[idx])
+
+    return result
 
 
 def process_spanning(paired_end, spanning_alignments, divergence, length_extend, informative,
                      generic_insertions, insert_ppf, to_assemble):
+    # echo("PROCESS SPANNING")
     cdef int min_found_support = 0
     cdef str svtype, jointype
     cdef bint passed
@@ -818,7 +845,7 @@ def process_spanning(paired_end, spanning_alignments, divergence, length_extend,
         # 1.6.8
         posA_95 = abs(posA - posA_adjusted)
         posB_95 = abs(posB - posB_adjusted)
-
+        # echo([sp.len for sp in spanning_alignments], "pos adjusted:", [b[0] for b in size_pos_bounded])
         svlen = int(np.median([sp.len for sp in spanning_alignments]))
         posA = spanning_alignments[best_index].pos
         posB = spanning_alignments[best_index].end
@@ -832,6 +859,10 @@ def process_spanning(paired_end, spanning_alignments, divergence, length_extend,
         er.preciseA = True
         er.preciseB = True
 
+    er.qnames = set([])
+    for item in spanning_alignments:
+        er.qnames.add(hash(item.align.qname))
+
     ab = abs(posB - posA)
     if svlen > 0:
         jitter = ((posA_95 + posB_95) / 2) / svlen
@@ -851,7 +882,7 @@ def process_spanning(paired_end, spanning_alignments, divergence, length_extend,
     er.query_overlap = 0
     er.jitter = jitter
     u_reads = [i.align for i in spanning_alignments]
-    v_reads = []
+
     min_found_support = len(spanning_alignments)
     if len(generic_insertions) > 0:
         min_found_support += len(generic_insertions)
@@ -874,19 +905,21 @@ def process_spanning(paired_end, spanning_alignments, divergence, length_extend,
     as1 = None
     as2 = None
     ref_bases = 0
-    if to_assemble:
-        if er.preciseA:
+    if to_assemble and er.preciseA:
+
+        if not paired_end and er.svtype == "INS" and er.svlen > 50:
+            # echo('Making contig from', best_align.qname, spanning_alignments[best_index].cigar_index)
+            as1 = consensus.contig_from_read_cigar(best_align, spanning_alignments[best_index].cigar_index)
+            # as1 = consensus.base_assemble([best_align.align], er.posA, 500)
+        else:
             as1 = consensus.base_assemble(u_reads, er.posA, 500)
-            if as1:
-                ref_bases += assign_contig_to_break(as1, er, "A", spanning_alignments)
-        if er.preciseB:
-            as2 = consensus.base_assemble(v_reads, er.posB, 500)
-            if as2:
-                ref_bases += assign_contig_to_break(as2, er, "B", 0)
-        if not as1 and len(generic_insertions) > 0:
-            as1 = consensus.base_assemble([item.read_a for item in generic_insertions], er.posA, 500)
-            if as1:
-                ref_bases += assign_contig_to_break(as1, er, "A", 0)
+
+        if as1:
+            ref_bases += assign_contig_to_break(as1, er, "A", spanning_alignments)
+        # elif len(generic_insertions) > 0:
+        #     as1 = consensus.base_assemble([item.read_a for item in generic_insertions], er.posA, 500)
+        #     if as1:
+        #         ref_bases += assign_contig_to_break(as1, er, "A", 0)
     er.linked = 0
     er.block_edge = 0
     er.ref_bases = ref_bases
@@ -895,15 +928,12 @@ def process_spanning(paired_end, spanning_alignments, divergence, length_extend,
         start_ins = 0
         ct = best_align.cigartuples
         target_len = svlen
-        # if er.preciseA:
-        #     target_len = svlen
-        # else:
-        #     target_len = ct[cigar_index][1]
         for i in range(spanning_alignments[best_index].cigar_index):
             if ct[i][0] in {0, 1, 4, 7, 8}:
                 start_ins += ct[i][1]
         er.variant_seq = best_align.seq[start_ins:start_ins + target_len]
         er.ref_seq = best_align.seq[start_ins - 1]
+    # echo("contig is ", er.contig)
     return er
 
 
@@ -923,17 +953,24 @@ cdef single(rds, int insert_size, int insert_stdev, float insert_ppf, int clip_l
 
     # Use spanning if available, otherwise informative, otherwise generic
     spanning_alignments, informative, generic_insertions = group_read_subsets(rds, insert_ppf, insert_size, insert_stdev)
+    n_spanning = len(spanning_alignments)
+
+    if n_spanning and len(generic_insertions) > n_spanning * 10:
+        generic_insertions = []
+    if n_spanning and len(informative) > n_spanning * 20:
+        informative = []
+
     if len(spanning_alignments) > 0:
-        if not paired_end:
+        # if not paired_end:
             candidates = []
-            for spanning_alignments, informative in bicluster_spanning_lr(spanning_alignments, informative):
+            for spanning_alignments, informative in dbscan_spanning(spanning_alignments, informative):
                 candidates.append(process_spanning(paired_end, spanning_alignments, divergence, length_extend, informative,
                      generic_insertions, insert_ppf, to_assemble))
             return candidates
-        else:
+        # else:
             # single event
-            return process_spanning(paired_end, spanning_alignments, divergence, length_extend, informative,
-                         generic_insertions, insert_ppf, to_assemble)
+            # return process_spanning(paired_end, spanning_alignments, divergence, length_extend, informative,
+            #              generic_insertions, insert_ppf, to_assemble)
 
 
     elif len(informative) > 0:
@@ -1291,29 +1328,83 @@ cdef tuple mask_soft_clips(int aflag, int bflag, a_ct, b_ct):
     return left_clipA, right_clipA, left_clipB, right_clipB
 
 
-cdef query_start_end_from_cigartuples(r):
-    cdef int end = 0
-    cdef int start = 0
-    cdef int query_length = r.infer_read_length()  # Note, this also counts hard-clips
-    end = query_length
-    if r.cigartuples[0][0] == 4 or r.cigartuples[0][0] == 5:
-        start += r.cigartuples[0][1]
-    if r.cigartuples[-1][0] == 4 or r.cigartuples[-1][0] == 5:
-        end -= r.cigartuples[-1][1]
-    return start, end, query_length
+# cdef query_start_end_from_cigartuples(r):
+#     cdef int end = 0
+#     cdef int start = 0
+#     cdef int query_length = r.infer_read_length()  # Note, this also counts hard-clips
+#     end = query_length
+#     if r.cigartuples[0][0] == 4 or r.cigartuples[0][0] == 5:
+#         start += r.cigartuples[0][1]
+#     if r.cigartuples[-1][0] == 4 or r.cigartuples[-1][0] == 5:
+#         end -= r.cigartuples[-1][1]
+#     return start, end, query_length
+#
+#
+# cdef start_end_query_pair(r1, r2):
+#     cdef int query_length, s1, e1, s2, e2, start_temp, r1l, r2l
+#     # r1 and r2 might be on same read, if this is case infer the query position on the read
+#     s1, e1, r1l = query_start_end_from_cigartuples(r1)
+#     s2, e2, r2l = query_start_end_from_cigartuples(r2)
+#     if r1.flag & 64 == r2.flag & 64:  # same read
+#         if r2.flag & 16 != r1.flag & 16:  # different strand, count from end
+#             query_length = r1l  # Note, this also counts hard-clips
+#             start_temp = query_length - e2
+#             e2 = start_temp + e2 - s2
+#             s2 = start_temp
+#     return s1, e1, s2, e2, r1l, r2l
+
+
+cdef int query_start_end_from_cigar(AlignedSegment r, int *start, int *end):
+    cdef uint32_t cigar_value
+    cdef uint32_t cigar_l
+    cdef uint32_t *cigar_p
+    cdef int opp, length
+    cdef int query_length = 0
+    cigar_l = r._delegate.core.n_cigar
+    cigar_p = bam_get_cigar(r._delegate)
+    if cigar_l == 0:
+        return 0
+
+    # Calculate query length and handle starting clip
+    start[0] = 0
+    for i in range(cigar_l):
+        cigar_value = cigar_p[i]
+        opp = <int> cigar_value & 15  # Get operation
+        length = <int> cigar_value >> 4  # Get length
+
+        if opp in (0, 1, 4, 7, 8):  # M, I, S, =, X
+            query_length += length
+        elif opp == 5:  # H
+            query_length += length
+
+        if i == 0 and (opp == 4 or opp == 5):  # S or H
+            start[0] = length
+
+    # Set initial end position
+    end[0] = query_length
+
+    # Handle ending clip
+    cigar_value = cigar_p[cigar_l - 1]
+    opp = <int> cigar_value & 15
+    length = <int> cigar_value >> 4
+    if opp == 4 or opp == 5:  # S or H
+        end[0] -= length
+
+    return query_length
+
 
+cdef start_end_query_pair(AlignedSegment r1, AlignedSegment r2):
+    cdef int s1 = 0, e1 = 0, s2 = 0, e2 = 0
+    cdef int r1l, r2l, start_temp
+    r1l = query_start_end_from_cigar(r1, &s1, &e1)
+    r2l = query_start_end_from_cigar(r2, &s2, &e2)
 
-cdef start_end_query_pair(r1, r2):
-    cdef int query_length, s1, e1, s2, e2, start_temp, r1l, r2l
-    # r1 and r2 might be on same read, if this is case infer the query position on the read
-    s1, e1, r1l = query_start_end_from_cigartuples(r1)
-    s2, e2, r2l = query_start_end_from_cigartuples(r2)
     if r1.flag & 64 == r2.flag & 64:  # same read
-        if r2.flag & 16 != r1.flag & 16:  # different strand, count from end
-            query_length = r1l  # Note, this also counts hard-clips
-            start_temp = query_length - e2
+        if r2.flag & 16 != r1.flag & 16:  # different strand
+            start_temp = r1l - e2
             e2 = start_temp + e2 - s2
             s2 = start_temp
+
     return s1, e1, s2, e2, r1l, r2l
 
 
@@ -1324,17 +1415,18 @@ def sort_by_length(x):
 cdef void assemble_partitioned_reads(EventResult_t er, u_reads, v_reads, int block_edge, int assemble):
     as1 = None
     as2 = None
-    if assemble:
-        if er.preciseA:
-            as1 = consensus.base_assemble(u_reads, er.posA, 500)
-            if as1:
-                if er.spanning == 0 and not (as1['left_clips'] or as1['right_clips']):
-                    as1 = None
-        if (er.spanning == 0 or as1 is None) and er.preciseB:
-            as2 = consensus.base_assemble(v_reads, er.posB, 500)
-            if as2 :
-                if not (as2['left_clips'] or as2['right_clips']):
-                    as2 = None
+    # todo
+    # if assemble:
+    #     if er.preciseA:
+    #         as1 = consensus.base_assemble(u_reads, er.posA, 500)
+    #         if as1:
+    #             if er.spanning == 0 and not (as1['left_clips'] or as1['right_clips']):
+    #                 as1 = None
+    #     if (er.spanning == 0 or as1 is None) and er.preciseB:
+    #         as2 = consensus.base_assemble(v_reads, er.posB, 500)
+    #         if as2 :
+    #             if not (as2['left_clips'] or as2['right_clips']):
+    #                 as2 = None
     er.linked = 0
     er.block_edge = block_edge
     er.contig = None
@@ -1720,7 +1812,8 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf,
         for (u, v), d in data.s_between: #.items():
             rd_u = get_reads(bam, d[0], data.reads, n2n, add_to_buffer, info)   # [(Nodeinfo, alignment)..]
             rd_v = get_reads(bam, d[1], data.reads, n2n, add_to_buffer, info)
-
+            # echo("rd_u", [rr[1].qname for rr in rd_u])
+            # echo("rd_v", [rr[1].qname for rr in rd_v])
             total_reads = len(rd_u) + len(rd_v)
             buffered_reads += total_reads
             if add_to_buffer and buffered_reads > 50000:
@@ -1804,6 +1897,9 @@ cpdef list call_from_block_model(bam, data, clip_length, insert_size, insert_std
     n_parts = len(data.parts) if data.parts else 0
     events = []
     info = data.info
+    # echo(data.parts)
+    # echo(data.s_between)
+    # echo(data.s_within)
     if data.reads is None:
         data.reads = {}
     # next deal with info - need to filter these into the partitions, then deal with them in single / one_edge
@@ -1825,8 +1921,8 @@ cpdef list call_from_block_model(bam, data, clip_length, insert_size, insert_std
             else:
                 events.append(ev)
     events = [e for e in events if e and (e.svlen > 0 or e.svtype == "TRA")]
-    #for e in events:
-    #    echo("call_component svlen", e.svlen, f" support={e.su}, {e.chrA}:{e.posA}-{e.posB}, {e.chrB}")
-    #    if e.svlen_precise:
-    #        set_ins_seq(e)
+    for e in events:
+       # echo("call_component svlen", e.svlen, f" support={e.su}, {e.chrA}:{e.posA}-{e.posB}, {e.chrB}")
+       if e.svlen_precise:
+           set_ins_seq(e)
     return events
diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx
index 5d77be2..97d6982 100644
--- a/dysgu/cluster.pyx
+++ b/dysgu/cluster.pyx
@@ -1,4 +1,5 @@
 # cython: language_level=3
+
 from __future__ import absolute_import
 import datetime
 import time
@@ -184,15 +185,15 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
         if args["mode"] == "pacbio-sequel2":
             max_dist, max_clust_dist = 35, 500000
             if args["merge_dist"] is None:
-                args["merge_dist"] = 700
+                args["merge_dist"] = 1000
         elif args["mode"] == "pacbio-revio":
             max_dist, max_clust_dist = 50, 500000
             if args["merge_dist"] is None:
-                args["merge_dist"] = 700
+                args["merge_dist"] = 1000
         elif args["mode"] == "nanopore-r9" or args["mode"] == "nanopore-r10":
             max_dist, max_clust_dist = 100, 500000
             if args["merge_dist"] is None:
-                args["merge_dist"] = 700
+                args["merge_dist"] = 1000
 
     # set upper bound on single-partition size
     max_single_size = min(max(args["max_cov"] * 50, 10000), 100000)  # limited between 5000 - 50,000 reads
@@ -436,6 +437,10 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
             os.remove(f"{tdir}/job_{p}.done.pkl")
     if len(block_edge_events) == 0:
         return [], None
+
+    # for item1 in block_edge_events:
+    #     echo(item1.svtype, item1.su, item1.svlen)
+
     logging.info("Number of components {}. N candidates {}".format(components_seen, len(block_edge_events)))
     keeps = len([i for i in block_edge_events if i.site_info])
     if keeps:
@@ -452,12 +457,17 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
     # Merge across calls
     if args["merge_within"] == "True":
         merged = merge_svs.merge_events(block_edge_events, args["merge_dist"], regions, bool(paired_end), try_rev=False, pick_best=False,
-                                         debug=True, min_size=args["min_size"],
-                              max_comparisons=args["max_comparisons"] if "max_comparisons" in args else 100)
+                                        debug=True, min_size=args["min_size"],
+                                        max_comparisons=args["max_comparisons"] if "max_comparisons" in args else 100,
+                                        procs=args['procs'])
     else:
         merged = block_edge_events
     logging.info("Number of candidate SVs merged: {}".format(len(block_edge_events) - len(merged)))
     logging.info("Number of candidate SVs after merge: {}".format(len(merged)))
+
+    # echo("no--->")
+    # for item1 in merged:
+    #     echo(item1.svtype, item1.su, item1.svlen)
     before = len(merged)
 
     if auto_support:
@@ -490,12 +500,15 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N
     preliminaries = re_map.drop_svs_near_reference_gaps(preliminaries, paired_end, ref_genome, args["drop_gaps"] == "True")
     preliminaries = post_call.ref_repetitiveness(preliminaries, ref_genome)
     preliminaries = post_call.strand_binom_t(preliminaries)
-    # preliminaries = consensus.contig_info(preliminaries)  # GC info, repetitiveness
+
     preliminaries = extra_metrics.find_repeat_expansions(preliminaries, insert_stdev)
     preliminaries = post_call.compressability(preliminaries)
     preliminaries = post_call.get_gt_metric2(preliminaries, args["mode"], True)
+
     preliminaries = post_call.get_ref_base(preliminaries, ref_genome, args["symbolic_sv_size"])
 
+    # preliminaries = post_call.filter_microsatellite_non_diploid(preliminaries)
+
     preliminaries = extra_metrics.sample_level_density(preliminaries, regions)
     preliminaries = coverage_analyser.normalize_coverage_values(preliminaries)
 
diff --git a/dysgu/consensus.pyx b/dysgu/consensus.pyx
index 093e13d..7708f2b 100644
--- a/dysgu/consensus.pyx
+++ b/dysgu/consensus.pyx
@@ -1,11 +1,12 @@
-#distutils: language = c++
-#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True
+# cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True
+
 """
-A basic assembler/consensus sequence generator. Takes an overlap graph and merges reads in-place in a POA style.
+A basic consensus sequence generator. Takes an overlap graph and merges reads in-place in a POA style.
 """
 
 import warnings
 import array
+from re import match
 
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 warnings.simplefilter(action='ignore', category=FutureWarning)
@@ -140,7 +141,7 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T
 
     cdef str seq
     cdef tuple k
-    cdef bint done = 0
+
     cdef cpp_vector[int] vv = [0, 0, 0, 0]
 
     cdef int r_end = r.reference_end
@@ -164,8 +165,7 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T
         if opp == 4 and length > 250:
             i = length - 250
             length = 250
-        if done:
-            break
+
         if opp == 4:
             if start:
                 for o in range(length, 0, -1):
@@ -245,12 +245,11 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T
                 if current_pos < approx_position and approx_position - current_pos > max_distance:
                     i += 1
                     continue
-                elif current_pos > approx_position and current_pos - approx_position > max_distance:
-                    break
+                # elif current_pos > approx_position and current_pos - approx_position > max_distance:
+                #     break
                 ref_bases += 1
                 if ref_bases > target_bases:
-                    done = 1
-                    break
+                    return
                 qual = quals[i]
                 base = bam_seqi(char_ptr_rseq, i)
                 i += 1
@@ -556,6 +555,11 @@ cdef dict get_consensus(rd, int position, int max_distance):
     else:
         cigar = [tuple(ct) for ct in cigar]
 
+    # if len(seq) < len(rd[0].seq) and longest_left_sc > 10 and longest_right_sc > 10:
+    #     return {}
+        # Not a good consensus, use first read instead
+        # return trim_sequence_from_cigar(rd[0], position, max_distance)
+
     return {"contig": seq,
             "left_clips": longest_left_sc,
             "right_clips": longest_right_sc,
@@ -582,13 +586,17 @@ cdef trim_sequence_from_cigar(r, int approx_pos, int max_distance):
     cdef int pos = start_pos  # current genome position
     cdef int end_index = len(ct) - 1
     cdef bint started = False
-    cdef int opp, length
+    cdef int opp, length, keep_start, keep_end
 
     cdef int longest_left_sc = 0
     cdef int longest_right_sc = 0
+    cdef int ref_bases = 0
 
     parts = []
+    pos_index = -1
     for opp, length in ct:
+        if ref_bases > 300 and pos > approx_pos + 150:
+            break
 
         if opp == 4 and index == 0:
             if abs(pos - approx_pos) < 50:
@@ -616,29 +624,30 @@ cdef trim_sequence_from_cigar(r, int approx_pos, int max_distance):
 
         elif opp == 0 or opp == 7 or opp == 8 or opp == 3:
 
-            if not started:
-                if abs(pos + length - approx_pos) < 300:
-                    parts.append(r.seq[seq_index:seq_index + length])  # upper implied
+            op_end = pos + length
 
-                    started = True
-                    start_pos = pos
-                    pos += length
-                    start_index = index
-                    seq_start = seq_index
+            # If we're completely before the region of interest
+            if op_end < approx_pos - 250:
+                pos += length
+                seq_index += length
+                continue
 
-                else:
-                    pos += length
+            # If we're completely after the region of interest
+            if pos > approx_pos + 250:
+                break
 
-                seq_index += length
+            # Calculate which portion of this match we want to keep
+            keep_start = max(0, approx_pos - 250 - pos)
+            keep_end = min(length, approx_pos + 250 - pos)
 
-            else:
-                if abs(pos + length - approx_pos) > 300:
-                    end_index = index + 1
-                    break
-                else:
-                    parts.append(r.seq[seq_index:seq_index + length])
-                    pos += length
-                    seq_index += length
+            # If this match overlaps our region of interest
+            if keep_end > keep_start:
+                started = True
+                ref_bases += keep_end - keep_start
+                parts.append(seq[seq_index + keep_start:seq_index + keep_end])
+
+            pos += length
+            seq_index += length
 
         index += 1
 
@@ -693,6 +702,103 @@ cpdef dict base_assemble(rd, int position, int max_distance):
     return get_consensus(rd, position, max_distance)
 
 
+cpdef contig_from_read_cigar(r, int cigar_index):
+    ct = r.cigartuples
+    seq = r.seq
+    query_pos = 0  # Position in query sequence
+    ref_pos = r.pos  # Position in reference
+    window_size = 500  # Size of sequence context to include
+
+    # First pass: find the target position in query coordinates
+    target_query_start = 0
+    for i in range(cigar_index):
+        op, length = ct[i]
+        if op in {0, 1, 4, 7, 8}:  # Operations that consume query sequence
+            target_query_start += length
+
+    # Calculate window boundaries in query coordinates
+    target_length = ct[cigar_index][1]
+    window_start = max(0, target_query_start - window_size)
+    window_end = min(len(seq), target_query_start + target_length + window_size)
+
+    parts = []
+    query_pos = 0
+    ref_pos = r.pos
+    longest_left_sc = 0
+    longest_right_sc = 0
+    ref_bases = 0
+
+    cigar_blocks = []
+    started = False
+    # Second pass: build the sequence
+    for i, (op, length) in enumerate(ct):
+        if query_pos > window_end:
+            break
+
+        if op == 0 or op == 7 or op == 8:  # Match/mismatch
+            if query_pos + length > window_start:
+                start_idx = max(0, window_start - query_pos)
+                end_idx = min(length, window_end - query_pos)
+                match_seq = seq[query_pos + start_idx:query_pos + end_idx]
+                parts.append(match_seq.upper())
+                started = True
+                ref_bases += len(match_seq)
+                if cigar_blocks and cigar_blocks[-1][0] == 0:  # elide 7 and 8 ops
+                    cigar_blocks[-1][1] += len(match_seq)
+                else:
+                    cigar_blocks.append([0, len(match_seq)])
+            query_pos += length
+            ref_pos += length
+
+        elif op == 4:  # Soft clip
+            clip_length = min(length, 250)
+            if query_pos < window_end and query_pos + length > window_start:
+                start_idx = max(0, window_start - query_pos)
+                end_idx = min(clip_length, window_end - query_pos)
+                clip_seq = seq[query_pos + start_idx:query_pos + end_idx].lower()
+                parts.append(clip_seq)
+                cigar_blocks.append([4, len(clip_seq)])
+                if query_pos < target_query_start:  # Left clip
+                    longest_left_sc = len(clip_seq)
+                else:  # Right clip
+                    longest_right_sc = len(clip_seq)
+            query_pos += length
+
+        elif op == 1:  # Insertion
+            if query_pos + length > window_start and query_pos < window_end:
+                if i < cigar_index:
+                    parts.append(seq[max(window_start, query_pos):query_pos + length].lower())
+                if i == cigar_index:
+                    # Full insertion if it's the target
+                    parts.append(seq[query_pos:query_pos + length].lower())
+                elif i > cigar_index:
+                    parts.append(seq[query_pos:min(window_end, query_pos + length)].lower())
+                cigar_blocks.append([1, len(parts[-1])])
+                if query_pos + length >= window_end:
+                    break
+            query_pos += length
+
+        elif op == 2:  # Deletion
+            ref_pos += length
+            if started:
+                cigar_blocks.append([length, 2])
+
+    contig = "".join(parts)
+
+    return {
+        "contig": contig,
+        "cigar": cigar_blocks,
+        "left_clips": longest_left_sc,
+        "right_clips": longest_right_sc,
+        "ref_bases": ref_bases,
+        "ref_start": r.pos,
+        "ref_end": r.reference_end,
+        "bamrname": r.rname,
+        "left_weight": 0,
+        "right_weight": 0,
+    }
+
+
 cpdef float compute_rep(seq):
 
     cdef unordered_map[float, int] last_visited
@@ -731,6 +837,7 @@ cpdef float compute_rep(seq):
 
     return tot_amount / total_seen
 
+
 cdef tuple get_rep(contig_seq):
 
     cdef int left_clip_end = 0
@@ -822,10 +929,17 @@ def contig_info(events):
 
 
 def check_contig_match(a, b, rel_diffs=False, diffs=8, ol_length=21, supress_seq=True, return_int=False):
-    if not a or not b or len(a) > 5000 or len(b) > 5000:
+    if not a or not b:
         return 0
-
-    query = StripedSmithWaterman(str(a), suppress_sequences=supress_seq)
+    if len(a) > 10000:
+        a = a[:10000]
+    if len(b) > 10000:
+        b = b[:10000]
+
+    query = StripedSmithWaterman(str(a), suppress_sequences=supress_seq,
+                                 match_score=2, mismatch_score=-3, gap_open_penalty=10,
+                                 gap_extend_penalty=1
+                                 )
     alignment = query(str(b))
     if not return_int:
         return alignment
diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx
index 7486b2e..5265096 100644
--- a/dysgu/coverage.pyx
+++ b/dysgu/coverage.pyx
@@ -15,6 +15,7 @@ from libc.stdint cimport uint32_t
 from pysam.libcalignedsegment cimport AlignedSegment
 from pysam.libchtslib cimport bam_get_cigar
 
+
 def index_stats(f, rl=None):
     if rl is None:
         rl = []
@@ -147,7 +148,9 @@ cdef class GenomeScanner:
             # when 'run' command is invoked, run this block. cov track already exists from find-reads
             if self.cov_track_path is None and self.bam_iter is None:
                 for aln in self.input_bam:
-                    if aln.flag & 1284 or aln.mapq < mq_thresh or aln.cigartuples is None:  # not primary, duplicate or unmapped?
+                    cigar_l = aln._delegate.core.n_cigar
+
+                    if aln.flag & 1284 or aln.mapq < mq_thresh or cigar_l == 0:  # not primary, duplicate or unmapped?
                         continue
                     self._add_to_bin_buffer(aln, tell)
                     tell = 0 if self.no_tell else self.input_bam.tell()
@@ -248,7 +251,9 @@ cdef class GenomeScanner:
                     name = aln.qname.__hash__(), aln.flag, aln.pos
                     if name in seen_reads:
                         continue
-                    if aln.flag & 1284 or aln.mapq < mq_thresh or aln.cigartuples is None:
+
+                    cigar_l = aln._delegate.core.n_cigar
+                    if aln.flag & 1284 or aln.mapq < mq_thresh or cigar_l == 0:
                         continue
                     if aln.rname != self.current_tid:
                         if self.current_tid != -1 and self.current_tid <= self.input_bam.nreferences:
@@ -342,7 +347,8 @@ cdef class GenomeScanner:
             prev_alignment = a
             
             if ibam is None:
-                if a.flag & 1284 or a.mapq < self.mapq_threshold or a.cigartuples is None:
+                cigar_l = a._delegate.core.n_cigar
+                if a.flag & 1284 or a.mapq < self.mapq_threshold or cigar_l == 0:
                     continue
                 tell = 0 if self.no_tell else self.input_bam.tell()
                 if self.no_tell:
@@ -458,10 +464,11 @@ cdef class GenomeScanner:
         elif self.no_tell:
             raise BufferError("Read buffer has overflowed, increase --buffer-size")
 
-    def _add_to_bin_buffer(self, a, tell):
+    def _add_to_bin_buffer(self, AlignedSegment a, tell):
         # Calculates coverage information on fly, drops high coverage regions, buffers reads
         cdef int flag = a.flag
-        if flag & 1540 or a.cigartuples is None or a.seq is None:
+        cdef uint32_t cigar_l = a._delegate.core.n_cigar
+        if flag & 1540 or cigar_l == 0: # or a.seq is None:
             return
         cdef int rname = a.rname
         cdef int apos = a.pos
diff --git a/dysgu/extra_metrics.pxd b/dysgu/extra_metrics.pxd
index d4bae53..c481e9f 100644
--- a/dysgu/extra_metrics.pxd
+++ b/dysgu/extra_metrics.pxd
@@ -1,4 +1,4 @@
-#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True
+#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8
 
 from libc.stdint cimport uint32_t
 
diff --git a/dysgu/extra_metrics.pyx b/dysgu/extra_metrics.pyx
index d4bef24..f6aa31f 100644
--- a/dysgu/extra_metrics.pyx
+++ b/dysgu/extra_metrics.pyx
@@ -1,4 +1,4 @@
-#cython: language_level=3, boundscheck=True, c_string_type=unicode, c_string_encoding=utf8, infer_types=True
+#cython: language_level=3, c_string_type=unicode, c_string_encoding=utf8
 import numpy as np
 cimport numpy as np
 from dysgu.map_set_utils cimport unordered_map, EventResult
diff --git a/dysgu/find_reads.hpp b/dysgu/find_reads.hpp
index 7138af1..8a56410 100644
--- a/dysgu/find_reads.hpp
+++ b/dysgu/find_reads.hpp
@@ -38,7 +38,7 @@ class CoverageTrack
         ~CoverageTrack() {}
 
         std::vector<int32_t> cov_array;  // Assume coverage never overflows int32
-        int max_coverage;
+        int max_coverage = 250;
         int index = 0;
 
         void set_max_cov(int m) {
diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx
index 5d641ba..27a0779 100644
--- a/dysgu/graph.pyx
+++ b/dysgu/graph.pyx
@@ -1,4 +1,5 @@
-#cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True
+# cython: language_level=3, boundscheck=False, c_string_type=unicode, c_string_encoding=utf8, infer_types=True
+
 from __future__ import absolute_import
 from collections import defaultdict, deque, namedtuple
 import numpy as np
@@ -12,7 +13,7 @@ from dysgu.map_set_utils cimport unordered_map as robin_map, Py_SimpleGraph
 from dysgu.map_set_utils cimport multimap as cpp_map
 from dysgu cimport map_set_utils
 from dysgu.io_funcs import intersecter
-from dysgu.map_set_utils cimport unordered_set, cigar_clip, clip_sizes_hard, is_reciprocal_overlapping, span_position_distance
+from dysgu.map_set_utils cimport unordered_set, cigar_clip, clip_sizes, clip_sizes_hard, is_reciprocal_overlapping, span_position_distance
 from dysgu.map_set_utils cimport hash as xxhasher
 from dysgu.map_set_utils cimport MinimizerTable
 from dysgu.extra_metrics import BadClipCounter
@@ -248,7 +249,9 @@ cdef class ClipScoper:
 
     cdef void update(self, r, int input_read, int chrom, int position,
                unordered_set[int]& clustered_nodes):
-        clip_left, clip_right = map_set_utils.clip_sizes(r)
+        cdef int clip_left = 0
+        cdef int clip_right = 0
+        clip_sizes(r, clip_left, clip_right)
         if chrom != self.current_chrom:
             self.scope_left.clear()
             self.scope_right.clear()
@@ -282,10 +285,12 @@ cdef class PairedEndScoper:
     cdef int clst_dist, max_dist, local_chrom, max_search_depth
     cdef cpp_map[int, LocalVal] loci  # Track the local breaks and mapping locations
     cdef vector[cpp_map[int, LocalVal]] chrom_scope  # Track the mate-pair breaks and locations
+    cdef public vector[int] found_exact, found2
     cdef float norm
-    cdef float thresh # spd
+    cdef float thresh  # spd
     cdef float position_distance_thresh
     cdef bint paired_end
+
     def __init__(self, max_dist, clst_dist, n_references, norm, thresh, paired_end, position_distance_thresh, max_search_depth):
         self.clst_dist = clst_dist
         self.max_dist = max_dist
@@ -298,135 +303,126 @@ cdef class PairedEndScoper:
         cdef cpp_map[int, LocalVal] scope
         for n in range(n_references + 1):  # Add one for special 'insertion chromosome'
             self.chrom_scope.push_back(scope)
+
     cdef void empty_scopes(self) nogil:
         for idx in range(self.chrom_scope.size()):
             if not self.chrom_scope[idx].empty():
                 self.chrom_scope[idx].clear()
         self.loci.clear()
 
-    cdef vector[int] find_other_nodes(self, int node_name, int current_chrom, int current_pos, int chrom2, int pos2,
-                                      ReadEnum_t read_enum, int length_from_cigar, bint trust_ins_len) nogil:
-        # todo make this code less nested and more readable
-        cdef int idx, i, count_back, steps, node_name2
-        cdef int sep = 0
-        cdef int sep2 = 0
-        cdef vector[int] found2
-        cdef vector[int] found_exact
-        cdef cpp_map[int, LocalVal]* forward_scope = &self.chrom_scope[chrom2]
+    cdef void erase_items_out_of_range(self, int current_pos) nogil:
+        cdef cpp_map[int, LocalVal].iterator local_it, local_it2
+        cdef cpp_pair[int, LocalVal] vitem
+        local_it = self.loci.lower_bound(current_pos - self.clst_dist)
+        local_it2 = self.loci.begin()
+        while local_it2 != local_it:
+            vitem = dereference(local_it2)
+            self.chrom_scope[vitem.second.chrom2].erase(vitem.second.pos2)
+            preincrement(local_it2)
+        if local_it != self.loci.begin():
+            self.loci.erase(self.loci.begin(), local_it)
+
+    cdef bint process_vitem(self, cpp_pair[int, LocalVal] vitem, int node_name, int current_chrom, int current_pos,
+                            int chrom2, int pos2, ReadEnum_t read_enum, int length_from_cigar, bint trust_ins_len): # nogil:
+        cdef int node_name2 = vitem.second.node_name
+        cdef int sep, sep2
+        cdef float max_span, span_distance
+        if (read_enum == DELETION and vitem.second.read_enum == INSERTION) or \
+           (read_enum == INSERTION and vitem.second.read_enum == DELETION):
+            return True
+        if node_name2 == node_name:
+            return True
+
+        if current_chrom != chrom2 or is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2):
+            sep = c_abs(vitem.first - pos2)
+            if sep >= self.max_dist:
+                return False  # break the loop
+            sep2 = c_abs(vitem.second.pos2 - current_pos)
+            if vitem.second.chrom2 == chrom2 and sep2 < self.max_dist:
+                if sep < 150:
+                    if length_from_cigar > 0 and vitem.second.length_from_cigar > 0:
+                        max_span = max(length_from_cigar, vitem.second.length_from_cigar)
+                        span_distance = <float>c_abs(length_from_cigar - vitem.second.length_from_cigar) / max_span
+                        if span_distance < self.position_distance_thresh:
+                            self.found_exact.push_back(node_name2)
+                    else:
+                        self.found2.push_back(node_name2)
+                        # self.found_exact.push_back(node_name2)
+                elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len):
+                    self.found2.push_back(node_name2)
+
+            elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len):
+                self.found2.push_back(node_name2)
+
+        elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len):
+            self.found2.push_back(node_name2)
+
+        return True  # continue the loop
+
+    cdef void search_forward(self, cpp_map[int, LocalVal]* forward_scope, int node_name, int current_chrom, int current_pos,
+                             int chrom2, int pos2, ReadEnum_t read_enum, int length_from_cigar, bint trust_ins_len):# nogil:
+        cdef cpp_map[int, LocalVal].iterator local_it
+        cdef cpp_pair[int, LocalVal] vitem
+        cdef int steps = 0
+        local_it = forward_scope.lower_bound(pos2)
+        while local_it != forward_scope.end() and steps < self.max_search_depth:
+            vitem = dereference(local_it)
+            steps += 1
+            if not self.process_vitem(vitem, node_name, current_chrom, current_pos, chrom2, pos2,
+                                      read_enum, length_from_cigar, trust_ins_len):
+                break
+            preincrement(local_it)
+
+    cdef void search_backward(self, cpp_map[int, LocalVal]* forward_scope, int node_name, int current_chrom, int current_pos,
+                              int chrom2, int pos2, ReadEnum_t read_enum, int length_from_cigar, bint trust_ins_len):# nogil:
+        cdef cpp_map[int, LocalVal].iterator local_it
+        cdef cpp_pair[int, LocalVal] vitem
+        cdef int steps = 0
+        local_it = forward_scope.lower_bound(pos2)
+        if local_it != forward_scope.begin():
+            predecrement(local_it)
+        else:
+            return
+
+        while True:
+            vitem = dereference(local_it)
+            steps += 1
+            if not self.process_vitem(vitem, node_name, current_chrom, current_pos, chrom2, pos2,
+                                      read_enum, length_from_cigar, trust_ins_len):
+                break
+            if local_it == forward_scope.begin() or steps >= self.max_search_depth:
+                break
+            predecrement(local_it)
+
+    cdef void find_other_nodes(self, int node_name, int current_chrom, int current_pos, int chrom2, int pos2,
+                               ReadEnum_t read_enum, int length_from_cigar, bint trust_ins_len):  # nogil:
+        if not self.found2.empty():
+            self.found2.clear()
+        if not self.found_exact.empty():
+            self.found_exact.clear()
+
+        cdef cpp_map[int, LocalVal]* forward_scope
         if chrom2 == 10000000:
             forward_scope = &self.chrom_scope.back()
         else:
             forward_scope = &self.chrom_scope[chrom2]
-        cdef cpp_map[int, LocalVal].iterator local_it, local_it2
-        cdef cpp_pair[int, LocalVal] vitem
-        cdef float max_span, span_distance
-        # Debug
-        # echo("current_chrom ", current_chrom, "chrom2", chrom2, "current_pos", current_pos, "pos2", pos2)
-        # echo("Forward scope len", forward_scope.size(), "Loci scope len", self.loci.size())
-        # local_it = forward_scope.begin()
-        # while local_it != forward_scope.end():
-        #     vitem = dereference(local_it)
-        #     echo(vitem.first, vitem.second)
-        #     preincrement(local_it)
-
-        # Re-initialize empty
+
+        # Re-initialize if chromosome has changed
         if current_chrom != self.local_chrom:
             self.local_chrom = current_chrom
             self.empty_scopes()
-        if not self.loci.empty():
-            # Erase items out of range in forward scope
-            local_it = self.loci.lower_bound(current_pos - self.clst_dist)
-            local_it2 = self.loci.begin()
-            while local_it2 != local_it:
-                vitem = dereference(local_it2)
-                self.chrom_scope[vitem.second.chrom2].erase(vitem.second.pos2)
-                preincrement(local_it2)
-            if local_it != self.loci.begin():
-                self.loci.erase(self.loci.begin(), local_it)
-
-            local_it = forward_scope.lower_bound(pos2)
-            steps = 0
-            if local_it != forward_scope.end():
-                while local_it != forward_scope.end() and steps < self.max_search_depth:
-                    vitem = dereference(local_it)
-                    preincrement(local_it)
-                    steps += 1
-
-                    if (read_enum == DELETION and vitem.second.read_enum == INSERTION) or (read_enum == INSERTION and vitem.second.read_enum == DELETION):
-                        continue
-                    node_name2 = vitem.second.node_name
-                    if node_name2 == node_name:  # Can happen due to within-read events
-                        continue
 
-                    if current_chrom != chrom2 or is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2):
-                        sep = c_abs(vitem.first - pos2)
-                        if sep >= self.max_dist:
-                            break
-                        sep2 = c_abs(vitem.second.pos2 - current_pos)
-                        if vitem.second.chrom2 == chrom2 and sep2 < self.max_dist:
-                            if sep < 150:
-                                if length_from_cigar > 0 and vitem.second.length_from_cigar > 0:
-                                    max_span = max(length_from_cigar, vitem.second.length_from_cigar)
-                                    span_distance = <float>c_abs(length_from_cigar - vitem.second.length_from_cigar) / max_span
-                                    if span_distance < self.position_distance_thresh:
-                                        found_exact.push_back(node_name2)
-                                else:
-                                    found_exact.push_back(node_name2)
-                            elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len):
-                                found2.push_back(node_name2)
-                        elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len):
-                            found2.push_back(node_name2)
-                    elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len):
-                        found2.push_back(node_name2)
-            if not found_exact.empty():
-                return found_exact
-
-            local_it = forward_scope.lower_bound(pos2)
-            vitem = dereference(local_it)
-            if local_it != forward_scope.begin():
-                predecrement(local_it)  # Move back one before staring search, otherwise same value is processed twice
-                steps = 0
-                while local_it != forward_scope.begin() and steps < self.max_search_depth:
-                    vitem = dereference(local_it)
-                    predecrement(local_it)
-                    steps += 1
-                    if (read_enum == DELETION and vitem.second.read_enum == INSERTION) or (read_enum == INSERTION and vitem.second.read_enum == DELETION):
-                        continue
-                    node_name2 = vitem.second.node_name
-                    if node_name2 == node_name:
-                        continue
-                    if current_chrom != chrom2 or is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2):
-                        sep = c_abs(vitem.first - pos2)
-                        if sep >= self.max_dist:
-                            break
-                        sep2 = c_abs(vitem.second.pos2 - current_pos)
-                        if vitem.second.chrom2 == chrom2 and sep2 < self.max_dist:
-                            if sep < 150:
-                                if length_from_cigar > 0 and vitem.second.length_from_cigar > 0:
-                                    max_span = max(length_from_cigar, vitem.second.length_from_cigar)
-                                    span_distance = <float>c_abs(length_from_cigar - vitem.second.length_from_cigar) / max_span
-                                    if span_distance < self.position_distance_thresh:
-                                        found_exact.push_back(node_name2)
-                                else:
-                                    found_exact.push_back(node_name2)
-                            elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len):
-                                found2.push_back(node_name2)
-                        elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len):
-                            found2.push_back(node_name2)
-                    elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len):
-                        found2.push_back(node_name2)
-
-        if not found_exact.empty():
-            return found_exact
-        else:
-            return found2
+        if not self.loci.empty():
+            self.erase_items_out_of_range(current_pos)
+            self.search_forward(forward_scope, node_name, current_chrom, current_pos, chrom2, pos2,
+                                read_enum, length_from_cigar, trust_ins_len)
+            if not self.found_exact.empty():
+                return
+            self.search_backward(forward_scope, node_name, current_chrom, current_pos, chrom2, pos2,
+                                 read_enum, length_from_cigar, trust_ins_len)
 
     cdef void add_item(self, int node_name, int current_chrom, int current_pos, int chrom2, int pos2,
-                       ReadEnum_t read_enum, int length_from_cigar): # nogil:
-
-        # Add to scopes, if event is within read, add two references to forward scope. Otherwise when the
-        # first break point drops from the local scope, the forward scope break will not connect with other
-        # events that are added later. This is a fix for long read deletions mainly
+                       ReadEnum_t read_enum, int length_from_cigar):  # nogil:
         if chrom2 == -1:
             return  # No chrom2 was set, single-end?
         cdef cpp_map[int, LocalVal]* forward_scope
@@ -434,6 +430,7 @@ cdef class PairedEndScoper:
             forward_scope = &self.chrom_scope.back()
         else:
             forward_scope = &self.chrom_scope[chrom2]
+
         # Add to local scope
         cdef cpp_pair[int, LocalVal] pp
         pp.first = current_pos
@@ -600,6 +597,7 @@ cdef class NodeToName:
         if self.stored_nodes[query_node].hash_val == self.stored_nodes[target_node].hash_val:
             return True
 
+
 cdef get_query_pos_from_cigarstring(cigar, pos):
     # Infer the position on the query sequence of the alignment using cigar string
     cdef int end = 0
@@ -650,17 +648,6 @@ cdef void parse_cigar(str cigar, int *start, int *end, int *ref_end):
             in_lead = 0
         c_cigar += 1
 
-def get_query_pos_from_cigartuples(r, query_length):
-    # Infer the position on the query sequence of the alignment using cigar string
-    start = 0
-    # query_length = r.infer_read_length()  # Note, this also counts hard-clips
-    end = query_length
-    if r.cigartuples[0][0] == 4 or r.cigartuples[0][0] == 5:
-        start += r.cigartuples[0][1]
-    if r.cigartuples[-1][0] == 4 or r.cigartuples[-1][0] == 5:
-        end -= r.cigartuples[-1][1]
-    return start, end, query_length
-
 
 AlnBlock = namedtuple("SA", ["query_start", "query_end", "ref_start", "ref_end", "chrom", "mq", "strand", "this"])
 JoinEvent = namedtuple("JE", ["chrom", "event_pos", "chrom2", "pos2", "query_pos", "query_end", "read_enum", "cigar_index"])
@@ -675,6 +662,7 @@ class AlignmentsSA:
         self.join_result = []
         self.query_length = r.infer_read_length()  # Note, this also counts hard-clips
         self._alignments_from_sa(r, gettid)
+        self.cigar_l = 0
 
     def connect_alignments(self, r, max_dist=1000, mq_thresh=0, read_enum=0):
         if len(self.query_aligns) > 1 and self.index is not None:
@@ -683,15 +671,35 @@ class AlignmentsSA:
             if self.index < len(self.query_aligns) - 1:
                 self._connect_right(r, max_dist, mq_thresh, read_enum)
 
-    def _alignments_from_sa(self, r, gettid):
-        qstart, qend, query_length = get_query_pos_from_cigartuples(r, self.query_length)
+    def _alignments_from_sa(self, AlignedSegment r, gettid):
+
+        cdef int ref_start, ref_end, query_start, query_end, qstart, qend
+
+        cdef uint32_t cigar_value, cigar_index
+        cdef uint32_t cigar_l
+        cdef uint32_t *cigar_p
+        cdef int event_pos, opp, length
+
+        qstart = 0
+        qend = self.query_length
+        cigar_l = r._delegate.core.n_cigar
+        self.cigar_l = cigar_l
+        cigar_p = bam_get_cigar(r._delegate)
+        cigar_value = cigar_p[0]
+        opp = <int> cigar_value & 15
+        if opp == 4 or opp == 5:
+            qstart += <int> cigar_value >> 4
+        cigar_value = cigar_p[cigar_l - 1]
+        opp = <int> cigar_value & 15
+        if opp == 4 or opp == 5:
+            qend += <int> cigar_value >> 4
+
         this_aln = AlnBlock(query_start=qstart, query_end=qend,
                             ref_start=r.pos, ref_end=r.reference_end, chrom=r.rname, mq=r.mapq,
                             strand="-" if r.flag & 16 else "+", this=True)
         self.aln_strand = this_aln.strand
         query_aligns = [this_aln]
 
-        cdef int ref_start, ref_end, query_start, query_end
         for sa_block in r.get_tag("SA").split(";"):
             if sa_block == "":
                 break
@@ -702,7 +710,7 @@ class AlignmentsSA:
             ref_end = ref_start
             parse_cigar(sa[3], &query_start, &query_end, &ref_end)
             if this_aln.strand != sa[2]:
-                start_temp = query_length - query_end
+                start_temp = self.query_length - query_end
                 query_end = start_temp + query_end - query_start
                 query_start = start_temp
             query_aligns.append(AlnBlock(query_start, query_end, ref_start, ref_end, gettid(sa[0]), int(sa[4]), sa[2], False))
@@ -751,7 +759,7 @@ class AlignmentsSA:
             query_pos = self.query_length - query_end
             query_end = self.query_length - qtemp
         chrom = a.chrom
-        cigar_index = len(r.cigartuples) - 1
+        cigar_index = self.cigar_l - 1
         if b.strand == self.aln_strand:
             pos2 = b.ref_start
         else:
@@ -802,15 +810,28 @@ cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_
     genome_scanner.add_to_buffer(r, node_name, tell)  # Add read to buffer
 
     both_overlap = p1_overlaps and p2_overlaps
+
+    # Vectors for making edges. 'exact' nodes are nodes with highly similar signatures
+    cdef vector[int] *found_nodes_exact = &pe_scope.found_exact
+    cdef vector[int] *found2 = &pe_scope.found2
+
     if not paired_end or (paired_end and read_enum != BREAKEND and not mm_only and not chrom2 == -1):
-        other_nodes = pe_scope.find_other_nodes(node_name, chrom, event_pos, chrom2, pos2, read_enum, length_from_cigar, trust_ins_len)
-        for other_node in other_nodes:
-            if node_to_name.same_template(node_name, other_node):
-                continue
-            if not G.hasEdge(node_name, other_node):
-                G.addEdge(node_name, other_node, 2)  # 'black' edge
-                # if r.cigartuples[cigar_index][1] == 288 and event_pos > 159764:
-                #     echo("be", node_name, other_node)
+        pe_scope.find_other_nodes(node_name, chrom, event_pos, chrom2, pos2, read_enum, length_from_cigar,
+                                  trust_ins_len)
+
+        # other_nodes = pe_scope.find_other_nodes(node_name, chrom, event_pos, chrom2, pos2, read_enum, length_from_cigar, trust_ins_len)
+        if not dereference(found_nodes_exact).empty():
+            for other_node in dereference(found_nodes_exact):
+                if node_to_name.same_template(node_name, other_node):
+                    continue
+                if not G.hasEdge(node_name, other_node):
+                    G.addEdge(node_name, other_node, 2)  # 'black' edge
+        else:
+            for other_node in dereference(found2):
+                if node_to_name.same_template(node_name, other_node):
+                    continue
+                if not G.hasEdge(node_name, other_node):
+                    G.addEdge(node_name, other_node, 2)  # 'black' edge
 
     elif chrom != chrom2 and clip_l != -1:  # Note all paired-end reads have BREAKENDS where chrom != chrom2, but also includes translocations
         cluster_clipped(G, r, clip_scope, chrom, event_pos, node_name)
@@ -846,22 +867,34 @@ cdef int good_quality_clip(AlignedSegment r, int clip_length):
     if len(quals) == 0:
         return 1
     cdef char* char_ptr_rseq = <char*>bam_get_seq(r._delegate)
-    cdef int i, length, w_sum
-    cdef int window_length = 10
+    cdef uint32_t i, length, w_sum
+    cdef uint32_t window_length = 10
     if clip_length < window_length:
         window_length = clip_length
     cdef float avg_qual
     cdef int poly = window_length - 1
     cdef total_good = window_length - 1
-    ct = r.cigartuples
-    if r.flag & 2304:  # supplementary, usually hard-clipped, assume good clipping
-        if ct[0][0] == 5 or ct[-1][0] == 5:
+
+    cdef uint32_t first_cigar_value, last_cigar_value
+    cdef uint32_t cigar_l
+    cdef uint32_t *cigar_p
+    cdef uint32_t opp
+    cigar_l = r._delegate.core.n_cigar
+    cigar_p = bam_get_cigar(r._delegate)
+    first_cigar_value = cigar_p[0]
+    last_cigar_value = cigar_p[cigar_l -1]
+
+    if r.flag & 2304:  # supplementary, usually hard-clipped
+        if r.mapq < 20 and first_cigar_value & 15 == 5 and last_cigar_value & 15 == 5:  # hard clipped both sides
+            return 0
+        if first_cigar_value & 15 == 5 or last_cigar_value & 15 == 5:
             return 1
     cdef int[17] basecounts  # char value is index into array
     cdef bint homopolymer
     cdef int base_index
-    if ct[0][0] == 4:
-        length = r.cigartuples[0][1]
+
+    if first_cigar_value & 15 == 4:  # left soft-clip
+        length = first_cigar_value >> 4
         if length >= window_length and length >= clip_length:
             for i in range(length - window_length, -1, -1):
                 # average of current window by counting leftwards
@@ -886,8 +919,8 @@ cdef int good_quality_clip(AlignedSegment r, int clip_length):
                 return 1
 
     total_good = window_length - 1
-    if ct[-1][0] == 4:
-        length = r.cigartuples[-1][1]
+    if last_cigar_value & 15 == 4:
+        length = last_cigar_value >> 4
         if length >= window_length and length >= clip_length:
             for i in range(len(r.query_qualities) - length, len(r.query_qualities) - window_length):
                 # average of current window by counting rightwards
@@ -933,6 +966,14 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int
     cdef uint64_t v
     cdef bint success
     cdef bint good_clip
+
+    cdef uint32_t first_cigar_value, last_cigar_value
+    cdef uint32_t cigar_l
+    cdef uint32_t *cigar_p
+    cdef uint32_t opp
+    cigar_l = r._delegate.core.n_cigar
+    cigar_p = bam_get_cigar(r._delegate)
+
     if paired_end and read_enum == SPLIT and flag & 8:  # clip event, or whole read, but mate is unmapped
         return
 
@@ -979,7 +1020,8 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int
                             return
                     if read_enum == DELETION or read_enum == INSERTION:
                         chrom2 = chrom
-                        if r.cigartuples[cigar_index][0] != 1:  # not insertion, use length of cigar event
+                        # if r.cigartuples[cigar_index][0] != 1:  # not insertion, use length of cigar event
+                        if cigar_p[cigar_index] & 15 != 1:  # not insertion, use length of cigar event
                             pos2 = cigar_pos2
                         else:
                             pos2 = event_pos
@@ -1000,7 +1042,9 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int
                 return
         if read_enum == DELETION or read_enum == INSERTION:
             chrom2 = chrom
-            if r.cigartuples[cigar_index][0] != 1:  # not insertion, use length of cigar event
+
+            # if r.cigartuples[cigar_index][0] != 1:  # not insertion, use length of cigar event
+            if cigar_p[cigar_index] & 15 != 1:
                 pos2 = cigar_pos2
             else:
                 pos2 = event_pos
@@ -1033,7 +1077,8 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int
             if read_enum == BREAKEND:
                return
             chrom2 = r.rname
-            if cigar_index != -1 and r.cigartuples[cigar_index][0] != 1:  # If not insertion
+            # if cigar_index != -1 and r.cigartuples[cigar_index][0] != 1:  # If not insertion
+            if cigar_index != -1 and cigar_p[cigar_index] & 15 != 1:  # If not insertion
                 pos2 = cigar_pos2
             else:
                 pos2 = event_pos
@@ -1193,7 +1238,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
 
     logging.info("Building graph with clustering {} bp".format(clustering_dist))
     cdef TemplateEdges_t template_edges = TemplateEdges()  # Edges are added between alignments from same template, after building main graph
-    cdef int event_pos, cigar_index, opp, length
+
     node_to_name = NodeToName()  # Map of nodes -> read ids
     cdef ClipScoper_t clip_scope = ClipScoper(minimizer_dist, k=k, m=m, clip_length=clip_l,  # Keeps track of local reads
                        minimizer_support_thresh=minimizer_support_thresh,
@@ -1220,8 +1265,11 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
     cdef uint32_t cigar_value
     cdef uint32_t cigar_l
     cdef uint32_t *cigar_p
+    cdef int event_pos, cigar_index, opp, length
     cdef long n_aligned_bases = 0
     cdef int elide_threshold = 150
+    cdef int left_clip_size, right_clip_size
+
     for chunk in genome_scanner.iter_genome():
         for r, tell in chunk:
             if r.mapq < mapq_thresh:
@@ -1286,7 +1334,9 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
                 # Whole alignment will be used, try infer position from soft-clip
                 cigar_index = -1
                 pos2 = -1
-                left_clip_size, right_clip_size = clip_sizes_hard(r)  # soft and hard-clips
+                left_clip_size = 0
+                right_clip_size = 0
+                clip_sizes_hard(r, left_clip_size, right_clip_size)  # soft and hard-clips
                 if r.flag & 8 and clipped:  # paired by inference
                     # skip if both ends are clipped, usually means its a chunk of badly mapped sequence
                     # if not (left_clip_size and right_clip_size) and ((paired_end and good_quality_clip(r, 20)) or (not paired_end and) ):
diff --git a/dysgu/io_funcs.pyx b/dysgu/io_funcs.pyx
index 84960a2..4c37bf6 100644
--- a/dysgu/io_funcs.pyx
+++ b/dysgu/io_funcs.pyx
@@ -1,10 +1,9 @@
-#cython: language_level=2, boundscheck=True, wraparound=True
-#distutils: language=c++
+#cython: language_level=3
 
 import numpy as np
 cimport numpy as np
 import logging
-from map_set_utils import merge_intervals, echo
+from dysgu.map_set_utils import merge_intervals, Py_BasicIntervalTree, echo
 from collections import defaultdict
 from importlib.metadata import version
 import sortedcontainers
@@ -12,37 +11,33 @@ import pandas as pd
 import os
 import sys
 import gzip
-from dysgu.map_set_utils import Py_BasicIntervalTree
 import random
 
 from libc.stdlib cimport malloc
 
 
-cdef char *basemap = [ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
-                       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
-                       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
-                       '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
-                       '\0',  'T', '\0',  'G', '\0', '\0', '\0',  'C', '\0', '\0', '\0', '\0', '\0', '\0',  'N', '\0',
-                       '\0', '\0', '\0', '\0',  'A',  'A', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
-                       '\0',  't', '\0',  'g', '\0', '\0', '\0',  'c', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
-                       '\0', '\0', '\0', '\0',  'a',  'a' ]
+cdef char *basemap = [ b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0',
+                       b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0',
+                       b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0',
+                       b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0',
+                       b'\0', b'T', b'\0',  b'G', b'\0', b'\0', b'\0',  b'C', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0',  b'N', b'\0',
+                       b'\0', b'\0', b'\0', b'\0',  b'A',  b'A', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0',
+                       b'\0', b't', b'\0',  b'g', b'\0', b'\0', b'\0',  b'c', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0', b'\0',
+                       b'\0', b'\0', b'\0', b'\0',  b'a',  b'a' ]
 
 np.random.seed(0)
 random.seed(0)
 
 cpdef str reverse_complement(str seq, int seq_len):
-    """https://bioinformatics.stackexchange.com/questions/3583/\
-    what-is-the-fastest-way-to-get-the-reverse-complement-of-a-dna-sequence-in-pytho/3595#3595"""
-
     cdef char *seq_dest = <char *>malloc(seq_len + 1)
-    seq_dest[seq_len] = '\0'
+    seq_dest[seq_len] = b'\0'
 
-    cdef bytes py_bytes = seq.encode('UTF-8')
+    cdef bytes py_bytes = seq.encode('ascii')
     cdef char *seq_src = py_bytes
     cdef int i = 0
     for i in range(seq_len):
         seq_dest[seq_len - i - 1] = basemap[<int>seq_src[i]]
-    return seq_dest[:seq_len].decode('UTF-8')
+    return seq_dest[:seq_len].decode('ascii')
 
 
 def bed_iter(path):
diff --git a/dysgu/main.py b/dysgu/main.py
index 1497dfa..eb25979 100644
--- a/dysgu/main.py
+++ b/dysgu/main.py
@@ -1,3 +1,5 @@
+# cython: language_level=3
+
 from __future__ import absolute_import
 import click
 import os
@@ -126,10 +128,11 @@ def apply_preset(kwargs):
         kwargs["mode"] = "nanopore-r10"
     if kwargs["mode"] != "pe":
         kwargs["paired"] = "False"
-    p = presets[kwargs["mode"]]
     options = new_options_set
     options = {k: v for k, v in options.items() if v is not None}
-    kwargs.update(defaults)
+    for k, v in defaults.items():
+        if kwargs.get(k) is None:
+            kwargs[k] = v
     kwargs.update(presets[kwargs["mode"]].items())
     kwargs.update(options)
 
@@ -250,7 +253,7 @@ def cli():
               default="True", type=click.Choice(["True", "False"]), show_default=True)
 @click.option("--drop-gaps", help="Drop SVs near gaps +/- 250 bp of Ns in reference",
               default="True", type=click.Choice(["True", "False"]), show_default=True)
-@click.option("--merge-dist", help="Attempt merging of SVs below this distance threshold. Default for paired-end data is (insert-median + 5*insert_std) for paired reads, or 700 bp for single-end reads",
+@click.option("--merge-dist", help="Attempt merging of SVs below this distance threshold. Default for paired-end data is (insert-median + 5*insert_std) for paired reads, or 1000 bp for single-end reads",
               default=None, type=int, show_default=False)
 @click.option("--paired", help="Paired-end reads or single", default="True", show_default=True,
               type=click.Choice(["True", "False"]))
@@ -412,7 +415,7 @@ def get_reads(ctx, **kwargs):
 @click.option("--drop-gaps", help="Drop SVs near gaps +/- 250 bp of Ns in reference",
               default="True", type=click.Choice(["True", "False"]), show_default=True)
 @click.option("--merge-dist", help="Attempt merging of SVs below this distance threshold, default is (insert-median + 5*insert_std) for paired"
-                                   "reads, or 700 bp for single-end reads",
+                                   "reads, or 1000 bp for single-end reads",
               default=None, type=int, show_default=False)
 @click.option("--paired", help="Paired-end reads or single", default="True", show_default=True,
               type=click.Choice(["True", "False"]))
@@ -594,3 +597,7 @@ def test_command(ctx, **kwargs):
             click.echo("PASS: " + c + "\n", err=True)
 
     logging.info("Run test complete")
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/dysgu/map_set_utils.pxd b/dysgu/map_set_utils.pxd
index 587e54b..c33d7ff 100644
--- a/dysgu/map_set_utils.pxd
+++ b/dysgu/map_set_utils.pxd
@@ -9,6 +9,9 @@ cimport numpy as np
 
 from libc.stdint cimport uint64_t, int32_t, int8_t
 
+from pysam.libcalignedsegment cimport AlignedSegment
+from pysam.libchtslib cimport bam_get_qname, bam_seqi, bam_get_seq, bam_get_cigar
+
 ctypedef cpp_vector[int] int_vec_t
 ctypedef cpp_pair[int, int] get_val_result
 
@@ -291,9 +294,9 @@ cdef extern from "<map>" namespace "std" nogil:
 
 cdef int cigar_exists(r)
 
-cdef tuple clip_sizes(r)
+cdef void clip_sizes(AlignedSegment r, int& left, int& right)
 
-cdef tuple clip_sizes_hard(r)
+cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right)
 
 cdef int cigar_clip(r, int clip_length)
 
@@ -322,4 +325,4 @@ cdef class EventResult:
     cdef public bint preciseA, preciseB, linked, modified, remapped
     cdef public int8_t svlen_precise
     cdef public object contig, contig2, contig_cigar, contig2_cigar, svtype, join_type, chrA, chrB, exp_seq, sample, type, \
-        partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info
+        partners, GQ, GT, kind, ref_seq, variant_seq, left_ins_seq, right_ins_seq, site_info, qnames
diff --git a/dysgu/map_set_utils.pyx b/dysgu/map_set_utils.pyx
index e97101b..e883dd9 100644
--- a/dysgu/map_set_utils.pyx
+++ b/dysgu/map_set_utils.pyx
@@ -1,7 +1,5 @@
 #cython: language_level=3
 import click
-import numpy as np
-cimport numpy as np
 import cython
 import time
 import logging
@@ -12,6 +10,9 @@ from libc.stdlib cimport abs as c_abs
 from libc.math cimport fabs as c_fabs
 from libc.stdint cimport uint32_t, uint16_t, int16_t, int32_t
 
+from pysam.libcalignedsegment cimport AlignedSegment
+from pysam.libchtslib cimport bam_get_qname, bam_seqi, bam_get_seq, bam_get_cigar
+
 ctypedef cpp_pair[int, int] cpp_item
 ctypedef cpp_pair[long, int] cpp_long_item
 
@@ -245,35 +246,70 @@ cdef int cigar_exists(r):
     return 0
 
 
-cdef tuple clip_sizes(r):
-    c = r.cigartuples
-    if not c:
-        return 0, 0
-
-    cdef int left = 0
-    cdef int right = 0
-
-    if c[0][0] == 4:
-        left = c[0][1]
-    if c[-1][0] == 4:
-        right = c[-1][1]
-    return left, right
-
-
-cdef tuple clip_sizes_hard(r):
-    c = r.cigartuples
-    if not c:
-        return 0, 0
-
-    cdef int left = 0
-    cdef int right = 0
-    c1 = c[0][0]
-    if c1 == 4 or c1 == 5:
-        left = c[0][1]
-    c1 = c[-1][0]
-    if c1 == 4 or c1 == 5:
-        right = c[-1][1]
-    return left, right
+cdef void clip_sizes(AlignedSegment r, int& left, int& right):
+    cdef uint32_t cigar_value
+    cdef uint32_t cigar_l
+    cdef uint32_t *cigar_p
+    cdef int opp, length
+    cigar_l = r._delegate.core.n_cigar
+    cigar_p = bam_get_cigar(r._delegate)
+    if cigar_l == 0:
+        return
+    cigar_value = cigar_p[0]
+    opp = <int> cigar_value & 15
+    if opp == 4:
+        left = <int> cigar_value >> 4
+    cigar_value = cigar_p[cigar_l - 1]
+    opp = <int> cigar_value & 15
+    if opp == 4:
+        right = <int> cigar_value >> 4
+
+    # c = r.cigartuples
+    # if not c:
+    #     return 0, 0
+    #
+    # cdef int left = 0
+    # cdef int right = 0
+    #
+    # if c[0][0] == 4:
+    #     left = c[0][1]
+    # if c[-1][0] == 4:
+    #     right = c[-1][1]
+    # return left, right
+
+
+cdef void clip_sizes_hard(AlignedSegment r, int& left, int& right):
+    cdef uint32_t cigar_value
+    cdef uint32_t cigar_l
+    cdef uint32_t *cigar_p
+    cdef int opp, length
+
+    cigar_l = r._delegate.core.n_cigar
+    cigar_p = bam_get_cigar(r._delegate)
+    if cigar_l == 0:
+        return
+    cigar_value = cigar_p[0]
+    opp = <int> cigar_value & 15
+    if opp == 4 or opp == 5:
+        left = <int> cigar_value >> 4
+    cigar_value = cigar_p[cigar_l - 1]
+    opp = <int> cigar_value & 15
+    if opp == 4 or opp == 5:
+        right = <int> cigar_value >> 4
+
+    # c = r.cigartuples
+    # if not c:
+    #     return 0, 0
+    #
+    # cdef int left = 0
+    # cdef int right = 0
+    # c1 = c[0][0]
+    # if c1 == 4 or c1 == 5:
+    #     left = c[0][1]
+    # c1 = c[-1][0]
+    # if c1 == 4 or c1 == 5:
+    #     right = c[-1][1]
+    # return left, right
 
 
 cdef int cigar_clip(r, int clip_length):
@@ -447,6 +483,7 @@ cdef class EventResult:
         self.n_sa = 0
         self.n_gaps = 0
         self.compress = 0
+        self.qnames = set([])
 
     def __repr__(self):
         return str(to_dict(self))
diff --git a/dysgu/merge_svs.pyx b/dysgu/merge_svs.pyx
index b2c6cff..a3b8f1a 100644
--- a/dysgu/merge_svs.pyx
+++ b/dysgu/merge_svs.pyx
@@ -1,15 +1,19 @@
 # cython: language_level=3
+
 from __future__ import absolute_import
+
+from multiprocessing import Pool
+
 import numpy as np
 import random
-from collections import defaultdict
+from collections import defaultdict, deque
 import networkx as nx
 from dysgu import consensus, io_funcs
 from dysgu.map_set_utils cimport is_reciprocal_overlapping, EventResult
 from dysgu.map_set_utils import echo
 from dysgu.io_funcs import intersecter
-import itertools
 from cython.operator import dereference
+from functools import cmp_to_key
 
 
 ctypedef EventResult EventResult_t
@@ -26,15 +30,20 @@ def get_chrom_key(ei):
 def compare_subset(potential, int max_dist, int max_comparisons):
     tmp_list = defaultdict(list)
     cdef int idx, jdx, dist1, ci_a
+    cdef int half_d = <int>(max_dist * 0.5)
     for idx in range(len(potential)):
         ei = potential[idx]
-        tmp_list[get_chrom_key(ei)].append((ei.posA - ei.cipos95A - max_dist, ei.posA + ei.cipos95A + max_dist, idx))
+        chrom_key = get_chrom_key(ei)
+        tmp_list[chrom_key].append((ei.posA - ei.cipos95A - max_dist, ei.posA + ei.cipos95A + max_dist, idx))
+        if ei.chrA == ei.chrB and ei.svlen > half_d:
+            tmp_list[chrom_key].append(
+                (ei.posB - ei.cipos95B - max_dist, ei.posB + ei.cipos95B + max_dist, idx))
 
     nc2 = {k: io_funcs.iitree(v, add_value=True) for k, v in tmp_list.items()}
     for idx in range(len(potential)):
         ei = potential[idx]
         ols = nc2[get_chrom_key(ei)].allOverlappingIntervals(ei.posA, ei.posA + 1)
-        ols.remove(idx)
+        ols = [i for i in set(ols) if i != idx]
         if len(ols) > max_comparisons:
             random.shuffle(ols)
             ols = ols[:max_comparisons]
@@ -43,11 +52,6 @@ def compare_subset(potential, int max_dist, int max_comparisons):
             yield ei, ej, idx, jdx
 
 
-def compare_all(potential):
-    for idx, jdx in itertools.product(range(len(potential)), range(len(potential))):
-        yield potential[idx], potential[jdx], idx, jdx
-
-
 cdef span_position_distance(ei, ej):
     if ei.svtype != "INS":
         span1 = abs(ei.posB - ei.posA)
@@ -129,7 +133,6 @@ def consistent_alignment_and_cigars(ei, ej, l_ratio):
     if ei.contig_ref_end > 0 and ej.contig_ref_end > 0:
         ref_overlap = min(ei.contig_ref_end, ej.contig_ref_end) - max(ei.contig_ref_start, ej.contig_ref_start)
         if ref_overlap < 0:
-            #echo("no ref overlap", (ei.contig_ref_start, ej.contig_ref_start), (ei.contig_ref_end, ej.contig_ref_end))
             return False
     else:
         return True
@@ -147,16 +150,20 @@ def consistent_alignment_and_cigars(ei, ej, l_ratio):
     if ej.svtype == "DEL":
         diff_i = sum(l for op, l in ei.contig_cigar if op == 2 and l > 10)
         diff_j = sum(l for op, l in ej.contig_cigar if op == 2 and l > 10)
-    #echo(ei.contig_cigar)
-    #echo(ej.contig_cigar)
     tot_non_ref = diff_i + diff_j
     tot_sv_len = ei.svlen + ej.svlen
     ratio = tot_non_ref / tot_sv_len
-    #echo("RATIO", ratio, tot_non_ref, tot_sv_len)
     if ratio > 1.8: # or ratio < 0.25:
         return False
     return True
 
+
+def jaccard_similarity(set1, set2):
+    intersection = len(set1.intersection(set2))
+    union = len(set1.union(set2))
+    return intersection / union if union != 0 else 0
+
+
 def get_consensus_seqs(ei, ej):
     ci = ei.contig
     ci2 = ei.contig2
@@ -178,8 +185,6 @@ def get_consensus_seqs(ei, ej):
 def contig_pairs_iter(ci, ci2, ci_alt, cj, cj2, cj_alt):
     if ci and cj:
         yield ci, cj
-    if ci_alt and cj_alt:
-        yield ci_alt, cj_alt
     if ci2 and cj2:
         yield ci2, cj2
     if ci2 and cj:
@@ -194,263 +199,408 @@ def contig_pairs_iter(ci, ci2, ci_alt, cj, cj2, cj_alt):
         yield ci, cj_alt
     if ci2 and cj_alt:
         yield ci2, cj_alt
+    if ci_alt and cj_alt:
+        yield ci_alt, cj_alt
 
 
-cdef int matches_with_max_gap(str cigar, int max_gap):
+cdef int matches_with_max_gap(char *c_cigar, int max_gap):
     # get matching bases, and filter large alignment gaps
     cdef:
-        # int idx = 0
         int matches = 0
+        int gaps = 0
         int num = 0  # To accumulate the number as we parse through the string
         char op
-    cdef bytes t = cigar.encode('utf8')
-    cdef char *c_cigar = t  # Convert Python string to char*
-    # cigartuples = []
+
     while dereference(c_cigar):
         if b'0' <= dereference(c_cigar) <= b'9':
             # Convert digit char to int and accumulate
             num = num * 10 + (dereference(c_cigar) - 48 )
         else:
             op = dereference(c_cigar)
-            if (op == b'D' or op == b'I') and num > max_gap:
-                return 0
+            if op == b'D' or op == b'I':
+                if num > max_gap:
+                    return 0
+                gaps += num
             if op == b'M':
                 matches += num
-            # cigartuples.append((cigar[idx], num))
             num = 0
         c_cigar += 1
-        # idx += 1
+    if matches and <float>gaps / <float>matches > 0.05:  #todo
+        return 0
     return matches
 
-cdef bint bad_alignment(alignment, ei, ej, v):
-    if not alignment:
-        return True
 
-    cdef int matches = matches_with_max_gap(alignment.cigar, 30)
-    if matches == 0:
+cdef bint bad_insertion_coverage(char *c_cigar, seq, bint is_query):
+    # Count the matching bases over the lowercase 'insertion' seq only
+    cdef:
+        int seq_index = 0
+        int matches = 0
+        int gaps = 0
+        int num = 0
+        int i
+        char op
+    while dereference(c_cigar):
+        if b'0' <= dereference(c_cigar) <= b'9':
+            num = num * 10 + (dereference(c_cigar) - 48)
+        else:
+            op = dereference(c_cigar)
+            if op == b'D' or op == b'I':
+                if seq[seq_index].islower():
+                    gaps += num
+                if op == b'D' and not is_query:
+                    seq_index += num
+            if op == b'M':
+                for i in range(seq_index, seq_index + num - 1):
+                    if seq[i].islower():
+                        matches += 1
+                seq_index += num
+            num = 0
+        c_cigar += 1
+    if not matches or <float> gaps / <float> matches > 0.05:  #todo
+        return <bint>True
+    return <bint>False
+
+
+cdef bint bad_alignment(alignment, ei, ej, v, paired_end):
+    if not alignment:
         return True
 
     qs = alignment.query_begin
     qe = alignment.query_end
+    qlen = len(v[0])
+    tlen = len(v[1])
     ts = alignment.target_begin
     te = alignment.target_end_optimal
-    total_sv_length = ei.svlen + ej.svlen
-    unaligned_bases = min(qs, ts) + min(len(v[0]) - qe, (len(v[1]) - te))
-    if unaligned_bases > total_sv_length:
+
+    q_right_clip = qlen - qe
+    t_right_clip = tlen - te
+
+    if not paired_end:
+        soft_clip_tolerance = max(10, min(qlen, tlen) * 0.04)
+        matches_threshold = min(qlen, tlen) * 0.6
+        max_gap = 15
+        ins_cov = 0.8
+    else:
+        soft_clip_tolerance = 10
+        matches_threshold = min(qlen, tlen) * 0.5
+        max_gap = 5
+        ins_cov = 0.9
+
+    cdef bytes t = alignment.cigar.encode('utf8')
+    cdef char *c_cigar = t
+
+    # Skip alignments with larger gaps or not many matching bases
+    cdef int matches = matches_with_max_gap(c_cigar, max_gap)
+    if matches == 0 or matches < matches_threshold:
         return True
 
+    # Keep if aligned inserted bases ~ svlen
+    if (ei.svtype == "INS" and ei.spanning > 0) or (ej.svtype == "INS" and ej.spanning > 0):
+        a_into_b = True
+        b_into_a = True
+        if ei.svtype == "INS" and ei.spanning > 0:
+            ei_ins_count = len([i for i in v[0][qs:qe] if i.islower()])
+            if ei_ins_count / ei.svlen < ins_cov:
+                a_into_b = False
+            elif bad_insertion_coverage(c_cigar, v[0][qs:qe], <bint>True):
+                a_into_b = False
+
+        if ej.svtype == "INS" and ej.spanning > 0:
+            ej_ins_count = len([i for i in v[1][ts:te] if i.islower()])
+            if bad_insertion_coverage(c_cigar, v[1][ts:te], <bint>False):
+                b_into_a = False
+            elif ej_ins_count / ej.svlen < ins_cov:
+                b_into_a = False
+        if not a_into_b and not b_into_a:
+            return True
+
+    # Keep, if one sequence is more or less completely aligned to the other
+    if qs < soft_clip_tolerance and q_right_clip < soft_clip_tolerance:
+        return False
+    if ts < soft_clip_tolerance and t_right_clip < soft_clip_tolerance:
+        return False
+
+    if qs > soft_clip_tolerance:
+        if q_right_clip > soft_clip_tolerance:
+            return True
+        if ts > soft_clip_tolerance:
+            return True
+
+    if ts > soft_clip_tolerance:
+        if t_right_clip > soft_clip_tolerance:
+            return True
+        if qs > soft_clip_tolerance:
+            return True
+
+    if q_right_clip > soft_clip_tolerance:
+        if qs > soft_clip_tolerance:
+            return True
+        if t_right_clip > soft_clip_tolerance:
+            return True
+
+    if t_right_clip > soft_clip_tolerance:
+        if ts > soft_clip_tolerance:
+            return True
+        if q_right_clip > soft_clip_tolerance:
+            return True
+
+    return False  # ok
+
+
+def process_contig_aignments(ci, ci2, ci_alt, cj, cj2, cj_alt, ei, ej, paired_end, idx, jdx, same_sample):
+    for v in contig_pairs_iter(ci, ci2, ci_alt, cj, cj2, cj_alt):
+        if not v[0] or not v[1]:
+            continue
+        if same_sample:
+            res = consensus.check_contig_match(v[0], v[1], return_int=False)
+            if bad_alignment(res, ei, ej, v, paired_end):
+                break
+            return idx, jdx
+            # echo("---->MERGED3", ei.svlen, ej.svlen, ei.svtype, (idx, jdx))
+
+        elif consensus.check_contig_match(v[0], v[1], return_int=True):
+            return idx, jdx
+
 
 def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, rel_diffs=False, diffs=15,
-                     same_sample=True, aggressive_ins_merge=False, debug=False, max_comparisons=20):
-    event_iter = compare_all(potential) if len(potential) < 50 else compare_subset(potential, max_dist, max_comparisons)
+                     same_sample=True, aggressive_ins_merge=False, debug=False, max_comparisons=20, procs=1):
+    event_iter = compare_subset(potential, max_dist, max_comparisons)
 
-    seen, disjoint_nodes = set(), set()
+    seen, disjoint_edges = set(), set()
     out_edges = defaultdict(int)
-    #echo("LEN potential", len(potential), paired_end)
+
+    avg_su_thresh = max(np.median([i.su for i in potential]) * 0.4, 10)
+
+    job = []
+    pool = None
+    if procs > 1:
+        pool = Pool(procs)
     for ei, ej, idx, jdx in event_iter:
 
+        if len(job) > 1000:
+            if procs == 1:
+                for item in job:
+                    edge = process_contig_aignments(*item)
+                    if not edge:
+                        continue
+                    else:
+                        G.add_edge(edge[0], edge[1], loci_same=True)
+            else:
+                for edge in pool.starmap(process_contig_aignments, job):
+                    if not edge:
+                        continue
+                    else:
+                        G.add_edge(edge[0], edge[1], loci_same=True)
+            job = []
+
         i_id, j_id = ei.event_id, ej.event_id
 
-        ins_dup = (ei.svtype == "INS" and ej.svtype == "DUP") or (ei.svtype == "DUP" and ej.svtype == "INS")
+        id_key = (min(idx, jdx), max(idx, jdx))
+
+        # if id_key not in seen:
+        #     echo("merge candidate", ei.svlen, ej.svlen, "positions", ei.posA, ej.posA, ei.svtype, ej.svtype, 'SU=', ei.su, ej.su)
+
+        if ei.su > avg_su_thresh and ej.su > avg_su_thresh and ei.spanning > 0 and ej.spanning > 0:
+            disjoint_edges.add(id_key)
 
-        if (i_id == j_id or (i_id, j_id) in seen or (j_id, i_id) in seen or
-                out_edges[idx] > 10 or out_edges[jdx] > 10 or
-                (not same_sample and ei.sample == ej.sample) or
-                (ej.svtype == "DEL" and ei.svtype != "DEL") or
-                (ei.svtype == "DEL" and ej.svtype != "DEL")):
+        if (same_sample and i_id == j_id) or \
+            (not same_sample and ei.sample == ej.sample) or \
+            id_key in seen or \
+            out_edges[idx] > 10 or out_edges[jdx] > 10:
+                # or
+                # (ej.svtype == "DEL" and ei.svtype != "DEL") or
+                # (ei.svtype == "DEL" and ej.svtype != "DEL")):
                 # (ei.svtype in {"INS", "DUP"} and ej.svtype not in {"INS", "DUP"}) or
                 # ei.svtype != ej.svtype):
+                continue
 
-            continue
+        seen.add(id_key)
 
-        seen.add((i_id, j_id))
 
         intra = ei.chrA == ej.chrA and ei.chrB == ej.chrB
 
-        loci_similar = True
-        if paired_end:
-            loci_similar = similar_locations(intra, ei, ej)
-            if not loci_similar:
-                #echo("loci not similar")
-                continue
+        # loci_similar = True
+        # if paired_end:
+        #     loci_similar = similar_locations(intra, ei, ej)
+        #     if not loci_similar:
+        #         continue
 
         if not intra:
             out_edges[idx] += 1
             out_edges[jdx] += 1
-            G.add_edge(i_id, j_id)
-            #echo("not intra added")
+            G.add_edge(idx, jdx)
             continue
 
-        #echo("merge candidate", ei.svlen, ej.svlen, "positions", ei.posA, ej.posA, ei.svtype, ej.svtype)
-
-        one_is_imprecise = (not ei.preciseA or not ei.preciseB or ei.svlen_precise or
-                            not ej.preciseA or not ej.preciseB or ej.svlen_precise)
+        if ei.spanning > 0 and ej.spanning > 0 and jaccard_similarity(ei.qnames, ej.qnames) > 0.1:
+            disjoint_edges.add(id_key)
+            continue
 
         any_contigs_to_check, ci, ci2, ci_alt, cj, cj2, cj_alt = get_consensus_seqs(ei, ej)
 
         if paired_end:
             overlap = max(0, min(ei.posA, ej.posA) - max(ei.posB, ej.posB))
             if ei.spanning > 0 and ej.spanning > 0 and overlap == 0 and ei.svtype != "INS":
-                disjoint_nodes.add(i_id)
-                disjoint_nodes.add(j_id)
-                #echo("filetered disjoint")
+                disjoint_edges.add(id_key)
+                # disjoint_nodes.add(j_id)
                 continue
 
         if (same_sample and ei.svtype == "DEL" and ei.su < 3 and ej.su < 3 and
                 not any_contigs_to_check and ei.spanning == 0 and ej.spanning == 0 and ei.sc == 0 and ej.sc == 0):
-            #echo("other1")
             continue
 
         ml = max(int(ei.svlen), int(ej.svlen))
         if ml == 0 and ei.svtype != 'TRA':
-            #echo("other2")
             continue
 
         l_ratio = min(int(ei.svlen), int(ej.svlen)) / ml if ml else 1
 
-        merge_conditions_met = False
-        if ins_dup or ei.svtype == "TRA" or ej.svtype == "TRA":
-            merge_conditions_met = True
-        elif ei.svtype == "INS":
-            if aggressive_ins_merge or paired_end: #(paired_end and isinstance(ei.variant_seq, str) and isinstance(ej.variant_seq, str) and l_ratio > 0.7):
-                merge_conditions_met = True
-            elif ml > 0 and l_ratio > 0.7:
-                merge_conditions_met = True
-        else:
-            spd = span_position_distance(ei, ej)
+        if paired_end:  # Merge insertion-like sequences aggressively
+            ei_ins_like = ei.svtype in "INSDUPTRA"
+            ej_ins_like = ej.svtype in "INSDUPTRA"
             recpi_overlap = is_reciprocal_overlapping(ei.posA, ei.posB, ej.posA, ej.posB)
-            both_in_include = intersecter(tree, ei.chrA, ei.posA, ei.posA + 1) and intersecter(tree, ei.chrB, ei.posB, ei.posB + 1)
-
-            merge_conditions_met = (
-                    (paired_end and ei.spanning > 0 and ej.spanning > 0 and (recpi_overlap or spd > 0.3)) or
-                    ((recpi_overlap or spd > 0.3 or (loci_similar and any_contigs_to_check)) and not both_in_include)
-            )
-
-
-        if not merge_conditions_met:
-            #echo("merge conditions not met", i_id, j_id, ei.svlen, ej.svlen)
-            continue
+            if ei_ins_like and ej_ins_like and abs(ei.posA - ej.posA) < 50 and (ei.spanning == 0 and ej.spanning == 0 and ei.remap_score == 0 and ej.remap_score == 0):
+                G.add_edge(idx, jdx, loci_same=False)
+                continue
+            elif recpi_overlap and ei.svtype == "DEL" and ej.svtype == "DEL" and ei.remap_score > 0 and ej.remap_score > 0:
+                G.add_edge(idx, jdx, loci_same=False)
+                continue
 
-        if same_sample and not paired_end and not consistent_alignment_and_cigars(ei, ej, l_ratio):
-            #echo("inconsistent cigars")
-            continue
+        # merge_conditions_met = False
+        # if ins_dup or ei.svtype == "TRA" or ej.svtype == "TRA":
+        #     merge_conditions_met = True
+        # elif ei.svtype == "INS":
+        #     if aggressive_ins_merge or paired_end: #(paired_end and isinstance(ei.variant_seq, str) and isinstance(ej.variant_seq, str) and l_ratio > 0.7):
+        #         merge_conditions_met = True
+        #     elif ml > 0 and l_ratio > 0.7:
+        #         merge_conditions_met = True
+        # else:
+        #     spd = span_position_distance(ei, ej)
+        #     recpi_overlap = is_reciprocal_overlapping(ei.posA, ei.posB, ej.posA, ej.posB)
+        #     both_in_include = intersecter(tree, ei.chrA, ei.posA, ei.posA + 1) and intersecter(tree, ei.chrB, ei.posB, ei.posB + 1)
+        #
+        #     merge_conditions_met = (
+        #             (paired_end and ei.spanning > 0 and ej.spanning > 0 and (recpi_overlap or spd > 0.3)) or
+        #             ((recpi_overlap or spd > 0.3 or (loci_similar and any_contigs_to_check)) and not both_in_include)
+        #     )
+
+
+        # if not merge_conditions_met:
+        #     echo("merge conditions not met", i_id, j_id, ei.svlen, ej.svlen)
+        #     continue
+
+        # if same_sample and not paired_end and not consistent_alignment_and_cigars(ei, ej, l_ratio):
+        #     echo("inconsistent cigars")
+        #     continue
 
         # Loci are similar, check contig match or reciprocal overlap
         if not any_contigs_to_check:
+            one_is_imprecise = (not ei.preciseA or not ei.preciseB or ei.svlen_precise or
+                                not ej.preciseA or not ej.preciseB or ej.svlen_precise)
             if ml > 0 and (l_ratio > 0.5 or (one_is_imprecise and l_ratio > 0.3)):
                 out_edges[idx] += 1
                 out_edges[jdx] += 1
-                G.add_edge(i_id, j_id, loci_same=False)
-                #echo("MERGED2", ei.svlen, ej.svlen, ei.svtype)
+                G.add_edge(idx, jdx, loci_same=False)
+                # echo("MERGED2", ei.svlen, ej.svlen, ei.svtype)
                 continue
-            #echo("no contigs to check fail")
+        elif procs > 1:
+            job.append(
+                (ci, ci2, ci_alt, cj, cj2, cj_alt, ei, ej, paired_end, idx, jdx, same_sample)
+            )
         else:
-            #echo("processing contig pairs", ci, ci2, ci_alt, cj, cj2, cj_alt)
-            for v in contig_pairs_iter(ci, ci2, ci_alt, cj, cj2, cj_alt):
-            #    echo(v)
-                if same_sample:
-                    res = consensus.check_contig_match(v[0], v[1], return_int=False)
-                    if bad_alignment(res, ei, ej, v):
-                        continue
-
-                    G.add_edge(i_id, j_id, loci_same=True)
-                    #echo("MERGED3", ei.svlen, ej.svlen, ei.svtype)
-
-                elif consensus.check_contig_match(v[0], v[1], return_int=True):
-                    G.add_edge(i_id, j_id, loci_same=True)
-                else:
-                    continue
-                #echo("no align", v[0], v[1])
-                break
-
-
-
-
+            edge = process_contig_aignments(ci, ci2, ci_alt, cj, cj2, cj_alt, ei, ej, paired_end, idx, jdx, same_sample)
+            if not edge:
+                continue
+            else:
+                G.add_edge(edge[0], edge[1], loci_same=True)
 
-            #v = (ci_alt, cj_alt) if ci_alt and cj_alt else (ci, cj) if ci and cj else None
-
-            # v = ((ci, cj) if (ci and cj) else (ci_alt, cj_alt)) if ci_alt and cj_alt else None
-            # if v is None:
-            #     echo("skipped due to no contigs")
-            #     continue
-            #
-            # if v and same_sample:
-            #
-            #
-            #     res = consensus.check_contig_match(v[0], v[1], return_int=False)
-            #     echo("RES", res, v[0], v[1], ei.svlen, ej.svlen)
-            #
-            #     if res:
-            #         qs, qe, ts, te, cig, _, _ = res
-            #         if large_cigar_gap(cig, 30):
-            #             echo("GAP too big", cig)
-            #             continue
-            #         total_sv_length = ei.svlen + ej.svlen
-            #         aligned_bases = max(qe - qs, te - ts)
-            #         unaligned_bases = qs + ts + (len(v[0]) - qe) + (len(v[1]) - te)
-            #         echo(unaligned_bases, total_sv_length, aligned_bases, cig)
-            #         echo(ei.contig_cigar, ej.contig_cigar)
-            #         #if unaligned_bases < total_sv_length:
-            #         # if aligned_bases > ref_overlap or unaligned_bases < total_sv_length:
-            #         if unaligned_bases < total_sv_length:
-            #             G.add_edge(i_id, j_id, loci_same=True)
-            #             echo("MERGED3", ei.svlen, ej.svlen, ei.svtype)
-            #             continue
-            #
-            # elif v and not same_sample and consensus.check_contig_match(v[0], v[1], return_int=True):
-            #     G.add_edge(i_id, j_id, loci_same=True)
-            #     continue
-            #
-            # # else:
-            # # Handle contig matching for alternate sequences
-            # for alt_combination in [(ci_alt, cj), (ci_alt, cj2), (cj_alt, ci), (cj_alt, ci2)]:
-            #     echo("testing alt combo")
-            #     if alt_combination[0] and alt_combination[1] and consensus.check_contig_match(alt_combination[0], alt_combination[1], return_int=True):
-            #         out_edges[idx] += 1
-            #         out_edges[jdx] += 1
-            #         G.add_edge(i_id, j_id, loci_same=True)
-            #         # echo(f"MERGED4")
-            #         echo("MERGED4", ei.svlen, ej.svlen, ei.svtype)
-            #         break
-
-    return G, disjoint_nodes
-
-
-def cut_components(G, disjoint_nodes):
-    components = nx.algorithms.components.connected_components(G)
-    if len(disjoint_nodes) > 0:
-        # try split this component into disjoint sets. This method works for small cluster sizes (most of the time)
-        # but can fail when there are many disjoint nodes. Label propagation might be needed for these
-        components2 = []
-        for c in components:
-            n_disjoin = set([])
-            for node in c:
-                if node in disjoint_nodes:
-                    n_disjoin.add(node)
-            if len(n_disjoin) <= 1:
-                components2.append(c)
+    if job:
+        for edge in pool.starmap(process_contig_aignments, job):
+            if not edge:
                 continue
-            out_e = defaultdict(list)
-            for node in n_disjoin:
-                for neigh in G.neighbors(node):
-                    out_e[neigh].append(node)
-            G3 = nx.Graph()
-            for k, v in out_e.items():
-                G3.add_edge(k, random.choice(v))
-            components2 += list(nx.algorithms.components.connected_components(G3))
-        return components2
-    return components
+            else:
+                G.add_edge(edge[0], edge[1], loci_same=True)
+
+    return G, disjoint_edges
+
+
+def split_graph_by_forbidden_edges(G, forbidden_pairs, potential):
+
+    def bfs_find_and_cut(start, target, work_graph, potential):
+        queue = deque([(start, None)])  # (node, parent)
+        visited = {start}
+        parent_map = {}  # Keep track of how we reached each node
+        while queue:
+            current, parent = queue.popleft()
+            for neighbor in work_graph.neighbors(current):
+                if neighbor == target:
+                    work_graph.remove_edge(current, neighbor)
+                    return True
+                if neighbor not in visited:
+                    visited.add(neighbor)
+                    parent_map[neighbor] = current
+                    queue.append((neighbor, current))
+        return False
 
+    work_graph = G.copy()
 
-cpdef srt_func(c):
-    if c.type != "pe" and c.type != "":
-        return 100 + c.su
-    return c.su + (3 * c.spanning)
+    node_component_size = {}
+    components = nx.connected_components(work_graph)
+    for component in components:
+        component_size = len(component)
+        max_degree = max(work_graph.degree(u) for u in component)
+        # If one node connects all others, don't cut
+        if max_degree == len(component) - 1:
+            continue
+        for node in component:
+            node_component_size[node] = component_size
+
+    for node1, node2 in forbidden_pairs:
+        if node1 not in work_graph or node2 not in work_graph:
+            continue
+        # For n=3, assume nodes really should be merged. This can arise when
+        # one large INDEL matches two smaller INDELS that are adjacent on the same read
+        # if node_component_size.get(node1, 0) <= 3 or node_component_size.get(node2, 0) <= 3:
+        #     continue
+        bfs_find_and_cut(node1, node2, work_graph, potential)
+
+    final_components = [work_graph.subgraph(c).copy() for c in nx.connected_components(work_graph)]
+    return final_components
+
+    # # Splits a graph into components ensuring that specified pairs of nodes are not in the same component
+    # work_graph = G.copy()
+    # for node1, node2 in forbidden_pairs:
+    #     if node1 not in work_graph or node2 not in work_graph:
+    #         continue
+    #     if nx.has_path(work_graph, node1, node2):
+    #         path = nx.shortest_path(work_graph, node1, node2)
+    #         # Remove the middle edge from the path to disconnect them
+    #         middle_idx = len(path) // 2
+    #         work_graph.remove_edge(path[middle_idx-1], path[middle_idx])
+    # components = [work_graph.subgraph(c).copy() for c in nx.connected_components(work_graph)]
+    # return components
+
+
+def srt_func(item1, item2):
+    # if item1.type != "pe" and item1.type != "":
+    #     return 100 + item1.su
+    support1 = item1.su + (3 * item1.spanning)
+    support2 = item2.su + (3 * item2.spanning)
+    if support1 > support2:
+        return -1
+    elif support2 > support1:
+        return 1
+
+    # If supports are equal, compare svlen
+    if item1.svlen > item2.svlen:
+        return -1
+    elif item2.svlen > item1.svlen:
+        return 1
+    return 0  # If everything is equal
 
 
 def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pick_best=False, add_partners=False,
                  rel_diffs=False, diffs=15, same_sample=True, debug=False, min_size=0, aggressive_ins_merge=False,
-                 skip_imprecise=False, max_comparisons=100):
+                 skip_imprecise=False, max_comparisons=100, procs=1):
     """Try and merge similar events, use overlap of both breaks points
     """
     max_dist = max_dist / 2
@@ -458,30 +608,30 @@ def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pic
         return potential
     # Cluster events on graph
     G = nx.Graph()
-    G, disjoint_nodes = enumerate_events(G, potential, max_dist, try_rev, tree, paired_end, rel_diffs, diffs, same_sample,
+    G, forbidden_edges = enumerate_events(G, potential, max_dist, try_rev, tree, paired_end, rel_diffs, diffs, same_sample,
                         aggressive_ins_merge=aggressive_ins_merge,
-                        debug=debug, max_comparisons=max_comparisons)
+                        debug=debug, max_comparisons=max_comparisons, procs=procs)
+    # for x, y in forbidden_edges:
+    #     echo('forbidden edge', (x, y), potential[x].svlen, potential[y].svlen)
     found = []
-    for item in potential:  # Add singletons, non-merged
-        if not G.has_node(item.event_id):
+    for idx, item in enumerate(potential):  # Add singletons, non-merged
+        if not G.has_node(idx):
             found.append(item)
 
     # Try and merge SVs with identical breaks, then merge ones with less accurate breaks - this helps prevent
     # over merging SVs that are close together
-    components = cut_components(G, disjoint_nodes)
-    node_to_event = {i.event_id: i for i in potential}
+    components = split_graph_by_forbidden_edges(G, forbidden_edges, potential)
+    #node_to_event = {i.event_id: i for i in potential}
     cdef int k
     for grp in components:
-        #echo(grp)
 
-        best = [node_to_event[n] for n in grp]
+        best = [potential[n] for n in grp]
 
-        best.sort(key=srt_func, reverse=True)
+        best.sort(key=cmp_to_key(srt_func), reverse=False)
         w0 = best[0]
 
-        #echo(w0.contig, w0.contig2)
-        #echo([b.svtype for b in best])
-        #echo([b.svlen for b in best], w0.svlen, w0.posA, w0.svtype, w0.rep)
+        # echo('merge groups', [i for i in best])
+        # echo([(b.svlen, b.su, b.svtype) for b in best], w0.svtype, w0.svlen)
         if not pick_best:
             weight = w0.pe + w0.supp + w0.spanning
             spanned = bool(w0.spanning)
@@ -498,6 +648,7 @@ def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pic
             best_var_seq = w0.variant_seq
             for k in range(1, len(best)):
                 item = best[k]
+                w0.qnames |= item.qnames
                 w0.pe += item.pe
                 w0.supp += item.supp
                 w0.sc += item.sc
@@ -514,7 +665,7 @@ def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pic
                     if not spanned:
                         if item.spanning:
                             w0.svlen = item.svlen
-                        elif min_size > w0.svlen < item.svlen:
+                        elif w0.remap_score == 0 and min_size > w0.svlen < item.svlen:
                             w0.svlen = item.svlen
                 elif item.svtype == "INS" and svt in {"INS","DUP","TRA","INV"}:
                     if not spanned:
@@ -522,9 +673,7 @@ def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pic
                             w0.svlen = item.svlen
                             w0.variant_seq = item.variant_seq
                         # elif item.svlen * 0.6 < w0.svlen < item.svlen or min_size > w0.svlen < item.svlen:
-                        elif min_size > w0.svlen < item.svlen:
-                            #echo("best var seq", best_var_seq)
-                            # if best_var_seq == -1:
+                        elif w0.remap_score == 0 and min_size > w0.svlen < item.svlen:
                             w0.svlen = item.svlen
                             w0.svtype = item.svtype
                             if best_var_seq:
@@ -562,4 +711,5 @@ def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pic
         if add_partners:
             w0.partners = [i.event_id for i in best[1:]]
         found.append(w0)
+
     return found
diff --git a/dysgu/post_call.py b/dysgu/post_call.py
index eabb885..6ae925d 100644
--- a/dysgu/post_call.py
+++ b/dysgu/post_call.py
@@ -13,6 +13,9 @@
 import warnings
 pd.options.mode.chained_assignment = None
 from dysgu.scikitbio._ssw_wrapper import StripedSmithWaterman
+from dysgu.io_funcs import iitree
+from dysgu.consensus import compute_rep
+from collections import defaultdict
 import os
 import warnings
 from sklearn.exceptions import InconsistentVersionWarning
@@ -352,7 +355,7 @@ def strand_binom_t(events):
 
 def get_ref_base(events, ref_genome, symbolic_sv_size):
     for e in events:
-        if not e.ref_seq:
+        if not e.ref_seq and not e.svtype == "BND":
             if symbolic_sv_size == -1 or e.svtype == "INS" or e.svtype == "TRA":
                 if e.posA == 0:
                     e.posA = 1
@@ -657,6 +660,43 @@ def get_gt_metric2(events, mode, add_gt=True):
     return events
 
 
+def filter_microsatellite_non_diploid(potential):
+    tmp_list = defaultdict(list)
+    max_dist = 50
+    half_d = (max_dist * 0.5)
+    candidates = []
+    for idx in range(len(potential)):
+        ei = potential[idx]
+        if (ei.svtype != "DEL" and ei.svtype != "INS") or not ei.variant_seq:
+            continue
+        rep = compute_rep(ei.variant_seq)
+        if rep < 0.4:
+            continue
+        candidates.append(idx)
+        tmp_list[ei.chrA].append((ei.posA - ei.cipos95A - max_dist, ei.posA + ei.cipos95A + max_dist, idx))
+        if ei.chrA == ei.chrB and ei.svlen > half_d:
+            tmp_list[ei.chrA].append((ei.posB - ei.cipos95B - max_dist, ei.posB + ei.cipos95B + max_dist, idx))
+
+    nc2 = {k: iitree(v, add_value=True) for k, v in tmp_list.items()}
+    seen = set([])
+    to_drop = set([])
+    for idx in candidates:
+        if idx in seen:
+            continue
+        ei = potential[idx]
+        ols = set(nc2[ei.chrA].allOverlappingIntervals(ei.posA, ei.posA + 1))
+        ols2 = set(nc2[ei.chrB].allOverlappingIntervals(ei.posB, ei.posB + 1))
+        ols |= ols2
+        ols.add(idx)
+        if len(ols) <= 2:
+            continue
+        bad = sorted([(i, potential[i]) for i in ols], key=lambda x: x[1].su, reverse=True)[2:]
+        for jdx, p in bad:
+            to_drop.add(jdx)
+        seen |= ols
+    return [p for i, p in enumerate(potential) if i not in to_drop]
+
+
 def compressability(events):
     for e in events:
         c1 = []
diff --git a/dysgu/re_map.py b/dysgu/re_map.py
index c881c13..f6115df 100644
--- a/dysgu/re_map.py
+++ b/dysgu/re_map.py
@@ -94,8 +94,18 @@ def merge_align_regions(locations):
         if abs(s - last[0]) < merge_dist and abs(e - last[1]) < merge_dist:
             new_l[-1][1] = e
         else:
-            break
+            new_l.append([s, e])
+            # break
             # return None
+    # Choose block closest to break point
+    best_i = 0
+    dist = 100_000
+    for i, (s, e) in enumerate(new_l):
+        mid = e - s
+        if abs(mid - 500) < dist:
+            best_i = i
+    new_l = [new_l[best_i]]
+
     return new_l
 
 
diff --git a/dysgu/sv2bam.pyx b/dysgu/sv2bam.pyx
index f7c14c2..e84fe8a 100644
--- a/dysgu/sv2bam.pyx
+++ b/dysgu/sv2bam.pyx
@@ -1,4 +1,4 @@
-#cython: language_level=3
+# cython: language_level=3
 
 from __future__ import absolute_import
 import pysam
diff --git a/pyproject.toml b/pyproject.toml
index 66a8190..938cc7d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ authors = [
     { name = "Kez Cleal", email = "clealk@cardiff.ac.uk" }
 ]
 license = { text = "MIT" }
-requires-python = ">=3.10"
+requires-python = ">=3.9"
 dependencies = [
     "setuptools >= 61.0",
     "cython",
diff --git a/setup.py b/setup.py
index d255a8e..012713a 100644
--- a/setup.py
+++ b/setup.py
@@ -115,7 +115,9 @@ def get_extension_modules():
                                      include_dirs=include_dirs,
                                      runtime_library_dirs=runtime_dirs,
                                      extra_compile_args=extras,
-                                     define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
+                                     define_macros=[
+                                         ("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION"),
+                                     ],
                                      language="c++"))
 
     return cythonize(ext_modules)