diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a4621b2..6b23803 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -14,24 +14,23 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Build wheels - uses: pypa/cibuildwheel@v2.16.0 + uses: pypa/cibuildwheel@v2.16.5 env: - CIBW_PROJECT_REQUIRES_PYTHON: ">=3.8, <3.12" - CIBW_SKIP: "*-win32 *-manylinux_i686 pp* cp311-macosx* *musl* " + CIBW_PROJECT_REQUIRES_PYTHON: ">=3.10" # >=3.8 <3.12 + CIBW_SKIP: "*-win32 *-manylinux_i686 pp* *musl* " # cp311-macosx* CIBW_ARCHS_MACOS: "x86_64 arm64" CIBW_ARCHS_LINUX: "auto" CIBW_BEFORE_ALL_MACOS: bash ci/osx-deps CIBW_BEFORE_ALL_LINUX: bash ci/manylinux-deps - CIBW_BEFORE_BUILD: pip install -r requirements.txt - CIBW_TEST_COMMAND: dysgu test - - # ... - # with: - # package-dir: . - # output-dir: wheelhouse - # config-file: "{package}/pyproject.toml" + CIBW_BEFORE_BUILD_MACOS: | + ln -s /Library/Frameworks/Python.framework/Versions/3.11/include/python3.11/cpython/longintrepr.h /Library/Frameworks/Python.framework/Versions/3.11/include/python3.11 + pip install -r requirements.txt + CIBW_BEFORE_BUILD_LINUX: pip install -r requirements.txt + CIBW_REPAIR_WHEEL_COMMAND_MACOS: delocate-wheel --require-archs x86_64 -w {dest_dir} -v {wheel} + CIBW_TEST_SKIP: "*-macosx_arm64" + CIBW_TEST_REQUIRES: cython click>=8.0 numpy scipy pandas pysam>=0.22.0 networkx>=2.4 scikit-learn>=0.22 sortedcontainers lightgbm + CIBW_TEST_COMMAND: dysgu test --verbose - uses: actions/upload-artifact@v3 with: diff --git a/.gitignore b/.gitignore index 4046d24..0a8f8e9 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ dysgu/.DS_Store docs/build/ docs/source/_templates/ dev +/dysgu/edlib/edlib.cpp diff --git a/README.rst b/README.rst index a19c837..d992af3 100644 --- a/README.rst +++ b/README.rst @@ -42,15 +42,16 @@ for calling structural variants using paired-end or long read sequencing data. S ⚙️ Installation --------------- -Dysgu requires Python >=3.8 - 3.10 plus htslib and has been tested on linux and MacOS. +Dysgu can be installed via pip using Python 3.10 - 3.12 and has been tested on linux and MacOS. The list of python packages needed can be found in requirements.txt. To install:: - conda install -c bioconda -c conda-forge dysgu + pip install dysgu -Or, fetch from PyPI:: +Or, from conda:: + + conda install -c bioconda -c conda-forge dysgu - pip install dysgu To build from source, run the install script: ``bash INSTALL.sh``. diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx index e466ed2..ec9b4e0 100644 --- a/dysgu/call_component.pyx +++ b/dysgu/call_component.pyx @@ -1,6 +1,6 @@ #cython: language_level=3, boundscheck=True, c_string_type=unicode, c_string_encoding=utf8, infer_types=True from __future__ import absolute_import -from collections import Counter, defaultdict +from collections import Counter, defaultdict, namedtuple import logging import numpy as np cimport numpy as np @@ -36,7 +36,7 @@ ctypedef enum ReadEnum_t: BREAKEND = 4 -cdef n_aligned_bases(AlignedSegment aln): +cpdef n_aligned_bases(AlignedSegment aln): cdef int opp, l, aligned, large_gaps, n_small_gaps, i cdef uint32_t cigar_value cdef uint32_t cigar_l = aln._delegate.core.n_cigar @@ -123,7 +123,9 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, MAPQsupp += a.mapq if a.has_tag("NM"): if a_bases: - NMsupp += float(a.get_tag("NM")) / a_bases + nm = float(a.get_tag("NM")) + NMsupp += nm / a_bases + NMbase += (nm - large_gaps) / a_bases if a.has_tag("AS"): this_as = a.get_tag("AS") if this_as > maxASsupp: @@ -173,7 +175,9 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, MAPQsupp += a.mapq if a.has_tag("NM"): if a_bases: - NMsupp += float(a.get_tag("NM")) / a_bases + nm = float(a.get_tag("NM")) + NMsupp += nm / a_bases + NMbase += (nm - large_gaps) / a_bases if a.has_tag("AS"): this_as = a.get_tag("AS") if this_as > maxASsupp: @@ -202,7 +206,9 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, clipped_base_quals += cbq clipped_bases += cb - cdef int tot = er.supp + total_pri + cdef float tot = er.plus + er.minus # er.supp + total_pri + if tot == 0: + return er.NMpri = (NMpri / total_pri) * 100 if total_pri > 0 else 0 er.NMsupp = (NMsupp / er.supp) * 100 if er.supp > 0 else 0 er.NMbase = (NMbase / tot) * 100 if tot > 0 else 0 @@ -222,7 +228,7 @@ cdef count_attributes2(reads1, reads2, spanning, float insert_ppf, generic_ins, cdef int within_read_end_position(event_pos, svtype, cigartuples, cigar_index): - cdef int i, end, idx, opp, length, cigar_skip, n_aligned_bases, target_bases + cdef int i, end if svtype == 1: # insertion (or other e.g. duplication/inversion within read) return event_pos + 1 else: # deletion type @@ -684,6 +690,9 @@ cdef single(rds, int insert_size, int insert_stdev, float insert_ppf, int clip_l tmp = defaultdict(list) # group by template name small_tlen_outliers = 0 # for paired reads note any smaller than expected TLENs for cigar_info, align in rds: + # echo(cigar_info, cigar_info.read_enum) + # if not paired_end and cigar_info.read_enum < 2: # split read + # continue tmp[align.qname].append((cigar_info, align)) if align.flag & 1 and abs(align.tlen) < insert_ppf: small_tlen_outliers += 1 @@ -694,6 +703,7 @@ cdef single(rds, int insert_size, int insert_stdev, float insert_ppf, int clip_l l_align = list(alignments) if len(l_align) > 1: pair = guess_informative_pair(l_align) + # pair = informative_pair(alignments, alignments, paired_end=paired_end) if pair is not None: if len(pair) == 2: a, b = pair @@ -762,19 +772,19 @@ cdef single(rds, int insert_size, int insert_stdev, float insert_ppf, int clip_l svlen_adjusted = int(np.median([b[0] for b in size_pos_bounded])) posA_adjusted = int(np.median([b[1] for b in size_pos_bounded])) posB_adjusted = int(np.median([b[2] for b in size_pos_bounded])) - if svtype_m == 2: # del - posA = posA_adjusted - posB = posB_adjusted - svlen = svlen_adjusted - er.preciseA = True - er.preciseB = True - else: + # if svtype_m == 2: # del + # posA = posA_adjusted + # posB = posB_adjusted + # svlen = svlen_adjusted + # er.preciseA = True + # er.preciseB = True + # else: # to_assemble = False - er.preciseA = True - er.preciseB = True - posA = posA_adjusted - posB = posB_adjusted - svlen = svlen_adjusted + er.preciseA = True + er.preciseB = True + posA = posA_adjusted + posB = posB_adjusted + svlen = svlen_adjusted else: svlen = int(np.median([sp[4] for sp in spanning_alignments])) @@ -872,18 +882,35 @@ cdef single(rds, int insert_size, int insert_stdev, float insert_ppf, int clip_l cdef tuple informative_pair(u, v): pri_u = None pri_v = None - for i in u: + for i_info, i in u: ri_flag = i.flag & 64 if not i.flag & 2304: # Not pri, supplementary --> is primary - pri_u = i - for j in v: + pri_u = i_info, i + for j_info, j in v: if j.flag & 64 == ri_flag: # Same read # Same read, primary + supp, or supp + supp - return i, j + return (i_info, i), (j_info, j) if not j.flag & 2304: # Is primary - pri_v = j + pri_v = j_info, j if pri_u is not None and pri_v is not None: return pri_u, pri_v + return None + +# cdef tuple informative_pair(u, v, bint paired_end): +# # fetch either a split read or pair1 and pair2 +# for i in u: +# i_info = i[0] +# for j in v: +# j_info = j[0] +# if j_info.hash_name == i_info.hash_name: +# continue +# if not paired_end: +# if i_info.read_enum == 1 and j_info.read_enum == 1: +# return i, j +# elif i_info.read_enum == j_info.read_enum: +# return i, j +# return None + cdef tuple break_ops(positions, precise, int limit, float median_pos): @@ -903,7 +930,7 @@ cdef tuple break_ops(positions, precise, int limit, float median_pos): ops.append((i + v + 1, -1, 1)) else: ops.append((i + v, -1, 1)) - ops = sorted(ops, reverse=limit < 0) + ops.sort(reverse=limit < 0) cdef int cum_sum = 0 cdef int max_sum = 0 cdef int max_sum_i = 0 @@ -1286,63 +1313,66 @@ cdef void assemble_partitioned_reads(EventResult_t er, u_reads, v_reads, int blo er.ref_bases = rbases -cdef call_from_reads(u_reads, v_reads, int insert_size, int insert_stdev, float insert_ppf, int min_support, +cdef call_from_reads(u_reads_info, v_reads_info, int insert_size, int insert_stdev, float insert_ppf, int min_support, int block_edge, int assemble, info, bint paired_end): grp_u = defaultdict(list) grp_v = defaultdict(list) - for r in u_reads: - grp_u[r.qname].append(r) - for r in v_reads: - grp_v[r.qname].append(r) + for uinfo, r in u_reads_info: + grp_u[r.qname].append((uinfo, r)) + for vinfo, r in v_reads_info: + grp_v[r.qname].append((vinfo, r)) informative = [] cdef AlignmentItem v_item, i cdef int left_clip_a, right_clip_a, left_clip_b, right_clip_b for qname in [k for k in grp_u if k in grp_v]: # Qname found on both sides u = grp_u[qname] v = grp_v[qname] - if len(u) == 1 and len(v) == 1: - pair = (u[0], v[0]) - else: - pair = informative_pair(u, v) - if pair: - a, b = pair - a_ct = a.cigartuples - b_ct = b.cigartuples - a_qstart, a_qend, b_qstart, b_qend, a_len, b_len = start_end_query_pair(a, b) - # Soft-clips for the chosen pair, plus template start of alignment - left_clip_a, right_clip_a, left_clip_b, right_clip_b = mask_soft_clips(a.flag, b.flag, a_ct, b_ct) - a_chrom, b_chrom = a.rname, b.rname - a_start, a_end = a.pos, a.reference_end - b_start, b_end = b.pos, b.reference_end - read_overlaps_mate = same_read_overlaps_mate(a_chrom, b_chrom, a_start, a_end, b_start, b_end, a, b) - v_item = AlignmentItem(a.rname, - b.rname, - int(not a.flag & 2304), # Is primary - int(not b.flag & 2304), - 1 if a.flag & 64 else 2, - 1 if b.flag & 64 else 2, - a.pos, a.reference_end, - b.pos, b.reference_end, - 3 if a.flag & 16 == 0 else 5, - 3 if b.flag & 16 == 0 else 5, - left_clip_a, right_clip_a, - left_clip_b, right_clip_b, - a_qstart, a_qend, b_qstart, b_qend, a_len, b_len, - read_overlaps_mate, - a, b - ) - if v_item.left_clipA and v_item.right_clipA: - if a_ct[0][1] >= a_ct[-1][1]: - v_item.right_clipA = 0 - else: - v_item.left_clipA = 0 - if v_item.left_clipB and v_item.right_clipB: - if b_ct[0][1] >= b_ct[-1][1]: - v_item.right_clipB = 0 - else: - v_item.left_clipB = 0 - classify_d(v_item) - informative.append(v_item) + pair = informative_pair(u, v) #, paired_end) + if not pair: + continue + # if len(u) == 1 and len(v) == 1: + # pair = (u[0], v[0]) + # else: + # pair = informative_pair(u, v, paired_end) + a_node_info, a, b_node_info, b, = pair[0][0], pair[0][1], pair[1][0], pair[1][1] + a_ct = a.cigartuples + b_ct = b.cigartuples + a_qstart, a_qend, b_qstart, b_qend, a_len, b_len = start_end_query_pair(a, b) + # Soft-clips for the chosen pair, plus template start of alignment + left_clip_a, right_clip_a, left_clip_b, right_clip_b = mask_soft_clips(a.flag, b.flag, a_ct, b_ct) + a_chrom, b_chrom = a.rname, b.rname + a_start, a_end = a.pos, a.reference_end + b_start, b_end = b.pos, b.reference_end + read_overlaps_mate = same_read_overlaps_mate(a_chrom, b_chrom, a_start, a_end, b_start, b_end, a, b) + v_item = AlignmentItem(a.rname, + b.rname, + int(not a.flag & 2304), # Is primary + int(not b.flag & 2304), + 1 if a.flag & 64 else 2, + 1 if b.flag & 64 else 2, + a.pos, a.reference_end, + b.pos, b.reference_end, + 3 if a.flag & 16 == 0 else 5, + 3 if b.flag & 16 == 0 else 5, + left_clip_a, right_clip_a, + left_clip_b, right_clip_b, + a_qstart, a_qend, b_qstart, b_qend, a_len, b_len, + read_overlaps_mate, + a, b, + a_node_info, b_node_info + ) + if v_item.left_clipA and v_item.right_clipA: + if a_ct[0][1] >= a_ct[-1][1]: + v_item.right_clipA = 0 + else: + v_item.left_clipA = 0 + if v_item.left_clipB and v_item.right_clipB: + if b_ct[0][1] >= b_ct[-1][1]: + v_item.right_clipB = 0 + else: + v_item.left_clipB = 0 + classify_d(v_item) + informative.append(v_item) if not informative: return [] @@ -1356,10 +1386,10 @@ cdef call_from_reads(u_reads, v_reads, int insert_size, int insert_stdev, float svtypes_counts[2].append(i) elif i.svtype == "TRA": svtypes_counts[3].append(i) - svtypes_res = sorted(svtypes_counts, key=sort_by_length, reverse=True) + svtypes_counts.sort(key=sort_by_length, reverse=True) results = [] cdef EventResult_t er - for sub_informative in svtypes_res: + for sub_informative in svtypes_counts: if len(sub_informative) >= min_support: svtype = sub_informative[0].svtype if svtype == "INV" or svtype == "TRA": @@ -1436,7 +1466,7 @@ cdef filter_single_partitions(u_reads, v_reads): cdef one_edge(u_reads_info, v_reads_info, int clip_length, int insert_size, int insert_stdev, float insert_ppf, - int min_support, int block_edge, int assemble, info, bint paired_end): + int min_support, int block_edge, int assemble, info, bint paired_end): spanning_alignments = [] u_reads = [] v_reads = [] @@ -1555,7 +1585,7 @@ cdef one_edge(u_reads_info, v_reads_info, int clip_length, int insert_size, int return [er] else: - results = call_from_reads(u_reads, v_reads, insert_size, insert_stdev, insert_ppf, min_support, block_edge, assemble, info, paired_end) + results = call_from_reads(u_reads_info, v_reads_info, insert_size, insert_stdev, insert_ppf, min_support, block_edge, assemble, info, paired_end) return results @@ -1571,15 +1601,24 @@ cdef get_reads(infile, nodes_info, buffered_reads, n2n, bint add_to_buffer, site aligns = [] fpos = [] site_info = [] + + # Sometimes seeking into the bam file doesnt find the read. This can be addressed by instead + # using the index and looping through the file, but this adds significant overhead + # todo see if missing reads are an issue for long reads + # regions = [] + # hash_names = {} + # has_index = infile.has_index() + for int_node in nodes_info: if sites_index and int_node in sites_index: continue n = n2n[int_node] - if int_node in buffered_reads: + if buffered_reads and int_node in buffered_reads: aligns.append((n, buffered_reads[int_node])) continue fpos.append((n, int_node)) - for node, int_node in sorted(fpos, key=fpos_srt): + fpos.sort(key=fpos_srt) + for node, int_node in fpos: infile.seek(node.tell) try: a = next(infile) @@ -1605,6 +1644,23 @@ cdef get_reads(infile, nodes_info, buffered_reads, n2n, bint add_to_buffer, site if add_to_buffer: buffered_reads[int_node] = a break + # else: + # if has_index: + # nn = n2n[int_node] + # hash_names[(nn.hash_name, nn.flag, nn.pos, nn.chrom)] = nn + # regions.append((infile.get_reference_name(nn.chrom), nn.pos, nn.pos + 200)) + # + # if regions: + # regions = merge_intervals(regions) + # for rgn in regions: + # for a in infile.fetch(*rgn): + # v = xxhasher(bam_get_qname(a._delegate), len(a.qname), 42) + # nm = (v, a.flag, a.pos, a.rname) + # if nm in hash_names: + # aligns.append((hash_names[nm], a)) + # + # if len(aligns) < len(nodes_info): + # echo(len(aligns), len(nodes_info)) return aligns @@ -1614,65 +1670,68 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, # Sometimes partitions are not linked, happens when there is not much support between partitions # Then need to decide whether to call from a single partition - n2n = data["n2n"] - seen = set(range(len(data['parts']))) + n2n = data.n2n + seen = set(range(len(data.parts))) if data.parts else {} out_counts = defaultdict(int) # The number of 'outward' links to other clusters cdef int buffered_reads = 0 cdef bint add_to_buffer = 1 cdef int int_node cdef unsigned long[:] partition events = [] - if info: - sites_info = list(info.values()) - else: - sites_info = [] - # u and v are the part ids, d[0] and d[1] are the lists of nodes for those parts - for (u, v), d in data["s_between"].items(): - rd_u = get_reads(bam, d[0], data["reads"], data["n2n"], add_to_buffer, info) # [(Nodeinfo, alignment)..] - rd_v = get_reads(bam, d[1], data["reads"], data["n2n"], add_to_buffer, info) - total_reads = len(rd_u) + len(rd_v) - buffered_reads += total_reads - if add_to_buffer and buffered_reads > 50000: - add_to_buffer = 0 - out_counts[u] += len(rd_u) - out_counts[v] += len(rd_v) - if len(rd_u) == 0 or len(rd_v) == 0: - continue - if u in seen: - seen.remove(u) - if v in seen: - seen.remove(v) - - # finds reads that should be a single partition - u_reads, v_reads, u_single, v_single = filter_single_partitions(rd_u, rd_v) - if len(u_reads) > 0 and len(v_reads) > 0: - events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs, + sites_info = list(info.values()) if info else [] + + if data.s_between: + # u and v are the part ids, d[0] and d[1] are the lists of nodes for those parts + for (u, v), d in data.s_between: #.items(): + rd_u = get_reads(bam, d[0], data.reads, n2n, add_to_buffer, info) # [(Nodeinfo, alignment)..] + rd_v = get_reads(bam, d[1], data.reads, n2n, add_to_buffer, info) + total_reads = len(rd_u) + len(rd_v) + buffered_reads += total_reads + if add_to_buffer and buffered_reads > 50000: + add_to_buffer = 0 + out_counts[u] += len(rd_u) + out_counts[v] += len(rd_v) + if len(rd_u) == 0 or len(rd_v) == 0: + continue + if u in seen: + seen.remove(u) + if v in seen: + seen.remove(v) + + events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, + assemble_contigs, sites_info, paired_end) - if u_single: - res = single(u_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - sites_info, paired_end, length_extend, divergence) - if res: - if isinstance(res, EventResult): - events.append(res) - else: - events += res - if v_single: - res = single(v_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - sites_info, paired_end, length_extend, divergence) - if res: - if isinstance(res, EventResult): - events.append(res) - else: - events += res + + # finds reads that should be a single partition + # u_reads, v_reads, u_single, v_single = filter_single_partitions(rd_u, rd_v) + # if len(u_reads) > 0 and len(v_reads) > 0: + # events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs, + # sites_info, paired_end) + # if u_single: + # res = single(u_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, + # sites_info, paired_end, length_extend, divergence) + # if res: + # if isinstance(res, EventResult): + # events.append(res) + # else: + # events += res + # if v_single: + # res = single(v_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, + # sites_info, paired_end, length_extend, divergence) + # if res: + # if isinstance(res, EventResult): + # events.append(res) + # else: + # events += res # Process any singles / unconnected blocks if seen: for part_key in seen: - d = data["parts"][part_key] + d = data.parts[part_key] lb = lower_bound_support if len(sites_info) == 0 else 1 if max_single_size > len(d) >= lb: # Call single block, only collect local reads to the block - rds = get_reads(bam, d, data["reads"], data["n2n"], 0, info) + rds = get_reads(bam, d, data.reads, data.n2n, 0, info) if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): continue res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, @@ -1684,46 +1743,43 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf, events += res # Check for events within clustered nodes - for k, d in data["s_within"].items(): - o_count = out_counts[k] - i_counts = len(d) - if i_counts > max_single_size: - continue - if o_count > 0 and i_counts > (2*min_support) and i_counts > o_count: - rds = get_reads(bam, d, data["reads"], data["n2n"], 0, info) - if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): - continue - res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, - sites_info, paired_end, length_extend, divergence) - if res: - if isinstance(res, EventResult): - events.append(res) - else: - events += res + if data.s_within: + for k, d in data.s_within: #.items(): + o_count = out_counts[k] + i_counts = len(d) + if i_counts > max_single_size: + continue + if o_count > 0 and i_counts > (2*min_support) and i_counts > o_count: + rds = get_reads(bam, d, data.reads, data.n2n, 0, info) + if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): + continue + res = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, + sites_info, paired_end, length_extend, divergence) + if res: + if isinstance(res, EventResult): + events.append(res) + else: + events += res return events cpdef list call_from_block_model(bam, data, clip_length, insert_size, insert_stdev, insert_ppf, min_support, lower_bound_support, assemble_contigs, max_single_size, sites_index, bint paired_end, int length_extend, float divergence): - n_parts = len(data["parts"]) + n_parts = len(data.parts) if data.parts else 0 events = [] - if "info" in data: - info = data["info"] - else: - info = None + info = data.info + if data.reads is None: + data.reads = {} # next deal with info - need to filter these into the partitions, then deal with them in single / one_edge cdef EventResult_t e if n_parts >= 1: events += multi(data, bam, insert_size, insert_stdev, insert_ppf, clip_length, min_support, lower_bound_support, assemble_contigs, max_single_size, info, paired_end, length_extend, divergence) elif n_parts == 0: - if len(data["n2n"]) > max_single_size: + if len(data.n2n) > max_single_size: return [] - rds = get_reads(bam, data["n2n"].keys(), data["reads"], data["n2n"], 0, info) - if info: - sites_info = list(info.values()) - else: - sites_info = [] + rds = get_reads(bam, data.n2n.keys(), data.reads, data.n2n, 0, info) + sites_info = list(info.values()) if info else [] if len(rds) < lower_bound_support or (len(sites_info) != 0 and len(rds) == 0): return [] ev = single(rds, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs, sites_info, paired_end, length_extend, divergence) diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx index 33bc24a..55e8a80 100644 --- a/dysgu/cluster.pyx +++ b/dysgu/cluster.pyx @@ -52,24 +52,27 @@ def filter_potential(input_events, tree, regions_only): return potential -def compare_subset(potential, int max_dist): +def get_chrom_key(ei): + if ei.chrA != ei.chrB: + return min(ei.chrA, ei.chrB), max(ei.chrA, ei.chrB) + return ei.chrA + + +def compare_subset(potential, int max_dist, int max_comparisons): tmp_list = defaultdict(list) cdef int idx, jdx, dist1, ci_a for idx in range(len(potential)): ei = potential[idx] - tmp_list[ei.chrA].append((ei.posA - ei.cipos95A - max_dist, ei.posA + ei.cipos95A + max_dist, idx)) - if ei.chrA != ei.chrB or abs(ei.posB - ei.posA) > 5: - if ei.chrA != ei.chrB: - tmp_list[ei.chrB].append((ei.posB - ei.cipos95B - max_dist, ei.posB + ei.cipos95B + max_dist, idx)) - else: - dist1 = abs(ei.posB - ei.posA) - ci_a = max(ei.cipos95A, ei.cipos95A) - if dist1 + ci_a > max_dist: - tmp_list[ei.chrB].append((ei.posB - ei.cipos95B - max_dist, ei.posB + ei.cipos95B + max_dist, idx)) + tmp_list[get_chrom_key(ei)].append((ei.posA - ei.cipos95A - max_dist, ei.posA + ei.cipos95A + max_dist, idx)) + nc2 = {k: io_funcs.iitree(v, add_value=True) for k, v in tmp_list.items()} for idx in range(len(potential)): ei = potential[idx] - ols = nc2[ei.chrB].allOverlappingIntervals(ei.posB, ei.posB + 1) + ols = nc2[get_chrom_key(ei)].allOverlappingIntervals(ei.posA, ei.posA + 1) + ols.remove(idx) + if len(ols) > max_comparisons: + random.shuffle(ols) + ols = ols[:max_comparisons] for jdx in ols: ej = potential[jdx] yield ei, ej, idx, jdx @@ -143,14 +146,16 @@ cdef break_distances(int i_a, int i_b, int j_a, j_b, bint i_a_precise, bint i_b_ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, rel_diffs=False, diffs=15, - same_sample=True, aggressive_ins_merge=False, debug=False): + same_sample=True, aggressive_ins_merge=False, debug=False, max_comparisons=20): if len(potential) < 50: event_iter = compare_all(potential) # N^2 compare all events to all others else: - event_iter = compare_subset(potential, max_dist) # Use iitree, generate overlap tree and perform intersections + event_iter = compare_subset(potential, max_dist, max_comparisons) # Use iitree, generate overlap tree and perform intersections + seen = set([]) pad = 100 disjoint_nodes = set([]) # if a component has more than one disjoint nodes it needs to be broken apart + node_counts = defaultdict(int) for ei, ej, idx, jdx in event_iter: i_id = ei.event_id j_id = ej.event_id @@ -279,7 +284,6 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re v = (ci2, cj) else: continue - # echo(ei.remap_score == 0 or ej.remap_score == 0, ei.svlen, ej.svlen, v, assembler.check_contig_match(v[0], v[1], return_int=True)) # if not remapped and nearby insertions with opposing soft-clips --> merge # also long-reads will normally have remap_score == 0 if ei.remap_score == 0 or ej.remap_score == 0: @@ -290,6 +294,10 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re G.add_edge(i_id, j_id, loci_same=True) # see if alt sequence can be found in other contig else: + # if ci_alt and cj_alt: + # if assembler.check_contig_match(ci_alt, cj_alt, return_int=True): + # G.add_edge(i_id, j_id, loci_same=True) + # continue if ci_alt and cj: if assembler.check_contig_match(ci_alt, cj, return_int=True): G.add_edge(i_id, j_id, loci_same=True) @@ -350,7 +358,7 @@ cpdef srt_func(c): def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pick_best=False, add_partners=False, rel_diffs=False, diffs=15, same_sample=True, debug=False, min_size=0, aggressive_ins_merge=False, - skip_imprecise=False): + skip_imprecise=False, max_comparisons=100): """Try and merge similar events, use overlap of both breaks points """ max_dist = max_dist / 2 @@ -360,7 +368,7 @@ def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pic G = nx.Graph() G, disjoint_nodes = enumerate_events(G, potential, max_dist, try_rev, tree, paired_end, rel_diffs, diffs, same_sample, aggressive_ins_merge=aggressive_ins_merge, - debug=debug) + debug=debug, max_comparisons=max_comparisons) found = [] for item in potential: # Add singletons, non-merged if not G.has_node(item.event_id): @@ -371,8 +379,8 @@ def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pic node_to_event = {i.event_id: i for i in potential} cdef int k for grp in components: - c = [node_to_event[n] for n in grp] - best = sorted(c, key=srt_func, reverse=True) + best = [node_to_event[n] for n in grp] + best.sort(key=srt_func, reverse=True) w0 = best[0] if not pick_best: weight = w0.pe + w0.supp + w0.spanning @@ -811,7 +819,10 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N trust_ins_len=args["trust_ins_len"] == "True", low_mem=low_mem, temp_dir=tdir, - find_n_aligned_bases=find_n_aligned_bases) + find_n_aligned_bases=find_n_aligned_bases, + position_distance_thresh=args['sd'], + max_search_depth=args['search_depth'] + ) sites_index = None if sites_adder: sites_index = sites_adder.sites_index @@ -937,7 +948,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N pickle.dump(res, completed_file) else: j_submitted, w_idx = heapq.heappop(minhq) - heapq.heappush(minhq, (j_submitted + len(res["n2n"]), w_idx)) + heapq.heappush(minhq, (j_submitted + len(res.n2n), w_idx)) msg_queues[w_idx][1].send(res) else: # most partitions processed here, dict returned, or None @@ -969,7 +980,7 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N pickle.dump(res, completed_file) else: j_submitted, w_idx = heapq.heappop(minhq) - heapq.heappush(minhq, (j_submitted + len(res["n2n"]), w_idx)) + heapq.heappush(minhq, (j_submitted + len(res.n2n), w_idx)) msg_queues[w_idx][1].send(res) if completed_file is not None: @@ -1022,7 +1033,8 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=N # Merge across calls if args["merge_within"] == "True": merged = merge_events(block_edge_events, args["merge_dist"], regions, bool(paired_end), try_rev=False, pick_best=False, - debug=True, min_size=args["min_size"]) + debug=True, min_size=args["min_size"], + max_comparisons=args["max_comparisons"] if "max_comparisons" in args else 100) else: merged = block_edge_events logging.info("Number of candidate SVs merged: {}".format(len(block_edge_events) - len(merged))) diff --git a/dysgu/coverage.pyx b/dysgu/coverage.pyx index 6da4246..8e526fd 100644 --- a/dysgu/coverage.pyx +++ b/dysgu/coverage.pyx @@ -161,10 +161,10 @@ cdef class GenomeScanner: else: f_iter = self.input_bam.fetch(until_eof=True) for aln in f_iter: - # if aln.flag & 1284 or aln.mapq < mq_thresh or aln.cigartuples is None: - # continue + cigar_l = aln._delegate.core.n_cigar if cigar_l == 0 or aln.flag & 1284: + tell = 0 if self.no_tell else self.input_bam.tell() continue if aln.rname != self.current_tid: @@ -182,7 +182,6 @@ cdef class GenomeScanner: good_read = True elif not aln.flag & 2: good_read = True - # cigar_l = aln._delegate.core.n_cigar cigar_p = bam_get_cigar(aln._delegate) for i in range(cigar_l): cigar_value = cigar_p[i] @@ -199,12 +198,11 @@ cdef class GenomeScanner: elif opp == 0 or opp == 7 or opp == 8: self.cpp_cov_track.add(pos + index_start, pos + index_start + length) index_start += length - if aln.mapq < mq_thresh: - continue - if not good_read: - continue - if not self.cpp_cov_track.cov_val_good(self.current_tid, aln.rname, pos): + + if aln.mapq < mq_thresh or not good_read or not self.cpp_cov_track.cov_val_good(self.current_tid, aln.rname, pos): + tell = 0 if self.no_tell else self.input_bam.tell() continue + self._add_to_bin_buffer(aln, tell) tell = 0 if self.no_tell else self.input_bam.tell() while len(self.staged_reads) > 0: @@ -460,9 +458,9 @@ cdef class GenomeScanner: cdef float current_coverage if self.current_chrom != rname: self.current_chrom = rname - in_roi = False - if self.overlap_regions: - in_roi = intersecter(self.overlap_regions, a.rname, apos, apos+1) + # in_roi = False + # if self.overlap_regions: + # in_roi = intersecter(self.overlap_regions, a.rname, apos, apos+1) if rname == self.current_chrom and bin_pos == self.current_pos: self.current_bin.append((a, tell)) else: diff --git a/dysgu/edlib/__init__.py b/dysgu/edlib/__init__.py new file mode 100644 index 0000000..5dd9e64 --- /dev/null +++ b/dysgu/edlib/__init__.py @@ -0,0 +1,3 @@ + +from .edlib import (align) +__all__ = ['align'] \ No newline at end of file diff --git a/dysgu/edlib/edlib.pxd b/dysgu/edlib/edlib.pxd new file mode 100644 index 0000000..4eddbb8 --- /dev/null +++ b/dysgu/edlib/edlib.pxd @@ -0,0 +1,39 @@ +cdef extern from "src/edlib.h" nogil: + + ctypedef enum EdlibAlignMode: EDLIB_MODE_NW, EDLIB_MODE_SHW, EDLIB_MODE_HW + ctypedef enum EdlibAlignTask: EDLIB_TASK_DISTANCE, EDLIB_TASK_LOC, EDLIB_TASK_PATH + ctypedef enum EdlibCigarFormat: EDLIB_CIGAR_STANDARD, EDLIB_CIGAR_EXTENDED + + ctypedef struct EdlibEqualityPair: + char first + char second + + ctypedef struct EdlibAlignConfig: + int k + EdlibAlignMode mode + EdlibAlignTask task + const EdlibEqualityPair* additionalEqualities + int additionalEqualitiesLength + + EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask task, + const EdlibEqualityPair* additionalEqualities, + int additionalEqualitiesLength) + EdlibAlignConfig edlibDefaultAlignConfig() + + ctypedef struct EdlibAlignResult: + int status + int editDistance + int* endLocations + int* startLocations + int numLocations + unsigned char* alignment + int alignmentLength + int alphabetLength + + void edlibFreeAlignResult(EdlibAlignResult result) + + EdlibAlignResult edlibAlign(const char* query, int queryLength, + const char* target, int targetLength, + const EdlibAlignConfig config) + + char* edlibAlignmentToCigar(const unsigned char* alignment, int alignmentLength, EdlibCigarFormat cigarFormat) \ No newline at end of file diff --git a/dysgu/edlib/edlib.pyx b/dysgu/edlib/edlib.pyx new file mode 100644 index 0000000..ea12c0a --- /dev/null +++ b/dysgu/edlib/edlib.pyx @@ -0,0 +1,240 @@ +# cython: language_level=2 +cimport cython +from cpython.mem cimport PyMem_Malloc, PyMem_Free +import re +# +# cimport dysgu.cedlib + +class NeedsAlphabetMapping(Exception): + pass + + +def _map_ascii_string(s): + """ Helper func to handle the bytes mapping in the case of ASCII strings.""" + if isinstance(s, bytes): + return s + elif isinstance(s, str): + s_bytes = s.encode('utf-8') + if len(s_bytes) == len(s): + return s_bytes + raise NeedsAlphabetMapping() + + +def _map_to_bytes(query, target, additional_equalities): + """ Map hashable input values to single byte values. + + Example: + In: query={12, 'ä'}, target='ööö', additional_equalities={'ä': 'ö'} + Out: b'\x00\x01', b'\x02\x02\x02', additional_equalities={b'\x01': b'\x02'} + """ + cdef bytes query_bytes + cdef bytes target_bytes + try: + query_bytes = _map_ascii_string(query) + target_bytes = _map_ascii_string(target) + except NeedsAlphabetMapping: + # Map elements of alphabet to chars from 0 up to 255, so that Edlib can work with them, + # since C++ Edlib needs chars. + alphabet = set(query).union(set(target)) + if len(alphabet) > 256: + raise ValueError( + "query and target combined have more than 256 unique values, " + "this is not supported.") + alphabet_to_byte_mapping = { + c: idx.to_bytes(1, byteorder='big') for idx, c in enumerate(alphabet) + } + map_seq = lambda seq: b''.join(alphabet_to_byte_mapping[c] for c in seq) + query_bytes = map_seq(query) + target_bytes = map_seq(target) + if additional_equalities is not None: + additional_equalities = [ + (alphabet_to_byte_mapping[a].decode('utf-8'), alphabet_to_byte_mapping[b].decode('utf-8')) + for a, b in additional_equalities + if a in alphabet_to_byte_mapping and b in alphabet_to_byte_mapping] + return query_bytes, target_bytes, additional_equalities + + +def align(query, target, mode="NW", task="distance", k=-1, additionalEqualities=None): + """ Align query with target using edit distance. + @param {str or bytes or iterable of hashable objects} query, combined with target must have no more + than 256 unique values + @param {str or bytes or iterable of hashable objects} target, combined with query must have no more + than 256 unique values + @param {string} mode Optional. Alignment method do be used. Possible values are: + - 'NW' for global (default) + - 'HW' for infix + - 'SHW' for prefix. + @param {string} task Optional. Tells edlib what to calculate. The less there is to calculate, + the faster it is. Possible value are (from fastest to slowest): + - 'distance' - find edit distance and end locations in target. Default. + - 'locations' - find edit distance, end locations and start locations. + - 'path' - find edit distance, start and end locations and alignment path. + @param {int} k Optional. Max edit distance to search for - the lower this value, + the faster is calculation. Set to -1 (default) to have no limit on edit distance. + @param {list} additionalEqualities Optional. + List of pairs of characters or hashable objects, where each pair defines two values as equal. + This way you can extend edlib's definition of equality (which is that each character is equal only + to itself). + This can be useful e.g. when you want edlib to be case insensitive, or if you want certain + characters to act as a wildcards. + Set to None (default) if you do not want to extend edlib's default equality definition. + @return Dictionary with following fields: + {int} editDistance Integer, -1 if it is larger than k. + {int} alphabetLength Integer, length of unique characters in 'query' and 'target' + {[(int, int)]} locations List of locations, in format [(start, end)]. + {string} cigar Cigar is a standard format for alignment path. + Here we are using extended cigar format, which uses following symbols: + Match: '=', Insertion to target: 'I', Deletion from target: 'D', Mismatch: 'X'. + e.g. cigar of "5=1X1=1I" means "5 matches, 1 mismatch, 1 match, 1 insertion (to target)". + """ + # Transform python sequences of hashables into c strings. + cdef bytes query_bytes + cdef bytes target_bytes + query_bytes, target_bytes, additionalEqualities = _map_to_bytes( + query, target, additionalEqualities) + cdef char* cquery = query_bytes; + cdef char* ctarget = target_bytes; + + # Build an edlib config object based on given parameters. + cconfig = edlibDefaultAlignConfig() + + if k is not None: cconfig.k = k + + if mode == 'NW': cconfig.mode = EDLIB_MODE_NW + if mode == 'HW': cconfig.mode = EDLIB_MODE_HW + if mode == 'SHW': cconfig.mode = EDLIB_MODE_SHW + + if task == 'distance': cconfig.task = EDLIB_TASK_DISTANCE + if task == 'locations': cconfig.task = EDLIB_TASK_LOC + if task == 'path': cconfig.task = EDLIB_TASK_PATH + + cdef bytes tmp_bytes; + cdef char* tmp_cstring; + cdef EdlibEqualityPair* c_additionalEqualities = NULL + if additionalEqualities is None: + cconfig.additionalEqualities = NULL + cconfig.additionalEqualitiesLength = 0 + else: + c_additionalEqualities = PyMem_Malloc(len(additionalEqualities) + * cython.sizeof(EdlibEqualityPair)) + for i in range(len(additionalEqualities)): + c_additionalEqualities[i].first = bytearray(additionalEqualities[i][0].encode('utf-8'))[0] + c_additionalEqualities[i].second = bytearray(additionalEqualities[i][1].encode('utf-8'))[0] + cconfig.additionalEqualities = c_additionalEqualities + cconfig.additionalEqualitiesLength = len(additionalEqualities) + + # Run alignment -- need to get len before disabling the GIL + query_len = len(query) + target_len = len(target) + with nogil: + cresult = edlibAlign(cquery, query_len, ctarget, target_len, cconfig) + if c_additionalEqualities != NULL: PyMem_Free(c_additionalEqualities) + + if cresult.status == 1: + raise Exception("There was an error.") + + # Build python dictionary with results from result object that edlib returned. + locations = [] + if cresult.numLocations >= 0: + for i in range(cresult.numLocations): + locations.append((cresult.startLocations[i] if cresult.startLocations else None, + cresult.endLocations[i] if cresult.endLocations else None)) + cigar = None + if cresult.alignment: + ccigar = edlibAlignmentToCigar(cresult.alignment, cresult.alignmentLength, + EDLIB_CIGAR_EXTENDED) + cigar = ccigar + cigar = cigar.decode('UTF-8') + result = { + 'editDistance': cresult.editDistance, + 'alphabetLength': cresult.alphabetLength, + 'locations': locations, + 'cigar': cigar + } + edlibFreeAlignResult(cresult) + + return result + + +def getNiceAlignment(alignResult, query, target, gapSymbol="-"): + """ Output alignments from align() in NICE format + @param {dictionary} alignResult, output of the method align() + NOTE: The method align() requires the argument task="path" + @param {string} query, the exact query used for alignResult + @param {string} target, the exact target used for alignResult + @param {string} gapSymbol, default "-" + String used to represent gaps in the alignment between query and target + @return Alignment in NICE format, which is human-readable visual representation of how the query and target align to each other. + e.g., for "telephone" and "elephant", it would look like: + telephone + |||||.|. + -elephant + It is represented as dictionary with following fields: + - {string} query_aligned + - {string} matched_aligned ('|' for match, '.' for mismatch, ' ' for insertion/deletion) + - {string} target_aligned + Normally you will want to print these three in order above joined with newline character. + """ + if type(alignResult) is not dict: + raise Exception("The object alignResult is expected to be a python dictionary. Please check the input alignResult.") + + if 'locations' not in alignResult.keys(): + raise Exception("The object alignResult is expected to contain a field 'locations'. Please check the input alignResult.") + + target_pos = alignResult["locations"][0][0] + if target_pos == None: + target_pos = 0 + query_pos = 0 ## 0-indexed + target_aln = match_aln = query_aln = "" + + if 'cigar' not in alignResult.keys(): + raise Exception("The object alignResult is expected to contain a CIGAR string. Please check the input alignResult.") + + cigar = alignResult["cigar"] + + if alignResult["cigar"] == '' or alignResult["cigar"] == None: + raise Exception("The object alignResult contains an empty CIGAR string. Users must run align() with task='path'. Please check the input alignResult.") + + ### + ## cigar parsing, motivated by yech1990: https://github.com/Martinsos/edlib/issues/127 + ## + ## 'num_occurences' == Number of occurrences of the alignment operation + ## 'alignment_operation' == Cigar symbol/code that represent an alignment operation + ## + for num_occurrences, alignment_operation in re.findall("(\d+)(\D)", cigar): + num_occurrences = int(num_occurrences) + if alignment_operation == "=": + target_aln += target[target_pos : target_pos + num_occurrences] + target_pos += num_occurrences + query_aln += query[query_pos : query_pos + num_occurrences] + query_pos += num_occurrences + match_aln += "|" * num_occurrences + elif alignment_operation == "X": + target_aln += target[target_pos : target_pos + num_occurrences] + target_pos += num_occurrences + query_aln += query[query_pos : query_pos + num_occurrences] + query_pos += num_occurrences + match_aln += "." * num_occurrences + elif alignment_operation == "D": + target_aln += target[target_pos : target_pos + num_occurrences] + target_pos += num_occurrences + query_aln += gapSymbol * num_occurrences + query_pos += 0 + match_aln +=gapSymbol * num_occurrences + elif alignment_operation == "I": + target_aln += gapSymbol * num_occurrences + target_pos += 0 + query_aln += query[query_pos : query_pos + num_occurrences] + query_pos += num_occurrences + match_aln += gapSymbol * num_occurrences + else: + raise Exception("The CIGAR string from alignResult contains a symbol not '=', 'X', 'D', 'I'. Please check the validity of alignResult and alignResult.cigar") + + alignments = { + 'query_aligned': query_aln, + 'matched_aligned': match_aln, + 'target_aligned': target_aln + } + + return alignments + diff --git a/dysgu/edlib/src/edlib.cpp b/dysgu/edlib/src/edlib.cpp new file mode 100644 index 0000000..46cbd17 --- /dev/null +++ b/dysgu/edlib/src/edlib.cpp @@ -0,0 +1,1495 @@ +#include "edlib.h" + +#include +#include +#include +#include +#include +#include + +using namespace std; + +typedef uint64_t Word; +static const int WORD_SIZE = sizeof(Word) * 8; // Size of Word in bits +static const Word WORD_1 = static_cast(1); +static const Word HIGH_BIT_MASK = WORD_1 << (WORD_SIZE - 1); // 100..00 +static const int MAX_UCHAR = 255; + +// Data needed to find alignment. +struct AlignmentData { + Word* Ps; + Word* Ms; + int* scores; + int* firstBlocks; + int* lastBlocks; + + AlignmentData(int maxNumBlocks, int targetLength) { + // We build a complete table and mark first and last block for each column + // (because algorithm is banded so only part of each columns is used). + // TODO: do not build a whole table, but just enough blocks for each column. + Ps = new Word[maxNumBlocks * targetLength]; + Ms = new Word[maxNumBlocks * targetLength]; + scores = new int[maxNumBlocks * targetLength]; + firstBlocks = new int[targetLength]; + lastBlocks = new int[targetLength]; + } + + ~AlignmentData() { + delete[] Ps; + delete[] Ms; + delete[] scores; + delete[] firstBlocks; + delete[] lastBlocks; + } +}; + +struct Block { + Word P; // Pvin + Word M; // Mvin + int score; // score of last cell in block; + + Block() {} + Block(Word p, Word m, int s) :P(p), M(m), score(s) {} +}; + + +/** + * Defines equality relation on alphabet characters. + * By default each character is always equal only to itself, but you can also provide additional equalities. + */ +class EqualityDefinition { +private: + bool matrix[MAX_UCHAR + 1][MAX_UCHAR + 1]; +public: + EqualityDefinition(const string& alphabet, + const EdlibEqualityPair* additionalEqualities = NULL, + const int additionalEqualitiesLength = 0) { + for (int i = 0; i < static_cast(alphabet.size()); i++) { + for (int j = 0; j < static_cast(alphabet.size()); j++) { + matrix[i][j] = (i == j); + } + } + if (additionalEqualities != NULL) { + for (int i = 0; i < additionalEqualitiesLength; i++) { + size_t firstTransformed = alphabet.find(additionalEqualities[i].first); + size_t secondTransformed = alphabet.find(additionalEqualities[i].second); + if (firstTransformed != string::npos && secondTransformed != string::npos) { + matrix[firstTransformed][secondTransformed] = matrix[secondTransformed][firstTransformed] = true; + } + } + } + } + + /** + * @param a Element from transformed sequence. + * @param b Element from transformed sequence. + * @return True if a and b are defined as equal, false otherwise. + */ + bool areEqual(unsigned char a, unsigned char b) const { + return matrix[a][b]; + } +}; + +static int myersCalcEditDistanceSemiGlobal(const Word* Peq, int W, int maxNumBlocks, + int queryLength, + const unsigned char* target, int targetLength, + int k, EdlibAlignMode mode, + int* bestScore_, int** positions_, int* numPositions_); + +static int myersCalcEditDistanceNW(const Word* Peq, int W, int maxNumBlocks, + int queryLength, + const unsigned char* target, int targetLength, + int k, int* bestScore_, + int* position_, bool findAlignment, + AlignmentData** alignData, int targetStopPosition); + + +static int obtainAlignment( + const unsigned char* query, const unsigned char* rQuery, int queryLength, + const unsigned char* target, const unsigned char* rTarget, int targetLength, + const EqualityDefinition& equalityDefinition, int alphabetLength, int bestScore, + unsigned char** alignment, int* alignmentLength); + +static int obtainAlignmentHirschberg( + const unsigned char* query, const unsigned char* rQuery, int queryLength, + const unsigned char* target, const unsigned char* rTarget, int targetLength, + const EqualityDefinition& equalityDefinition, int alphabetLength, int bestScore, + unsigned char** alignment, int* alignmentLength); + +static int obtainAlignmentTraceback(int queryLength, int targetLength, + int bestScore, const AlignmentData* alignData, + unsigned char** alignment, int* alignmentLength); + +static string transformSequences(const char* queryOriginal, int queryLength, + const char* targetOriginal, int targetLength, + unsigned char** queryTransformed, + unsigned char** targetTransformed); + +static inline int ceilDiv(int x, int y); + +static inline unsigned char* createReverseCopy(const unsigned char* seq, int length); + +static inline Word* buildPeq(const int alphabetLength, + const unsigned char* query, + const int queryLength, + const EqualityDefinition& equalityDefinition); + + +/** + * Main edlib method. + */ +extern "C" EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLength, + const char* const targetOriginal, const int targetLength, + const EdlibAlignConfig config) { + EdlibAlignResult result; + result.status = EDLIB_STATUS_OK; + result.editDistance = -1; + result.endLocations = result.startLocations = NULL; + result.numLocations = 0; + result.alignment = NULL; + result.alignmentLength = 0; + result.alphabetLength = 0; + + /*------------ TRANSFORM SEQUENCES AND RECOGNIZE ALPHABET -----------*/ + unsigned char* query, * target; + string alphabet = transformSequences(queryOriginal, queryLength, targetOriginal, targetLength, + &query, &target); + result.alphabetLength = static_cast(alphabet.size()); + /*-------------------------------------------------------*/ + + // Handle special situation when at least one of the sequences has length 0. + if (queryLength == 0 || targetLength == 0) { + if (config.mode == EDLIB_MODE_NW) { + result.editDistance = std::max(queryLength, targetLength); + result.endLocations = static_cast(malloc(sizeof(int) * 1)); + result.endLocations[0] = targetLength - 1; + result.numLocations = 1; + } else if (config.mode == EDLIB_MODE_SHW || config.mode == EDLIB_MODE_HW) { + result.editDistance = queryLength; + result.endLocations = static_cast(malloc(sizeof(int) * 1)); + result.endLocations[0] = -1; + result.numLocations = 1; + } else { + result.status = EDLIB_STATUS_ERROR; + } + + free(query); + free(target); + return result; + } + + /*--------------------- INITIALIZATION ------------------*/ + int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); // bmax in Myers + int W = maxNumBlocks * WORD_SIZE - queryLength; // number of redundant cells in last level blocks + EqualityDefinition equalityDefinition(alphabet, config.additionalEqualities, config.additionalEqualitiesLength); + Word* Peq = buildPeq(static_cast(alphabet.size()), query, queryLength, equalityDefinition); + /*-------------------------------------------------------*/ + + /*------------------ MAIN CALCULATION -------------------*/ + // TODO: Store alignment data only after k is determined? That could make things faster. + int positionNW; // Used only when mode is NW. + AlignmentData* alignData = NULL; + bool dynamicK = false; + int k = config.k; + if (k < 0) { // If valid k is not given, auto-adjust k until solution is found. + dynamicK = true; + k = WORD_SIZE; // Gives better results than smaller k. + } + + do { + if (config.mode == EDLIB_MODE_HW || config.mode == EDLIB_MODE_SHW) { + myersCalcEditDistanceSemiGlobal(Peq, W, maxNumBlocks, + queryLength, target, targetLength, + k, config.mode, &(result.editDistance), + &(result.endLocations), &(result.numLocations)); + } else { // mode == EDLIB_MODE_NW + myersCalcEditDistanceNW(Peq, W, maxNumBlocks, + queryLength, target, targetLength, + k, &(result.editDistance), &positionNW, + false, &alignData, -1); + } + k *= 2; + } while(dynamicK && result.editDistance == -1); + + if (result.editDistance >= 0) { // If there is solution. + // If NW mode, set end location explicitly. + if (config.mode == EDLIB_MODE_NW) { + result.endLocations = static_cast(malloc(sizeof(int) * 1)); + result.endLocations[0] = targetLength - 1; + result.numLocations = 1; + } + + // Find starting locations. + if (config.task == EDLIB_TASK_LOC || config.task == EDLIB_TASK_PATH) { + result.startLocations = static_cast(malloc(result.numLocations * sizeof(int))); + if (config.mode == EDLIB_MODE_HW) { // If HW, I need to calculate start locations. + const unsigned char* rTarget = createReverseCopy(target, targetLength); + const unsigned char* rQuery = createReverseCopy(query, queryLength); + // Peq for reversed query. + Word* rPeq = buildPeq(static_cast(alphabet.size()), rQuery, queryLength, equalityDefinition); + for (int i = 0; i < result.numLocations; i++) { + int endLocation = result.endLocations[i]; + if (endLocation == -1) { + // NOTE: Sometimes one of optimal solutions is that query starts before target, like this: + // AAGG <- target + // CCTT <- query + // It will never be only optimal solution and it does not happen often, however it is + // possible and in that case end location will be -1. What should we do with that? + // Should we just skip reporting such end location, although it is a solution? + // If we do report it, what is the start location? -4? -1? Nothing? + // TODO: Figure this out. This has to do in general with how we think about start + // and end locations. + // Also, we have alignment later relying on this locations to limit the space of it's + // search -> how can it do it right if these locations are negative or incorrect? + result.startLocations[i] = 0; // I put 0 for now, but it does not make much sense. + } else { + int bestScoreSHW, numPositionsSHW; + int* positionsSHW; + myersCalcEditDistanceSemiGlobal( + rPeq, W, maxNumBlocks, + queryLength, rTarget + targetLength - endLocation - 1, endLocation + 1, + result.editDistance, EDLIB_MODE_SHW, + &bestScoreSHW, &positionsSHW, &numPositionsSHW); + // Taking last location as start ensures that alignment will not start with insertions + // if it can start with mismatches instead. + result.startLocations[i] = endLocation - positionsSHW[numPositionsSHW - 1]; + free(positionsSHW); + } + } + delete[] rTarget; + delete[] rQuery; + delete[] rPeq; + } else { // If mode is SHW or NW + for (int i = 0; i < result.numLocations; i++) { + result.startLocations[i] = 0; + } + } + } + + // Find alignment -> all comes down to finding alignment for NW. + // Currently we return alignment only for first pair of locations. + if (config.task == EDLIB_TASK_PATH) { + int alnStartLocation = result.startLocations[0]; + int alnEndLocation = result.endLocations[0]; + const unsigned char* alnTarget = target + alnStartLocation; + const int alnTargetLength = alnEndLocation - alnStartLocation + 1; + const unsigned char* rAlnTarget = createReverseCopy(alnTarget, alnTargetLength); + const unsigned char* rQuery = createReverseCopy(query, queryLength); + obtainAlignment(query, rQuery, queryLength, + alnTarget, rAlnTarget, alnTargetLength, + equalityDefinition, static_cast(alphabet.size()), result.editDistance, + &(result.alignment), &(result.alignmentLength)); + delete[] rAlnTarget; + delete[] rQuery; + } + } + /*-------------------------------------------------------*/ + + //--- Free memory ---// + delete[] Peq; + free(query); + free(target); + if (alignData) delete alignData; + //-------------------// + + return result; +} + +extern "C" char* edlibAlignmentToCigar(const unsigned char* const alignment, const int alignmentLength, + const EdlibCigarFormat cigarFormat) { + if (cigarFormat != EDLIB_CIGAR_EXTENDED && cigarFormat != EDLIB_CIGAR_STANDARD) { + return 0; + } + + // Maps move code from alignment to char in cigar. + // 0 1 2 3 + char moveCodeToChar[] = {'=', 'I', 'D', 'X'}; + if (cigarFormat == EDLIB_CIGAR_STANDARD) { + moveCodeToChar[0] = moveCodeToChar[3] = 'M'; + } + + vector* cigar = new vector(); + char lastMove = 0; // Char of last move. 0 if there was no previous move. + int numOfSameMoves = 0; + for (int i = 0; i <= alignmentLength; i++) { + // if new sequence of same moves started + if (i == alignmentLength || (moveCodeToChar[alignment[i]] != lastMove && lastMove != 0)) { + // Write number of moves to cigar string. + int numDigits = 0; + for (; numOfSameMoves; numOfSameMoves /= 10) { + cigar->push_back('0' + numOfSameMoves % 10); + numDigits++; + } + reverse(cigar->end() - numDigits, cigar->end()); + // Write code of move to cigar string. + cigar->push_back(lastMove); + // If not at the end, start new sequence of moves. + if (i < alignmentLength) { + // Check if alignment has valid values. + if (alignment[i] > 3) { + delete cigar; + return 0; + } + numOfSameMoves = 0; + } + } + if (i < alignmentLength) { + lastMove = moveCodeToChar[alignment[i]]; + numOfSameMoves++; + } + } + cigar->push_back(0); // Null character termination. + char* cigar_ = static_cast(malloc(cigar->size() * sizeof(char))); + memcpy(cigar_, &(*cigar)[0], cigar->size() * sizeof(char)); + delete cigar; + + return cigar_; +} + +/** + * Build Peq table for given query and alphabet. + * Peq is table of dimensions alphabetLength+1 x maxNumBlocks. + * Bit i of Peq[s * maxNumBlocks + b] is 1 if i-th symbol from block b of query equals symbol s, otherwise it is 0. + * NOTICE: free returned array with delete[]! + */ +static inline Word* buildPeq(const int alphabetLength, + const unsigned char* const query, + const int queryLength, + const EqualityDefinition& equalityDefinition) { + int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); + // table of dimensions alphabetLength+1 x maxNumBlocks. Last symbol is wildcard. + Word* Peq = new Word[(alphabetLength + 1) * maxNumBlocks]; + + // Build Peq (1 is match, 0 is mismatch). NOTE: last column is wildcard(symbol that matches anything) with just 1s + for (int symbol = 0; symbol <= alphabetLength; symbol++) { + for (int b = 0; b < maxNumBlocks; b++) { + if (symbol < alphabetLength) { + Peq[symbol * maxNumBlocks + b] = 0; + for (int r = (b+1) * WORD_SIZE - 1; r >= b * WORD_SIZE; r--) { + Peq[symbol * maxNumBlocks + b] <<= 1; + // NOTE: We pretend like query is padded at the end with W wildcard symbols + if (r >= queryLength || equalityDefinition.areEqual(query[r], symbol)) + Peq[symbol * maxNumBlocks + b] += 1; + } + } else { // Last symbol is wildcard, so it is all 1s + Peq[symbol * maxNumBlocks + b] = static_cast(-1); + } + } + } + + return Peq; +} + + +/** + * Returns new sequence that is reverse of given sequence. + * Free returned array with delete[]. + */ +static inline unsigned char* createReverseCopy(const unsigned char* const seq, const int length) { + unsigned char* rSeq = new unsigned char[length]; + for (int i = 0; i < length; i++) { + rSeq[i] = seq[length - i - 1]; + } + return rSeq; +} + +/** + * Corresponds to Advance_Block function from Myers. + * Calculates one word(block), which is part of a column. + * Highest bit of word (one most to the left) is most bottom cell of block from column. + * Pv[i] and Mv[i] define vin of cell[i]: vin = cell[i] - cell[i-1]. + * @param [in] Pv Bitset, Pv[i] == 1 if vin is +1, otherwise Pv[i] == 0. + * @param [in] Mv Bitset, Mv[i] == 1 if vin is -1, otherwise Mv[i] == 0. + * @param [in] Eq Bitset, Eq[i] == 1 if match, 0 if mismatch. + * @param [in] Phin Bitset, 00..01 when hin == +1. + * @param [in] Mhin Bitset, 00..01 when hin == -1. + * @param [out] PvOut Bitset, PvOut[i] == 1 if vout is +1, otherwise PvOut[i] == 0. + * @param [out] MvOut Bitset, MvOut[i] == 1 if vout is -1, otherwise MvOut[i] == 0. + * @param [out] Phout Bitset, 00..01 when hout == +1. + * @param [out] Mhout Bitset, 00..01 when hout == -1. + */ +static inline pair calculateBlock(Word Pv, Word Mv, Word Eq, const Word Phin, const Word Mhin, + Word &PvOut, Word &MvOut) { + // hin can be 1, -1 or 0. + // 1 -> 00...01 + // 0 -> 00...00 + // -1 -> 11...11 (2-complement) + + Word Xv = Eq | Mv; + // This is instruction below written using 'if': if (hin < 0) Eq |= (Word)1; + Eq |= Mhin; + Word Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq; + + Word Ph = Mv | ~(Xh | Pv); + Word Mh = Pv & Xh; + + // This is instruction below written using 'if': if (Ph & HIGH_BIT_MASK) hout = 1; + Word Phout = Ph >> (WORD_SIZE - 1); + // This is instruction below written using 'if': if (Mh & HIGH_BIT_MASK) hout = -1; + Word Mhout = Mh >> (WORD_SIZE - 1); + + Ph <<= 1; + Mh <<= 1; + + // This is instruction below written using 'if': if (hin < 0) Mh |= (Word)1; + Mh |= Mhin; + // This is instruction below written using 'if': if (hin > 0) Ph |= (Word)1; + Ph |= Phin; + + PvOut = Mh | ~(Xv | Ph); + MvOut = Ph & Xv; + + return {Phout, Mhout}; +} + +/** + * Does ceiling division x / y. + * Note: x and y must be non-negative and x + y must not overflow. + */ +static inline int ceilDiv(const int x, const int y) { + return x % y ? x / y + 1 : x / y; +} + +static inline int min(const int x, const int y) { + return x < y ? x : y; +} + +static inline int max(const int x, const int y) { + return x > y ? x : y; +} + + +/** + * @param [in] block + * @return Values of cells in block, starting with bottom cell in block. + */ +static inline vector getBlockCellValues(const Block block) { + vector scores(WORD_SIZE); + int score = block.score; + Word mask = HIGH_BIT_MASK; + for (int i = 0; i < WORD_SIZE - 1; i++) { + scores[i] = score; + if (block.P & mask) score--; + if (block.M & mask) score++; + mask >>= 1; + } + scores[WORD_SIZE - 1] = score; + return scores; +} + +/** + * Writes values of cells in block into given array, starting with first/top cell. + * @param [in] block + * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE. + */ +static inline void readBlock(const Block block, int* const dest) { + int score = block.score; + Word mask = HIGH_BIT_MASK; + for (int i = 0; i < WORD_SIZE - 1; i++) { + dest[WORD_SIZE - 1 - i] = score; + if (block.P & mask) score--; + if (block.M & mask) score++; + mask >>= 1; + } + dest[0] = score; +} + +/** + * Writes values of cells in block into given array, starting with last/bottom cell. + * @param [in] block + * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE. + */ +static inline void readBlockReverse(const Block block, int* const dest) { + int score = block.score; + Word mask = HIGH_BIT_MASK; + for (int i = 0; i < WORD_SIZE - 1; i++) { + dest[i] = score; + if (block.P & mask) score--; + if (block.M & mask) score++; + mask >>= 1; + } + dest[WORD_SIZE - 1] = score; +} + +/** + * @param [in] block + * @param [in] k + * @return True if all cells in block have value larger than k, otherwise false. + */ +static inline bool allBlockCellsLarger(const Block block, const int k) { + vector scores = getBlockCellValues(block); + for (int i = 0; i < WORD_SIZE; i++) { + if (scores[i] <= k) return false; + } + return true; +} + + +/** + * Uses Myers' bit-vector algorithm to find edit distance for one of semi-global alignment methods. + * @param [in] Peq Query profile. + * @param [in] W Size of padding in last block. + * TODO: Calculate this directly from query, instead of passing it. + * @param [in] maxNumBlocks Number of blocks needed to cover the whole query. + * TODO: Calculate this directly from query, instead of passing it. + * @param [in] queryLength + * @param [in] target + * @param [in] targetLength + * @param [in] k + * @param [in] mode EDLIB_MODE_HW or EDLIB_MODE_SHW + * @param [out] bestScore_ Edit distance. + * @param [out] positions_ Array of 0-indexed positions in target at which best score was found. + Make sure to free this array with free(). + * @param [out] numPositions_ Number of positions in the positions_ array. + * @return Status. + */ +static int myersCalcEditDistanceSemiGlobal( + const Word* const Peq, const int W, const int maxNumBlocks, + const int queryLength, + const unsigned char* const target, const int targetLength, + int k, const EdlibAlignMode mode, + int* const bestScore_, int** const positions_, int* const numPositions_) { + *positions_ = NULL; + *numPositions_ = 0; + + // firstBlock is 0-based index of first block in Ukkonen band. + // lastBlock is 0-based index of last block in Ukkonen band. + int firstBlock = 0; + int lastBlock = min(ceilDiv(k + 1, WORD_SIZE), maxNumBlocks) - 1; // y in Myers + Block *bl; // Current block + + Block* blocks = new Block[maxNumBlocks]; + + // For HW, solution will never be larger then queryLength. + if (mode == EDLIB_MODE_HW) { + k = min(queryLength, k); + } + + // Each STRONG_REDUCE_NUM column is reduced in more expensive way. + // This gives speed up of about 2 times for small k. + const int STRONG_REDUCE_NUM = 2048; + + // Initialize P, M and score + bl = blocks; + for (int b = 0; b <= lastBlock; b++) { + bl->score = (b + 1) * WORD_SIZE; + bl->P = static_cast(-1); // All 1s + bl->M = static_cast(0); + bl++; + } + + int bestScore = -1; + vector positions; // TODO: Maybe put this on heap? + const Word PstartHout = mode == EDLIB_MODE_HW ? 0 : 1; // If 0 then gap before query is not penalized; + const unsigned char* targetChar = target; + for (int c = 0; c < targetLength; c++) { // for each column + const Word* Peq_c = Peq + (*targetChar) * maxNumBlocks; + + //----------------------- Calculate column -------------------------// + Word Phout = PstartHout; + Word Mhout = 0; + int hout = 0; + bl = blocks + firstBlock; + Peq_c += firstBlock; + for (int b = firstBlock; b <= lastBlock; b++) { + pair PMhout = calculateBlock(bl->P, bl->M, *Peq_c, Phout, Mhout, bl->P, bl->M); + Phout = PMhout.first; + Mhout = PMhout.second; + hout = Phout - Mhout; + bl->score += hout; + bl++; Peq_c++; + } + bl--; Peq_c--; + //------------------------------------------------------------------// + + //---------- Adjust number of blocks according to Ukkonen ----------// + if ((lastBlock < maxNumBlocks - 1) && (bl->score - hout <= k) // bl is pointing to last block + && ((*(Peq_c + 1) & WORD_1) || hout < 0)) { // Peq_c is pointing to last block + // If score of left block is not too big, calculate one more block + lastBlock++; bl++; Peq_c++; + bl->P = static_cast(-1); // All 1s + bl->M = static_cast(0); + pair PMhout = calculateBlock(bl->P, bl->M, *Peq_c, Phout, Mhout, bl->P, bl->M); + bl->score = (bl - 1)->score - hout + WORD_SIZE + PMhout.first - PMhout.second; + } else { + while (lastBlock >= firstBlock && bl->score >= k + WORD_SIZE) { + lastBlock--; bl--; Peq_c--; + } + } + + // Every some columns, do some expensive but also more efficient block reducing. + // This is important! + // + // Reduce the band by decreasing last block if possible. + if (c % STRONG_REDUCE_NUM == 0) { + while (lastBlock >= 0 && lastBlock >= firstBlock && allBlockCellsLarger(*bl, k)) { + lastBlock--; bl--; Peq_c--; + } + } + // For HW, even if all cells are > k, there still may be solution in next + // column because starting conditions at upper boundary are 0. + // That means that first block is always candidate for solution, + // and we can never end calculation before last column. + if (mode == EDLIB_MODE_HW && lastBlock == -1) { + lastBlock++; bl++; Peq_c++; + } + + // Reduce band by increasing first block if possible. Not applicable to HW. + if (mode != EDLIB_MODE_HW) { + while (firstBlock <= lastBlock && blocks[firstBlock].score >= k + WORD_SIZE) { + firstBlock++; + } + if (c % STRONG_REDUCE_NUM == 0) { // Do strong reduction every some blocks + while (firstBlock <= lastBlock && allBlockCellsLarger(blocks[firstBlock], k)) { + firstBlock++; + } + } + } + + // If band stops to exist finish + if (lastBlock < firstBlock) { + *bestScore_ = bestScore; + if (bestScore != -1) { + *positions_ = static_cast(malloc(sizeof(int) * static_cast(positions.size()))); + *numPositions_ = static_cast(positions.size()); + copy(positions.begin(), positions.end(), *positions_); + } + delete[] blocks; + return EDLIB_STATUS_OK; + } + //------------------------------------------------------------------// + + //------------------------- Update best score ----------------------// + if (lastBlock == maxNumBlocks - 1) { + int colScore = bl->score; + if (colScore <= k) { // Scores > k dont have correct values (so we cannot use them), but are certainly > k. + // NOTE: Score that I find in column c is actually score from column c-W + if (bestScore == -1 || colScore <= bestScore) { + if (colScore != bestScore) { + positions.clear(); + bestScore = colScore; + // Change k so we will look only for equal or better + // scores then the best found so far. + k = bestScore; + } + positions.push_back(c - W); + } + } + } + //------------------------------------------------------------------// + + targetChar++; + } + + + // Obtain results for last W columns from last column. + if (lastBlock == maxNumBlocks - 1) { + vector blockScores = getBlockCellValues(*bl); + for (int i = 0; i < W; i++) { + int colScore = blockScores[i + 1]; + if (colScore <= k && (bestScore == -1 || colScore <= bestScore)) { + if (colScore != bestScore) { + positions.clear(); + k = bestScore = colScore; + } + positions.push_back(targetLength - W + i); + } + } + } + + *bestScore_ = bestScore; + if (bestScore != -1) { + *positions_ = static_cast(malloc(sizeof(int) * static_cast(positions.size()))); + *numPositions_ = static_cast(positions.size()); + copy(positions.begin(), positions.end(), *positions_); + } + + delete[] blocks; + return EDLIB_STATUS_OK; +} + + +/** + * Uses Myers' bit-vector algorithm to find edit distance for global(NW) alignment method. + * @param [in] Peq Query profile. + * @param [in] W Size of padding in last block. + * TODO: Calculate this directly from query, instead of passing it. + * @param [in] maxNumBlocks Number of blocks needed to cover the whole query. + * TODO: Calculate this directly from query, instead of passing it. + * @param [in] queryLength + * @param [in] target + * @param [in] targetLength + * @param [in] k + * @param [out] bestScore_ Edit distance. + * @param [out] position_ 0-indexed position in target at which best score was found. + * @param [in] findAlignment If true, whole matrix is remembered and alignment data is returned. + * Quadratic amount of memory is consumed. + * @param [out] alignData Data needed for alignment traceback (for reconstruction of alignment). + * Set only if findAlignment is set to true, otherwise it is NULL. + * Make sure to free this array using delete[]. + * @param [out] targetStopPosition If set to -1, whole calculation is performed normally, as expected. + * If set to p, calculation is performed up to position p in target (inclusive) + * and column p is returned as the only column in alignData. + * @return Status. + */ +static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int maxNumBlocks, + const int queryLength, + const unsigned char* const target, const int targetLength, + int k, int* const bestScore_, + int* const position_, const bool findAlignment, + AlignmentData** const alignData, const int targetStopPosition) { + if (targetStopPosition > -1 && findAlignment) { + // They can not be both set at the same time! + return EDLIB_STATUS_ERROR; + } + + // Each STRONG_REDUCE_NUM column is reduced in more expensive way. + const int STRONG_REDUCE_NUM = 2048; // TODO: Choose this number dinamically (based on query and target lengths?), so it does not affect speed of computation + + if (k < abs(targetLength - queryLength)) { + *bestScore_ = *position_ = -1; + return EDLIB_STATUS_OK; + } + + k = min(k, max(queryLength, targetLength)); // Upper bound for k + + // firstBlock is 0-based index of first block in Ukkonen band. + // lastBlock is 0-based index of last block in Ukkonen band. + int firstBlock = 0; + // This is optimal now, by my formula. + int lastBlock = min(maxNumBlocks, ceilDiv(min(k, (k + queryLength - targetLength) / 2) + 1, WORD_SIZE)) - 1; + Block* bl; // Current block + + Block* blocks = new Block[maxNumBlocks]; + + // Initialize P, M and score + bl = blocks; + for (int b = 0; b <= lastBlock; b++) { + bl->score = (b + 1) * WORD_SIZE; + bl->P = static_cast(-1); // All 1s + bl->M = static_cast(0); + bl++; + } + + // If we want to find alignment, we have to store needed data. + if (findAlignment) + *alignData = new AlignmentData(maxNumBlocks, targetLength); + else if (targetStopPosition > -1) + *alignData = new AlignmentData(maxNumBlocks, 1); + else + *alignData = NULL; + + const unsigned char* targetChar = target; + for (int c = 0; c < targetLength; c++) { // for each column + const Word* Peq_c = Peq + *targetChar * maxNumBlocks; + + //----------------------- Calculate column -------------------------// + int hout = 1; + Word Phout = 1; + Word Mhout = 0; + bl = blocks + firstBlock; + for (int b = firstBlock; b <= lastBlock; b++) { + pair PMhout = calculateBlock(bl->P, bl->M, Peq_c[b], Phout, Mhout, bl->P, bl->M); + Phout = PMhout.first; + Mhout = PMhout.second; + hout = Phout - Mhout; + bl->score += hout; + bl++; + } + bl--; + //------------------------------------------------------------------// + // bl now points to last block + + // Update k. I do it only on end of column because it would slow calculation too much otherwise. + // NOTICE: I add W when in last block because it is actually result from W cells to the left and W cells up. + k = min(k, bl->score + + max(targetLength - c - 1, queryLength - ((1 + lastBlock) * WORD_SIZE - 1) - 1) + + (lastBlock == maxNumBlocks - 1 ? W : 0)); + + //---------- Adjust number of blocks according to Ukkonen ----------// + //--- Adjust last block ---// + // If block is not beneath band, calculate next block. Only next because others are certainly beneath band. + if (lastBlock + 1 < maxNumBlocks + && !(//score[lastBlock] >= k + WORD_SIZE || // NOTICE: this condition could be satisfied if above block also! + ((lastBlock + 1) * WORD_SIZE - 1 + > k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength))) { + lastBlock++; bl++; + bl->P = static_cast(-1); // All 1s + bl->M = static_cast(0); + pair PMhout = calculateBlock(bl->P, bl->M, Peq_c[lastBlock], Phout, Mhout, bl->P, bl->M); + Phout = PMhout.first; + Mhout = PMhout.second; + int newHout = Phout - Mhout; + bl->score = (bl - 1)->score - hout + WORD_SIZE + newHout; + hout = newHout; + } + + // While block is out of band, move one block up. + // NOTE: Condition used here is more loose than the one from the article, since I simplified the max() part of it. + // I could consider adding that max part, for optimal performance. + while (lastBlock >= firstBlock + && (bl->score >= k + WORD_SIZE + || ((lastBlock + 1) * WORD_SIZE - 1 > + // TODO: Does not work if do not put +1! Why??? + k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength + 1))) { + lastBlock--; bl--; + } + //-------------------------// + + //--- Adjust first block ---// + // While outside of band, advance block + while (firstBlock <= lastBlock + && (blocks[firstBlock].score >= k + WORD_SIZE + || ((firstBlock + 1) * WORD_SIZE - 1 < + blocks[firstBlock].score - k - targetLength + queryLength + c))) { + firstBlock++; + } + //--------------------------/ + + + // TODO: consider if this part is useful, it does not seem to help much + if (c % STRONG_REDUCE_NUM == 0) { // Every some columns do more expensive but more efficient reduction + while (lastBlock >= firstBlock) { + // If all cells outside of band, remove block + vector scores = getBlockCellValues(*bl); + int numCells = lastBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE; + int r = lastBlock * WORD_SIZE + numCells - 1; + bool reduce = true; + for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) { + // TODO: Does not work if do not put +1! Why??? + if (scores[i] <= k && r <= k - scores[i] - targetLength + c + queryLength + 1) { + reduce = false; + break; + } + r--; + } + if (!reduce) break; + lastBlock--; bl--; + } + + while (firstBlock <= lastBlock) { + // If all cells outside of band, remove block + vector scores = getBlockCellValues(blocks[firstBlock]); + int numCells = firstBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE; + int r = firstBlock * WORD_SIZE + numCells - 1; + bool reduce = true; + for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) { + if (scores[i] <= k && r >= scores[i] - k - targetLength + c + queryLength) { + reduce = false; + break; + } + r--; + } + if (!reduce) break; + firstBlock++; + } + } + + + // If band stops to exist finish + if (lastBlock < firstBlock) { + *bestScore_ = *position_ = -1; + delete[] blocks; + return EDLIB_STATUS_OK; + } + //------------------------------------------------------------------// + + + //---- Save column so it can be used for reconstruction ----// + if (findAlignment && c < targetLength) { + bl = blocks + firstBlock; + for (int b = firstBlock; b <= lastBlock; b++) { + (*alignData)->Ps[maxNumBlocks * c + b] = bl->P; + (*alignData)->Ms[maxNumBlocks * c + b] = bl->M; + (*alignData)->scores[maxNumBlocks * c + b] = bl->score; + (*alignData)->firstBlocks[c] = firstBlock; + (*alignData)->lastBlocks[c] = lastBlock; + bl++; + } + } + //----------------------------------------------------------// + //---- If this is stop column, save it and finish ----// + if (c == targetStopPosition) { + for (int b = firstBlock; b <= lastBlock; b++) { + (*alignData)->Ps[b] = (blocks + b)->P; + (*alignData)->Ms[b] = (blocks + b)->M; + (*alignData)->scores[b] = (blocks + b)->score; + (*alignData)->firstBlocks[0] = firstBlock; + (*alignData)->lastBlocks[0] = lastBlock; + } + *bestScore_ = -1; + *position_ = targetStopPosition; + delete[] blocks; + return EDLIB_STATUS_OK; + } + //----------------------------------------------------// + + targetChar++; + } + + if (lastBlock == maxNumBlocks - 1) { // If last block of last column was calculated + // Obtain best score from block -> it is complicated because query is padded with W cells + int bestScore = getBlockCellValues(blocks[lastBlock])[W]; + if (bestScore <= k) { + *bestScore_ = bestScore; + *position_ = targetLength - 1; + delete[] blocks; + return EDLIB_STATUS_OK; + } + } + + *bestScore_ = *position_ = -1; + delete[] blocks; + return EDLIB_STATUS_OK; +} + + +/** + * Finds one possible alignment that gives optimal score by moving back through the dynamic programming matrix, + * that is stored in alignData. Consumes large amount of memory: O(queryLength * targetLength). + * @param [in] queryLength Normal length, without W. + * @param [in] targetLength Normal length, without W. + * @param [in] bestScore Best score. + * @param [in] alignData Data obtained during finding best score that is useful for finding alignment. + * @param [out] alignment Alignment. + * @param [out] alignmentLength Length of alignment. + * @return Status code. + */ +static int obtainAlignmentTraceback(const int queryLength, const int targetLength, + const int bestScore, const AlignmentData* const alignData, + unsigned char** const alignment, int* const alignmentLength) { + const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); + const int W = maxNumBlocks * WORD_SIZE - queryLength; + + *alignment = static_cast(malloc((queryLength + targetLength - 1) * sizeof(unsigned char))); + *alignmentLength = 0; + int c = targetLength - 1; // index of column + int b = maxNumBlocks - 1; // index of block in column + int currScore = bestScore; // Score of current cell + int lScore = -1; // Score of left cell + int uScore = -1; // Score of upper cell + int ulScore = -1; // Score of upper left cell + Word currP = alignData->Ps[c * maxNumBlocks + b]; // P of current block + Word currM = alignData->Ms[c * maxNumBlocks + b]; // M of current block + // True if block to left exists and is in band + bool thereIsLeftBlock = c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]; + // We set initial values of lP and lM to 0 only to avoid compiler warnings, they should not affect the + // calculation as both lP and lM should be initialized at some moment later (but compiler can not + // detect it since this initialization is guaranteed by "business" logic). + Word lP = 0, lM = 0; + if (thereIsLeftBlock) { + lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // P of block to the left + lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; // M of block to the left + } + currP <<= W; + currM <<= W; + int blockPos = WORD_SIZE - W - 1; // 0 based index of current cell in blockPos + + // TODO(martin): refactor this whole piece of code. There are too many if-else statements, + // it is too easy for a bug to hide and to hard to effectively cover all the edge-cases. + // We need better separation of logic and responsibilities. + while (true) { + if (c == 0) { + thereIsLeftBlock = true; + lScore = b * WORD_SIZE + blockPos + 1; + ulScore = lScore - 1; + } + + // TODO: improvement: calculate only those cells that are needed, + // for example if I calculate upper cell and can move up, + // there is no need to calculate left and upper left cell + //---------- Calculate scores ---------// + if (lScore == -1 && thereIsLeftBlock) { + lScore = alignData->scores[(c - 1) * maxNumBlocks + b]; // score of block to the left + for (int i = 0; i < WORD_SIZE - blockPos - 1; i++) { + if (lP & HIGH_BIT_MASK) lScore--; + if (lM & HIGH_BIT_MASK) lScore++; + lP <<= 1; + lM <<= 1; + } + } + if (ulScore == -1) { + if (lScore != -1) { + ulScore = lScore; + if (lP & HIGH_BIT_MASK) ulScore--; + if (lM & HIGH_BIT_MASK) ulScore++; + } + else if (c > 0 && b-1 >= alignData->firstBlocks[c-1] && b-1 <= alignData->lastBlocks[c-1]) { + // This is the case when upper left cell is last cell in block, + // and block to left is not in band so lScore is -1. + ulScore = alignData->scores[(c - 1) * maxNumBlocks + b - 1]; + } + } + if (uScore == -1) { + uScore = currScore; + if (currP & HIGH_BIT_MASK) uScore--; + if (currM & HIGH_BIT_MASK) uScore++; + currP <<= 1; + currM <<= 1; + } + //-------------------------------------// + + // TODO: should I check if there is upper block? + + //-------------- Move --------------// + // Move up - insertion to target - deletion from query + if (uScore != -1 && uScore + 1 == currScore) { + currScore = uScore; + lScore = ulScore; + uScore = ulScore = -1; + if (blockPos == 0) { // If entering new (upper) block + if (b == 0) { // If there are no cells above (only boundary cells) + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; // Move up + for (int i = 0; i < c + 1; i++) // Move left until end + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; + break; + } else { + blockPos = WORD_SIZE - 1; + b--; + currP = alignData->Ps[c * maxNumBlocks + b]; + currM = alignData->Ms[c * maxNumBlocks + b]; + if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) { + thereIsLeftBlock = true; + lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // TODO: improve this, too many operations + lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; + } else { + thereIsLeftBlock = false; + // TODO(martin): There may not be left block, but there can be left boundary - do we + // handle this correctly then? Are l and ul score set correctly? I should check that / refactor this. + } + } + } else { + blockPos--; + lP <<= 1; + lM <<= 1; + } + // Mark move + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; + } + // Move left - deletion from target - insertion to query + else if (lScore != -1 && lScore + 1 == currScore) { + currScore = lScore; + uScore = ulScore; + lScore = ulScore = -1; + c--; + if (c == -1) { // If there are no cells to the left (only boundary cells) + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; // Move left + int numUp = b * WORD_SIZE + blockPos + 1; + for (int i = 0; i < numUp; i++) // Move up until end + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; + break; + } + currP = lP; + currM = lM; + if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) { + thereIsLeftBlock = true; + lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; + lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; + } else { + if (c == 0) { // If there are no cells to the left (only boundary cells) + thereIsLeftBlock = true; + lScore = b * WORD_SIZE + blockPos + 1; + ulScore = lScore - 1; + } else { + thereIsLeftBlock = false; + } + } + // Mark move + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; + } + // Move up left - (mis)match + else if (ulScore != -1) { + unsigned char moveCode = ulScore == currScore ? EDLIB_EDOP_MATCH : EDLIB_EDOP_MISMATCH; + currScore = ulScore; + uScore = lScore = ulScore = -1; + c--; + if (c == -1) { // If there are no cells to the left (only boundary cells) + (*alignment)[(*alignmentLength)++] = moveCode; // Move left + int numUp = b * WORD_SIZE + blockPos; + for (int i = 0; i < numUp; i++) // Move up until end + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; + break; + } + if (blockPos == 0) { // If entering upper left block + if (b == 0) { // If there are no more cells above (only boundary cells) + (*alignment)[(*alignmentLength)++] = moveCode; // Move up left + for (int i = 0; i < c + 1; i++) // Move left until end + (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; + break; + } + blockPos = WORD_SIZE - 1; + b--; + currP = alignData->Ps[c * maxNumBlocks + b]; + currM = alignData->Ms[c * maxNumBlocks + b]; + } else { // If entering left block + blockPos--; + currP = lP; + currM = lM; + currP <<= 1; + currM <<= 1; + } + // Set new left block + if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) { + thereIsLeftBlock = true; + lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; + lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; + } else { + if (c == 0) { // If there are no cells to the left (only boundary cells) + thereIsLeftBlock = true; + lScore = b * WORD_SIZE + blockPos + 1; + ulScore = lScore - 1; + } else { + thereIsLeftBlock = false; + } + } + // Mark move + (*alignment)[(*alignmentLength)++] = moveCode; + } else { + // Reached end - finished! + break; + } + //----------------------------------// + } + + *alignment = static_cast(realloc(*alignment, (*alignmentLength) * sizeof(unsigned char))); + reverse(*alignment, *alignment + (*alignmentLength)); + return EDLIB_STATUS_OK; +} + + +/** + * Finds one possible alignment that gives optimal score (bestScore). + * It will split problem into smaller problems using Hirschberg's algorithm and when they are small enough, + * it will solve them using traceback algorithm. + * @param [in] query + * @param [in] rQuery Reversed query. + * @param [in] queryLength + * @param [in] target + * @param [in] rTarget Reversed target. + * @param [in] targetLength + * @param [in] equalityDefinition + * @param [in] alphabetLength + * @param [in] bestScore Best(optimal) score. + * @param [out] alignment Sequence of edit operations that make target equal to query. + * @param [out] alignmentLength Length of alignment. + * @return Status code. + */ +static int obtainAlignment( + const unsigned char* const query, const unsigned char* const rQuery, const int queryLength, + const unsigned char* const target, const unsigned char* const rTarget, const int targetLength, + const EqualityDefinition& equalityDefinition, const int alphabetLength, const int bestScore, + unsigned char** const alignment, int* const alignmentLength) { + + // Handle special case when one of sequences has length of 0. + if (queryLength == 0 || targetLength == 0) { + *alignmentLength = targetLength + queryLength; + *alignment = static_cast(malloc((*alignmentLength) * sizeof(unsigned char))); + for (int i = 0; i < *alignmentLength; i++) { + (*alignment)[i] = queryLength == 0 ? EDLIB_EDOP_DELETE : EDLIB_EDOP_INSERT; + } + return EDLIB_STATUS_OK; + } + + const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); + const int W = maxNumBlocks * WORD_SIZE - queryLength; + int statusCode; + + // TODO: think about reducing number of memory allocations in alignment functions, probably + // by sharing some memory that is allocated only once. That refers to: Peq, columns in Hirschberg, + // and it could also be done for alignments - we could have one big array for alignment that would be + // sparsely populated by each of steps in recursion, and at the end we would just consolidate those results. + + // If estimated memory consumption for traceback algorithm is smaller than 1MB use it, + // otherwise use Hirschberg's algorithm. By running few tests I choose boundary of 1MB as optimal. + long long alignmentDataSize = (2ll * sizeof(Word) + sizeof(int)) * maxNumBlocks * targetLength + + 2ll * sizeof(int) * targetLength; + if (alignmentDataSize < 1024 * 1024) { + int score_, endLocation_; // Used only to call function. + AlignmentData* alignData = NULL; + Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition); + myersCalcEditDistanceNW(Peq, W, maxNumBlocks, + queryLength, + target, targetLength, + bestScore, + &score_, &endLocation_, true, &alignData, -1); + //assert(score_ == bestScore); + //assert(endLocation_ == targetLength - 1); + + statusCode = obtainAlignmentTraceback(queryLength, targetLength, + bestScore, alignData, alignment, alignmentLength); + delete alignData; + delete[] Peq; + } else { + statusCode = obtainAlignmentHirschberg(query, rQuery, queryLength, + target, rTarget, targetLength, + equalityDefinition, alphabetLength, bestScore, + alignment, alignmentLength); + } + return statusCode; +} + + +/** + * Finds one possible alignment that gives optimal score (bestScore). + * Uses Hirschberg's algorithm to split problem into two sub-problems, solve them and combine them together. + * @param [in] query + * @param [in] rQuery Reversed query. + * @param [in] queryLength + * @param [in] target + * @param [in] rTarget Reversed target. + * @param [in] targetLength + * @param [in] alphabetLength + * @param [in] bestScore Best(optimal) score. + * @param [out] alignment Sequence of edit operations that make target equal to query. + * @param [out] alignmentLength Length of alignment. + * @return Status code. + */ +static int obtainAlignmentHirschberg( + const unsigned char* const query, const unsigned char* const rQuery, const int queryLength, + const unsigned char* const target, const unsigned char* const rTarget, const int targetLength, + const EqualityDefinition& equalityDefinition, const int alphabetLength, const int bestScore, + unsigned char** const alignment, int* const alignmentLength) { + + const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); + const int W = maxNumBlocks * WORD_SIZE - queryLength; + + Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition); + Word* rPeq = buildPeq(alphabetLength, rQuery, queryLength, equalityDefinition); + + // Used only to call functions. + int score_, endLocation_; + + // Divide dynamic matrix into two halfs, left and right. + const int leftHalfWidth = targetLength / 2; + const int rightHalfWidth = targetLength - leftHalfWidth; + + // Calculate left half. + AlignmentData* alignDataLeftHalf = NULL; + int leftHalfCalcStatus = myersCalcEditDistanceNW( + Peq, W, maxNumBlocks, queryLength, target, targetLength, bestScore, + &score_, &endLocation_, false, &alignDataLeftHalf, leftHalfWidth - 1); + + // Calculate right half. + AlignmentData* alignDataRightHalf = NULL; + int rightHalfCalcStatus = myersCalcEditDistanceNW( + rPeq, W, maxNumBlocks, queryLength, rTarget, targetLength, bestScore, + &score_, &endLocation_, false, &alignDataRightHalf, rightHalfWidth - 1); + + delete[] Peq; + delete[] rPeq; + + if (leftHalfCalcStatus == EDLIB_STATUS_ERROR || rightHalfCalcStatus == EDLIB_STATUS_ERROR) { + if (alignDataLeftHalf) delete alignDataLeftHalf; + if (alignDataRightHalf) delete alignDataRightHalf; + return EDLIB_STATUS_ERROR; + } + + // Unwrap the left half. + int firstBlockIdxLeft = alignDataLeftHalf->firstBlocks[0]; + int lastBlockIdxLeft = alignDataLeftHalf->lastBlocks[0]; + // TODO: avoid this allocation by using some shared array? + // scoresLeft contains scores from left column, starting with scoresLeftStartIdx row (query index) + // and ending with scoresLeftEndIdx row (0-indexed). + int scoresLeftLength = (lastBlockIdxLeft - firstBlockIdxLeft + 1) * WORD_SIZE; + int* scoresLeft = new int[scoresLeftLength]; + for (int blockIdx = firstBlockIdxLeft; blockIdx <= lastBlockIdxLeft; blockIdx++) { + Block block(alignDataLeftHalf->Ps[blockIdx], alignDataLeftHalf->Ms[blockIdx], + alignDataLeftHalf->scores[blockIdx]); + readBlock(block, scoresLeft + (blockIdx - firstBlockIdxLeft) * WORD_SIZE); + } + int scoresLeftStartIdx = firstBlockIdxLeft * WORD_SIZE; + // If last block contains padding, shorten the length of scores for the length of padding. + if (lastBlockIdxLeft == maxNumBlocks - 1) { + scoresLeftLength -= W; + } + + // Unwrap the right half (I also reverse it while unwraping). + int firstBlockIdxRight = alignDataRightHalf->firstBlocks[0]; + int lastBlockIdxRight = alignDataRightHalf->lastBlocks[0]; + int scoresRightLength = (lastBlockIdxRight - firstBlockIdxRight + 1) * WORD_SIZE; + int* scoresRight = new int[scoresRightLength]; + int* scoresRightOriginalStart = scoresRight; + for (int blockIdx = firstBlockIdxRight; blockIdx <= lastBlockIdxRight; blockIdx++) { + Block block(alignDataRightHalf->Ps[blockIdx], alignDataRightHalf->Ms[blockIdx], + alignDataRightHalf->scores[blockIdx]); + readBlockReverse(block, scoresRight + (lastBlockIdxRight - blockIdx) * WORD_SIZE); + } + int scoresRightStartIdx = queryLength - (lastBlockIdxRight + 1) * WORD_SIZE; + // If there is padding at the beginning of scoresRight (that can happen because of reversing that we do), + // move pointer forward to remove the padding (that is why we remember originalStart). + if (scoresRightStartIdx < 0) { + //assert(scoresRightStartIdx == -1 * W); + scoresRight += W; + scoresRightStartIdx += W; + scoresRightLength -= W; + } + + delete alignDataLeftHalf; + delete alignDataRightHalf; + + //--------------------- Find the best move ----------------// + // Find the query/row index of cell in left column which together with its lower right neighbour + // from right column gives the best score (when summed). We also have to consider boundary cells + // (those cells at -1 indexes). + // x| + // -+- + // |x + int queryIdxLeftStart = max(scoresLeftStartIdx, scoresRightStartIdx - 1); + int queryIdxLeftEnd = min(scoresLeftStartIdx + scoresLeftLength - 1, + scoresRightStartIdx + scoresRightLength - 2); + int leftScore = -1, rightScore = -1; + int queryIdxLeftAlignment = -1; // Query/row index of cell in left column where alignment is passing through. + bool queryIdxLeftAlignmentFound = false; + for (int queryIdx = queryIdxLeftStart; queryIdx <= queryIdxLeftEnd; queryIdx++) { + leftScore = scoresLeft[queryIdx - scoresLeftStartIdx]; + rightScore = scoresRight[queryIdx + 1 - scoresRightStartIdx]; + if (leftScore + rightScore == bestScore) { + queryIdxLeftAlignment = queryIdx; + queryIdxLeftAlignmentFound = true; + break; + } + } + // Check boundary cells. + if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx == 0 && scoresRightStartIdx == 0) { + leftScore = leftHalfWidth; + rightScore = scoresRight[0]; + if (leftScore + rightScore == bestScore) { + queryIdxLeftAlignment = -1; + queryIdxLeftAlignmentFound = true; + } + } + if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx + scoresLeftLength == queryLength + && scoresRightStartIdx + scoresRightLength == queryLength) { + leftScore = scoresLeft[scoresLeftLength - 1]; + rightScore = rightHalfWidth; + if (leftScore + rightScore == bestScore) { + queryIdxLeftAlignment = queryLength - 1; + queryIdxLeftAlignmentFound = true; + } + } + + delete[] scoresLeft; + delete[] scoresRightOriginalStart; + + if (queryIdxLeftAlignmentFound == false) { + // If there was no move that is part of optimal alignment, then there is no such alignment + // or given bestScore is not correct! + return EDLIB_STATUS_ERROR; + } + //----------------------------------------------------------// + + // Calculate alignments for upper half of left half (upper left - ul) + // and lower half of right half (lower right - lr). + const int ulHeight = queryIdxLeftAlignment + 1; + const int lrHeight = queryLength - ulHeight; + const int ulWidth = leftHalfWidth; + const int lrWidth = rightHalfWidth; + unsigned char* ulAlignment = NULL; int ulAlignmentLength; + int ulStatusCode = obtainAlignment(query, rQuery + lrHeight, ulHeight, + target, rTarget + lrWidth, ulWidth, + equalityDefinition, alphabetLength, leftScore, + &ulAlignment, &ulAlignmentLength); + unsigned char* lrAlignment = NULL; int lrAlignmentLength; + int lrStatusCode = obtainAlignment(query + ulHeight, rQuery, lrHeight, + target + ulWidth, rTarget, lrWidth, + equalityDefinition, alphabetLength, rightScore, + &lrAlignment, &lrAlignmentLength); + if (ulStatusCode == EDLIB_STATUS_ERROR || lrStatusCode == EDLIB_STATUS_ERROR) { + if (ulAlignment) free(ulAlignment); + if (lrAlignment) free(lrAlignment); + return EDLIB_STATUS_ERROR; + } + + // Build alignment by concatenating upper left alignment with lower right alignment. + *alignmentLength = ulAlignmentLength + lrAlignmentLength; + *alignment = static_cast(malloc((*alignmentLength) * sizeof(unsigned char))); + memcpy(*alignment, ulAlignment, ulAlignmentLength); + memcpy(*alignment + ulAlignmentLength, lrAlignment, lrAlignmentLength); + + free(ulAlignment); + free(lrAlignment); + return EDLIB_STATUS_OK; +} + + +/** + * Takes char query and char target, recognizes alphabet and transforms them into unsigned char sequences + * where elements in sequences are not any more letters of alphabet, but their index in alphabet. + * Most of internal edlib functions expect such transformed sequences. + * This function will allocate queryTransformed and targetTransformed, so make sure to free them when done. + * Example: + * Original sequences: "ACT" and "CGT". + * Alphabet would be recognized as "ACTG". Alphabet length = 4. + * Transformed sequences: [0, 1, 2] and [1, 3, 2]. + * @param [in] queryOriginal + * @param [in] queryLength + * @param [in] targetOriginal + * @param [in] targetLength + * @param [out] queryTransformed It will contain values in range [0, alphabet length - 1]. + * @param [out] targetTransformed It will contain values in range [0, alphabet length - 1]. + * @return Alphabet as a string of unique characters, where index of each character is its value in transformed + * sequences. + */ +static string transformSequences(const char* const queryOriginal, const int queryLength, + const char* const targetOriginal, const int targetLength, + unsigned char** const queryTransformed, + unsigned char** const targetTransformed) { + // Alphabet is constructed from letters that are present in sequences. + // Each letter is assigned an ordinal number, starting from 0 up to alphabetLength - 1, + // and new query and target are created in which letters are replaced with their ordinal numbers. + // This query and target are used in all the calculations later. + *queryTransformed = static_cast(malloc(sizeof(unsigned char) * queryLength)); + *targetTransformed = static_cast(malloc(sizeof(unsigned char) * targetLength)); + + string alphabet = ""; + + // Alphabet information, it is constructed on fly while transforming sequences. + // letterIdx[c] is index of letter c in alphabet. + unsigned char letterIdx[MAX_UCHAR + 1]; + bool inAlphabet[MAX_UCHAR + 1]; // inAlphabet[c] is true if c is in alphabet + for (int i = 0; i < MAX_UCHAR + 1; i++) inAlphabet[i] = false; + + for (int i = 0; i < queryLength; i++) { + unsigned char c = static_cast(queryOriginal[i]); + if (!inAlphabet[c]) { + inAlphabet[c] = true; + letterIdx[c] = static_cast(alphabet.size()); + alphabet += queryOriginal[i]; + } + (*queryTransformed)[i] = letterIdx[c]; + } + for (int i = 0; i < targetLength; i++) { + unsigned char c = static_cast(targetOriginal[i]); + if (!inAlphabet[c]) { + inAlphabet[c] = true; + letterIdx[c] = static_cast(alphabet.size()); + alphabet += targetOriginal[i]; + } + (*targetTransformed)[i] = letterIdx[c]; + } + + return alphabet; +} + + +extern "C" EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask task, + const EdlibEqualityPair* additionalEqualities, + int additionalEqualitiesLength) { + EdlibAlignConfig config; + config.k = k; + config.mode = mode; + config.task = task; + config.additionalEqualities = additionalEqualities; + config.additionalEqualitiesLength = additionalEqualitiesLength; + return config; +} + +extern "C" EdlibAlignConfig edlibDefaultAlignConfig(void) { + return edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_DISTANCE, NULL, 0); +} + +extern "C" void edlibFreeAlignResult(EdlibAlignResult result) { + if (result.endLocations) free(result.endLocations); + if (result.startLocations) free(result.startLocations); + if (result.alignment) free(result.alignment); +} \ No newline at end of file diff --git a/dysgu/edlib/src/edlib.h b/dysgu/edlib/src/edlib.h new file mode 100644 index 0000000..284331b --- /dev/null +++ b/dysgu/edlib/src/edlib.h @@ -0,0 +1,277 @@ +#ifndef EDLIB_H +#define EDLIB_H + +/** + * @file + * @author Martin Sosic + * @brief Main header file, containing all public functions and structures. + */ + +// Define EDLIB_API macro to properly export symbols +#ifdef EDLIB_SHARED +# ifdef _WIN32 +# ifdef EDLIB_BUILD +# define EDLIB_API __declspec(dllexport) +# else +# define EDLIB_API __declspec(dllimport) +# endif +# else +# define EDLIB_API __attribute__ ((visibility ("default"))) +# endif +#else +# define EDLIB_API +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// Status codes +#define EDLIB_STATUS_OK 0 +#define EDLIB_STATUS_ERROR 1 + + /** + * Alignment methods - how should Edlib treat gaps before and after query? + */ + typedef enum { + /** + * Global method. This is the standard method. + * Useful when you want to find out how similar is first sequence to second sequence. + */ + EDLIB_MODE_NW, + /** + * Prefix method. Similar to global method, but with a small twist - gap at query end is not penalized. + * What that means is that deleting elements from the end of second sequence is "free"! + * For example, if we had "AACT" and "AACTGGC", edit distance would be 0, because removing "GGC" from the end + * of second sequence is "free" and does not count into total edit distance. This method is appropriate + * when you want to find out how well first sequence fits at the beginning of second sequence. + */ + EDLIB_MODE_SHW, + /** + * Infix method. Similar as prefix method, but with one more twist - gaps at query end and start are + * not penalized. What that means is that deleting elements from the start and end of second sequence is "free"! + * For example, if we had ACT and CGACTGAC, edit distance would be 0, because removing CG from the start + * and GAC from the end of second sequence is "free" and does not count into total edit distance. + * This method is appropriate when you want to find out how well first sequence fits at any part of + * second sequence. + * For example, if your second sequence was a long text and your first sequence was a sentence from that text, + * but slightly scrambled, you could use this method to discover how scrambled it is and where it fits in + * that text. In bioinformatics, this method is appropriate for aligning read to a sequence. + */ + EDLIB_MODE_HW + } EdlibAlignMode; + + /** + * Alignment tasks - what do you want Edlib to do? + */ + typedef enum { + EDLIB_TASK_DISTANCE, //!< Find edit distance and end locations. + EDLIB_TASK_LOC, //!< Find edit distance, end locations and start locations. + EDLIB_TASK_PATH //!< Find edit distance, end locations and start locations and alignment path. + } EdlibAlignTask; + + /** + * Describes cigar format. + * @see http://samtools.github.io/hts-specs/SAMv1.pdf + * @see http://drive5.com/usearch/manual/cigar.html + */ + typedef enum { + EDLIB_CIGAR_STANDARD, //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'. + EDLIB_CIGAR_EXTENDED //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'. + } EdlibCigarFormat; + +// Edit operations. +#define EDLIB_EDOP_MATCH 0 //!< Match. +#define EDLIB_EDOP_INSERT 1 //!< Insertion to target = deletion from query. +#define EDLIB_EDOP_DELETE 2 //!< Deletion from target = insertion to query. +#define EDLIB_EDOP_MISMATCH 3 //!< Mismatch. + + /** + * @brief Defines two given characters as equal. + */ + typedef struct { + char first; + char second; + } EdlibEqualityPair; + + /** + * @brief Configuration object for edlibAlign() function. + */ + typedef struct { + /** + * Set k to non-negative value to tell edlib that edit distance is not larger than k. + * Smaller k can significantly improve speed of computation. + * If edit distance is larger than k, edlib will set edit distance to -1. + * Set k to negative value and edlib will internally auto-adjust k until score is found. + */ + int k; + + /** + * Alignment method. + * EDLIB_MODE_NW: global (Needleman-Wunsch) + * EDLIB_MODE_SHW: prefix. Gap after query is not penalized. + * EDLIB_MODE_HW: infix. Gaps before and after query are not penalized. + */ + EdlibAlignMode mode; + + /** + * Alignment task - tells Edlib what to calculate. Less to calculate, faster it is. + * EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target. + * EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target. + * EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target). + */ + EdlibAlignTask task; + + /** + * List of pairs of characters, where each pair defines two characters as equal. + * This way you can extend edlib's definition of equality (which is that each character is equal only + * to itself). + * This can be useful if you have some wildcard characters that should match multiple other characters, + * or e.g. if you want edlib to be case insensitive. + * Can be set to NULL if there are none. + */ + const EdlibEqualityPair* additionalEqualities; + + /** + * Number of additional equalities, which is non-negative number. + * 0 if there are none. + */ + int additionalEqualitiesLength; + } EdlibAlignConfig; + + /** + * Helper method for easy construction of configuration object. + * @return Configuration object filled with given parameters. + */ + EDLIB_API EdlibAlignConfig edlibNewAlignConfig( + int k, EdlibAlignMode mode, EdlibAlignTask task, + const EdlibEqualityPair* additionalEqualities, + int additionalEqualitiesLength + ); + + /** + * @return Default configuration object, with following defaults: + * k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE, no additional equalities. + */ + EDLIB_API EdlibAlignConfig edlibDefaultAlignConfig(void); + + + /** + * Container for results of alignment done by edlibAlign() function. + */ + typedef struct { + /** + * EDLIB_STATUS_OK or EDLIB_STATUS_ERROR. If error, all other fields will have undefined values. + */ + int status; + + /** + * -1 if k is non-negative and edit distance is larger than k. + */ + int editDistance; + + /** + * Array of zero-based positions in target where optimal alignment paths end. + * If gap after query is penalized, gap counts as part of query (NW), otherwise not. + * Set to NULL if edit distance is larger than k. + * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). + */ + int* endLocations; + + /** + * Array of zero-based positions in target where optimal alignment paths start, + * they correspond to endLocations. + * If gap before query is penalized, gap counts as part of query (NW), otherwise not. + * Set to NULL if not calculated or if edit distance is larger than k. + * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). + */ + int* startLocations; + + /** + * Number of end (and start) locations. + */ + int numLocations; + + /** + * Alignment is found for first pair of start and end locations. + * Set to NULL if not calculated. + * Alignment is sequence of numbers: 0, 1, 2, 3. + * 0 stands for match. + * 1 stands for insertion to target. + * 2 stands for insertion to query. + * 3 stands for mismatch. + * Alignment aligns query to target from begining of query till end of query. + * If gaps are not penalized, they are not in alignment. + * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). + */ + unsigned char* alignment; + + /** + * Length of alignment. + */ + int alignmentLength; + + /** + * Number of different characters in query and target together. + */ + int alphabetLength; + } EdlibAlignResult; + + /** + * Frees memory in EdlibAlignResult that was allocated by edlib. + * If you do not use it, make sure to free needed members manually using free(). + */ + EDLIB_API void edlibFreeAlignResult(EdlibAlignResult result); + + + /** + * Aligns two sequences (query and target) using edit distance (levenshtein distance). + * Through config parameter, this function supports different alignment methods (global, prefix, infix), + * as well as different modes of search (tasks). + * It always returns edit distance and end locations of optimal alignment in target. + * It optionally returns start locations of optimal alignment in target and alignment path, + * if you choose appropriate tasks. + * @param [in] query First sequence. + * @param [in] queryLength Number of characters in first sequence. + * @param [in] target Second sequence. + * @param [in] targetLength Number of characters in second sequence. + * @param [in] config Additional alignment parameters, like alignment method and wanted results. + * @return Result of alignment, which can contain edit distance, start and end locations and alignment path. + * Make sure to clean up the object using edlibFreeAlignResult() or by manually freeing needed members. + */ + EDLIB_API EdlibAlignResult edlibAlign( + const char* query, int queryLength, + const char* target, int targetLength, + const EdlibAlignConfig config + ); + + + /** + * Builds cigar string from given alignment sequence. + * @param [in] alignment Alignment sequence. + * 0 stands for match. + * 1 stands for insertion to target. + * 2 stands for insertion to query. + * 3 stands for mismatch. + * @param [in] alignmentLength + * @param [in] cigarFormat Cigar will be returned in specified format. + * @return Cigar string. + * I stands for insertion. + * D stands for deletion. + * X stands for mismatch. (used only in extended format) + * = stands for match. (used only in extended format) + * M stands for (mis)match. (used only in standard format) + * String is null terminated. + * Needed memory is allocated and given pointer is set to it. + * Do not forget to free it later using free()! + */ + EDLIB_API char* edlibAlignmentToCigar( + const unsigned char* alignment, int alignmentLength, + EdlibCigarFormat cigarFormat + ); + +#ifdef __cplusplus +} +#endif + +#endif // EDLIB_H \ No newline at end of file diff --git a/dysgu/extra_metrics.pyx b/dysgu/extra_metrics.pyx index 5e5a2c6..b5c44c6 100644 --- a/dysgu/extra_metrics.pyx +++ b/dysgu/extra_metrics.pyx @@ -262,7 +262,7 @@ cpdef filter_poorly_aligned_ends(spanning_alignments, float divergence=0.02): return spanning, 1 - (len(spanning) / len(spanning_alignments)) -cpdef gap_size_upper_bound(AlignedSegment alignment, int cigarindex, int pos_input, int end_input, int length_extend=15, float divergence=0.02): +cpdef gap_size_upper_bound(AlignedSegment alignment, int cigarindex, int pos_input, int end_input, int length_extend=15, float divergence=0.02, float len_t=3): # expand indel using cigar, merges nearby gaps into the SV event cdef int pos, end, l, opp, extent_left, extent_right, candidate_type, candidate_len, len_input, i, dist, dist_thresh, middle, last_seen_size cdef uint32_t cigar_value @@ -276,7 +276,7 @@ cpdef gap_size_upper_bound(AlignedSegment alignment, int cigarindex, int pos_inp candidate_type = cigar_value & BAM_CIGAR_MASK candidate_len = cigar_value >> BAM_CIGAR_SHIFT len_input = candidate_len - dist_thresh = min(len_input * 3, ( log2_32(1 + candidate_len) / divergence)) + dist_thresh = min( (len_input * len_t), ( log2_32(1 + candidate_len) / divergence)) i = cigarindex + 1 dist = 0 last_seen_size = candidate_len @@ -308,7 +308,7 @@ cpdef gap_size_upper_bound(AlignedSegment alignment, int cigarindex, int pos_inp else: candidate_len += l extent_right = end - dist_thresh = min(l * 3, ( log2_32(1 + candidate_len) / divergence)) + dist_thresh = min( (l * len_t), ( log2_32(1 + candidate_len) / divergence)) dist = 0 i += 1 @@ -342,7 +342,7 @@ cpdef gap_size_upper_bound(AlignedSegment alignment, int cigarindex, int pos_inp else: candidate_len += l extent_left = pos - dist_thresh = min(l * 3, ( log2_32(1 + candidate_len) / divergence) ) + dist_thresh = min( (l * len_t), ( log2_32(1 + candidate_len) / divergence) ) dist = 0 i -= 1 diff --git a/dysgu/filter_normals.py b/dysgu/filter_normals.py index 3bc9afe..7857808 100644 --- a/dysgu/filter_normals.py +++ b/dysgu/filter_normals.py @@ -1,5 +1,7 @@ import random from sys import stderr, argv + +import numpy as np import pysam from pysam import CSOFT_CLIP, CHARD_CLIP, CDEL, CINS, CDIFF, CMATCH import time @@ -7,13 +9,16 @@ import os from dysgu.map_set_utils import Py_BasicIntervalTree, is_overlapping from dysgu.graph import AlignmentsSA +from dysgu.extra_metrics import gap_size_upper_bound +from dysgu.call_component import n_aligned_bases import logging from collections import defaultdict from enum import Enum from dysgu.scikitbio._ssw_wrapper import StripedSmithWaterman import zlib -import edlib +from dysgu.edlib import edlib import glob +from dysgu.map_set_utils import echo random.seed(42) @@ -113,8 +118,11 @@ def load_samples(args, pths): def output_vcf(args, template_vcf): hdr = template_vcf.header input_command = ' '.join(argv) - hdr.add_line('##FILTER=') + hdr.add_line('##FILTER=') + hdr.add_line('##FILTER=') hdr.add_line('##FILTER=') + hdr.add_line('##FILTER=') + hdr.add_line('##FILTER=') hdr.add_line(f'##command="{input_command}"') return pysam.VariantFile("-" if not args['svs_out'] else args['svs_out'], "w", header=hdr) @@ -181,7 +189,7 @@ def get_sv_type(r, chrom, chrom2): def make_interval_tree(args, infile, sample_name, normal_vcfs): if not normal_vcfs: return None - intervals = defaultdict(lambda: Py_BasicIntervalTree()) + intervals = defaultdict(lambda: defaultdict(lambda: Py_BasicIntervalTree())) # chrom : SVTYPE : interval ignored = defaultdict(int) added = 0 for normal_vcf in normal_vcfs: @@ -211,18 +219,20 @@ def make_interval_tree(args, infile, sample_name, normal_vcfs): continue start = r.start # note pysam converts to zero-based index like bam stop = r.stop + svlen = r.info["SVLEN"] if "SVLEN" in r.info else 1000000 if chrom == chrom2 and stop < start: start = r.stop - 1 stop = r.start - 1 if chrom == chrom2 and abs(stop - start) <= distance: - intervals[chrom].add(min(start, stop) - distance, max(start, stop) + distance, idx) + intervals[chrom][r.info["SVTYPE"]].add(min(start, stop) - distance, max(start, stop) + distance, svlen) else: posB = get_posB(r) - intervals[chrom].add(start - distance, start + distance, idx) - intervals[chrom2].add(posB - distance, posB + distance, idx) + intervals[chrom][r.info["SVTYPE"]].add(start - distance, start + distance, svlen) + intervals[chrom2][r.info["SVTYPE"]].add(posB - distance, posB + distance, svlen) added += 1 - for tree in intervals.values(): - tree.index() + for tree_v in intervals.values(): + for tree in tree_v.values(): + tree.index() if ignored: logging.warning(f'Ignored SV types: {ignored}') if added == 0: @@ -392,38 +402,90 @@ def matching_supplementary(aln, infile, posA, posB): return False -def matching_gap(posA, posB, r, svtype, is_insertion, svlen, pos_threshold=100, span_threshold=0.1): +def within_read_end_position(event_pos, svtype, cigartuples, cigar_index): + if svtype == 1: # insertion (or other e.g. duplication/inversion within read) + return event_pos + 1 + else: # deletion type + end = event_pos + cigartuples[cigar_index][1] + return end + +def matching_gap(posA, posB, r, svtype, is_insertion, svlen, pos_threshold=100, span_threshold=0.1, paired_end=True, cipos95=0): ct = r.cigartuples pos = r.pos min_length = svlen * 0.2 # 0.25 max_distance = svlen * 10 ins_like = svtype == "DUP" or svtype == "INV" + debug = False + # debug = True + cigar_index = 0 + # echo(span_threshold) for k, l in ct: if k == CHARD_CLIP or k == CSOFT_CLIP: + cigar_index += 1 continue if k == CMATCH or k == CDIFF: pos += l + cigar_index += 1 continue if is_insertion and k == CDEL: pos += l + cigar_index += 1 continue elif not is_insertion and k == CINS: - if l > 250 and ins_like: + + span_similarity = min(l, svlen) / max(l, svlen) + if l > 250 and ins_like and span_similarity > 0.85 and abs(posA - pos) < max(l, svlen): + if debug: echo("ret0", svlen, f"{posA}-{posA + svlen}", f"{pos}-{pos + l}", (k, l)) return True elif svlen * 2 < l and (abs(posA - pos) < max_distance or (abs(posB - pos + l) < max_distance)): + if debug: echo("ret1") return True + + # if l > 100: + # pos_original = pos + # l, pos, _ = gap_size_upper_bound(r, cigar_index, pos, pos+1) + # span_similarity = min(l, svlen) / max(l, svlen) + # if l > 250 and ins_like and span_similarity > 0.85 and abs(posA - pos) < max(l, svlen): + # if debug: echo("ret0b", svlen, f"{posA}-{posA + svlen}", f"{pos}-{pos + l}", (k, l)) + # return True + # elif svlen * 2 < l and (abs(posA - pos) < max_distance or (abs(posB - pos + l) < max_distance)): + # if debug: echo("ret1b") + # return True + # if pos_original is not None: + # pos = pos_original + # pos_original = None + + cigar_index += 1 continue + end = pos + l if l > min_length and abs(posA - pos) < max_distance: if is_insertion: - if span_position_distance(posA, posA + svlen, pos, end, pos_threshold, span_threshold): + if min(svlen, l) / max(svlen, l) > span_threshold: + if debug: echo("ret2", f"{posA}-{posA + svlen}", f"{pos}-{end}") return True elif span_position_distance(posA, posB, pos, end, pos_threshold, span_threshold): + if debug: echo("ret3") return True + + if not paired_end and l > 20 and ((k == CINS and is_insertion) or (k == CDEL and svtype == "DEL")): + this_l, this_pos, this_end = gap_size_upper_bound(r, cigar_index, pos, pos + 1) + if this_l > min_length and abs(posA - this_pos) < max_distance: + if is_insertion: + if min(svlen, this_l) / max(svlen, this_l) > span_threshold: + if debug: echo("ret2b", f"{posA}-{posA + svlen}", f"{this_pos}-{this_end}") + return True + elif cipos95 and min(svlen + cipos95, this_l) / max(svlen + cipos95, this_l) > span_threshold: + return True + elif span_position_distance(posA, posB, this_pos, this_end, pos_threshold, span_threshold): + if debug: echo("ret3b") + return True + if k == CDEL: pos += l if pos > posB + svlen: break + cigar_index += 1 return False @@ -533,7 +595,7 @@ def cache_nearby_soft_clip(posA, posB, align, join_type, svtype, cached, distanc def any_nearby_soft_clip(posA, posB, align, join_type, svtype, distance=30, clip_length=3): ct = align.cigartuples - if svtype == "BND" or svtype == "INV" or svtype == "DUP": # respect join orientation + if svtype == "BND" or svtype == "DUP": # respect join orientation if join_type[0] == "3": if clip_position_matches(ct[-1], clip_length, posA, align.reference_end, distance): return True @@ -549,7 +611,7 @@ def any_nearby_soft_clip(posA, posB, align, join_type, svtype, distance=30, clip return True if clip_position_matches(ct[0], clip_length, posB, align.pos, distance): return True - elif svtype == "INS" or svtype == "TRA": # check all clips + elif svtype == "INS" or svtype == "TRA" or svtype == "INV": # check all clips if clip_position_matches(ct[-1], clip_length, posA, align.reference_end, distance): return True if clip_position_matches(ct[-1], clip_length, posB, align.reference_end, distance): @@ -567,13 +629,16 @@ def has_clip(r): return False -def clip_align_matches(seq1, seq2, clip_side, paired_end): +def clip_align_matches(seq1, seq2, clip_side, paired_end, is_alt_seq=False): if not seq1 or not seq2: return False - min_seq = min(len(seq1), len(seq2)) + if not is_alt_seq: + min_seq = min(len(seq1), len(seq2)) + else: + min_seq = len(seq2) if min_seq < 8: - if min_seq == len(seq2): - return False + # if min_seq == len(seq2): + # return False if clip_side == SeqType.RIGHT_CLIP: seq2_clipped = seq2[int(len(seq2) / 2) + (1 if len(seq2) % 2 == 0 else 0):] if seq2_clipped.startswith(seq1): @@ -588,13 +653,15 @@ def clip_align_matches(seq1, seq2, clip_side, paired_end): a = aln(seq2) score = a.optimal_alignment_score pct = score / (min_seq * 2) + # echo(pct, "1") if pct > 0.7: return True else: el = edlib.align(seq1, seq2, mode="HW", task="locations") if el and el['locations']: - pct = el['locations'][0][1] - el['locations'][0][0] / min_seq - if pct > 0.7: + pct = 1 - (el['editDistance'] / min_seq) + # echo(pct, "2", min(len(seq1), len(seq2)), el['editDistance']) + if pct > 0.7 and (el['locations'][0][1] - el['locations'][0][0]) / min_seq > 0.7: return True if paired_end: seq1_e = seq1.encode("ascii") @@ -611,36 +678,41 @@ def matching_soft_clips(r, reads_with_nearby_soft_clips, pe): if not break_seqs.any_seqs: return False for clip_side, align, target_side in reads_with_nearby_soft_clips: - checked = False ct = align.cigartuples + if break_seqs.alt_seq: + if clip_align_matches(get_left_clip(align), break_seqs.alt_seq, SeqType.LEFT_CLIP, pe, True): + return True + if clip_align_matches(get_right_clip(align), break_seqs.alt_seq, SeqType.RIGHT_CLIP, pe, True): + return True if target_side == "A": if clip_side == SeqType.LEFT_CLIP and break_seqs.left_clip_A: - checked = True if clip_align_matches(get_left_clip(align), break_seqs.left_clip_A, SeqType.LEFT_CLIP, pe) or (pe and ct[0][0] == CHARD_CLIP): return True elif clip_side == SeqType.RIGHT_CLIP and break_seqs.right_clip_A: - checked = True if clip_align_matches(get_right_clip(align), break_seqs.right_clip_A, SeqType.RIGHT_CLIP, pe) or (pe and ct[-1][0] == CHARD_CLIP): return True else: if clip_side == SeqType.LEFT_CLIP and break_seqs.left_clip_B: - checked = True if clip_align_matches(get_left_clip(align), break_seqs.left_clip_B, SeqType.LEFT_CLIP, pe) or (pe and ct[0][0] == CHARD_CLIP): return True elif clip_side == SeqType.RIGHT_CLIP and break_seqs.right_clip_B: - checked = True if clip_align_matches(get_right_clip(align), break_seqs.right_clip_B, SeqType.RIGHT_CLIP, pe) or (pe and ct[-1][0] == CHARD_CLIP): return True - if not checked and break_seqs.alt_seq: - if clip_align_matches(get_left_clip(align), break_seqs.alt_seq, SeqType.LEFT_CLIP, pe): - return True - if clip_align_matches(get_right_clip(align), break_seqs.alt_seq, SeqType.RIGHT_CLIP, pe): - return True + return False def has_low_support(r, sample, support_fraction): - cov = r.samples[sample]['COV'] + d = r.samples[sample] + if "SVLEN" in r.info and "RPOLY" in r.info and r.info['RPOLY'] > r.info["SVLEN"]: + support_fraction *= 2 + m = max(d['ICN'], d['OCN']) + if m == 0: + cov = d['COV'] + if cov == 0: + return False + else: + cov = d['COV'] * ((min(d['ICN'], d['OCN']) / m)) min_support = round(1.5 + support_fraction * cov) if r.info['SU'] < min_support: return True @@ -649,7 +721,14 @@ def has_low_support(r, sample, support_fraction): def has_low_WR_support(r, sample, support_fraction, n_overlapping, n_mapq0): sf = support_fraction / 2 - cov = r.samples[sample]['COV'] + d = r.samples[sample] + m = max(d['ICN'], d['OCN']) + if m == 0: + cov = d['COV'] + if cov == 0: + return False + else: + cov = d['COV'] * ((min(d['ICN'], d['OCN']) / m)) if n_mapq0 / n_overlapping > 0.3: cov = max(cov, n_overlapping) min_support = min(4, round(1.5 + sf * cov)) @@ -658,17 +737,40 @@ def has_low_WR_support(r, sample, support_fraction, n_overlapping, n_mapq0): return False +def has_low_covering_support(covering_reads, n_mapq0): + if covering_reads - n_mapq0 <= 2: + return True + + def too_many_clipped_reads(r, clipped_reads, support_fraction): sf = support_fraction / 2 - cov = r.samples[r.samples.keys()[0]]['COV'] - max_nearby_clipped_reads = round(3 + sf * cov) - if clipped_reads > max_nearby_clipped_reads: + d = r.samples[r.samples.keys()[0]] + # cov = r.samples[r.samples.keys()[0]]['COV'] + m = max(d['ICN'], d['OCN']) + if m == 0: + cov = d['COV'] + if cov == 0: + return False + else: + cov = d['COV'] * ((min(d['ICN'], d['OCN']) / m)) + # max_nearby_clipped_reads = round(3 + sf * cov) + max_nearby_clipped_reads = round(1 + sf * cov) + if clipped_reads >= max_nearby_clipped_reads: return True return False -def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, keep_all, sample, - support_fraction): +def poorly_aligned_region(sum_edit_distance, sum_n_small_gap, NMbase, n_aligns, fmt, max_divergence=0.08): + # Normal-reads error + if sum_edit_distance/n_aligns > max_divergence or sum_n_small_gap/n_aligns > max_divergence / 2: + return True + # Variant error rate > 4 times the normal-reads error rate + NMbase = (NMbase / n_aligns) * 100 + if "NMB" in fmt and fmt["NMB"] > 4 * NMbase: + return True + +def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, sample, + support_fraction, max_divergence): ct = r.info["CT"] chromA = r.chrom posA = r.pos @@ -681,6 +783,14 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, has_contig = 'CONTIG' in r.info or 'CONTIG2' in r.info or r.alts[0][0] != "<" is_paired_end = False good_step = good_step_translocation(r, sample) + sum_edit_distance = 0 + sum_n_small_gap = 0 + n_aligns = 0 + NMbase = 0 + # debug = True + + debug = False + # debug = True for is_paired_end, aln in iterate_bams_single_region(bams, chromA, posA, pad, bam_is_paired_end): if is_paired_end: distance = 50 @@ -688,36 +798,38 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, if aln.flag & 12 or (aln.flag & 2 and not has_clip(aln)): continue if chromB == infile.get_reference_name(aln.rnext) and is_overlapping(next_start, next_end, aln.pnext - pad, aln.pnext + pad): - if keep_all: - r.filter.add("normal") - return False + return "normal" if matching_supplementary(aln, infile, posA, posB): - if keep_all: - r.filter.add("normal") - return False + return "normal" if has_contig: cache_nearby_soft_clip(posA, posB, aln, ct, "TRA", cached, distance, clip_length) else: + n_aligns += 1 + a_bases, large_gaps, n_small_gaps = n_aligned_bases(aln) + if a_bases > 0: + sum_n_small_gap += n_small_gaps / a_bases + if aln.has_tag("NM"): + if a_bases: + nm = float(aln.get_tag("NM")) + sum_edit_distance += nm / a_bases + NMbase += (nm - large_gaps) / a_bases distance = 500 clip_length = 50 if aln.flag & 4 or not has_clip(aln): continue if matching_supplementary(aln, infile, posA, posB): - if keep_all: - r.filter.add("normal") - return False + if debug: echo("tra0") + return "normal" if not good_step and matching_ins_translocation(posA, aln): - if keep_all: - r.filter.add("normal") - return False + if debug: echo("tra1") + return "normal" if has_contig: cache_nearby_soft_clip(posA, posB, aln, ct, "TRA", cached, distance, clip_length) if any_nearby_soft_clip(posA, posB, aln, ct, "TRA", 30, clip_length=50): nearby_soft_clips += 1 elif any_nearby_soft_clip(posA, posB, aln, ct, "TRA", 30, clip_length=250): - if keep_all: - r.filter.add("lowSupport") - return False + if debug: echo("tra2") + return "lowSupport" for is_paired_end, aln in iterate_bams_single_region(bams, chromB, posB, pad, bam_is_paired_end): if is_paired_end: @@ -726,51 +838,55 @@ def process_translocation(r, chromB, posB, bams, infile, bam_is_paired_end, pad, if aln.flag & 12 or (aln.flag & 2 and not has_clip(aln)): continue if r.chrom == infile.get_reference_name(aln.rnext) and is_overlapping(current_start, current_end, aln.pnext - pad, aln.pnext + pad): - if keep_all: - r.filter.add("normal") - return False + return "normal" if matching_supplementary(aln, infile, posB, posA): - if keep_all: - r.filter.add("normal") - return False + return "normal" if has_contig: cache_nearby_soft_clip(r.pos, posB, aln, ct, "TRA", cached, distance, clip_length) else: + n_aligns += 1 + a_bases, large_gaps, n_small_gaps = n_aligned_bases(aln) + if a_bases > 0: + sum_n_small_gap += n_small_gaps / a_bases + if aln.has_tag("NM"): + if a_bases: + nm = float(aln.get_tag("NM")) + sum_edit_distance += nm / a_bases + NMbase += (nm - large_gaps) / a_bases distance = 500 clip_length = 50 if aln.flag & 4 or not has_clip(aln): continue if matching_supplementary(aln, infile, posB, posA): - if keep_all: - r.filter.add("normal") - return False + if debug: echo("tra3") + return "normal" if not good_step and matching_ins_translocation(posB, aln): - if keep_all: - r.filter.add("normal") - return False + if debug: echo("tra4") + return "normal" if has_contig: cache_nearby_soft_clip(posA, posB, aln, ct, "TRA", cached, distance, clip_length) if any_nearby_soft_clip(r.pos, posB, aln, ct, "TRA", 30, clip_length=50): nearby_soft_clips += 1 elif any_nearby_soft_clip(posA, posB, aln, ct, "TRA", 30, clip_length=250): - if keep_all: - r.filter.add("lowSupport") - return False + if debug: echo("tra5") + return "lowSupport" if not is_paired_end and too_many_clipped_reads(r, nearby_soft_clips, support_fraction): - if keep_all: - r.filter.add("lowSupport") - return False + if debug: echo("tra6") + return "lowSupport" if cached and matching_soft_clips(r, cached, is_paired_end): - if keep_all: - r.filter.add("normal") - return False - return True + if debug: echo("tra7") + return "normal" + + if not is_paired_end and poorly_aligned_region(sum_edit_distance, sum_n_small_gap, NMbase, n_aligns, r.samples[sample], max_divergence): + return "highDivergence" + return None -def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pad, sample, keep_all): +def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pad, sample, max_divergence): svlen = r.info["SVLEN"] svtype = r.info["SVTYPE"] + any_contigs = ("CONTIGA" in r.info and r.info["CONTIGA"]) or ("CONTIGB" in r.info and r.info["CONTIGB"]) is_insertion = svtype == "INS" posA = r.pos ct = r.info["CT"] @@ -780,7 +896,24 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa is_paired_end = False n_mapq_0 = 0 n_overlpapping = 0 + n_aligns = 0 + sum_edit_distance = 0 + sum_n_small_gap = 0 + NMbase = 0 + + debug = False + # debug = True for is_paired_end, aln in iterate_bams(bams, r.chrom, posA, r.chrom, posB, pad, bam_is_paired_end): + n_aligns += 1 + a_bases, large_gaps, n_small_gaps = n_aligned_bases(aln) + if a_bases > 0: + sum_n_small_gap += n_small_gaps / a_bases + if aln.has_tag("NM"): + if a_bases: + nm = float(aln.get_tag("NM")) + sum_edit_distance += nm / a_bases + NMbase += (nm - large_gaps) / a_bases + if is_paired_end: a_posA = min(aln.pos, aln.pnext) a_posB = max(aln.pos, aln.pnext) @@ -788,37 +921,32 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa continue if aln.rname != aln.rnext: if matching_supplementary(aln, infile, posA, posB): - if keep_all: - r.filter.add("normal") - return False + if debug: echo("1") + return "normal" cache_nearby_soft_clip(posA, posB, aln, ct, svtype, cached) continue if abs(a_posA - posA) > pad or abs(a_posB - posB) > pad: continue if not is_insertion and not aln.flag & 10: # proper pair, mate unmapped - if span_position_distance(posA, posB, a_posA, a_posB, pos_threshold=20, span_threshold=0.5): - if keep_all: - r.filter.add("normal") - return False + if span_position_distance(posA, posB, a_posA, a_posB, pos_threshold=20, span_threshold=0.75): + if debug: echo("2") + return "normal" if not is_insertion and matching_supplementary(aln, infile, posA, posB): - if keep_all: - r.filter.add("normal") - return False - if svlen < 80 and matching_gap(posA, posB, aln, svtype, is_insertion, svlen): - if keep_all: - r.filter.add("normal") - return False + if debug: echo("3") + return "normal" + if svlen < 80 and matching_gap(posA, posB, aln, svtype, is_insertion, svlen, span_threshold=0.75): + if debug: echo("4") + return "normal" cache_nearby_soft_clip(posA, posB, aln, ct, svtype, cached, distance=50, clip_length=3) else: - if matching_gap(posA, posB, aln, svtype, is_insertion, svlen, pos_threshold=30, span_threshold=0.7): - if keep_all: - r.filter.add("normal") - return False + cipos95 = r.info["CIPOS95"] if "CIPOS95" in r.info else 0 + if matching_gap(posA, posB, aln, svtype, is_insertion, svlen, pos_threshold=30, span_threshold=0.75, paired_end=False, cipos95=cipos95): + if debug: echo("5", svlen, aln.qname) + return "normal" if matching_supplementary(aln, infile, posA, posB): - if keep_all: - r.filter.add("normal") - return False + if debug: echo("6") + return "normal" if is_overlapping(posA, posA + 1, aln.pos, aln.reference_end): n_overlpapping += 1 if aln.mapq == 0: @@ -830,23 +958,22 @@ def process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pa nearby_soft_clips += 1 cache_nearby_soft_clip(posA, posB, aln, ct, svtype, cached, distance=min(500, svlen * 0.5), clip_length=50) - if not is_paired_end and not covered: - if keep_all: - r.filter.add("lowSupport") - return False + if not is_paired_end and not covered or has_low_covering_support(covered, n_mapq_0): + if debug: echo("7") + return "lowSupport" if not is_paired_end and has_low_WR_support(r, sample, support_fraction, n_overlpapping, n_mapq_0): - if keep_all: - r.filter.add("lowSupport") - return False - if not is_paired_end and too_many_clipped_reads(r, nearby_soft_clips, support_fraction): - if keep_all: - r.filter.add("lowSupport") - return False + if debug: echo("8") + return "lowSupport" + if (not is_paired_end or (svtype in "INVTRA" and not any_contigs)) and too_many_clipped_reads(r, nearby_soft_clips, support_fraction): + if debug: echo("9") + return "lowSupport" if cached and matching_soft_clips(r, cached, is_paired_end): - if keep_all: - r.filter.add("normal") - return False - return True + if debug: echo("10") + return "normal" + if not is_paired_end and poorly_aligned_region(sum_edit_distance, sum_n_small_gap, NMbase, n_aligns, r.samples[sample], max_divergence): + return "highDivergence" + + return None def update_filter_value(r, sample_name, old_filter_values, pass_prob, new_value=""): @@ -863,6 +990,28 @@ def update_filter_value(r, sample_name, old_filter_values, pass_prob, new_value= r.filter.add(v) +def check_for_interval_overlap(intervals, r, chrom_tid, chrom2_tid, posB, filter_results): + if not intervals: + return False + + svtype = r.info["SVTYPE"] + if chrom_tid in intervals and svtype in intervals[chrom_tid]: + posA_overlaps = set(intervals[chrom_tid][svtype].allOverlappingIntervals(r.pos, r.pos + 1)) + if chrom2_tid in intervals and svtype in intervals[chrom2_tid]: + posB_overlaps = set(intervals[chrom2_tid][svtype].allOverlappingIntervals(posB, posB + 1)) + matching_svlens = posA_overlaps.intersection(posB_overlaps) + if matching_svlens: + for other_svlen in matching_svlens: + if "SVLEN" in r.info: + if r.info["SVLEN"] > 500 or r.info["SVLEN"] < 100: + t = 0.8 + else: + t = 0.5 + if min(other_svlen, r.info["SVLEN"]) / max(other_svlen, r.info["SVLEN"]) > t: + filter_results['normal'] += 1 + return True + + def run_filtering(args): t0 = time.time() pths = get_bam_paths(args) @@ -875,23 +1024,37 @@ def run_filtering(args): pad = args["interval_size"] keep_all = args["keep_all"] min_prob = args['min_prob'] + min_mapq = args['min_mapq'] pass_prob = args['pass_prob'] support_fraction = args['support_fraction'] + max_divergence = args['max_divergence'] filter_results = defaultdict(int) written = 0 + samp = list(vcf.header.samples)[0] + for idx, r in enumerate(vcf): - # if r.id != "25979": + # if r.id != "207736": # continue + # echo(dict(r.info)) + # echo(dict(r.samples[sample_name])) old_filter_value = list(r.filter.keys()) r.filter.clear() + mq_pri = r.samples[samp]["MAPQP"] + mq_sup = r.samples[samp]["MAPQS"] if "MAPQS" in r.samples[samp] else 0 + if mq_pri < min_mapq and mq_sup < min_mapq: + filter_results['lowMapQ'] += 1 + if keep_all: + update_filter_value(r, old_filter_value, pass_prob, new_value="lowMapQ") + out_vcf.write(r) + continue if min_prob != 0 and 'PROB' in r.samples[sample_name] and r.samples[sample_name]['PROB'] < min_prob: - filter_results['dropped, lowProb'] += 1 + filter_results['lowProb'] += 1 if keep_all: update_filter_value(r, old_filter_value, pass_prob, new_value="lowProb") out_vcf.write(r) continue if has_low_support(r, sample_name, support_fraction): - filter_results['dropped, low support'] += 1 + filter_results['lowSupport'] += 1 if keep_all: update_filter_value(r, sample_name, old_filter_value, pass_prob, new_value="lowSupport") out_vcf.write(r) @@ -900,17 +1063,13 @@ def run_filtering(args): posB = get_posB(r) if chrom_tid == -1: logging.exception(f'Chromosome name {r.chrom} not found in bam file header') - if intervals: - if chrom_tid in intervals: - posA_overlaps = set(intervals[chrom_tid].allOverlappingIntervals(r.pos, r.pos+1)) - if chrom2_tid in intervals: - posB_overlaps = set(intervals[chrom2_tid].allOverlappingIntervals(posB, posB + 1)) - if posA_overlaps.intersection(posB_overlaps): - filter_results['dropped, normal SV overlap'] += 1 - if keep_all: - update_filter_value(r, sample_name, old_filter_value, pass_prob, new_value="normal") - out_vcf.write(r) - continue + + if check_for_interval_overlap(intervals, r, chrom_tid, chrom2_tid, posB, filter_results): + if keep_all: + update_filter_value(r, sample_name, old_filter_value, pass_prob, new_value="normalOverlap") + out_vcf.write(r) + continue + if not bams: update_filter_value(r, sample_name, old_filter_value, pass_prob) out_vcf.write(r) @@ -918,18 +1077,19 @@ def run_filtering(args): continue sv_type = get_sv_type(r, chrom_tid, chrom2_tid) if sv_type == "TRA" or sv_type == "BND": - good = process_translocation(r, r.info["CHR2"], posB, bams, infile, bam_is_paired_end, pad=pad, keep_all=keep_all, sample=sample_name, support_fraction=support_fraction) + reason = process_translocation(r, r.info["CHR2"], posB, bams, infile, bam_is_paired_end, pad=pad, sample=sample_name, support_fraction=support_fraction, max_divergence=max_divergence) else: - good = process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pad=pad, sample=sample_name, keep_all=keep_all) - if good: + reason = process_intra(r, posB, bams, infile, bam_is_paired_end, support_fraction, pad=pad, sample=sample_name, max_divergence=max_divergence) + if reason is None: update_filter_value(r, sample_name, old_filter_value, pass_prob) out_vcf.write(r) written += 1 else: if keep_all: - update_filter_value(r, sample_name, old_filter_value, pass_prob, new_value="normal") + r.filter.add(reason) + update_filter_value(r, sample_name, old_filter_value, pass_prob, new_value=reason) out_vcf.write(r) - filter_results['dropped, normal read support'] += 1 + filter_results[f'{reason}'] += 1 out_vcf.close() logging.info(f'Filter results: {dict(sorted(filter_results.items()))}') logging.info("dysgu filter {} complete, n={}, h:m:s, {}".format(args['input_vcf'], diff --git a/dysgu/find_reads.hpp b/dysgu/find_reads.hpp index 19e4aa7..7138af1 100644 --- a/dysgu/find_reads.hpp +++ b/dysgu/find_reads.hpp @@ -14,6 +14,22 @@ #include +void remove_extraneous_tags(bam1_t* src, int remove_extra_tags) { + // needed NM, AS, SA, XS, but remove some of the large offenders to save a bit of disk space + int i=0; + for (auto &t : {"ML", "MM", "MD"}) { + uint8_t* data = bam_aux_get(src, t); + if (data != nullptr) { + bam_aux_del(src, data); + } + i += 1; + if (i >= remove_extra_tags) { + break; + } + } +} + + class CoverageTrack { public: @@ -107,7 +123,7 @@ int process_alignment(int& current_tid, std::deque> robin_hood::unordered_set& read_names, CoverageTrack& cov_track, uint64_t& total, const int n_chromosomes, const int mapq_thresh, const int check_clips, const int min_within_size, sam_hdr_t **samHdr, htsFile **f_out, - std::string& temp_folder, const bool write_all, long *n_aligned_bases) { + std::string& temp_folder, const bool write_all, long *n_aligned_bases, int remove_extra_tags) { int result; bam1_t* aln; @@ -128,6 +144,9 @@ int process_alignment(int& current_tid, std::deque> // Check if write queue is full if (write_queue.size() > max_write_queue) { for (const auto& val: write_queue) { + if (remove_extra_tags) { + remove_extraneous_tags(val, remove_extra_tags); + } result = sam_write1(*f_out, *samHdr, val); if (result < 0) { return -1; } total += 1; @@ -317,7 +336,7 @@ int search_hts_alignments(char* infile, char* outfile, uint32_t min_within_size, std::string region_string = region; long n_aligned_bases = 0; - + int remove_extra_tags = 0; // iterate through regions if (region_string != ".,") { @@ -342,12 +361,21 @@ int search_hts_alignments(char* infile, char* outfile, uint32_t min_within_size, // Read alignment into the back of scope queue while ( sam_itr_next(fp_in, iter, scope.back().second) >= 0 ) { - + if (total == 0) { + for (auto &t : {"ML", "MM", "MD"}) { + uint8_t* data = bam_aux_get(scope.back().second, t); + if (data != nullptr) { + remove_extra_tags += 1; + } else { + break; + } + } + } int success = process_alignment(current_tid, scope, write_queue, max_scope, max_write_queue, clip_length, read_names, cov_track, total, n_chromosomes, mapq_thresh, check_clips, min_within_size, - &samHdr, &f_out, temp_folder, write_all, &n_aligned_bases); + &samHdr, &f_out, temp_folder, write_all, &n_aligned_bases, remove_extra_tags); if (success < 0) { std::cerr << "Failed to process input alignment. Stopping" << region << std::endl; break; @@ -357,12 +385,21 @@ int search_hts_alignments(char* infile, char* outfile, uint32_t min_within_size, } else { // iterate whole alignment file (no index needed) while (sam_read1(fp_in, samHdr, scope.back().second) >= 0) { - + if (total == 0) { + for (auto &t : {"ML", "MM", "MD"}) { + uint8_t* data = bam_aux_get(scope.back().second, t); + if (data != nullptr) { + remove_extra_tags += 1; + } else { + break; + } + } + } int success = process_alignment(current_tid, scope, write_queue, max_scope, max_write_queue, clip_length, read_names, cov_track, total, n_chromosomes, mapq_thresh, check_clips, min_within_size, - &samHdr, &f_out, temp_folder, write_all, &n_aligned_bases); + &samHdr, &f_out, temp_folder, write_all, &n_aligned_bases, remove_extra_tags); if (success < 0) { std::cerr << "Failed to process input alignment. Stopping" << region << std::endl; break; @@ -384,6 +421,9 @@ int search_hts_alignments(char* infile, char* outfile, uint32_t min_within_size, } for (const auto& val: write_queue) { + if (remove_extra_tags) { + remove_extraneous_tags(val, remove_extra_tags); + } result = sam_write1(f_out, samHdr, val); if (result < 0) { return -1; } total += 1; diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx index 9a0f720..26d4e1b 100644 --- a/dysgu/graph.pyx +++ b/dysgu/graph.pyx @@ -7,7 +7,6 @@ import sortedcontainers import cython from cpython cimport array import array -import re import logging from dysgu.map_set_utils cimport unordered_map as robin_map, Py_SimpleGraph from dysgu.map_set_utils cimport multimap as cpp_map @@ -23,7 +22,7 @@ from libcpp.deque cimport deque as cpp_deque from libcpp.vector cimport vector from libcpp.pair cimport pair as cpp_pair from libcpp.unordered_map cimport unordered_map -from libc.stdint cimport uint16_t, uint32_t, int32_t, uint64_t +from libc.stdint cimport uint8_t, uint16_t, uint32_t, int32_t, uint64_t from libc.stdlib cimport abs as c_abs from cython.operator import dereference, postincrement, postdecrement, preincrement, predecrement from pysam.libcalignedsegment cimport AlignedSegment @@ -280,20 +279,21 @@ cdef LocalVal make_local_val(int chrom2, int pos2, int node_name, ReadEnum_t rea cdef class PairedEndScoper: - cdef int clst_dist - cdef int max_dist - cdef int local_chrom + cdef int clst_dist, max_dist, local_chrom, max_search_depth cdef cpp_map[int, LocalVal] loci # Track the local breaks and mapping locations cdef vector[cpp_map[int, LocalVal]] chrom_scope # Track the mate-pair breaks and locations cdef float norm - cdef float thresh + cdef float thresh # spd + cdef float position_distance_thresh cdef bint paired_end - def __init__(self, max_dist, clst_dist, n_references, norm, thresh, paired_end): + def __init__(self, max_dist, clst_dist, n_references, norm, thresh, paired_end, position_distance_thresh, max_search_depth): self.clst_dist = clst_dist self.max_dist = max_dist + self.max_search_depth = max_search_depth self.local_chrom = -1 self.norm = norm self.thresh = thresh + self.position_distance_thresh = position_distance_thresh self.paired_end = paired_end cdef cpp_map[int, LocalVal] scope for n in range(n_references + 1): # Add one for special 'insertion chromosome' @@ -347,28 +347,26 @@ cdef class PairedEndScoper: local_it = forward_scope.lower_bound(pos2) steps = 0 if local_it != forward_scope.end(): - while steps < 20: #6: + while local_it != forward_scope.end() and steps < self.max_search_depth: vitem = dereference(local_it) + preincrement(local_it) + steps += 1 + if (read_enum == DELETION and vitem.second.read_enum == INSERTION) or (read_enum == INSERTION and vitem.second.read_enum == DELETION): - steps += 1 continue node_name2 = vitem.second.node_name if node_name2 != node_name: # Can happen due to within-read events - # if node_name == 77: - # echo("-->", node_name, node_name2, current_chrom == chrom2, current_pos, pos2, pos2 - current_pos, vitem.second.pos2 - vitem.first, - # is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2), - # ) - # echo(read_enum, vitem.second.read_enum, read_enum == INSERTION, vitem.second.read_enum == DELETION) if current_chrom != chrom2 or is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2): sep = c_abs(vitem.first - pos2) + if sep >= self.max_dist: + break sep2 = c_abs(vitem.second.pos2 - current_pos) - if sep < self.max_dist and sep2 < self.max_dist: - if sep < 35: + if vitem.second.chrom2 == chrom2 and sep2 < self.max_dist: + if sep < 150:#35: if length_from_cigar > 0 and vitem.second.length_from_cigar > 0: max_span = max(length_from_cigar, vitem.second.length_from_cigar) span_distance = c_abs(length_from_cigar - vitem.second.length_from_cigar) / max_span - # echo("hi", node_name2, (length_from_cigar, vitem.second.length_from_cigar), span_distance, self.thresh) - if span_distance < 0.8: + if span_distance < self.position_distance_thresh: found_exact.push_back(node_name2) else: found_exact.push_back(node_name2) @@ -379,54 +377,43 @@ cdef class PairedEndScoper: elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): found2.push_back(node_name2) - if sep >= self.max_dist: - break - preincrement(local_it) - steps += 1 - if local_it == forward_scope.end(): - break + if not found_exact.empty(): + return found_exact - if found_exact.empty(): - local_it = forward_scope.lower_bound(pos2) - vitem = dereference(local_it) - if local_it != forward_scope.begin(): - predecrement(local_it) # Move back one before staring search, otherwise same value is processed twice - steps = 0 - while steps < 20: # 6: - vitem = dereference(local_it) - if (read_enum == DELETION and vitem.second.read_enum == INSERTION) or (read_enum == INSERTION and vitem.second.read_enum == DELETION): - steps += 1 - continue - node_name2 = vitem.second.node_name - if node_name2 != node_name: - # if node_name == 1: - # echo("2", node_name, node_name2, read_enum) #, is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2), span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar)) - # if node_name == 77: - # echo('r', current_pos, pos2, vitem.first, vitem.second.pos2, span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len)) - if current_chrom != chrom2 or is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2): - sep = c_abs(vitem.first - pos2) - sep2 = c_abs(vitem.second.pos2 - current_pos) - if sep < self.max_dist and vitem.second.chrom2 == chrom2 and \ - sep2 < self.max_dist: - if sep < 35: - if length_from_cigar > 0 and vitem.second.length_from_cigar > 0: - max_span = max(length_from_cigar, vitem.second.length_from_cigar) - span_distance = c_abs(length_from_cigar - vitem.second.length_from_cigar) / max_span - # echo("hi2", node_name2, (length_from_cigar, vitem.second.length_from_cigar), span_distance, self.thresh) - if span_distance < 0.8: - found_exact.push_back(node_name2) - else: + local_it = forward_scope.lower_bound(pos2) + vitem = dereference(local_it) + if local_it != forward_scope.begin(): + predecrement(local_it) # Move back one before staring search, otherwise same value is processed twice + steps = 0 + while local_it != forward_scope.begin() and steps < self.max_search_depth: + vitem = dereference(local_it) + predecrement(local_it) + steps += 1 + if (read_enum == DELETION and vitem.second.read_enum == INSERTION) or (read_enum == INSERTION and vitem.second.read_enum == DELETION): + continue + node_name2 = vitem.second.node_name + if node_name2 != node_name: + if current_chrom != chrom2 or is_reciprocal_overlapping(current_pos, pos2, vitem.first, vitem.second.pos2): + sep = c_abs(vitem.first - pos2) + if sep >= self.max_dist: + break + sep2 = c_abs(vitem.second.pos2 - current_pos) + if vitem.second.chrom2 == chrom2 and sep2 < self.max_dist: + if sep < 150:#35: + if length_from_cigar > 0 and vitem.second.length_from_cigar > 0: + max_span = max(length_from_cigar, vitem.second.length_from_cigar) + span_distance = c_abs(length_from_cigar - vitem.second.length_from_cigar) / max_span + if span_distance < self.position_distance_thresh: found_exact.push_back(node_name2) - elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): - found2.push_back(node_name2) + else: + found_exact.push_back(node_name2) elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): found2.push_back(node_name2) elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): found2.push_back(node_name2) - if local_it == forward_scope.begin() or sep >= self.max_dist: - break - predecrement(local_it) - steps += 1 + elif span_position_distance(current_pos, pos2, vitem.first, vitem.second.pos2, self.norm, self.thresh, read_enum, self.paired_end, length_from_cigar, vitem.second.length_from_cigar, trust_ins_len): + found2.push_back(node_name2) + if not found_exact.empty(): return found_exact else: @@ -458,72 +445,94 @@ cdef class PairedEndScoper: forward_scope.insert(pp) +cdef struct TemplateNode: + int query_start + int node + int flag + + cdef class TemplateEdges: - cdef public unordered_map[string, vector[int]] templates_s + cdef public unordered_map[string, vector[TemplateNode]] templates_s # query_start, node, flag def __init__(self): pass cdef void add(self, str template_name, int flag, int node, int query_start): - cdef vector[int] val cdef bytes key = bytes(template_name, encoding="utf8") - val.push_back(query_start) - val.push_back(node) - val.push_back(flag) - self.templates_s[key].insert(self.templates_s[key].end(), val.begin(), val.end()) + self.templates_s[key].resize(self.templates_s[key].size() + 1) + cdef TemplateNode *t = &self.templates_s[key].back() + t.query_start = query_start + t.flag = t.flag + t.node = node cdef void add_template_edges(Py_SimpleGraph G, TemplateEdges template_edges): # this function joins up template reads (read 1, read 2, plus any supplementary) - cdef int ii, u_start, v_start, u, v, uflag, vflag - cdef unordered_map[string, vector[int]].iterator it = template_edges.templates_s.begin() - cdef vector[int] arr + cdef int i, j, u_start, v_start, u, v, uflag, vflag, primary1, primary2, n1, n2 + cdef unordered_map[string, vector[TemplateNode]].iterator it = template_edges.templates_s.begin() + cdef vector[TemplateNode] arr + cdef TemplateNode *t while it != template_edges.templates_s.end(): - arr = dereference(it).second # Array values are query start, node-name, flag + arr = dereference(it).second postincrement(it) read1_aligns = [] read2_aligns = [] - for ii in range(0, arr.size(), 3): - if arr[ii + 2] & 64: # first in pair - read1_aligns.append(arr[ii:ii + 3]) + for i in range(0, arr.size()): + t = &arr[i] + if t.flag & 64: # first in pair + read1_aligns.append((t.query_start, t.node, t.flag)) else: - read2_aligns.append(arr[ii:ii + 3]) - primary1 = None - primary2 = None - if len(read1_aligns) > 0: - if len(read1_aligns) == 1: + read2_aligns.append((t.query_start, t.node, t.flag)) + primary1 = -1 + primary2 = -1 + n1 = len(read1_aligns) + n2 = len(read2_aligns) + if n1 > 0: + if n1 == 1: if not read1_aligns[0][2] & 2304: # Is primary primary1 = read1_aligns[0][1] else: - if len(read1_aligns) > 2: - read1_aligns = sorted(read1_aligns) + if n1 > 2: + read1_aligns.sort() # Add edge between alignments that are neighbors on the query sequence, or between primary alignments - for ii in range(len(read1_aligns) - 1): - u_start, u, uflag = read1_aligns[ii] + for i in range(n1 - 1): + u_start, u, uflag = read1_aligns[i] if not uflag & 2304: primary1 = u - v_start, v, vflag = read1_aligns[ii + 1] + v_start, v, vflag = read1_aligns[i + 1] if not G.hasEdge(u, v): G.addEdge(u, v, w=1) - if primary1 is None: + if primary1 == -1: if not read1_aligns[-1][2] & 2304: primary1 = read1_aligns[-1][1] - if len(read2_aligns) > 0: - if len(read2_aligns) == 1: + if n2 > 0: + if n2 == 1: if not read2_aligns[0][2] & 2304: primary2 = read2_aligns[0][1] else: - if len(read2_aligns) > 2: - read2_aligns = sorted(read2_aligns) - for ii in range(len(read2_aligns) - 1): - u_start, u, uflag = read2_aligns[ii] + if n2 > 2: + read2_aligns.sort() + for i in range(n2 - 1): + u_start, u, uflag = read2_aligns[i] if not uflag & 2304: primary2 = u - v_start, v, vflag = read2_aligns[ii + 1] + v_start, v, vflag = read2_aligns[i + 1] if not G.hasEdge(u, v): G.addEdge(u, v, w=1) - if primary2 is None: + # Tried this, only edges between ajacent query aligns, not duplicate ones. but hurt performance! + # j = i + 1 + # while j < n2: + # v_start, v, vflag = read2_aligns[j] + # if v_start != u_start: + # if not G.hasEdge(u, v): + # G.addEdge(u, v, w=1) + # if j < n2 - 1 and read2_aligns[j + 1][0] == v_start: + # j += 1 + # continue + # break + # j += 1 + if primary2 == -1: if not read2_aligns[-1][2] & 2304: primary2 = read2_aligns[-1][1] - if primary1 is not None and primary2 is not None: + if primary1 >= 0 and primary2 >= 0: if not G.hasEdge(primary1, primary2): G.addEdge(primary1, primary2, w=1) @@ -545,35 +554,48 @@ cdef class NodeName: self.tell = t self.cigar_index = cigar_index self.event_pos = event_pos - def as_tuple(self): - return self.hash_name, self.flag, self.pos, self.chrom, self.tell, self.cigar_index, self.event_pos + def as_dict(self): + return {"hash_name": self.hash_name, + "flag": self.flag, + "chrom": self.chrom, + "pos": self.pos, + "tell": self.tell, + "cigar_index": self.cigar_index, + "event_pos": self.event_pos, + "read_enum": self.read_enum, + } + + +cdef struct NodeNameItem: + uint64_t hash_val, tell + uint32_t pos, event_pos + int32_t cigar_index + uint16_t flag, chrom + cdef class NodeToName: # Index these vectors to get the unique 'template_name' # node names have the form (hash qname, flag, pos, chrom, tell, cigar index, event pos) - cdef vector[uint64_t] h - cdef vector[uint16_t] f - cdef vector[uint32_t] p - cdef vector[uint16_t] c - cdef vector[uint64_t] t - cdef vector[int32_t] cigar_index - cdef vector[uint32_t] event_pos + cdef vector[NodeNameItem] stored_nodes def __cinit__(self): pass - cdef void append(self, long a, int b, int c, int d, long e, int f, int g) nogil: - self.h.push_back(a) - self.f.push_back(b) - self.p.push_back(c) - self.c.push_back(d) - if e == -1: # means to look in read buffer instead - e = 0 - self.t.push_back(e) - self.cigar_index.push_back(f) - self.event_pos.push_back(g) + cdef void append(self, long hash_val, int flag, int pos, int chrom, long tell, int cigar_index, int event_pos): + self.stored_nodes.resize(self.stored_nodes.size() + 1); + cdef NodeNameItem *nd = &self.stored_nodes.back(); + nd.hash_val = hash_val + nd.flag = flag + nd.pos = pos + nd.chrom = chrom + if tell == -1: # means to look in read buffer instead + tell = 0 + nd.tell = tell + nd.cigar_index = cigar_index + nd.event_pos = event_pos def __getitem__(self, idx): - return NodeName(self.h[idx], self.f[idx], self.p[idx], self.c[idx], self.t[idx], self.cigar_index[idx], self.event_pos[idx]) + cdef NodeNameItem *nd = &self.stored_nodes[idx] + return NodeName(nd.hash_val, nd.flag, nd.pos, nd.chrom, nd.tell, nd.cigar_index, nd.event_pos) cdef bint same_template(self, int query_node, int target_node): - if self.h[query_node] == self.h[target_node]: + if self.stored_nodes[query_node].hash_val == self.stored_nodes[target_node].hash_val: return True cdef get_query_pos_from_cigarstring(cigar, pos): @@ -599,12 +621,38 @@ cdef get_query_pos_from_cigarstring(cigar, pos): return start, end, pos, ref_end -def get_query_pos_from_cigartuples(r): +cdef void parse_cigar(str cigar, int &start, int &end, int &ref_end): + cdef: + int in_lead = 1 + int num = 0 # To accumulate the number as we parse through the string + char op + cdef bytes t = cigar.encode('utf8') + cdef char *c_cigar = t # Convert Python string to char* + while dereference(c_cigar): + if b'0' <= dereference(c_cigar) <= b'9': + # Convert digit char to int and accumulate + num = num * 10 + (dereference(c_cigar) - 48 ) # ord(b'0') = 48 + else: + op = dereference(c_cigar) + if in_lead and (op == b'S' or op == b'H'): + start += num + end += num + elif op == b'D': + ref_end += num + elif op == b'I': + end += num + elif op == b'M' or op == b'=' or op == b'X': + end += num + ref_end += num + num = 0 + in_lead = 0 + c_cigar += 1 + +def get_query_pos_from_cigartuples(r, query_length): # Infer the position on the query sequence of the alignment using cigar string start = 0 - query_length = r.infer_read_length() # Note, this also counts hard-clips + # query_length = r.infer_read_length() # Note, this also counts hard-clips end = query_length - if r.cigartuples[0][0] == 4 or r.cigartuples[0][0] == 5: start += r.cigartuples[0][1] if r.cigartuples[-1][0] == 4 or r.cigartuples[-1][0] == 5: @@ -613,7 +661,7 @@ def get_query_pos_from_cigartuples(r): AlnBlock = namedtuple("SA", ["query_start", "query_end", "ref_start", "ref_end", "chrom", "mq", "strand", "this"]) -JoinEvent = namedtuple("JE", ["chrom", "event_pos", "chrom2", "pos2", "read_enum", "cigar_index"]) +JoinEvent = namedtuple("JE", ["chrom", "event_pos", "chrom2", "pos2", "query_pos", "query_end", "read_enum", "cigar_index"]) class AlignmentsSA: @@ -623,6 +671,7 @@ class AlignmentsSA: self.aln_strand = None self.query_aligns = [] self.join_result = [] + self.query_length = r.infer_read_length() # Note, this also counts hard-clips self._alignments_from_sa(r, gettid) def connect_alignments(self, r, max_dist=1000, mq_thresh=0, read_enum=0): @@ -633,24 +682,29 @@ class AlignmentsSA: self._connect_right(r, max_dist, mq_thresh, read_enum) def _alignments_from_sa(self, r, gettid): - qstart, qend, query_length = get_query_pos_from_cigartuples(r) + qstart, qend, query_length = get_query_pos_from_cigartuples(r, self.query_length) this_aln = AlnBlock(query_start=qstart, query_end=qend, ref_start=r.pos, ref_end=r.reference_end, chrom=r.rname, mq=r.mapq, strand="-" if r.flag & 16 else "+", this=True) self.aln_strand = this_aln.strand query_aligns = [this_aln] + + cdef int ref_start, ref_end, query_start, query_end for sa_block in r.get_tag("SA").split(";"): if sa_block == "": break + query_start = 0 + query_end = 0 sa = sa_block.split(",", 5) - matches = [(int(slen), opp) for slen, opp in re.findall(r'(\d+)([A-Z]{1})', sa[3])] - query_start, query_end, ref_start, ref_end = get_query_pos_from_cigarstring(matches, int(sa[1])) + ref_start = int(sa[1]) + ref_end = ref_start + parse_cigar(sa[3], query_start, query_end, ref_end) if this_aln.strand != sa[2]: start_temp = query_length - query_end query_end = start_temp + query_end - query_start query_start = start_temp query_aligns.append(AlnBlock(query_start, query_end, ref_start, ref_end, gettid(sa[0]), int(sa[4]), sa[2], False)) - query_aligns = sorted(query_aligns) + query_aligns.sort() cdef int index = 0 for i, item in enumerate(query_aligns): if item.this: @@ -658,13 +712,16 @@ class AlignmentsSA: break self.query_aligns = query_aligns - def _connect_left(self, r, max_dist, mq_thresh, read_enum): #, max_gap_size=100): + def _connect_left(self, r, max_dist, mq_thresh, read_enum): a = self.query_aligns[self.index] b = self.query_aligns[self.index - 1] - # gap = abs(a.query_start - b.query_end) - # if gap > max_gap_size: - # return event_pos = a.ref_start + query_pos = a.query_start + query_end = a.query_end + if r.is_reverse: + qtemp = query_pos + query_pos = self.query_length - query_end + query_end = self.query_length - qtemp chrom = a.chrom cigar_index = 0 if b.strand == self.aln_strand: @@ -673,21 +730,24 @@ class AlignmentsSA: pos2 = b.ref_start chrom2 = b.chrom if b.mq < mq_thresh: - self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, ReadEnum_t.BREAKEND, cigar_index)) + self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, query_pos, query_end, ReadEnum_t.BREAKEND, cigar_index)) return elif self.paired: if not r.flag & 2304 and r.flag & 2 and r.pnext <= event_pos and (chrom != chrom2 or abs(pos2 - event_pos) > max_dist): - self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, ReadEnum_t.BREAKEND, cigar_index)) + self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, query_pos, query_end, ReadEnum_t.BREAKEND, cigar_index)) return - self.join_result.append(JoinEvent(chrom, event_pos, chrom2, pos2, read_enum, cigar_index)) + self.join_result.append(JoinEvent(chrom, event_pos, chrom2, pos2, query_pos, query_end, read_enum, cigar_index)) - def _connect_right(self, r, max_dist, mq_thresh, read_enum): #, max_gap_size=100): + def _connect_right(self, r, max_dist, mq_thresh, read_enum): a = self.query_aligns[self.index] b = self.query_aligns[self.index + 1] - # gap = abs(b.query_start - a.query_end) - # if gap > max_gap_size: - # return event_pos = a.ref_end + query_pos = a.query_start + query_end = a.query_end + if r.is_reverse: + qtemp = query_pos + query_pos = self.query_length - query_end + query_end = self.query_length - qtemp chrom = a.chrom cigar_index = len(r.cigartuples) - 1 if b.strand == self.aln_strand: @@ -696,17 +756,17 @@ class AlignmentsSA: pos2 = b.ref_end chrom2 = b.chrom if b.mq < mq_thresh: - self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, ReadEnum_t.BREAKEND, cigar_index)) + self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, query_pos, query_end, ReadEnum_t.BREAKEND, cigar_index)) return elif self.paired: # If paired, and SA block fits between two normal primary alignments, and block is not local, then # ignore block and try and call insertion if not r.flag & 2304 and r.flag & 2 and r.pnext >= event_pos and (chrom != chrom2 or abs(pos2 - event_pos) > max_dist): # is primary, is proper pair. Use primary mate info, not SA - self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, ReadEnum_t.BREAKEND, cigar_index)) + self.join_result.append(JoinEvent(chrom, event_pos, chrom, event_pos, query_pos, query_end, ReadEnum_t.BREAKEND, cigar_index)) return - self.join_result.append(JoinEvent(chrom, event_pos, chrom2, pos2, read_enum, cigar_index)) - + self.join_result.append(JoinEvent(chrom, event_pos, chrom2, pos2, query_pos, query_end, read_enum, cigar_index)) +# cdef int cluster_clipped(Py_SimpleGraph G, r, ClipScoper_t clip_scope, chrom, pos, node_name): cdef int other_node cdef int count = 0 @@ -722,7 +782,7 @@ cdef int cluster_clipped(Py_SimpleGraph G, r, ClipScoper_t clip_scope, chrom, po cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_scope, TemplateEdges_t template_edges, NodeToName node_to_name, genome_scanner, - int flag, int chrom, tell, int cigar_index, int event_pos, + int flag, int chrom, tell, int cigar_index, int event_pos, int query_pos, int chrom2, int pos2, ClipScoper_t clip_scope, ReadEnum_t read_enum, bint p1_overlaps, bint p2_overlaps, bint mm_only, int clip_l, site_adder, int length_from_cigar, bint trust_ins_len, bint paired_end): @@ -733,12 +793,8 @@ cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_ cdef uint64_t v = xxhasher(bam_get_qname(r._delegate), len(r.qname), 42) # Hash qname to save mem cdef int bnd_site_node, bnd_site_node2 cdef int q_start - # echo("\nADDING", node_name, r.qname, (chrom, event_pos), (chrom2, pos2), flag, read_enum) node_to_name.append(v, flag, r.pos, chrom, tell, cigar_index, event_pos) # Index this list to get the template_name genome_scanner.add_to_buffer(r, node_name, tell) # Add read to buffer - if read_enum < 2: # Prevents joining up within-read svs with between-read svs - q_start = r.query_alignment_start if not r.flag & 16 else r.infer_query_length() - r.query_alignment_end - template_edges.add(r.qname, flag, node_name, q_start) both_overlap = p1_overlaps and p2_overlaps if not paired_end or (paired_end and read_enum != BREAKEND and not mm_only and not chrom2 == -1): @@ -761,14 +817,19 @@ cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_ bnd_site_node2 = site_adder.find_nearest_site(chrom2, pos2) if bnd_site_node != bnd_site_node2 and bnd_site_node2 >= 0 and not G.hasEdge(node_name, bnd_site_node2): G.addEdge(node_name, bnd_site_node2, 0) + + if read_enum < 2: # Prevents joining up within-read svs with between-read svs + template_edges.add(r.qname, flag, node_name, query_pos) + # Debug: - # if r.qname == "M03762:232:000000000-L65J4:1:2111:17729:15161": + # if r.qname == "cfa54189-601a-4a98-aeca-60ea41d0b931": # echo("---", r.qname, read_enum, node_name, (event_pos, pos2), length_from_cigar, list(other_nodes)) - # look = {'a4d38568-fd80-4785-8fa5-84ed132b445c', '2313a985-385c-4c84-b02c-dddfc627940b', '0031840a-bd2d-475d-9a04-528f71c7b512'} + # look = {129,128 } + # if node_name == 25: # if r.qname in look: # # if r.qname == "D00360:18:H8VC6ADXX:1:1210:7039:44052": - # echo(r.qname, r.flag, node_name, (chrom, event_pos), (chrom2, pos2), list(other_nodes), - # cigar_index, length_from_cigar, "enum=", read_enum) + # echo(r.qname, length_from_cigar, "nodename:", node_name, list(other_nodes), (chrom, event_pos), (chrom2, pos2), + # f"enum={read_enum}") # echo(list(other_nodes)) @@ -867,9 +928,9 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int cdef bint good_clip if paired_end and read_enum == SPLIT and flag & 8: # clip event, or whole read, but mate is unmapped return + if site_adder: - if site_adder: - site_adder.add_any_sites(r.rname, event_pos, G, pe_scope, node_to_name, clustering_dist) + site_adder.add_any_sites(r.rname, event_pos, G, pe_scope, node_to_name, clustering_dist) if paired_end or flag & 1: pnext = r.pnext rnext = r.rnext @@ -898,6 +959,7 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int event_pos = j.event_pos chrom2 = j.chrom2 pos2 = j.pos2 + query_pos = j.query_pos read_enum = j.read_enum cigar_index = j.cigar_index if read_enum == BREAKEND: @@ -916,7 +978,7 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int pos2 = event_pos add_to_graph(G, r, pe_scope, template_edges, node_to_name, genome_scanner, flag, chrom, - tell, cigar_index, event_pos, chrom2, pos2, clip_scope, read_enum, + tell, cigar_index, event_pos, query_pos, chrom2, pos2, clip_scope, read_enum, current_overlaps_roi, next_overlaps_roi, mm_only, clip_l, site_adder, 0, trust_ins_len, paired_end) return @@ -935,9 +997,13 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int pos2 = cigar_pos2 else: pos2 = event_pos + add_to_graph(G, r, pe_scope, template_edges, node_to_name, genome_scanner, flag, chrom, - tell, cigar_index, event_pos, chrom2, pos2, clip_scope, read_enum, current_overlaps_roi, next_overlaps_roi, - mm_only, clip_l, site_adder, length_from_cigar, trust_ins_len, paired_end) + tell, cigar_index, event_pos, + 1, + chrom2, pos2, clip_scope, read_enum, + current_overlaps_roi, next_overlaps_roi, + mm_only, clip_l, site_adder, length_from_cigar, trust_ins_len, paired_end) ### else: # Single end current_overlaps_roi, next_overlaps_roi = False, False # not supported @@ -947,21 +1013,28 @@ cdef void process_alignment(Py_SimpleGraph G, AlignedSegment r, int clip_l, int all_aligns.connect_alignments(r, loci_dist, mapq_thresh, read_enum) if not all_aligns.join_result: return - # for chrom, event_pos, chrom2, pos2, read_e, cigar_index in all_aligns.join_result: for j in all_aligns.join_result: add_to_graph(G, r, pe_scope, template_edges, node_to_name, genome_scanner, flag, j.chrom, - tell, j.cigar_index, j.event_pos, j.chrom2, j.pos2, clip_scope, j.read_enum, + tell, j.cigar_index, j.event_pos, + j.query_pos, + j.chrom2, j.pos2, clip_scope, j.read_enum, current_overlaps_roi, next_overlaps_roi, - mm_only, clip_l, site_adder, 0, trust_ins_len, paired_end) - + mm_only, clip_l, site_adder, + 0, + trust_ins_len, paired_end) elif read_enum >= 2: # Sv within read or breakend + if read_enum == BREAKEND: + return chrom2 = r.rname if cigar_index != -1 and r.cigartuples[cigar_index][0] != 1: # If not insertion pos2 = cigar_pos2 else: pos2 = event_pos + add_to_graph(G, r, pe_scope, template_edges, node_to_name, genome_scanner, flag, chrom, - tell, cigar_index, event_pos, chrom2, pos2, clip_scope, read_enum, + tell, cigar_index, event_pos, + 1, + chrom2, pos2, clip_scope, read_enum, current_overlaps_roi, next_overlaps_roi, mm_only, clip_l, site_adder, length_from_cigar, trust_ins_len, paired_end) @@ -1090,7 +1163,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering int paired_end=1, int read_length=150, bint contigs=True, float norm_thresh=100, float spd_thresh=0.3, bint mm_only=False, sites=None, bint trust_ins_len=True, low_mem=False, temp_dir=".", - find_n_aligned_bases=True): + find_n_aligned_bases=True, position_distance_thresh=0.8, max_search_depth=20): logging.info("Building graph with clustering {} bp".format(clustering_dist)) cdef TemplateEdges_t template_edges = TemplateEdges() # Edges are added between alignments from same template, after building main graph cdef int event_pos, cigar_index, opp, length @@ -1099,7 +1172,8 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering minimizer_support_thresh=minimizer_support_thresh, minimizer_breadth=minimizer_breadth, read_length=read_length) # Infers long-range connections, outside local scope using pe information - cdef PairedEndScoper_t pe_scope = PairedEndScoper(max_dist, clustering_dist, infile.header.nreferences, norm_thresh, spd_thresh, paired_end) + cdef PairedEndScoper_t pe_scope = PairedEndScoper(max_dist, clustering_dist, infile.header.nreferences, norm_thresh, spd_thresh, paired_end, position_distance_thresh, max_search_depth) + bad_clip_counter = BadClipCounter(infile.header.nreferences, low_mem, temp_dir) cdef Py_SimpleGraph G = map_set_utils.Py_SimpleGraph() site_adder = None @@ -1157,7 +1231,7 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.INSERTION)) added = True elif opp == 2: - if length >= min_sv_size: # elif! + if length >= min_sv_size: pos2 = event_pos + length events_to_add.push_back(make_cigar_event(opp, cigar_index, event_pos, pos2, length, ReadEnum_t.DELETION)) added = True @@ -1283,9 +1357,9 @@ cdef tuple count_support_between(Py_SimpleGraph G, parts): cdef tuple t cdef unsigned long[:] p if len(parts) == 0: - return {}, {} + return None, None elif len(parts) == 1: - return {}, {0: parts[0]} + return None, {0: parts[0]} cdef Py_Int2IntMap p2i = map_set_utils.Py_Int2IntMap() for i, p in enumerate(parts): for node in p: @@ -1315,7 +1389,7 @@ cdef tuple count_support_between(Py_SimpleGraph G, parts): if t in seen_t: continue if t not in counts: - counts[t] = [set([]), set([])] + counts[t] = (set([]), set([])) if j < i: counts[t][0].add(child) counts[t][1].add(node) @@ -1331,7 +1405,8 @@ cdef tuple count_support_between(Py_SimpleGraph G, parts): self_counts[i].append(node) seen_t.update(current_t) # Only count edge once for t in current_t: # save memory by converting support_between to array - counts[t] = [np.fromiter(m, dtype="uint32", count=len(m)) for m in counts[t]] + counts[t] = tuple(np.fromiter(m, dtype="uint32", count=len(m)) for m in counts[t]) + return counts, self_counts @@ -1393,6 +1468,24 @@ cpdef break_large_component(Py_SimpleGraph G, component, int min_support): return jobs +@cython.auto_pickle(True) +cdef class GraphComponent: + cdef public object parts, s_between, reads, s_within, n2n, info + def __init__(self, parts, s_between, reads, s_within, n2n, info): + self.parts = parts + self.s_between = s_between + self.reads = reads + self.s_within = s_within + self.n2n = n2n + self.info = info + def as_dict(self): + return {"parts": self.parts, + "s_between": self.s_between, + "reads": self.reads, + "s_within": self.s_within, + "n2n": self.n2n, + "info": self.info} + cpdef proc_component(node_to_name, component, read_buffer, infile, Py_SimpleGraph G, int min_support, int procs, int paired_end, sites_index): n2n = {} @@ -1425,39 +1518,31 @@ cpdef proc_component(node_to_name, component, read_buffer, infile, Py_SimpleGrap # Explore component for locally interacting nodes; create partitions using these partitions = get_partitions(G, component) support_between, support_within = count_support_between(G, partitions) - if len(support_between) == 0 and len(support_within) == 0: + + if not support_between and not support_within: if not paired_end: if len(n2n) >= min_support or len(reads) >= min_support or info: - d = {"parts": [], "s_between": {}, "reads": reads, "s_within": {}, "n2n": n2n} - if info: - d["info"] = info - return d + return GraphComponent(None, None, None, None, n2n, info) else: return else: # single paired end template can have 3 nodes e.g. two reads plus supplementary if min_support == 1 and (len(n2n) >= min_support or len(reads) >= min_support): - d = {"parts": [], "s_between": {}, "reads": reads, "s_within": {}, "n2n": n2n} - if info: - d["info"] = info - return d + return GraphComponent(None, None, None, None, n2n, info) elif len(reads) >= min_support or info: - d = {"parts": [], "s_between": {}, "reads": reads, "s_within": {}, "n2n": n2n} - if info: - d["info"] = info - return d + return GraphComponent(None, None, None, None, n2n, info) else: return # Debug: - # if 157835 in n2n: + # if 14 in n2n: # echo("parts", partitions) # echo("s_between", support_between) # echo("s_within", support_within) - + # if len(partitions) > 1: # echo("parts", partitions) # echo("s_between", support_between) # echo("s_within", support_within) - + # # echo("n2n", n2n.keys()) # node_look = set(range(653526, 653532)) # node_look = set(range(8)) @@ -1466,7 +1551,12 @@ cpdef proc_component(node_to_name, component, read_buffer, infile, Py_SimpleGrap # echo("parts", partitions) # echo("s_between", sb) # echo("s_within", support_within) - d = {"parts": partitions, "s_between": support_between, "reads": reads, "s_within": support_within, "n2n": n2n} - if info: - d["info"] = info - return d + + support_between = tuple(support_between.items()) if support_between else None + support_within = tuple(support_within.items()) if support_within else None + return GraphComponent(partitions, + support_between, + reads, + support_within, + n2n, + info) diff --git a/dysgu/io_funcs.pyx b/dysgu/io_funcs.pyx index baa3f7b..9409f60 100644 --- a/dysgu/io_funcs.pyx +++ b/dysgu/io_funcs.pyx @@ -129,7 +129,7 @@ cpdef list col_names(small_output): if small_output: return ["chrA", "posA", "chrB", "posB", "sample", "event_id", "grp_id", "n_in_grp", "kind", "type", "svtype", "join_type", "cipos95A", "cipos95B", 'contigA', 'contigB', "svlen", "svlen_precise", "rep", "gc", - ["GT", "GQ", "MAPQpri", "su", "spanning", "pe", "supp", "sc", "bnd", + ["GT", "GQ", "MAPQpri", "MAPQsupp", "su", "spanning", "pe", "supp", "sc", "bnd", "NMbase", "raw_reads_10kb", "neigh10kb", "plus", "minus", "remap_score", "remap_ed", "bad_clip_count", "fcc", "inner_cn", "outer_cn", "prob"] ] @@ -264,7 +264,7 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small info_extras += [f"MeanPROB={round(mean_prob, 3)}", f"MaxPROB={round(max_prob, 3)}"] if small_output: - fmt_keys = "GT:GQ:MAPQP:SU:WR:PE:SR:SC:BND:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB" + fmt_keys = "GT:GQ:MAPQP:MAPQS:SU:WR:PE:SR:SC:BND:NMB:COV:NEIGH10:PS:MS:RMS:RED:BCC:FCC:ICN:OCN:PROB" else: fmt_keys = "GT:GQ:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB" @@ -309,7 +309,7 @@ def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small def get_fmt(r, small_output): if small_output: - v = [r["GT"], r["GQ"], r['MAPQpri'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], + v = [r["GT"], r["GQ"], r['MAPQpri'], r['MAPQsupp'], r['su'], r['spanning'], r['pe'], r['supp'], r['sc'], r['bnd'], r['NMbase'], r['raw_reads_10kb'], r['neigh10kb'], r["plus"], r["minus"], r["remap_score"], r["remap_ed"], r["bad_clip_count"], round(r["fcc"], 3), round(r["inner_cn"], 3), round(r["outer_cn"], 3), r['prob'] ] diff --git a/dysgu/main.py b/dysgu/main.py index bf328ab..df75ea9 100644 --- a/dysgu/main.py +++ b/dysgu/main.py @@ -5,7 +5,7 @@ import shutil import time from multiprocessing import cpu_count -from subprocess import run +from subprocess import run, Popen, PIPE from importlib.metadata import version import warnings from dysgu import cluster, view, sv2bam, filter_normals @@ -189,6 +189,8 @@ def cli(): type=int, callback=add_option_set) @click.option('--dist-norm', help=f"Distance normalizer [default: {defaults['dist_norm']}]", type=float, callback=add_option_set) @click.option('--spd', help="Span position distance", default=0.3, type=float, show_default=True) +@click.option('--sd', help="Span distance, only SV span is considered, lower values separate multi-allelic sites", default=0.8, type=float, show_default=True) +@click.option('--search-depth', help="Search through this many local reads for matching SVs. Increase this to identify low frequency events", default=20, type=float, show_default=True) @click.option('--trust-ins-len', help=f"Trust insertion length from cigar, for high error rate reads use False [default: {defaults['trust_ins_len']}]", type=str, callback=add_option_set) @click.option('--length-extend', help=f"Extend SV length if any nearby gaps found with length >= length-extend. Ignored for paired-end reads", type=int, default=15, show_default=True) @@ -348,6 +350,8 @@ def get_reads(ctx, **kwargs): type=int, callback=add_option_set) @click.option('--dist-norm', help=f"Distance normalizer [default: {defaults['dist_norm']}]", type=float, callback=add_option_set) @click.option('--spd', help="Span position distance", default=0.3, type=float, show_default=True) +@click.option('--sd', help="Span distance, only SV span is considered, lower values separate multi-allelic sites", default=0.8, type=float, show_default=True) +@click.option('--search-depth', help="Search through this many local reads for matching SVs. Increase this to identify low frequency events", default=20, type=float, show_default=True) @click.option('--trust-ins-len', help=f"Trust insertion length from cigar, for high error rate reads use False [default: {defaults['trust_ins_len']}]", type=str, callback=add_option_set) @click.option('--length-extend', help=f"Extend SV length if any nearby gaps found with length >= length-extend. Ignored for paired-end reads", type=int, default=15, show_default=True) @@ -427,6 +431,8 @@ def call_events(ctx, **kwargs): @click.option("-p", "--procs", help="Number of processors to use when merging, requires --wd option to be supplied", type=cpu_range, default=1, show_default=True) @click.option("-d", "--wd", help="Working directory to use/create when merging", type=click.Path(exists=False), required=False) @click.option("-c", "--clean", help="Remove working directory when finished", is_flag=True, flag_value=True, show_default=False, default=False) +@click.option("--progress", help="Prints detailed progress information", is_flag=True, flag_value=True, show_default=False, default=False) +@click.option("--cohort-update", help="Updated this cohort file with new calls from input_files", required=False, type=click.Path(exists=True)) @click.option("--collapse-nearby", help="Merges more aggressively by collapsing nearby SV", default="True", type=click.Choice(["True", "False"]), show_default=True) @click.option("--merge-across", help="Merge records across input samples", default="True", @@ -435,6 +441,8 @@ def call_events(ctx, **kwargs): default="False", type=click.Choice(["True", "False"]), show_default=True) @click.option("--merge-dist", help="Distance threshold for merging", default=500, type=int, show_default=True) +@click.option("--max-comparisons", help="Compare each event with up to --max-comparisons local SVs", + default=20, type=int, show_default=True) @click.option("--separate", help="Keep merged tables separate, adds --post-fix to file names, csv format only", default="False", type=click.Choice(["True", "False"]), show_default=True) @click.option("--post-fix", help="Adds --post-fix to file names, only if --separate is True", @@ -467,8 +475,10 @@ def view_data(ctx, **kwargs): @click.option("--target-sample", help="If input_vcf if multi-sample, use target-sample as input", required=False, type=str, default="") @click.option("--keep-all", help="All SVs classified as normal will be kept in the output, labelled as filter=normal", is_flag=True, flag_value=True, show_default=False, default=False) @click.option("--ignore-read-groups", help="Ignore ReadGroup RG tags when parsing sample names. Filenames will be used instead", is_flag=True, flag_value=True, show_default=False, default=False) -@click.option("--min-prob", help="Remove SVs with PROB value < min-prob", default=0.1, type=float, show_default=True) -@click.option("--pass-prob", help="Re-label SVs as PASS if PROB value >= pass-prob", default=1.0, type=float, show_default=True) +@click.option("--max-divergence", help="Remove SV if normal_bam displays divergence > max-divergence at same location", default=0.1, type=float, show_default=True) +@click.option("--min-prob", help="Remove SV with PROB value < min-prob", default=0.1, type=float, show_default=True) +@click.option("--min-mapq", help="Remove SV with mean mapqq < min-mapq", default=10, type=float, show_default=True) +@click.option("--pass-prob", help="Re-label SV as PASS if PROB value >= pass-prob", default=1.0, type=float, show_default=True) @click.option("--interval-size", help="Interval size for searching normal-vcf/normal-bams", default=1000, type=int, show_default=True) @click.option("--random-bam-sample", help="Choose N random normal-bams to search. Use -1 to ignore", default=-1, type=int, show_default=True) @click.pass_context @@ -482,6 +492,8 @@ def filter_normal(ctx, **kwargs): @cli.command("test") +@click.option("--verbose", help="Show output of test commands", + is_flag=True, flag_value=True, show_default=False, default=False) @click.pass_context def test_command(ctx, **kwargs): """Run dysgu tests""" @@ -531,9 +543,14 @@ def test_command(ctx, **kwargs): "-o " + pwd + '/test.merge.dysgu{}.vcf'.format(dysgu_version)]) for t in tests: c = " ".join(t) - v = run(shlex.split(c), shell=False, capture_output=True, check=True) - if v.returncode != 0: - raise RuntimeError(t, "finished with non zero: {}".format(v)) + process = Popen(shlex.split(c), stdout=PIPE, stderr=PIPE, text=True) + if kwargs["verbose"]: + for line in process.stderr: + click.echo(line.strip(), err=True) + process.wait() + if process.returncode != 0: + logging.warning(f"WARNING: Command failed with return code {process.returncode}") else: - click.echo("PASS: " + c, err=True) + click.echo("PASS: " + c + "\n", err=True) + logging.info("Run test complete") diff --git a/dysgu/post_call.py b/dysgu/post_call.py index b9a0044..d856b25 100644 --- a/dysgu/post_call.py +++ b/dysgu/post_call.py @@ -286,9 +286,11 @@ def ref_repetitiveness(events, ref_genome): if e.svlen < 150 and e.svtype == "DEL": try: ref_seq = ref_genome.fetch(e.chrA, e.posA, e.posB).upper() - except ValueError: # todo out or range, needs fixing - continue - e.ref_rep = compute_rep(ref_seq) + e.ref_rep = compute_rep(ref_seq) + except (ValueError, KeyError, IndexError) as errors: + # Might be different reference genome version, compared to bam genome + logging.warning("Error fetching reference chromosome: {}".format(e.chrA), errors) + e.ref_seq = 0 return events diff --git a/dysgu/re_map.py b/dysgu/re_map.py index 3418a7d..08be124 100644 --- a/dysgu/re_map.py +++ b/dysgu/re_map.py @@ -4,7 +4,7 @@ from dysgu.coverage import merge_intervals from dysgu.assembler import compute_rep import math -import edlib +import dysgu.edlib as edlib import logging diff --git a/dysgu/sites_utils.py b/dysgu/sites_utils.py index 8d0f3eb..223c183 100644 --- a/dysgu/sites_utils.py +++ b/dysgu/sites_utils.py @@ -65,7 +65,7 @@ def vcf_reader(pth, infile, parse_probs, sample_name, ignore_sample, default_pro continue if ignore_sample and sample_name in r.samples and "GT" in r.samples[sample_name]: this_gt = r.samples[sample_name]['GT'] - if not (this_gt == "0/0" or this_gt == "0" or this_gt == "." or this_gt == "./." or this_gt == "0|0"): + if not (this_gt == (0, 0) or this_gt == "0/0" or this_gt == "0" or this_gt == "." or this_gt == "./." or this_gt == "0|0"): continue if "CHROM2" in r.info: chrom2_info = r.info["CHROM2"] @@ -103,6 +103,11 @@ def vcf_reader(pth, infile, parse_probs, sample_name, ignore_sample, default_pro if chrom == -1 or chrom2 == -1: logging.warning(f"Chromosome from record in --sites not found in input file header CHROM={r.chrom}, POS={r.start}, CHROM2={chrom2_info}, END={r.stop}") + if isinstance(chrom, str): + raise ValueError(f"Could not find {chrom} in bam file header") + if isinstance(chrom2, str): + chrom2 = chrom + start = r.start # note pysam converts to zero-based index like bam stop = r.stop if chrom == chrom2 and stop < start: @@ -119,7 +124,7 @@ def vcf_reader(pth, infile, parse_probs, sample_name, ignore_sample, default_pro else: svlen = int(r.info["SVLEN"]) else: - svlen = None + svlen = -1 # is_dysgu = False # if "SVMETHOD" in r.info and "DYSGU" in r.info["SVMETHOD"]: @@ -144,7 +149,6 @@ def vcf_reader(pth, infile, parse_probs, sample_name, ignore_sample, default_pro raise ValueError("PROB or MeanPROB must be in a float in range 0-1, error at CHROM {}, POS {}".format(chrom, start)) recs[chrom].append(Site(chrom, start, chrom2, stop, svt, idx, r.id, svlen, pval)) - if not_parsed: logging.warning("Some records had incompatible SVTYPE: {}".format( str({k: v for k, v in Counter(not_parsed).items()}))) @@ -152,7 +156,8 @@ def vcf_reader(pth, infile, parse_probs, sample_name, ignore_sample, default_pro def append_uncalled(df, site_adder, infile, parse_probs): - + if site_adder is None: + raise ValueError("Sites was None type") # add 0/0 genotype to uncalled variants in --sites found_sites = set([i for i in df["site_info"] if i]) uncalled = [] diff --git a/dysgu/sv_category.pxd b/dysgu/sv_category.pxd index 105f3d0..7457a6f 100644 --- a/dysgu/sv_category.pxd +++ b/dysgu/sv_category.pxd @@ -7,6 +7,7 @@ cdef class AlignmentItem: a_len, b_len, query_gap, read_overlaps_mate, size_inferred, query_overlap, inferred_sv_len cdef public str svtype, join_type cdef public object read_a, read_b + cdef public object a_node_info, b_node_info cdef void classify_d(AlignmentItem v) \ No newline at end of file diff --git a/dysgu/sv_category.pyx b/dysgu/sv_category.pyx index 5189413..97199dd 100644 --- a/dysgu/sv_category.pyx +++ b/dysgu/sv_category.pyx @@ -6,16 +6,10 @@ from dysgu.map_set_utils import echo cdef class AlignmentItem: """Data holder for classifying alignments into SV types""" - # cdef public int chrA, chrB, priA, priB, rA, rB, posA, endA, posB, endB, strandA, strandB, left_clipA, right_clipA,\ - # left_clipB, right_clipB, breakA_precise, breakB_precise, breakA, breakB, a_qstart, a_qend, b_qstart, b_qend,\ - # a_len, b_len, query_gap, read_overlaps_mate, size_inferred, query_overlap, inferred_sv_len - # cdef public str svtype, join_type - # cdef public object read_a, read_b def __cinit__(self, int chrA, int chrB, int priA, int priB, int rA, int rB, int posA, int endA, int posB, int endB, int strandA, int strandB, int left_clipA, int right_clipA, int left_clipB, int right_clipB, int a_qstart, int a_qend, int b_qstart, int b_qend, int a_len, int b_len, - int read_overlaps_mate, object read_a, - object read_b): + int read_overlaps_mate, read_a, read_b, a_node_info=None, b_node_info=None): self.chrA = chrA self.chrB = chrB self.priA = priA @@ -41,7 +35,8 @@ cdef class AlignmentItem: self.read_overlaps_mate = read_overlaps_mate self.read_a = read_a self.read_b = read_b - + self.a_node_info = a_node_info + self.b_node_info = b_node_info # These will be determined self.breakA_precise = 0 self.breakB_precise = 0 @@ -53,6 +48,56 @@ cdef class AlignmentItem: self.inferred_sv_len = -1 self.size_inferred = 0 # set to 1 if insertion size was inferred +# cdef void from_node_info(AlignmentItem v): +# a_info = v.a_node_info +# b_info = v.b_node_info +# +# v.breakA = v.a_node_info.event_pos +# v.breakA_precise = 1 if v.a_node_info.read_enum == 1 else 0 +# v.breakB = v.b_node_info.event_pos +# v.breakB_precise = 1 if v.b_node_info.read_enum == 1 else 0 +# +# v.join_type = f"{v.strandA}to{v.strandB}" +# +# cdef int query_gap = 0 +# cdef int query_overlap = 0 +# if v.rA == v.rB: # same read +# if v.b_qstart < v.a_qstart: # B is first on query +# query_gap = v.a_qstart - v.b_qend +# else: +# query_gap = v.b_qstart - v.a_qend +# if query_gap < 0: +# query_overlap = abs(query_gap) +# query_gap = 0 +# +# v.query_gap = query_gap +# v.query_overlap = query_overlap +# +# if a_info.chrom != b_info.chrom: +# v.svtype = "TRA" +# else: +# if a_info.event_pos < b_info.event_pos: +# if a_info.connect_right: +# if not b_info.connect_right: +# v.svtype = "DEL" +# else: +# v.svtype = "INV" +# else: +# if not b_info.connect_right: +# v.svtype = "INV" +# else: +# v.svtype = "DUP" +# else: +# if a_info.connect_right: +# if not b_info.connect_right: +# v.svtype = "DUP" +# else: +# v.svtype = "INV" +# else: +# if not b_info.connect_right: +# v.svtype = "INV" +# else: +# v.svtype = "DEL" cdef void two_primary(AlignmentItem v): @@ -272,7 +317,6 @@ cdef void same_read(AlignmentItem v): v.breakB = v.endB v.svtype = "DUP" - # v.svtype = "INS" # Check if gap on query is bigger than gap on reference ref_gap = abs(v.breakB - v.breakA) if v.a_qstart > v.b_qstart: # B is first on query seq @@ -785,7 +829,6 @@ cdef void translocation(AlignmentItem v): v.query_gap = query_gap v.query_overlap = query_overlap - cdef void classify_d(AlignmentItem v): #, debug=False): v.breakA_precise = 0 diff --git a/dysgu/view.py b/dysgu/view.py index 42eefe6..0c1de06 100644 --- a/dysgu/view.py +++ b/dysgu/view.py @@ -13,6 +13,8 @@ import gzip import pysam import numpy as np +from scipy.spatial.distance import cosine +import networkx as nx from sys import stdout from heapq import heappop, heappush import resource @@ -373,7 +375,7 @@ def get_names_list(file_list, ignore_csv=True): return seen_names, names_list -def process_file_list(args, file_list, seen_names, names_list, log_messages): +def process_file_list(args, file_list, seen_names, names_list, log_messages, show_progress=False, job_id=None): dfs = [] header = None contig_names = None @@ -398,10 +400,14 @@ def process_file_list(args, file_list, seen_names, names_list, log_messages): df = mung_df(df, args) if args["merge_within"] == "True": l_before = len(df) + if show_progress and job_id: + logging.info("{} started, records {}".format(job_id, l_before)) df = merge_df(df, 1, args["merge_dist"], {}, merge_within_sample=True, aggressive=args['collapse_nearby'] == "True", log_messages=log_messages) if log_messages: logging.info("{} rows before merge-within {}, rows after {}".format(name, l_before, len(df))) + elif show_progress and job_id: + logging.info("{} rows before merge-within {}, rows after {}".format(job_id, l_before, len(df))) if "partners" in df: del df["partners"] dfs.append(df) @@ -416,10 +422,15 @@ def process_file_list(args, file_list, seen_names, names_list, log_messages): outfile = open_outfile(args, names_list, log_messages=log_messages) if args["out_format"] == "vcf": + lens_before = list(map(len, dfs)) + if show_progress and job_id: + logging.info("{} started, records {}".format(job_id, lens_before)) count = io_funcs.to_vcf(df, args, seen_names, outfile, header=header, small_output_f=True, contig_names=contig_names, show_names=log_messages) if log_messages: - logging.info("Sample rows before merge {}, rows after {}".format(list(map(len, dfs)), count)) + logging.info("Sample rows before merge {}, rows after {}".format(lens_before, count)) + elif show_progress and job_id: + logging.info("{} rows after {}".format(job_id, count)) else: to_csv(df, args, outfile, small_output=False) @@ -444,7 +455,7 @@ def write(self, r): self.vcf.write(str(r)) -def shard_job(wd, item_path, name, Global): +def shard_job(wd, item_path, name, Global, show_progress): shards = {} vcf = pysam.VariantFile(item_path, 'r') contigs = len(vcf.header.contigs) @@ -473,6 +484,8 @@ def shard_job(wd, item_path, name, Global): for v in shards.values(): v.close() vcf.close() + if show_progress: + logging.info(f"Finished splitting {item_path}") return list(shards.keys()), rows, name @@ -545,7 +558,7 @@ def sort_into_single_file(out_f, vcf_header, file_paths_to_combine, sample_list) return written -def shard_data(args, input_files, Global): +def shard_data(args, input_files, Global, show_progress): logging.info(f"Merge distance: {args['merge_dist']} bp") out_f = args['svs_out'] if args['svs_out'] else '-' logging.info("SVs output to {}".format(out_f if out_f != '-' else 'stdout')) @@ -554,9 +567,11 @@ def shard_data(args, input_files, Global): seen_names, names_list = get_names_list(input_files, ignore_csv=False) # Split files into shards + if show_progress: + logging.info("Splitting files into shards") job_args = [] for name, item in zip(names_list, input_files): - job_args.append((args['wd'], item, name, Global)) + job_args.append((args['wd'], item, name, Global, show_progress)) pool = multiprocessing.Pool(args['procs']) results = pool.starmap(shard_job, job_args) input_rows = {} @@ -572,6 +587,8 @@ def shard_data(args, input_files, Global): resource.setrlimit(resource.RLIMIT_NOFILE, (min(hard, max(Global.soft, soft) * len(input_files)), hard)) # Process shards + if show_progress: + logging.info("Processing shards") job_args2 = [] needed_args = {k: args[k] for k in ["wd", "metrics", "merge_within", "merge_dist", "collapse_nearby", "merge_across", "out_format", "separate", "verbosity", "add_kind"]} merged_outputs = [] @@ -584,7 +601,7 @@ def shard_data(args, input_files, Global): fout = os.path.join(args['wd'], block_id + '_merged.vcf') merged_outputs.append(fout) target_args["svs_out"] = fout - job_args2.append((target_args, file_targets, seen_names, names, False)) + job_args2.append((target_args, file_targets, seen_names, names, False, show_progress, block_id)) if args['procs'] > 1: pool.starmap(process_file_list, job_args2) @@ -616,6 +633,191 @@ def shard_data(args, input_files, Global): os.remove(item) +def get_cosine_similarity(r_format_array, candidates, samp, target_keys): + res = [] + for index, c in candidates: + fmt = c.samples[samp] + vals2 = np.array([fmt[k] if k in fmt and fmt[k] is not None else 0 for k in target_keys]) + 1e-4 + cs = 1 - abs(cosine(r_format_array, vals2)) + res.append((index, c, cs)) + return res + + +def find_similar_candidates(current_cohort_file, variant_table, samp): + tot = 0 + G = nx.Graph() + cached = {} + cohort_index = 0 + for r in current_cohort_file.fetch(): + tot += 1 + if not r.chrom in variant_table: + cohort_index += 1 + continue + if not r.samples[samp]: + cohort_index += 1 + continue + t_idx = variant_table[r.chrom].bisect_left(r) + if t_idx >= len(variant_table[r.chrom]): + t_idx = 0 if t_idx == 0 else t_idx - 1 + while t_idx > 0 and variant_table[r.chrom][t_idx].pos > r.pos - 150: + t_idx -= 1 + candidates = [] + for i in range(t_idx, len(variant_table[r.chrom])): + vr = variant_table[r.chrom][i] + if abs(vr.pos - r.pos) > 250 and vr.pos > r.pos: + break + if vr.info["SVTYPE"] != r.info["SVTYPE"]: + continue + elif vr.info["SVTYPE"] == "TRA": + if vr.info["CHR2"] != r.info["CHR2"]: + continue + if abs(vr.info["CHR2_POS"] < r.info["CHR2_POS"]) > 250: + continue + + candidates.append((i, vr)) + + if len(candidates): + # NMB is skipped for older versions of dysgu + numeric = {k: v if v is not None else 0 for k, v in r.samples[samp].items() if k != "GT" and k != "NMB"} + target_keys = numeric.keys() + r_format_array = np.array(list(numeric.values())) + 1e-4 + candidates = get_cosine_similarity(r_format_array, candidates, samp, target_keys) + for index, c, cs_similarity in candidates: + cached[(index, r.chrom)] = c + G.add_edge(cohort_index, (index, r.chrom), weight=cs_similarity) + + cohort_index += 1 + + # Find single best u out-edge and single best v out-edge + matching_edges_u = {} + matching_edges_v = {} + for u in G.nodes(): + if isinstance(u, tuple): + continue + best = None + for v in G.neighbors(u): + if not isinstance(v, tuple): + continue + w = G[u][v]["weight"] + if not best or w > best[1]: + best = v, w + if best: + if best[0] not in matching_edges_v: + matching_edges_v[best[0]] = (u, best[1]) + elif best[1] >= matching_edges_v[best[0]][1]: + del matching_edges_u[ matching_edges_v[best[0]][0] ] + matching_edges_v[best[0]] = (u, best[1]) + else: + continue + if u not in matching_edges_u: + matching_edges_u[u] = best + elif best[1] >= matching_edges_u[u][1]: # update u edge + del matching_edges_v[ matching_edges_u[u][0] ] + matching_edges_u[u] = best + + matches = {} + for u, (v, w) in matching_edges_u.items(): + if w != 1: + matches[u] = cached[v] + + return tot, matches + + +def update_cohort_only(args): + + cohort = pysam.VariantFile(args["cohort_update"]) + cohort_samples = set(cohort.header.samples) + header = cohort.header + input_command = ' '.join(sys.argv) + header.add_line(f'##command="{input_command}"') + if args["svs_out"] == "-" or args["svs_out"] is None: + logging.info("SVs output to stdout") + args["svs_out"] = "-" + else: + logging.info("SVs output to {}".format(args["svs_out"])) + outfile = pysam.VariantFile(args["svs_out"], "w", header=header) + + samples = {} + samps_counts = defaultdict(int) + for pth in args["input_files"]: + v = pysam.VariantFile(pth) + samps = list(v.header.samples) + if len(samps) > 1: + raise ValueError(f"Only one sample supported per input file but {pth} has {list(samps)}") + samp = samps[0] + samps_counts[samp] += 1 + if samp in samples: + logging.warning(f"{samp} already in samples list, {samp} is assumed to be {samp}_{samps_counts[samp]} in cohort file") + samp = f"{samp}_{samps_counts[samp]}" + if samp not in cohort_samples: + raise ValueError(f"Input file has sample name {samp}, but this was not found in the cohort file {args['cohort_update']}") + samples[samp] = v + + # Go through each input sample and add to cohort + sample_index = 0 + current_cohort_file = cohort + for samp, samp_file in samples.items(): + variant_table = defaultdict(lambda: sortedcontainers.SortedKeyList([], key=lambda x: x.pos - 25)) + if sample_index == len(samples) - 1: + current_out_file = outfile + else: + current_out_file = pysam.VariantFile(f"{args['wd']}/merge_sample_idx_{sample_index}.vcf", "w", header=header) + if sample_index > 0: + try: + os.remove(f"{args['wd']}/merge_sample_idx_{sample_index - 1}.vcf") + except OSError: + pass + for r in samp_file.fetch(): + variant_table[r.chrom].add(r) + + n_updated = 0 + # First pass, find matching SVs + tot, matchings = find_similar_candidates(current_cohort_file, variant_table, samp) + + # Second pass update matching SVs + cohort_index = 0 + for r in current_cohort_file.fetch(): + if cohort_index not in matchings: + current_out_file.write(r) + cohort_index += 1 + continue + vr = matchings[cohort_index] + cohort_index += 1 + updated = False + ch_fmt = dict(r.samples[samp].items()) + for k, v in vr.samples[samp].items(): + if k not in ch_fmt or ch_fmt[k] is None: + continue + elif isinstance(v, float): + if int(v * 100) != int(ch_fmt[k] * 100): + ch_fmt[k] = v + updated = True + elif v != r.samples[samp][k]: + ch_fmt[k] = v + updated = True + if updated: + for k, v in ch_fmt.items(): + r.samples[samp][k] = v + probs = [] + for s in cohort_samples: + probs.append(r.samples[s]["PROB"]) + mean = sum(probs) / len(probs) + max_prob = max(probs) + r.info["MeanPROB"] = mean + r.info["MaxPROB"] = max_prob + n_updated += 1 + + current_out_file.write(r) + + logging.info(f"{sample_index + 1}/{len(samples)} - {samp}, updated: {n_updated}/{tot}") + current_out_file.close() + if sample_index != len(samples) - 1: + current_cohort_file = pysam.VariantFile(f"{args['wd']}/merge_sample_idx_{sample_index}.vcf") + sample_index += 1 + + return + + def view_file(args): t0 = time.time() @@ -630,22 +832,31 @@ def view_file(args): if not all(os.path.splitext(i)[1] == ".csv" for i in args["input_files"]): raise ValueError("All input files must have .csv extension") - args["metrics"] = False # only option supported so far - args["contigs"] = False + if args["cohort_update"]: + for opt, v in (("out_format", "vcf"), ("merge_within", "False"), ("merge_across", "True"), ("add_kind", "False"), ("separate", "False")): + if args[opt] != v: + raise ValueError(f"{opt}={v} fot supported with --cohort-update") + if args["wd"] is None: + raise ValueError("Need a working directory --wd") + update_cohort_only(args) - soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) - manager = multiprocessing.Manager() - Global = manager.Namespace() - Global.open_files = soft # we dont know how many file descriptors are already open by the user, assume all - Global.soft = soft - - if not args["wd"]: - seen_names, names_list = get_names_list(args["input_files"]) - process_file_list(args, args["input_files"], seen_names, names_list, log_messages=True) else: - shard_data(args, args["input_files"], Global=Global) - if args['clean']: - os.rmdir(args['wd']) + args["metrics"] = False # only option supported so far + args["contigs"] = False + + soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) + manager = multiprocessing.Manager() + Global = manager.Namespace() + Global.open_files = soft # we dont know how many file descriptors are already open by the user, assume all + Global.soft = soft + + if not args["wd"]: + seen_names, names_list = get_names_list(args["input_files"]) + process_file_list(args, args["input_files"], seen_names, names_list, log_messages=True) + else: + shard_data(args, args["input_files"], Global=Global, show_progress=args['progress']) + if args['clean']: + os.rmdir(args['wd']) logging.info("dysgu merge complete h:m:s, {}".format(str(datetime.timedelta(seconds=int(time.time() - t0))), time.time() - t0)) diff --git a/requirements.txt b/requirements.txt index fe93871..aaa1d69 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,9 +4,8 @@ click>=8.0 numpy scipy pandas -pysam==0.21.0 +pysam==0.22.0 networkx>=2.4 scikit-learn>=0.22 -edlib sortedcontainers lightgbm diff --git a/setup.py b/setup.py index 1e87c30..484e2a2 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ def get_extra_args(): extras = get_extra_args() + ["-Wno-sign-compare", "-Wno-unused-function", "-Wno-unused-result", '-Wno-ignored-qualifiers', - "-Wno-deprecated-declarations", + "-Wno-deprecated-declarations", "-fpermissive" ] ext_modules = list() @@ -128,22 +128,23 @@ def get_extra_args(): # Scikit-bio module ssw_extra_compile_args = ["-Wno-deprecated-declarations", '-std=c99', '-I.'] -# if icc or sysconfig.get_config_vars()['CC'] == 'icc': -# ssw_extra_compile_args.extend(['-qopenmp-simd', '-DSIMDE_ENABLE_OPENMP']) -# elif not (clang or sysconfig.get_config_vars()['CC'] == 'clang'): -# ssw_extra_compile_args.extend(['-fopenmp-simd', '-DSIMDE_ENABLE_OPENMP']) + ext_modules.append(Extension(f"dysgu.scikitbio._ssw_wrapper", [f"dysgu/scikitbio/_ssw_wrapper.pyx", f"dysgu/scikitbio/ssw.c"], include_dirs=[f"{root}/dysgu/scikitbio", numpy.get_include()], extra_compile_args=ssw_extra_compile_args, - # define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")], language="c")) +ext_modules.append(Extension(f"dysgu.edlib.edlib", + [f"dysgu/edlib/edlib.pyx", f"dysgu/edlib/src/edlib.cpp"], + include_dirs=[f"{root}/dysgu/edlib", numpy.get_include()], + extra_compile_args=["-O3", "-std=c++11"], + language="c++")) # Dysgu modules for item in ["sv2bam", "io_funcs", "graph", "coverage", "assembler", "call_component", - "map_set_utils", "cluster", "sv_category", "extra_metrics"]: # "post_call_metrics", + "map_set_utils", "cluster", "sv_category", "extra_metrics"]: ext_modules.append(Extension(f"dysgu.{item}", [f"dysgu/{item}.pyx"], @@ -164,8 +165,8 @@ def get_extra_args(): url="https://github.com/kcleal/dysgu", description="Structural variant calling", license="MIT", - version='1.6.2', - python_requires='>=3.7', + version='1.6.3', + python_requires='>=3.10', install_requires=[ # runtime requires 'setuptools>=63.0', 'cython', @@ -173,12 +174,11 @@ def get_extra_args(): 'numpy>=1.18', 'scipy', 'pandas', - 'pysam==0.21.0', + 'pysam==0.22.0', 'networkx>=2.4', 'scikit-learn>=0.22', 'sortedcontainers', 'lightgbm', - 'edlib', ], setup_requires=[ 'setuptools>=63.0', @@ -187,14 +187,13 @@ def get_extra_args(): 'numpy>=1.18', 'scipy', 'pandas', - 'pysam==0.21.0', + 'pysam==0.22.0', 'networkx>=2.4', 'scikit-learn>=0.22', 'sortedcontainers', 'lightgbm', - 'edlib' ], - packages=["dysgu", "dysgu.tests", "dysgu.scikitbio"], + packages=["dysgu", "dysgu.tests", "dysgu.scikitbio", "dysgu.edlib"], ext_modules=cythonize(ext_modules), include_package_data=True, zip_safe=False,