From 8033043e0dda00b8df82f23ca5d291289a1330e1 Mon Sep 17 00:00:00 2001 From: "Marcus R. Breese" Date: Fri, 13 Dec 2024 12:01:12 -0500 Subject: [PATCH] bed-combine => bed-merge, added option for not splitting This will be much faster than bed-reduce for sorted BED files, and can operate in a similar manner (except that score isn't used, but this could be added easily to the internal class MultinameBedRecord) --- src/java/io/compgen/ngsutils/NGSUtils.java | 4 +- .../bed/{BedCombine.java => BedMerge.java} | 39 +++++++++++++++---- 2 files changed, 34 insertions(+), 9 deletions(-) rename src/java/io/compgen/ngsutils/cli/bed/{BedCombine.java => BedMerge.java} (93%) diff --git a/src/java/io/compgen/ngsutils/NGSUtils.java b/src/java/io/compgen/ngsutils/NGSUtils.java index 8ee01ce..4adb99c 100644 --- a/src/java/io/compgen/ngsutils/NGSUtils.java +++ b/src/java/io/compgen/ngsutils/NGSUtils.java @@ -44,7 +44,7 @@ import io.compgen.ngsutils.cli.bam.BinCount; import io.compgen.ngsutils.cli.bam.PileupCli; import io.compgen.ngsutils.cli.bed.BedCleanScore; -import io.compgen.ngsutils.cli.bed.BedCombine; +import io.compgen.ngsutils.cli.bed.BedMerge; import io.compgen.ngsutils.cli.bed.BedCount; import io.compgen.ngsutils.cli.bed.BedNearest; import io.compgen.ngsutils.cli.bed.BedPEToBed; @@ -232,7 +232,7 @@ public static void main(String[] args) { .addCommand(BamToBedPE.class) .addCommand(BedPEToBed.class) .addCommand(BamClean.class) - .addCommand(BedCombine.class) + .addCommand(BedMerge.class) .addCommand(VCFPeptide.class) .addCommand(FastaGrep.class) .addCommand(BamToFasta.class) diff --git a/src/java/io/compgen/ngsutils/cli/bed/BedCombine.java b/src/java/io/compgen/ngsutils/cli/bed/BedMerge.java similarity index 93% rename from src/java/io/compgen/ngsutils/cli/bed/BedCombine.java rename to src/java/io/compgen/ngsutils/cli/bed/BedMerge.java index 5303a46..ff07e54 100644 --- a/src/java/io/compgen/ngsutils/cli/bed/BedCombine.java +++ b/src/java/io/compgen/ngsutils/cli/bed/BedMerge.java @@ -27,8 +27,8 @@ import io.compgen.ngsutils.support.BufferedIteratorImpl; import io.compgen.ngsutils.support.ListSortedBufferedIterator; -@Command(name="bed-combine", desc="Given two or more (sorted) BED(3/6) files, combine the BED annotations into one output BED file. This will produce non-overlapping regions (unlike bed-reduce). All other columns (and score) are ignored.", category="bed", experimental=true) -public class BedCombine extends AbstractOutputCommand { +@Command(name="bed-merge", desc="Given two or more (sorted) BED(3/6) files, combine the BED annotations into one output BED file. This can produce non-overlapping regions (--split) or unions (like bed-reduce). All other columns (and score) are ignored. Because this requires sorted inputs, it is more effcient than bed-reduce.", category="bed", experimental=true) +public class BedMerge extends AbstractOutputCommand { public class MultinameIterator implements Iterator { private Iterator it; @@ -46,7 +46,6 @@ public MultinameBedRecord next() { BedRecord rec = it.next(); return new MultinameBedRecord(rec); } - } public class MultinameBedRecord implements Comparable{ @@ -114,6 +113,7 @@ public MultinameBedRecord clone(GenomeSpan newcoord) { private boolean ignoreStrand = false; private boolean single = false; + private boolean split = false; @Option(name="single", desc="Use only one annotation for each position (prioritzed based on arg order)") public void setSingle(boolean val) { @@ -156,6 +156,11 @@ public void setIgnoreStrand(boolean val) { this.ignoreStrand = val; } + @Option(name="split", desc="Split overlapping regions") + public void setSplit(boolean val) { + this.split = val; + } + @Exec public void exec() throws IOException, CommandArgumentException { if (bedFilenames == null) { @@ -207,12 +212,12 @@ public void exec() throws IOException, CommandArgumentException { } } - } - + } protected void processBedFiles(Strand strand, FAIFile fai) throws IOException { processBedFiles(strand, fai, null, null); } + protected void processBedFiles(Strand strand, FAIFile fai, Iterator firstIterator, BedRecord firstRecord) throws IOException { List> srcBeds = new ArrayList>(); ListSortedBufferedIterator bufList = new ListSortedBufferedIterator(); @@ -353,16 +358,36 @@ protected void processBedFiles(Strand strand, FAIFile fai, Iterator f for (String name: second.getNames()) { if (!first.getNames().contains(name)) { namematch = false; + break; } } - if (namematch) { + for (String name: first.getNames()) { + if (!second.getNames().contains(name)) { + namematch = false; + break; + } + } + } + + if (namematch || !split) { // // A |-------| // A |-----| // + // *or* + // + // A |-------| + // B |-----| + // B |-----| + // + // and not splitting... + // MultinameBedRecord merged = first.clone(new GenomeSpan(curRef, Math.min(start1, start2), Math.max(end1, end2), strand)); + if (!split && !namematch) { + merged.addName(second.getNames()); + } first = merged; } else if (start2 > start1 && start2 < end1 && end2 > end1) { @@ -423,7 +448,7 @@ protected void processBedFiles(Strand strand, FAIFile fai, Iterator f write(left); first = merged; - } else if (start2 == start1 && start2 < end1 && end2 > end1) { + } else if (start2 == start1 && end2 > end1) { // // A |-------| // B |-----------|