Skip to content

Commit

Permalink
Add a new interval list scatter mode (#1786)
Browse files Browse the repository at this point in the history
* Add a new interval list scatter mode to avoid issue of giant final list
in large joint genotyping scatters
  • Loading branch information
ldgauthier committed Apr 7, 2022
1 parent b1e01c2 commit 9eafe4e
Show file tree
Hide file tree
Showing 153 changed files with 1,349 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,16 @@ public enum IntervalListScatterMode implements CommandLineParser.ClpEnum {
* rather than the base level, for example CNV calling.
*/
INTERVAL_COUNT(IntervalListScattererByIntervalCount::new, "Scatter the interval list into similarly sized interval lists " +
"(by interval count, not by base count). " +
"Resulting interval lists will contain the same number of intervals except for the last, which contains the remainder."),

/***
* A scatter by interval **count** which attempts to fill each resulting interval list with approximately equal numbers
* of intervals, disregarding the base count. This approach distributes the remainder intervals across the initial interval lists.
* This is the preferred mode for whole genome joint calling and other scenarios where the interval list to be split
* contains a large number of intervals that should be distributed equally by count.
*/
INTERVAL_COUNT_WITH_DISTRIBUTED_REMAINDER(IntervalListScattererByIntervalCountWithDistributedRemainder::new, "Scatter the interval list into similarly sized interval lists " +
"(by interval count, not by base count). " +
"Resulting interval lists will contain similar number of intervals.");

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package picard.util.IntervalList;

import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;

import java.util.Arrays;
import java.util.List;

/**
* Scatters {@link IntervalList} by into `interval count` shards so that resulting {@link IntervalList}'s have
* approximately same number of intervals in them. The "remainder" intervals are distributed over the last lists.
*/
public class IntervalListScattererByIntervalCountWithDistributedRemainder extends IntervalListScattererByIntervalCount {

@Override
public List<Interval> takeSome(final Interval interval, final long idealSplitWeight, final long currentSize, final double projectSizeOfRemaining) {
if (projectSizeOfRemaining > currentSize) {
return Arrays.asList(interval, null);
} else {
return Arrays.asList(null, interval);
}
}
}
2 changes: 1 addition & 1 deletion src/main/java/picard/util/IntervalListTools.java
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ protected int doWork() {

if (SCATTER_CONTENT != null) {
final long listSize = SUBDIVISION_MODE.make().listWeight(output);
SCATTER_COUNT = (int) listSize / SCATTER_CONTENT;
SCATTER_COUNT = (int)Math.round((double) listSize / SCATTER_CONTENT);
LOG.info(String.format("Using SCATTER_CONTENT = %d and an interval of size %d, attempting to scatter into %s intervals.", SCATTER_CONTENT, listSize, SCATTER_COUNT));
}

Expand Down
46 changes: 42 additions & 4 deletions src/test/java/picard/util/IntervalListScattererTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@

import java.io.File;
import java.util.*;
import java.util.stream.Collectors;

/**
* Basic test for scatter functionality in IntervalListTools
Expand All @@ -43,6 +44,12 @@ public class IntervalListScattererTest {
private static final File INTERVAL_FILE = new File(TEST_DATA_DIR, "scatterable.interval_list");
private static final IntervalList LIST_TO_SCATTER = IntervalList.fromFile(INTERVAL_FILE);

static final File LARGER_INTERVAL_FILE = new File(TEST_DATA_DIR, "test.hg38.200.interval_list");
private static final List<File> LARGER_INTERVAL_EXPECTEDS = Arrays.asList(new File(TEST_DATA_DIR, "largeScatters").listFiles());
static final List<IntervalList> LARGER_EXPECTED_LISTS = LARGER_INTERVAL_EXPECTEDS.stream().sorted().flatMap(l -> Arrays.asList(l.listFiles()).stream().map(f -> IntervalList.fromFile(f))).collect(Collectors.toList());
private static final List<File> LARGER_INTERVAL_NO_REMAINDER_EXPECTEDS = Arrays.asList(new File(TEST_DATA_DIR, "largeScattersNoRemainder").listFiles());
static final List<IntervalList> LARGER_NO_REMAINDER_EXPECTED_LISTS = LARGER_INTERVAL_NO_REMAINDER_EXPECTEDS.stream().sorted().flatMap(l -> Arrays.asList(l.listFiles()).stream().map(f -> IntervalList.fromFile(f))).collect(Collectors.toList());

private static final File INTERVAL_WITH_OVERFLOW_FILE = new File(TEST_DATA_DIR, "scatterable_with_overflow.interval_list");
private static final IntervalList LIST_TO_SCATTER_WITH_OVERFLOW = IntervalList.fromFile(INTERVAL_WITH_OVERFLOW_FILE);

Expand All @@ -64,7 +71,7 @@ public String toString() {
'}';
}

private Testcase(final File file, final int scatterWidth, final IntervalListScatterMode mode, final List<IntervalList> expectedScatter) {
Testcase(final File file, final int scatterWidth, final IntervalListScatterMode mode, final List<IntervalList> expectedScatter) {
this.source = IntervalList.fromFile(file);
this.file = file;
this.expectedScatter = expectedScatter;
Expand All @@ -73,8 +80,32 @@ private Testcase(final File file, final int scatterWidth, final IntervalListScat
}
}

@DataProvider
public static Iterator<Object[]> testScatterTestcases() {
/**
* These are split out separately and private because the treatment of remainders causes different behavior whether
* scatter count is specified (as similated here) versus whether scatter *content* is specified, as in in the IntervalListTools
* integration tests
* @return test cases with expected behavior for distributed remainder mode given a specified number of output lists
*/
private static List<Testcase> getRemainderTestcases() {
final List<Testcase> testCases = new ArrayList<>();
testCases.add(new Testcase(
LARGER_INTERVAL_FILE, 60, IntervalListScatterMode.INTERVAL_COUNT_WITH_DISTRIBUTED_REMAINDER,
LARGER_EXPECTED_LISTS
));

testCases.add(new Testcase(
LARGER_INTERVAL_FILE, 20, IntervalListScatterMode.INTERVAL_COUNT_WITH_DISTRIBUTED_REMAINDER,
LARGER_NO_REMAINDER_EXPECTED_LISTS
));
return testCases;
}

/**
* These test cases for modes not including INTERVAL_COUNT_WITH_DISTRIBUTED_REMAINDER have the same behavior
*
* @return
*/
public static List<Testcase> getScatterTestcases() {
final List<Testcase> testCases = new ArrayList<>();
Assert.assertEquals(LIST_TO_SCATTER.getUniqueBaseCount(), 200, "Wrong unique base count");
Assert.assertEquals(LIST_TO_SCATTER_MANY.getUniqueBaseCount(), 32 * 2, "Wrong unique base count");
Expand Down Expand Up @@ -393,7 +424,14 @@ public static Iterator<Object[]> testScatterTestcases() {
IntervalList.overlaps(LIST_TO_SCATTER_MANY, secondThird))
))))));

return testCases.stream().map(tc -> new Object[]{tc}).iterator();
return testCases;
}

@DataProvider
public static Iterator<Object[]> testScatterTestcases() {
final List<Testcase> testcases = new ArrayList<>(getScatterTestcases());
testcases.addAll(getRemainderTestcases());
return testcases.stream().map(tc -> new Object[]{tc}).iterator();
}

@Test(dataProvider = "testScatterTestcases")
Expand Down
33 changes: 30 additions & 3 deletions src/test/java/picard/util/IntervalListToolsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import picard.cmdline.CommandLineProgramTest;
import picard.util.IntervalList.IntervalListScatterMode;
import picard.util.IntervalList.IntervalListScatterer;

import java.io.File;
Expand All @@ -40,21 +41,25 @@
import java.util.List;
import java.util.Objects;
import java.util.Scanner;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
* Created by farjoun on 10/22/17.
*/

public class IntervalListToolsTest extends CommandLineProgramTest {
private final File TEST_DATA_DIR = new File("testdata/picard/util/");
private static final File TEST_DATA_DIR = new File("testdata/picard/util/");
private final File scatterable = new File(TEST_DATA_DIR, "scatterable.interval_list");
private final File scatterableStdin = new File(TEST_DATA_DIR, "scatterable_stdin");
private final File secondInput = new File(TEST_DATA_DIR, "secondInput.interval_list");
private final File largeScatterable = new File(TEST_DATA_DIR, "large_scatterable.interval_list");
private final File abutting = new File(TEST_DATA_DIR, "abutting.interval_list");
private final File abutting_combined = new File(TEST_DATA_DIR, "abutting_combined.interval_list");
private final File abutting_notcombined = new File(TEST_DATA_DIR, "abutting_notcombined.interval_list");
private static final List<File> LARGER_EXPECTED_WITH_REMAINDER_FILES = Arrays.asList(new File(TEST_DATA_DIR, "largeScattersWithRemainder").listFiles());
private static final List<IntervalList> LARGER_EXPECTED_WITH_REMAINDER_LISTS = LARGER_EXPECTED_WITH_REMAINDER_FILES.stream().sorted().flatMap(l -> Arrays.asList(l.listFiles()).stream().map(f -> IntervalList.fromFile(f))).collect(Collectors.toList());


@Test
public void testSecondInputValidation() {
Expand Down Expand Up @@ -293,10 +298,32 @@ private long testerCountOutput(IntervalListTools.Action action, IntervalListTool
return reader.nextLong();
}

/**
* These are split out separately and private because the treatment of remainders causes different behavior whether
* scatter count is specified (as similated here) versus whether scatter *content* is specified, as in in the IntervalListTools
* integration tests
* @return test cases with expected behavior for distributed remainder mode given a specified number of output lists
*/
private static List<IntervalListScattererTest.Testcase> getRemainderTestcases() {
final List<IntervalListScattererTest.Testcase> testCases = new ArrayList<>();
testCases.add(new IntervalListScattererTest.Testcase(
IntervalListScattererTest.LARGER_INTERVAL_FILE, 67, IntervalListScatterMode.INTERVAL_COUNT_WITH_DISTRIBUTED_REMAINDER,
LARGER_EXPECTED_WITH_REMAINDER_LISTS
));

testCases.add(new IntervalListScattererTest.Testcase(
IntervalListScattererTest.LARGER_INTERVAL_FILE, 20, IntervalListScatterMode.INTERVAL_COUNT_WITH_DISTRIBUTED_REMAINDER,
IntervalListScattererTest.LARGER_NO_REMAINDER_EXPECTED_LISTS
));
return testCases;
}

// test scatter with different kinds of balancing.
@DataProvider
public static Iterator<Object[]> testScatterTestcases() {
return IntervalListScattererTest.testScatterTestcases();
final List<IntervalListScattererTest.Testcase> testcases = new ArrayList<>(IntervalListScattererTest.getScatterTestcases());
testcases.addAll(getRemainderTestcases());
return testcases.stream().map(tc -> new Object[]{tc}).iterator();
}

private final List<File> dirsToDelete = new ArrayList<>();
Expand Down Expand Up @@ -381,7 +408,7 @@ public void testScatterByContent(final IntervalListScattererTest.Testcase tc) th
args.add("OUTPUT=" + ilOutDir);
}

args.add("SCATTER_CONTENT=" + tc.mode.make().listWeight(tc.source) / tc.expectedScatter.size());
args.add("SCATTER_CONTENT=" + (int)Math.round((double)tc.mode.make().listWeight(tc.source) / tc.expectedScatter.size()));

Assert.assertEquals(runPicardCommandLine(args), 0);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
@HD VN:1.6 SO:coordinate
@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd AS:38 UR:/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta SP:Homo sapiens
@PG ID:1 CL:IntervalListTools --INPUT testdata/picard/util/test.hg38.200.interval_list --OUTPUT testdata/picard/util/largeScatters --SCATTER_COUNT 60 --SUBDIVISION_MODE INTERVAL_COUNT_WITH_DISTRIBUTED_REMAINDER --PADDING 0 --UNIQUE false --DONT_MERGE_ABUTTING false --SORT true --ACTION CONCAT --INCLUDE_FILTERED false --BREAK_BANDS_AT_MULTIPLES_OF 0 --INVERT false --OUTPUT_VALUE NONE --VERBOSITY INFO --QUIET false --VALIDATION_STRINGENCY STRICT --COMPRESSION_LEVEL 5 --MAX_RECORDS_IN_RAM 500000 --CREATE_INDEX false --CREATE_MD5_FILE false --GA4GH_CLIENT_SECRETS client_secrets.json --help false --version false --showHidden false --USE_JDK_DEFLATER false --USE_JDK_INFLATER false PN:IntervalListTools
chr1 1 195878 + target_1
chr1 195879 391754 + target_2
chr1 391755 606302 + target_3
chr1 606303 820848 + target_4
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
@HD VN:1.6 SO:coordinate
@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd AS:38 UR:/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta SP:Homo sapiens
@PG ID:1 CL:IntervalListTools --INPUT testdata/picard/util/test.hg38.200.interval_list --OUTPUT testdata/picard/util/largeScatters --SCATTER_COUNT 60 --SUBDIVISION_MODE INTERVAL_COUNT_WITH_DISTRIBUTED_REMAINDER --PADDING 0 --UNIQUE false --DONT_MERGE_ABUTTING false --SORT true --ACTION CONCAT --INCLUDE_FILTERED false --BREAK_BANDS_AT_MULTIPLES_OF 0 --INVERT false --OUTPUT_VALUE NONE --VERBOSITY INFO --QUIET false --VALIDATION_STRINGENCY STRICT --COMPRESSION_LEVEL 5 --MAX_RECORDS_IN_RAM 500000 --CREATE_INDEX false --CREATE_MD5_FILE false --GA4GH_CLIENT_SECRETS client_secrets.json --help false --version false --showHidden false --USE_JDK_DEFLATER false --USE_JDK_INFLATER false PN:IntervalListTools
chr1 820849 910849 + target_5
chr1 910850 1000848 + target_6
chr1 1000849 1112599 + target_7
chr1 1112600 1224349 + target_8
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
@HD VN:1.6 SO:coordinate
@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd AS:38 UR:/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta SP:Homo sapiens
@PG ID:1 CL:IntervalListTools --INPUT testdata/picard/util/test.hg38.200.interval_list --OUTPUT testdata/picard/util/largeScatters --SCATTER_COUNT 60 --SUBDIVISION_MODE INTERVAL_COUNT_WITH_DISTRIBUTED_REMAINDER --PADDING 0 --UNIQUE false --DONT_MERGE_ABUTTING false --SORT true --ACTION CONCAT --INCLUDE_FILTERED false --BREAK_BANDS_AT_MULTIPLES_OF 0 --INVERT false --OUTPUT_VALUE NONE --VERBOSITY INFO --QUIET false --VALIDATION_STRINGENCY STRICT --COMPRESSION_LEVEL 5 --MAX_RECORDS_IN_RAM 500000 --CREATE_INDEX false --CREATE_MD5_FILE false --GA4GH_CLIENT_SECRETS client_secrets.json --help false --version false --showHidden false --USE_JDK_DEFLATER false --USE_JDK_INFLATER false PN:IntervalListTools
chr1 1224350 1348642 + target_9
chr1 1348643 1472934 + target_10
chr1 1472935 1584197 + target_11
chr1 1584198 1695459 + target_12
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
@HD VN:1.6 SO:coordinate
@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd AS:38 UR:/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta SP:Homo sapiens
@PG ID:1 CL:IntervalListTools --INPUT testdata/picard/util/test.hg38.200.interval_list --OUTPUT testdata/picard/util/largeScatters --SCATTER_COUNT 60 --SUBDIVISION_MODE INTERVAL_COUNT_WITH_DISTRIBUTED_REMAINDER --PADDING 0 --UNIQUE false --DONT_MERGE_ABUTTING false --SORT true --ACTION CONCAT --INCLUDE_FILTERED false --BREAK_BANDS_AT_MULTIPLES_OF 0 --INVERT false --OUTPUT_VALUE NONE --VERBOSITY INFO --QUIET false --VALIDATION_STRINGENCY STRICT --COMPRESSION_LEVEL 5 --MAX_RECORDS_IN_RAM 500000 --CREATE_INDEX false --CREATE_MD5_FILE false --GA4GH_CLIENT_SECRETS client_secrets.json --help false --version false --showHidden false --USE_JDK_DEFLATER false --USE_JDK_INFLATER false PN:IntervalListTools
chr1 1695460 1815724 + target_13
chr1 1815725 1935987 + target_14
chr1 1935988 2030188 + target_15
chr1 2030189 2124387 + target_16
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
@HD VN:1.6 SO:coordinate
@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd AS:38 UR:/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta SP:Homo sapiens
@PG ID:1 CL:IntervalListTools --INPUT testdata/picard/util/test.hg38.200.interval_list --OUTPUT testdata/picard/util/largeScatters --SCATTER_COUNT 60 --SUBDIVISION_MODE INTERVAL_COUNT_WITH_DISTRIBUTED_REMAINDER --PADDING 0 --UNIQUE false --DONT_MERGE_ABUTTING false --SORT true --ACTION CONCAT --INCLUDE_FILTERED false --BREAK_BANDS_AT_MULTIPLES_OF 0 --INVERT false --OUTPUT_VALUE NONE --VERBOSITY INFO --QUIET false --VALIDATION_STRINGENCY STRICT --COMPRESSION_LEVEL 5 --MAX_RECORDS_IN_RAM 500000 --CREATE_INDEX false --CREATE_MD5_FILE false --GA4GH_CLIENT_SECRETS client_secrets.json --help false --version false --showHidden false --USE_JDK_DEFLATER false --USE_JDK_INFLATER false PN:IntervalListTools
chr1 2124388 2261379 + target_17
chr1 2261380 2398369 + target_18
chr1 2398370 2662724 + target_19
chr1 2662725 2927077 + target_20
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
@HD VN:1.6 SO:coordinate
@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd AS:38 UR:/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta SP:Homo sapiens
@PG ID:1 CL:IntervalListTools --INPUT testdata/picard/util/test.hg38.200.interval_list --OUTPUT testdata/picard/util/largeScatters --SCATTER_COUNT 60 --SUBDIVISION_MODE INTERVAL_COUNT_WITH_DISTRIBUTED_REMAINDER --PADDING 0 --UNIQUE false --DONT_MERGE_ABUTTING false --SORT true --ACTION CONCAT --INCLUDE_FILTERED false --BREAK_BANDS_AT_MULTIPLES_OF 0 --INVERT false --OUTPUT_VALUE NONE --VERBOSITY INFO --QUIET false --VALIDATION_STRINGENCY STRICT --COMPRESSION_LEVEL 5 --MAX_RECORDS_IN_RAM 500000 --CREATE_INDEX false --CREATE_MD5_FILE false --GA4GH_CLIENT_SECRETS client_secrets.json --help false --version false --showHidden false --USE_JDK_DEFLATER false --USE_JDK_INFLATER false PN:IntervalListTools
chr1 2927078 3044965 + target_21
chr1 3044966 3162852 + target_22
chr1 3162853 3261512 + target_23
chr1 3261513 3360170 + target_24
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
@HD VN:1.6 SO:coordinate
@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd AS:38 UR:/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta SP:Homo sapiens
@PG ID:1 CL:IntervalListTools --INPUT testdata/picard/util/test.hg38.200.interval_list --OUTPUT testdata/picard/util/largeScatters --SCATTER_COUNT 60 --SUBDIVISION_MODE INTERVAL_COUNT_WITH_DISTRIBUTED_REMAINDER --PADDING 0 --UNIQUE false --DONT_MERGE_ABUTTING false --SORT true --ACTION CONCAT --INCLUDE_FILTERED false --BREAK_BANDS_AT_MULTIPLES_OF 0 --INVERT false --OUTPUT_VALUE NONE --VERBOSITY INFO --QUIET false --VALIDATION_STRINGENCY STRICT --COMPRESSION_LEVEL 5 --MAX_RECORDS_IN_RAM 500000 --CREATE_INDEX false --CREATE_MD5_FILE false --GA4GH_CLIENT_SECRETS client_secrets.json --help false --version false --showHidden false --USE_JDK_DEFLATER false --USE_JDK_INFLATER false PN:IntervalListTools
chr1 3360171 3481821 + target_25
chr1 3481822 3603470 + target_26
chr1 3603471 3720976 + target_27
chr1 3720977 3838480 + target_28
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
@HD VN:1.6 SO:coordinate
@SQ SN:chr1 LN:248956422 M5:6aef897c3d6ff0c78aff06ac189178dd AS:38 UR:/seq/references/Homo_sapiens_assembly38/v0/Homo_sapiens_assembly38.fasta SP:Homo sapiens
@PG ID:1 CL:IntervalListTools --INPUT testdata/picard/util/test.hg38.200.interval_list --OUTPUT testdata/picard/util/largeScatters --SCATTER_COUNT 60 --SUBDIVISION_MODE INTERVAL_COUNT_WITH_DISTRIBUTED_REMAINDER --PADDING 0 --UNIQUE false --DONT_MERGE_ABUTTING false --SORT true --ACTION CONCAT --INCLUDE_FILTERED false --BREAK_BANDS_AT_MULTIPLES_OF 0 --INVERT false --OUTPUT_VALUE NONE --VERBOSITY INFO --QUIET false --VALIDATION_STRINGENCY STRICT --COMPRESSION_LEVEL 5 --MAX_RECORDS_IN_RAM 500000 --CREATE_INDEX false --CREATE_MD5_FILE false --GA4GH_CLIENT_SECRETS client_secrets.json --help false --version false --showHidden false --USE_JDK_DEFLATER false --USE_JDK_INFLATER false PN:IntervalListTools
chr1 3838481 3931034 + target_29
chr1 3931035 4023586 + target_30
chr1 4023587 4132844 + target_31
chr1 4132845 4242101 + target_32
Loading

0 comments on commit 9eafe4e

Please sign in to comment.