Skip to content

Commit

Permalink
There was an issue when the --replace-reference-sequences option was …
Browse files Browse the repository at this point in the history
…used with the elprep sfm command, which under certain circumstances may have resulted in improperly sorted results. This has been fixed in elprep version 4.1.1. Thanks a lot to Geert Vandeweyer for reporting this issue.

Additional fixes:
- The --replace-reference-sequences option incorrectly removed unmapped reads. This has been fixed in version 4.1.1 and does not happen anymore.
- The --contig-group-size option has been deprecated for the elprep merge command. The elprep sfm and elprep split command still support it. The elprep split command now works correctly without knowing the --contig-group-size that was passed to the corresponding elprep split command.
- The BGZFWriter panicked when a Write happens after a Close because of sending data to a closed Go channel. This has been changed into a proper error return value.
  • Loading branch information
Pascal Costanza committed Jan 28, 2019
1 parent a88f504 commit e020756
Show file tree
Hide file tree
Showing 8 changed files with 308 additions and 182 deletions.
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ Use this command option to indicate the sfm command is processing single-end dat

### --contig-group-size number

This command option is passed to both the split and merge tools.
This command option is passed to the split tool.

The elprep split command groups the sequence dictionary entries for deciding how to split up the input data. The goal is to end up with groups of sequence dictionary entries (contigs) for which the total length (sum of LN tags) is roughly the same among all groups. By default, the elprep split command identifies the sequence dictionary entry with the largest length (LN) and chooses this as a target size for the groups.

Expand Down Expand Up @@ -490,8 +490,6 @@ Sets the path for writing a log file.

The elprep split command groups the sequence dictionary entries for deciding how to split up the input data. The --contig-group-size options allows configuring a specific group size. See the description of --contig-group-size for the elprep sfm command for more details.

The --contig-group-size parameter passed to the elprep merge command must be exactly the same as the one passed to the elprep split commend. The elprep sfm command ensures that this is the case.

## Name

### elprep merge - a commandline tool for merging .sam/.bam files created by elprep split
Expand Down Expand Up @@ -522,9 +520,7 @@ Sets the path for writing a log file.

### --contig-group-size number

The elprep split command groups the sequence dictionary entries for deciding how to split up the input data. The --contig-group-size options allows configuring a specific group size. See the description of --contig-group-size for the elprep sfm command for more details.

The --contig-group-size parameter passed to the elprep merge command must be exactly the same as the one passed to the elprep split commend. The elprep sfm command ensures that this is the case.
The --contig-group-size parameter for the elprep merge command is deprecated since version 4.1.1. The elprep merge command now correctly processes the split files without that information.

# Extending elPrep

Expand Down
17 changes: 10 additions & 7 deletions cmd/merge.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@ const MergeHelp = "\nmerge parameters:\n" +
"[--single-end]\n" +
"[--nr-of-threads n]\n" +
"[--timed]\n" +
"[--log-path path]\n" +
"[--contig-group-size nr]\n"
"[--log-path path]\n"

// Merge implements the elprep merge command.
func Merge() error {
Expand All @@ -52,7 +51,7 @@ func Merge() error {

var flags flag.FlagSet

flags.IntVar(&contigGroupSize, "contig-group-size", 0, "maximum sum of reference sequence lengths for creating groups of reference sequences")
flags.IntVar(&contigGroupSize, "contig-group-size", -1, "maximum sum of reference sequence lengths for creating groups of reference sequences (deprecated)")
flags.BoolVar(&singleEnd, "single-end", false, "when splitting single-end data")
flags.IntVar(&nrOfThreads, "nr-of-threads", 0, "number of worker threads")
flags.BoolVar(&timed, "timed", false, "measure the runtime")
Expand Down Expand Up @@ -104,6 +103,10 @@ func Merge() error {
os.Exit(1)
}

if contigGroupSize != -1 {
fmt.Fprintln(os.Stderr, "Use of the --contig-group-size option in the elprep merge command is deprecated.")
}

firstFile, err := sam.Open(filepath.Join(fullInputPath, filesToMerge[0]))
if err != nil {
return err
Expand Down Expand Up @@ -164,12 +167,12 @@ func Merge() error {
case sam.Coordinate:
if singleEnd {
err := timedRun(timed, profile, "Merging single-end split files sorted by coordinate.", 1, func() (err error) {
return sam.MergeSingleEndFilesSplitPerChromosome(fullInputPath, output, inputPrefix, inputExtension, header, contigGroupSize)
return sam.MergeSingleEndFilesSplitPerChromosome(fullInputPath, output, inputPrefix, inputExtension, header, -1)
})
return err
}
err := timedRun(timed, profile, "Merging paired-end split files sorted by coordinate.", 1, func() (err error) {
return sam.MergeSortedFilesSplitPerChromosome(fullInputPath, output, inputPrefix, inputExtension, header, contigGroupSize)
return sam.MergeSortedFilesSplitPerChromosome(fullInputPath, output, inputPrefix, inputExtension, header, -1)
})
return err
case sam.Queryname:
Expand All @@ -178,12 +181,12 @@ func Merge() error {
default:
if singleEnd {
err := timedRun(timed, profile, "Merging unsorted single-end split files.", 1, func() (err error) {
return sam.MergeSingleEndFilesSplitPerChromosome(fullInputPath, output, inputPrefix, inputExtension, header, contigGroupSize)
return sam.MergeSingleEndFilesSplitPerChromosome(fullInputPath, output, inputPrefix, inputExtension, header, -1)
})
return err
}
err := timedRun(timed, profile, "Merging unsorted paired-end split files.", 1, func() (err error) {
return sam.MergeUnsortedFilesSplitPerChromosome(fullInputPath, output, inputPrefix, inputExtension, header, contigGroupSize)
return sam.MergeUnsortedFilesSplitPerChromosome(fullInputPath, output, inputPrefix, inputExtension, header, -1)
})
return err
}
Expand Down
1 change: 0 additions & 1 deletion cmd/sfm.go
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,6 @@ func Sfm() error {
if contigGroupSize > 0 {
contigGroupSizeString := strconv.FormatInt(int64(contigGroupSize), 10)
splitArgs = append(splitArgs, "--contig-group-size", contigGroupSizeString)
mergeArgs = append(mergeArgs, "--contig-group-size", contigGroupSizeString)
}

if singleEnd {
Expand Down
2 changes: 1 addition & 1 deletion cmd/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ const (
// ProgramName is "elprep"
ProgramName = "elprep"
// ProgramVersion is the version of the elprep binary
ProgramVersion = "4.1.0"
ProgramVersion = "4.1.1"
// ProgramURL is the repository for the elprep source code
ProgramURL = "http://github.com/exascience/elprep"
)
Expand Down
2 changes: 1 addition & 1 deletion demo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ We recommend using NA12878.exome.hg19.chr22.bam for quick testing and checking t

Copy the elPrep binary on your machine add it to your path. For example, fill in your username and execute:

export PATH=$PATH:/home/username/tools/elprep-v4.1.0:
export PATH=$PATH:/home/username/tools/elprep-v4.1.1:


## Demo 1: a workload for testing installation (NA12878.exome.hg19.chr22.bam)
Expand Down
1 change: 1 addition & 0 deletions filters/simple-filters.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ func ReplaceReferenceSequenceDictionary(dict []utils.StringMap) sam.Filter {
}
}
dictTable := make(map[string]bool)
dictTable["*"] = true
for _, entry := range dict {
dictTable[entry["SN"]] = true
}
Expand Down
19 changes: 17 additions & 2 deletions sam/bgzf-files.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"context"
"encoding/binary"
"errors"
"fmt"
"hash/crc32"
"io"
"sync"
Expand Down Expand Up @@ -356,10 +357,22 @@ func NewBGZFWriter(w io.Writer) *BGZFWriter {
return bgzf
}

func (bgzf *BGZFWriter) sendBlock() (err error) {
defer func() {
if x := recover(); x != nil {
err = errors.New(fmt.Sprint(x))
}
}()
bgzf.channel <- bgzf.block
return nil
}

// Close closes this BGZFWriter.
func (bgzf *BGZFWriter) Close() error {
if bgzf.block != nil && len(bgzf.block.bytes) > 0 {
bgzf.channel <- bgzf.block
if err := bgzf.sendBlock(); err != nil {
return err
}
}
close(bgzf.channel)
bgzf.wait.Wait()
Expand All @@ -380,7 +393,9 @@ func (bgzf *BGZFWriter) Write(p []byte) (n int, err error) {
bgzf.block.bytes = bgzf.block.bytes[:maxBgzfBlockSize]
k := copy(bgzf.block.bytes[blockIndex:], p)
p = p[k:]
bgzf.channel <- bgzf.block
if err := bgzf.sendBlock(); err != nil {
return n - len(p), err
}
bgzf.block = bytesPool.Get().(*bytesBlock)
} else {
bgzf.block.bytes = bgzf.block.bytes[:newBlockLength]
Expand Down
Loading

0 comments on commit e020756

Please sign in to comment.