Skip to content

Commit

Permalink
elPrep 4.0.0 release.
Browse files Browse the repository at this point in the history
- added base quality score recalibration (BQSR)
- added optical duplicate marking
- added metrics (MultiQC compatible)
- support for SAM File Format version 1.6
- support for FASTA and VCF files
- support for BAM/BGZF files
- support for elPrep-specific elsites and elfasta formats
- split/filter/merge (sfm) mode now implemented in Go instead of Python
- added --log-path option to all tools
- various API and performance improvements
- changed license to the GNU Affero General Public License version 3 as published by the Free Software Foundation, with Additional Terms
- updated demos
  • Loading branch information
caherzee committed Oct 18, 2018
1 parent 69b7d56 commit 5937c9b
Show file tree
Hide file tree
Showing 67 changed files with 11,186 additions and 2,449 deletions.
731 changes: 705 additions & 26 deletions LICENSE.txt

Large diffs are not rendered by default.

430 changes: 212 additions & 218 deletions README.md

Large diffs are not rendered by default.

20 changes: 19 additions & 1 deletion bed/bed-files.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
// elPrep: a high-performance tool for preparing SAM/BAM files.
// Copyright (c) 2017, 2018 imec vzw.

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version, and Additional Terms
// (see below).

// This program is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Affero General Public License for more details.

// You should have received a copy of the GNU Affero General Public
// License and Additional Terms along with this program. If not, see
// <https://github.com/ExaScience/elprep/blob/master/LICENSE.txt>.

package bed

import (
Expand All @@ -7,7 +25,7 @@ import (
"strconv"
"strings"

"github.com/exascience/elprep/utils"
"github.com/exascience/elprep/v4/utils"
)

// Helper function for parsing a track line field.
Expand Down
20 changes: 19 additions & 1 deletion bed/bed-types.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,29 @@
// elPrep: a high-performance tool for preparing SAM/BAM files.
// Copyright (c) 2017, 2018 imec vzw.

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version, and Additional Terms
// (see below).

// This program is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Affero General Public License for more details.

// You should have received a copy of the GNU Affero General Public
// License and Additional Terms along with this program. If not, see
// <https://github.com/ExaScience/elprep/blob/master/LICENSE.txt>.

package bed

import (
"fmt"
"sort"
"strconv"

"github.com/exascience/elprep/utils"
"github.com/exascience/elprep/v4/utils"
)

// Bed is a struct for representing the contents of a BED file. See
Expand Down
140 changes: 140 additions & 0 deletions cmd/convert.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
// elPrep: a high-performance tool for preparing SAM/BAM files.
// Copyright (c) 2017, 2018 imec vzw.

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version, and Additional Terms
// (see below).

// This program is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Affero General Public License for more details.

// You should have received a copy of the GNU Affero General Public
// License and Additional Terms along with this program. If not, see
// <https://github.com/ExaScience/elprep/blob/master/LICENSE.txt>.

package cmd

import (
"flag"
"fmt"
"os"

"github.com/exascience/elprep/v4/fasta"
"github.com/exascience/elprep/v4/intervals"
)

// VcfToElsitesHelp is the help string for this command.
const VcfToElsitesHelp = "vcf-to-elsites parameters:\n" +
"elprep vcf-to-elsites vcf-file elsites-file\n" +
"[--log-path path]\n"

// VcfToElsites implements the elprep vcf-to-elsites command.
func VcfToElsites() error {

var logPath string

var flags flag.FlagSet

flags.StringVar(&logPath, "log-path", "", "write log files to the specified directory")

var input, output string

if len(os.Args) < 4 {
fmt.Fprintln(os.Stderr, "Incorrect number of parameters.")
fmt.Fprint(os.Stderr, VcfToElsitesHelp)
os.Exit(1)
}

input = getFilename(os.Args[2], VcfToElsitesHelp)
output = getFilename(os.Args[3], VcfToElsitesHelp)

setLogOutput(logPath)

inter, err := intervals.FromVcfFile(input)

if err != nil {
return err
}

for chrom, ivals := range inter {
intervals.ParallelSortByStart(ivals)
inter[chrom] = intervals.ParallelFlatten(ivals)
}

return intervals.ToElsitesFile(inter, output)
}

// BedToElsitesHelp is the help string for this command.
const BedToElsitesHelp = "\nbed-to-elsites parameters:\n" +
"elprep bed-to-elsites bed-file elsites-file\n" +
"[--log-path path]\n"

// BedToElsites implements the elprep bed-to-elsites command.
func BedToElsites() error {

var logPath string

var flags flag.FlagSet

flags.StringVar(&logPath, "log-path", "", "write log files to the specified directory")

parseFlags(flags, 4, BedToElsitesHelp)

input := getFilename(os.Args[2], BedToElsitesHelp)
output := getFilename(os.Args[3], BedToElsitesHelp)

setLogOutput(logPath)

inter, err := intervals.FromBedFile(input)

if err != nil {
return err
}

for chrom, ivals := range inter {
intervals.ParallelSortByStart(ivals)
inter[chrom] = intervals.ParallelFlatten(ivals)
}

return intervals.ToElsitesFile(inter, output)
}

// FastaToElfastaHelp is the help string for this command.
const FastaToElfastaHelp = "fasta-to-elfasta parameters:\n" +
"elprep fasta-to-elfasta fasta-file elfasta-file\n" +
"[--log-path path]\n"

// FastaToElfasta implements the elprep fasta-to-elfasta command.
func FastaToElfasta() error {

var logPath string

var flags flag.FlagSet

flags.StringVar(&logPath, "log-path", "", "write log files to the specified directory")

var input, output string

if len(os.Args) < 4 {
fmt.Fprintln(os.Stderr, "Incorrect number of parameters.")
fmt.Fprintln(os.Stderr, FastaToElfastaHelp)
os.Exit(1)
}

input = getFilename(os.Args[2], FastaToElfastaHelp)
output = getFilename(os.Args[3], FastaToElfastaHelp)

setLogOutput(logPath)

fst, err := fasta.ParseFasta(input, nil, false, false)

if err != nil {
return err
}

return fasta.ToElfasta(fst, output)
}
77 changes: 30 additions & 47 deletions cmd/deprecated-filter.go
Original file line number Diff line number Diff line change
@@ -1,18 +1,35 @@
// elPrep: a high-performance tool for preparing SAM/BAM files.
// Copyright (c) 2017, 2018 imec vzw.

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version, and Additional Terms
// (see below).

// This program is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Affero General Public License for more details.

// You should have received a copy of the GNU Affero General Public
// License and Additional Terms along with this program. If not, see
// <https://github.com/ExaScience/elprep/blob/master/LICENSE.txt>.

package cmd

import (
"bytes"
"fmt"
"log"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"

"github.com/exascience/elprep/filters"
"github.com/exascience/elprep/sam"
"github.com/exascience/elprep/utils"
"github.com/exascience/elprep/v4/filters"
"github.com/exascience/elprep/v4/sam"
"github.com/exascience/elprep/v4/utils"
)

type commandLine []string
Expand Down Expand Up @@ -43,7 +60,7 @@ func appendIf(slice []sam.Filter, filters ...sam.Filter) []sam.Filter {
}

// DeprecatedFilterHelp is the help string for this command.
const DeprecatedFilterHelp = "Filter parameters: (deprecated, please use the filter command instead)\n" +
const DeprecatedFilterHelp = "\nFilter parameters: (deprecated, please use the filter command instead)\n" +
"elprep sam-file sam-output-file\n" +
"[--replace-reference-sequences sam-file]\n" +
"[--filter-unmapped-reads [strict]]\n" +
Expand All @@ -52,29 +69,27 @@ const DeprecatedFilterHelp = "Filter parameters: (deprecated, please use the fil
"[--sorting-order [keep | unknown | unsorted | queryname | coordinate]]\n" +
"[--clean-sam]\n" +
"[--nr-of-threads nr]\n" +
"[--timed]\n" +
"[--reference-t fai-file]\n" +
"[--reference-T fasta-file]\n"
"[--timed]\n"

// DeprecatedFilter parses the command line for elprep filter in the
// style of previous elprep versions (1.x and 2.x), for backwards
// compatibility. This command is deprecated and will be removed at a
// later stage.
func DeprecatedFilter() error {
setLogOutput()
setLogOutput("")
log.Println("Warning: Calling elprep without a command to invoke the filter functionality is depecratead. Please use the filter command instead.")
cmdLine := commandLine(os.Args[1:])
sortingOrder := sam.Keep
nrOfThreads := 0
var markDuplicates, markDuplicatesDeterministic, timed bool
var markDuplicates, timed bool
var replaceRefSeqDictFilter,
removeUnmappedReadsFilter,
replaceReadGroupFilter,
markDuplicatesFilter,
removeDuplicatesFilter,
cleanSamFilter,
renameChromosomesFilter sam.Filter
var refSeqDict, filterUnmappedArg, readGroupString, referenceFai, referenceFasta, profile string
var refSeqDict, filterUnmappedArg, readGroupString, profile string
var filenames []string
for entry, found := cmdLine.pop(); found; entry, found = cmdLine.pop() {
switch entry {
Expand Down Expand Up @@ -110,12 +125,11 @@ func DeprecatedFilter() error {
removeDuplicatesFilter = filters.RemoveDuplicateReads
} else if next == "deterministic" {
cmdLine.pop()
markDuplicatesDeterministic = true
} else {
break
}
}
markDuplicatesFilter = filters.MarkDuplicates(markDuplicatesDeterministic)
markDuplicatesFilter, _, _ = filters.MarkDuplicates(false)
case "--sorting-order":
if so := cmdLine.peek(); (so == "") || strings.Contains(so, "--") {
sortingOrder = sam.Keep
Expand Down Expand Up @@ -161,22 +175,6 @@ func DeprecatedFilter() error {
timed = true
case "--profile":
profile, _ = cmdLine.pop()
case "--reference-t":
if ref := cmdLine.peek(); (ref == "") || strings.Contains(ref, "--") {
log.Println("Please provide reference file with --reference-t.")
fmt.Fprint(os.Stderr, DeprecatedFilterHelp)
os.Exit(1)
} else {
referenceFai, _ = cmdLine.pop()
}
case "--reference-T":
if ref := cmdLine.peek(); (ref == "") || strings.Contains(ref, "--") {
log.Println("Please provide reference file with --reference-T.")
fmt.Fprint(os.Stderr, DeprecatedFilterHelp)
os.Exit(1)
} else {
referenceFasta, _ = cmdLine.pop()
}
case "--rename-chromosomes":
renameChromosomesFilter = filters.RenameChromosomes
case "--split-file":
Expand All @@ -193,19 +191,8 @@ func DeprecatedFilter() error {
if (replaceRefSeqDictFilter != nil) && (sortingOrder == sam.Keep) {
log.Print("Warning: Requesting to keep the order of the input file while replacing the reference sequence dictionary may force an additional sorting phase to ensure the original sorting order is respected.")
}
if (filepath.Ext(filenames[1]) == sam.CramExt) && (referenceFai == "") && (referenceFasta == "") {
log.Println("Error: Attempting to output to cram without specifying a reference file. Please add --reference-t or --reference-T to your call.")
fmt.Fprint(os.Stderr, DeprecatedFilterHelp)
os.Exit(1)
}
var s bytes.Buffer
fmt.Fprint(&s, os.Args[0], " filter ", filenames[0], " ", filenames[1])
if referenceFai != "" {
fmt.Fprint(&s, " --reference-t ", referenceFai)
}
if referenceFasta != "" {
fmt.Fprint(&s, " --reference-T ", referenceFasta)
}
if removeUnmappedReadsFilter != nil {
switch filterUnmappedArg {
case "strict":
Expand All @@ -229,11 +216,7 @@ func DeprecatedFilter() error {
fmt.Fprint(&s, " --replace-read-group ", readGroupString)
}
if markDuplicatesFilter != nil {
if markDuplicatesDeterministic {
fmt.Fprint(&s, " --mark-duplicates-deterministic")
} else {
fmt.Fprint(&s, " --mark-duplicates")
}
fmt.Fprint(&s, " --mark-duplicates")
if removeDuplicatesFilter != nil {
fmt.Fprint(&s, " --remove-duplicates")
}
Expand Down Expand Up @@ -269,7 +252,7 @@ func DeprecatedFilter() error {
log.Println("Executing command:\n", cmdString)
if markDuplicates || (sortingOrder == sam.Coordinate) || (sortingOrder == sam.Queryname) ||
((replaceRefSeqDictFilter != nil) && (sortingOrder == sam.Keep)) {
return runBestPracticesPipelineIntermediateSam(filenames[0], filenames[1], referenceFai, referenceFasta, sortingOrder, filters1, filters2, timed, profile)
return runBestPracticesPipelineIntermediateSam(filenames[0], filenames[1], sortingOrder, filters1, filters2, nil, false, timed, profile)
}
return runBestPracticesPipeline(filenames[0], filenames[1], referenceFai, referenceFasta, sortingOrder, filters1, timed, profile)
return runBestPracticesPipeline(filenames[0], filenames[1], sortingOrder, filters1, timed, profile)
}
Loading

0 comments on commit 5937c9b

Please sign in to comment.