Skip to content

Commit

Permalink
Merge branch 'region-based-filter'
Browse files Browse the repository at this point in the history
Conflicts:
	README.md
	sam/simple-filters.go
  • Loading branch information
caherzee committed Nov 21, 2017
2 parents 2caa0f0 + a3813ea commit 81eace9
Show file tree
Hide file tree
Showing 8 changed files with 374 additions and 28 deletions.
15 changes: 10 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,12 @@ The order in which command options are passed is ignored. For optimal performanc

1. filter-unmapped-reads or filter-unmapped-reads-strict
2. filter-non-exact-mapping-reads or filter-non-exact-mapping-reads-strict
3. clean-sam
4. replace-reference-sequences
5. replace-read-group
6. mark-duplicates
7. remove-duplicates
3. filter-non-overlapping-reads
4. clean-sam
5. replace-reference-sequences
6. replace-read-group
7. mark-duplicates
8. remove-duplicates
9. remove-optional-fields
10. keep-optional-fields

Expand Down Expand Up @@ -246,6 +247,10 @@ Removes all alignments where the mapping is not an exact match with the referenc

Removes all alignments where the mapping is not an exact match with reference or not a unique match. This filters checks for each read that the following optional fields are present with the following values: X0=1 (unique mapping), X1=0 (no suboptimal hit), XM=0 (no mismatch), XO=0 (no gap opening), XG=0 (no gap extension).

### --filter-non-overlapping-reads bed-file

Removes all reads where the mapping positions do not overlap with any region specified in the bed file. Specifically, either the start or end of the read's mapping position must be contained in an interval, or the read is removed from the output.

### --replace-read-group read-group-string

This filter replaces or adds read groups to the alignments in the input file. This command option takes a single argument, a string of the form "ID:group1 LB:lib1 PL:illumina PU:unit1 SM:sample1" where the names following ID:, PL:, PU:, etc. can be any user-chosen name conforming to the SAM specification. See SAM Format Specification Section 1.3 for details: The string passed here can be any string conforming to a header line for tag @RG, omitting the tag @RG itself, and using whitespace as separators for the line instead of TABs.
Expand Down
101 changes: 101 additions & 0 deletions bed/bed-files.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package bed

import (
"bufio"
"fmt"
"os"
"strconv"
"strings"

"github.com/exascience/elprep/utils"
)

/*
Helper function for parsing a track line field.
*/
func splitTrackField(field string) (string, string) {
split := strings.Split(field, "=")
return split[0], split[1]
}

/*
Parses a BED file. See
https://genome.ucsc.edu/FAQ/FAQformat.html#format1
*/
func ParseBed(filename string) (b *Bed, err error) {

bed := NewBed()

// open file
file, err := os.Open(filename)
if err != nil {
return nil, err
}
defer func() {
if nerr := file.Close(); err == nil {
err = nerr
}
}()

scanner := bufio.NewScanner(file)

var track *BedTrack // for storing the current track

for scanner.Scan() {
line := scanner.Text()
data := strings.Split(line, "\t")
// check if the line is a new track
if data[0] == "track" {
// create new track, store the old one
if track != nil {
bed.Tracks = append(bed.Tracks, track)
}
// all track entries are optional
// parse and collect those that are used
fields := make(map[string]string)
for _, field := range data[1:] {
key, val := splitTrackField(field)
fields[key] = val
}
track = NewBedTrack(fields)
} else {
// parse a region entry
chrom := utils.Intern(data[0])
var err error
start, err := strconv.Atoi(data[1])
if err != nil {
return nil, fmt.Errorf("Invalid bed region start: %v ", err)
}
end, err := strconv.Atoi(data[2])
if err != nil {
return nil, fmt.Errorf("Invalid bed region end: %v ", err)
}
region, err := NewBedRegion(chrom, int32(start), int32(end), data[3:])
if err != nil {
return nil, fmt.Errorf("Invalid bed region: %v ", err)
}
AddBedRegion(bed, region)
if track != nil {
track.Regions = append(track.Regions, region)
}
}

}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("Error while reading bed file: %v ", err)
}
// Make sure bed regions are sorted.
sortBedRegions(bed)
return bed, nil
}

func printParsedBed(bed *Bed) {
fmt.Println("Bed{")
for k, r := range bed.RegionMap {
fmt.Println("Chrom ", *k, " :")
for _, v := range r {
fmt.Println("BedRegion{", *v.Chrom, v.Start, v.End, " }")
}
}
fmt.Println("}")
}
181 changes: 181 additions & 0 deletions bed/bed-types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
package bed

import (
"fmt"
"sort"
"strconv"

"github.com/exascience/elprep/utils"
)

/*
A struct for representing the contents of a BED file. See
https://genome.ucsc.edu/FAQ/FAQformat.html#format1
*/
type Bed struct {
// Bed tracks defined in the file.
Tracks []*BedTrack
// Maps chromosome name onto bed regions.
RegionMap map[utils.Symbol][]*BedRegion
}

/*
A struct for representing BED tracks. See
https://genome.ucsc.edu/FAQ/FAQformat.html#format1
*/
type BedTrack struct {
// All track fields are optional.
Fields map[string]string
// The bed regions this track groups together.
Regions []*BedRegion
}

/*
An interval as defined in a BED file. See
https://genome.ucsc.edu/FAQ/FAQformat.html#format1
*/
type BedRegion struct {
Chrom utils.Symbol
Start int32
End int32
OptionalFields []interface{}
}

// Symbols for optional strand field of a BedRegion.
var (
// Strand forward.
SF = utils.Intern("+")
// Strand reverse.
SR = utils.Intern("-")
)

/*
Allocates and initializes a new BedRegion. Optional fields are given
in order. If a "later" field is entered, then the "earlier" field was
entered as well. See https://genome.ucsc.edu/FAQ/FAQformat.html#format1
*/
func NewBedRegion(chrom utils.Symbol, start int32, end int32, fields []string) (b *BedRegion, err error) {
bedRegionFields, err := initializeBedRegionFields(fields)
if err != nil {
return nil, err
}
return &BedRegion{
Chrom: chrom,
Start: start,
End: end,
OptionalFields: bedRegionFields,
}, nil
}

// Valid bed region optional fields. See spec.
const (
brName = iota
brScore
brStrand
brThickStart
brThickEnd
brItemRgb
brBlockCount
brBlockSizes
brBlockStarts
)

/*
Allocates a fresh SmallMap to initialize a BedRegion's optional fields.
*/
func initializeBedRegionFields(fields []string) ([]interface{}, error) {
brFields := make([]interface{}, len(fields))
for i, val := range fields {
switch i {
case brName:
brFields[brName] = val
case brScore:
score, err := strconv.Atoi(val)
if err != nil || score < 0 || score > 1000 {
return nil, fmt.Errorf("Invalid Score field : %v", err)
}
brFields[brScore] = score
case brStrand:
if val != "+" && val != "-" {
return nil, fmt.Errorf("Invalid Strand field: %v", val)
}
brFields[brStrand] = utils.Intern(val)
case brThickStart:
start, err := strconv.Atoi(val)
if err != nil {
return nil, fmt.Errorf("Invalid ThickStart field: %v", err)
}
brFields[brThickStart] = start
case brThickEnd:
end, err := strconv.Atoi(val)
if err != nil {
return nil, fmt.Errorf("Invalid ThickEnd field: %v", err)
}
brFields[brThickEnd] = end
case brItemRgb:
if val == "on" {
brFields[brItemRgb] = true
} else {
brFields[brItemRgb] = false
}
case brBlockCount:
count, err := strconv.Atoi(val)
if err != nil {
return nil, fmt.Errorf("Invalid BlockCount field: %v", err)
}
brFields[brBlockCount] = count
case brBlockSizes:
sizes, err := strconv.Atoi(val)
if err != nil {
return nil, fmt.Errorf("Invalid BlockSizes field: %v", err)
}
brFields[brBlockSizes] = sizes
case brBlockStarts:
start, err := strconv.Atoi(val)
if err != nil {
return nil, fmt.Errorf("Invalid BlockStarts field: %v", err)
}
brFields[brBlockStarts] = start
default:
return nil, fmt.Errorf("Invalid optional field: %v out of 0-8.", val)
}
}
return brFields, nil
}

/*
Allocates and initializes a new BedTrack.
*/
func NewBedTrack(fields map[string]string) *BedTrack {
return &BedTrack{
Fields: fields,
}
}

/*
Allocates and initializes an empty bed.
*/
func NewBed() *Bed {
return &Bed{
RegionMap: make(map[utils.Symbol][]*BedRegion),
}
}

/*
Add region to the bed region map.
*/
func AddBedRegion(bed *Bed, region *BedRegion) {
// append the region entry
bed.RegionMap[region.Chrom] = append(bed.RegionMap[region.Chrom], region)
}

/*
A function for sorting the bed regions.
*/
func sortBedRegions(bed *Bed) {
for _, regions := range bed.RegionMap {
sort.SliceStable(regions, func(i, j int) bool {
return regions[i].Start < regions[j].Start
})
}
}
5 changes: 5 additions & 0 deletions bed/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
/*
Package bed is a library for parsing and representing BED files. See
https://genome.ucsc.edu/FAQ/FAQformat.html#format1
*/
package bed
14 changes: 14 additions & 0 deletions cmd/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"strings"
"time"

"github.com/exascience/elprep/bed"
"github.com/exascience/elprep/internal"
"github.com/exascience/elprep/sam"
"github.com/exascience/elprep/utils"
Expand Down Expand Up @@ -137,6 +138,7 @@ const FilterHelp = "Filter parameters:\n" +
"[--filter-unmapped-reads-strict]\n" +
"[--filter-non-exact-mapping-reads]\n" +
"[--filter-non-exact-mapping-reads-strict]\n" +
"[--filter-non-overlapping-reads bed-file]\n" +
"[--replace-read-group read-group-string]\n" +
"[--mark-duplicates]\n" +
"[--remove-duplicates]\n" +
Expand All @@ -158,6 +160,7 @@ func Filter() error {
filterUnmappedReads, filterUnmappedReadsStrict bool
filterNonExactMappingReads bool
filterNonExactMappingReadsStrict bool
filterNonOverlappingReads string
replaceReadGroup string
markDuplicates, markDuplicatesDeterministic, removeDuplicates bool
removeOptionalFields string
Expand All @@ -178,6 +181,7 @@ func Filter() error {
flags.BoolVar(&filterUnmappedReadsStrict, "filter-unmapped-reads-strict", false, "remove all unmapped alignments, taking also POS and RNAME into account")
flags.BoolVar(&filterNonExactMappingReads, "filter-non-exact-mapping-reads", false, "output only exact mapping reads (soft-clipping allowed) based on cigar string (only M,S allowed)")
flags.BoolVar(&filterNonExactMappingReadsStrict, "filter-non-exact-mapping-reads-strict", false, "output only exact mapping reads (soft-clipping allowed) based on optional fields X0=1, X1=0, XM=0, XO=0, XG=0")
flags.StringVar(&filterNonOverlappingReads, "filter-non-overlapping-reads", "", "output only reads that overlap with the given regions (bed format)")
flags.StringVar(&replaceReadGroup, "replace-read-group", "", "add or replace alignment read groups")
flags.BoolVar(&markDuplicates, "mark-duplicates", false, "mark duplicates")
flags.BoolVar(&markDuplicatesDeterministic, "mark-duplicates-deterministic", false, "mark duplicates deterministically")
Expand Down Expand Up @@ -279,6 +283,16 @@ func Filter() error {
fmt.Fprint(&command, " --filter-non-exact-mapping-reads-strict")
}

if filterNonOverlappingReads != "" {
parsedBed, err := bed.ParseBed(filterNonOverlappingReads)
if err != nil {
return err
}
filterNonOverlappingReadsFilter := sam.FilterNonOverlappingReads(parsedBed)
filters = append(filters, filterNonOverlappingReadsFilter)
fmt.Fprint(&command, " --filter-non-overlapping-reads ", filterNonOverlappingReads)
}

if renameChromosomes {
filters = append(filters, sam.RenameChromosomes)
fmt.Fprint(&command, " --rename-chromosomes")
Expand Down
2 changes: 1 addition & 1 deletion cmd/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (

const (
ProgramName = "elprep"
ProgramVersion = "3.03"
ProgramVersion = "3.04"
ProgramURL = "http://github.com/exascience/elprep"
)

Expand Down
Loading

0 comments on commit 81eace9

Please sign in to comment.