Skip to content

Commit

Permalink
Merge pull request #28 from EBI-Metagenomics/fix_seqprep
Browse files Browse the repository at this point in the history
Fix seqprep
  • Loading branch information
KateSakharova committed Oct 8, 2020
2 parents ebf8b99 + 2107195 commit 722b9df
Show file tree
Hide file tree
Showing 11 changed files with 210 additions and 77 deletions.
25 changes: 25 additions & 0 deletions tools/Raw_reads/filter_paired_reads/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
FROM alpine:3.7

LABEL maintainer="Ekaterina Sakharova <kates@ebi.ac.uk>"
##############################################################
# Dockerfile Version: 19.03.1
# Software: seqtk + bash wrapper
# Software Version: 1.3 (r106)
# Description: filter reads < LEN bp from fastq
# paired-end files
##############################################################
RUN apk add --no-cache bash wget gzip build-base zlib-dev

# install seqtk
ENV VERSION=1.3
RUN wget https://github.com/lh3/seqtk/archive/v$VERSION.zip && \
unzip v$VERSION.zip && \
cd seqtk-$VERSION && make

# add wrapper
COPY filter_paired_reads.sh /tools/
RUN chmod a+x /tools/*

ENV PATH="/seqtk-$VERSION:/tools:${PATH}"

CMD ["filter_paired_reads.sh"]
55 changes: 55 additions & 0 deletions tools/Raw_reads/filter_paired_reads/filter_paired_reads.cwl
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env cwl-runner
cwlVersion: v1.0
class: CommandLineTool

label: "remove reads from both files that are less than LEN"

requirements:
ResourceRequirement:
coresMax: 1
ramMin: 200

inputs:
forward:
type: File
format: edam:format_1930
inputBinding:
prefix: -f
reverse:
type: File
format: edam:format_1930
inputBinding:
prefix: -r
len:
type: int
inputBinding:
prefix: -l

baseCommand: [filter_paired_reads.sh]

outputs:
forward_filtered:
type: File
format: edam:format_1930
outputBinding:
glob: forward_filt.fastq
reverse_filtered:
type: File
format: edam:format_1930
outputBinding:
glob: reverse_filt.fastq

hints:
- class: DockerRequirement
dockerPull: microbiomeinformatics/pipeline-v5.filter-paired


$namespaces:
edam: http://edamontology.org/
s: http://schema.org/
$schemas:
- http://edamontology.org/EDAM_1.16.owl
- https://schema.org/version/latest/schemaorg-current-http.rdf

s:license: "https://www.apache.org/licenses/LICENSE-2.0"
s:copyrightHolder: "EMBL - European Bioinformatics Institute"
22 changes: 22 additions & 0 deletions tools/Raw_reads/filter_paired_reads/filter_paired_reads.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

set -e

while getopts :f:r:l: option; do
case "${option}" in
f) FORWARD=${OPTARG};;
r) REVERSE=${OPTARG};;
l) LEN=${OPTARG};;
esac
done

gunzip -c ${FORWARD} > forward.fastq
gunzip -c ${REVERSE} > reverse.fastq

seqtk comp forward.fastq | awk -v l="${LEN}" '{ if ($2 >= l) { print} }' | cut -f1 > selected_1
seqtk comp reverse.fastq | awk -v l="${LEN}" '{ if ($2 >= l) { print} }' | cut -f1 > selected_2

comm -12 selected_1 selected_2 > common

seqtk subseq forward.fastq common > forward_filt.fastq
seqtk subseq reverse.fastq common > reverse_filt.fastq
13 changes: 13 additions & 0 deletions tools/Raw_reads/filter_paired_reads/filter_paired_reads.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
$namespaces:
edam: http://edamontology.org/

forward:
class: File
path: ../../../input_examples/amplicon-paired-ERR2237853_1.fastq.gz
format: edam:format_1930

reverse:
class: File
path: ../../../input_examples/amplicon-paired-ERR2237853_2.fastq.gz
format: edam:format_1930
len: 100
3 changes: 2 additions & 1 deletion tools/SeqPrep/seqprep.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ inputs:
label: second read input fastq
inputBinding:
prefix: -r
name: string

baseCommand: SeqPrep

Expand All @@ -38,7 +39,7 @@ arguments:
- "-2"
- reverse_unmerged.fastq.gz
- valueFrom: |
${ return inputs.forward_reads.nameroot.split('_')[0] + '_MERGED.fastq.gz' }
${ return inputs.name + '_MERGED.fastq.gz' }
prefix: "-s"
# - "-3"
# - forward_discarded.fastq.gz
Expand Down
4 changes: 2 additions & 2 deletions utils/multiple-gunzip.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
cwlVersion: v1.0

class: CommandLineTool
label: "merges output of seqprep and unzips for paired end reads, or unzips file for single end"
label: "unzip files"
requirements:
ResourceRequirement:
coresMin: 1
ramMin: 200 # just a default, could be lowered
ramMin: 2000 # just a default, could be lowered
InlineJavascriptRequirement: {}

hints:
Expand Down
24 changes: 6 additions & 18 deletions workflows/conditionals/amplicon/amplicon-1.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -70,41 +70,29 @@ steps:
out: [ hashsum ]


# << SeqPrep only for paired reads >>
# << SeqPrep (only for paired reads) + gunzip for paired and single>>
overlap_reads:
label: Paired-end overlapping reads are merged
run: ../../../tools/SeqPrep/seqprep.cwl
when: $(inputs.single == undefined)
run: ../../subworkflows/seqprep-subwf.cwl
in:
single: single_reads
forward_reads: forward_reads
reverse_reads: reverse_reads
out: [ merged_reads, forward_unmerged_reads, reverse_unmerged_reads ]

# << unzipping only >>
unzip_reads:
run: ../../../utils/multiple-gunzip.cwl
in:
target_reads:
source:
- overlap_reads/merged_reads
- single_reads
pickValue: first_non_null
reads: { default: true }
out: [ unzipped_merged_reads ]
paired_reads_length_filter: { default: 100 }
out: [ unzipped_single_reads ]

count_submitted_reads:
run: ../../../utils/count_lines/count_lines.cwl
in:
sequences: unzip_reads/unzipped_merged_reads
sequences: overlap_reads/unzipped_single_reads
number: { default: 4 }
out: [ count ]

# << Trim and Reformat >>
trimming:
run: ../../subworkflows/trim_and_reformat_reads.cwl
in:
reads: unzip_reads/unzipped_merged_reads
reads: overlap_reads/unzipped_single_reads
count: count_submitted_reads/count
out: [ trimmed_and_reformatted_reads ]

Expand Down
24 changes: 6 additions & 18 deletions workflows/conditionals/raw-reads/raw-reads-1.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -72,33 +72,21 @@ steps:
out: [ hashsum ]


# << SeqPrep only for paired reads >>
# << SeqPrep (only for paired reads) + gunzip for paired and single>>
overlap_reads:
label: Paired-end overlapping reads are merged
run: ../../../tools/SeqPrep/seqprep.cwl
when: $(inputs.single == undefined)
run: ../../subworkflows/seqprep-subwf.cwl
in:
single: single_reads
forward_reads: forward_reads
reverse_reads: reverse_reads
out: [ merged_reads, forward_unmerged_reads, reverse_unmerged_reads ]

# << unzipping only >>
unzip_reads:
run: ../../../utils/multiple-gunzip.cwl
in:
target_reads:
source:
- overlap_reads/merged_reads
- single_reads
pickValue: first_non_null
reads: { default: true }
out: [ unzipped_merged_reads ]
paired_reads_length_filter: { default: 100 }
out: [ unzipped_single_reads ]

count_submitted_reads:
run: ../../../utils/count_lines/count_lines.cwl
in:
sequences: unzip_reads/unzipped_merged_reads
sequences: overlap_reads/unzipped_single_reads
number: { default: 4 }
out: [ count ]

Expand All @@ -109,7 +97,7 @@ steps:
less than 15 over a 4 nucleotide wide window are removed)
run: ../../../tools/Trimmomatic/Trimmomatic-v0.36-SE.cwl
in:
reads1: unzip_reads/unzipped_merged_reads
reads1: overlap_reads/unzipped_single_reads
phred: { default: '33' }
leading: { default: 3 }
trailing: { default: 3 }
Expand Down
11 changes: 0 additions & 11 deletions workflows/subworkflows/chunking-subwf-hmmer.yml

This file was deleted.

27 changes: 0 additions & 27 deletions workflows/subworkflows/classify-otu-visualise.yml

This file was deleted.

79 changes: 79 additions & 0 deletions workflows/subworkflows/seqprep-subwf.cwl
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env cwl-runner
class: Workflow
cwlVersion: v1.2.0-dev2

requirements:
SubworkflowFeatureRequirement: {}
MultipleInputFeatureRequirement: {}
InlineJavascriptRequirement: {}
StepInputExpressionRequirement: {}
ScatterFeatureRequirement: {}

inputs:
forward_reads: File?
reverse_reads: File?
single_reads: File?
paired_reads_length_filter: int

outputs:
unzipped_single_reads:
type: File
outputSource:
- unzip_merged_reads/unzipped_merged_reads
- unzip_single_reads/unzipped_merged_reads
pickValue: first_non_null

steps:

# filter paired-end reads (for single do nothing)
filter_paired:
run: ../../tools/Raw_reads/filter_paired_reads/filter_paired_reads.cwl
when: $(inputs.single == undefined)
in:
single: single_reads
forward: forward_reads
reverse: reverse_reads
len: paired_reads_length_filter
out: [ forward_filtered, reverse_filtered ]

# << SeqPrep only for paired reads >>
overlap_reads:
label: Paired-end overlapping reads are merged
run: ../../tools/SeqPrep/seqprep.cwl
when: $(inputs.single == undefined)
in:
single: single_reads
forward_reads: filter_paired/forward_filtered
reverse_reads: filter_paired/reverse_filtered
name:
source: forward_reads
valueFrom: $(self.nameroot.split('_')[0])
out: [ merged_reads, forward_unmerged_reads, reverse_unmerged_reads ]

# << unzip merged reads >>
unzip_merged_reads:
when: $(inputs.single == undefined)
run: ../../utils/multiple-gunzip.cwl
in:
target_reads: overlap_reads/merged_reads
reads: { default: true }
out: [ unzipped_merged_reads ]

# << unzipping single reads >>
unzip_single_reads:
run: ../../utils/multiple-gunzip.cwl
when: $(inputs.single != undefined)
in:
target_reads: single_reads
reads: { default: true }
out: [ unzipped_merged_reads ]

$namespaces:
edam: http://edamontology.org/
s: http://schema.org/
$schemas:
- http://edamontology.org/EDAM_1.16.owl
- https://schema.org/version/latest/schemaorg-current-http.rdf

s:license: "https://www.apache.org/licenses/LICENSE-2.0"
s:copyrightHolder: "EMBL - European Bioinformatics Institute"

0 comments on commit 722b9df

Please sign in to comment.