-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile
112 lines (101 loc) · 3.78 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
sample_links = {"ERR458493": "https://osf.io/5daup/download",
"ERR458494":"https://osf.io/8rvh5/download",
"ERR458495":"https://osf.io/2wvn3/download",
"ERR458500":"https://osf.io/xju4a/download",
"ERR458501": "https://osf.io/nmqe6/download",
"ERR458502": "https://osf.io/qfsze/download"}
# the sample names are dictionary keys in sample_links. extract them to a list we can use below
SAMPLES=sample_links.keys()
rule all:
input:
# create a new filename for every entry in SAMPLES,
# replacing {name} with each entry.
expand("rnaseq/quant/{name}_quant/quant.sf", name=SAMPLES),
"rnaseq/fastqc/multiqc_report.html",
# download yeast rna-seq data from Schurch et al, 2016 study
rule download_reads:
output: "rnaseq/raw_data/{sample}.fq.gz"
params:
download_link = lambda wildcards: sample_links[wildcards.sample]
shell:
"""
curl -L {params.download_link} -o {output}
"""
rule fastqc_raw:
input: "rnaseq/raw_data/{sample}.fq.gz"
output: "rnaseq/raw_data/fastqc/{sample}_fastqc.html"
params:
outdir="rnaseq/raw_data/fastqc"
conda: "rnaseq-env.yml"
shell:
"""
fastqc {input} --outdir {params.outdir}
"""
## quality trim reads and assess with fastqc/multiqc
rule download_trimmomatic_adapter_file:
output: "TruSeq2-SE.fa"
shell:
"""
curl -L https://raw.githubusercontent.com/timflutre/trimmomatic/master/adapters/TruSeq2-SE.fa -o {output}
"""
rule quality_trim:
input:
reads="rnaseq/raw_data/{sample}.fq.gz",
adapters="TruSeq2-SE.fa",
output: "rnaseq/quality/{sample}.qc.fq.gz"
conda: "rnaseq-env.yml"
shell:
"""
trimmomatic SE {input.reads} {output} ILLUMINACLIP:{input.adapters}:2:0:15 LEADING:2 TRAILING:2 SLIDINGWINDOW:4:2 MINLEN:25
"""
rule fastqc_trimmed:
input: "rnaseq/quality/{sample}.qc.fq.gz"
output: "rnaseq/quality/fastqc/{sample}.qc_fastqc.html"
params:
outdir="rnaseq/quality/fastqc"
conda: "rnaseq-env.yml"
shell:
"""
fastqc {input} --outdir {params.outdir}
"""
rule multiqc:
input:
raw=expand("rnaseq/raw_data/fastqc/{sample}_fastqc.html", sample=SAMPLES),
trimmed=expand("rnaseq/quality/fastqc/{sample}.qc_fastqc.html", sample=SAMPLES)
output: "rnaseq/fastqc/multiqc_report.html"
params:
raw_dir="rnaseq/raw_data/fastqc",
trimmed_dir="rnaseq/raw_data/fastqc",
conda: "rnaseq-env.yml"
shell:
"""
multiqc {params.raw_dir} {params.trimmed_dir} --filename {output}
"""
### download and index the yeast transcriptome ###
rule download_yeast_transcriptome:
output: "rnaseq/reference/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz"
shell:
"""
curl -L ftp://ftp.ensembl.org/pub/release-99/fasta/saccharomyces_cerevisiae/cdna/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz -o {output}
"""
rule salmon_index:
input: "rnaseq/reference/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz"
output: directory("rnaseq/quant/sc_ensembl_index")
conda: "rnaseq-env.yml"
shell:
"""
salmon index --index {output} --transcripts {input} # --type quasi
"""
### quantify reads with salmon
rule salmon_quantify:
input:
reads="rnaseq/quality/{sample}.qc.fq.gz",
index_dir="rnaseq/quant/sc_ensembl_index"
output: "rnaseq/quant/{sample}_quant/quant.sf"
params:
outdir= lambda wildcards: "rnaseq/quant/" + wildcards.sample + "_quant"
conda: "rnaseq-env.yml"
shell:
"""
salmon quant -i {input.index_dir} --libType A -r {input.reads} -o {params.outdir} --seqBias --gcBias --validateMappings
"""