-
Notifications
You must be signed in to change notification settings - Fork 1
/
config.yaml
202 lines (170 loc) · 5.9 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
###################################################################
###################################################################
#### _______ _ _____ ####
#### /\ |__ __| | | /\ / ____| ####
#### / \ | | | | / \ | (___ ####
#### / /\ \ | | | | / /\ \ \___ \ ####
#### / ____ \ | | | |____ / ____ \ ____) | ####
#### /_/ \_\ |_| |______| /_/ \_\ |_____/ ####
#### ####
###################################################################
# For more details about the config values see:
# https://metagenome-atlas.rtfd.io
###################################################################
########################
# Execution parameters
########################
# max cores per process
threads: 12
# Memory for most jobs especially from BBtools, which are memory demanding
mem: 32
# Memory and threads for jobs needing high amount of memory. e.g GTDB-tk
large_mem: 250
large_threads: 16
# can be a subset of threads or altered if rule run_spades or run_megahit are being defined differently in your cluster configuration
assembly_threads: 8
# in GB
assembly_memory: 250
# Local directory for temp files, useful for cluster execution without shared file system
tmpdir: /tmp
# directory where databases are downloaded with 'atlas download'
database_dir: /home/kiesers/scratch/Atlas/databases
########################
# Quality control
########################
data_type: metagenome # metagenome or metatranscriptome
interleaved_fastqs: false
# remove (PCR)-duplicated reads using clumpify
deduplicate: true
duplicates_only_optical: false
duplicates_allow_substitutions: 2
# used to trim adapters from reads and read ends
preprocess_adapters: /path/to/databases/adapters.fa
preprocess_minimum_base_quality: 10
preprocess_minimum_passing_read_length: 51
# 0.05 requires at least 5 percent of each nucleotide per sequence
preprocess_minimum_base_frequency: 0.05
preprocess_adapter_min_k: 8
preprocess_allowable_kmer_mismatches: 1
preprocess_reference_kmer_match_length: 27
# error correction where PE reads overlap
error_correction_overlapping_pairs: true
#contamination references can be added such that -- key: /path/to/fasta
contaminant_references:
rRNA: /home/kiesers/scratch/Atlas/databases/silva_rfam_all_rRNAs.fa
PhiX: /home/kiesers/scratch/Atlas/databases/phiX174_virus.fa
mouse: /home/kiesers/scratch/Atlas/databases/host_genome/Mus_musculus.GRCm38.dna.toplevel.fa.gz
# We won't allow large indels
contaminant_max_indel: 20
contaminant_min_ratio: 0.65
contaminant_kmer_length: 13
contaminant_minimum_hits: 1
contaminant_ambiguous: best
########################
# Pre-assembly-processing
########################
error_correction_before_assembly : true
# join R1 and R2 at overlap; unjoined reads are still utilized
merge_pairs_before_assembly : true
merging_k: 62
# extend reads while merging to this many nucleotides
merging_extend2: 40
# Iterations are performed until extend2 x iterations
merging_flags: "ecct iterations=5"
########################
# Assembly
########################
# megahit OR spades
assembler: spades
# Megahit
#-----------
# 2 is for metagenomes, 3 for genomes with 30x coverage
megahit_min_count: 2
megahit_k_min: 21
megahit_k_max: 121
megahit_k_step: 20
megahit_merge_level: 20,0.98
megahit_prune_level: 2
megahit_low_local_ratio: 0.2
# ['default','meta-large','meta-sensitive']
megahit_preset: default
# Spades
#------------
spades_skip_BayesHammer: true
spades_use_scaffolds: false # use contigs
#Comma-separated list of k-mer sizes to be used (all values must be odd, less than 128 and listed in ascending order).
spades_k: '21,33,55,77,99,127'
spades_preset: meta # meta, ,normal, rna single end libraries doesn't work for metaspades
spades_extra: ""
longread_type: none # [none,"pacbio", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"]
# Preprocessed long reads can be defined in the sample table with 'longreads' , for more info see the spades manual
# Filtering
#------------
prefilter_minimum_contig_length: 200
# filter out assembled noise
# this is more important for assemblys from megahit
filter_contigs: true
# trim contig tips
contig_trim_bp: 0
# require contigs to have read support
minimum_average_coverage: 1
minimum_percent_covered_bases: 20
minimum_mapped_reads: 0
# after filtering
minimum_contig_length: 300
########################
# Quantification
########################
# Mapping reads to contigs
#--------------------------
contig_min_id: 0.9
contig_map_paired_only: true
contig_max_distance_between_pairs: 1000
maximum_counted_map_sites: 10
########################
# Binning
########################
final_binner: DASTool # [DASTool or one of the binner, e.g. maxbin]
binner: # If DASTool is used as final_binner, use predictions of this binners
- metabat
- maxbin
metabat:
sensitivity: sensitive
min_contig_length: 1500 # metabat needs >1500
maxbin:
max_iteration: 50
prob_threshold: 0.9
min_contig_length: 1000
DASTool:
search_engine: 'diamond'
score_threshold: 0.5 #Score threshold until selection algorithm will keep selecting bins [0..1].
genome_dereplication:
ANI: 0.99
overlap: 0.6
opt_parameters: ""
filter:
noFilter: false
length: 5000
completeness: 50
contamination: 10
score:
completeness: 1
contamination: 5
N50: 0.5
length: 0
annotations:
- checkm_taxonomy
# - checkm_tree
# - gtdb_tree
# - gtdb_taxonomy
########################
# Gene catalog
#######################
genecatalog:
source: genomes # which predicted proteins should be used for the gene catalog
clustermethod: linclust # cd-hit-est or cluster or linclust see mmseqs for more details
minlength_nt: 100
minid: 0.5
coverage: 0.5
extra: ""
SubsetSize: 500000