config.yaml

###################################################################
###################################################################
####                 _______   _                    _____      ####
####         /\     |__   __| | |          /\      / ____|     ####
####        /  \       | |    | |         /  \    | (___       ####
####       / /\ \      | |    | |        / /\ \    \___ \      ####
####      / ____ \     | |    | |____   / ____ \   ____) |     ####
####     /_/    \_\    |_|    |______| /_/    \_\ |_____/      ####
####                                                           ####
###################################################################
#  For more details about the config values see:
#  https://metagenome-atlas.rtfd.io
###################################################################

########################
# Execution parameters
########################
# max cores per process
threads: 12
# Memory for most jobs especially from BBtools, which are memory demanding
mem: 32
# Memory and threads for jobs needing high amount of memory. e.g GTDB-tk
large_mem: 250
large_threads: 16
# can be a subset of threads or altered if rule run_spades or run_megahit are being defined differently in your cluster configuration
assembly_threads: 8
# in GB
assembly_memory: 250


# Local directory for temp files, useful for cluster execution without shared file system
tmpdir: /tmp
# directory where databases are downloaded with 'atlas download'
database_dir: /home/kiesers/scratch/Atlas/databases


########################
# Quality control
########################
data_type: metagenome # metagenome or metatranscriptome
interleaved_fastqs: false
# remove (PCR)-duplicated reads using clumpify
deduplicate: true
duplicates_only_optical: false
duplicates_allow_substitutions: 2
# used to trim adapters from reads and read ends
preprocess_adapters: /path/to/databases/adapters.fa
preprocess_minimum_base_quality: 10
preprocess_minimum_passing_read_length: 51
# 0.05 requires at least 5 percent of each nucleotide per sequence
preprocess_minimum_base_frequency: 0.05
preprocess_adapter_min_k: 8
preprocess_allowable_kmer_mismatches: 1
preprocess_reference_kmer_match_length: 27
# error correction where PE reads overlap
error_correction_overlapping_pairs: true
#contamination references can be added such that -- key: /path/to/fasta
contaminant_references:
  rRNA: /home/kiesers/scratch/Atlas/databases/silva_rfam_all_rRNAs.fa
  PhiX: /home/kiesers/scratch/Atlas/databases/phiX174_virus.fa
  mouse: /home/kiesers/scratch/Atlas/databases/host_genome/Mus_musculus.GRCm38.dna.toplevel.fa.gz
# We won't allow large indels
contaminant_max_indel: 20
contaminant_min_ratio: 0.65
contaminant_kmer_length: 13
contaminant_minimum_hits: 1
contaminant_ambiguous: best


########################
# Pre-assembly-processing
########################

error_correction_before_assembly : true

# join R1 and R2 at overlap; unjoined reads are still utilized
merge_pairs_before_assembly : true
merging_k: 62
# extend reads while merging to this many nucleotides
merging_extend2: 40
# Iterations are performed until extend2 x iterations
merging_flags: "ecct iterations=5"

########################
# Assembly
########################
# megahit OR spades
assembler: spades

# Megahit
#-----------
# 2 is for metagenomes, 3 for genomes with 30x coverage
megahit_min_count: 2
megahit_k_min: 21
megahit_k_max: 121
megahit_k_step: 20
megahit_merge_level: 20,0.98
megahit_prune_level: 2
megahit_low_local_ratio: 0.2
# ['default','meta-large','meta-sensitive']
megahit_preset: default

# Spades
#------------
spades_skip_BayesHammer: true
spades_use_scaffolds: false # use contigs
#Comma-separated list of k-mer sizes to be used (all values must be odd, less than 128 and listed in ascending order).
spades_k: '21,33,55,77,99,127'
spades_preset: meta    # meta, ,normal, rna  single end libraries doesn't work for metaspades
spades_extra: ""
longread_type: none # [none,"pacbio", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"]
# Preprocessed long reads can be defined in the sample table with 'longreads' , for more info see the spades manual

# Filtering
#------------
prefilter_minimum_contig_length: 200
# filter out assembled noise
# this is more important for assemblys from megahit
filter_contigs: true
# trim contig tips
contig_trim_bp: 0
# require contigs to have read support
minimum_average_coverage: 1
minimum_percent_covered_bases: 20
minimum_mapped_reads: 0
# after filtering
minimum_contig_length: 300


########################
# Quantification
########################

# Mapping reads to contigs
#--------------------------
contig_min_id: 0.9
contig_map_paired_only: true
contig_max_distance_between_pairs: 1000
maximum_counted_map_sites: 10

########################
# Binning
########################

final_binner: DASTool             # [DASTool or one of the binner, e.g. maxbin]

binner:                           # If DASTool is used as final_binner, use predictions of this binners
  - metabat
  - maxbin


metabat:
  sensitivity: sensitive
  min_contig_length: 1500 # metabat needs >1500


maxbin:
  max_iteration: 50
  prob_threshold: 0.9
  min_contig_length: 1000

DASTool:
  search_engine: 'diamond'
  score_threshold: 0.5              #Score threshold until selection algorithm will keep selecting bins [0..1].

genome_dereplication:
  ANI: 0.99
  overlap: 0.6
  opt_parameters: ""
  filter:
      noFilter: false
      length: 5000
      completeness: 50
      contamination: 10
  score:
      completeness: 1
      contamination: 5
      N50: 0.5
      length: 0

annotations:
  - checkm_taxonomy
#  - checkm_tree
#  - gtdb_tree
#  - gtdb_taxonomy


########################
# Gene catalog
#######################
genecatalog:
  source: genomes  # which predicted proteins should be used for the gene catalog
  clustermethod: linclust # cd-hit-est or cluster or linclust see mmseqs for more details
  minlength_nt: 100
  minid: 0.5
  coverage: 0.5
  extra: ""
  SubsetSize: 500000