CLOMP.ini


#[general]
# Controls if you have paired end reads and want to keep them paired
#---Possible Values: True False
PAIRED_END = False
# this is the string that differentiates your R1 and R2 reads if you're using paired end
# if you've got single end reads R1 can be blank or any string.
# Note: This string has to be before your input suffix i.e .fastq.1 will NOT work
# Also this string should not appear more than once in a given filename 
#---Possible Values: Any text string that doesn't have system control characters in it 
R1 = _R1_
R2 = _R2_ 
# Controls if we keep the **intermediate files or the input files**, this is included because
# on systems with low disk space you may have to delete files as you go - unless you're 
# literally writing your hard drive full during execution don't set this to false
#---Possible Values: True False 
SAVE_MIDDLE_AND_INPUT_FILES = True
# Number of threads to call for parallelized applications 
#---Possible Values: Any integer value (you shouldn't set this higher than the number or threads you have)
THREADS = 42
# Each sample needs a unique name and everything BEFORE the FIRST instance of this string
# will be the sample base name (this needs to be unique)
# So for example sample1_R1.fastq, sample2_R2.fastq, '_' would work as a base delimiter
# but for sample_1.fastq, sample_2.fastq '_' would just produce two bases named samples
# This should work out of the box for things that are named like Illumina does 
#--Possible Values: any substring in your input file names for which the string BEFORE this substring is unique for each sample 
BASE_DELIMITER = _
# This controls if you want to perform host filtering on your samples BEFORE trimming 
# this can drastically speed up execution if you have files with a huge amount of host background
#---Possible Values: True False 
HOST_FILTER_FIRST = False 
# This controls your base input, ls * + INPUT_SUFFIX should reveal a list of everything that you would like to analyze
#---Possible Values: any substring which is contained at the END of all the filenames that you would like to analyze 
INPUT_SUFFIX = .fastq
# Should CLOMP overwrite previous outputs (if detected) If no previous outputs are present this is pretty much ignored 
#---Possible Values: True False 
OVERWRITE = False


#[trimming]
# Path to your Trimmomatic Jar file 
#---Possible Values: An absolute file path to your trimmomatic jar 
TRIMMOMATIC_JAR_PATH = /tools/Trimmomatic-0.38/trimmomatic-0.38.jar
# Location of the adapters you want to get rid of, I've included one in the main CLOMP repository but it's just a fasta
# file so you can tweak this to your hearts delight with no extra problems 
#---Possible Values: An absolute file path to a fasta formatted file containing the adapters you want to trimmed
TRIMMOMATIC_ADAPTER_PATH = /tools/Trimmomatic-0.38/adapters/adapters.fa
# This is just the first part of the trimmomatic argument, I'm not even sure if you can or ever need to change this
#---Possible Values: ILLUMINACLIP: 
SEQUENCER = ILLUMINACLIP:
# This is a string containing all the options you want to input to trimmomatic
# For additional help: http://www.usadellab.org/cms/?page=trimmomatic
# Briefly: MINLEN is minimum read length allowed through 
#   HEADCROP: trim this many bases from the start of the read 
#   SLIDINGWINDOW: remove read when Nbases:avgQuality
#   CROP: Trim all reads to this length at a maximum
#   MINLEN: Trim all reads that are not at least this length
TRIMMOMATIC_OPTIONS = :2:30:10 HEADCROP:10 SLIDINGWINDOW:4:20 CROP:65 MINLEN:65
# last part of the output file name, this will be added whatever is in front of BASE_DELIMITER in your file name
# so if your input file is sample1_blah.fastq and your BASE_DELIMITER = _ The output for this step will be a file
# named sample1_trimmed.fastq 
# NOTE: this needs to contain the base_delimiter (at least right now) 
#---Possible Values: Any valid text string 
TRIM_SUFFIX = trimmed.fastq


#[host_filtering]
# Location of your PREBUILT bowtie2 index, this is passed as the value of -x in bowtie2 options
# so this needs to point to where your index.bwt files are living.
#---Possible Values: Any valid path to a valid bowtie2 index 
BWT_DB_LOCATION=/db/hg38/hg38_spiro
# Options passed to bowtie2 for host alignment, you can play with these and get different specificities
# For additional help: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#main-arguments
  # Briefly (and in my own words) 
  #-D seed extensions (max number that can fail in a row) 
  #-R maximum reseed attempts (max number or reattempts)
  #-N number of mismatches per seed
  #-L seed length
  #-i interval between seeds
  # other options:
  # -x location of database 
  # -q fastq files are input 
  # -U use unpaired reads
  # -S output as a sam file 
# I've pulled threads out into the THREADS variable 
#---Possible Values: Any correct formulation of bowtie2 alignment arguments 
BWT_OPTIONS = -D 20 -R 3 -N 0 -L 20 -i S,1,0.50
# Same drill for output as above, this will be the end of your output files 
#---Possible Values: Any valid filepath string
BWT_SUFFIX = hf.trimmed.fastq
# should we delete all the intermediate host filtering files like _mappedBam _mappedSam and ect.
#---Possible Values: True False 
BWT_CLEAN = True
# Should we do a more sensitive second pass of host filtering? 
SECOND_PASS = True
# Bowtie options for the second pass of the data 
BWT_SECOND_PASS_OPTIONS = -D 35 -R 5 -N 1 -L 19 -i S,1,0.50
# Output suffix for the second pass
BWT_SECOND_SUFFIX = hf2.trimmed.fastq 

#[snap]
# SNAP website available at http://snap.cs.berkeley.edu/
# python formatted list containing strings for each of your SNAP databases, these must be built and valid
# But can live on different drives
#---Possible Values: Needs to be [] encased, comma separated and values need '' what's in the quotes has to 
# 		be a valid SNAP database location
DB_LIST = ['/hd2/nt.00','/hd2/nt.01','/hd2/nt.02','/hd2/nt.03','/hd2/nt.04','/hd2/nt.05','/hd2/nt.06','/hd2/nt.07','/hd2/nt.08','/hd2/nt.09','/hd2/nt.10','/hd2/nt.11','/hd2/nt.12','/hd2/nt.13']
#
# Where is the actual snap executable file, Use the one I've forked on my github repository https://github.com/rcs333/snap
# The one I have is hard coded to never write SAM headers - this is necessary because when you align to all of NT the sam header 
# becomes HUGE and you end up spending a lot of time just writing the header to you disk. 
#---Possible Values: Path to a snap executable 
SNAP_ALIGNER_LOCTION = /tools/snap2/snap/snap-aligner
#
# Options passed to the SNAP aligner 
	# -mrl minimum read length to attempt to align  
	# -d maximum edit distance that we allow for an alignment 
	# -h maximum number of other locations that a read can align to
	# -omax additional number of alignments to report
	# -om extra edit distance that that extra alignments specified by -omax are allowed to be from the 'best' alignment 
	# Commentary:
	# -mrl I recommend that this be at least 50
	# -d should be about 10% of your read length 
	# -h explanations: if read aligns to more entries than this number, SNAP will not align it.  We have arbitrarily set this at 30,000, but this may need to be increased as NT gets larger or larger databases are used. 
	# -omax Current holds 20 entries per database for 14 x 20 = 280 total potential entries for each read for tie-breaking.  will need to be modified if you are using more than 14 snap databases 
	# -om needs to be at least 1 to accept reverse complement alignments 
#---Possible Values: Any valid string with valid SNAP command line arguments 
SNAP_OPTIONS = -mrl 65 -d 9 -h 30000 -om 1 -omax 20

#[tiebreaking]
# We have been using the kraken-report feature to link the output of tie-breaking to the Pavain display.
# This is where your kraken-unique scripts live 
#---Possible Values: Path to a kraken-report perl script 
KRAKEN_PATH = /tools/krakenuniq/krakenuniq-report
#
# Kraken needs to have a taxonomy database: See the README for more information
# We have two blank database files (database.idx, database.kdb) in the following directory (KRAKEN_DB_PATH) that are required for kraken to run.
# SeqID2taxid.map and taxDB are created when building a kraken taxonomy database.  We made these with krakenuniq and NCBI nt database.
KRAKEN_DB_PATH = /hd2/kraktest/
#
# SNAP can only parse alignments on a per database basis.  When we combine the databases for tie-breaking, we need logic as to how to compare edit distances between different databases
# The edit distance offset is the maximum difference in edit distance we will accept for alignments
# this is like the SNAP option -om except across all the databases 
# higher values are more permissive, set to 0 if you only want alignments with edit distances
# equal to the best alignment
# We consider this to be a very permissive parameter which essentially keeps all reads.
#---Possible Values: Positive integer
EDIT_DISTANCE_OFFSET = 6
#
# Any read assignment that contains these taxids in the full lineage will be discarded
# 12908 = other sequences
# 28384 = artificial sequences
# 48479 = environmental samples 
# This needs to be a valid python list containing the taxids that you want to filter out 
FILTER_LIST = [12908,28384,48479]
#
# Should any reads that ever gets a valid alignment to host/human be called host/human?
# Given the high prior probability of reads being host in metagenomics, we allow removal of reads that have any SNAP alignment to host that survives the edit distance offset. 
# The host is set by the taxid.  Default is human (taxid: 9606).
# If you are host filtering against a different host, you need to change the taxid.
# If H_STRICT is FALSE, we do not perform this step.
#---Possible Values: True False 
H_STRICT = False 
H_TAXID = 9606
#
# This controls the different types of tie-breaking logic that are baked in to CLOMP 
# strict: The most specific taxid shared between ALL remaining alignments is returned.  Essentially a strict tree intersection.
# 90: The most specific taxid shared between ~90% of the remaining alignments is returned
# oneoff: If there is not more than one other taxid assigned, call read most common taxid, otherwise
# 			intersect and pick most specific shared taxid.  Essentially, one weird/bad assignment cannot define/determine the tree intersection.
# For more information check out the readme.
#---Possible Values: strict 90 oneoff 
LOGIC = strict
# Should program write a new folder named after the sample and write all unique reads that mapped to each taxid
# into this folder? VERY useful for debugging tie-breaking 
#---Possible Values: True False 
WRITE_UNIQUES = True
# Should we follow up and blast all non-human eukaryotic reads? This is time-intensive but double checks important taxonomical assignments.
# If this is true all reads classified as eukaryotic get blasted and any read that hits human with an 
# evalue of less than 0.01 gets recalled human
BLAST_CHECK = False 
# You can set the criteria for blast checking - as well as the database here 
# Inclusion TAXID will pull all reads that get classified below this level but not downstream of the exclusion taxid
INCLUSION_TAXID = 2759
EXCLUSION_TAXID = 9604
# Set the location of a blast database here you should set this blast database to only include the taxid that you are checking
# for the human example this is a blast database built off the same hg38 that we use for host filtering
# this means that we if get ANY hit to this database at an evalue below the one specified later we'll reclassify the read
BLAST_CHECK_DB = /db/hg38_blast
# this is the maximum evalue that an evalue can be to report the hit, ALL reported hits will be reclassified
BLAST_EVAL = 0.001
# what do we want to reclassify reads that hit the blast database as? 9606 = H. sapiens but you could set this to anything 
DB_TAXID = 9606


#[post]
# Should we build SAM files aligning reads to species level assignments?
# This is very helpful for assigning number of loci covered across the genome and creating coverage maps.
BUILD_SAMS = False
# The minimum number of reads assigned to a species level taxid that are required to build the sam file
MIN_READ_CUTOFF = 10
# how far back up the tree to pull reads to align to the reference for the taxid. -2 means that we'll
# align all reads that are in the same genus as the species level taxid - set higher to grab more reads and set to 
# set to -1 to just align the reads that were assigned to that species 
ASSEMBLY_NODE_OFFSET = -2 
# Email adress requried for interfacing with NCBI servers. Please change this to your own email address .
ENTREZ_EMAIL = uwvirongs@gmail.com
# taxids that contain these taxids in their lineage will not be built
#    2759 excludes building any eukaryotic sams
#    77133 excludes 'uncultured bacteria' 
# This must be a python formatted list but can be expanded to any number of taxids 
SAM_NO_BUILD_LIST = [2759,77133]
# Should we add the number of reads called to the host to the final Pavian output?
# If set to true we will parse the bowtie logs for the number of original reads and the
# number of reads classified as human. These will then get added back to the pavian output.
# This will create two final Pavian outputs per sample. If set to true you will get one file
# with host filtered reads respiked and another withouth. 
# Please note that in human derived samples this will drastically reduce the % unclassified 
#---Possible Values: True False
ADD_HOST_FILTERED_TO_REPORT = True
# The taxid for which we should classify host filtered reads as when adding them to the Pavian report
HOST_FILTER_TAXID = 9606