Skip to content

Commit

Permalink
Merge pull request #5 from esteinig/0.1.0
Browse files Browse the repository at this point in the history
0.1.0
  • Loading branch information
esteinig authored Feb 17, 2023
2 parents 17c7916 + 79f9ef2 commit 9b56022
Show file tree
Hide file tree
Showing 4 changed files with 823 additions and 110 deletions.
95 changes: 71 additions & 24 deletions src/cli.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::ffi::OsStr;
use std::{ffi::OsString, path::PathBuf};
use std::collections::HashSet;
use std::ffi::{OsString, OsStr};
use std::path::PathBuf;
use structopt::StructOpt;
use structopt::clap::AppSettings;
use thiserror::Error;

/// A collection of custom errors relating to the command line interface for this package.
Expand All @@ -14,10 +16,10 @@ pub enum CliError {
CompressionLevel(String),
/// Indicates a bad combination of input and output files was passed.
#[error("Bad combination of input and output files: {0}")]
BadInputOutputCombination(String),
BadInputOutputCombination(String)
}

/// MGP-DEPLETE command-line interface
/// Scrubby command-line application
#[derive(Debug, StructOpt)]
pub struct Cli {
#[structopt(subcommand)]
Expand All @@ -26,17 +28,17 @@ pub struct Cli {

#[derive(Debug, StructOpt)]
pub enum Commands {
#[structopt(global_settings = &[AppSettings::ColoredHelp, AppSettings::ArgRequiredElseHelp])]
/// Clean seqeunce data by removing background taxa (k-mer) or host reads (alignment)
Scrub {
ScrubReads {
/// Input filepath(s) (fa, fq, gz, bz).
///
/// For paired Illumina you may either pass this flag twice `-i r1.fq -i r2.fq` or give two
/// files consecutively `-i r1.fq r2.fq`. NOTE: Read identifiers for paired-end Illumina reads
/// files consecutively `-i r1.fq r2.fq`. Read identifiers for paired-end Illumina reads
/// are assumed to be the same in forward and reverse read files (modern format) without trailing
/// read orientations e.g. `/1` and `/2`. If you are using legacy identifiers, reads in the depleted
/// output may be unpaired.
/// read orientations `/1` or `/2`.
#[structopt(
short,
short = "i",
long,
parse(try_from_os_str = check_file_exists),
multiple = true,
Expand All @@ -48,24 +50,71 @@ pub enum Commands {
/// For paired Illumina you may either pass this flag twice `-o r1.fq -o r2.fq` or give two
/// files consecutively `-o r1.fq r2.fq`. NOTE: The order of the pairs is assumed to be the
/// same as that given for --input.
#[structopt(short, long, parse(from_os_str), multiple = true, required = true)]
#[structopt(
short = "o",
long,
parse(from_os_str),
multiple = true,
required = true
)]
output: Vec<PathBuf>,
/// Kraken2 database path.
/// Extract reads instead of removing them (--output)
///
/// Specify the path to the Kraken2 database directory.
#[structopt(short = "k", long, parse(try_from_os_str = check_file_exists), multiple = false, required = true)]
kraken_db: PathBuf,
/// This flag reverses the depletion and makes the command an extraction process
/// of reads that would otherwise be removed during depletion.
#[structopt(short = "e", long)]
extract: bool,
/// Kraken2 database directory path(s).
///
/// Specify the path to the Kraken2 database directory. This only needs to be specified if you would like to
/// run the Kraken2 analysis; otherwise `--kraken-report` and `--kraken-read` can be used. Note that multiple
/// databases can be specified with `--kraken-db` which will be run in the order in which they are provided.
/// You may either pass this flag twice `-k db1/ -k db2.` or give two files consecutively `-k db1/ db2/`.
/// If multiple databases are provided, their names must be unique.
///
#[structopt(short = "k", long, parse(try_from_os_str = check_file_exists), multiple = true, required = true)]
kraken_db: Vec<PathBuf>,
/// Threads to use for Kraken2
///
/// Specify the number of threads to pass to Kraken2
#[structopt(short = "k", long, default_value = "4")]
/// Specify the number of threads with which to run Kraken2.
#[structopt(short = "j", long, default_value = "4")]
kraken_threads: u32,
/// Taxa to deplete from reads classified with Kraken2.
///
/// You may specify multiple taxon names or taxonomic identifiers by passing this flag
/// multiple times `-t Archaea -t 9606` or give taxa consecutively `-t Archaea 9606`.
/// Kraken reports are parsed and every taxonomic level below the provided taxa will
/// be provided for depletion of reads classified at these levels. For example, when
/// providing `Archaea` (Domain) all taxonomic levels below the Domain level are removed
/// until the next level of the same rank or higher is encountere in the report. This means
/// that generally, higher levels than Domain should be specified with `--kraken-taxa-direct`.
/// For example, if the level `cellular organisms` (R2) should be specified with `--kraken-taxa`
/// it may also remove `Viruses` (D) if it is located below the `cellular organisms` level in
/// the report.
#[structopt(short = "t", long, multiple = true, required = false)]
kraken_taxa: Vec<String>,
/// Taxa to deplete directly from reads classified with Kraken2.
///
/// Additional taxon names or taxonomic identifiers at levels above the domain level provided with
/// `--kraken-taxa` can be specified with this argument. These are directly to the list of taxons
/// and not parsed from the report. For example, to retain Viruses one can specify the domains
/// `-t Archaea -t Bacteria -t Eukaryota` with `--kraken-taxa` and add
/// `-d 'other sequences' -d 'cellular organsisms' -d root` with `--kraken-taxa-direct`.
#[structopt(short = "d", long, multiple = true, required = false)]
kraken_taxa_direct: Vec<String>,
/// Working directory containing intermediary files.
///
/// Path to a working directory which contains the alignment and intermediary output files
/// from the programs called during scrubbing.
#[structopt(short, long, parse(from_os_str))]
/// from the programs called during scrubbing. By default is the working output directory
/// is named with a timestamp e.g. `Scrubby_{YYYYMMDDTHHMMSS}`
#[structopt(short = "w", long, parse(from_os_str))]
workdir: Option<PathBuf>,
/// Keep the working directory and intermediate files
///
/// This flag specifies that we want to keep the working directory and all intermediate files
/// otherwise the working directory is deleted
#[structopt(short = "K", long)]
keep: bool,
/// u: uncompressed; b: Bzip2; g: Gzip; l: Lzma
///
/// Default is to attempt to infer the output compression format automatically from the filename
Expand All @@ -80,9 +129,9 @@ pub enum Commands {
hide_possible_values = true
)]
output_format: Option<niffler::compression::Format>,
/// Compression level to use if compressing output
/// Compression level to use if compressing output.
#[structopt(
short = "l",
short = "L",
long,
parse(try_from_str = parse_level),
default_value="6",
Expand All @@ -92,7 +141,6 @@ pub enum Commands {
}
}

// Functions may be heavily adapted from Rasusa, due to the excellent error annotation style
impl Cli {
/// Checks there is a valid and equal number of `--input` and `--output` arguments given.
///
Expand All @@ -102,7 +150,7 @@ impl Cli {
/// - An unequal number of `--input` and `--output` are passed
pub fn validate_input_output_combination(&self) -> Result<(), CliError> {
match &self.commands {
Commands::Scrub { input, output, .. } => {
Commands::ScrubReads { input, output, .. } => {
let out_len = output.len();
let in_len = input.len();
if in_len > 2 {
Expand All @@ -129,7 +177,7 @@ fn check_file_exists(file: &OsStr) -> Result<PathBuf, OsString> {
let path = PathBuf::from(file);
let path_msg = format!("{:?} does not exist", path);
if path.exists() {
let abs_path = std::fs::canonicalize(path).map_err(|_| OsString::from(path_msg))?;
let abs_path = path.canonicalize().map_err(|_| OsString::from(path_msg))?;
Ok(abs_path)
} else {
Err(OsString::from(path_msg))
Expand Down Expand Up @@ -164,4 +212,3 @@ fn parse_level(s: &str) -> Result<niffler::Level, CliError> {
};
Ok(lvl)
}

97 changes: 81 additions & 16 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,54 +1,119 @@
use anyhow::Result;
use thiserror::Error;
use structopt::StructOpt;
use std::path::PathBuf;
use chrono::Local;
use env_logger::Builder;
use log::LevelFilter;
use std::io::Write;
use log::{LevelFilter, Level};
use env_logger::fmt::Color;
use std::{io::Write, path::PathBuf};

mod scrub;
mod cli;
mod utils;



#[derive(Error, Debug)]
pub enum ScrubbyError {
/// Represents a failure to convert a PathBuf converted OsStr into a String
#[error("incorrect format of the input database path, are there non-UTF8 characters?")]
InvalidDatabasePath,
/// Indicates a failure to obtain an absolute path
#[error("database name could not be obtained from {0}")]
DatabaseNameExtraction(String),
}

fn main() -> Result<()> {
let cli = cli::Cli::from_args();

// Command specific checks - scrubbing

// Command line application specific checks
cli.validate_input_output_combination()?;

Builder::new()
.format(|buf, record| {
writeln!(buf,
let timestamp = buf.timestamp();

let mut red_style = buf.style();
red_style.set_color(Color::Red).set_bold(true);
let mut green_style = buf.style();
green_style.set_color(Color::Green).set_bold(true);
let mut white_style = buf.style();
white_style.set_color(Color::White).set_bold(false);
let mut orange_style = buf.style();
orange_style.set_color(Color::Rgb(255, 102, 0)).set_bold(true);
let mut apricot_style = buf.style();
apricot_style.set_color(Color::Rgb(255, 195, 0)).set_bold(true);

let msg = match record.level(){
Level::Warn => (orange_style.value(record.level()), orange_style.value(record.args())),
Level::Info => (green_style.value(record.level()), white_style.value(record.args())),
Level::Debug => (apricot_style.value(record.level()), apricot_style.value(record.args())),
Level::Error => (red_style.value(record.level()), red_style.value(record.args())),
_ => (white_style.value(record.level()), white_style.value(record.args()))
};

writeln!(
buf,
"{} [{}] - {}",
Local::now().format("%Y-%m-%dT%H:%M:%S"),
record.level(),
record.args()
white_style.value(timestamp),
msg.0,
msg.1
)
})
.filter(None, LevelFilter::Info)
.init();


match cli.commands {
cli::Commands::Scrub {
cli::Commands::ScrubReads {
input,
output,
workdir,
extract,
keep,
kraken_db,
kraken_threads,
kraken_taxa,
kraken_taxa_direct,
output_format,
compression_level,
} => {

let scrubber = scrub::Scrubber::new(workdir)?;
let scrubber = scrub::Scrubber::new(workdir, output_format, compression_level)?;

log::info!("=============================================");
log::info!("Welcome to Scrubby! You name it, we clean it.");
log::info!("=============================================");

log::info!("Welcome to Scrubby, your trusty read scrubber!");
scrubber.run_kraken(&input, kraken_db, kraken_threads);
let mut read_files = input;
for (db_index, db_path) in kraken_db.into_iter().enumerate() {
// Extract the database name from path and run Kraken2
let db_name = get_db_name(&db_path)?;
let kraken_files = scrubber.run_kraken(&read_files, &db_path, &db_name, &db_index, &kraken_threads)?;
// These are either depleted or extracted reads - for depleted reads, we use the depleted reads as input for the next iteration
// but for extracted reads, we do not want to re-extract reads with another database [https://github.com/esteinig/scrubby/issues/2]
read_files = scrubber.deplete_kraken(&read_files, &db_name, &db_index, &false, &kraken_files, &kraken_taxa, &kraken_taxa_direct)?;
}
// Iterating again over the depleted record files to produce the user-specified outputs
// because we want to ensure they are properly compressed (intermediate files are not)
scrubber.write_outputs(read_files, output)?;
// If we do not want to keep the intermediary files in `workdir` delete the directory
scrubber.clean_up(keep)?;



}
}

log::info!("Scrub scrub, scrubbity-scrub! Your sequence data, only cleaner!");
log::info!("==============================================================");
log::info!("Thank you for using Scrubby! Your sequence data, only cleaner.");
log::info!("==============================================================");

Ok(())
}

// Utility function to extract the database name as valid UTF-8
fn get_db_name(db_path: &PathBuf) -> Result<String, ScrubbyError> {
match db_path.file_name(){
Some(name) => Ok(name.to_os_string().into_string().map_err(|_| ScrubbyError::InvalidDatabasePath)?),
None => return Err(ScrubbyError::DatabaseNameExtraction(format!("{:?}", db_path)))
}
}
Loading

0 comments on commit 9b56022

Please sign in to comment.