Merge pull request #5 from esteinig/0.1.0

0.1.0
esteinig · Feb 17, 2023 · 9b56022 · 9b56022
2 parents 17c7916 + 79f9ef2
commit 9b56022
Show file tree

Hide file tree

Showing 4 changed files with 823 additions and 110 deletions.
diff --git a/src/cli.rs b/src/cli.rs
@@ -1,6 +1,8 @@
-use std::ffi::OsStr;
-use std::{ffi::OsString, path::PathBuf};
+use std::collections::HashSet;
+use std::ffi::{OsString, OsStr};
+use std::path::PathBuf;
 use structopt::StructOpt;
+use structopt::clap::AppSettings;
 use thiserror::Error;
 
 /// A collection of custom errors relating to the command line interface for this package.
@@ -14,10 +16,10 @@ pub enum CliError {
     CompressionLevel(String),
     /// Indicates a bad combination of input and output files was passed.
     #[error("Bad combination of input and output files: {0}")]
-    BadInputOutputCombination(String),
+    BadInputOutputCombination(String)
 }
 
-/// MGP-DEPLETE command-line interface
+/// Scrubby command-line application
 #[derive(Debug, StructOpt)]
 pub struct Cli {
     #[structopt(subcommand)]
@@ -26,17 +28,17 @@ pub struct Cli {
 
 #[derive(Debug, StructOpt)]
 pub enum Commands {
+    #[structopt(global_settings = &[AppSettings::ColoredHelp, AppSettings::ArgRequiredElseHelp])]
     /// Clean seqeunce data by removing background taxa (k-mer) or host reads (alignment)
-    Scrub {
+    ScrubReads {
         /// Input filepath(s) (fa, fq, gz, bz).
         ///
         /// For paired Illumina you may either pass this flag twice `-i r1.fq -i r2.fq` or give two
-        /// files consecutively `-i r1.fq r2.fq`. NOTE: Read identifiers for paired-end Illumina reads
+        /// files consecutively `-i r1.fq r2.fq`. Read identifiers for paired-end Illumina reads
         /// are assumed to be the same in forward and reverse read files (modern format) without trailing
-        /// read orientations e.g. `/1` and `/2`. If you are using legacy identifiers, reads in the depleted
-        /// output may be unpaired.
+        /// read orientations `/1` or `/2`.
         #[structopt(
-            short,
+            short = "i",
             long,
             parse(try_from_os_str = check_file_exists),
             multiple = true,
@@ -48,24 +50,71 @@ pub enum Commands {
         /// For paired Illumina you may either pass this flag twice `-o r1.fq -o r2.fq` or give two
         /// files consecutively `-o r1.fq r2.fq`. NOTE: The order of the pairs is assumed to be the
         /// same as that given for --input.
-        #[structopt(short, long, parse(from_os_str), multiple = true, required = true)]
+        #[structopt(
+            short = "o", 
+            long, 
+            parse(from_os_str), 
+            multiple = true, 
+            required = true
+        )]
         output: Vec<PathBuf>,
-        /// Kraken2 database path.
+        /// Extract reads instead of removing them (--output)
         ///
-        /// Specify the path to the Kraken2 database directory.
-        #[structopt(short = "k", long, parse(try_from_os_str = check_file_exists), multiple = false, required = true)]
-        kraken_db: PathBuf,
+        /// This flag reverses the depletion and makes the command an extraction process 
+        /// of reads that would otherwise be removed during depletion.
+        #[structopt(short = "e", long)]
+        extract: bool,
+        /// Kraken2 database directory path(s).
+        ///
+        /// Specify the path to the Kraken2 database directory. This only needs to be specified if you would like to
+        /// run the Kraken2 analysis; otherwise `--kraken-report` and `--kraken-read` can be used. Note that multiple
+        /// databases can be specified with `--kraken-db` which will be run in the order in which they are provided.
+        /// You may either pass this flag twice `-k db1/ -k db2.` or give two files consecutively `-k db1/ db2/`.
+        /// If multiple databases are provided, their names must be unique.
+        /// 
+        #[structopt(short = "k", long, parse(try_from_os_str = check_file_exists), multiple = true, required = true)]
+        kraken_db: Vec<PathBuf>,
         /// Threads to use for Kraken2
         ///
-        /// Specify the number of threads to pass to Kraken2
-        #[structopt(short = "k", long, default_value = "4")]
+        /// Specify the number of threads with which to run Kraken2.
+        #[structopt(short = "j", long, default_value = "4")]
         kraken_threads: u32,
+        /// Taxa to deplete from reads classified with Kraken2.
+        ///
+        /// You may specify multiple taxon names or taxonomic identifiers by passing this flag
+        /// multiple times `-t Archaea -t 9606` or give taxa consecutively `-t Archaea 9606`.
+        /// Kraken reports are parsed and every taxonomic level below the provided taxa will
+        /// be provided for depletion of reads classified at these levels. For example, when
+        /// providing `Archaea` (Domain) all taxonomic levels below the Domain level are removed
+        /// until the next level of the same rank or higher is encountere in the report. This means
+        /// that generally, higher levels than Domain should be specified with `--kraken-taxa-direct`.
+        /// For example, if the level `cellular organisms` (R2) should be specified with `--kraken-taxa`
+        /// it may also remove `Viruses` (D) if it is located below the `cellular organisms` level in
+        /// the report. 
+        #[structopt(short = "t", long, multiple = true, required = false)]
+        kraken_taxa: Vec<String>,
+        /// Taxa to deplete directly from reads classified with Kraken2.
+        ///
+        /// Additional taxon names or taxonomic identifiers at levels above the domain level provided with
+        /// `--kraken-taxa` can be specified with this argument. These are directly to the list of taxons
+        /// and not parsed from the report. For example, to retain Viruses one can specify the domains
+        /// `-t Archaea -t Bacteria -t Eukaryota` with `--kraken-taxa` and add 
+        /// `-d 'other sequences' -d 'cellular organsisms' -d root` with `--kraken-taxa-direct`.
+        #[structopt(short = "d", long, multiple = true, required = false)]
+        kraken_taxa_direct: Vec<String>,
         /// Working directory containing intermediary files.
         /// 
         /// Path to a working directory which contains the alignment and intermediary output files
-        /// from the programs called during scrubbing.
-        #[structopt(short, long, parse(from_os_str))]
+        /// from the programs called during scrubbing. By default is the working output directory
+        /// is named with a timestamp e.g. `Scrubby_{YYYYMMDDTHHMMSS}`
+        #[structopt(short = "w", long, parse(from_os_str))]
         workdir: Option<PathBuf>,
+        /// Keep the working directory and intermediate files
+        ///
+        /// This flag specifies that we want to keep the working directory and all intermediate files
+        /// otherwise the working directory is deleted
+        #[structopt(short = "K", long)]
+        keep: bool,
         /// u: uncompressed; b: Bzip2; g: Gzip; l: Lzma
         ///
         /// Default is to attempt to infer the output compression format automatically from the filename
@@ -80,9 +129,9 @@ pub enum Commands {
             hide_possible_values = true
         )]
         output_format: Option<niffler::compression::Format>,
-        /// Compression level to use if compressing output
+        /// Compression level to use if compressing output.
         #[structopt(
-            short = "l",
+            short = "L",
             long,
             parse(try_from_str = parse_level),
             default_value="6",
@@ -92,7 +141,6 @@ pub enum Commands {
     }
 }
 
-// Functions may be heavily adapted from Rasusa, due to the excellent error annotation style
 impl Cli {
     /// Checks there is a valid and equal number of `--input` and `--output` arguments given.
     ///
@@ -102,7 +150,7 @@ impl Cli {
     /// - An unequal number of `--input` and `--output` are passed
     pub fn validate_input_output_combination(&self) -> Result<(), CliError> {
         match &self.commands {
-            Commands::Scrub { input, output, .. } => {
+            Commands::ScrubReads { input, output, .. } => {
                 let out_len = output.len();
                 let in_len = input.len();
                 if in_len > 2 {
@@ -129,7 +177,7 @@ fn check_file_exists(file: &OsStr) -> Result<PathBuf, OsString> {
     let path = PathBuf::from(file);
     let path_msg = format!("{:?} does not exist", path);
     if path.exists() {
-        let abs_path = std::fs::canonicalize(path).map_err(|_| OsString::from(path_msg))?;
+        let abs_path = path.canonicalize().map_err(|_| OsString::from(path_msg))?;
         Ok(abs_path)
     } else {
         Err(OsString::from(path_msg))
@@ -164,4 +212,3 @@ fn parse_level(s: &str) -> Result<niffler::Level, CliError> {
     };
     Ok(lvl)
 }
-
diff --git a/src/main.rs b/src/main.rs
@@ -1,54 +1,119 @@
 use anyhow::Result;
 use thiserror::Error;
 use structopt::StructOpt;
-use std::path::PathBuf;
-use chrono::Local;
 use env_logger::Builder;
-use log::LevelFilter;
-use std::io::Write;
+use log::{LevelFilter, Level};
+use env_logger::fmt::Color;
+use std::{io::Write, path::PathBuf};
+
 mod scrub;
 mod cli;
+mod utils;
+
+
+
+#[derive(Error, Debug)]
+pub enum ScrubbyError {
+    /// Represents a failure to convert a PathBuf converted OsStr into a String
+    #[error("incorrect format of the input database path, are there non-UTF8 characters?")]
+    InvalidDatabasePath,
+    /// Indicates a failure to obtain an absolute path
+    #[error("database name could not be obtained from {0}")]
+    DatabaseNameExtraction(String),
+}
 
 fn main() -> Result<()> {
     let cli = cli::Cli::from_args();
 
-    // Command specific checks - scrubbing
-
+    // Command line application specific checks
     cli.validate_input_output_combination()?;
 
     Builder::new()
         .format(|buf, record| {
-            writeln!(buf,
+            let timestamp = buf.timestamp();
+
+            let mut red_style = buf.style();
+            red_style.set_color(Color::Red).set_bold(true);
+            let mut green_style = buf.style();
+            green_style.set_color(Color::Green).set_bold(true);
+            let mut white_style = buf.style();
+            white_style.set_color(Color::White).set_bold(false);
+            let mut orange_style = buf.style();
+            orange_style.set_color(Color::Rgb(255, 102, 0)).set_bold(true);
+            let mut apricot_style = buf.style();
+            apricot_style.set_color(Color::Rgb(255, 195, 0)).set_bold(true);
+
+            let msg = match record.level(){
+                Level::Warn => (orange_style.value(record.level()), orange_style.value(record.args())),
+                Level::Info => (green_style.value(record.level()), white_style.value(record.args())),
+                Level::Debug => (apricot_style.value(record.level()), apricot_style.value(record.args())),
+                Level::Error => (red_style.value(record.level()), red_style.value(record.args())),
+                _ => (white_style.value(record.level()), white_style.value(record.args()))
+            };
+
+            writeln!(
+                buf,
                 "{} [{}] - {}",
-                Local::now().format("%Y-%m-%dT%H:%M:%S"),
-                record.level(),
-                record.args()
+                white_style.value(timestamp),
+                msg.0,
+                msg.1
             )
         })
         .filter(None, LevelFilter::Info)
         .init();
 
 
     match cli.commands {
-        cli::Commands::Scrub {
+        cli::Commands::ScrubReads {
             input,
             output,
             workdir,
+            extract,
+            keep,
             kraken_db,
             kraken_threads,
+            kraken_taxa,
+            kraken_taxa_direct,
             output_format,
             compression_level,
         } => {
 
-            let scrubber = scrub::Scrubber::new(workdir)?;
+            let scrubber = scrub::Scrubber::new(workdir, output_format, compression_level)?;
+
+            log::info!("=============================================");
+            log::info!("Welcome to Scrubby! You name it, we clean it.");
+            log::info!("=============================================");
 
-            log::info!("Welcome to Scrubby, your trusty read scrubber!");
-            scrubber.run_kraken(&input, kraken_db, kraken_threads);
+            let mut read_files = input;
+            for (db_index, db_path) in kraken_db.into_iter().enumerate() {
+                // Extract the database name from path and run Kraken2
+                let db_name = get_db_name(&db_path)?;
+                let kraken_files = scrubber.run_kraken(&read_files, &db_path, &db_name, &db_index, &kraken_threads)?;
+                // These are either depleted or extracted reads - for depleted reads, we use the depleted reads as input for the next iteration
+                // but for extracted reads, we do not want to re-extract reads with another database [https://github.com/esteinig/scrubby/issues/2]
+                read_files = scrubber.deplete_kraken(&read_files, &db_name, &db_index, &false, &kraken_files, &kraken_taxa, &kraken_taxa_direct)?;
+            }
+            // Iterating again over the depleted record files to produce the user-specified outputs
+            // because we want to ensure they are properly compressed (intermediate files are not)
+            scrubber.write_outputs(read_files, output)?;
+            // If we do not want to keep the intermediary files in `workdir` delete the directory
+            scrubber.clean_up(keep)?;
 
+
+
         }
     }
-
-    log::info!("Scrub scrub, scrubbity-scrub! Your sequence data, only cleaner!");
+    log::info!("==============================================================");
+    log::info!("Thank you for using Scrubby! Your sequence data, only cleaner.");
+    log::info!("==============================================================");
 
     Ok(())
+}
+
+// Utility function to extract the database name as valid UTF-8
+fn get_db_name(db_path: &PathBuf) -> Result<String, ScrubbyError> {
+    match db_path.file_name(){
+        Some(name) => Ok(name.to_os_string().into_string().map_err(|_| ScrubbyError::InvalidDatabasePath)?),
+        None => return Err(ScrubbyError::DatabaseNameExtraction(format!("{:?}", db_path)))
+    }
 }