diff --git a/.gitignore b/.gitignore index 088ba6b..8fb4d6a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # Generated by Cargo # will have compiled files and executables /target/ - +/eval/ # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..69fb479 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,46 @@ +[package] +name = "scrubby" +version = "0.1.0" +authors = ["esteinig "] +description = "Scrubby: remove background taxa and host sequences from reads" +documentation = "https://github.com/esteinig/scrubby" +homepage = "https://github.com/esteinig/scrubby" +repository = "https://github.com/esteinig/scrubby" +readme = "README.md" +keywords = ["meta-gp", "host", "background", "depletion", "metagenomics", "diagnostics"] +categories = ["science"] +license = "MIT" +edition = "2018" +include = [ + "**/*.rs", + "src/data/*", + "Cargo.toml" +] + +[dependencies] +anyhow = "1.0" +structopt = "0.3" +clap = "2.33.0" +thiserror = "1.0" +crossterm = "0.23.0" +itertools = "0.10.3" +tabled = "0.5.0" +indicatif = "0.16.2" +env_logger = "0.9.0" +rust-htslib = "0.38" +needletail = "0.4.1" +niffler = "2.3" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +log = "0.4" +chrono = "0.4" +rand = "0.8.5" + +[dev-dependencies] +assert_cmd = "2.0.1" +predicates = "1" +float_eq = "0.6.1" + +[[bin]] +name = "scrubby" +path = "src/main.rs" \ No newline at end of file diff --git a/src/cli.rs b/src/cli.rs new file mode 100644 index 0000000..3899ea8 --- /dev/null +++ b/src/cli.rs @@ -0,0 +1,167 @@ +use std::ffi::OsStr; +use std::{ffi::OsString, path::PathBuf}; +use structopt::StructOpt; +use thiserror::Error; + +/// A collection of custom errors relating to the command line interface for this package. +#[derive(Error, Debug, PartialEq)] +pub enum CliError { + /// Indicates that a string cannot be parsed into a [`CompressionFormat`](#compressionformat). + #[error("{0} is not a valid output format")] + CompressionFormat(String), + /// Indicates that a string cannot be parsed into a [`CompressionLevel`](#compressionlevel). + #[error("{0} is not a valid compression level (1-9)")] + CompressionLevel(String), + /// Indicates a bad combination of input and output files was passed. + #[error("Bad combination of input and output files: {0}")] + BadInputOutputCombination(String), +} + +/// MGP-DEPLETE command-line interface +#[derive(Debug, StructOpt)] +pub struct Cli { + #[structopt(subcommand)] + pub commands: Commands, +} + +#[derive(Debug, StructOpt)] +pub enum Commands { + /// Clean seqeunce data by removing background taxa (k-mer) or host reads (alignment) + Scrub { + /// Input filepath(s) (fa, fq, gz, bz). + /// + /// For paired Illumina you may either pass this flag twice `-i r1.fq -i r2.fq` or give two + /// files consecutively `-i r1.fq r2.fq`. NOTE: Read identifiers for paired-end Illumina reads + /// are assumed to be the same in forward and reverse read files (modern format) without trailing + /// read orientations e.g. `/1` and `/2`. If you are using legacy identifiers, reads in the depleted + /// output may be unpaired. + #[structopt( + short, + long, + parse(try_from_os_str = check_file_exists), + multiple = true, + required = true + )] + input: Vec, + /// Output filepath(s) with contaminated reads removed. + /// + /// For paired Illumina you may either pass this flag twice `-o r1.fq -o r2.fq` or give two + /// files consecutively `-o r1.fq r2.fq`. NOTE: The order of the pairs is assumed to be the + /// same as that given for --input. + #[structopt(short, long, parse(from_os_str), multiple = true, required = true)] + output: Vec, + /// Kraken2 database path. + /// + /// Specify the path to the Kraken2 database directory. + #[structopt(short = "k", long, parse(try_from_os_str = check_file_exists), multiple = false, required = true)] + kraken_db: PathBuf, + /// Threads to use for Kraken2 + /// + /// Specify the number of threads to pass to Kraken2 + #[structopt(short = "k", long, default_value = "4")] + kraken_threads: u32, + /// Working directory containing intermediary files. + /// + /// Path to a working directory which contains the alignment and intermediary output files + /// from the programs called during scrubbing. + #[structopt(short, long, parse(from_os_str))] + workdir: Option, + /// u: uncompressed; b: Bzip2; g: Gzip; l: Lzma + /// + /// Default is to attempt to infer the output compression format automatically from the filename + /// extension (gz|bz|bz2|lzma). This option is used to override that. + #[structopt( + short = "O", + long, + value_name = "u|b|g|l", + parse(try_from_str = parse_compression_format), + possible_values = &["u", "b", "g", "l"], + case_insensitive=true, + hide_possible_values = true + )] + output_format: Option, + /// Compression level to use if compressing output + #[structopt( + short = "l", + long, + parse(try_from_str = parse_level), + default_value="6", + value_name = "1-9" + )] + compression_level: niffler::Level, + } +} + +// Functions may be heavily adapted from Rasusa, due to the excellent error annotation style +impl Cli { + /// Checks there is a valid and equal number of `--input` and `--output` arguments given. + /// + /// # Errors + /// A [`CliError::BadInputOutputCombination`](#clierror) is returned for the following: + /// - Either `--input` or `--output` are passed more than twice + /// - An unequal number of `--input` and `--output` are passed + pub fn validate_input_output_combination(&self) -> Result<(), CliError> { + match &self.commands { + Commands::Scrub { input, output, .. } => { + let out_len = output.len(); + let in_len = input.len(); + if in_len > 2 { + let msg = String::from("Got more than 2 files for input."); + return Err(CliError::BadInputOutputCombination(msg)); + } + if out_len > 2 { + let msg = String::from("Got more than 2 files for output."); + return Err(CliError::BadInputOutputCombination(msg)); + } + if in_len != out_len { + let msg = format!("Got {} --input but {} --output", in_len, out_len); + return Err(CliError::BadInputOutputCombination(msg)); + } + } + }; + Ok(()) + } +} + + +/// A utility function to validate whether an input files exist +fn check_file_exists(file: &OsStr) -> Result { + let path = PathBuf::from(file); + let path_msg = format!("{:?} does not exist", path); + if path.exists() { + let abs_path = std::fs::canonicalize(path).map_err(|_| OsString::from(path_msg))?; + Ok(abs_path) + } else { + Err(OsString::from(path_msg)) + } +} + +/// A utility function to validate compression format is in allowed values +fn parse_compression_format(s: &str) -> Result { + match s { + "b" | "B" => Ok(niffler::Format::Bzip), + "g" | "G" => Ok(niffler::Format::Gzip), + "l" | "L" => Ok(niffler::Format::Lzma), + "u" | "U" => Ok(niffler::Format::No), + _ => Err(CliError::CompressionFormat(s.to_string())), + } +} + +/// A utility function to validate compression level is in allowed range +#[allow(clippy::redundant_clone)] +fn parse_level(s: &str) -> Result { + let lvl = match s.parse::() { + Ok(1) => niffler::Level::One, + Ok(2) => niffler::Level::Two, + Ok(3) => niffler::Level::Three, + Ok(4) => niffler::Level::Four, + Ok(5) => niffler::Level::Five, + Ok(6) => niffler::Level::Six, + Ok(7) => niffler::Level::Seven, + Ok(8) => niffler::Level::Eight, + Ok(9) => niffler::Level::Nine, + _ => return Err(CliError::CompressionLevel(s.to_string())), + }; + Ok(lvl) +} + diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..f48f6ca --- /dev/null +++ b/src/main.rs @@ -0,0 +1,54 @@ +use anyhow::Result; +use thiserror::Error; +use structopt::StructOpt; +use std::path::PathBuf; +use chrono::Local; +use env_logger::Builder; +use log::LevelFilter; +use std::io::Write; +mod scrub; +mod cli; + +fn main() -> Result<()> { + let cli = cli::Cli::from_args(); + + // Command specific checks - scrubbing + + cli.validate_input_output_combination()?; + + Builder::new() + .format(|buf, record| { + writeln!(buf, + "{} [{}] - {}", + Local::now().format("%Y-%m-%dT%H:%M:%S"), + record.level(), + record.args() + ) + }) + .filter(None, LevelFilter::Info) + .init(); + + + match cli.commands { + cli::Commands::Scrub { + input, + output, + workdir, + kraken_db, + kraken_threads, + output_format, + compression_level, + } => { + + let scrubber = scrub::Scrubber::new(workdir)?; + + log::info!("Welcome to Scrubby, your trusty read scrubber!"); + scrubber.run_kraken(&input, kraken_db, kraken_threads); + + } + } + + log::info!("Scrub scrub, scrubbity-scrub! Your sequence data, only cleaner!"); + + Ok(()) +} \ No newline at end of file diff --git a/src/scrub.rs b/src/scrub.rs new file mode 100644 index 0000000..689db37 --- /dev/null +++ b/src/scrub.rs @@ -0,0 +1,128 @@ +use anyhow::{Result, Context}; +use std::path::PathBuf; +use needletail::{parse_fastx_file, Sequence}; +use std::str::from_utf8; +use std::process::Command; +use thiserror::Error; +use chrono::Local; +use std::fs; +use std::fs::{create_dir_all, canonicalize}; + +// See: https://nick.groenen.me/posts/rust-error-handling/ + +/// WordCountError enumerates all possible errors returned by this library. +#[derive(Error, Debug)] +pub enum ScrubberError { + /// Represents a failure to execute Kraken2 + #[error("execution error - is Kraken2 installed?")] + KrakenError, + /// Represents a failure in the correct length of the input file vector + #[error("input file error - incorrect number of input files")] + FileNumberError, + /// Represents a failure to convert a PathBuf converted OsStr into a String + /// This is because into_string() returns Result) -> Result { + let _workdir = check_or_create_workdir(workdir)?; + Ok(Self { workdir: _workdir }) + } + pub fn run_kraken( + &self, + input: &Vec, + kraken_db: PathBuf, + kraken_threads: u32, + ) -> Result<(), ScrubberError>{ + + // Safely build the arguments for Kraken2 + + let kraken_db_path = kraken_db.into_os_string().into_string().map_err(|_| ScrubberError::InvalidFilePathConversion)?; + + let file_args = match input.len() { + 2 => { + let file1 = input[0].clone().into_os_string().into_string().map_err(|_| ScrubberError::InvalidFilePathConversion)?; + let file2 = input[1].clone().into_os_string().into_string().map_err(|_| ScrubberError::InvalidFilePathConversion)?; + [Some(file1), Some(file2)] + }, + 1 => [Some(input[0].clone().into_os_string().into_string().map_err(|_| ScrubberError::InvalidFilePathConversion)?), None], + _ => return Err(ScrubberError::FileNumberError), + }; + + let paired_arg = match input.len() { + 2 => Some("--paired"), + 1 => None, + _ => return Err(ScrubberError::FileNumberError), + }; + + let kraken_threads_arg = kraken_threads.to_string(); + let mut kraken_args = Vec::from(["--threads".to_string(), kraken_threads_arg, "--db".to_string(), kraken_db_path]); + + match paired_arg { + Some(value) => kraken_args.push(value.to_string()), + None => {} + }; + + for file in file_args { + match file { + Some(value) => { + kraken_args.push(value) + } + None => {} + } + }; + + // Run the Kraken command + + let output = Command::new("kraken2") + .args(kraken_args) + .current_dir(&self.workdir) + .output() + .map_err(|_| ScrubberError::KrakenError)?; + + println!("status: {}", output.status); + println!("stdout: {}", String::from_utf8_lossy(&output.stdout)); + println!("stderr: {}", String::from_utf8_lossy(&output.stderr)); + + Ok(()) + } + +} + +/// Checks if work directory exists and otherwise creates it - return the actual path for the application +/// +/// # Errors +/// A [`ScrubberError::WorkdirExists`](#clierror) is returned if the directory already exists +/// A [`ScrubberError::WorkdirCreateFailure`](#clierror) is returned if the directory cannot be created +pub fn check_or_create_workdir(workdir: Option) -> Result { + let _workdir = match workdir { + Some(path) => path, + None => PathBuf::from(format!("scrubby_{:}", Local::now().format("%Y%m%dT%H%M%S"))) + }; + let abs_workdir = canonicalize(&_workdir).map_err(|_| ScrubberError::AbsolutePathFailure(format!("{:?}", _workdir)))?; + let abs_workdir_msg = format!("{:?}", abs_workdir); + if !&_workdir.exists(){ + create_dir_all(&_workdir).map_err(|_| ScrubberError::WorkdirCreateFailure(abs_workdir_msg))?; + Ok(abs_workdir) + } else { + Err(ScrubberError::WorkdirExists(abs_workdir_msg)) + } +} \ No newline at end of file