Skip to content

Commit

Permalink
add kraken execution and command line client
Browse files Browse the repository at this point in the history
  • Loading branch information
esteinig committed Feb 11, 2023
1 parent f2a45b9 commit 17c7916
Show file tree
Hide file tree
Showing 5 changed files with 396 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Generated by Cargo
# will have compiled files and executables
/target/

/eval/
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock
Expand Down
46 changes: 46 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
[package]
name = "scrubby"
version = "0.1.0"
authors = ["esteinig <eike.steinig@unimelb.edu.au>"]
description = "Scrubby: remove background taxa and host sequences from reads"
documentation = "https://github.com/esteinig/scrubby"
homepage = "https://github.com/esteinig/scrubby"
repository = "https://github.com/esteinig/scrubby"
readme = "README.md"
keywords = ["meta-gp", "host", "background", "depletion", "metagenomics", "diagnostics"]
categories = ["science"]
license = "MIT"
edition = "2018"
include = [
"**/*.rs",
"src/data/*",
"Cargo.toml"
]

[dependencies]
anyhow = "1.0"
structopt = "0.3"
clap = "2.33.0"
thiserror = "1.0"
crossterm = "0.23.0"
itertools = "0.10.3"
tabled = "0.5.0"
indicatif = "0.16.2"
env_logger = "0.9.0"
rust-htslib = "0.38"
needletail = "0.4.1"
niffler = "2.3"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
log = "0.4"
chrono = "0.4"
rand = "0.8.5"

[dev-dependencies]
assert_cmd = "2.0.1"
predicates = "1"
float_eq = "0.6.1"

[[bin]]
name = "scrubby"
path = "src/main.rs"
167 changes: 167 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
use std::ffi::OsStr;
use std::{ffi::OsString, path::PathBuf};
use structopt::StructOpt;
use thiserror::Error;

/// A collection of custom errors relating to the command line interface for this package.
#[derive(Error, Debug, PartialEq)]
pub enum CliError {
/// Indicates that a string cannot be parsed into a [`CompressionFormat`](#compressionformat).
#[error("{0} is not a valid output format")]
CompressionFormat(String),
/// Indicates that a string cannot be parsed into a [`CompressionLevel`](#compressionlevel).
#[error("{0} is not a valid compression level (1-9)")]
CompressionLevel(String),
/// Indicates a bad combination of input and output files was passed.
#[error("Bad combination of input and output files: {0}")]
BadInputOutputCombination(String),
}

/// MGP-DEPLETE command-line interface
#[derive(Debug, StructOpt)]
pub struct Cli {
#[structopt(subcommand)]
pub commands: Commands,
}

#[derive(Debug, StructOpt)]
pub enum Commands {
/// Clean seqeunce data by removing background taxa (k-mer) or host reads (alignment)
Scrub {
/// Input filepath(s) (fa, fq, gz, bz).
///
/// For paired Illumina you may either pass this flag twice `-i r1.fq -i r2.fq` or give two
/// files consecutively `-i r1.fq r2.fq`. NOTE: Read identifiers for paired-end Illumina reads
/// are assumed to be the same in forward and reverse read files (modern format) without trailing
/// read orientations e.g. `/1` and `/2`. If you are using legacy identifiers, reads in the depleted
/// output may be unpaired.
#[structopt(
short,
long,
parse(try_from_os_str = check_file_exists),
multiple = true,
required = true
)]
input: Vec<PathBuf>,
/// Output filepath(s) with contaminated reads removed.
///
/// For paired Illumina you may either pass this flag twice `-o r1.fq -o r2.fq` or give two
/// files consecutively `-o r1.fq r2.fq`. NOTE: The order of the pairs is assumed to be the
/// same as that given for --input.
#[structopt(short, long, parse(from_os_str), multiple = true, required = true)]
output: Vec<PathBuf>,
/// Kraken2 database path.
///
/// Specify the path to the Kraken2 database directory.
#[structopt(short = "k", long, parse(try_from_os_str = check_file_exists), multiple = false, required = true)]
kraken_db: PathBuf,
/// Threads to use for Kraken2
///
/// Specify the number of threads to pass to Kraken2
#[structopt(short = "k", long, default_value = "4")]
kraken_threads: u32,
/// Working directory containing intermediary files.
///
/// Path to a working directory which contains the alignment and intermediary output files
/// from the programs called during scrubbing.
#[structopt(short, long, parse(from_os_str))]
workdir: Option<PathBuf>,
/// u: uncompressed; b: Bzip2; g: Gzip; l: Lzma
///
/// Default is to attempt to infer the output compression format automatically from the filename
/// extension (gz|bz|bz2|lzma). This option is used to override that.
#[structopt(
short = "O",
long,
value_name = "u|b|g|l",
parse(try_from_str = parse_compression_format),
possible_values = &["u", "b", "g", "l"],
case_insensitive=true,
hide_possible_values = true
)]
output_format: Option<niffler::compression::Format>,
/// Compression level to use if compressing output
#[structopt(
short = "l",
long,
parse(try_from_str = parse_level),
default_value="6",
value_name = "1-9"
)]
compression_level: niffler::Level,
}
}

// Functions may be heavily adapted from Rasusa, due to the excellent error annotation style
impl Cli {
/// Checks there is a valid and equal number of `--input` and `--output` arguments given.
///
/// # Errors
/// A [`CliError::BadInputOutputCombination`](#clierror) is returned for the following:
/// - Either `--input` or `--output` are passed more than twice
/// - An unequal number of `--input` and `--output` are passed
pub fn validate_input_output_combination(&self) -> Result<(), CliError> {
match &self.commands {
Commands::Scrub { input, output, .. } => {
let out_len = output.len();
let in_len = input.len();
if in_len > 2 {
let msg = String::from("Got more than 2 files for input.");
return Err(CliError::BadInputOutputCombination(msg));
}
if out_len > 2 {
let msg = String::from("Got more than 2 files for output.");
return Err(CliError::BadInputOutputCombination(msg));
}
if in_len != out_len {
let msg = format!("Got {} --input but {} --output", in_len, out_len);
return Err(CliError::BadInputOutputCombination(msg));
}
}
};
Ok(())
}
}


/// A utility function to validate whether an input files exist
fn check_file_exists(file: &OsStr) -> Result<PathBuf, OsString> {
let path = PathBuf::from(file);
let path_msg = format!("{:?} does not exist", path);
if path.exists() {
let abs_path = std::fs::canonicalize(path).map_err(|_| OsString::from(path_msg))?;
Ok(abs_path)
} else {
Err(OsString::from(path_msg))
}
}

/// A utility function to validate compression format is in allowed values
fn parse_compression_format(s: &str) -> Result<niffler::compression::Format, CliError> {
match s {
"b" | "B" => Ok(niffler::Format::Bzip),
"g" | "G" => Ok(niffler::Format::Gzip),
"l" | "L" => Ok(niffler::Format::Lzma),
"u" | "U" => Ok(niffler::Format::No),
_ => Err(CliError::CompressionFormat(s.to_string())),
}
}

/// A utility function to validate compression level is in allowed range
#[allow(clippy::redundant_clone)]
fn parse_level(s: &str) -> Result<niffler::Level, CliError> {
let lvl = match s.parse::<u8>() {
Ok(1) => niffler::Level::One,
Ok(2) => niffler::Level::Two,
Ok(3) => niffler::Level::Three,
Ok(4) => niffler::Level::Four,
Ok(5) => niffler::Level::Five,
Ok(6) => niffler::Level::Six,
Ok(7) => niffler::Level::Seven,
Ok(8) => niffler::Level::Eight,
Ok(9) => niffler::Level::Nine,
_ => return Err(CliError::CompressionLevel(s.to_string())),
};
Ok(lvl)
}

54 changes: 54 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
use anyhow::Result;
use thiserror::Error;
use structopt::StructOpt;
use std::path::PathBuf;
use chrono::Local;
use env_logger::Builder;
use log::LevelFilter;
use std::io::Write;
mod scrub;
mod cli;

fn main() -> Result<()> {
let cli = cli::Cli::from_args();

// Command specific checks - scrubbing

cli.validate_input_output_combination()?;

Builder::new()
.format(|buf, record| {
writeln!(buf,
"{} [{}] - {}",
Local::now().format("%Y-%m-%dT%H:%M:%S"),
record.level(),
record.args()
)
})
.filter(None, LevelFilter::Info)
.init();


match cli.commands {
cli::Commands::Scrub {
input,
output,
workdir,
kraken_db,
kraken_threads,
output_format,
compression_level,
} => {

let scrubber = scrub::Scrubber::new(workdir)?;

log::info!("Welcome to Scrubby, your trusty read scrubber!");
scrubber.run_kraken(&input, kraken_db, kraken_threads);

}
}

log::info!("Scrub scrub, scrubbity-scrub! Your sequence data, only cleaner!");

Ok(())
}
Loading

0 comments on commit 17c7916

Please sign in to comment.