Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added concat and tests for concat #24

Merged
merged 4 commits into from
Aug 7, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,49 @@ pub enum Commands {
#[arg(required = true, short)]
output: String,
},
/// Append new genomes to be sketched to an existing sketch database
Append {
/// Sketching database basename (so without .skm or .skd)
#[arg(required = true)]
db: String,

/// List of input FASTA files
#[arg(long, group = "input", num_args = 1.., value_delimiter = ',')]
seq_files: Option<Vec<String>>,

/// File listing input files (tab separated name, sequences)
#[arg(short, group = "input")]
file_list: Option<String>,

/// Output filename for the merged sketch
#[arg(required = true, short)]
output: String,

/// Ignore reverse complement (all contigs are oriented along same strand)
#[arg(long, default_value_t = DEFAULT_STRAND)]
single_strand: bool,

/// Minimum k-mer count (with reads)
#[arg(long, default_value_t = DEFAULT_MINCOUNT)]
min_count: u16,

/// Minimum k-mer quality (with reads)
#[arg(long, default_value_t = DEFAULT_MINQUAL)]
min_qual: u8,

/// Treat every sequence in an input file as a new sample (aa only)
// TODO: for now, could be extended to dna, but probably no need
johnlees marked this conversation as resolved.
Show resolved Hide resolved
#[arg(long, default_value_t = false)]
concat_fasta: bool,

/// Number of CPU threads
#[arg(long, value_parser = valid_cpus, default_value_t = 1)]
threads: usize,

/// aaHash 'level'
#[arg(long, value_enum, default_value_t = DEFAULT_LEVEL)]
level: AaLevel,
},
/// Print information about a .skm file
Info {
/// Sketch metadata file (.skm) to describe
Expand Down
89 changes: 89 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
pub mod hashing;

pub mod utils;
use std::fs::{File, OpenOptions};
use std::io::copy;

/// Default k-mer size for (genome) sketching
pub const DEFAULT_KMER: usize = 17;
Expand Down Expand Up @@ -395,6 +397,93 @@
utils::save_sketch_data(ref_db_name1, ref_db_name2, output)
}

Commands::Append {
db,
seq_files,
file_list,
output,
single_strand,
min_count,
min_qual,
concat_fasta,
threads,
level,
} => {
// An extra thread is needed for the writer. This doesn't 'overuse' CPU
check_threads(*threads + 1);
//get input files
log::info!("Getting input files");
let input_files: Vec<(String, String, Option<String>)> =
get_input_list(file_list, seq_files);
log::info!("Parsed {} samples in input list", input_files.len());

//check if any of the new files are already existant in the db
let db_metadata: MultiSketch = MultiSketch::load(db)
.expect(&format!("Could not read sketch metadata from .skm: {}", db));

Check warning on line 422 in src/lib.rs

View workflow job for this annotation

GitHub Actions / clippy

use of `expect` followed by a function call

warning: use of `expect` followed by a function call --> src/lib.rs:422:18 | 422 | .expect(&format!("Could not read sketch metadata from .skm: {}", db)); | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ help: try: `unwrap_or_else(|_| panic!("Could not read sketch metadata from .skm: {}", db))` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#expect_fun_call = note: `#[warn(clippy::expect_fun_call)]` on by default

if !db_metadata.append_compatibility(&input_files) {
panic!("Databases are not compatible for merging.")
}
log::info!("Passed concat check");

// read out sketching information needed to sketch the new files
let kmers = db_metadata.kmer_lengths();
let rc = !*single_strand;
let sketch_size = db_metadata.sketch_size;
let seq_type = db_metadata.get_hash_type();
if *concat_fasta && matches!(*seq_type, HashType::DNA | HashType::PDB) {
panic!("--concat-fasta currently only supported with --seq-type aa");
}
johnlees marked this conversation as resolved.
Show resolved Hide resolved

log::info!(
"Running sketching: k:{:?}; sketch_size:{}; seq:{:?}; threads:{}",
kmers,
sketch_size * u64::BITS as u64,
seq_type,
threads,
);

let seq_type = if let HashType::AA(_) = seq_type {
HashType::AA(level.clone())
} else {
seq_type.clone()
};
johnlees marked this conversation as resolved.
Show resolved Hide resolved
// sketch genomes and save them to concat output file
let mut db2_sketches = sketch_files(
johannahelene marked this conversation as resolved.
Show resolved Hide resolved
output,
&input_files,
*concat_fasta,
kmers,
sketch_size,
&seq_type,
rc,
*min_count,
*min_qual,
);
let mut db2_metadata =
MultiSketch::new(&mut db2_sketches, sketch_size, kmers, seq_type);

// save skd data from db1 and from freshly sketched input files
log::info!("Merging and saving sketch data to {}.skd", output);
// let mut output_file = File::create(format!("{}.skd", output))?;
let mut output_file = OpenOptions::new()
.create(true)
.append(true)
.open(format!("{}.skd", output))?;
// stream sketch data directly to concat output file
let mut db_sketch = File::open(format!("{}.skd", db))?;
println!("{:?}", db_sketch);
println!("{:?}", output_file);
johannahelene marked this conversation as resolved.
Show resolved Hide resolved
copy(&mut db_sketch, &mut output_file)?;

// merge and update skm from db1 and the new just sketched sketch
let concat_metadata = db2_metadata.merge_sketches(&db_metadata);
concat_metadata
.save_metadata(output)
.expect(&format!("Could not save metadata to {}", output));

Check warning on line 483 in src/lib.rs

View workflow job for this annotation

GitHub Actions / clippy

use of `expect` followed by a function call

warning: use of `expect` followed by a function call --> src/lib.rs:483:18 | 483 | .expect(&format!("Could not save metadata to {}", output)); | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ help: try: `unwrap_or_else(|_| panic!("Could not save metadata to {}", output))` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#expect_fun_call
Ok(())
}

Commands::Info {
skm_file,
sample_info,
Expand Down
17 changes: 17 additions & 0 deletions src/multisketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@
s1_slice
}

pub fn remove_sketches(&self, ids: &[String]) {

Check warning on line 150 in src/multisketch.rs

View workflow job for this annotation

GitHub Actions / clippy

unused variable: `ids`

warning: unused variable: `ids` --> src/multisketch.rs:150:35 | 150 | pub fn remove_sketches(&self, ids: &[String]) { | ^^^ help: if this is intentional, prefix it with an underscore: `_ids` | = note: `#[warn(unused_variables)]` on by default
// TODO: remove sketch bins which belong to the duplicate ids
todo!();
}
Expand All @@ -158,6 +158,23 @@
&& self.get_hash_type() == sketch2.get_hash_type()
}

pub fn append_compatibility(&self, name_vec: &[(String, String, Option<String>)]) -> bool {
let mut compatibility = true;
let mut duplicate_list = Vec::new();
for (id, _, _) in name_vec.iter() {
if self.name_map.contains_key(id) {
duplicate_list.push(id);
compatibility = false;
}
}

if !duplicate_list.is_empty() {
println!("Duplicates found: {:?}", duplicate_list);
}

compatibility
}

pub fn merge_sketches(&mut self, sketch2: &Self) -> &mut Self {
// First metadata
let offset = self.sketch_metadata.len();
Expand Down
112 changes: 112 additions & 0 deletions tests/concat.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
use predicates::prelude::*;
use snapbox::cmd::{cargo_bin, Command};
use std::path::Path;

pub mod common;
use crate::common::*;
use sketchlib::io::*;

use sketchlib::multisketch::MultiSketch;

#[cfg(test)]

mod tests {
use sketchlib::io;
use super::*;


#[test]
fn test_concat_sketches() {
let sandbox = TestSetup::setup();

Command::new(cargo_bin("sketchlib"))
.current_dir(sandbox.get_wd())
.arg("sketch")
.args(&["--k-vals", "17"])
.arg("--seq-files")
.arg(sandbox.file_string("14412_3#82.contigs_velvet.fa.gz", TestDir::Input))
.arg(sandbox.file_string("14412_3#84.contigs_velvet.fa.gz", TestDir::Input))
.arg("-v")
.args(&["-o", "part1"])
.assert()
.success();

Command::new(cargo_bin("sketchlib"))
.current_dir(sandbox.get_wd())
.arg("sketch")
.args(&["--k-vals", "17"])
.arg("--seq-files")
.arg(sandbox.file_string("R6.fa.gz", TestDir::Input))
.arg(sandbox.file_string("TIGR4.fa.gz", TestDir::Input))
.arg("-v")
.args(&["-o", "part2"])
.assert()
.success();

Command::new(cargo_bin("sketchlib"))
.current_dir(sandbox.get_wd())
.arg("sketch")
.args(&["--k-vals", "17"])
.arg("--seq-files")
.arg(sandbox.file_string("R6.fa.gz", TestDir::Input))
.arg(sandbox.file_string("TIGR4.fa.gz", TestDir::Input))
.arg(sandbox.file_string("14412_3#82.contigs_velvet.fa.gz", TestDir::Input))
.arg(sandbox.file_string("14412_3#84.contigs_velvet.fa.gz", TestDir::Input))
.arg("-v")
.args(&["-o", "concat_ref"])
.assert()
.success();

// Overlapping labels fails
Command::new(cargo_bin("sketchlib"))
.current_dir(sandbox.get_wd())
.arg("concat")
.arg("part1")
.arg("--seq-files")
.arg(sandbox.file_string("14412_3#82.contigs_velvet.fa.gz", TestDir::Input))
.arg(sandbox.file_string("14412_3#84.contigs_velvet.fa.gz", TestDir::Input))
.arg("-v")
.args(&["-o", "concat_test"])
.assert()
.failure();

Command::new(cargo_bin("sketchlib"))
.current_dir(sandbox.get_wd())
.arg("append")
.arg("part1")
.arg("--seq-files")
// .arg(sandbox.file_string("fasta_part2.txt", TestDir::Input))
.arg(sandbox.file_string("R6.fa.gz", TestDir::Input))
.arg(sandbox.file_string("TIGR4.fa.gz", TestDir::Input))
.arg("-v")
.args(&["-o", "concat_test"])
.assert()
.success();

// Check .skm the same
let concat_sketch: MultiSketch =
MultiSketch::load(&sandbox.file_string("concat_test", TestDir::Output))
.expect("Failed to load output merged sketch");
let expected_sketch =
MultiSketch::load(&sandbox.file_string("concat_ref", TestDir::Output))
.expect("Failed to load expected merged sketch");
println!("{}", concat_sketch);
println!("{}", expected_sketch);
assert_eq!(
concat_sketch, expected_sketch,
"Concat sketch metadata does not match"
);

// Check .skd the same
let predicate_file = predicate::path::eq_file(Path::new(
&sandbox.file_string("concat_test.skd", TestDir::Output),
));
assert_eq!(
true,
predicate_file.eval(Path::new(
&sandbox.file_string("concat_ref.skd", TestDir::Output)
)),
"Concat sketch data does not match"
);
}
}
4 changes: 2 additions & 2 deletions tests/merge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ mod tests {

// Check .skm the same
let merged_sketch: MultiSketch =
MultiSketch::load(&sandbox.file_string("merged_test", TestDir::Output))
.expect("Failed to load output merged sketch");
MultiSketch::load(&sandbox.file_string("merged_test", TestDir::Output))
.expect("Failed to load output merged sketch");
let expected_sketch =
MultiSketch::load(&sandbox.file_string("merged_ref", TestDir::Output))
.expect("Failed to load expected merged sketch");
Expand Down
4 changes: 0 additions & 4 deletions tests/test_files_in/fasta.txt

This file was deleted.

Loading