diff --git a/kr2r/Cargo.toml b/kr2r/Cargo.toml index bd97585..2aa27b8 100644 --- a/kr2r/Cargo.toml +++ b/kr2r/Cargo.toml @@ -17,14 +17,6 @@ path = "src/bin/build_db.rs" name = "classify" path = "src/bin/classify.rs" -[[bin]] -name = "cht" -path = "src/bin/cht.rs" - -[[bin]] -name = "taxo" -path = "src/bin/taxo.rs" - [features] default = ["dna"] diff --git a/kr2r/src/bin/cht.rs b/kr2r/src/bin/cht.rs deleted file mode 100644 index 9d643bd..0000000 --- a/kr2r/src/bin/cht.rs +++ /dev/null @@ -1,81 +0,0 @@ -use kr2r::compact_hash::{Cell, CellIndex, Compact, CompactHash, CompactHashTable}; -use memmap2::MmapOptions; -use rayon::prelude::*; -use std::fs::OpenOptions; - -fn main() -> std::io::Result<()> { - let capacity = 2usize; - let file = OpenOptions::new() - .read(true) - .write(true) - .create(true) - .open("make_mut.md")?; - file.set_len(4 * capacity as u64)?; - - // 分配足够的空间来存储两个 CompactHashCell - let mut mut_mmap = unsafe { MmapOptions::new().len(4 * capacity).map_mut(&file)? }; - - // 使用 unsafe 获取 CompactHashCell 的切片引用 - let cells: &mut [u32] = - unsafe { std::slice::from_raw_parts_mut(mut_mmap.as_mut_ptr() as *mut u32, capacity) }; - - // for i in 0..capacity { - // println!("cell {:?}", cells[i]); - // } - - // 直接在内存映射上操作 CompactHashCell 实例 - // cells[0] = CompactHashCell(4); - // cells[1] = CompactHashCell(5); - - // let chtm = CompactHashTable::from("test.k2d")?; - // println!("chtm {:?}", chtm); - // let cell = chtm.table[0]; - - // println!("cell {:?}", cell.compacted_key(10)); - // println!("cell taxid {:?}", cell.taxid(chtm.value_mask)); - - let chtm1 = CompactHashTable::from("lib/hash1.k2d")?; - println!("chtm1 {:?}", chtm1); - println!("value::: {:?}", chtm1.get(2361267427824489423)); - let key: u64 = 2361267427824489423; - - println!("compacted key {:?}", key.compacted(10)); - println!("index: {:?}", key.index(560818468)); - - let table = chtm1.table; - let cell = &table[24550723]; - println!("cell {:?}", cell.compacted_key(10)); - println!("cell taxid {:?}", cell.taxid(chtm1.value_mask)); - - let chtm1 = CompactHashTable::from("lib/hash.k2d")?; - println!("chtm1 {:?}", chtm1); - println!("value::: {:?}", chtm1.get(2361267427824489423)); - - // let chtm = CompactHashTable::new("test.k2d", 1, 10)?; - // chtm.set_cell(CellIndex::new(0, 1, 32)); - // let table = chtm.table; - // let cell = &table[24550723]; - // println!("cell {:?}", cell.compacted_key(10)); - // println!("cell taxid {:?}", cell.taxid(chtm.value_mask)); - - // let chtm1 = CompactHashTableMut::new("hash_file.k2d", 1731287771, 10)?; - - // let mut_table = chtm1.table; - - // let table = chtm.table; - // mut_table.copy_from_slice(table); - // let mut size = 0; - - // println!("chtm11 {:?}", chtm1); - - // chtm1.save_size(393344595)?; - // let hash_file = File::open("lib/hash.k2d")?; - // let hash_mmp = unsafe { Mmap::map(&file)? }; - - // mut_mmap.flush(); - // 直接修改第 2-5 字节 - // 注意:这里的索引范围是 2..6,因为范围是左闭右开的 - // mut_mmap[2..6].copy_from_slice(b"test"); - - Ok(()) -} diff --git a/kr2r/src/bin/classify.rs b/kr2r/src/bin/classify.rs index de79d6e..83313a4 100644 --- a/kr2r/src/bin/classify.rs +++ b/kr2r/src/bin/classify.rs @@ -46,10 +46,9 @@ struct Args { )] confidence_threshold: f64, - /// Enable quick mode for faster processing. - #[clap(short = 'q', long = "quick-mode", action)] - quick_mode: bool, - + // /// Enable quick mode for faster processing. + // #[clap(short = 'q', long = "quick-mode", action)] + // quick_mode: bool, /// The number of threads to use, default is 1. #[clap(short = 'p', long = "num-threads", value_parser, default_value_t = 1)] num_threads: i32, @@ -71,38 +70,36 @@ struct Args { #[clap(short = 'S', long = "single-file-pairs", action)] single_file_pairs: bool, - /// Use mpa-style report format. - #[clap(short = 'm', long = "mpa-style-report", action)] - mpa_style_report: bool, - - /// Report k-mer data in the output. - #[clap(short = 'K', long = "report-kmer-data", action)] - report_kmer_data: bool, + // /// Use mpa-style report format. + // #[clap(short = 'm', long = "mpa-style-report", action)] + // mpa_style_report: bool, - /// File path for outputting the report. - #[clap(short = 'R', long = "report-filename", value_parser)] - report_filename: Option, + // /// Report k-mer data in the output. + // #[clap(short = 'K', long = "report-kmer-data", action)] + // report_kmer_data: bool, - /// Report taxa with zero count. - #[clap(short = 'z', long = "report-zero-counts", action)] - report_zero_counts: bool, + // /// File path for outputting the report. + // #[clap(short = 'R', long = "report-filename", value_parser)] + // report_filename: Option, - /// File path for outputting classified sequences. - #[clap(short = 'C', long = "classified-output-filename", value_parser)] - classified_output_filename: Option, + // /// Report taxa with zero count. + // #[clap(short = 'z', long = "report-zero-counts", action)] + // report_zero_counts: bool, - /// File path for outputting unclassified sequences. - #[clap(short = 'U', long = "unclassified-output-filename", value_parser)] - unclassified_output_filename: Option, + // /// File path for outputting classified sequences. + // #[clap(short = 'C', long = "classified-output-filename", value_parser)] + // classified_output_filename: Option, + // /// File path for outputting unclassified sequences. + // #[clap(short = 'U', long = "unclassified-output-filename", value_parser)] + // unclassified_output_filename: Option, /// File path for outputting normal Kraken output. #[clap(short = 'O', long = "kraken-output-filename", value_parser)] kraken_output_filename: Option, - /// Print scientific name instead of taxid in Kraken output. - #[clap(short = 'n', long = "print-scientific-name", action)] - print_scientific_name: bool, - + // /// Print scientific name instead of taxid in Kraken output. + // #[clap(short = 'n', long = "print-scientific-name", action)] + // print_scientific_name: bool, /// Minimum quality score for FASTQ data, default is 0. #[clap( short = 'Q', @@ -112,10 +109,6 @@ struct Args { )] minimum_quality_score: i32, - /// Use memory mapping to access hash and taxonomy data. - #[clap(short = 'M', long = "use-memory-mapping", action)] - use_memory_mapping: bool, - /// Input files for processing. /// /// A list of input file paths (FASTA/FASTQ) to be processed by the classify program. diff --git a/kr2r/src/bin/estimate_capacity.rs b/kr2r/src/bin/estimate_capacity.rs index a9b7958..9fcaa30 100644 --- a/kr2r/src/bin/estimate_capacity.rs +++ b/kr2r/src/bin/estimate_capacity.rs @@ -47,9 +47,11 @@ struct Args { #[clap(short = 'T', long, value_parser = parse_binary)] toggle_mask: Option, - /// Read block size - #[clap(short = 'B', long, default_value = "31457280")] - block_size: usize, + /// Proportion of the hash table to be populated + /// (build task only; def: 0.7, must be + /// between 0 and 1). + #[clap(long, long, default_value_t = 0.7)] + load_factor: f64, /// Number of threads #[clap(short = 'p', long, default_value = "4")] @@ -145,6 +147,23 @@ fn process_sequence( hllp } +fn format_bytes(size: f64) -> String { + let suffixes = ["B", "kB", "MB", "GB", "TB", "PB", "EB"]; + let mut size = size; + let mut current_suffix = &suffixes[0]; + + for suffix in &suffixes[1..] { + if size >= 1024.0 { + current_suffix = suffix; + size /= 1024.0; + } else { + break; + } + } + + format!("{:.2}{}", size, current_suffix) +} + fn main() { let mut args = Args::parse(); if args.k_mer < args.l_mer as u64 { @@ -182,5 +201,11 @@ fn main() { let hllp_count = (hllp.count() * RANGE_SECTIONS as f64 / args.n as f64).round() as u64; // println!("Final count: {:?}", final_count); - println!("estimate count: {:?}", hllp_count); + let required_capacity = (hllp_count + 8192) as f64 / args.load_factor; + println!( + "estimate count: {:?}, required capacity: {:?}, Estimated hash table requirement: {:?}", + hllp_count, + required_capacity.ceil(), + format_bytes(required_capacity) + ); } diff --git a/kr2r/src/bin/taxo.rs b/kr2r/src/bin/taxo.rs deleted file mode 100644 index a05312e..0000000 --- a/kr2r/src/bin/taxo.rs +++ /dev/null @@ -1,25 +0,0 @@ -use kr2r::taxonomy::{NCBITaxonomy, Taxonomy}; -use kr2r::utils::read_id_to_taxon_map; -use std::io::Result; - -fn main() -> Result<()> { - // let t = Taxonomy::from_file("taxo.k2d")?; - // println!("t {:?}", t); - let mut ncbi = NCBITaxonomy::from_ncbi("lib/taxonomy/nodes.dmp", "lib/taxonomy/names.dmp")?; - let id_map = read_id_to_taxon_map("lib/seqid2taxid.map")?; - // let name_map = parse_names_file("lib/taxonomy/names.dmp")?; - // println!("names {:?}", name_map); - - for (_, id) in id_map.into_iter() { - ncbi.mark_node(id); - } - // println!("ncbi {:?}", ncbi.marked_nodes); - // println!("name map {:?}", ncbi.name_map); - let mut taxo = ncbi.convert_to_kraken_taxonomy(); - taxo.generate_external_to_internal_id_map(); - taxo.build_path_cache(); - println!("path_cache {:?}", taxo.path_cache.keys().len()); - println!("node_count {:?}", taxo.node_count()); - // taxo.write_to_disk("taxo.k2d")?; - Ok(()) -}