chunk db

eric9n · Jul 12, 2024 · 147bc45 · 147bc45
1 parent 2d8d4d1
commit 147bc45
Show file tree

Hide file tree

Showing 10 changed files with 210 additions and 254 deletions.
diff --git a/README.md b/README.md
@@ -181,7 +181,7 @@ Usage: kun_peng <COMMAND>
 Commands:
   estimate   estimate capacity
   build      build `k2d` files
-  hashshard  split hash file
+  hashshard  Convert Kraken2 database files to Kun-peng database format for efficient processing and analysis.
   splitr     Split fast(q/a) file into ranges
   annotate   annotate a set of sequences
   resolve    resolve taxonomy tree
@@ -236,6 +236,29 @@ Options:
           Print version
 ```
 
+### Convert Kraken2 database
+
+This tool converts Kraken2 database files into Kun-peng database format for more efficient processing and analysis. By specifying the database directory and the hash file capacity, users can control the size of the resulting database index files.
+
+```sh
+./target/release/kun_peng hashshard -h
+Convert Kraken2 database files to Kun-peng database format for efficient processing and analysis.
+
+Usage: kun_peng hashshard [OPTIONS] --db <DATABASE>
+
+Options:
+      --db <DATABASE>                  The database directory for the Kraken 2 index. contains index files(hash.k2d opts.k2d taxo.k2d)
+      --hash-capacity <HASH_CAPACITY>  Specifies the hash file capacity.
+                                       Acceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K').
+                                       Note: The specified capacity affects the index size, with a factor of 4 applied.
+                                       For example, specifying '1G' results in an index size of '4G'.
+                                       Default: 1G (capacity 1G = file size 4G) [default: 1G]
+  -h, --help                           Print help
+  -V, --version                        Print version
+
+```
+
+
 ### classify
 
 The classification process is divided into three modes:

diff --git a/kr2r/src/bin/build_k2_db.rs b/kr2r/src/bin/build_k2_db.rs
@@ -1,112 +1,52 @@
 // 使用时需要引用模块路径
 use clap::Parser;
-use kr2r::args::{parse_size, Build};
 use kr2r::compact_hash::HashConfig;
-use kr2r::db::{convert_fna_to_k2_format, generate_taxonomy, get_bits_for_taxid, process_k2file};
-use kr2r::utils::{
-    create_partition_files, create_partition_writers, find_library_fna_files, get_file_limit,
-    read_id_to_taxon_map, set_fd_limit,
-};
-use kr2r::IndexOptions;
+use kr2r::db::process_k2file;
+use kr2r::taxonomy::Taxonomy;
+use kr2r::utils::find_and_trans_files;
 use std::fs::remove_file;
+use std::path::PathBuf;
 use std::time::Instant;
 
 #[derive(Parser, Debug, Clone)]
 #[clap(author, version, about="build database", long_about = None)]
 pub struct Args {
-    // /// database hash chunk directory and other files
-    // #[clap(long)]
-    // pub k2d_dir: Option<PathBuf>,
-    #[clap(long, value_parser = parse_size, default_value = "1G", help = "Specifies the hash file capacity.\nAcceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K').\nNote: The specified capacity affects the index size, with a factor of 4 applied.\nFor example, specifying '1G' results in an index size of '4G'.\nDefault: 1G (capacity 1G = file size 4G)")]
-    pub hash_capacity: usize,
-
-    // chunk temp directory
-    // #[clap(long)]
-    // pub chunk_dir: PathBuf,
-    /// 包含原始配置
-    #[clap(flatten)]
-    pub build: Build,
-    // #[arg(short = 'm')]
-    // pub id_to_taxon_map_filename: Option<PathBuf>,
+    /// database hash chunk directory and other files
+    #[arg(long = "db", required = true)]
+    pub database: PathBuf,
 }
 
-pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::error::Error>> {
-    let file_num_limit = get_file_limit();
-    let meros = args.build.klmt.as_meros();
-
-    let id_to_taxon_map_filename = args.build.database.join("seqid2taxid.map");
-    let id_to_taxon_map = read_id_to_taxon_map(&id_to_taxon_map_filename)?;
-    let k2d_dir = &args.build.database;
+pub fn run(database: &PathBuf) -> Result<(), Box<dyn std::error::Error>> {
+    let k2d_dir = database;
     let taxonomy_filename = k2d_dir.join("taxo.k2d");
+    let taxonomy = Taxonomy::from_file(taxonomy_filename)?;
+    let hash_filename = k2d_dir.join("hash_config.k2d");
 
-    let ncbi_taxonomy_directory = &args.build.database.join("taxonomy");
-
-    let taxonomy = generate_taxonomy(
-        &ncbi_taxonomy_directory,
-        &taxonomy_filename,
-        &id_to_taxon_map,
-    )?;
-
-    let value_bits = get_bits_for_taxid(
-        args.build.requested_bits_for_taxid as usize,
-        taxonomy.node_count() as f64,
-    )
-    .expect("more bits required for storing taxid");
-
-    let capacity = required_capacity;
-    let partition = (capacity + args.hash_capacity - 1) / args.hash_capacity;
-    let mut hash_config =
-        HashConfig::new(1, capacity, value_bits, 0, partition, args.hash_capacity);
+    let mut hash_config = HashConfig::from_hash_header(&hash_filename)?;
 
     // 开始计时
     let start = Instant::now();
 
-    let chunk_size = args.hash_capacity as usize;
-
-    if partition >= file_num_limit {
-        set_fd_limit(partition as u64 + 1).expect("Failed to set file descriptor limit");
-        // panic!("Exceeds File Number Limit");
-    }
-
-    let chunk_files = create_partition_files(partition, &k2d_dir, "chunk");
-    let mut writers = create_partition_writers(&chunk_files);
-
-    let fna_files = find_library_fna_files(&args.build.database);
-
-    for fna_file in fna_files {
-        println!("convert fna file {:?}", fna_file);
-        convert_fna_to_k2_format(
-            fna_file,
-            meros,
-            &taxonomy,
-            &id_to_taxon_map,
-            hash_config,
-            &mut writers,
-            chunk_size,
-            args.build.threads,
-        );
-    }
+    let chunk_files = find_and_trans_files(&k2d_dir, "chunk", ".k2", true)?;
 
-    let hash_filename = k2d_dir.join("hash_config.k2d");
-    let partition = chunk_files.len();
     let mut size: usize = 0;
 
     println!("start process k2 files...");
-    for i in 1..=partition {
+    for (i, chunk_file) in &chunk_files {
         // 计算持续时间
         let count = process_k2file(
             hash_config,
             &k2d_dir,
-            &chunk_files[i - 1],
+            &chunk_file,
             &taxonomy,
-            chunk_size,
-            i,
+            hash_config.hash_capacity,
+            *i,
         )?;
         size += count;
         let duration = start.elapsed();
         println!(
             "process chunk file {:?}/{:}: duration: {:?}",
-            i, partition, duration
+            i, hash_config.partition, duration
         );
     }
 
@@ -118,30 +58,17 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::erro
     // 打印运行时间
     println!("build k2 db took: {:?}", duration);
 
-    let options_filename = k2d_dir.join("opts.k2d");
-    let idx_opts = IndexOptions::from_meros(meros);
-    idx_opts.write_to_file(options_filename)?;
-
-    for chunk_file in chunk_files {
+    for (_, chunk_file) in &chunk_files {
         remove_file(chunk_file)?;
     }
-    Ok(())
-}
 
-#[derive(Parser, Debug, Clone)]
-#[clap(author, version, about, long_about = None)]
-struct BuildArgs {
-    #[clap(flatten)]
-    db_args: Args,
-
-    #[arg(short = 'c', long, required = true)]
-    pub required_capacity: usize,
+    Ok(())
 }
 
 #[allow(dead_code)]
 fn main() {
-    let args = BuildArgs::parse();
-    if let Err(e) = run(args.db_args, args.required_capacity) {
+    let args = Args::parse();
+    if let Err(e) = run(&args.database) {
         eprintln!("Application error: {}", e);
     }
 }
diff --git a/kr2r/src/bin/chunk_db.rs b/kr2r/src/bin/chunk_db.rs
@@ -0,0 +1,106 @@
+// 使用时需要引用模块路径
+use clap::Parser;
+use kr2r::args::{parse_size, Build};
+use kr2r::compact_hash::HashConfig;
+use kr2r::db::{convert_fna_to_k2_format, get_bits_for_taxid};
+use kr2r::taxonomy::Taxonomy;
+use kr2r::utils::{
+    create_partition_files, create_partition_writers, find_files, get_file_limit,
+    read_id_to_taxon_map, set_fd_limit,
+};
+use kr2r::IndexOptions;
+use std::time::Instant;
+
+#[derive(Parser, Debug, Clone)]
+#[clap(author, version, about="prebuild database", long_about = None)]
+pub struct Args {
+    #[clap(long, value_parser = parse_size, default_value = "1G", help = "Specifies the hash file capacity.\nAcceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K').\nNote: The specified capacity affects the index size, with a factor of 4 applied.\nFor example, specifying '1G' results in an index size of '4G'.\nDefault: 1G (capacity 1G = file size 4G)")]
+    pub hash_capacity: usize,
+
+    /// 包含原始配置
+    #[clap(flatten)]
+    pub build: Build,
+}
+
+pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::error::Error>> {
+    let file_num_limit = get_file_limit();
+    let meros = args.build.klmt.as_meros();
+    let k2d_dir = &args.build.database;
+
+    let id_to_taxon_map_filename = args.build.database.join("seqid2taxid.map");
+    let id_to_taxon_map = read_id_to_taxon_map(&id_to_taxon_map_filename)?;
+
+    let taxonomy_filename = k2d_dir.join("taxo.k2d");
+    let taxonomy = Taxonomy::from_file(taxonomy_filename)?;
+
+    let value_bits = get_bits_for_taxid(
+        args.build.requested_bits_for_taxid as usize,
+        taxonomy.node_count() as f64,
+    )
+    .expect("more bits required for storing taxid");
+
+    let capacity = required_capacity;
+    let partition = (capacity + args.hash_capacity - 1) / args.hash_capacity;
+    let hash_config = HashConfig::new(1, capacity, value_bits, 0, partition, args.hash_capacity);
+
+    // 开始计时
+    let start = Instant::now();
+
+    let chunk_size = args.hash_capacity as usize;
+
+    if partition >= file_num_limit {
+        set_fd_limit(partition as u64 + 1).expect("Failed to set file descriptor limit");
+        // panic!("Exceeds File Number Limit");
+    }
+
+    let chunk_files = create_partition_files(partition, &k2d_dir, "chunk");
+    let mut writers = create_partition_writers(&chunk_files);
+
+    let library_dir = &args.build.database.join("library");
+    let fna_files = find_files(&library_dir, "library", ".fna");
+
+    for fna_file in fna_files {
+        println!("convert fna file {:?}", fna_file);
+        convert_fna_to_k2_format(
+            fna_file,
+            meros,
+            &taxonomy,
+            &id_to_taxon_map,
+            hash_config,
+            &mut writers,
+            chunk_size,
+            args.build.threads,
+        );
+    }
+
+    let hash_filename = k2d_dir.join("hash_config.k2d");
+    hash_config.write_to_file(&hash_filename)?;
+    // 计算持续时间
+    let duration = start.elapsed();
+    // 打印运行时间
+    println!("chunk db took: {:?}", duration);
+
+    let options_filename = k2d_dir.join("opts.k2d");
+    let idx_opts = IndexOptions::from_meros(meros);
+    idx_opts.write_to_file(options_filename)?;
+
+    Ok(())
+}
+
+#[derive(Parser, Debug, Clone)]
+#[clap(author, version, about, long_about = None)]
+struct ChunkArgs {
+    #[clap(flatten)]
+    db_args: Args,
+
+    #[arg(short = 'c', long, required = true)]
+    pub required_capacity: usize,
+}
+
+#[allow(dead_code)]
+fn main() {
+    let args = ChunkArgs::parse();
+    if let Err(e) = run(args.db_args, args.required_capacity) {
+        eprintln!("Application error: {}", e);
+    }
+}
diff --git a/kr2r/src/bin/estimate_capacity.rs b/kr2r/src/bin/estimate_capacity.rs
@@ -1,7 +1,7 @@
 use clap::{error::ErrorKind, Error, Parser};
 use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
 use kr2r::args::KLMTArgs;
-use kr2r::utils::{find_library_fna_files, format_bytes, open_file};
+use kr2r::utils::{find_files, format_bytes, open_file};
 use kr2r::KBuildHasher;
 
 use seqkmer::{read_parallel, BufferFastaReader};
@@ -145,7 +145,8 @@ pub fn run(args: Args) -> usize {
     let fna_files = if source.is_file() {
         vec![source.clone()]
     } else {
-        find_library_fna_files(args.database)
+        let library_dir = &args.database.join("library");
+        find_files(library_dir, "library", ".fna")
     };
 
     if fna_files.is_empty() {

diff --git a/kr2r/src/bin/inspect.rs b/kr2r/src/bin/inspect.rs