Skip to content

Commit

Permalink
chunk db
Browse files Browse the repository at this point in the history
  • Loading branch information
dagou committed Jul 12, 2024
1 parent 2d8d4d1 commit 147bc45
Show file tree
Hide file tree
Showing 10 changed files with 210 additions and 254 deletions.
25 changes: 24 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ Usage: kun_peng <COMMAND>
Commands:
estimate estimate capacity
build build `k2d` files
hashshard split hash file
hashshard Convert Kraken2 database files to Kun-peng database format for efficient processing and analysis.
splitr Split fast(q/a) file into ranges
annotate annotate a set of sequences
resolve resolve taxonomy tree
Expand Down Expand Up @@ -236,6 +236,29 @@ Options:
Print version
```
### Convert Kraken2 database
This tool converts Kraken2 database files into Kun-peng database format for more efficient processing and analysis. By specifying the database directory and the hash file capacity, users can control the size of the resulting database index files.
```sh
./target/release/kun_peng hashshard -h
Convert Kraken2 database files to Kun-peng database format for efficient processing and analysis.

Usage: kun_peng hashshard [OPTIONS] --db <DATABASE>

Options:
--db <DATABASE> The database directory for the Kraken 2 index. contains index files(hash.k2d opts.k2d taxo.k2d)
--hash-capacity <HASH_CAPACITY> Specifies the hash file capacity.
Acceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K').
Note: The specified capacity affects the index size, with a factor of 4 applied.
For example, specifying '1G' results in an index size of '4G'.
Default: 1G (capacity 1G = file size 4G) [default: 1G]
-h, --help Print help
-V, --version Print version

```
### classify
The classification process is divided into three modes:
Expand Down
117 changes: 22 additions & 95 deletions kr2r/src/bin/build_k2_db.rs
Original file line number Diff line number Diff line change
@@ -1,112 +1,52 @@
// 使用时需要引用模块路径
use clap::Parser;
use kr2r::args::{parse_size, Build};
use kr2r::compact_hash::HashConfig;
use kr2r::db::{convert_fna_to_k2_format, generate_taxonomy, get_bits_for_taxid, process_k2file};
use kr2r::utils::{
create_partition_files, create_partition_writers, find_library_fna_files, get_file_limit,
read_id_to_taxon_map, set_fd_limit,
};
use kr2r::IndexOptions;
use kr2r::db::process_k2file;
use kr2r::taxonomy::Taxonomy;
use kr2r::utils::find_and_trans_files;
use std::fs::remove_file;
use std::path::PathBuf;
use std::time::Instant;

#[derive(Parser, Debug, Clone)]
#[clap(author, version, about="build database", long_about = None)]
pub struct Args {
// /// database hash chunk directory and other files
// #[clap(long)]
// pub k2d_dir: Option<PathBuf>,
#[clap(long, value_parser = parse_size, default_value = "1G", help = "Specifies the hash file capacity.\nAcceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K').\nNote: The specified capacity affects the index size, with a factor of 4 applied.\nFor example, specifying '1G' results in an index size of '4G'.\nDefault: 1G (capacity 1G = file size 4G)")]
pub hash_capacity: usize,

// chunk temp directory
// #[clap(long)]
// pub chunk_dir: PathBuf,
/// 包含原始配置
#[clap(flatten)]
pub build: Build,
// #[arg(short = 'm')]
// pub id_to_taxon_map_filename: Option<PathBuf>,
/// database hash chunk directory and other files
#[arg(long = "db", required = true)]
pub database: PathBuf,
}

pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::error::Error>> {
let file_num_limit = get_file_limit();
let meros = args.build.klmt.as_meros();

let id_to_taxon_map_filename = args.build.database.join("seqid2taxid.map");
let id_to_taxon_map = read_id_to_taxon_map(&id_to_taxon_map_filename)?;
let k2d_dir = &args.build.database;
pub fn run(database: &PathBuf) -> Result<(), Box<dyn std::error::Error>> {
let k2d_dir = database;
let taxonomy_filename = k2d_dir.join("taxo.k2d");
let taxonomy = Taxonomy::from_file(taxonomy_filename)?;
let hash_filename = k2d_dir.join("hash_config.k2d");

let ncbi_taxonomy_directory = &args.build.database.join("taxonomy");

let taxonomy = generate_taxonomy(
&ncbi_taxonomy_directory,
&taxonomy_filename,
&id_to_taxon_map,
)?;

let value_bits = get_bits_for_taxid(
args.build.requested_bits_for_taxid as usize,
taxonomy.node_count() as f64,
)
.expect("more bits required for storing taxid");

let capacity = required_capacity;
let partition = (capacity + args.hash_capacity - 1) / args.hash_capacity;
let mut hash_config =
HashConfig::new(1, capacity, value_bits, 0, partition, args.hash_capacity);
let mut hash_config = HashConfig::from_hash_header(&hash_filename)?;

// 开始计时
let start = Instant::now();

let chunk_size = args.hash_capacity as usize;

if partition >= file_num_limit {
set_fd_limit(partition as u64 + 1).expect("Failed to set file descriptor limit");
// panic!("Exceeds File Number Limit");
}

let chunk_files = create_partition_files(partition, &k2d_dir, "chunk");
let mut writers = create_partition_writers(&chunk_files);

let fna_files = find_library_fna_files(&args.build.database);

for fna_file in fna_files {
println!("convert fna file {:?}", fna_file);
convert_fna_to_k2_format(
fna_file,
meros,
&taxonomy,
&id_to_taxon_map,
hash_config,
&mut writers,
chunk_size,
args.build.threads,
);
}
let chunk_files = find_and_trans_files(&k2d_dir, "chunk", ".k2", true)?;

let hash_filename = k2d_dir.join("hash_config.k2d");
let partition = chunk_files.len();
let mut size: usize = 0;

println!("start process k2 files...");
for i in 1..=partition {
for (i, chunk_file) in &chunk_files {
// 计算持续时间
let count = process_k2file(
hash_config,
&k2d_dir,
&chunk_files[i - 1],
&chunk_file,
&taxonomy,
chunk_size,
i,
hash_config.hash_capacity,
*i,
)?;
size += count;
let duration = start.elapsed();
println!(
"process chunk file {:?}/{:}: duration: {:?}",
i, partition, duration
i, hash_config.partition, duration
);
}

Expand All @@ -118,30 +58,17 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::erro
// 打印运行时间
println!("build k2 db took: {:?}", duration);

let options_filename = k2d_dir.join("opts.k2d");
let idx_opts = IndexOptions::from_meros(meros);
idx_opts.write_to_file(options_filename)?;

for chunk_file in chunk_files {
for (_, chunk_file) in &chunk_files {
remove_file(chunk_file)?;
}
Ok(())
}

#[derive(Parser, Debug, Clone)]
#[clap(author, version, about, long_about = None)]
struct BuildArgs {
#[clap(flatten)]
db_args: Args,

#[arg(short = 'c', long, required = true)]
pub required_capacity: usize,
Ok(())
}

#[allow(dead_code)]
fn main() {
let args = BuildArgs::parse();
if let Err(e) = run(args.db_args, args.required_capacity) {
let args = Args::parse();
if let Err(e) = run(&args.database) {
eprintln!("Application error: {}", e);
}
}
106 changes: 106 additions & 0 deletions kr2r/src/bin/chunk_db.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
// 使用时需要引用模块路径
use clap::Parser;
use kr2r::args::{parse_size, Build};
use kr2r::compact_hash::HashConfig;
use kr2r::db::{convert_fna_to_k2_format, get_bits_for_taxid};
use kr2r::taxonomy::Taxonomy;
use kr2r::utils::{
create_partition_files, create_partition_writers, find_files, get_file_limit,
read_id_to_taxon_map, set_fd_limit,
};
use kr2r::IndexOptions;
use std::time::Instant;

#[derive(Parser, Debug, Clone)]
#[clap(author, version, about="prebuild database", long_about = None)]
pub struct Args {
#[clap(long, value_parser = parse_size, default_value = "1G", help = "Specifies the hash file capacity.\nAcceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K').\nNote: The specified capacity affects the index size, with a factor of 4 applied.\nFor example, specifying '1G' results in an index size of '4G'.\nDefault: 1G (capacity 1G = file size 4G)")]
pub hash_capacity: usize,

/// 包含原始配置
#[clap(flatten)]
pub build: Build,
}

pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::error::Error>> {
let file_num_limit = get_file_limit();
let meros = args.build.klmt.as_meros();
let k2d_dir = &args.build.database;

let id_to_taxon_map_filename = args.build.database.join("seqid2taxid.map");
let id_to_taxon_map = read_id_to_taxon_map(&id_to_taxon_map_filename)?;

let taxonomy_filename = k2d_dir.join("taxo.k2d");
let taxonomy = Taxonomy::from_file(taxonomy_filename)?;

let value_bits = get_bits_for_taxid(
args.build.requested_bits_for_taxid as usize,
taxonomy.node_count() as f64,
)
.expect("more bits required for storing taxid");

let capacity = required_capacity;
let partition = (capacity + args.hash_capacity - 1) / args.hash_capacity;
let hash_config = HashConfig::new(1, capacity, value_bits, 0, partition, args.hash_capacity);

// 开始计时
let start = Instant::now();

let chunk_size = args.hash_capacity as usize;

if partition >= file_num_limit {
set_fd_limit(partition as u64 + 1).expect("Failed to set file descriptor limit");
// panic!("Exceeds File Number Limit");
}

let chunk_files = create_partition_files(partition, &k2d_dir, "chunk");
let mut writers = create_partition_writers(&chunk_files);

let library_dir = &args.build.database.join("library");
let fna_files = find_files(&library_dir, "library", ".fna");

for fna_file in fna_files {
println!("convert fna file {:?}", fna_file);
convert_fna_to_k2_format(
fna_file,
meros,
&taxonomy,
&id_to_taxon_map,
hash_config,
&mut writers,
chunk_size,
args.build.threads,
);
}

let hash_filename = k2d_dir.join("hash_config.k2d");
hash_config.write_to_file(&hash_filename)?;
// 计算持续时间
let duration = start.elapsed();
// 打印运行时间
println!("chunk db took: {:?}", duration);

let options_filename = k2d_dir.join("opts.k2d");
let idx_opts = IndexOptions::from_meros(meros);
idx_opts.write_to_file(options_filename)?;

Ok(())
}

#[derive(Parser, Debug, Clone)]
#[clap(author, version, about, long_about = None)]
struct ChunkArgs {
#[clap(flatten)]
db_args: Args,

#[arg(short = 'c', long, required = true)]
pub required_capacity: usize,
}

#[allow(dead_code)]
fn main() {
let args = ChunkArgs::parse();
if let Err(e) = run(args.db_args, args.required_capacity) {
eprintln!("Application error: {}", e);
}
}
5 changes: 3 additions & 2 deletions kr2r/src/bin/estimate_capacity.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use clap::{error::ErrorKind, Error, Parser};
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
use kr2r::args::KLMTArgs;
use kr2r::utils::{find_library_fna_files, format_bytes, open_file};
use kr2r::utils::{find_files, format_bytes, open_file};
use kr2r::KBuildHasher;

use seqkmer::{read_parallel, BufferFastaReader};
Expand Down Expand Up @@ -145,7 +145,8 @@ pub fn run(args: Args) -> usize {
let fna_files = if source.is_file() {
vec![source.clone()]
} else {
find_library_fna_files(args.database)
let library_dir = &args.database.join("library");
find_files(library_dir, "library", ".fna")
};

if fna_files.is_empty() {
Expand Down
40 changes: 0 additions & 40 deletions kr2r/src/bin/inspect.rs

This file was deleted.

Loading

0 comments on commit 147bc45

Please sign in to comment.