Skip to content

Commit

Permalink
add plasmid and plastid
Browse files Browse the repository at this point in the history
  • Loading branch information
dagou committed Jul 1, 2024
1 parent 02f2b35 commit 4ee878d
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 1 deletion.
1 change: 1 addition & 0 deletions ncbi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ pub mod load;
pub mod md5sum;
pub mod meta;
// pub mod site;
pub mod plas;
pub mod task;
pub mod utils;
36 changes: 35 additions & 1 deletion ncbi/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use clap::{Parser, Subcommand, ValueEnum};
use lazy_static::lazy_static;
use ncbi::fna::write_to_fna;
use ncbi::meta::{init_meta, save_meta};
use ncbi::plas::download_plas_files;
use ncbi::task;
use ncbi::utils;
use std::collections::HashMap;
Expand All @@ -21,6 +22,7 @@ const NCBI_LIBRARY: &'static [&str] = &[
"vertebrate_mammalian",
"vertebrate_other",
"invertebrate",
"plasmid",
];

lazy_static! {
Expand Down Expand Up @@ -74,6 +76,25 @@ impl fmt::Display for Site {
}
}

#[derive(Subcommand, Debug, ValueEnum, Clone)]
enum Plas {
Plasmid,
Plastid,
}

impl fmt::Display for Plas {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"{}",
match self {
Plas::Plasmid => "plasmid",
Plas::Plastid => "plastid",
}
)
}
}

#[derive(Subcommand, Debug)]
enum Mode {
/// Check the md5 of files only
Expand Down Expand Up @@ -115,6 +136,11 @@ struct Args {

#[derive(Subcommand, Debug)]
enum Commands {
/// plasmid or plastid
Plas {
#[command(subcommand)]
mode: Plas,
},
/// Download taxonomy files from NCBI (alias: tax)
#[command(alias = "tax")]
Taxonomy,
Expand All @@ -132,7 +158,7 @@ enum Commands {
asm_level: String,

/// Type of data to download from NCBI site, can be multiple comma-separated values
/// e.g., archaea, bacteria, viral, fungi, plant, human, protozoa, vertebrate_mammalian, vertebrate_other, invertebrate
/// e.g., archaea, bacteria, viral, fungi, plant, human, protozoa, vertebrate_mammalian, vertebrate_other, invertebrate, plasmid
#[arg(short, long, value_parser = validate_group)]
group: String,

Expand All @@ -147,6 +173,14 @@ async fn async_run(args: Args) -> Result<()> {
init_meta(&db_path).await;

match args.command {
Commands::Plas { mode } => {
let data_dir: PathBuf = db_path
.join("library")
.join(mode.to_string())
.join("refseq");
utils::create_dir(&data_dir)?;
download_plas_files(data_dir, &mode.to_string()).await?
}
Commands::Taxonomy => {
let data_dir: PathBuf = db_path.join("taxonomy");
utils::create_dir(&data_dir)?;
Expand Down
37 changes: 37 additions & 0 deletions ncbi/src/plas.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
use std::path::PathBuf;

use crate::client::retry_client;
use crate::load::{DownTuple, NcbiFile, NCBI_GEN_URL};
use crate::meta::get_local_etag;
use anyhow::Result;
use regex::Regex;

pub async fn download_plas_files(data_dir: PathBuf, plas_type: &str) -> Result<()> {
let plas_url = format!("{}refseq/{}/", NCBI_GEN_URL, plas_type);
let client = retry_client();
let response: reqwest::Response = client.get(&plas_url).send().await?;
if response.status().is_success() {
let contents = response.text().await?;
// println!("contents {:?}", contents);
// 正则表达式匹配所有 href 属性
let re = Regex::new(r#"href="([^"]+\.genomic\.fna\.gz)""#)?;

// 查找并打印所有匹配的 href
for cap in re.captures_iter(&contents) {
let filename = &cap[1];
let url = format!("{}{}", plas_url, filename);
println!("url {:?}", url);
let etag = get_local_etag(&url).await;
let output_path = data_dir.join(filename);
let ncbi_file = NcbiFile::Summary(DownTuple::new(
url.clone(),
output_path,
etag.unwrap_or("".into()),
));
ncbi_file.run().await?;
}
} else {
log::error!("Failed to fetch the webpage.");
}
Ok(())
}

0 comments on commit 4ee878d

Please sign in to comment.