From c71ca5266a9a04059eb154e1682003d6630e7749 Mon Sep 17 00:00:00 2001 From: yun Date: Tue, 2 Jul 2024 17:03:15 +0800 Subject: [PATCH] fix `fda_drugs`: wrong url --- DESCRIPTION | 2 +- R/athena.R | 2 +- R/download.R | 67 ++++++++++++++++++++++++++++------ R/fda_drugs.R | 49 +++++++++++++++++-------- R/utils-file.R | 84 +++++++++++++++++++++++++------------------ man/faers_download.Rd | 2 +- man/fda_drugs.Rd | 9 ++--- 7 files changed, 149 insertions(+), 66 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d7124fb..dfd7058 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: faers Title: R interface for FDA Adverse Event Reporting System -Version: 1.1.1 +Version: 1.1.2 Authors@R: c( person("Yun", "Peng", , "yunyunp96@163.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-2801-3332")), diff --git a/R/athena.R b/R/athena.R index e3e0eef..bb878eb 100644 --- a/R/athena.R +++ b/R/athena.R @@ -72,7 +72,7 @@ athena_read <- function(file) { athena_file <- function( url, force, dir = faers_cache_dir("athena"), arg = rlang::caller_arg(url)) { - cache_file( + cache_use_or_download( force = force, url = url, prefix = "athena_vocabularies", diff --git a/R/download.R b/R/download.R index 3afec5a..41a33e2 100644 --- a/R/download.R +++ b/R/download.R @@ -8,7 +8,7 @@ #' @param dir The destination directory for any downloads. Defaults to #' current working dir. #' @param ... Extra handle options passed to each request -#' [new_handle][curl::new_handle]. +#' [multi_download][curl::multi_download]. #' @return An atomic character for the path of downloaded files. #' @examples #' # you must change `dir`, as the file included in the package is sampled @@ -34,33 +34,39 @@ faers_download <- function(years, quarters, format = NULL, dir = getwd(), ...) { } urls <- build_faers_url(format, years, quarters) dest_files <- file.path(dir_create2(dir), basename(urls)) - download_inform(urls, dest_files, handle_opts = list(...)) + download_inform(urls, dest_files, ...) } #' Download utils function with good message. #' @return A character path if downloading successed, otherwise, stop with error #' message. #' @noRd -download_inform <- function(urls, file_paths, handle_opts = list()) { +download_inform <- function(urls, file_paths, ...) { out <- list( urls = urls, destfiles = file_paths, is_success = rep_len(TRUE, length(urls)) ) - is_existed <- file.exists(file_paths) - if (any(is_existed)) { - cli::cli_inform("Finding {.val {sum(is_existed)}} file{?s} already downloaded: {.file {basename(file_paths[is_existed])}}") # nolint + if (any(is_existed <- file.exists(file_paths))) { + cli::cli_inform(paste( + "Finding {.val {sum(is_existed)}} file{?s} already", + "downloaded: {.file {basename(file_paths[is_existed])}}" + )) urls <- urls[!is_existed] file_paths <- file_paths[!is_existed] } - if (length(urls)) { + if (l <- length(urls)) { assert_internet() - cli::cli_inform("Downloading {.val {length(urls)}} file{?s}") + if (l == 1L) { + cli::cli_inform("Downloading 1 file from: {.url {urls}}") + } else { + cli::cli_inform("Downloading {.val {l}} files") + } arg_list <- c( list( urls = urls, destfiles = file_paths, resume = FALSE, progress = interactive(), timeout = Inf ), - handle_opts + rlang::list2(...) ) status <- do.call(curl::multi_download, arg_list) is_success <- is_download_success(status) @@ -73,7 +79,10 @@ download_inform <- function(urls, file_paths, handle_opts = list()) { cli::cli_abort(c( "Cannot download {.val {n_failed_files}} file{?s}", "i" = "url{?s}: {.url {urls[!is_success]}}", - "!" = "status {cli::qty(n_failed_files)} code{?s}: {.val {status$status_code[!is_success]}}", + "!" = paste( + "status {cli::qty(n_failed_files)} code{?s}:", + "{.val {status[!is_success]}}" + ), x = "error {cli::qty(n_failed_files)} message{?s}: {.val {status$error[!is_success]}}" )) } @@ -88,6 +97,44 @@ is_download_success <- function(status, successful_code = c(200L, 206L, 416L)) { status$status_code %in% successful_code } +base_download_inform <- function(urls, file_paths, ...) { + out <- file_paths + if (any(is_existed <- file.exists(file_paths))) { + cli::cli_inform(paste( + "Finding {.val {sum(is_existed)}} file{?s} already", + "downloaded: {.file {basename(file_paths[is_existed])}}" + )) # nolint + urls <- urls[!is_existed] + file_paths <- file_paths[!is_existed] + } + if (l <- length(urls)) { + assert_internet() + if (l == 1L) { + cli::cli_inform("Downloading 1 file from: {.url {urls}}") + } else { + cli::cli_inform("Downloading {.val {l}} files") + } + status <- utils::download.file(urls, + destfile = file_paths, ..., method = "libcurl" + ) + is_success <- status == 0L + is_need_deleted <- !is_success & file.exists(file_paths) + if (any(is_need_deleted)) file.remove(file_paths[is_need_deleted]) + if (!all(is_success)) { + n_failed_files <- sum(!is_success) # nolint + cli::cli_abort(c( + "Cannot download {.val {n_failed_files}} file{?s}", + "i" = "url{?s}: {.url {urls[!is_success]}}", + "!" = paste( + "status {cli::qty(n_failed_files)} code{?s}:", + "{.val {status[!is_success]}}" + ) + )) + } + } + out +} + build_faers_url <- function(type, years, quarters) { laers_period <- is_from_laers(years, quarters) sprintf( diff --git a/R/fda_drugs.R b/R/fda_drugs.R index 9976dcf..dd8a99d 100644 --- a/R/fda_drugs.R +++ b/R/fda_drugs.R @@ -6,23 +6,27 @@ #' dataset? #' @param force A boolean value. If set to `TRUE`, it indicates the retrieval of #' `Drugs@@FDA` data in the FDA directly, bypassing the cache. +#' @param url A string of the url for `Drugs@@FDA` file. Try to get the link +#' from site: +#' . #' @return #' - if `list = TRUE`, an atomic character. #' - if `list = FALSE`, a [data.table][data.table::data.table]. -#' @seealso -#' #' @examples #' fda_drugs(list = TRUE) #' fda_drugs() #' @export -fda_drugs <- function(pattern = "Products", list = FALSE, force = FALSE) { +fda_drugs <- function(pattern = "Products", url = NULL, + list = FALSE, force = FALSE) { assert_bool(list) assert_bool(force) - file <- fda_drugs_file(force) + assert_string(url, null_ok = TRUE) + file <- fda_drugs_file(url, force) fda_drugs_load(file, pattern = pattern, list = list) } -fda_drugs_load <- function(file, pattern = "Products", list = FALSE, dir = faers_cache_dir("fdadrugs")) { +fda_drugs_load <- function(file, pattern = "Products", + list = FALSE, dir = faers_cache_dir("fdadrugs")) { path <- unzip2(file, dir) if (list) { list.files(path) @@ -35,18 +39,26 @@ fda_drugs_load <- function(file, pattern = "Products", list = FALSE, dir = faers } } -fda_drugs_file <- function(force, dir = faers_cache_dir("fdadrugs")) { - cache_file( +fda_drugs_file <- function(url = NULL, force, + dir = faers_cache_dir("fdadrugs"), + arg = rlang::caller_arg(url), + call = rlang::caller_env()) { + cache_use_or_download( force = force, - url = fda_drugs_url(), + url = fda_drugs_url(url, arg = arg, call = call), prefix = "fda_drugs_data", ext = "zip", name = "Drugs@FDA data", - dir = dir + dir = dir, + method = "base", + arg = arg, call = call ) } -fda_drugs_url <- function(call = rlang::caller_env()) { +fda_drugs_url <- function(url = NULL, + arg = rlang::caller_arg(url), + call = rlang::caller_env()) { + if (!is.null(url)) return(url) # styler: off assert_internet(call = call) url <- sprintf( "%s/drugs/drug-approvals-and-databases/drugsfda-data-files", @@ -54,9 +66,16 @@ fda_drugs_url <- function(call = rlang::caller_env()) { ) cli::cli_inform(c(">" = "Reading html: {.url {url}}")) html <- xml2::read_html(url) - node <- rvest::html_element( - html, "[data-entity-substitution=media_download]" - ) - cli::cat_line(rvest::html_text(node)) - paste0(fda_host("www"), rvest::html_attr(node, "href")) + node <- rvest::html_element(html, "[data-entity-substitution]") + if (inherits(node, "xml_missing")) { + # cli::cli_abort(c( + # "Cannot determine the url of {.field Drugs@FDA} file", + # i = "try to provide {.arg {arg}} manually" + # ), call = call) + href <- "/media/89850/download?attachment" + } else { + cli::cat_line(rvest::html_text(node)) + href <- rvest::html_attr(node, "href") + } + paste0(fda_host("www"), href) } diff --git a/R/utils-file.R b/R/utils-file.R index ff2f228..f6b38a6 100644 --- a/R/utils-file.R +++ b/R/utils-file.R @@ -40,11 +40,10 @@ delete_cache <- function(path, ..., name = NULL) { } } -cache_file <- function( - force, url, prefix, - ext = NULL, name, dir, - arg = rlang::caller_arg(url), - call = rlang::caller_env()) { +cache_use_or_download <- function(force, url, prefix, + ext = NULL, name, dir, ..., + arg = rlang::caller_arg(url), + call = rlang::caller_env()) { if (!force) { pattern <- sprintf("^%s_\\d+-\\d+-\\d+", prefix) if (!is.null(ext)) { @@ -54,41 +53,56 @@ cache_file <- function( } file <- tryCatch( locate_files(dir, pattern, ignore.case = FALSE), - no_file = function(cnd) { - force <<- TRUE - } + no_file = function(cnd) force <<- TRUE ) } if (force) { - if (is.null(url)) { - cli::cli_abort("You must provide {.arg {arg}}", call = call) - } - file <- cache_download(url, prefix = prefix, ext = ext, dir = dir) - } else { - date <- as.Date( - str_extract(basename(file), "\\d+-\\d+-\\d+"), - format = "%Y-%m-%d" + cache_download( + url = url, + prefix = prefix, ext = ext, dir = dir, + ..., + arg = arg, call = call ) - if (length(file) > 1L) { - i <- order(date, decreasing = TRUE)[1L] - file <- file[i] - date <- date[i] - } - cli::cli_inform(c( - ">" = "Using {name} from cached {.file {file}}", - " " = "Snapshot date: {date}" - )) + } else { + cache_use(file, name = name) } +} + +cache_use <- function(file, name) { + date <- as.Date( + str_extract(basename(file), "\\d+-\\d+-\\d+"), + format = "%Y-%m-%d" + ) + if (length(file) > 1L) { + i <- order(date, decreasing = TRUE)[1L] + file <- file[i] + date <- date[i] + } + cli::cli_inform(c( + ">" = "Using {name} from cached {.file {file}}", + " " = "Snapshot date: {date}" + )) file } -cache_download <- function(url, prefix, ext, dir, call = rlang::caller_env()) { +cache_download <- function(url, prefix, ext, dir, method = NULL, + ..., + arg = rlang::caller_arg(url), + call = rlang::caller_env()) { + if (is.null(url)) { + cli::cli_abort("You must provide {.arg {arg}}", call = call) + } assert_internet(call = call) + method <- match.arg(method, c("base", "curl")) file <- file.path(dir, paste(prefix, Sys.Date(), sep = "_")) if (!is.null(ext)) { file <- paste(file, ext, sep = ".") } - download_inform(url, file) + if (identical(method, "base")) { + base_download_inform(url, file) + } else { + download_inform(url, file, ...) + } } # name used: metadata, fdadrugs, rxnorm, athena @@ -111,7 +125,8 @@ faers_user_cache_dir <- function(create = TRUE) { } # file or path utils function -------------- -dir_or_unzip <- function(path, compress_dir, pattern, none_msg, ignore.case = TRUE) { +dir_or_unzip <- function(path, compress_dir, pattern, none_msg, + ignore.case = TRUE) { if (dir.exists(path)) { return(path) } else if (file.exists(path)) { @@ -138,14 +153,15 @@ zip2 <- function(zipfile, files, ..., root = getwd()) { #' Will always add the basename into the compress_dir #' @noRd unzip2 <- function(path, compress_dir, ignore.case = TRUE) { - compress_dir <- file.path(dir_create2(compress_dir), str_remove( - basename(path), "\\.zip$", - ignore.case = ignore.case - )) - if (is.null(utils::unzip(path, exdir = dir_create2(compress_dir), overwrite = TRUE))) { + compress_dir <- file.path( + dir_create2(compress_dir), + str_remove(basename(path), "\\.zip$", ignore.case = ignore.case) + ) + exdir <- dir_create2(compress_dir) + if (is.null(utils::unzip(path, exdir = exdir, overwrite = TRUE))) { cli::cli_abort("Cannot uncompress {.file {path}}") } - compress_dir + exdir } locate_dir <- function(path, pattern = NULL, ignore.case = TRUE) { diff --git a/man/faers_download.Rd b/man/faers_download.Rd index e695a44..65f1500 100644 --- a/man/faers_download.Rd +++ b/man/faers_download.Rd @@ -19,7 +19,7 @@ Default: "ascii".} current working dir.} \item{...}{Extra handle options passed to each request -\link[curl:handle]{new_handle}.} +\link[curl:multi_download]{multi_download}.} } \value{ An atomic character for the path of downloaded files. diff --git a/man/fda_drugs.Rd b/man/fda_drugs.Rd index 0576b80..a56a7ca 100644 --- a/man/fda_drugs.Rd +++ b/man/fda_drugs.Rd @@ -4,12 +4,16 @@ \alias{fda_drugs} \title{Read and Parse Drugs@FDA data} \usage{ -fda_drugs(pattern = "Products", list = FALSE, force = FALSE) +fda_drugs(pattern = "Products", url = NULL, list = FALSE, force = FALSE) } \arguments{ \item{pattern}{File pattern to use. Must define a file exactly, you can set \code{list = TRUE} to see what files can be used.} +\item{url}{A string of the url for \code{Drugs@FDA} file. Try to get the link +from site: +\url{https://www.fda.gov/drugs/drug-approvals-and-databases/drugsfda-data-files}.} + \item{list}{A boolean value, should it only list files in the \code{Drugs@FDA} dataset?} @@ -29,6 +33,3 @@ Read and Parse Drugs@FDA data fda_drugs(list = TRUE) fda_drugs() } -\seealso{ -\url{https://www.fda.gov/drugs/drug-approvals-and-databases/drugsfda-data-files} -}