diff --git a/R/dates_and_times.R b/R/dates_and_times.R index 416f237..257fa54 100644 --- a/R/dates_and_times.R +++ b/R/dates_and_times.R @@ -1,5 +1,5 @@ #' Fix UTC offset strings -#' +#' #' UTC offsets can be formatted in multiple ways (e.g. -07, -07:00, -0700) and R often struggles to parse these offsets. This function takes date/time strings with valid UTC offsets, and formats them so that they are consistent and readable by R. #' #' @param datetime_strings Character vector of dates in ISO 8601 format @@ -10,10 +10,10 @@ #' @examples #' datetimes <- c("2023-11-16T03:32:49+07:00","2023-11-16T03:32:49-07","2023-11-16T03:32:49","2023-11-16T03:32:49Z") #' fix_utc_offset(datetimes) # returns c("2023-11-16T03:32:49+0700", "2023-11-16T03:32:49-0700", "2023-11-16T03:32:49", "2023-11-16T03:32:49+0000") and warns about missing offset (see third element) -#' +#' fix_utc_offset <- function(datetime_strings) { - datetime_strings <- stringr::str_replace_all(datetime_strings, "[−‐‑‒–—―﹘﹣-]", "-") # replace every possible type of dash with a regular minus sign - + datetime_strings <- stringr::str_replace_all(datetime_strings, "[\u2212\u2010\u2011\u2012\u2013\u2014\u2015\ufe58\ufe63\uff0d]", "-") # replace every possible type of dash with a regular minus sign + # get UTC offset and format it as 4 digits with no special characters (e.g. 0700) new_offsets <- datetime_strings %>% stringr::str_extract("[Zz]|((?<=[+-])[0-9]{1,2}:?[0-9]{0,2})$") %>% @@ -26,16 +26,16 @@ fix_utc_offset <- function(datetime_strings) { if (any(new_offsets == "")) { warning("Date strings contain missing or invalid UTC offsets") } - + # remove old UTC offsets from date strings datetime_strings <- datetime_strings %>% stringr::str_remove("(?<=[+-])[0-9]{1,2}:?[0-9]{0,2}$") %>% stringr::str_replace("[Zz](?=$)", "+") - + # add new UTC offsets datetime_strings <- paste0(datetime_strings, new_offsets) %>% stringr::str_remove("[+-](?=$)") # Remove trailing + or - where invalid offsets were removed - + return(datetime_strings) } @@ -44,7 +44,7 @@ fix_utc_offset <- function(datetime_strings) { #' @details `convert_datetime_format()` is not a sophisticated function. If the EML format string is not valid, it will happily and without complaint return an R format string that will break your code. You have been warned. Note that UTC offset formats using a colon or only two digits will be parsed by this function, but if parsing datetime values from strings, you will also need to use `fix_utc_offset` to change the UTC offsets to the +/-hhhh format that R can read. #' #' @param eml_format_string A character vector of EML date/time format strings. This function understands the following codes: YYYY = four digit year, YY = two digit year, MMM = three letter month abbrev., MM = two digit month, DD = two digit day, hh or HH = 24 hour time, mm = minutes, ss or SS = seconds, +/-hhhh or +/-HHHH = UTC offset. -#' @param convert_z Should a "Z" at the end of the format string (indicating UTC) be replaced by a "%z"? Only set to `TRUE` if you plan to use `fix_utc_offset` to change "Z" in datetime strings to "+0000". +#' @param convert_z Should a "Z" at the end of the format string (indicating UTC) be replaced by a "%z"? Only set to `TRUE` if you plan to use `fix_utc_offset` to change "Z" in datetime strings to "+0000". #' #' @return A character vector of date/time format strings that can be parsed by `readr` or `strptime`. #' @export @@ -67,10 +67,10 @@ convert_datetime_format <- function(eml_format_string, convert_z = FALSE) { stringr::str_replace_all("D", "%d") %>% stringr::str_replace_all("[+-][Hh]{1,2}:?[Hh]{0,2}(?=$)", "%z") # Replace UTC offset format string (e.g. -hh, -hhhh, -hh:hh) with %z. Note that R seems to only parse UTC offsets when in the format +/-hhhh. #stringr::str_replace_all("T", " ") - + if (convert_z) { r_format_string <- stringr::str_replace(r_format_string, "Z(?=$)", "%z") } - + return(r_format_string) } \ No newline at end of file