Skip to content
/ oco2 Public

Aquisição e processamento dos dados de XCO2 (JPL Virtual Science Data Environment)

License

Notifications You must be signed in to change notification settings

arpanosso/oco2

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

81 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

MODELOS DE APRENDIZADO DE MÁQUINA PARA A EMISSÃO DE CO2 DO SOLO EM ÁREAS AGRÍCOLAS NO BRASIL

Panosso, A. R.; Costa, L. M.; Lima, L. R. ; Crispim, V. S.

Financiamento: CNPq (311981/2020-8); CNPq (301606/2017-0); Fapesp (2016/03861-5)

Resumo do Trabalho

Aquisição dos dados de CO2 atmosférico (xCO2)

A aquisição de dados de Xco2 e SIF, e seus processamentos iniciais pode ser encontrados no link:

Para facilitar o acesso, os dodos foram adquiridos por meio do pacote {fco2}.

## Instalando pacotes (se necessário)
# install.packages("devtools")
# Sys.getenv("GITHUB_PAT")
# Sys.unsetenv("GITHUB_PAT")
# Sys.getenv("GITHUB_PAT")
# devtools::install_github("arpanosso/fco2r")
library(readxl)
library(tidyverse)
library(geobr)
library(fco2r)
library(skimr)
library(tidymodels)
library(ISLR)
library(modeldata)
library(vip)
library(ggpubr)
source("R/my_fun.R")

# Definindo o plano de multisession
future::plan("multisession")

Carregando os dados meteorológicos

dados_estacao <- read_excel("data-raw/xlsx/estacao_meteorologia_ilha_solteira.xlsx", na = "NA") 
glimpse(dados_estacao)
#> Rows: 1,826
#> Columns: 16
#> $ data    <dttm> 2015-01-01, 2015-01-02, 2015-01-03, 2015-01-04, 2015-01-05, 2~
#> $ Tmed    <dbl> 30.5, 30.0, 26.8, 27.1, 27.0, 27.6, 30.2, 28.2, 28.5, 29.9, 30~
#> $ Tmax    <dbl> 36.5, 36.7, 35.7, 34.3, 33.2, 36.4, 37.2, 32.4, 37.1, 38.1, 38~
#> $ Tmin    <dbl> 24.6, 24.5, 22.9, 22.7, 22.3, 22.8, 22.7, 24.0, 23.0, 23.3, 24~
#> $ Umed    <dbl> 66.6, 70.4, 82.7, 76.8, 81.6, 75.5, 65.8, 70.0, 72.9, 67.6, 66~
#> $ Umax    <dbl> 89.6, 93.6, 99.7, 95.0, 98.3, 96.1, 99.2, 83.4, 90.7, 97.4, 90~
#> $ Umin    <dbl> 42.0, 44.2, 52.9, 43.8, 57.1, 47.5, 34.1, 57.4, 42.7, 38.3, 37~
#> $ PkPa    <dbl> 97.2, 97.3, 97.4, 97.5, 97.4, 97.5, 97.4, 97.4, 97.4, 97.4, 97~
#> $ Rad     <dbl> 23.6, 24.6, 20.2, 21.4, 17.8, 19.2, 27.0, 15.2, 21.6, 24.3, 24~
#> $ PAR     <dbl> 496.6, 513.3, 430.5, 454.0, 378.2, 405.4, 565.7, 317.2, 467.5,~
#> $ Eto     <dbl> 5.7, 5.8, 4.9, 5.1, 4.1, 4.8, 6.2, 4.1, 5.5, 5.7, 5.9, 6.1, 6.~
#> $ Velmax  <dbl> 6.1, 4.8, 12.1, 6.2, 5.1, 4.5, 4.6, 5.7, 5.8, 5.2, 5.2, 4.7, 6~
#> $ Velmin  <dbl> 1.0, 1.0, 1.2, 1.0, 0.8, 0.9, 0.9, 1.5, 1.2, 0.8, 0.8, 1.2, 1.~
#> $ Dir_vel <dbl> 17.4, 261.9, 222.0, 25.0, 56.9, 74.9, 53.4, 89.0, 144.8, 303.9~
#> $ chuva   <dbl> 0.0, 0.0, 3.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.~
#> $ inso    <dbl> 7.9, 8.7, 5.2, 6.2, 3.4, 4.5, 10.5, 1.3, 6.3, 8.4, 8.6, 7.9, 1~

Conhecendo a base de dados de CO2 atmosférico

help(oco2_br)
glimpse(fco2r::oco2_br)
#> Rows: 37,387
#> Columns: 18
#> $ longitude                                                     <dbl> -70.5, -~
#> $ longitude_bnds                                                <chr> "-71.0:-~
#> $ latitude                                                      <dbl> -5.5, -4~
#> $ latitude_bnds                                                 <chr> "-6.0:-5~
#> $ time_yyyymmddhhmmss                                           <dbl> 2.014091~
#> $ time_bnds_yyyymmddhhmmss                                      <chr> "2014090~
#> $ altitude_km                                                   <dbl> 3307.8, ~
#> $ alt_bnds_km                                                   <chr> "0.0:661~
#> $ fluorescence_radiance_757nm_uncert_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 7.272876~
#> $ fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1        <dbl> 2.537127~
#> $ xco2_moles_mole_1                                             <dbl> 0.000394~
#> $ aerosol_total_aod                                             <dbl> 0.148579~
#> $ fluorescence_offset_relative_771nm_idp                        <dbl> 0.016753~
#> $ fluorescence_at_reference_ph_sec_1_m_2_sr_1_um_1              <dbl> 2.615319~
#> $ fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1        <dbl> 3.088582~
#> $ fluorescence_offset_relative_757nm_idp                        <dbl> 0.013969~
#> $ fluorescence_radiance_771nm_uncert_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 5.577878~
#> $ XCO2                                                          <dbl> 387.2781~

Inicialmente devemos transformar os dados de concentração de CO2, variável xco2_moles_mole_1 para ppm em seguida devemos criar as variáveis de data a partir da variável time_yyyymmddhhmmss.

oco2<-oco2_br  %>% 
         mutate(
           xco2 = xco2_moles_mole_1*1e06,
           data = ymd_hms(time_yyyymmddhhmmss),
           ano = year(data),
           mes = month(data),
           dia = day(data),
           dia_semana = wday(data))

Existe uma tendência de aumento monotônica mundial da concentração de CO2 na atmosfera, assim, ela deve ser retirada para podermos observar as tendências regionais.

oco2  %>%  
  ggplot(aes(x=data,y=xco2)) +
  geom_point(color="blue") +
  geom_line(color="red")

Agora devemos retirar a tendência ao longo do tempo, para isso, dentro do período específico, faremos a retirada por meio de um ajuste linear:

oco2  %>%  
  mutate(x= 1:nrow(oco2))  %>%  
  ggplot(aes(x=data,y=xco2)) +
  geom_point(shape=21,color="black",fill="gray") +
  geom_smooth(method = "lm") +
  stat_regline_equation(ggplot2::aes(
  label =  paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")))

Extrair os coeficientes $\alpha$ e $\beta$ da análise de regressão linear $(y=\alpha+\beta x)$.

modelo_linear_tendencia <- lm(xco2~data,
          data = oco2)
coefs <- modelo_linear_tendencia$coefficients

Criando a variável xco2_est a partir da retirada da tendência.

oco2 |> 
  mutate(
    xco2_est = coefs[1] + coefs[2] * as.numeric(data),
    delta = xco2_est - xco2,
    XCO2 = (coefs[1]-delta) - (mean(xco2) - coefs[1])
  ) 
#> # A tibble: 37,387 x 26
#>    longitude longitude~1 latit~2 latit~3 time_~4 time_~5 altit~6 alt_b~7 fluor~8
#>        <dbl> <chr>         <dbl> <chr>     <dbl> <chr>     <dbl> <chr>     <dbl>
#>  1     -70.5 -71.0:-70.0    -5.5 -6.0:-~ 2.01e13 201409~   3308. 0.0:66~ 7.27e17
#>  2     -70.5 -71.0:-70.0    -4.5 -5.0:-~ 2.01e13 201409~   3308. 0.0:66~ 7.56e17
#>  3     -69.5 -70.0:-69.0   -10.5 -11.0:~ 2.01e13 201409~   3308. 0.0:66~ 7.17e17
#>  4     -69.5 -70.0:-69.0    -9.5 -10.0:~ 2.01e13 201409~   3308. 0.0:66~ 7.34e17
#>  5     -69.5 -70.0:-69.0    -8.5 -9.0:-~ 2.01e13 201409~   3308. 0.0:66~ 7.68e17
#>  6     -69.5 -70.0:-69.0    -7.5 -8.0:-~ 2.01e13 201409~   3308. 0.0:66~ 7.46e17
#>  7     -69.5 -70.0:-69.0    -6.5 -7.0:-~ 2.01e13 201409~   3308. 0.0:66~ 7.40e17
#>  8     -69.5 -70.0:-69.0    -5.5 -6.0:-~ 2.01e13 201409~   3308. 0.0:66~ 6.91e17
#>  9     -68.5 -69.0:-68.0   -10.5 -11.0:~ 2.01e13 201409~   3308. 0.0:66~ 6.99e17
#> 10     -46.5 -47.0:-46.0    -1.5 -2.0:-~ 2.01e13 201409~   3308. 0.0:66~ 7.74e17
#> # ... with 37,377 more rows, 17 more variables:
#> #   fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1 <dbl>,
#> #   xco2_moles_mole_1 <dbl>, aerosol_total_aod <dbl>,
#> #   fluorescence_offset_relative_771nm_idp <dbl>,
#> #   fluorescence_at_reference_ph_sec_1_m_2_sr_1_um_1 <dbl>,
#> #   fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1 <dbl>,
#> #   fluorescence_offset_relative_757nm_idp <dbl>, ...
glimpse(oco2)
#> Rows: 37,387
#> Columns: 24
#> $ longitude                                                     <dbl> -70.5, -~
#> $ longitude_bnds                                                <chr> "-71.0:-~
#> $ latitude                                                      <dbl> -5.5, -4~
#> $ latitude_bnds                                                 <chr> "-6.0:-5~
#> $ time_yyyymmddhhmmss                                           <dbl> 2.014091~
#> $ time_bnds_yyyymmddhhmmss                                      <chr> "2014090~
#> $ altitude_km                                                   <dbl> 3307.8, ~
#> $ alt_bnds_km                                                   <chr> "0.0:661~
#> $ fluorescence_radiance_757nm_uncert_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 7.272876~
#> $ fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1        <dbl> 2.537127~
#> $ xco2_moles_mole_1                                             <dbl> 0.000394~
#> $ aerosol_total_aod                                             <dbl> 0.148579~
#> $ fluorescence_offset_relative_771nm_idp                        <dbl> 0.016753~
#> $ fluorescence_at_reference_ph_sec_1_m_2_sr_1_um_1              <dbl> 2.615319~
#> $ fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1        <dbl> 3.088582~
#> $ fluorescence_offset_relative_757nm_idp                        <dbl> 0.013969~
#> $ fluorescence_radiance_771nm_uncert_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 5.577878~
#> $ XCO2                                                          <dbl> 387.2781~
#> $ xco2                                                          <dbl> 394.3686~
#> $ data                                                          <dttm> 2014-09~
#> $ ano                                                           <dbl> 2014, 20~
#> $ mes                                                           <dbl> 9, 9, 9,~
#> $ dia                                                           <int> 6, 6, 6,~
#> $ dia_semana                                                    <dbl> 7, 7, 7,~
oco2  %>%  
  ggplot(aes(x=data,y=XCO2)) +
  geom_point(shape=21,color="black",fill="gray") +
  geom_smooth(method = "lm") +
  stat_regline_equation(ggplot2::aes(
  label =  paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")))

Alguns gráficos

# oco2 %>% 
#   sample_n(1000) %>% 
#   ggplot(aes(x = longitude, y = latitude)) + 
#   geom_point(color = "blue")

Carregando o contorno do território

# br <- geobr::read_country(showProgress = FALSE)

Construindo o mapa com os pontos

# br %>% 
#   ggplot() +
#   geom_sf(fill = "white") +
#     geom_point(data=oco2 %>% 
#                  sample_n(1000),
#              aes(x=longitude,y=latitude),
#              shape=3,
#              col="red",
#              alpha=0.2)

Observe que utilizamos dplyr::sample_n() para retirar apenas $1000$ amostras do total do banco de dados $37387$.

Estatísticas descritivas

# skim(oco2_br)

Conhecendo a base de dados de emissão de CO2 do solo

# help(data_fco2)
glimpse(data_fco2)
#> Rows: 15,397
#> Columns: 39
#> $ experimento       <chr> "Espacial", "Espacial", "Espacial", "Espacial", "Esp~
#> $ data              <date> 2001-07-10, 2001-07-10, 2001-07-10, 2001-07-10, 200~
#> $ manejo            <chr> "convencional", "convencional", "convencional", "con~
#> $ tratamento        <chr> "AD_GN", "AD_GN", "AD_GN", "AD_GN", "AD_GN", "AD_GN"~
#> $ revolvimento_solo <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL~
#> $ data_preparo      <date> 2001-07-01, 2001-07-01, 2001-07-01, 2001-07-01, 200~
#> $ conversao         <date> 1970-01-01, 1970-01-01, 1970-01-01, 1970-01-01, 197~
#> $ cobertura         <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE~
#> $ cultura           <chr> "milho_soja", "milho_soja", "milho_soja", "milho_soj~
#> $ x                 <dbl> 0, 40, 80, 10, 25, 40, 55, 70, 20, 40, 60, 10, 70, 3~
#> $ y                 <dbl> 0, 0, 0, 10, 10, 10, 10, 10, 20, 20, 20, 25, 25, 30,~
#> $ longitude_muni    <dbl> 782062.7, 782062.7, 782062.7, 782062.7, 782062.7, 78~
#> $ latitude_muni     <dbl> 7647674, 7647674, 7647674, 7647674, 7647674, 7647674~
#> $ estado            <chr> "SP", "SP", "SP", "SP", "SP", "SP", "SP", "SP", "SP"~
#> $ municipio         <chr> "Jaboticabal", "Jaboticabal", "Jaboticabal", "Jaboti~
#> $ ID                <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1~
#> $ prof              <chr> "0-0.1", "0-0.1", "0-0.1", "0-0.1", "0-0.1", "0-0.1"~
#> $ FCO2              <dbl> 1.080, 0.825, 1.950, 0.534, 0.893, 0.840, 1.110, 1.8~
#> $ Ts                <dbl> 18.73, 18.40, 19.20, 18.28, 18.35, 18.47, 19.10, 18.~
#> $ Us                <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ pH                <dbl> 5.1, 5.1, 5.8, 5.3, 5.5, 5.7, 5.6, 6.4, 5.3, 5.8, 5.~
#> $ MO                <dbl> 20, 24, 25, 23, 23, 21, 26, 23, 25, 24, 26, 20, 25, ~
#> $ P                 <dbl> 46, 26, 46, 78, 60, 46, 55, 92, 55, 60, 48, 71, 125,~
#> $ K                 <dbl> 2.4, 2.2, 5.3, 3.6, 3.4, 2.9, 4.0, 2.3, 3.3, 3.6, 4.~
#> $ Ca                <dbl> 25, 30, 41, 27, 33, 38, 35, 94, 29, 36, 37, 29, 50, ~
#> $ Mg                <dbl> 11, 11, 25, 11, 15, 20, 16, 65, 11, 17, 15, 11, 30, ~
#> $ H_Al              <dbl> 31, 31, 22, 28, 27, 22, 22, 12, 31, 28, 28, 31, 18, ~
#> $ SB                <dbl> 38.4, 43.2, 71.3, 41.6, 50.6, 60.9, 55.0, 161.3, 43.~
#> $ CTC               <dbl> 69.4, 74.2, 93.3, 69.6, 77.9, 82.9, 77.0, 173.3, 74.~
#> $ V                 <dbl> 55, 58, 76, 60, 65, 73, 71, 93, 58, 67, 67, 58, 82, ~
#> $ Ds                <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ Macro             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ Micro             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ VTP               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ PLA               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ AT                <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ SILTE             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ ARG               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ HLIFS             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~

Visualização de dados

data_fco2 %>% 
  group_by(experimento, cultura, data) %>% 
  summarise(FCO2 = mean(FCO2, na.rm=TRUE)) %>% 
  ggplot(aes(y=FCO2, x=data)) +
  geom_line() +
   facet_wrap(~experimento + cultura, scale="free")

Construindo o mapa com os pontos

# br %>% 
#   ggplot() +
#   geom_sf(fill = "white") +
#     geom_point(data=oco2 %>% sample_n(1000),
#              aes(x=longitude,y=latitude),
#              shape=3,
#              col="red",
#              alpha=0.2)

Observe que utilizamos dplyr::sample_n() para retirar apenas $1000$ amostras do total do banco de dados $146,646$.

Estatísticas descritivas

# skim(data_fco2)
visdat::vis_miss(data_fco2 %>% 
                   sample_n(15000))

Estatísticas descritivas

# skim(dados_estacao)
dados_estacao <- dados_estacao %>% 
                   drop_na()
visdat::vis_miss(dados_estacao)

# Lista do xCO2
# 01 passar as datas que estão em ano-mes-dia-horas-min-segundos
# para uma outra coluna denominada 'data' como ano-mes-dia
# Fazer em pipeline, usar o mutate do pacote dplyr e provavelmente
# a funçoes do pacote lubridate
oco2 <- oco2  %>% 
  mutate (
    ano = time_yyyymmddhhmmss%/%1e10,
    mês = time_yyyymmddhhmmss%/%1e8 %%100,
    dia = time_yyyymmddhhmmss%/%1e6 %%100,
    data = as.Date(stringr::str_c(ano,mês,dia,sep="-"))
  ) %>% 
  glimpse()
#> Rows: 37,387
#> Columns: 25
#> $ longitude                                                     <dbl> -70.5, -~
#> $ longitude_bnds                                                <chr> "-71.0:-~
#> $ latitude                                                      <dbl> -5.5, -4~
#> $ latitude_bnds                                                 <chr> "-6.0:-5~
#> $ time_yyyymmddhhmmss                                           <dbl> 2.014091~
#> $ time_bnds_yyyymmddhhmmss                                      <chr> "2014090~
#> $ altitude_km                                                   <dbl> 3307.8, ~
#> $ alt_bnds_km                                                   <chr> "0.0:661~
#> $ fluorescence_radiance_757nm_uncert_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 7.272876~
#> $ fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1        <dbl> 2.537127~
#> $ xco2_moles_mole_1                                             <dbl> 0.000394~
#> $ aerosol_total_aod                                             <dbl> 0.148579~
#> $ fluorescence_offset_relative_771nm_idp                        <dbl> 0.016753~
#> $ fluorescence_at_reference_ph_sec_1_m_2_sr_1_um_1              <dbl> 2.615319~
#> $ fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1        <dbl> 3.088582~
#> $ fluorescence_offset_relative_757nm_idp                        <dbl> 0.013969~
#> $ fluorescence_radiance_771nm_uncert_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 5.577878~
#> $ XCO2                                                          <dbl> 387.2781~
#> $ xco2                                                          <dbl> 394.3686~
#> $ data                                                          <date> 2014-09~
#> $ ano                                                           <dbl> 2014, 20~
#> $ mes                                                           <dbl> 9, 9, 9,~
#> $ dia                                                           <dbl> 6, 6, 6,~
#> $ dia_semana                                                    <dbl> 7, 7, 7,~
#> $ mês                                                           <dbl> 9, 9, 9,~
dados_estacao <- dados_estacao %>% 
  mutate(
    ano = lubridate::year(data),
    mês = lubridate::month(data),
    dia = lubridate::day(data),
    data = as.Date(stringr::str_c(ano,mês,dia,sep="-"))
)

Manipulação dos bancos de dados Fco2 e de estação.

# atributos <- data_fco2
atributos <- left_join(data_fco2, dados_estacao, by = "data")

Listando as datas em ambos os bancos de dados

# Lista das datas de FCO2 
lista_data_fco2 <- unique(atributos$data)
lista_data_oco2 <- unique(oco2$data)
lista_data_estacao <- unique(dados_estacao$data)
datas_fco2 <- paste0(lubridate::year(lista_data_fco2),"-",lubridate::month(lista_data_fco2)) %>% unique()

datas_oco2 <- paste0(lubridate::year(lista_data_oco2),"-",lubridate::month(lista_data_oco2)) %>% unique()
datas <- datas_fco2[datas_fco2 %in% datas_oco2]

Criação as listas de datas, que é chave para a mesclagem dos arquivos.

fco2 <- atributos %>% 
  mutate(ano_mes = paste0(lubridate::year(data),"-",lubridate::month(data))) %>% 
  dplyr::filter(ano_mes %in% datas)

xco2 <- oco2 %>%   
  mutate(ano_mes=paste0(ano,"-",mês)) %>% 
  dplyr::filter(ano_mes %in% datas)

Coordenadas das cidades

unique(xco2$ano_mes)[unique(xco2$ano_mes) %>% order()] == 
unique(fco2$ano_mes)[unique(fco2$ano_mes) %>% order()]
#>  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#> [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

Abordagem usando o join do {dplyr}

memory.limit(size=10001)
#> [1] 10001
data_set <- left_join(fco2 %>% 
            mutate(ano = lubridate::year(data),
                   mes = lubridate::month(data)
                   ) %>% 
            select(ID, data, cultura, ano, mes, x,y, FCO2, Ts,
                   Us, MO, Macro, VTP, ARG, ano_mes,Tmed,Tmax, Tmin, Umed,
                   Umax, Umin, PkPa, Rad, Eto, Velmax, Velmin, Dir_vel,
                   chuva, inso), 
          xco2 %>% 
            select(data,mês,dia,longitude,latitude,XCO2,fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1,fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1, ano_mes), by = "ano_mes") %>% 
  mutate(dist = sqrt((longitude-(-51.423519))^2+(latitude-(-20.362911))^2),
         SIF = (fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1*2.6250912*10^(-19)  + 1.5*fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1* 2.57743*10^(-19))/2)

data_set<-data_set %>%
  select(-fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1, -fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1 )  %>% 
  filter(dist <= .16, FCO2 <= 20 ) 

visdat::vis_miss(data_set %>% 
                   sample_n(2000)
                 )

# head(data_set)
# fco2$ano_mes %>% unique()
# xco2$ano_mes %>% unique()
# data_set$ano_mes %>% unique()
tab_medias <- data_set %>% 
  # mutate(SIF = ifelse(SIF <=0, mean(data_set$SIF, na.rm=TRUE),SIF)) %>% 
  group_by(ano_mes, cultura) %>% 
  summarise(FCO2 = mean(FCO2, na.rm=TRUE),
            XCO2 = mean(XCO2, na.rm=TRUE),
            SIF = mean(SIF, na.rm=TRUE))

tab_medias %>% 
  ggplot(aes(x=XCO2, y=SIF)) +
  geom_point()+
  geom_smooth(method = "lm")+
  theme_bw()

tab_medias %>% 
  ggplot(aes(x=XCO2, y=FCO2)) +
  geom_point()+
  geom_smooth(method = "lm")+
  theme_bw()

tab_medias %>% 
  ggplot(aes(x=FCO2, y=SIF)) +
  geom_point()+
  geom_smooth(method = "lm") +
  theme_bw()

Estatística Descritiva

Completar posteriormente.

Abordagem de Parendizado de Máquina

Definindo a base de treino e a base de teste

Definindo a semente aleatória mais o conjunto de dados para teste e treino dos modelos

data_set_ml <- data_set #%>%
#   select(cultura, FCO2, Ts,
#                    Us, MO, Tmed,Tmax, Tmin, Umed,
#                    Umax, Umin, PkPa, Rad, Eto, Velmax, Velmin, Dir_vel,
#                    chuva, inso, SIF, xco2) %>% 
#   drop_na(FCO2, Ts,Us,Tmed:inso)
# visdat::vis_miss(data_set_ml)
# set.seed(1235)
fco2_initial_split <- initial_split(data_set_ml, prop = 0.75)
fco2_train <- training(fco2_initial_split)
# fco2_test <- testing(fco2_initial_split)
# visdat::vis_miss(fco2_test)
fco2_train  %>% 
  ggplot(aes(x=FCO2, y=..density..))+
  geom_histogram(bins = 30, color="black",  fill="lightgray")+
  geom_density(alpha=.05,fill="red")+
  theme_bw() +
  labs(x="FCO2", y = "Densidade")

fco2_train  %>% 
  ggplot(aes(x=SIF, y=..density..))+
  geom_histogram(bins = 11, color="black",  fill="lightgray")+
  geom_density(alpha=.05,fill="green")+
  theme_bw() +
  labs(x="SIF", y = "Densidade")

fco2_train  %>% 
  ggplot(aes(x=XCO2, y=..density..))+
  geom_histogram(bins = 15, color="black",  fill="lightgray")+
  geom_density(alpha=.05,fill="blue")+
  theme_bw() +
  labs(x="XCO2", y = "Densidade")

Correlação

glimpse(fco2_train)
#> Rows: 6,103
#> Columns: 37
#> $ ID        <int> 31, 35, 21, 3, 21, 76, 31, 35, 12, 45, 56, 49, 3, 21, 16, 19~
#> $ data.x    <date> 2018-07-04, 2018-05-25, 2019-07-07, 2017-03-17, 2017-03-17,~
#> $ cultura   <chr> "silvipastoril", "pasto", "pasto", "silvipastoril", "silvipa~
#> $ ano       <dbl> 2018, 2018, 2019, 2017, 2017, 2017, 2018, 2017, 2018, 2018, ~
#> $ mes       <dbl> 7, 5, 7, 3, 3, 6, 5, 6, 7, 6, 6, 10, 7, 7, 6, 2, 6, 9, 6, 2,~
#> $ x         <dbl> 7749399, 7747950, 7747952, 40, 100, 105, 7747956, 60, 774945~
#> $ y         <dbl> 457163.2, 456884.4, 456878.7, 0.0, 10.0, 40.0, 456862.8, 20.~
#> $ FCO2      <dbl> 1.06, 3.19, 1.23, 7.41, 5.36, 3.17, 1.16, 2.97, 1.13, 1.72, ~
#> $ Ts        <dbl> 23.20000, 24.20000, 12.10000, 23.17000, 23.17000, 20.70000, ~
#> $ Us        <dbl> 11.11000, 11.40673, 10.70000, 30.08400, 30.29000, 11.00000, ~
#> $ MO        <dbl> 4, 12, 20, 29, 32, 24, 13, 26, 28, 29, 8, 2, 13, 9, 37, 29, ~
#> $ Macro     <dbl> 11.45000000, 0.28000000, 0.02000000, NA, 10.94000000, NA, 0.~
#> $ VTP       <dbl> 57.11000, 0.44000, 0.37000, NA, 49.77000, NA, 0.35000, NA, 5~
#> $ ARG       <dbl> NA, 137.6400, 91.6200, NA, NA, NA, 112.1100, NA, 626.9800, 5~
#> $ ano_mes   <chr> "2018-7", "2018-5", "2019-7", "2017-3", "2017-3", "2017-6", ~
#> $ Tmed      <dbl> 23.9, 23.0, 13.1, 25.8, 25.8, 22.0, 23.0, 23.7, 15.8, 25.3, ~
#> $ Tmax      <dbl> 33.4, 31.3, 22.2, 33.6, 33.6, 30.0, 31.3, 31.5, 21.8, 33.9, ~
#> $ Tmin      <dbl> 16.0, 15.5, 5.1, 22.2, 22.2, 15.4, 15.5, 17.3, 11.9, 17.8, 1~
#> $ Umed      <dbl> 60.1, 67.3, 61.8, 86.5, 86.5, 74.7, 67.3, 67.7, 66.4, 57.1, ~
#> $ Umax      <dbl> 98.6, 90.8, 90.5, 99.9, 99.9, 90.8, 90.8, 94.8, 85.4, 93.3, ~
#> $ Umin      <dbl> 27.7, 43.8, 32.3, 50.3, 50.3, 54.7, 43.8, 38.8, 46.5, 27.2, ~
#> $ PkPa      <dbl> 97.6, 97.9, 98.4, 97.3, 97.3, 97.8, 97.9, 97.6, 98.3, 97.6, ~
#> $ Rad       <dbl> 12.5, 12.7, 11.8, 14.9, 14.9, 13.8, 12.7, 13.7, 13.2, 12.5, ~
#> $ Eto       <dbl> 2.8, 2.9, 2.3, 3.5, 3.5, 2.8, 2.9, 3.1, 2.8, 3.0, 3.2, 4.9, ~
#> $ Velmax    <dbl> 4.3, 5.5, 5.9, 5.2, 5.2, 4.6, 5.5, 5.2, 6.4, 4.7, 5.9, 7.1, ~
#> $ Velmin    <dbl> 0.8, 1.1, 1.1, 0.7, 0.7, 1.2, 1.1, 1.2, 2.1, 1.0, 1.6, 1.0, ~
#> $ Dir_vel   <dbl> 85.2, 107.2, 129.1, 264.9, 264.9, 93.0, 107.2, 72.5, 235.2, ~
#> $ chuva     <dbl> 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ~
#> $ inso      <dbl> 5.5, 5.3, 4.7, 3.6, 3.6, 8.2, 5.3, 8.0, 6.1, 6.8, 8.0, 6.0, ~
#> $ data.y    <date> 2018-07-09, 2018-05-06, 2019-07-28, 2017-03-16, 2017-03-16,~
#> $ mês       <dbl> 7, 5, 7, 3, 3, 6, 5, 6, 7, 6, 6, 10, 7, 7, 6, 2, 6, 9, 6, 2,~
#> $ dia       <dbl> 9, 6, 28, 16, 16, 29, 15, 29, 9, 16, 29, 9, 28, 28, 29, 26, ~
#> $ longitude <dbl> -51.5, -51.5, -51.5, -51.5, -51.5, -51.5, -51.5, -51.5, -51.~
#> $ latitude  <dbl> -20.5, -20.5, -20.5, -20.5, -20.5, -20.5, -20.5, -20.5, -20.~
#> $ XCO2      <dbl> 381.6135, 385.5825, 387.1356, 384.1294, 384.1294, 386.6562, ~
#> $ dist      <dbl> 0.1569801, 0.1569801, 0.1569801, 0.1569801, 0.1569801, 0.156~
#> $ SIF       <dbl> 0.31308794, 0.12510317, 0.30243534, 0.66028494, 0.66028494, ~
fco2_train   %>%    select(-c(ID,ano,mes,x,y,latitude,longitude,dist,mês,dia)) %>% 
  select(where(is.numeric)) %>%
  drop_na() %>% 
  cor()  %>%  
  corrplot::corrplot()

Data-prep

fco2_recipe <- recipe(FCO2 ~ ., data = fco2_train %>% 
                        select(-c(data.x,data.y,ID,ano,mes,x,y,latitude,longitude,dist,mês,dia,ano_mes))
) %>%  
  step_normalize(all_numeric_predictors())  %>% 
  step_novel(all_nominal_predictors()) %>% 
  step_zv(all_predictors()) %>%
  #step_naomit(c(Ts, Us)) %>% 
  #step_impute_mean(c(Us,Ts)) %>% 
  #step_poly(c(Us,Ts), degree = 2)  %>%  
  step_dummy(all_nominal_predictors())
bake(prep(fco2_recipe), new_data = NULL)
#> # A tibble: 6,103 x 29
#>         Ts     Us     MO  Macro    VTP    ARG     Tmed    Tmax    Tmin   Umed
#>      <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>
#>  1  0.0874 -0.380 -1.80   1.15   1.04  NA     -0.00687  0.422  -0.430  -1.03 
#>  2  0.260  -0.340 -1.01  -0.718 -1.56  -0.956 -0.210   -0.0192 -0.541  -0.286
#>  3 -1.83   -0.437 -0.215 -0.762 -1.57  -1.20  -2.45    -1.93   -2.86   -0.856
#>  4  0.0822  2.23   0.678 NA     NA     NA      0.422    0.465   0.951   1.71 
#>  5  0.0822  2.26   0.976  1.06   0.707 NA      0.422    0.465   0.951   1.71 
#>  6 -0.344  -0.395  0.182 NA     NA     NA     -0.436   -0.293  -0.563   0.483
#>  7  0.0874 -0.178 -0.909 -0.732 -1.57  -1.09  -0.210   -0.0192 -0.541  -0.286
#>  8 -0.982   1.10   0.381 NA     NA     NA     -0.0521   0.0229 -0.140  -0.244
#>  9 -1.09    0.921  0.579 -0.141  0.776  1.59  -1.84    -2.02   -1.34   -0.379
#> 10 -0.395   0.274  0.678  0.579  0.862  1.41   0.309    0.528  -0.0288 -1.34 
#> # ... with 6,093 more rows, and 19 more variables: Umax <dbl>, Umin <dbl>,
#> #   PkPa <dbl>, Rad <dbl>, Eto <dbl>, Velmax <dbl>, Velmin <dbl>,
#> #   Dir_vel <dbl>, chuva <dbl>, inso <dbl>, XCO2 <dbl>, SIF <dbl>, FCO2 <dbl>,
#> #   cultura_eucalipto <dbl>, cultura_mata.ciliar <dbl>, cultura_pasto <dbl>,
#> #   cultura_pinus <dbl>, cultura_silvipastoril <dbl>, cultura_new <dbl>
visdat::vis_miss(bake(prep(fco2_recipe), new_data = NULL))

## TUNAGEM

fco2_resamples <- vfold_cv(fco2_train, v = 10)
grid <- grid_regular(
  penalty(range = c(-8, 0)),
  levels = 20
)

ÁRVORE DE DECISÃO (decision tree - dt)

fco2_dt_model <- decision_tree(
  cost_complexity = tune(),
  tree_depth = tune(),
  min_n = tune()
)  %>%  
  set_mode("regression")  %>%  
  set_engine("rpart")

Workflow

fco2_dt_wf <- workflow()   %>%  
  add_model(fco2_dt_model) %>% 
  add_recipe(fco2_recipe)

Criando a matriz (grid) com os valores de hiperparâmetros a serem testados

grid_dt <- grid_random(
  cost_complexity(c(-6, -4)),
  tree_depth(range = c(8, 18)),
  min_n(range = c(42, 52)),
  size = 2
)
glimpse(grid_dt)
#> Rows: 2
#> Columns: 3
#> $ cost_complexity <dbl> 1.921168e-06, 3.382433e-06
#> $ tree_depth      <int> 17, 15
#> $ min_n           <int> 52, 43
fco2_dt_tune_grid <- tune_grid(
  fco2_dt_wf,
  resamples = fco2_resamples,
  grid = grid_dt,
  metrics = metric_set(rmse)
)
autoplot(fco2_dt_tune_grid)

collect_metrics(fco2_dt_tune_grid)
#> # A tibble: 2 x 9
#>   cost_complexity tree_depth min_n .metric .estima~1  mean     n std_err .config
#>             <dbl>      <int> <int> <chr>   <chr>     <dbl> <int>   <dbl> <chr>  
#> 1      0.00000192         17    52 rmse    standard   1.25    10  0.0292 Prepro~
#> 2      0.00000338         15    43 rmse    standard   1.24    10  0.0314 Prepro~
#> # ... with abbreviated variable name 1: .estimator

Desempenho dos modelos finais

fco2_dt_best_params <- select_best(fco2_dt_tune_grid, "rmse")
fco2_dt_wf <- fco2_dt_wf %>% finalize_workflow(fco2_dt_best_params)
fco2_dt_last_fit <- last_fit(fco2_dt_wf, fco2_initial_split)

Criar os preditos

fco2_test_preds <- bind_rows(
  collect_predictions(fco2_dt_last_fit)  %>%   mutate(modelo = "dt")
)

fco2_test <- testing(fco2_initial_split)
visdat::vis_miss(fco2_test)

fco2_test_preds %>% 
  ggplot(aes(x=.pred, y=FCO2)) +
  geom_point()+
  theme_bw() +
  geom_smooth(method = "lm") +
  stat_regline_equation(ggplot2::aes(
  label =  paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~"))) 

Variáveis importantes

fco2_dt_last_fit_model <-fco2_dt_last_fit$.workflow[[1]]$fit$fit
vip(fco2_dt_last_fit_model)

Métricas

da <- fco2_test_preds %>% 
  filter(FCO2 > 0, .pred>0 )

my_rmse <- Metrics::rmse(da$FCO2,
                         da$.pred)
my_mape <- Metrics::mape(da$FCO2,da$.pred)*100

fco2_test_preds %>% 
  ggplot(aes(x=FCO2,y=.pred))+
  geom_point()+
  geom_smooth(method = "lm")+
  stat_regline_equation(ggplot2::aes(
    label =  paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")),size=5)+
  ggplot2::annotate('text',x=10.4,y=16.7,label=paste0('RMSE = ',round(my_rmse,2),', MAPE = '
                                                      ,round(my_mape,2),'%'),size=5)+
  theme_bw()

Random Forest (rf)

Corrigindo os NAs no teste.

visdat::vis_miss(fco2_test)

data_set_ml <- data_set_ml %>%
  select(cultura, FCO2, Ts, XCO2, SIF, 
                   Us, MO, Tmed,Tmax, Tmin, Umed,
                   Umax, Umin, PkPa, Rad, Eto, Velmax, Velmin, Dir_vel,
                   chuva, inso) %>%
  drop_na(FCO2, Ts,Us,Tmed:inso)
visdat::vis_miss(data_set_ml)

fco2_initial_split <- initial_split(data_set_ml, prop = 0.75)
fco2_test <- testing(fco2_initial_split)
visdat::vis_miss(fco2_test)

fco2_train <- training(fco2_initial_split)
visdat::vis_miss(fco2_train)

fco2_resamples_rf <- vfold_cv(fco2_train, v = 5)

Data prep

fco2_rf_recipe <- recipe(FCO2 ~ ., data = fco2_train) %>% 
  step_string2factor(all_nominal(), skip = TRUE) %>% 
  step_normalize(all_numeric_predictors())  %>% 
  step_novel(all_nominal_predictors()) %>% 
  step_zv(all_predictors()) %>%
  # step_naomit(all_predictors()) #%>% 
  # step_impute_mean(c(Ts,Us)) %>% 
  # step_poly(c(Ts, Us), degree = 2)  %>%  
  step_dummy(all_nominal_predictors())
bake(prep(fco2_rf_recipe), new_data = NULL)
#> # A tibble: 5,459 x 26
#>          Ts   XCO2    SIF      Us     MO   Tmed    Tmax   Tmin    Umed    Umax
#>       <dbl>  <dbl>  <dbl>   <dbl>  <dbl>  <dbl>   <dbl>  <dbl>   <dbl>   <dbl>
#>  1 -0.193    0.114  0.107 -0.181   0.673 -0.429 -0.279  -0.564  0.466  -0.297 
#>  2  1.10    -0.832  0.474  0.489   0.769  0.429  0.288   1.04   2.19    1.03  
#>  3  0.647   -0.743  1.20  -0.987   0.577  0.948  0.666   1.08  -0.0100  0.0922
#>  4 -0.00815  1.27  -1.26  -0.0801 -0.767 -0.181 -0.0479 -0.319 -0.941  -0.786 
#>  5 -0.479    1.27  -1.26   0.361   0.481 -0.542 -0.258  -0.742  0.621   1.03  
#>  6  0.949    0.734  1.53  -0.289  -0.575  0.632  0.898   0.840  1.41    1.03  
#>  7  0.513   -0.177 -0.533 -0.569  -0.671  0.158 -0.0899  0.350 -1.37   -2.72  
#>  8  0.513    1.27  -1.26   0.324  -1.73  -0.632 -0.720  -0.453  1.10    0.625 
#>  9  0.882   -0.177 -0.533 -0.915  -0.767  0.158 -0.0899  0.350 -1.37   -2.72  
#> 10 -0.0250  -1.97  -0.180  1.78   -1.44  -1.40  -2.15   -0.698  1.30    0.553 
#> # ... with 5,449 more rows, and 16 more variables: Umin <dbl>, PkPa <dbl>,
#> #   Rad <dbl>, Eto <dbl>, Velmax <dbl>, Velmin <dbl>, Dir_vel <dbl>,
#> #   chuva <dbl>, inso <dbl>, FCO2 <dbl>, cultura_eucalipto <dbl>,
#> #   cultura_mata.ciliar <dbl>, cultura_pasto <dbl>, cultura_pinus <dbl>,
#> #   cultura_silvipastoril <dbl>, cultura_new <dbl>
visdat::vis_miss(bake(prep(fco2_rf_recipe), new_data = NULL))

Modelo

fco2_rf_model <- rand_forest(
  min_n = tune(),
  mtry = tune(),
  trees = tune()
)   %>%  
  set_mode("regression")  %>% 
  set_engine("randomForest")

Workflow

fco2_rf_wf <- workflow()   %>%  
  add_model(fco2_rf_model) %>%  
  add_recipe(fco2_rf_recipe)

Tune

mtry trees min_n .config 10 769 21 Preprocessor1_Model39

grid_rf <- grid_random(
  min_n(range = c(20, 30)),
  mtry(range = c(10,20)),
  trees(range = c(769,1500) ),
  size = 20
)
fco2_rf_tune_grid <- tune_grid(
 fco2_rf_wf,
  resamples = fco2_resamples_rf,
  grid = grid_rf,
  metrics = metric_set(rmse)
) 
autoplot(fco2_rf_tune_grid)

collect_metrics(fco2_rf_tune_grid)
#> # A tibble: 20 x 9
#>     mtry trees min_n .metric .estimator  mean     n std_err .config             
#>    <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
#>  1    12  1452    20 rmse    standard    1.14     5  0.0297 Preprocessor1_Model~
#>  2    20  1372    21 rmse    standard    1.13     5  0.0317 Preprocessor1_Model~
#>  3    15  1255    24 rmse    standard    1.14     5  0.0305 Preprocessor1_Model~
#>  4    18  1488    26 rmse    standard    1.14     5  0.0302 Preprocessor1_Model~
#>  5    18  1406    26 rmse    standard    1.14     5  0.0307 Preprocessor1_Model~
#>  6    11  1281    22 rmse    standard    1.15     5  0.0291 Preprocessor1_Model~
#>  7    14  1443    25 rmse    standard    1.14     5  0.0296 Preprocessor1_Model~
#>  8    17  1045    22 rmse    standard    1.14     5  0.0303 Preprocessor1_Model~
#>  9    12   929    25 rmse    standard    1.15     5  0.0291 Preprocessor1_Model~
#> 10    19  1110    20 rmse    standard    1.13     5  0.0319 Preprocessor1_Model~
#> 11    20   899    25 rmse    standard    1.14     5  0.0305 Preprocessor1_Model~
#> 12    18  1137    28 rmse    standard    1.14     5  0.0303 Preprocessor1_Model~
#> 13    17   797    22 rmse    standard    1.14     5  0.0312 Preprocessor1_Model~
#> 14    14   787    27 rmse    standard    1.14     5  0.0292 Preprocessor1_Model~
#> 15    15   876    25 rmse    standard    1.14     5  0.0302 Preprocessor1_Model~
#> 16    20  1077    26 rmse    standard    1.14     5  0.0306 Preprocessor1_Model~
#> 17    14  1220    27 rmse    standard    1.14     5  0.0297 Preprocessor1_Model~
#> 18    16  1010    20 rmse    standard    1.14     5  0.0315 Preprocessor1_Model~
#> 19    20  1128    22 rmse    standard    1.14     5  0.0315 Preprocessor1_Model~
#> 20    16  1214    30 rmse    standard    1.14     5  0.0298 Preprocessor1_Model~

Desempenho dos modelos finais

fco2_rf_best_params <- select_best(fco2_rf_tune_grid, "rmse")
fco2_rf_wf <- fco2_rf_wf %>% finalize_workflow(fco2_rf_best_params)
fco2_rf_last_fit <- last_fit(fco2_rf_wf, fco2_initial_split)

Criar os preditos

fco2_test_preds <- bind_rows(
  collect_predictions(fco2_rf_last_fit)  %>%   mutate(modelo = "rf")
)
fco2_test_preds %>% 
  ggplot(aes(x=.pred, y=FCO2)) +
  geom_point()+
  theme_bw() +
  geom_smooth(method = "lm") +
  stat_regline_equation(ggplot2::aes(
  label =  paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~"))) 

Variáveis importantes

fco2_rf_last_fit_model <-fco2_rf_last_fit$.workflow[[1]]$fit$fit
vip(fco2_rf_last_fit_model)

Métricas

da <- fco2_test_preds %>% 
  filter(FCO2 > 0, .pred> 0 )

my_rmse <- Metrics::rmse(da$FCO2,
                         da$.pred)
my_mape <- Metrics::mape(da$FCO2,da$.pred)*100

fco2_test_preds %>% 
  ggplot(aes(x=FCO2,y=.pred))+
  geom_point()+
  geom_smooth(method = "lm")+
  stat_regline_equation(ggplot2::aes(
    label =  paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")),size=5)+
  ggplot2::annotate('text',x=10.4,y=16.7,label=paste0('RMSE = ',round(my_rmse,2),', MAPE = '
                                                      ,round(my_mape,2),'%'),size=5)+
  theme_bw()

Boosting gradient tree (xgb)

Data prep

fco2_xgb_recipe <- recipe(FCO2 ~ ., data = fco2_train) %>% 
  step_string2factor(all_nominal(), skip = TRUE) %>% 
  step_normalize(all_numeric_predictors())  %>% 
  step_novel(all_nominal_predictors()) %>% 
  # step_zv(all_predictors()) %>%
  # step_naomit(all_predictors()) #%>% 
  #step_poly(c(Ts, Us), degree = 3)  %>%  
  step_dummy(all_nominal_predictors())
bake(prep(fco2_xgb_recipe), new_data = NULL)
#> # A tibble: 5,459 x 26
#>          Ts   XCO2    SIF      Us     MO   Tmed    Tmax   Tmin    Umed    Umax
#>       <dbl>  <dbl>  <dbl>   <dbl>  <dbl>  <dbl>   <dbl>  <dbl>   <dbl>   <dbl>
#>  1 -0.193    0.114  0.107 -0.181   0.673 -0.429 -0.279  -0.564  0.466  -0.297 
#>  2  1.10    -0.832  0.474  0.489   0.769  0.429  0.288   1.04   2.19    1.03  
#>  3  0.647   -0.743  1.20  -0.987   0.577  0.948  0.666   1.08  -0.0100  0.0922
#>  4 -0.00815  1.27  -1.26  -0.0801 -0.767 -0.181 -0.0479 -0.319 -0.941  -0.786 
#>  5 -0.479    1.27  -1.26   0.361   0.481 -0.542 -0.258  -0.742  0.621   1.03  
#>  6  0.949    0.734  1.53  -0.289  -0.575  0.632  0.898   0.840  1.41    1.03  
#>  7  0.513   -0.177 -0.533 -0.569  -0.671  0.158 -0.0899  0.350 -1.37   -2.72  
#>  8  0.513    1.27  -1.26   0.324  -1.73  -0.632 -0.720  -0.453  1.10    0.625 
#>  9  0.882   -0.177 -0.533 -0.915  -0.767  0.158 -0.0899  0.350 -1.37   -2.72  
#> 10 -0.0250  -1.97  -0.180  1.78   -1.44  -1.40  -2.15   -0.698  1.30    0.553 
#> # ... with 5,449 more rows, and 16 more variables: Umin <dbl>, PkPa <dbl>,
#> #   Rad <dbl>, Eto <dbl>, Velmax <dbl>, Velmin <dbl>, Dir_vel <dbl>,
#> #   chuva <dbl>, inso <dbl>, FCO2 <dbl>, cultura_eucalipto <dbl>,
#> #   cultura_mata.ciliar <dbl>, cultura_pasto <dbl>, cultura_pinus <dbl>,
#> #   cultura_silvipastoril <dbl>, cultura_new <dbl>
visdat::vis_miss(bake(prep(fco2_xgb_recipe), new_data = NULL))

Estratégia de Tunagem de Hiperparâmetros

Passo 1:

Achar uma combinação learning_rate e trees que funciona relativamente bem.

  • learn_rate - 0.05, 0.1, 0.3

  • trees - 100, 500, 1000, 1500

Modelo

cores = 4
fco2_xgb_model <- boost_tree(
  mtry = 0.8, 
  trees = tune(), # <---------------
  min_n = 5, 
  tree_depth = 4,
  loss_reduction = 0, # lambda
  learn_rate = tune(), # epsilon
  sample_size = 0.8
)  %>%   
  set_mode("regression")  %>% 
  set_engine("xgboost", nthread = cores, counts = FALSE)

Workflow

fco2_xgb_wf <- workflow()  %>%  
  add_model(fco2_xgb_model) %>%  
  add_recipe(fco2_xgb_recipe)

Tune

grid_xgb <- expand.grid(
  learn_rate =  c(0.05, 0.3),
  trees = c(2, 250, 500)
)
fco2_xgb_tune_grid <- tune_grid(
 fco2_xgb_wf,
  resamples = fco2_resamples,
  grid = grid_xgb,
  metrics = metric_set(rmse)
)
autoplot(fco2_xgb_tune_grid)

fco2_xgb_tune_grid   %>%   show_best(metric = "rmse", n = 6)
#> # A tibble: 6 x 8
#>   trees learn_rate .metric .estimator  mean     n std_err .config             
#>   <dbl>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
#> 1   500       0.05 rmse    standard    1.16    10  0.0292 Preprocessor1_Model3
#> 2   250       0.05 rmse    standard    1.17    10  0.0285 Preprocessor1_Model2
#> 3   250       0.3  rmse    standard    1.18    10  0.0309 Preprocessor1_Model5
#> 4   500       0.3  rmse    standard    1.19    10  0.0312 Preprocessor1_Model6
#> 5     2       0.3  rmse    standard    2.03    10  0.0365 Preprocessor1_Model4
#> 6     2       0.05 rmse    standard    3.11    10  0.0400 Preprocessor1_Model1
fco2_xgb_select_best_passo1 <- fco2_xgb_tune_grid %>% 
  select_best(metric = "rmse")
fco2_xgb_select_best_passo1
#> # A tibble: 1 x 3
#>   trees learn_rate .config             
#>   <dbl>      <dbl> <chr>               
#> 1   500       0.05 Preprocessor1_Model3

Passo 2:

São bons valores inciais. Agora, podemos tunar os parâmetros relacionados à árvore.

  • tree_depth: vamos deixar ele variar entre 3 e 10.
  • min_n: vamos deixar variar entre 5 e 90.
fco2_xgb_model <- boost_tree(
  mtry = 0.8,
  trees = fco2_xgb_select_best_passo1$trees,
  min_n = tune(),
  tree_depth = tune(), 
  loss_reduction = 0, 
  learn_rate = fco2_xgb_select_best_passo1$learn_rate, 
  sample_size = 0.8
) %>% 
  set_mode("regression")  %>% 
  set_engine("xgboost", nthread = cores, counts = FALSE)

#### Workflow
fco2_xgb_wf <- workflow() %>%  
    add_model(fco2_xgb_model)   %>%   
    add_recipe(fco2_xgb_recipe)

#### Grid
fco2_xgb_grid <- expand.grid(
  tree_depth = c(1, 3, 4), 
  min_n = c(5, 30, 60)
)

fco2_xgb_tune_grid <- fco2_xgb_wf   %>%   
  tune_grid(
    resamples =fco2_resamples,
    grid = fco2_xgb_grid,
    control = control_grid(save_pred = TRUE, verbose = FALSE, allow_par = TRUE),
    metrics = metric_set(rmse)
  )

#### Melhores hiperparâmetros
autoplot(fco2_xgb_tune_grid)

fco2_xgb_tune_grid  %>%   show_best(metric = "rmse", n = 5)
#> # A tibble: 5 x 8
#>   min_n tree_depth .metric .estimator  mean     n std_err .config             
#>   <dbl>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
#> 1     5          4 rmse    standard    1.17    10  0.0285 Preprocessor1_Model3
#> 2    30          4 rmse    standard    1.18    10  0.0297 Preprocessor1_Model6
#> 3     5          3 rmse    standard    1.18    10  0.0290 Preprocessor1_Model2
#> 4    60          4 rmse    standard    1.19    10  0.0275 Preprocessor1_Model9
#> 5    30          3 rmse    standard    1.20    10  0.0288 Preprocessor1_Model5
fco2_xgb_select_best_passo2 <- fco2_xgb_tune_grid  %>%   select_best(metric = "rmse")
fco2_xgb_select_best_passo2
#> # A tibble: 1 x 3
#>   min_n tree_depth .config             
#>   <dbl>      <dbl> <chr>               
#> 1     5          4 Preprocessor1_Model3

Passo 3

Vamos tunar o loss_reduction

fco2_xgb_model <- boost_tree(
  mtry = 0.8,
  trees = fco2_xgb_select_best_passo1$trees,
  min_n = fco2_xgb_select_best_passo2$min_n,
  tree_depth = fco2_xgb_select_best_passo2$tree_depth, 
  loss_reduction =tune(), 
  learn_rate = fco2_xgb_select_best_passo1$learn_rate, 
  sample_size = 0.8
)  %>%  
  set_mode("regression")  %>%  
  set_engine("xgboost", nthread = cores, counts = FALSE)

#### Workflow
fco2_xgb_wf <- workflow()  %>%   
    add_model(fco2_xgb_model)  %>%   
    add_recipe(fco2_xgb_recipe)

#### Grid
fco2_xgb_grid <- expand.grid(
  loss_reduction = c(0.01, 0.05, 1, 2, 4, 8)
)

fco2_xgb_tune_grid <- fco2_xgb_wf   %>%   
  tune_grid(
    resamples = fco2_resamples,
    grid = fco2_xgb_grid,
    control = control_grid(save_pred = TRUE, verbose = FALSE, allow_par = TRUE),
    metrics = metric_set(rmse)
  )

#### Melhores hiperparâmetros
autoplot(fco2_xgb_tune_grid)

fco2_xgb_tune_grid   %>%   show_best(metric = "rmse", n = 5)
#> # A tibble: 5 x 7
#>   loss_reduction .metric .estimator  mean     n std_err .config             
#>            <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
#> 1           0.05 rmse    standard    1.16    10  0.0293 Preprocessor1_Model2
#> 2           0.01 rmse    standard    1.16    10  0.0304 Preprocessor1_Model1
#> 3           2    rmse    standard    1.17    10  0.0286 Preprocessor1_Model4
#> 4           1    rmse    standard    1.17    10  0.0286 Preprocessor1_Model3
#> 5           4    rmse    standard    1.17    10  0.0294 Preprocessor1_Model5
fco2_xgb_select_best_passo3 <- fco2_xgb_tune_grid %>% select_best(metric = "rmse")
fco2_xgb_select_best_passo3
#> # A tibble: 1 x 2
#>   loss_reduction .config             
#>            <dbl> <chr>               
#> 1           0.05 Preprocessor1_Model2

Passo 4:

Vamos então tunar o mtry e o sample_size:

fco2_xgb_model <- boost_tree(
  mtry = tune(),
  trees = fco2_xgb_select_best_passo1$trees,
  min_n = fco2_xgb_select_best_passo2$min_n,
  tree_depth = fco2_xgb_select_best_passo2$tree_depth, 
  loss_reduction = fco2_xgb_select_best_passo3$loss_reduction, 
  learn_rate = fco2_xgb_select_best_passo1$learn_rate, 
  sample_size = tune()
)%>%  
  set_mode("regression")  |> 
  set_engine("xgboost", nthread = cores, counts = FALSE)
#### Workflow
fco2_xgb_wf <- workflow()  %>%   
    add_model(fco2_xgb_model)  %>%   
    add_recipe(fco2_xgb_recipe)
#### Grid
fco2_xgb_grid <- expand.grid(
    sample_size = seq(0.5, 1.0, length.out = 2), ## <---
    mtry = seq(0.1, 1.0, length.out = 2) ## <---
)
fco2_xgb_tune_grid <- fco2_xgb_wf   %>%   
  tune_grid(
    resamples = fco2_resamples,
    grid = fco2_xgb_grid,
    control = control_grid(save_pred = TRUE, verbose = FALSE, allow_par = TRUE),
    metrics = metric_set(rmse)
  )
#### Melhores hiperparâmetros
autoplot(fco2_xgb_tune_grid)

fco2_xgb_tune_grid  |>  show_best(metric = "rmse", n = 5)
#> # A tibble: 4 x 8
#>    mtry sample_size .metric .estimator  mean     n std_err .config             
#>   <dbl>       <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
#> 1   1           1   rmse    standard    1.16    10  0.0275 Preprocessor1_Model4
#> 2   1           0.5 rmse    standard    1.18    10  0.0297 Preprocessor1_Model3
#> 3   0.1         1   rmse    standard    1.19    10  0.0277 Preprocessor1_Model2
#> 4   0.1         0.5 rmse    standard    1.19    10  0.0279 Preprocessor1_Model1
fco2_xgb_select_best_passo4 <- fco2_xgb_tune_grid   %>%   select_best(metric = "rmse")
fco2_xgb_select_best_passo4
#> # A tibble: 1 x 3
#>    mtry sample_size .config             
#>   <dbl>       <dbl> <chr>               
#> 1     1           1 Preprocessor1_Model4

Passo 5

Agora vamos tunar o learn_rate e o trees de novo, mas deixando o learn_rate assumir valores menores.

fco2_xgb_model <- boost_tree(
  mtry = fco2_xgb_select_best_passo4$mtry,
  trees = tune(),
  min_n = fco2_xgb_select_best_passo2$min_n,
  tree_depth = fco2_xgb_select_best_passo2$tree_depth, 
  loss_reduction = fco2_xgb_select_best_passo3$loss_reduction, 
  learn_rate = tune(), 
  sample_size = fco2_xgb_select_best_passo4$sample_size
) |> 
  set_mode("regression")  %>%  
  set_engine("xgboost", nthread = cores, counts = FALSE)

#### Workflow
fco2_xgb_wf <- workflow() %>%   
    add_model(fco2_xgb_model)  %>%   
    add_recipe(fco2_xgb_recipe)

#### Grid
fco2_xgb_grid <- expand.grid(
    learn_rate = c(0.05, 0.10, 0.15, 0.25),
    trees = c(100, 250, 500)
)

fco2_xgb_tune_grid <- fco2_xgb_wf   %>%   
  tune_grid(
    resamples = fco2_resamples,
    grid = fco2_xgb_grid,
    control = control_grid(save_pred = TRUE, verbose = FALSE, allow_par = TRUE),
    metrics = metric_set(rmse)
  )

#### Melhores hiperparâmetros
autoplot(fco2_xgb_tune_grid)

fco2_xgb_tune_grid  %>%   show_best(metric = "rmse", n = 5)
#> # A tibble: 5 x 8
#>   trees learn_rate .metric .estimator  mean     n std_err .config              
#>   <dbl>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
#> 1   500       0.25 rmse    standard    1.13    10  0.0307 Preprocessor1_Model12
#> 2   500       0.15 rmse    standard    1.14    10  0.0292 Preprocessor1_Model09
#> 3   250       0.25 rmse    standard    1.14    10  0.0268 Preprocessor1_Model11
#> 4   250       0.15 rmse    standard    1.15    10  0.0271 Preprocessor1_Model08
#> 5   500       0.1  rmse    standard    1.15    10  0.0288 Preprocessor1_Model06
fco2_xgb_select_best_passo5 <- fco2_xgb_tune_grid   %>%   select_best(metric = "rmse")
fco2_xgb_select_best_passo5
#> # A tibble: 1 x 3
#>   trees learn_rate .config              
#>   <dbl>      <dbl> <chr>                
#> 1   500       0.25 Preprocessor1_Model12

Desempenho dos modelos finais

fco2_xgb_model <- boost_tree(
  mtry = fco2_xgb_select_best_passo4$mtry,
  trees = fco2_xgb_select_best_passo5$trees,
  min_n = fco2_xgb_select_best_passo2$min_n,
  tree_depth = fco2_xgb_select_best_passo2$tree_depth, 
  loss_reduction = fco2_xgb_select_best_passo3$loss_reduction, 
  learn_rate = fco2_xgb_select_best_passo5$learn_rate, 
  sample_size = fco2_xgb_select_best_passo4$sample_size
) %>%  
  set_mode("regression")  %>%  
  set_engine("xgboost", nthread = cores, counts = FALSE)

Desempenho dos modelos finais

# fco2_xgb_best_params <- select_best(fco2_xgb_tune_grid, "rmse")
df <- data.frame(
  mtry = fco2_xgb_select_best_passo4$mtry,
  trees = fco2_xgb_select_best_passo5$trees,
  min_n = fco2_xgb_select_best_passo2$min_n,
  tree_depth = fco2_xgb_select_best_passo2$tree_depth, 
  loss_reduction = fco2_xgb_select_best_passo3$loss_reduction, 
  learn_rate = fco2_xgb_select_best_passo5$learn_rate, 
  sample_size = fco2_xgb_select_best_passo4$sample_size
)
fco2_xgb_wf <- fco2_xgb_wf %>% finalize_workflow(df) # <------
fco2_xgb_last_fit <- last_fit(fco2_xgb_wf, fco2_initial_split) # <--------

Criar os preditos

fco2_test_preds <- bind_rows(
  collect_predictions(fco2_xgb_last_fit)  %>%   mutate(modelo = "xgb")
)
fco2_test_preds %>% 
  ggplot(aes(x=.pred, y=FCO2)) +
  geom_point()+
  theme_bw() +
  geom_smooth(method = "lm") +
  stat_regline_equation(ggplot2::aes(
  label =  paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~"))) 

Variáveis importantes

fco2_xgb_last_fit_model <-fco2_xgb_last_fit$.workflow[[1]]$fit$fit
vip(fco2_xgb_last_fit_model)

Métricas

da <- fco2_test_preds %>% 
  filter(FCO2 > 0, .pred> 0 )

my_rmse <- Metrics::rmse(da$FCO2,
                         da$.pred)
my_mape <- Metrics::mape(da$FCO2,da$.pred)*100

fco2_test_preds %>% 
  ggplot(aes(x=FCO2,y=.pred))+
  geom_point()+
  geom_smooth(method = "lm")+
  stat_regline_equation(ggplot2::aes(
    label =  paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")),size=5)+
  ggplot2::annotate('text',x=10.4,y=16.7,label=paste0('RMSE = ',round(my_rmse,2),', MAPE = '
                                                      ,round(my_mape,2),'%'),size=5)+
  theme_bw()

About

Aquisição e processamento dos dados de XCO2 (JPL Virtual Science Data Environment)

Topics

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Contributors 4

  •  
  •  
  •  
  •  

Languages