Este script tem como objetivo realizar a etapa de faxina de dados, ou seja, aplicar transformações, padronizações e correções necessárias para garantir a qualidade dos dados utilizados nas análises. Aqui são tratados problemas como valores ausentes, formatação inconsistente, e possÃveis outliers.
# install.packages("devtools")
# devtools::install_github("arpanosso/fco2r",force = TRUE)
# Sys.getenv("GITHUB_PAT")
# Sys.unsetenv("GITHUB_PAT")
# Sys.getenv("GITHUB_PAT")
library(fco2r)
library(tidyverse)
library(geobr)
library(ggpubr)
library(sf)
source("../R/functions.R")
theme_set(theme_bw())
data_fco2 <- data_fco2 |>
janitor::clean_names() |>
mutate(
municipio = case_when(
municipio == "Selv?ra" ~ "SelvÃria",
municipio == "Selv?ria" ~ "SelvÃria",
municipio == "Prad?polis" ~ "Pradópolis",
municipio == "Aparecida do Tabuado" ~ "Aparecida Do Taboado",
TRUE ~ municipio
))
glimpse(data_fco2)
## Rows: 15,397
## Columns: 39
## $ experimento <chr> "Espacial", "Espacial", "Espacial", "Espacial", "Esp…
## $ data <date> 2001-07-10, 2001-07-10, 2001-07-10, 2001-07-10, 200…
## $ manejo <chr> "convencional", "convencional", "convencional", "con…
## $ tratamento <chr> "AD_GN", "AD_GN", "AD_GN", "AD_GN", "AD_GN", "AD_GN"…
## $ revolvimento_solo <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ data_preparo <date> 2001-07-01, 2001-07-01, 2001-07-01, 2001-07-01, 200…
## $ conversao <date> 1970-01-01, 1970-01-01, 1970-01-01, 1970-01-01, 197…
## $ cobertura <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE…
## $ cultura <chr> "milho_soja", "milho_soja", "milho_soja", "milho_soj…
## $ x <dbl> 0, 40, 80, 10, 25, 40, 55, 70, 20, 40, 60, 10, 70, 3…
## $ y <dbl> 0, 0, 0, 10, 10, 10, 10, 10, 20, 20, 20, 25, 25, 30,…
## $ longitude_muni <dbl> 782062.7, 782062.7, 782062.7, 782062.7, 782062.7, 78…
## $ latitude_muni <dbl> 7647674, 7647674, 7647674, 7647674, 7647674, 7647674…
## $ estado <chr> "SP", "SP", "SP", "SP", "SP", "SP", "SP", "SP", "SP"…
## $ municipio <chr> "Jaboticabal", "Jaboticabal", "Jaboticabal", "Jaboti…
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ prof <chr> "0-0.1", "0-0.1", "0-0.1", "0-0.1", "0-0.1", "0-0.1"…
## $ fco2 <dbl> 1.080, 0.825, 1.950, 0.534, 0.893, 0.840, 1.110, 1.8…
## $ ts <dbl> 18.73, 18.40, 19.20, 18.28, 18.35, 18.47, 19.10, 18.…
## $ us <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ p_h <dbl> 5.1, 5.1, 5.8, 5.3, 5.5, 5.7, 5.6, 6.4, 5.3, 5.8, 5.…
## $ mo <dbl> 20, 24, 25, 23, 23, 21, 26, 23, 25, 24, 26, 20, 25, …
## $ p <dbl> 46, 26, 46, 78, 60, 46, 55, 92, 55, 60, 48, 71, 125,…
## $ k <dbl> 2.4, 2.2, 5.3, 3.6, 3.4, 2.9, 4.0, 2.3, 3.3, 3.6, 4.…
## $ ca <dbl> 25, 30, 41, 27, 33, 38, 35, 94, 29, 36, 37, 29, 50, …
## $ mg <dbl> 11, 11, 25, 11, 15, 20, 16, 65, 11, 17, 15, 11, 30, …
## $ h_al <dbl> 31, 31, 22, 28, 27, 22, 22, 12, 31, 28, 28, 31, 18, …
## $ sb <dbl> 38.4, 43.2, 71.3, 41.6, 50.6, 60.9, 55.0, 161.3, 43.…
## $ ctc <dbl> 69.4, 74.2, 93.3, 69.6, 77.9, 82.9, 77.0, 173.3, 74.…
## $ v <dbl> 55, 58, 76, 60, 65, 73, 71, 93, 58, 67, 67, 58, 82, …
## $ ds <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ macro <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ micro <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ vtp <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ pla <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ at <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ silte <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ arg <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ hlifs <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
Cálculo da SIF
oco2_br <- oco2_br %>%
janitor::clean_names() |>
mutate(
xco2 = xco2_moles_mole_1*1e06,
data = ymd_hms(time_yyyymmddhhmmss),
year = year(data),
month = month(data),
day = day(data),
w_day = wday(data),
sif = (fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1*2.6250912*10^(-19) + 1.5*fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1* 2.57743*10^(-19))/2,
sif = ifelse(sif <= 0 | sif > 10, median(sif), sif)
)
oco2_br |>
ggplot(aes(x = sif,y=..density..)) +
geom_histogram(bins = 80,color="black",fill="gray") +
coord_cartesian(xlim = c(0,4))
brasil_geobr <- read_country(showProgress = FALSE)
brasil_geobr |>
ggplot() +
geom_sf(fill="white", color="black",
size=.15, show.legend = FALSE) +
geom_point(data=oco2_br %>%
sample_n(10000) ,
aes(x=longitude,y=latitude),
shape=1,
col="red",
alpha=01)+
labs(x="Longitude",y="Latitude")
Existe uma tendência de aumento monotônica mundial da concentração de CO2 na atmosfera, assim, ela deve ser retirada para podermos observar as tendências regionais. Observe que o sinal na variável XCO2 não apresenta a tendência descrita.
oco2_br |>
ggplot(aes(x=data,y=xco2)) +
geom_point(shape=21,color="black",fill="gray") +
geom_smooth(method = "lm") +
stat_regline_equation(aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")))
Análise de regressão linear simples para caracterização da
tendência.
mod_trend_xco2 <- lm(xco2 ~ data,
data = oco2_br |>
mutate( data = data - min(data))
)
summary.lm(mod_trend_xco2)
##
## Call:
## lm(formula = xco2 ~ data, data = mutate(oco2_br, data = data -
## min(data)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.534 -1.486 0.411 1.927 44.108
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.930e+02 3.266e-02 12034.0 <2e-16 ***
## data 8.193e-08 3.299e-10 248.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.126 on 37385 degrees of freedom
## Multiple R-squared: 0.6226, Adjusted R-squared: 0.6226
## F-statistic: 6.168e+04 on 1 and 37385 DF, p-value: < 2.2e-16
a_co2 <- mod_trend_xco2$coefficients[[1]]
b_co2 <- mod_trend_xco2$coefficients[[2]]
oco2_br <- oco2_br |>
mutate(
data_modif = data -min(data),
xco2_est = a_co2+b_co2*data_modif,
delta = xco2_est-xco2,
xco2_detrend = as.numeric((a_co2-delta) - (mean(xco2) - a_co2))
)
Plot dos dados sem a tendência, variável
xco2_detrend
.
oco2_br |>
ggplot(aes(x=data,y=xco2_detrend)) +
geom_point(shape=21,color="black",fill="gray") +
geom_smooth(method = "lm") +
stat_regline_equation(aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")))
Para unir os bancos de dados, inicialmente será necessário
transformar as coordenadas do banco data_fco2
. Ideal é
buscar o ponto central do municÃpio nos dados do ibge, por meio, no
pacote {geobr}
.
# Objeto municipality previamente criado ao chamar o script source("..//R//functions.R")
my_municipaly <- data_fco2$ municipio |> unique()
municipaly_coord <- data.frame(municipio="",
long=1:length(my_municipaly),
lat=0)
for(i in 1:length(my_municipaly)){
muni_aux <- municipality |>
filter(name_muni == my_municipaly[i])
coord_aux <- muni_aux |> pluck(5) |> pluck(1) |> as.matrix()
municipaly_coord[i,1] <- muni_aux$name_muni
municipaly_coord[i,2] <- coord_aux[,1] |> mean()
municipaly_coord[i,3] <- coord_aux[,2] |> mean()
}
data_fco2 <- data_fco2 |> left_join(
municipaly_coord, by="municipio"
)
Agora precisamos selecionar no banco de dados oco2_br
quais pontos estão mais próximos daquele que observamos em campo. Para
isso vamos pegar a média dos \(3\)
pontos mais próximos de XCO2 e SIF.
# Criando colunas no banco de dados
data_fco2 <- data_fco2 |>
mutate(year = year(data),
month = month(data),
xco2_1 = NA,
xco2_detrend_1 = NA,
sif_1 = NA,
xco2_5 = NA,
xco2_detrend_5 = NA,
sif_5 = NA,
dist = NA
)
# Busca pelo ponto mais próximo
for(i in 1:nrow(data_fco2)){
x<- data_fco2[i,"long"]
y<- data_fco2[i,"lat"]
ano <- data_fco2[i,"year"]
mes <- data_fco2[i,"month"]
df_aux <- oco2_br |>
filter(year == ano, month == mes) |>
mutate(
dist = sqrt((longitude-(x))^2+(latitude-(y))^2)
) |>
arrange(dist) |>
slice(1:5)
if(nrow(df_aux)!=0){
data_fco2[i,"xco2_1"] <- df_aux$xco2[1]
data_fco2[i,"xco2_detrend_1"] <- df_aux$xco2_detrend[1]
data_fco2[i,"sif_1"] <- df_aux$sif[1]
data_fco2[i,"xco2_5"] <- df_aux$xco2 |> mean()
data_fco2[i,"xco2_detrend_5"] <- df_aux$xco2_detrend |> mean()
data_fco2[i,"sif_5"] <- df_aux$sif |> mean()
data_fco2[i,"dist"] <- df_aux$dist[1]
}
#print(paste(i,"/",nrow(data_fco2)))
}
Valor único do mais próximos.
tab_medias <- data_fco2 |>
# mutate(SIF = ifelse(SIF <=0, mean(data_set$SIF, na.rm=TRUE),SIF)) %>%
group_by(year, month, municipio) |>
summarise(fco2 = mean(fco2, na.rm=TRUE),
xco2_tend = mean(xco2_1,na.rm=TRUE),
xco2 = mean(xco2_detrend_1, na.rm=TRUE),
sif = mean(sif_1, na.rm=TRUE))
tab_medias |>
drop_na() |>
filter(sif > 0) |>
ggplot(aes(y=fco2, x=xco2)) +
geom_point(size=3, shape=21, fill="gray") +
geom_smooth(method = "lm", se=FALSE,
ldw=2,color="red") +
stat_regline_equation(aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")),size=5, label.x.npc = .4)
tab_medias |>
drop_na() |>
filter(sif > 0) |>
ggplot(aes(y=fco2, x=xco2_tend)) +
geom_point(size=3, shape=21, fill="gray") +
geom_smooth(method = "lm", se=FALSE,
ldw=2,color="red") +
stat_regline_equation(aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")),size=5, label.x.npc = .4)
tab_medias |>
drop_na() |>
filter(sif > 0) |>
ggplot(aes(y=fco2, x=sif)) +
geom_point(size=3, shape=21, fill="gray") +
geom_smooth(method = "lm", se=FALSE,
ldw=2,color="red") +
stat_regline_equation(aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")),size=5, label.x.npc = .4)
tab_medias |>
drop_na() |>
filter(sif > 0) |>
ggplot(aes(y=sif, x=xco2)) +
geom_point(size=3, shape=21, fill="gray") +
geom_smooth(method = "lm", se=FALSE,
ldw=2,color="red") +
stat_regline_equation(aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")),size=5, label.x.npc = .4)
Médias dos 5 mais próximos.
tab_medias <- data_fco2 |>
# mutate(SIF = ifelse(SIF <=0, mean(data_set$SIF, na.rm=TRUE),SIF)) %>%
group_by(year, month, municipio) |>
summarise(fco2 = mean(fco2, na.rm=TRUE),
xco2_tend = mean(xco2_5,na.rm=TRUE),
xco2 = mean(xco2_detrend_5, na.rm=TRUE),
sif = mean(sif_5, na.rm=TRUE))
tab_medias |>
drop_na() |>
filter(sif > 0) |>
ggplot(aes(y=fco2, x=xco2)) +
geom_point(size=3, shape=21, fill="gray") +
geom_smooth(method = "lm", se=FALSE,
ldw=2,color="red") +
stat_regline_equation(aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")),size=5, label.x.npc = .4)
tab_medias |>
drop_na() |>
filter(sif > 0) |>
ggplot(aes(y=fco2, x=xco2_tend)) +
geom_point(size=3, shape=21, fill="gray") +
geom_smooth(method = "lm", se=FALSE,
ldw=2,color="red") +
stat_regline_equation(aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")),size=5, label.x.npc = .4)
tab_medias |>
drop_na() |>
filter(sif > 0) |>
ggplot(aes(y=fco2, x=sif)) +
geom_point(size=3, shape=21, fill="gray") +
geom_smooth(method = "lm", se=FALSE,
ldw=2,color="red") +
stat_regline_equation(aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")),size=5, label.x.npc = .4)
tab_medias |>
drop_na() |>
filter(sif > 0) |>
ggplot(aes(y=sif, x=xco2)) +
geom_point(size=3, shape=21, fill="gray") +
geom_smooth(method = "lm", se=FALSE,
ldw=2,color="red") +
stat_regline_equation(aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")),size=5, label.x.npc = .4)
dados_estacao_isa <- readxl::read_excel("../data-raw/estacao_meteorologia_ilha_solteira.xlsx", na = "NA") |>
janitor::clean_names() |>
drop_na()
glimpse(dados_estacao_isa)
## Rows: 1,763
## Columns: 16
## $ data <dttm> 2015-01-01, 2015-01-02, 2015-01-03, 2015-01-04, 2015-01-05, 2…
## $ tmed <dbl> 30.5, 30.0, 26.8, 27.1, 27.0, 27.6, 30.2, 28.2, 28.5, 29.9, 30…
## $ tmax <dbl> 36.5, 36.7, 35.7, 34.3, 33.2, 36.4, 37.2, 32.4, 37.1, 38.1, 38…
## $ tmin <dbl> 24.6, 24.5, 22.9, 22.7, 22.3, 22.8, 22.7, 24.0, 23.0, 23.3, 24…
## $ umed <dbl> 66.6, 70.4, 82.7, 76.8, 81.6, 75.5, 65.8, 70.0, 72.9, 67.6, 66…
## $ umax <dbl> 89.6, 93.6, 99.7, 95.0, 98.3, 96.1, 99.2, 83.4, 90.7, 97.4, 90…
## $ umin <dbl> 42.0, 44.2, 52.9, 43.8, 57.1, 47.5, 34.1, 57.4, 42.7, 38.3, 37…
## $ pk_pa <dbl> 97.2, 97.3, 97.4, 97.5, 97.4, 97.5, 97.4, 97.4, 97.4, 97.4, 97…
## $ rad <dbl> 23.6, 24.6, 20.2, 21.4, 17.8, 19.2, 27.0, 15.2, 21.6, 24.3, 24…
## $ par <dbl> 496.6, 513.3, 430.5, 454.0, 378.2, 405.4, 565.7, 317.2, 467.5,…
## $ eto <dbl> 5.7, 5.8, 4.9, 5.1, 4.1, 4.8, 6.2, 4.1, 5.5, 5.7, 5.9, 6.1, 6.…
## $ velmax <dbl> 6.1, 4.8, 12.1, 6.2, 5.1, 4.5, 4.6, 5.7, 5.8, 5.2, 5.2, 4.7, 6…
## $ velmin <dbl> 1.0, 1.0, 1.2, 1.0, 0.8, 0.9, 0.9, 1.5, 1.2, 0.8, 0.8, 1.2, 1.…
## $ dir_vel <dbl> 17.4, 261.9, 222.0, 25.0, 56.9, 74.9, 53.4, 89.0, 144.8, 303.9…
## $ chuva <dbl> 0.0, 0.0, 3.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.…
## $ inso <dbl> 7.9, 8.7, 5.2, 6.2, 3.4, 4.5, 10.5, 1.3, 6.3, 8.4, 8.6, 7.9, 1…
dados_estacao_jbk <- readxl::read_excel("../data-raw/estacao_meteorologia_jaboticabal.xlsx") |>
janitor::clean_names()
glimpse(dados_estacao_jbk)
## Rows: 338
## Columns: 15
## $ data <dttm> 2001-07-01, 2001-07-02, 2001-07-03, 2001-07-04, 2001-07…
## $ tmed <dbl> 19.4, 19.9, 20.8, 20.0, 20.2, 19.5, 18.7, 19.3, 20.4, 19…
## $ umed <dbl> 64.5, 61.9, 60.2, 59.9, 54.2, 58.4, 64.9, 64.8, 58.0, 58…
## $ pk_pa <dbl> 94.65, 94.73, 94.79, 94.77, 94.91, 95.02, 95.11, 95.08, …
## $ rad <dbl> 15.55, 15.44, 15.48, 15.88, 17.26, 15.79, 15.55, 14.33, …
## $ saldo_w_m <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ fluxo_w_m <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ par <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ eto <dbl> 4.44, 3.61, 4.45, 4.54, 5.09, 1.81, 7.30, 4.65, 4.43, 6.…
## $ et_pm_mm_dia <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ et_tca_mm_dia <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ vel <dbl> 2.00, 0.44, 1.35, 1.03, 1.58, 1.23, 0.93, 1.18, 1.10, 1.…
## $ dir_vel <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ chuva <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1…
## $ inso <dbl> 9.2, 9.4, 9.5, 9.6, 8.6, 9.7, 9.5, 8.4, 8.1, 10.1, 8.2, …
dados_estacao <- dados_estacao_isa |>
add_row(dados_estacao_jbk |>
select(-c(saldo_w_m,fluxo_w_m,et_pm_mm_dia,et_tca_mm_dia,
vel)) |>
mutate(dir_vel=as.double(dir_vel))
) |> arrange(data)
visdat::vis_miss(dados_estacao)
visdat::vis_miss(data_fco2)
data_fco2 <- left_join(data_fco2, dados_estacao, by = "data")
visdat::vis_miss(data_fco2[18:length(data_fco2)])
write_rds(data_fco2,"../data/data-set-fco2.rds")