-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.R
316 lines (255 loc) · 15.9 KB
/
script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#===============================================================================================================
# title: Milch price forecasting model
# author: Damian Oswald
# date: 2023-11-08
#===============================================================================================================
#===============================================================================================================
# Setting up the work space
#===============================================================================================================
# user inputs
name_of_the_data <- "milk-price-data.xlsx"
# define target variable
target <- "CH_Milchpreis"
# how many months of the data set should be used for the test set?
test_set_size <- 18
# define feature names (all of these features are used to predict `target` using machine learning)
features <- c("TotMilch_Menge_inkl_LI", "CH_Milchpreis", "Molkerei_Milchpreis",
"Verkaest_Milchpreis", "Gewerblich_Milchpreis", "Bio_Milchpreis",
"EU_Milchpreis_CHF", "Kurs_CHF_Euro", "Total_Staatlich_Zahlung",
"staatlich_kg_Milch", "Kontingent_01", "Freihandel_01", "EU_Politik_01",
"Industriell_verkäst_Milchpreis", "Import_Menge_Käse_und_Quark_kg",
"Schlachtungen_Kühe", "Magermilchpulver_Prod_Menge",
"Vollmilchpulver_Prod_Menge", "Butter_Prod_Menge",
"Export_Menge_Käse_und_Quark_kg", "Milchpreis_NZ_Euro")
# conditional installation of packages (if allready installed, just add package to search path)
for (package in c("forecast", "ggplot2", "tidyr", "GGally", "tibble", "magrittr", "readxl", "visdat", "seastests", "ggcorrplot", "openxlsx", "glmnet", "lubridate")) {
if(!do.call(require,list(package))) install.packages(package)
do.call(library,list(package))
}
# print out an introductory message in the console
cat("\f\nFederal Office of Agriculture (2023)\n___ ____ _ _ ______ _ ______ _ _ \n| \\/ (_) | | | ___ \\ (_) | ___| | | (_) \n| . . |_| | | _ | |_/ / __ _ ___ ___ | |_ ___ _ __ ___ ___ __ _ ___| |_ _ _ __ __ _ \n| |\\/| | | | |/ / | __/ '__| |/ __/ _ \\ | _/ _ \\| '__/ _ \\/ __/ _` / __| __| | '_ \\ / _` |\n| | | | | | < | | | | | | (_| __/ | || (_) | | | __/ (_| (_| \\__ \\ |_| | | | | (_| |\n\\_| |_/_|_|_|\\_\\ |_| |_| |_|\\___\\___ |_| \\___/|_| \\___|\\___\\__,_|___/\\__|_|_| |_|\\__, |\n __/ |\n |___/\n")
# read the data from the excel file directly
data <- name_of_the_data |>
readxl::read_excel() |>
as.data.frame() |>
subset(select = features)
# export (or override) data as csv
write.csv(data, file = "milk-price-data.csv", row.names = FALSE)
# convert data frame to time series object
data_ts <- ts(data, start = 2000, frequency = 12)
# change the variable `TotMilch_Menge_inkl_LI` from monthly total to daily production
# (this yields an annual cycle more representative for the actual milk production independent of monthly length)
paste(floor(time(data_ts)),cycle(data_ts),"1",sep="-") |>
as.Date() |>
lubridate::days_in_month() -> days_in_month
data[,1] <- data[,1]/days_in_month
data_ts[,1] <- data_ts[,1]/days_in_month
#===============================================================================================================
# Exploratory data analysis
#===============================================================================================================
# function to convert a decomposed time series object to a data frame
ts2df <- function(x) {
d <- decompose(x)
data.frame(year = floor(time(x)), month = as.integer(cycle(x)),
observation = d$x,
trend = d$trend,
seasonal = seasonal(d),
remainder = d$random)
}
# conditional creation of the output folder
if(!file.exists(file.path(getwd(),"output"))) dir.create(file.path(getwd(),"output"))
# create empty pdf
pdf(file.path(getwd(),"output","time-series-decomposition.pdf"), width = 16, height = 9)
# title slide
plot.new()
text(0, 1, paste("This PDF contains results from an automated time-series analysis report."), pos = 4, xpd = NA)
text(0, 0.95, "Find more detailed information on https://github.com/blw-ofag-ufag/milk-price-forecasting", pos = 4, xpd = NA)
text(0, 0.9, Sys.time(), pos = 4, xpd = NA)
text(0, 0.85, version$version.string, pos = 4, xpd = NA)
# missing values
data_ts |> as.data.frame() |> visdat::vis_miss() |> print()
# correlation plot
data_ts |> cor(use = "complete.obs") |> ggcorrplot::ggcorrplot(method = "circle", type = "lower", outline.color = par()$fg, lab_size = 2, lab = TRUE) |> print()
openxlsx::write.xlsx(x = as.data.frame(cor(data_ts, use = "complete.obs")), file = file.path("output","correlations.xlsx"))
# seasonal decomposition of the time series data
for (i in 1:length(features)) {
cat("\rDecomposing time series [", i, "/", length(features),"]", sep = "")
if(!seastests:::isSeasonal(data_ts[,features[i]])) {
data_ts[,features[i]] |> forecast:::autoplot.ts(ylab = "") + ggtitle(paste(features[i], ": Time series plot"), subtitle = paste0("Ollech and Webel's combined seasonality test didn't detect seasonality in this data (p-value = ",paste(round(max(seastests::combined_test(data_ts[,features[i]])$Pval),3),collapse = ", "),")")) -> p
print(p)
} else {
data_ts[,features[i]] |> decompose() -> decomposition
decomposition |> forecast:::autoplot.decomposed.ts() + ggtitle(paste(features[i],": Decomposition of additive time series"), "Time series decomposition into seasonal, trend and irregular components using moving averages.") -> p
print(p + theme_minimal())
}
}
# close pdf
dev.off()
# write all the results into one excel sheet
list <- lapply(features, function(i) data_ts[,i] |> ts2df() |> signif(digits = 5))
names(list) <- features
write.xlsx(list, file = file.path("output","decomposition.xlsx"))
#===============================================================================================================
# Prepare data for forecasting
#===============================================================================================================
cat("\nPreparing data")
# Add dummy variables for each month (to model monthly effects)
Months <- model.matrix(~ as.factor(cycle(data_ts)) - 1)
colnames(Months) <- month.name
# sine and cosine transformer
sinCosTransform <- function(x, frequency = 1) {
cbind(sin = sin(x * 2 * pi/frequency), cos = cos(x * 2 * pi/frequency))
}
# function to add lag to a numeric vector x (can generate multiple lags at the same time)
lag <- function(x, k = 1, col.names = NA) {
Y <- sapply(k, function(k) {
n <- length(x)
c(x[(1+k):n], rep(NA, k))
})
if(!is.na(col.names)) colnames(Y) <- col.names
else colnames(Y) <- paste0("y", k)
return(Y)
}
# add transformations and labels (with a lag)
df <- cbind(lag(data_ts[,target], k = 1:3),
time = as.numeric(time(data_ts)),
sinCosTransform(as.numeric(time(data_ts))),
Months,
as.matrix(as.data.frame(data_ts)))
# replace all NA value by zero
df[,features][is.na(df[,features])] <- 0
# scale the data (IMPORTANT: apply the same normalization to y1, y2, and y3; otherwise re-scale is wrong!)
df <- cbind((df[,1:3] - mean(data[,target]))/sd(data[,target]), time = df[,4], scale(df[,-c(1:4)]))
# function to re-scale scaled values to the original size
rescale <- function(z, mu = mean(data[,target]), sigma = sd(data[,target])) sigma*z + mu
# test-train-split (keep the last year as a test dataset)
df_train <- head(df, -test_set_size)
df_test <- tail(df, test_set_size)
# In the train dataset, divide folds for cross validation as six-month periods
foldid <- (df_train[,"time"] %/% 0.5 - 3999)
# Save all features allowed to predict the model in one vector
features <- c("sin","cos",month.name,features)
label <- "y1"
#===============================================================================================================
# Forecasting the milk price with lasso and ridge regression
#===============================================================================================================
# we want to forecast 1, 2 and 3 months into the future
for (i in 3:1) {
# print message
cat("\rFitting regularized least squares [", (4-i), "/3]", sep = "")
# traine one L1 and one L2 regularized least squares model (lasso and ridge)
L1 <- glmnet::cv.glmnet(x = df_train[,features], y = df_train[,paste0("y",i)], lambda = exp(seq(-12,0,0.25)), alpha = 1, foldid = foldid)
L2 <- glmnet::cv.glmnet(x = df_train[,features], y = df_train[,paste0("y",i)], lambda = exp(seq(-12,0,0.25)), alpha = 0, foldid = foldid)
# visualize the coefficients as a coefficients plot
coefficients <- data.frame(Variable = rep(rownames(coefficients(L1)), 2), Coefficient = c(as.numeric(coef(L1)),as.numeric(coef(L2))), Norm = rep(c("L1", "L2"), each = length(coef(L2))))
coefficients$Variable <- factor(coefficients$Variable, levels = rownames(coefficients(L2))[order(abs(rowSums(cbind(coef(L1), coef(L2)))))])
highlight <- coefficients[coefficients$Variable %in% rownames(coef(L1))[which(abs(coef(L1))>0)],]
assign(paste0("plot_coefficients_",paste0("y",i)),
ggplot2::ggplot(coefficients, aes(Coefficient, Variable)) +
geom_line(aes(group = Variable), alpha = 0.3) + geom_point(aes(color = Norm), size = 2, alpha = 0.5) +
geom_line(data = highlight, aes(group = Variable)) + geom_point(data = highlight, aes(color = Norm), size = 3)
)
# save predictions
assign(paste0("predictions_",paste0("y",i)), {
data.frame(
Lasso = as.numeric(predict(L1, newx = as.matrix(df_test[,features]))),
Ridge = as.numeric(predict(L2, newx = as.matrix(df_test[,features])))
)
})
}
#===============================================================================================================
# Forecasting the milk price with (seasonal) ARIMA
#===============================================================================================================
# find ARIMA(1,1,1) model to fit the data
predictions_arima <- matrix(NA, test_set_size, 3)
cat("\n")
for (t in 1:test_set_size) {
cat("\rFitting ARIMA [", t, "/", test_set_size, "]", sep = "")
df[1:(nrow(df)-t),target] |>
arima(order = c(1,1,1)) |>
predict(n.ahead = 3, se.fit = FALSE) -> predictions_arima[test_set_size+1-t,]
}
# find ARIMA(1,1,1)(1,1,1)[12] model to fit the
predictions_sarima <- matrix(NA, test_set_size, 3)
cat("\n")
for (t in 1:test_set_size) {
cat("\rFitting SARIMA [", t, "/", test_set_size, "]", sep = "")
df[1:(nrow(df)-t),target] |>
arima(order = c(1,1,1), seasonal = list(order = c(1,1,1), period = 12)) |>
predict(n.ahead = 3, se.fit = FALSE) -> predictions_sarima[test_set_size+1-t,]
}
# Combine with RLS predictions
predictions_y1 <- cbind(predictions_y1, ARIMA = predictions_arima[,1], SARIMA = predictions_sarima[,1])
predictions_y2 <- cbind(predictions_y2, ARIMA = predictions_arima[,2], SARIMA = predictions_sarima[,2])
predictions_y3 <- cbind(predictions_y3, ARIMA = predictions_arima[,3], SARIMA = predictions_sarima[,3])
# Combine all forecastings (with unseen observations to compare)
t <- max(df[,"time"]) + c(1/12, 2/12, 3/12)
forecastings <- rescale(rbind(t1 = tail(predictions_y1, 1), t2 = tail(predictions_y2, 1), t3 = tail(predictions_y3, 1)))
write.xlsx(data.frame(Year = t%/%1, Month = t%%1*12+1, forecastings), file = file.path("output","predictions.xlsx"))
#===============================================================================================================
# Testing all models
#===============================================================================================================
# calculate root mean squared error (RMSE)
rmse <- function(predictions, observations) {
Errors <- (rescale(predictions) - rescale(observations))
sqrt(colMeans(Errors^2, na.rm = TRUE))
}
RMSE <- sapply(1:3, function(i) rmse(get(paste0("predictions_y",i)), df_test[,paste0("y",i)])) |> t()
RMSE <- data.frame(horizon = rep(as.character(1:3), times = 4), gather(as.data.frame(RMSE)))
RMSE[,"value"] <- RMSE[,"value"] * 0.01 # change from cents to CHF
# make plot of all model performances
overall_comparison <- ggplot(data = RMSE, aes(x = key, y = value, fill = horizon)) +
ylab("Root mean squared error [CHF]") +
geom_bar(stat = "identity", position=position_dodge()) +
geom_text(aes(label = round(value, 3)), color = "black", position = position_dodge(0.9), vjust = -1) +
scale_fill_brewer(palette = "Reds") +
ggtitle("Model performance comparison",paste("This chart shows the root mean squared error (RMSE), which represents the average error, of different models.\nThe models were all tested on the milk price data of the past",test_set_size,"months, but trained without that data."))
#===============================================================================================================
# Export all plots to one PDF
#===============================================================================================================
# print out message
cat("\nGenerate report [1/1]")
# Export all plots as one PDF
pdf(file.path("output","machine-learning-report.pdf"), width = 16, height = 9)
# title slide
plot.new()
text(0, 1, paste("This PDF contains results from an automated machine learning model to forecast `", target, "`.", sep = ""), pos = 4, xpd = NA)
text(0, 0.95, "Find more detailed information on https://github.com/blw-ofag-ufag/milk-price-forecasting", pos = 4, xpd = NA)
text(0, 0.9, Sys.time(), pos = 4, xpd = NA)
text(0, 0.85, version$version.string, pos = 4, xpd = NA)
# Forecast of unseen values
plot(x = df_test[,"time"], y = rescale(df_test[,target]), xlim = range(df_test[,"time"]) + c(0, 1/2), axes = FALSE, pch = 16, type = "o", las = 1, lwd = 3, ylab = "Swiss milk price [CHF/100]", xlab = "Time")
grid(lty = 1)
points(x = df_test[,"time"], y = rescale(df_test[,target]), pch = 16)
lines(x = df_test[,"time"], y = rescale(df_test[,target]), lwd = 2)
last <- max(df_test[,"time"])
colors = c(ARIMA = "#00BFC4", SARIMA = "#00BFC4", Lasso = "#F8766D", Ridge = "#F8766D")
for (model in colnames(forecastings)) {
lines(x = max(df_test[,"time"]) + c(1/12, 2/12, 3/12), y = forecastings[,model], lwd = 3, col = colors[model])
text(x = max(df_test[,"time"]) + 3/12, y = forecastings[3,model], labels = model, col = colors[model], pos = 4)
}
axis(1, lwd = NA, cex.axis = 0.8, at = 2000:2050, labels = 2000:2050)
axis(2, lwd = NA, cex.axis = 0.8, las = 1)
# Model comparison
print(overall_comparison + theme_minimal())
# Pairs panel
for(i in 1:3) {
cbind(Observation = df_test[,paste0("y",i)], get(paste0("predictions_",paste0("y",i)))) |>
na.omit() |>
rescale() |>
ggpairs(lower = list(continuous = "smooth"), progress = FALSE) +
theme_minimal() +
ggtitle(paste0("Comparing the forecast values (t + ",i,")"), subtitle = paste("These plots compare the forecast milk prices with the observed ones of the past", test_set_size, "months.")) -> plot
print(plot)
}
# Coefficients of lasso and ridge regression
for (i in 1:3) print(get(ls()[grep("plot_coefficients",ls())][i]) + ggtitle(paste0("Visualization of the regularized least squares coefficients (t + ", i,")"), subtitle = "This coefficients are retrieved from the best-fit models trained with the L1-norm (lasso regression) and L2-norm (ridge regression). They indicate relative variable importance.") + theme_minimal())
# Details of lasso and ridge
par(mfrow = c(2,2), mar = c(4,4,5,1)+0.1)
plot(L1, col = "black"); mtext("Lasso regression (L1)", line = 3, font = 2); abline(v = L1$lambda.min)
plot(L2); mtext("Ridge regression (L2)", line = 3, font = 2)
(L1$glmnet.fit) |> plot(col = "red", xvar = "lambda"); mtext("Lasso regression (L1)", line = 3, font = 2); abline(v = L1$lambda.min)
(L2$glmnet.fit) |> plot(col = "red", xvar = "lambda"); mtext("Ridge regression (L2)", line = 3, font = 2)
dev.off()
cat("\n\nDone!")
rm(list = ls())