Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
Let
Aggregating models
Let
And from the fact that :
We get :
Now, let’s see how
When
That’s the relative change in the variance of the ensemble prediction error, induced by introducing model
For a simple illustrative example in R, I create simulated data observations :
# number of observations n <- 100 u <- 1:n # Simulated observed data intercept <- 5 slope <- 0.2 set.seed(2) ## data y <- intercept + slope*u + rnorm(n) plot(u, y, type = 'l', main = "Simulated data observations") points(u, y, pch = 19)
I fit a linear regression model to the data, as a benchmark model :
# Defining training and test sets train <- 1:(0.7*n) test <- -train u.train <- u[(train)] y.train <- y[(train)] u.test <- u[(test)] y.test <- y[(test)] # Fitting the benchmark model to the training set fit.lm <- lm(y.train ~ u.train) (slope.coeff <- fit.lm$coefficients[2]) (intercept.coeff <- fit.lm$coefficients[1]) ## u.train ## 0.1925 ## (Intercept) ## 5.292
Obtaining the predicted values from the benchmark model, and prediction error
# Predicted values from benchmark model on the test set y.pred <- intercept.coeff + slope.coeff*u.test # prediction error from linear regression pred.err <- y.pred - y.test (mean.pred.err <- mean(pred.err)) (var.pred.err <- var(pred.err)) ## [1] -0.1802 ## [1] 1.338
Now, I consider two other models,
and
with
# Alternative model 1 (low bias, high variance, oscillating) m <- length(y.pred) eps1 <- rnorm(m, mean = 0, sd = 1.5) y.pred1 <- intercept.coeff + slope.coeff*u.test + sin(u.test) + eps1 # prediction error for model 1 pred.err1 <- y.pred1 - y.test
We can visualize the predictions of model
# Different prediction errors correlations for 1 and 2 rho.vec <- c(-1, -0.8, 0.6, 1) # Independent random gaussian numbers defining model 2 errors eps <- rnorm(m, mean = 0, sd = 2) # Plotting the predictions with different correlations par(mfrow=c(2, 2)) for (i in 1:4) { rho <- rho.vec[i] # Correlated gaussian numbers (Cholesky decomposition) eps2 <- rho*eps1 + sqrt(1 - rho^2)*eps # Alternative model 2 (low bias, higher variance than 1, oscillating) y.pred2 <- intercept.coeff + slope.coeff*u.test - 0.35*sin(u.test) + eps2 # prediction error for model 2 pred.err2 <- y.pred2 - y.test # predictions from 1 & 2 correlation corr.pred12 <- round(cor(pred.err1, pred.err2), 2) # Plot plot(u.test, y.test, type = "p", xlab = "test set values", ylab = "predicted values", main = paste("models 1 & 2 pred. values n correlation :", corr.pred12)) points(u.test, y.test, pch = 19) lines(u.test, y.pred, lwd = 2) lines(u.test, y.pred1, col = "blue", lwd = 2) lines(u.test, y.pred2, col = "red", lwd = 2) }
Now including allocations of models
# Allocation for model 1 in the ensemble alpha.vec <- seq(from = 0, to = 1, by = 0.05) # Correlations between model 1 and model 2 rho.vec <- seq(from = -1, to = 1, by = 0.05) # Results matrices nb.row <- length(alpha.vec) nb.col <- length(rho.vec) ## Average prediction errors of the ensemble mean.pred.err.ens <- matrix(0, nrow = nb.row, ncol = nb.col) rownames(mean.pred.err.ens) <- paste0("pct. 1 : ", alpha.vec*100, "%") colnames(mean.pred.err.ens) <- paste0("corr(1, 2) : ", rho.vec) ## Variance of prediction error of the ensemble var.pred.err.ens <- matrix(0, nrow = nb.row, ncol = nb.col) rownames(var.pred.err.ens) <- paste0("pct. 1 : ", alpha.vec*100, "%") colnames(var.pred.err.ens) <- paste0("corr(1, 2) : ", rho.vec) # loop on correlations and allocations for (i in 1:nb.row) { for (j in 1:nb.col) { alpha <- alpha.vec[i] rho <- rho.vec[j] # Alternative model 2 (low bias, higher variance, oscillating) eps2 <- rho*eps1 + sqrt(1 - rho^2)*eps y.pred2 <- intercept.coeff + slope.coeff*u.test - 0.35*sin(u.test) + eps2 pred.err2 <- y.pred2 - y.test # Ensemble prediction error z <- alpha*pred.err1 + (1-alpha)*pred.err2 mean.pred.err.ens[i, j] <- mean(z) var.pred.err.ens[i, j] <- var(z) } } res.var <- var.pred.err.ens # Heat map for the variance of the ensemble filled.contour(alpha.vec, rho.vec, res.var, color = terrain.colors, main = "prediction error variance for the ensemble", xlab = "allocation in x1", ylab = "correlation between 1 and 2")
Hence, the lower the correlation between
Finding the final ensemble, with lower variance and lower bias :
# Final ensemble ## Allocation alpha.vec[which.min(as.vector(res.var))] ## [1] 0.45 ## Correlation res.bias <- abs(mean.pred.err.ens) which.min(res.bias[which.min(as.vector(res.var)), ]) ## corr(1, 2) : -0.7 ## 7
Creating the final model with these parameters :
rho <- -0.7 eps2 <- rho*eps1 + sqrt(1 - rho^2)*eps # Alternative model 2 (low bias, higher variance, oscillating) y.pred2 <- intercept.coeff + slope.coeff*u.test - 0.35*sin(u.test) + eps2 # Final ensemble prediction y.pred.ens <- 0.45*y.pred1 + 0.55*y.pred2 # Plot plot(u.test, y.test, type = "p", xlab = "test set", ylab = "predicted values", main = "Final ensemble model (green)") points(u.test, y.test, pch = 19) # benchmark lines(u.test, y.pred, lwd = 2) # model 1 lines(u.test, y.pred1, col = "blue", lwd = 2) # model 2 lines(u.test, y.pred2, col = "red", lwd = 2) # ensemble model with 1 and 2 points(u.test, y.pred.ens, col = "green", pch = 19) lines(u.test, y.pred.ens, col = "green", lwd = 2)
Performance of the final model :
# Benchmark pred.err <- y.pred - y.test # Model 1 pred1.err <- y.pred1 - y.test # Model 2 pred2.err <- y.pred2 - y.test # Ensemble model pred.ens.err <- y.pred.ens - y.test # Bias comparison bias.ens <- mean(y.pred.ens - y.test) bias.ens_vsbench <- (bias.ens/mean(y.pred - y.test) - 1)*100 bias.ens_vs1 <- (bias.ens/mean(y.pred1 - y.test) - 1)*100 bias.ens_vs2 <- (bias.ens/mean(y.pred2 - y.test) - 1)*100 # Variance comparison var.ens <- var(y.pred.ens - y.test) var.ens_vsbench <- (var.ens/var(y.pred - y.test) - 1)*100 var.ens_vs1 <- (var.ens/var(y.pred1 - y.test) - 1)*100 var.ens_vs2 <- (var.ens/var(y.pred2 - y.test) - 1)*100 cbind(c(bias.ens_vsbench, bias.ens_vs1, bias.ens_vs2), c(var.ens_vsbench, var.ens_vs1, var.ens_vs2)) ## [,1] [,2] ## [1,] -95.31 46.27 ## [2,] -112.12 -62.93 ## [3,] -88.33 -45.53
R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.