Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
Let
Aggregating models
Let
And from the fact that :
We get :
Now, let’s see how
When
That’s the relative change in the variance of the ensemble prediction error, induced by introducing model
For a simple illustrative example in R, I create simulated data observations :
# number of observations n <- 100 u <- 1:n # Simulated observed data intercept <- 5 slope <- 0.2 set.seed(2) ## data y <- intercept + slope*u + rnorm(n) plot(u, y, type = 'l', main = "Simulated data observations") points(u, y, pch = 19)
I fit a linear regression model to the data, as a benchmark model :
# Defining training and test sets train <- 1:(0.7*n) test <- -train u.train <- u[(train)] y.train <- y[(train)] u.test <- u[(test)] y.test <- y[(test)] # Fitting the benchmark model to the training set fit.lm <- lm(y.train ~ u.train) (slope.coeff <- fit.lm$coefficients[2]) (intercept.coeff <- fit.lm$coefficients[1]) ## u.train ## 0.1925 ## (Intercept) ## 5.292
Obtaining the predicted values from the benchmark model, and prediction error
# Predicted values from benchmark model on the test set y.pred <- intercept.coeff + slope.coeff*u.test # prediction error from linear regression pred.err <- y.pred - y.test (mean.pred.err <- mean(pred.err)) (var.pred.err <- var(pred.err)) ## [1] -0.1802 ## [1] 1.338
Now, I consider two other models,
and
with
# Alternative model 1 (low bias, high variance, oscillating) m <- length(y.pred) eps1 <- rnorm(m, mean = 0, sd = 1.5) y.pred1 <- intercept.coeff + slope.coeff*u.test + sin(u.test) + eps1 # prediction error for model 1 pred.err1 <- y.pred1 - y.test
We can visualize the predictions of model
# Different prediction errors correlations for 1 and 2
rho.vec <- c(-1, -0.8, 0.6, 1)
# Independent random gaussian numbers defining model 2 errors
eps <- rnorm(m, mean = 0, sd = 2)
# Plotting the predictions with different correlations
par(mfrow=c(2, 2))
for (i in 1:4)
{
rho <- rho.vec[i]
# Correlated gaussian numbers (Cholesky decomposition)
eps2 <- rho*eps1 + sqrt(1 - rho^2)*eps
# Alternative model 2 (low bias, higher variance than 1, oscillating)
y.pred2 <- intercept.coeff + slope.coeff*u.test - 0.35*sin(u.test) + eps2
# prediction error for model 2
pred.err2 <- y.pred2 - y.test
# predictions from 1 & 2 correlation
corr.pred12 <- round(cor(pred.err1, pred.err2), 2)
# Plot
plot(u.test, y.test, type = "p",
xlab = "test set values", ylab = "predicted values",
main = paste("models 1 & 2 pred. values n correlation :",
corr.pred12))
points(u.test, y.test, pch = 19)
lines(u.test, y.pred, lwd = 2)
lines(u.test, y.pred1,
col = "blue", lwd = 2)
lines(u.test, y.pred2,
col = "red", lwd = 2)
}
Now including allocations of models
# Allocation for model 1 in the ensemble
alpha.vec <- seq(from = 0, to = 1, by = 0.05)
# Correlations between model 1 and model 2
rho.vec <- seq(from = -1, to = 1, by = 0.05)
# Results matrices
nb.row <- length(alpha.vec)
nb.col <- length(rho.vec)
## Average prediction errors of the ensemble
mean.pred.err.ens <- matrix(0, nrow = nb.row, ncol = nb.col)
rownames(mean.pred.err.ens) <- paste0("pct. 1 : ", alpha.vec*100, "%")
colnames(mean.pred.err.ens) <- paste0("corr(1, 2) : ", rho.vec)
## Variance of prediction error of the ensemble
var.pred.err.ens <- matrix(0, nrow = nb.row, ncol = nb.col)
rownames(var.pred.err.ens) <- paste0("pct. 1 : ", alpha.vec*100, "%")
colnames(var.pred.err.ens) <- paste0("corr(1, 2) : ", rho.vec)
# loop on correlations and allocations
for (i in 1:nb.row)
{
for (j in 1:nb.col)
{
alpha <- alpha.vec[i]
rho <- rho.vec[j]
# Alternative model 2 (low bias, higher variance, oscillating)
eps2 <- rho*eps1 + sqrt(1 - rho^2)*eps
y.pred2 <- intercept.coeff + slope.coeff*u.test - 0.35*sin(u.test) + eps2
pred.err2 <- y.pred2 - y.test
# Ensemble prediction error
z <- alpha*pred.err1 + (1-alpha)*pred.err2
mean.pred.err.ens[i, j] <- mean(z)
var.pred.err.ens[i, j] <- var(z)
}
}
res.var <- var.pred.err.ens
# Heat map for the variance of the ensemble
filled.contour(alpha.vec, rho.vec, res.var, color = terrain.colors,
main = "prediction error variance for the ensemble",
xlab = "allocation in x1", ylab = "correlation between 1 and 2")
Hence, the lower the correlation between
Finding the final ensemble, with lower variance and lower bias :
# Final ensemble ## Allocation alpha.vec[which.min(as.vector(res.var))] ## [1] 0.45 ## Correlation res.bias <- abs(mean.pred.err.ens) which.min(res.bias[which.min(as.vector(res.var)), ]) ## corr(1, 2) : -0.7 ## 7
Creating the final model with these parameters :
rho <- -0.7
eps2 <- rho*eps1 + sqrt(1 - rho^2)*eps
# Alternative model 2 (low bias, higher variance, oscillating)
y.pred2 <- intercept.coeff + slope.coeff*u.test - 0.35*sin(u.test) + eps2
# Final ensemble prediction
y.pred.ens <- 0.45*y.pred1 + 0.55*y.pred2
# Plot
plot(u.test, y.test, type = "p",
xlab = "test set", ylab = "predicted values",
main = "Final ensemble model (green)")
points(u.test, y.test, pch = 19)
# benchmark
lines(u.test, y.pred, lwd = 2)
# model 1
lines(u.test, y.pred1, col = "blue", lwd = 2)
# model 2
lines(u.test, y.pred2, col = "red", lwd = 2)
# ensemble model with 1 and 2
points(u.test, y.pred.ens, col = "green", pch = 19)
lines(u.test, y.pred.ens, col = "green", lwd = 2)
Performance of the final model :
# Benchmark
pred.err <- y.pred - y.test
# Model 1
pred1.err <- y.pred1 - y.test
# Model 2
pred2.err <- y.pred2 - y.test
# Ensemble model
pred.ens.err <- y.pred.ens - y.test
# Bias comparison
bias.ens <- mean(y.pred.ens - y.test)
bias.ens_vsbench <- (bias.ens/mean(y.pred - y.test) - 1)*100
bias.ens_vs1 <- (bias.ens/mean(y.pred1 - y.test) - 1)*100
bias.ens_vs2 <- (bias.ens/mean(y.pred2 - y.test) - 1)*100
# Variance comparison
var.ens <- var(y.pred.ens - y.test)
var.ens_vsbench <- (var.ens/var(y.pred - y.test) - 1)*100
var.ens_vs1 <- (var.ens/var(y.pred1 - y.test) - 1)*100
var.ens_vs2 <- (var.ens/var(y.pred2 - y.test) - 1)*100
cbind(c(bias.ens_vsbench, bias.ens_vs1, bias.ens_vs2), c(var.ens_vsbench, var.ens_vs1, var.ens_vs2))
## [,1] [,2]
## [1,] -95.31 46.27
## [2,] -112.12 -62.93
## [3,] -88.33 -45.53
R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
