Benchmarking missing data strategies for k-means clustering

R on Datentrang

3 years ago

[This article was first published on R on Datentrang, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

The goal is to compare a few algorithms for missing imputation when used before k-means clustering is performed. For the latter we use the same algorithm as in ClustImpute to ensure that only the computation time of the imputation is compared. In a nutshell, we’ll se that ClustImpute scales like a random imputation and hence is much faster than a pre-processing with MICE / MissRanger. This is not surprising since ClustImpute basically runs a fixed number of random imputations conditional on the current cluster assignment. Below is the code that has been used.

First we’ll load all relevant packages

Then we define a function that creates us an artificial data set with variable size \(n\).

create_random_data <- function(n,nr_other_vars=4,seedvar=739) {
  n <- round(n/3)*3
  set.seed(seedvar)
  mat <- matrix(rnorm(nr_other_vars*n),n,nr_other_vars)
  me<-4 # mean
  x <- c(rnorm(n/3,me/2,1),rnorm(2*n/3,-me/2,1)) 
  y <- c(rnorm(n/3,0,1),rnorm(n/3,me,1),rnorm(n/3,-me,1))
  true_clust <- c(rep(1,n/3),rep(2,n/3),rep(3,n/3)) # true clusters
  dat <- cbind(mat,x,y)
  dat<- as.data.frame(scale(dat)) # scaling
  
  return(list(data=dat,true_clusters=true_clust))
}

The parametrization \(nr_iter_other\) of the clustering for external imputation strategies is set so that we have a comparable number of steps after imputation (steps of Clustimpute with a weight < 1 are considered the burn-in phase since only afterwards the distribution is considered credible). Due to a very slow performance we decided to drop MissForrest and only consider MissRanger. The results below are for a missing rate of 20%, though this could be changed easily. For each data set, imputation and clustering are repeated 5 times with a differnt seed each time.

### for various n and p
N <- 2^(0:4)*400
p <- .2
### Parameters
# ClustImpute
nr_iter <- 14 # iterations of procedure
n_end <- 10 # step until convergence of weight function to 1
nr_cluster <- 3 # number of clusters
c_steps <- 50 # numer of cluster steps per iteration
# Clustering based on imputation
nr_iter_other <- (nr_iter-n_end) * c_steps # comparable number of steps "after" imputation

param_times <- 5 # how often to compute the benchmark

results <- list()
count <- 0

for (n in N) {
  count <- count + 1
  # Create random data with missings
  random_data <- create_random_data(n)
  dat <- random_data$data
  dat_with_miss <- miss_sim(dat,p,seed_nr=123)
  
  ### Benchmark time and calculate rand index
  rand_ClustImpute <- 0
  rand_missRanger <- 0
  rand_RandomImp <- 0
  rand_mice <- 0
  rand_amelia <- 0
  
  # Use a different seed each time
  mbm <- microbenchmark("ClustImpute" = {
    res <- ClustImpute(dat_with_miss,nr_cluster=nr_cluster, nr_iter=nr_iter, c_steps=c_steps, n_end=n_end, seed_nr = random_seed)
    rand_ClustImpute <- rand_ClustImpute + external_validation(random_data$true_clusters, res$clusters)
  }, "RandomImp"={
    dat_random_imp <- dat_with_miss
    for (j in 1:dim(dat)[2]) {
      dat_random_imp[,j] <- impute(dat_random_imp[,j],fun="random")
    }
    cl_RandomImp <- KMeans_arma(data=dat_random_imp,clusters=3,n_iter=nr_iter_other,seed=random_seed)
    pred_RandomImp <- predict_KMeans(dat_random_imp,cl_RandomImp)
    class(pred_RandomImp) <- "numeric"
    rand_RandomImp <- rand_RandomImp + external_validation(random_data$true_clusters, pred_RandomImp)
  },
  "MissRanger"={
    imp_missRanger <- missRanger(dat_with_miss,pmm.k = 3)
    cl_missRanger <- KMeans_arma(data=imp_missRanger,clusters=3,n_iter=nr_iter_other,seed=random_seed)
    pred_missRanger <- predict_KMeans(imp_missRanger,cl_missRanger)
    class(pred_missRanger) <- "numeric"
    rand_missRanger <- rand_missRanger + external_validation(random_data$true_clusters, pred_missRanger)
  }, 
  "MICE"={
    dat_mice <- mice(dat_with_miss,m=1,maxit=50,meth='pmm') # single data set
    dat_mice <- complete(dat_mice)
    cl_mice <- KMeans_arma(data=dat_mice,clusters=3,n_iter=nr_iter_other,seed=random_seed)
    pred_mice <- predict_KMeans(dat_mice,cl_mice)
    class(pred_mice) <- "numeric"
    rand_mice <- rand_mice + external_validation(random_data$true_clusters, pred_mice)
  }, "AMELIA"= {
    dat_amelia <- amelia(dat_with_miss,m=1) # single data set
    dat_amelia <- dat_amelia$imputations$imp1
    cl_amelia <- KMeans_arma(data=dat_amelia,clusters=3,n_iter=nr_iter_other,seed=random_seed)
    pred_amelia <- predict_KMeans(dat_amelia,cl_amelia)
    class(pred_amelia) <- "numeric"
    rand_amelia <- rand_amelia + external_validation(random_data$true_clusters, pred_amelia)
    
  }  ,times = param_times, setup = {random_seed=round(runif(1)*1e5)}, unit = "s")
  
  # compute average rand index
  rand_ClustImpute <- rand_ClustImpute/param_times
  rand_missRanger <- rand_missRanger/param_times
  rand_RandomImp <- rand_RandomImp/param_times
  rand_mice <- rand_mice/param_times
  rand_amelia <- rand_amelia/param_times
  
  results$randIndex[[count]] <- c(ClustImpute=rand_ClustImpute,RandomImp=rand_RandomImp,#missForest=rand_missForest,
                                  missRanger=rand_missRanger,MICE=rand_mice,AMELIA=rand_amelia)

  results$benchmark[[count]] <- mbm
  
  mbm_median <- print(mbm)$median
  results$benchmark_median[[count]] <- mbm_median
}

Below are the results for the four data sets (only differing in size). Clearly a reandom imputation is fastest, but the AMELIA package is only slightly slower. ClustImpute is 3rd and considerably faster than MICE and missRanger.

plot_grid(autoplot(results$benchmark[[2]]),autoplot(results$benchmark[[3]]),autoplot(results$benchmark[[4]]),
          autoplot(results$benchmark[[5]]),
               ncol=2,labels=sprintf("n = %s",N),label_size = 12,vjust=1)

Let’s focus on the scalabiltiy here: Amelia and ClustImpute scale much better than MICE and missRanger. ClustImpute scales like a simple random imputation and similarly as AMELIA.

data2plot <- data.frame(median=unlist(results$benchmark_median))
data2plot$method <- rep(Hmisc::Cs(ClustImpute,RandomImp,MissRanger,MICE,AMELIA),length(N))
data2plot$n <- rep(N,each=5)

# with shared legend
ps1 <- ggplot(data2plot,aes(x=n,y=median,color=method)) + geom_point() + theme_cowplot() + geom_smooth() + 
  xlab("Numer of observations n") + ylab("Median running time in seconds")
legend <- get_legend(ps1)
ps2 <- ggplot(data2plot,aes(x=n,y=median,color=method)) + geom_point() + theme_cowplot() + geom_smooth() + 
  scale_y_log10() + xlab("Numer of observations n") + ylab("Median running time in seconds") + theme(legend.position="none")
plot_grid(ps1 + theme(legend.position="none"),ps2,legend,nrow=1,rel_widths = c(3,3,1))

Of course, above plots only consider the running time. Below is a table of the rand indices comparing the resulting clusters with the true clusters. ClustImpute provides the highest numbers, although the other imputation methods (except random imputation) could provide better results if they are tuned.

randtbl <- data.frame(matrix(unlist(results$randIndex),nrow=length(results$randIndex), byrow=T))
colnames(randtbl) <- names(results$randIndex[[1]])
randtbl$N <- N
knitr::kable(x=randtbl)

ClustImpute	RandomImp	missRanger	MICE	AMELIA	N
0.6784811	0.3162996	0.3031195	0.2920014	0.3529032	400
0.6518900	0.3149466	0.3107541	0.3186570	0.2839927	800
0.6895598	0.4122350	0.2648335	0.2978446	0.2051036	1600
0.6732008	0.3458818	0.3365066	0.4451816	0.4263087	3200
0.6672859	0.2768001	0.3134265	0.2249913	0.3039771	6400

To leave a comment for the author, please follow the link and comment on their blog: R on Datentrang.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.