[This article was first published on Rmazing, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
For me, one of the most annoying features of R is that by default, rbind, cbind and data.frame recycle the shorter vector to the length of the longer vector. I still don’t understand why the standard generics don’t have a parameter like cbind(1:10, 1:5, fill = TRUE) to fill up with ‘NA’s. There may be a deeper reason…
> cbind(1:10, 1:5) [,1] [,2] [1,] 1 1 [2,] 2 2 [3,] 3 3 [4,] 4 4 [5,] 5 5 [6,] 6 1 [7,] 7 2 [8,] 8 3 [9,] 9 4 [10,] 10 5 > rbind(1:10, 1:5) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [1,] 1 2 3 4 5 6 7 8 9 10 [2,] 1 2 3 4 5 1 2 3 4 5 > data.frame(x = 1:10, y = 1:5) x y 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 1 7 7 2 8 8 3 9 9 4 10 10 5
Even more, using unequal size matrices gives an error:
> mat1 <- matrix(rnorm(10), nrow = 5) > mat2 <- matrix(rnorm(10), nrow = 2) > cbind(mat1, mat2) Fehler in cbind(mat1, mat2) : Anzahl der Zeilen der Matrizen muss übereinstimmen (siehe Argument 2)
(You can learn some german here. Well, I suppose you know the error…)
Enter cbind.na, rbind.na, data.frame.na !
I modified the three generic functions so that all arguments are filled with ‘NA’s up to maximal occuring length (vector), rows (cbind) or columns (rbind). One must ‘source’ all three functions as cbind.na calls rbind.na etc:
cbind.na <- function (..., deparse.level = 1) { na <- nargs() - (!missing(deparse.level)) deparse.level <- as.integer(deparse.level) stopifnot(0 <= deparse.level, deparse.level <= 2) argl <- list(...) while (na > 0 && is.null(argl[[na]])) { argl <- argl[-na] na <- na - 1 } if (na == 0) return(NULL) if (na == 1) { if (isS4(..1)) return(cbind2(..1)) else return(matrix(...)) ##.Internal(cbind(deparse.level, ...))) } if (deparse.level) { symarg <- as.list(sys.call()[-1L])[1L:na] Nms <- function(i) { if (is.null(r <- names(symarg[i])) || r == "") { if (is.symbol(r <- symarg[[i]]) || deparse.level == 2) deparse(r) } else r } } ## deactivated, otherwise no fill in with two arguments if (na == 0) { r <- argl[[2]] fix.na <- FALSE } else { nrs <- unname(lapply(argl, nrow)) iV <- sapply(nrs, is.null) fix.na <- identical(nrs[(na - 1):na], list(NULL, NULL)) ## deactivated, otherwise data will be recycled #if (fix.na) { # nr <- max(if (all(iV)) sapply(argl, length) else unlist(nrs[!iV])) # argl[[na]] <- cbind(rep(argl[[na]], length.out = nr), # deparse.level = 0) #} if (deparse.level) { if (fix.na) fix.na <- !is.null(Nna <- Nms(na)) if (!is.null(nmi <- names(argl))) iV <- iV & (nmi == "") ii <- if (fix.na) 2:(na - 1) else 2:na if (any(iV[ii])) { for (i in ii[iV[ii]]) if (!is.null(nmi <- Nms(i))) names(argl)[i] <- nmi } } ## filling with NA's to maximum occuring nrows nRow <- as.numeric(sapply(argl, function(x) NROW(x))) maxRow <- max(nRow, na.rm = TRUE) argl <- lapply(argl, function(x) if (is.null(nrow(x))) c(x, rep(NA, maxRow - length(x))) else rbind.na(x, matrix(, maxRow - nrow(x), ncol(x)))) r <- do.call(cbind, c(argl[-1L], list(deparse.level = deparse.level))) } d2 <- dim(r) r <- cbind2(argl[[1]], r) if (deparse.level == 0) return(r) ism1 <- !is.null(d1 <- dim(..1)) && length(d1) == 2L ism2 <- !is.null(d2) && length(d2) == 2L && !fix.na if (ism1 && ism2) return(r) Ncol <- function(x) { d <- dim(x) if (length(d) == 2L) d[2L] else as.integer(length(x) > 0L) } nn1 <- !is.null(N1 <- if ((l1 <- Ncol(..1)) && !ism1) Nms(1)) nn2 <- !is.null(N2 <- if (na == 2 && Ncol(..2) && !ism2) Nms(2)) if (nn1 || nn2 || fix.na) { if (is.null(colnames(r))) colnames(r) <- rep.int("", ncol(r)) setN <- function(i, nams) colnames(r)[i] <<- if (is.null(nams)) "" else nams if (nn1) setN(1, N1) if (nn2) setN(1 + l1, N2) if (fix.na) setN(ncol(r), Nna) } r }
rbind.na <- function (..., deparse.level = 1) { na <- nargs() - (!missing(deparse.level)) deparse.level <- as.integer(deparse.level) stopifnot(0 <= deparse.level, deparse.level <= 2) argl <- list(...) while (na > 0 && is.null(argl[[na]])) { argl <- argl[-na] na <- na - 1 } if (na == 0) return(NULL) if (na == 1) { if (isS4(..1)) return(rbind2(..1)) else return(matrix(..., nrow = 1)) ##.Internal(rbind(deparse.level, ...))) } if (deparse.level) { symarg <- as.list(sys.call()[-1L])[1L:na] Nms <- function(i) { if (is.null(r <- names(symarg[i])) || r == "") { if (is.symbol(r <- symarg[[i]]) || deparse.level == 2) deparse(r) } else r } } ## deactivated, otherwise no fill in with two arguments if (na == 0) { r <- argl[[2]] fix.na <- FALSE } else { nrs <- unname(lapply(argl, ncol)) iV <- sapply(nrs, is.null) fix.na <- identical(nrs[(na - 1):na], list(NULL, NULL)) ## deactivated, otherwise data will be recycled #if (fix.na) { # nr <- max(if (all(iV)) sapply(argl, length) else unlist(nrs[!iV])) # argl[[na]] <- rbind(rep(argl[[na]], length.out = nr), # deparse.level = 0) #} if (deparse.level) { if (fix.na) fix.na <- !is.null(Nna <- Nms(na)) if (!is.null(nmi <- names(argl))) iV <- iV & (nmi == "") ii <- if (fix.na) 2:(na - 1) else 2:na if (any(iV[ii])) { for (i in ii[iV[ii]]) if (!is.null(nmi <- Nms(i))) names(argl)[i] <- nmi } } ## filling with NA's to maximum occuring ncols nCol <- as.numeric(sapply(argl, function(x) if (is.null(ncol(x))) length(x) else ncol(x))) maxCol <- max(nCol, na.rm = TRUE) argl <- lapply(argl, function(x) if (is.null(ncol(x))) c(x, rep(NA, maxCol - length(x))) else cbind(x, matrix(, nrow(x), maxCol - ncol(x)))) ## create a common name vector from the ## column names of all 'argl' items namesVEC <- rep(NA, maxCol) for (i in 1:length(argl)) { CN <- colnames(argl[[i]]) m <- !(CN %in% namesVEC) namesVEC[m] <- CN[m] } ## make all column names from common 'namesVEC' for (j in 1:length(argl)) { if (!is.null(ncol(argl[[j]]))) colnames(argl[[j]]) <- namesVEC } r <- do.call(rbind, c(argl[-1L], list(deparse.level = deparse.level))) } d2 <- dim(r) ## make all column names from common 'namesVEC' colnames(r) <- colnames(argl[[1]]) r <- rbind2(argl[[1]], r) if (deparse.level == 0) return(r) ism1 <- !is.null(d1 <- dim(..1)) && length(d1) == 2L ism2 <- !is.null(d2) && length(d2) == 2L && !fix.na if (ism1 && ism2) return(r) Nrow <- function(x) { d <- dim(x) if (length(d) == 2L) d[1L] else as.integer(length(x) > 0L) } nn1 <- !is.null(N1 <- if ((l1 <- Nrow(..1)) && !ism1) Nms(1)) nn2 <- !is.null(N2 <- if (na == 2 && Nrow(..2) && !ism2) Nms(2)) if (nn1 || nn2 || fix.na) { if (is.null(rownames(r))) rownames(r) <- rep.int("", nrow(r)) setN <- function(i, nams) rownames(r)[i] <<- if (is.null(nams)) "" else nams if (nn1) setN(1, N1) if (nn2) setN(1 + l1, N2) if (fix.na) setN(nrow(r), Nna) } r }
data.frame.na <- function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE, stringsAsFactors = FALSE) { data.row.names <- if (check.rows && is.null(row.names)) function(current, new, i) { if (is.character(current)) new <- as.character(new) if (is.character(new)) current <- as.character(current) if (anyDuplicated(new)) return(current) if (is.null(current)) return(new) if (all(current == new) || all(current == "")) return(new) stop(gettextf("mismatch of row names in arguments of 'data.frame', item %d", i), domain = NA) } else function(current, new, i) { if (is.null(current)) { if (anyDuplicated(new)) { warning("some row.names duplicated: ", paste(which(duplicated(new)), collapse = ","), " --> row.names NOT used") current } else new } else current } object <- as.list(substitute(list(...)))[-1L] mrn <- is.null(row.names) x <- list(...) n <- length(x) if (n < 1L) { if (!mrn) { if (is.object(row.names) || !is.integer(row.names)) row.names <- as.character(row.names) if (any(is.na(row.names))) stop("row names contain missing values") if (anyDuplicated(row.names)) stop("duplicate row.names: ", paste(unique(row.names[duplicated(row.names)]), collapse = ", ")) } else row.names <- integer(0L) return(structure(list(), names = character(0L), row.names = row.names, class = "data.frame")) } vnames <- names(x) if (length(vnames) != n) vnames <- character(n) no.vn <- !nzchar(vnames) vlist <- vnames <- as.list(vnames) nrows <- ncols <- integer(n) for (i in seq_len(n)) { xi <- if (is.character(x[[i]]) || is.list(x[[i]])) as.data.frame(x[[i]], optional = TRUE, stringsAsFactors = stringsAsFactors) else as.data.frame(x[[i]], optional = TRUE) nrows[i] <- .row_names_info(xi) ncols[i] <- length(xi) namesi <- names(xi) if (ncols[i] > 1L) { if (length(namesi) == 0L) namesi <- seq_len(ncols[i]) if (no.vn[i]) vnames[[i]] <- namesi else vnames[[i]] <- paste(vnames[[i]], namesi, sep = ".") } else { if (length(namesi)) vnames[[i]] <- namesi else if (no.vn[[i]]) { tmpname <- deparse(object[[i]])[1L] if (substr(tmpname, 1L, 2L) == "I(") { ntmpn <- nchar(tmpname, "c") if (substr(tmpname, ntmpn, ntmpn) == ")") tmpname <- substr(tmpname, 3L, ntmpn - 1L) } vnames[[i]] <- tmpname } } if (missing(row.names) && nrows[i] > 0L) { rowsi <- attr(xi, "row.names") nc <- nchar(rowsi, allowNA = FALSE) nc <- nc[!is.na(nc)] if (length(nc) && any(nc)) row.names <- data.row.names(row.names, rowsi, i) } nrows[i] <- abs(nrows[i]) vlist[[i]] <- xi } nr <- max(nrows) for (i in seq_len(n)[nrows < nr]) { xi <- vlist[[i]] if (nrows[i] > 0L) { xi <- unclass(xi) fixed <- TRUE for (j in seq_along(xi)) { ### added NA fill to max length/nrow xi1 <- xi[[j]] if (is.vector(xi1) || is.factor(xi1)) xi[[j]] <- c(xi1, rep(NA, nr - nrows[i])) else if (is.character(xi1) && class(xi1) == "AsIs") xi[[j]] <- structure(c(xi1, rep(NA, nr - nrows[i])), class = class(xi1)) else if (inherits(xi1, "Date") || inherits(xi1, "POSIXct")) xi[[j]] <- c(xi1, rep(NA, nr - nrows[i])) else { fixed <- FALSE break } } if (fixed) { vlist[[i]] <- xi next } } stop("arguments imply differing number of rows: ", paste(unique(nrows), collapse = ", ")) } value <- unlist(vlist, recursive = FALSE, use.names = FALSE) vnames <- unlist(vnames[ncols > 0L]) noname <- !nzchar(vnames) if (any(noname)) vnames[noname] <- paste("Var", seq_along(vnames), sep = ".")[noname] if (check.names) vnames <- make.names(vnames, unique = TRUE) names(value) <- vnames if (!mrn) { if (length(row.names) == 1L && nr != 1L) { if (is.character(row.names)) row.names <- match(row.names, vnames, 0L) if (length(row.names) != 1L || row.names < 1L || row.names > length(vnames)) stop("row.names should specify one of the variables") i <- row.names row.names <- value[[i]] value <- value[-i] } else if (!is.null(row.names) && length(row.names) != nr) stop("row names supplied are of the wrong length") } else if (!is.null(row.names) && length(row.names) != nr) { warning("row names were found from a short variable and have been discarded") row.names <- NULL } if (is.null(row.names)) row.names <- .set_row_names(nr) else { if (is.object(row.names) || !is.integer(row.names)) row.names <- as.character(row.names) if (any(is.na(row.names))) stop("row names contain missing values") if (anyDuplicated(row.names)) stop("duplicate row.names: ", paste(unique(row.names[duplicated(row.names)]), collapse = ", ")) } attr(value, "row.names") <- row.names attr(value, "class") <- "data.frame" value }
So, let’s start!
> cbind.na(1:8, 1:7, 1:5, 1:10) [,1] [,2] [,3] [,4] [1,] 1 1 1 1 [2,] 2 2 2 2 [3,] 3 3 3 3 [4,] 4 4 4 4 [5,] 5 5 5 5 [6,] 6 6 NA 6 [7,] 7 7 NA 7 [8,] 8 NA NA 8 [9,] NA NA NA 9 [10,] NA NA NA 10 > rbind.na(1:8, 1:7, 1:5, 1:10) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [1,] 1 2 3 4 5 6 7 8 NA NA [2,] 1 2 3 4 5 6 7 NA NA NA [3,] 1 2 3 4 5 NA NA NA NA NA [4,] 1 2 3 4 5 6 7 8 9 10 > data.frame.na(A = 1:7, B = 1:5, C = letters[1:3], + D = factor(c(1, 1, 2, 2))) A B C D 1 1 1 a 1 2 2 2 b 1 3 3 3 c 2 4 4 4 <NA> 2 5 5 5 <NA> NA 6 6 NA <NA> NA 7 7 NA <NA> NA System time overhead for NA-filling is not too much: > testList <- vector("list", length = 10000) > for (i in 1:10000) testList[[i]] <- sample(1:100) > system.time(res1 <- do.call(cbind, testList)) user system elapsed 0.01 0.00 0.02 > testList <- vector("list", length = 10000) > for (i in 1:10000) testList[[i]] <- sample(1:100, sample(1:100)) > system.time(res2 <- do.call(cbind.na, testList)) user system elapsed 0.23 0.00 0.24 Have fun!
Filed under: R Internals
To leave a comment for the author, please follow the link and comment on their blog: Rmazing.
R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.