Don’t recycle me!

[This article was first published on Rmazing, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

For me, one of the most annoying features of R is that by default, rbindcbind  and data.frame recycle the shorter vector to the length of the longer vector. I still don’t understand why the standard generics don’t have a parameter like cbind(1:10, 1:5, fill = TRUE) to fill up with ‘NA’s. There may be a deeper reason…

> cbind(1:10, 1:5)
      [,1] [,2]
 [1,]    1    1
 [2,]    2    2
 [3,]    3    3
 [4,]    4    4
 [5,]    5    5
 [6,]    6    1
 [7,]    7    2
 [8,]    8    3
 [9,]    9    4
[10,]   10    5
> rbind(1:10, 1:5)
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,]    1    2    3    4    5    6    7    8    9    10
[2,]    1    2    3    4    5    1    2    3    4     5

> data.frame(x = 1:10, y = 1:5)
    x y
1   1 1
2   2 2
3   3 3
4   4 4
5   5 5
6   6 1
7   7 2
8   8 3
9   9 4
10 10 5

Even more, using unequal size matrices gives an error:

> mat1 <- matrix(rnorm(10), nrow = 5)
> mat2 <- matrix(rnorm(10), nrow = 2)
> cbind(mat1, mat2)
Fehler in cbind(mat1, mat2) : 
  Anzahl der Zeilen der Matrizen muss übereinstimmen (siehe Argument 2)

(You can learn some german here. Well, I suppose you know the error…)

Enter cbind.na, rbind.na, data.frame.na !

I modified the three generic functions so that all arguments are filled with ‘NA’s up to maximal occuring length (vector), rows (cbind) or columns (rbind). One must ‘source’ all three functions as cbind.na calls rbind.na etc:

cbind.na <- function (..., deparse.level = 1)
{
 na <- nargs() - (!missing(deparse.level))
 deparse.level <- as.integer(deparse.level)
 stopifnot(0 <= deparse.level, deparse.level <= 2)
 argl <- list(...)
 while (na > 0 && is.null(argl[[na]])) {
 argl <- argl[-na]
 na <- na - 1
 }
 if (na == 0)
 return(NULL)
 if (na == 1) {
 if (isS4(..1))
 return(cbind2(..1))
 else return(matrix(...)) ##.Internal(cbind(deparse.level, ...)))
 }
 if (deparse.level) {
 symarg <- as.list(sys.call()[-1L])[1L:na]
 Nms <- function(i) {
 if (is.null(r <- names(symarg[i])) || r == "") {
 if (is.symbol(r <- symarg[[i]]) || deparse.level ==
 2)
 deparse(r)
 }
 else r
 }
 }
 ## deactivated, otherwise no fill in with two arguments
 if (na == 0) {
 r <- argl[[2]]
 fix.na <- FALSE
 }
 else {
 nrs <- unname(lapply(argl, nrow))
 iV <- sapply(nrs, is.null)
 fix.na <- identical(nrs[(na - 1):na], list(NULL, NULL))
 ## deactivated, otherwise data will be recycled
 #if (fix.na) {
 # nr <- max(if (all(iV)) sapply(argl, length) else unlist(nrs[!iV]))
 # argl[[na]] <- cbind(rep(argl[[na]], length.out = nr),
 # deparse.level = 0)
 #}
 if (deparse.level) {
 if (fix.na)
 fix.na <- !is.null(Nna <- Nms(na))
 if (!is.null(nmi <- names(argl)))
 iV <- iV & (nmi == "")
 ii <- if (fix.na)
 2:(na - 1)
 else 2:na
 if (any(iV[ii])) {
 for (i in ii[iV[ii]]) if (!is.null(nmi <- Nms(i)))
 names(argl)[i] <- nmi
 }
 }

 ## filling with NA's to maximum occuring nrows
 nRow <- as.numeric(sapply(argl, function(x) NROW(x)))
 maxRow <- max(nRow, na.rm = TRUE)
 argl <- lapply(argl, function(x) if (is.null(nrow(x))) c(x, rep(NA, maxRow - length(x)))
 else rbind.na(x, matrix(, maxRow - nrow(x), ncol(x))))
 r <- do.call(cbind, c(argl[-1L], list(deparse.level = deparse.level)))
 }
 d2 <- dim(r)
 r <- cbind2(argl[[1]], r)
 if (deparse.level == 0)
 return(r)
 ism1 <- !is.null(d1 <- dim(..1)) && length(d1) == 2L
 ism2 <- !is.null(d2) && length(d2) == 2L && !fix.na
 if (ism1 && ism2)
 return(r)
 Ncol <- function(x) {
 d <- dim(x)
 if (length(d) == 2L)
 d[2L]
 else as.integer(length(x) > 0L)
 }
 nn1 <- !is.null(N1 <- if ((l1 <- Ncol(..1)) && !ism1) Nms(1))
 nn2 <- !is.null(N2 <- if (na == 2 && Ncol(..2) && !ism2) Nms(2))
 if (nn1 || nn2 || fix.na) {
 if (is.null(colnames(r)))
 colnames(r) <- rep.int("", ncol(r))
 setN <- function(i, nams) colnames(r)[i] <<- if (is.null(nams))
 ""
 else nams
 if (nn1)
 setN(1, N1)
 if (nn2)
 setN(1 + l1, N2)
 if (fix.na)
 setN(ncol(r), Nna)
 }
 r
}

 

rbind.na <- function (..., deparse.level = 1)
{
    na <- nargs() - (!missing(deparse.level))
    deparse.level <- as.integer(deparse.level)
    stopifnot(0 <= deparse.level, deparse.level <= 2)
    argl <- list(...)
    while (na > 0 && is.null(argl[[na]])) {
        argl <- argl[-na]
        na <- na - 1
    }
    if (na == 0)
        return(NULL)
    if (na == 1) {
        if (isS4(..1))
            return(rbind2(..1))
        else return(matrix(..., nrow = 1)) ##.Internal(rbind(deparse.level, ...)))
    }
    if (deparse.level) {
        symarg <- as.list(sys.call()[-1L])[1L:na]
        Nms <- function(i) {
            if (is.null(r <- names(symarg[i])) || r == "") {
                if (is.symbol(r <- symarg[[i]]) || deparse.level ==
                  2)
                  deparse(r)
            }
            else r
        }
    }

    ## deactivated, otherwise no fill in with two arguments
    if (na == 0) {
        r <- argl[[2]]
        fix.na <- FALSE
    }
    else {
        nrs <- unname(lapply(argl, ncol))
        iV <- sapply(nrs, is.null)
        fix.na <- identical(nrs[(na - 1):na], list(NULL, NULL))
        ## deactivated, otherwise data will be recycled
        #if (fix.na) {
        #    nr <- max(if (all(iV)) sapply(argl, length) else unlist(nrs[!iV]))
        #    argl[[na]] <- rbind(rep(argl[[na]], length.out = nr),
        #        deparse.level = 0)
        #}
        if (deparse.level) {
            if (fix.na)
                fix.na <- !is.null(Nna <- Nms(na))
            if (!is.null(nmi <- names(argl)))
                iV <- iV & (nmi == "")
            ii <- if (fix.na)
                2:(na - 1)
            else 2:na
            if (any(iV[ii])) {
                for (i in ii[iV[ii]]) if (!is.null(nmi <- Nms(i)))
                  names(argl)[i] <- nmi
            }
        }

        ## filling with NA's to maximum occuring ncols
        nCol <- as.numeric(sapply(argl, function(x) if (is.null(ncol(x))) length(x)
                                                    else ncol(x)))
        maxCol <- max(nCol, na.rm = TRUE)
        argl <- lapply(argl, function(x)  if (is.null(ncol(x))) c(x, rep(NA, maxCol - length(x)))
                                          else cbind(x, matrix(, nrow(x), maxCol - ncol(x))))

        ## create a common name vector from the
        ## column names of all 'argl' items
        namesVEC <- rep(NA, maxCol)
        for (i in 1:length(argl)) {
          CN <- colnames(argl[[i]])
          m <- !(CN %in% namesVEC)
          namesVEC[m] <- CN[m]
        }

        ## make all column names from common 'namesVEC'
        for (j in 1:length(argl)) {
          if (!is.null(ncol(argl[[j]]))) colnames(argl[[j]]) <- namesVEC
        }

        r <- do.call(rbind, c(argl[-1L], list(deparse.level = deparse.level)))
    }

    d2 <- dim(r)

    ## make all column names from common 'namesVEC'
    colnames(r) <- colnames(argl[[1]])

    r <- rbind2(argl[[1]], r)

    if (deparse.level == 0)
        return(r)
    ism1 <- !is.null(d1 <- dim(..1)) && length(d1) == 2L
    ism2 <- !is.null(d2) && length(d2) == 2L && !fix.na
    if (ism1 && ism2)
        return(r)
    Nrow <- function(x) {
        d <- dim(x)
        if (length(d) == 2L)
            d[1L]
        else as.integer(length(x) > 0L)
    }
    nn1 <- !is.null(N1 <- if ((l1 <- Nrow(..1)) && !ism1) Nms(1))
    nn2 <- !is.null(N2 <- if (na == 2 && Nrow(..2) && !ism2) Nms(2))
    if (nn1 || nn2 || fix.na) {
        if (is.null(rownames(r)))
            rownames(r) <- rep.int("", nrow(r))
        setN <- function(i, nams) rownames(r)[i] <<- if (is.null(nams))
            ""
        else nams
        if (nn1)
            setN(1, N1)
        if (nn2)
            setN(1 + l1, N2)
        if (fix.na)
            setN(nrow(r), Nna)
    }
    r
}

 

data.frame.na <- function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE,
    stringsAsFactors = FALSE)
{
    data.row.names <- if (check.rows && is.null(row.names))
        function(current, new, i) {
            if (is.character(current))
                new <- as.character(new)
            if (is.character(new))
                current <- as.character(current)
            if (anyDuplicated(new))
                return(current)
            if (is.null(current))
                return(new)
            if (all(current == new) || all(current == ""))
                return(new)
            stop(gettextf("mismatch of row names in arguments of 'data.frame', item %d",
                i), domain = NA)
        }
    else function(current, new, i) {
        if (is.null(current)) {
            if (anyDuplicated(new)) {
                warning("some row.names duplicated: ", paste(which(duplicated(new)),
                  collapse = ","), " --> row.names NOT used")
                current
            }
            else new
        }
        else current
    }
    object <- as.list(substitute(list(...)))[-1L]
    mrn <- is.null(row.names)
    x <- list(...)
    n <- length(x)
    if (n < 1L) {
        if (!mrn) {
            if (is.object(row.names) || !is.integer(row.names))
                row.names <- as.character(row.names)
            if (any(is.na(row.names)))
                stop("row names contain missing values")
            if (anyDuplicated(row.names))
                stop("duplicate row.names: ", paste(unique(row.names[duplicated(row.names)]),
                  collapse = ", "))
        }
        else row.names <- integer(0L)
        return(structure(list(), names = character(0L), row.names = row.names,
            class = "data.frame"))
    }
    vnames <- names(x)
    if (length(vnames) != n)
        vnames <- character(n)
    no.vn <- !nzchar(vnames)
    vlist <- vnames <- as.list(vnames)
    nrows <- ncols <- integer(n)
    for (i in seq_len(n)) {
        xi <- if (is.character(x[[i]]) || is.list(x[[i]]))
            as.data.frame(x[[i]], optional = TRUE, stringsAsFactors = stringsAsFactors)
        else as.data.frame(x[[i]], optional = TRUE)
        nrows[i] <- .row_names_info(xi)
        ncols[i] <- length(xi)
        namesi <- names(xi)
        if (ncols[i] > 1L) {
            if (length(namesi) == 0L)
                namesi <- seq_len(ncols[i])
            if (no.vn[i])
                vnames[[i]] <- namesi
            else vnames[[i]] <- paste(vnames[[i]], namesi, sep = ".")
        }
        else {
            if (length(namesi))
                vnames[[i]] <- namesi
            else if (no.vn[[i]]) {
                tmpname <- deparse(object[[i]])[1L]
                if (substr(tmpname, 1L, 2L) == "I(") {
                  ntmpn <- nchar(tmpname, "c")
                  if (substr(tmpname, ntmpn, ntmpn) == ")")
                    tmpname <- substr(tmpname, 3L, ntmpn - 1L)
                }
                vnames[[i]] <- tmpname
            }
        }
        if (missing(row.names) && nrows[i] > 0L) {
            rowsi <- attr(xi, "row.names")
            nc <- nchar(rowsi, allowNA = FALSE)
            nc <- nc[!is.na(nc)]
            if (length(nc) && any(nc))
                row.names <- data.row.names(row.names, rowsi,
                  i)
        }
        nrows[i] <- abs(nrows[i])
        vlist[[i]] <- xi
    }
    nr <- max(nrows)
    for (i in seq_len(n)[nrows < nr]) {
        xi <- vlist[[i]]
        if (nrows[i] > 0L) {
            xi <- unclass(xi)
            fixed <- TRUE
            for (j in seq_along(xi)) {
                ### added NA fill to max length/nrow
                xi1 <- xi[[j]]
                if (is.vector(xi1) || is.factor(xi1))
                  xi[[j]] <- c(xi1, rep(NA, nr - nrows[i]))
                else if (is.character(xi1) && class(xi1) == "AsIs")
                  xi[[j]] <- structure(c(xi1, rep(NA, nr - nrows[i])),
                    class = class(xi1))
                else if (inherits(xi1, "Date") || inherits(xi1,
                  "POSIXct"))
                  xi[[j]] <- c(xi1, rep(NA, nr - nrows[i]))
                else {
                  fixed <- FALSE
                  break
                }
            }
            if (fixed) {
                vlist[[i]] <- xi
                next
            }
        }
        stop("arguments imply differing number of rows: ", paste(unique(nrows),
            collapse = ", "))
    }
    value <- unlist(vlist, recursive = FALSE, use.names = FALSE)
    vnames <- unlist(vnames[ncols > 0L])
    noname <- !nzchar(vnames)
    if (any(noname))
        vnames[noname] <- paste("Var", seq_along(vnames), sep = ".")[noname]
    if (check.names)
        vnames <- make.names(vnames, unique = TRUE)
    names(value) <- vnames
    if (!mrn) {
        if (length(row.names) == 1L && nr != 1L) {
            if (is.character(row.names))
                row.names <- match(row.names, vnames, 0L)
            if (length(row.names) != 1L || row.names < 1L ||
                row.names > length(vnames))
                stop("row.names should specify one of the variables")
            i <- row.names
            row.names <- value[[i]]
            value <- value[-i]
        }
        else if (!is.null(row.names) && length(row.names) !=
            nr)
            stop("row names supplied are of the wrong length")
    }
    else if (!is.null(row.names) && length(row.names) != nr) {
        warning("row names were found from a short variable and have been discarded")
        row.names <- NULL
    }
    if (is.null(row.names))
        row.names <- .set_row_names(nr)
    else {
        if (is.object(row.names) || !is.integer(row.names))
            row.names <- as.character(row.names)
        if (any(is.na(row.names)))
            stop("row names contain missing values")
        if (anyDuplicated(row.names))
            stop("duplicate row.names: ", paste(unique(row.names[duplicated(row.names)]),
                collapse = ", "))
    }
    attr(value, "row.names") <- row.names
    attr(value, "class") <- "data.frame"
    value
}

So, let’s start!

> cbind.na(1:8, 1:7, 1:5, 1:10)
      [,1] [,2] [,3] [,4]
 [1,]    1    1    1    1
 [2,]    2    2    2    2
 [3,]    3    3    3    3
 [4,]    4    4    4    4
 [5,]    5    5    5    5
 [6,]    6    6   NA    6
 [7,]    7    7   NA    7
 [8,]    8   NA   NA    8
 [9,]   NA   NA   NA    9
[10,]   NA   NA   NA   10
> rbind.na(1:8, 1:7, 1:5, 1:10)
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,]    1    2    3    4    5    6    7    8   NA    NA
[2,]    1    2    3    4    5    6    7   NA   NA    NA
[3,]    1    2    3    4    5   NA   NA   NA   NA    NA
[4,]    1    2    3    4    5    6    7    8    9    10
> data.frame.na(A = 1:7, B = 1:5, C = letters[1:3], 
+               D = factor(c(1, 1, 2, 2)))
  A  B    C  D
1 1  1    a  1
2 2  2    b  1
3 3  3    c  2
4 4  4 <NA>  2
5 5  5 <NA> NA
6 6 NA <NA> NA
7 7 NA <NA> NA

System time overhead for NA-filling is not too much:
> testList <- vector("list", length = 10000)
> for (i in 1:10000) testList[[i]] <- sample(1:100)
> system.time(res1 <- do.call(cbind, testList))
   user  system elapsed 
   0.01    0.00    0.02 

> testList <- vector("list", length = 10000)
> for (i in 1:10000) testList[[i]] <- sample(1:100, sample(1:100))
> system.time(res2 <- do.call(cbind.na, testList))
   user  system elapsed 
   0.23    0.00    0.24

Have fun!

Filed under: R Internals

To leave a comment for the author, please follow the link and comment on their blog: Rmazing.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Never miss an update!
Subscribe to R-bloggers to receive
e-mails with the latest R posts.
(You will not see this message again.)

Click here to close (This popup will not appear again)