Identifying Records in Data Frame A That Are Not Contained In Data Frame B – A Comparison

songpants

10 years ago

[This article was first published on [R]appster, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)

Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Yesterday I launched my first question at Stackoverflow and apparently did a lot of things wrong as I managed to get my question closed wihtin hours

http://stackoverflow.com/questions/7728462/identify-records-in-data-frame-a-not-contained-in-data-frame-b

I had collected 9 different solutions to the problem and made the mistake to put it all within the original question space. So people complained and told me that such a collection belongs into a blog and not on Stackoverflow. Hence, I decided to go ahead and finally make my first blogging steps! I’ll probably still miss out on a lot of cool technical stuff on the blog, so please bear with me.

So here’s the thing:

Problem

How to identify records/rows in data frame x.1 that are not contained in data frame x.2 based on all attributes available (i.e. all columns) in the most efficient way?

Example Data

> x.1 <- data.frame(a=c(1,2,3,4,5), b=c(1,2,3,4,5))
> x.1
a b
1 1 1
2 2 2
3 3 3
4 4 4
5 5 5

> x.2 <- data.frame(a=c(1,1,2,3,4), b=c(1,1,99,3,4))
> x.2
a  b
1 1  1
2 1  1
3 2 99
4 3  3
5 4  4

# BENCHMARK SETTINGS
require(microbenchmark)
to.sec <- 1000000000

Desired Result


  a b
2 2 2
5 5 5

Best Solution So Far

by Prof. Brian Ripley and Gabor Grothendieck

> fun.12 <- function(x.1,x.2,...){
+     x.1p <- do.call("paste", x.1)
+     x.2p <- do.call("paste", x.2)
+     x.1[! x.1p %in% x.2p, ]
+ }
> fun.12(x.1,x.2)
  a b
2 2 2
5 5 5
> sol.12 <- microbenchmark(fun.12(x.1,x.2))
> sol.12 <- median(sol.12$time)/to.sec
> sol.12
> [1] 0.0002059665

Efficiency Comparison

> sol.scope <- 1:13
> comp <- data.frame(lapply(sol.scope, function(x){
+     eval(substitute(get(SOL), list(SOL=paste("sol.", x, sep=""))))
+ }))
> names(comp) <- paste("solution", sol.scope)
> comp <- as.data.frame(t(comp[order(comp[1,])]))
> colnames(comp) <- "time"
> comp
time   time.rel
solution 12 0.0002080150   1.000000
solution 3  0.0002548310   1.225061
solution 1  0.0004656785   2.238677
solution 10 0.0006398950   3.076196
solution 6  0.0007878430   3.787434
solution 8  0.0010459795   5.028385
solution 11 0.0021617355  10.392210
solution 9  0.0025755710  12.381660
solution 7  0.0103444610  49.729399
solution 13 0.0211265200 101.562483
solution 5  0.0225685395 108.494770
solution 2            NA         NA
solution 4            NA         NA

Here are all the solutions I collected so far:

Solution 1

by Chase

> fun.1 <- function(x.1,x.2,...){
+     expr <- paste("subset(x.1,", paste(sapply(names(x.1), function(x){
+         paste("!(", x, " %in% x.2$", x, ")", sep="")
+     }), collapse=" | "), ")")
+     eval(parse(text=expr))
+ }
> fun.1(x.1,x.2)
a b
2 2 2
5 5 5
> sol.1 <- microbenchmark(fun.1(x.1,x.2))
> sol.1 <- median(sol.1$time)/to.sec
> sol.1
[1] 0.0004656785

Solution 2 (rather just an approach)

by Ramnath

> setdiff(x.1$a, x.2$a)   # elements in x.1$a NOT in x.2$a
[1] 5
> setdiff(x.2$a, x.1$a)   # elements in x.2$a NOT in x.1$a
numeric(0)
> sol.2 <- microbenchmark(setdiff(x.1$a, x.2$a))
> sol.2 <- median(sol.2$time)/to.sec
> sol.2
[1] 4.03865e-05
# This helps, but as it does not directly provide the solution, we will set this to 'NA'
sol.2 <- NA

Solution 3

by Prof. Brian Ripley

> fun.3 <- function(x.1, x.2, ...){
+     x.1.id <- do.call("paste", c(x.1, sep = "\r"))
+     x.2.id <- do.call("paste", c(x.2, sep = "\r"))
+     x.1[match(setdiff(x.1.id, x.2.id),x.1.id), ]
+ }
> fun.3(x.1,x.2)
a b
1 2 2
2 5 5
> sol.3 <- microbenchmark(fun.3(x.1,x.2))
> sol.3 <- median(sol.3$time)/to.sec
> sol.3
[1] 0.000254831

Solution 4 (rather just an approach)

by me

> fun.4 <- function(x.1, x.2, ...){
+     # Combine
+     df <- rbind(x.1,x.2)
+     df <- df[order(df[,1]),]
+     # Find duplicates
+     idx.1 <- duplicated(df, all=TRUE)
+     idx.2 <- duplicated(df, fromLast=TRUE)
+     idx <- cbind(idx.1, idx.2)
+     idx <- apply(idx, MARGIN=1, any)
+     # Index records
+     df[-which(idx),]
+ }
> fun.4(x.1,x.2)
a b
1 2 2
8 2 99
2 5 5
> sol.4 <- microbenchmark(fun.4(x.1,x.2))
> sol.4 <- median(sol.4$time)/to.sec
> sol.4
[1] 0.001062621
# As it does not match the desired records, we set this to 'NA'
sol.4 <- NA

Solution 5

base on solution by Gabor Grothendieck

> library(sqldf)
> fun.5 <- function(x1,x2,...){
+     out <- sqldf(
+         "SELECT * FROM x1
+             WHERE
+                 x1.a NOT IN (SELECT x2.a FROM x2) OR
+                 x1.b NOT IN (SELECT x2.b FROM x2)"
+     )
+     out
+ }
> fun.5(x1=x.1,x2=x.2)
a b
1 2 2
2 5 5
> sol.5 <- microbenchmark(fun.5(x1=x.1,x2=x.2))
> sol.5 <- median(sol.5$time)/to.sec
> sol.5
[1] 0.02256854

Solution 6

by Tal Galili

> fun.6 <- function(x.1,x.2){
+     x.1.vec <- apply(x.1, 1, paste, collapse = "")
+     x.2.vec <- apply(x.2, 1, paste, collapse = "")
+     x.1.without.x.2.rows <- x.1[!x.1.vec %in% x.2.vec,]
+     return(x.1.without.x.2.rows)
+ }
> fun.6(x.1,x.2)
a b
1 2 2
2 5 5
> sol.6 <- microbenchmark(fun.6(x.1,x.2))
> sol.6 <- median(sol.6$time)/to.sec
> sol.6
[1] 0.000787843

Solution 7

by nullglob

# COULD NOT REPRODUCE RESULTS WITH MY DATA
> fun.7 <- function(x.1,x.2,...){
+     a1 <- data.frame(a = 1:5, b = letters[1:5])
+     a2 <- data.frame(a = 1:3, b = letters[1:3])
+     comparison <- compare(a1,a2,allowAll=TRUE)
+     comparison$tM
+     difference <-
+         data.frame(lapply(1:ncol(a1),function(i)setdiff(a1[,i],comparison$tM[,i])))
+     colnames(difference) <- colnames(a1)
+     difference
+ }
> fun.7(x.1,x.2)
  a b
1 4 d
2 5 e
> ### DISCUSSION ###
> # 1) Effectiveness
> # Could not reproduce results with my data frames.
> # 2) Efficiency
> sol.7 <- microbenchmark(fun.7(x.1,x.2))
> sol.7 <- median(sol.7$time)/to.sec
> sol.7
[1] 0.01034446

Solution 8

by Henrico

#  Derived from src/library/base/R/merge.R
#  Part of the R package, http://www.R-project.org
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  A copy of the GNU General Public License is available at
#  https://www.r-project.org/Licenses/
> XinY <-
+     function(x, y, by = intersect(names(x), names(y)), by.x = by, by.y = by,
+              notin = FALSE, incomparables = NULL,
+              ...)
+ {
+     fix.by <- function(by, df)
+     {
+         ## fix up 'by' to be a valid set of cols by number: 0 is row.names
+         if(is.null(by)) by <- numeric(0L)
+         by <- as.vector(by)
+         nc <- ncol(df)
+         if(is.character(by))
+             by <- match(by, c("row.names", names(df))) - 1L
+         else if(is.numeric(by)) {
+             if(any(by < 0L) || any(by > nc))
+                 stop("'by' must match numbers of columns")
+         } else if(is.logical(by)) {
+             if(length(by) != nc) stop("'by' must match number of columns")
+             by <- seq_along(by)[by]
+         } else stop("'by' must specify column(s) as numbers, names or logical")
+         if(any(is.na(by))) stop("'by' must specify valid column(s)")
+         unique(by)
+     }
+     nx <- nrow(x <- as.data.frame(x)); ny <- nrow(y <- as.data.frame(y))
+     by.x <- fix.by(by.x, x)
+     by.y <- fix.by(by.y, y)
+     if((l.b <- length(by.x)) != length(by.y))
+         stop("'by.x' and 'by.y' specify different numbers of columns")
+     if(l.b == 0L) {
+         ## was: stop("no columns to match on")
+         ## returns x
+         x
+     }
+     else {
+         if(any(by.x == 0L)) {
+             x <- cbind(Row.names = I(row.names(x)), x)
+             by.x <- by.x + 1L
+         }
+         if(any(by.y == 0L)) {
+             y <- cbind(Row.names = I(row.names(y)), y)
+             by.y <- by.y + 1L
+         }
+         ## create keys from 'by' columns:
+         if(l.b == 1L) {                  # (be faster)
+             bx <- x[, by.x]; if(is.factor(bx)) bx <- as.character(bx)
+             by <- y[, by.y]; if(is.factor(by)) by <- as.character(by)
+         } else {
+             ## Do these together for consistency in as.character.
+             ## Use same set of names.
+             bx <- x[, by.x, drop=FALSE]; by <- y[, by.y, drop=FALSE]
+             names(bx) <- names(by) <- paste("V", seq_len(ncol(bx)), sep="")
+             bz <- do.call("paste", c(rbind(bx, by), sep = "\r"))
+             bx <- bz[seq_len(nx)]
+             by <- bz[nx + seq_len(ny)]
+         }
+         comm <- match(bx, by, 0L)
+         if (notin) {
+             res <- x[comm == 0,]
+         } else {
+             res <- x[comm > 0,]
+         }
+     }
+     ## avoid a copy
+     ## row.names(res) <- NULL
+     attr(res, "row.names") <- .set_row_names(nrow(res))
+     res
+ }
> XnotinY <-
+     function(x, y, by = intersect(names(x), names(y)), by.x = by, by.y = by,
+              notin = TRUE, incomparables = NULL,
+              ...)
+ {
+     XinY(x,y,by,by.x,by.y,notin,incomparables)
+ }
> fun.8 <- XnotinY
> fun.8(x.1,x.2)
a b
1 2 2
2 5 5
> sol.8 <- microbenchmark(fun.8(x.1,x.2))
> sol.8 <- median(sol.8$time)/to.sec
> sol.8
[1] 0.00104598

Solution 9

by Tomas T.

> fun.9 <- function(x.1,x.2,...){
+     tmp = merge(x.1, cbind(x.2, q=1:nrow(x.2)), all.x = TRUE)
+         # provided that there's no column q in both dataframes
+     tmp[is.na(tmp$q), 1:ncol(x.1)] # the result
+ }
> fun.9(x.1,x.2)
a b
1 2 2
2 5 5
> sol.9 <- microbenchmark(fun.9(x.1,x.2))
> sol.9 <- median(sol.9$time)/to.sec
> sol.9
[1] 0.002575571

Solution 10

by Gabor Grothendieck

> fun.10 <- function(x.1,x.2,...){
+     x.1[!duplicated(rbind(x.2, x.1))[-(1:nrow(x.2))],]
+ }
> fun.10(x.1,x.2)
a b
1 2 2
2 5 5
> sol.10 <- microbenchmark(fun.10(x.1,x.2))
> sol.10 <- median(sol.10$time)/to.sec
> sol.10
[1] 0.000639895

Solution 11

by Gabor Grothendieck

> fun.11 <- function(x.1,x.2,...){
+     do.call("rbind", setdiff(split(x.1, rownames(x.1)), split(x.2, rownames(x.2))))
+ }
> fun.11(x.1,x.2)
a b
1 2 2
2 5 5
> sol.11 <- microbenchmark(fun.11(x.1,x.2))
> sol.11 <- median(sol.11$time)/to.sec
> sol.11
[1] 0.002161735

Solution 12

by Gabor Grothendieck

> fun.12 <- function(x.1,x.2,...){
+     x.1p <- do.call("paste", x.1)
+     x.2p <- do.call("paste", x.2)
+     x.1[! x.1p %in% x.2p, ]
+ }
> fun.12(x.1,x.2)
a b
1 2 2
2 5 5
> sol.12 <- microbenchmark(fun.12(x.1,x.2))
> sol.12 <- median(sol.12$time)/to.sec
> sol.12
[1] 0.000208015

Solution 13

by Gabor Grothendieck

> library(sqldf)
> fun.13 <- function(x.1,x.2,...){
+     sqldf("select * from `x.1` except select * from `x.2`")
+ }
> fun.13(x.1,x.2)
a b
1 2 2
2 5 5
> sol.13 <- microbenchmark(fun.13(x.1,x.2))
> sol.13 <- median(sol.13$time)/to.sec
> sol.13
[1] 0.02112652