[This article was first published on S+/R – Yet Another Blog in Statistical Computing, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here) Want to share your content on R-bloggers? click here if you have a blog, or here if you don't. library(SparkR, lib.loc = paste(Sys.getenv("SPARK_HOME"), "/R/lib", sep = "")) sc <- sparkR.session(master = "local") df1 <- read.df("nycflights13.csv", source = "csv", header = "true", inferSchema = "true") grp1 <- groupBy(filter(df1, "month in (1, 2, 3)"), "month") sum1 <- withColumnRenamed(agg(grp1, min_dep = min(df1$dep_delay)), "month", "month1") grp2 <- groupBy(filter(df1, "month in (2, 3, 4)"), "month") sum2 <- withColumnRenamed(agg(grp2, max_dep = max(df1$dep_delay)), "month", "month2") # INNER JOIN showDF(merge(sum1, sum2, by.x = "month1", by.y = "month2", all = FALSE)) showDF(join(sum1, sum2, sum1$month1 == sum2$month2, "inner")) #+------+-------+------+-------+ #|month1|min_dep|month2|max_dep| #+------+-------+------+-------+ #| 3| -25| 3| 911| #| 2| -33| 2| 853| #+------+-------+------+-------+ # LEFT JOIN showDF(merge(sum1, sum2, by.x = "month1", by.y = "month2", all.x = TRUE)) showDF(join(sum1, sum2, sum1$month1 == sum2$month2, "left")) #+------+-------+------+-------+ #|month1|min_dep|month2|max_dep| #+------+-------+------+-------+ #| 1| -30| null| null| #| 3| -25| 3| 911| #| 2| -33| 2| 853| #+------+-------+------+-------+ # RIGHT JOIN showDF(merge(sum1, sum2, by.x = "month1", by.y = "month2", all.y = TRUE)) showDF(join(sum1, sum2, sum1$month1 == sum2$month2, [...]
[Read more...]