[This article was first published on
Alexej's blog , and kindly contributed to
R-bloggers ]. (You can report issue about the content on this page
here )
Want to share your content on R-bloggers?
click here if you have a blog, or
here if you don't.
This blog post demonstrates the usage of the R package dplyr . It turns out that dplyr is intuitive to the point where I probably won’t ever need to look back at this summary. A nice and very concise dplyr and tidyr cheat sheet is available from RStudio.
The following was compiled in rmarkdown [:page_facing_up: download .Rmd file ].
library(dplyr)
library(gapminder)
The gapminder data will be used for demonstration purposes.
class(gapminder)
## [1] "tbl_df" "tbl" "data.frame"
glimpse(gapminder)
## Observations: 1,704
## Variables: 6
## $ country <fctr> Afghanistan, Afghanistan, Afghanistan, Afghani...
## $ continent <fctr> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia...
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987,...
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438,...
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460,...
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.981...
dplyr verbs
sample_frac
– sample a given percentage of rows
sample_frac(gapminder, 0.5)
## # A tibble: 852 × 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Taiwan Asia 1977 70.590 16785196 5596.520
## 2 Puerto Rico Americas 1972 72.160 2847132 9123.042
## 3 Croatia Europe 1957 64.770 3991242 4338.232
## 4 Panama Americas 2002 74.712 2990875 7356.032
## 5 Canada Americas 1992 77.950 28523502 26342.884
## 6 Poland Europe 1982 71.320 36227381 8451.531
## 7 Puerto Rico Americas 1957 68.540 2260000 3907.156
## 8 Chile Americas 1982 70.565 11487112 5095.666
## 9 Belgium Europe 1957 69.240 8989111 9714.961
## 10 Myanmar Asia 1967 49.379 25870271 349.000
## # ... with 842 more rows
set.seed(2016)
tiny <- sample_n(gapminder, 3)
tiny
## # A tibble: 3 × 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Colombia Americas 1982 66.653 27764644 4397.576
## 2 Canada Americas 1967 72.130 20819767 16076.588
## 3 Sri Lanka Asia 1972 65.042 13016733 1213.396
rename(tiny, GDP = gdpPercap, population = pop)
## # A tibble: 3 × 6
## country continent year lifeExp population GDP
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Colombia Americas 1982 66.653 27764644 4397.576
## 2 Canada Americas 1967 72.130 20819767 16076.588
## 3 Sri Lanka Asia 1972 65.042 13016733 1213.396
select
– select columns from the data frame
select(tiny, starts_with("y"), pop, matches("^co.*"))
## # A tibble: 3 × 4
## year pop country continent
## <int> <int> <fctr> <fctr>
## 1 1982 27764644 Colombia Americas
## 2 1967 20819767 Canada Americas
## 3 1972 13016733 Sri Lanka Asia
filter
– select rows from the data frame, producing a subset
# filter(tiny, lifeExp > 60 & year < 1980)
# or equivalent:
filter(tiny, lifeExp > 60, year < 1980)
## # A tibble: 2 × 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Canada Americas 1967 72.130 20819767 16076.588
## 2 Sri Lanka Asia 1972 65.042 13016733 1213.396
slice
– select rows from data frame by index, producing a subset
slice(gapminder, 300:303)
## # A tibble: 4 × 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 China Asia 2007 72.961 1318683096 4959.115
## 2 Colombia Americas 1952 50.643 12350771 2144.115
## 3 Colombia Americas 1957 55.118 14485993 2323.806
## 4 Colombia Americas 1962 57.863 17009885 2492.351
mutate
– add new columns that can be functions of existing columns
mutate(tiny, newVar = (lifeExp / gdpPercap), newcol = 3:1)
## # A tibble: 3 × 8
## country continent year lifeExp pop gdpPercap newVar
## <fctr> <fctr> <int> <dbl> <int> <dbl> <dbl>
## 1 Colombia Americas 1982 66.653 27764644 4397.576 0.015156760
## 2 Canada Americas 1967 72.130 20819767 16076.588 0.004486649
## 3 Sri Lanka Asia 1972 65.042 13016733 1213.396 0.053603296
## # ... with 1 more variables: newcol <int>
tiny <- mutate(tiny, newVar = (lifeExp / gdpPercap), newVar2 = newVar^2)
glimpse(tiny)
## Observations: 3
## Variables: 8
## $ country <fctr> Colombia, Canada, Sri Lanka
## $ continent <fctr> Americas, Americas, Asia
## $ year <int> 1982, 1967, 1972
## $ lifeExp <dbl> 66.653, 72.130, 65.042
## $ pop <int> 27764644, 20819767, 13016733
## $ gdpPercap <dbl> 4397.576, 16076.588, 1213.396
## $ newVar <dbl> 0.015156760, 0.004486649, 0.053603296
## $ newVar2 <dbl> 2.297274e-04, 2.013002e-05, 2.873313e-03
transmute
– add new columns that can be functions of the existing columns, and drop the existing columns
tiny <- transmute(tiny, id = 1:3, country, continent,
newVarSqrt = sqrt(newVar), pop)
tiny
## # A tibble: 3 × 5
## id country continent newVarSqrt pop
## <int> <fctr> <fctr> <dbl> <int>
## 1 1 Colombia Americas 0.12311279 27764644
## 2 2 Canada Americas 0.06698245 20819767
## 3 3 Sri Lanka Asia 0.23152386 13016733
arrange(tiny, pop)
## # A tibble: 3 × 5
## id country continent newVarSqrt pop
## <int> <fctr> <fctr> <dbl> <int>
## 1 3 Sri Lanka Asia 0.23152386 13016733
## 2 2 Canada Americas 0.06698245 20819767
## 3 1 Colombia Americas 0.12311279 27764644
arrange(gapminder, desc(year), lifeExp)
## # A tibble: 1,704 × 6
## country continent year lifeExp pop
## <fctr> <fctr> <int> <dbl> <int>
## 1 Swaziland Africa 2007 39.613 1133066
## 2 Mozambique Africa 2007 42.082 19951656
## 3 Zambia Africa 2007 42.384 11746035
## 4 Sierra Leone Africa 2007 42.568 6144562
## 5 Lesotho Africa 2007 42.592 2012649
## 6 Angola Africa 2007 42.731 12420476
## 7 Zimbabwe Africa 2007 43.487 12311143
## 8 Afghanistan Asia 2007 43.828 31889923
## 9 Central African Republic Africa 2007 44.741 4369038
## 10 Liberia Africa 2007 45.678 3193942
## # ... with 1,694 more rows, and 1 more variables: gdpPercap <dbl>
summarize
– create collapsed summaries of a data frame by applying functions to columns
summarize(gapminder, aveLife = mean(lifeExp))
## # A tibble: 1 × 1
## aveLife
## <dbl>
## 1 59.47444
distinct
– find distinct rows, for repetitive data
tiny2 <- tiny[c(1,1,2,2), ]
dim(tiny2)
## [1] 4 5
distinct(tiny2)
## # A tibble: 2 × 5
## id country continent newVarSqrt pop
## <int> <fctr> <fctr> <dbl> <int>
## 1 1 Colombia Americas 0.12311279 27764644
## 2 2 Canada Americas 0.06698245 20819767
n_distinct(tiny2)
## [1] 2
Chaining
Base-R-style
set.seed(2016)
sample_n(filter(gapminder, continent == "Asia" & lifeExp < 65), 2)
## # A tibble: 2 × 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 China Asia 1957 50.54896 637408000 575.987
## 2 Cambodia Asia 1972 40.31700 7450606 421.624
Using pipes
set.seed(2016)
gapminder %>% filter(continent == "Asia") %>% filter(lifeExp < 65) %>% sample_n(2)
## # A tibble: 2 × 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 China Asia 1957 50.54896 637408000 575.987
## 2 Cambodia Asia 1972 40.31700 7450606 421.624
More verbs
group_by
– convert the data frame into a grouped data frame, where the operations are performed by group
gapminder %>% group_by(continent) %>%
summarize(aveLife = mean(lifeExp), count = n(),
countries = n_distinct(country))
## # A tibble: 5 × 4
## continent aveLife count countries
## <fctr> <dbl> <int> <int>
## 1 Africa 48.86533 624 52
## 2 Americas 64.65874 300 25
## 3 Asia 60.06490 396 33
## 4 Europe 71.90369 360 30
## 5 Oceania 74.32621 24 2
gapminder %>% group_by(continent) %>% tally
## # A tibble: 5 × 2
## continent n
## <fctr> <int>
## 1 Africa 624
## 2 Americas 300
## 3 Asia 396
## 4 Europe 360
## 5 Oceania 24
Join multiple data frames
Example originally from http://stat545.com/bit001_dplyr-cheatsheet.html
superheroes <- c("name, alignment, gender, publisher",
"Magneto, bad, male, Marvel",
"Storm, good, female, Marvel",
"Mystique, bad, female, Marvel",
"Batman, good, male, DC",
"Joker, bad, male, DC",
"Catwoman, bad, female, DC",
"Hellboy, good, male, Dark Horse Comics")
superheroes <- read.csv(text = superheroes, strip.white = TRUE, as.is=TRUE)
publishers <- c("publisher, yr_founded",
" DC, 1934",
" Marvel, 1939",
" Image, 1992")
publishers <- read.csv(text = publishers, strip.white = TRUE, as.is=TRUE)
Inner vs left vs full join
inner_join(superheroes, publishers)
## Joining, by = "publisher"
## name alignment gender publisher yr_founded
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
left_join(superheroes, publishers)
## Joining, by = "publisher"
## name alignment gender publisher yr_founded
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
full_join(superheroes, publishers)
## Joining, by = "publisher"
## name alignment gender publisher yr_founded
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
## 8 <NA> <NA> <NA> Image 1992