Load Packages and Data
# load dplyr package 
suppressMessages(library(dplyr))

# create example dataframe
cars <- cbind(car = rownames(mtcars), mtcars)
rownames(cars) <- NULL


dplyr verbs
# filter - subset rows of a data frame / filter(df, how to subset)
filter(cars, mpg > 25)
##              car  mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1       Fiat 128 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## 2    Honda Civic 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## 3 Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## 4      Fiat X1-9 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## 5  Porsche 914-2 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## 6   Lotus Europa 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
filter(cars,  mpg > 25 & hp > 75)
##             car  mpg cyl  disp  hp drat    wt qsec vs am gear carb
## 1 Porsche 914-2 26.0   4 120.3  91 4.43 2.140 16.7  0  1    5    2
## 2  Lotus Europa 30.4   4  95.1 113 3.77 1.513 16.9  1  1    5    2
# slice - subset rows of a data frame by position / slice(df, rows to keep)
slice(cars, 1:5)
##                 car  mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1         Mazda RX4 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2     Mazda RX4 Wag 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 3        Datsun 710 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## 4    Hornet 4 Drive 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 5 Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
slice(cars, c(1:3, 11:13))
##             car  mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1     Mazda RX4 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2 Mazda RX4 Wag 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 3    Datsun 710 22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 4     Merc 280C 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## 5    Merc 450SE 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## 6    Merc 450SL 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
# arrange - order rows of a data frame / slice(df, column names to order by)
head(arrange(cars, mpg))
##                   car  mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1  Cadillac Fleetwood 10.4   8  472 205 2.93 5.250 17.98  0  0    3    4
## 2 Lincoln Continental 10.4   8  460 215 3.00 5.424 17.82  0  0    3    4
## 3          Camaro Z28 13.3   8  350 245 3.73 3.840 15.41  0  0    3    4
## 4          Duster 360 14.3   8  360 245 3.21 3.570 15.84  0  0    3    4
## 5   Chrysler Imperial 14.7   8  440 230 3.23 5.345 17.42  0  0    3    4
## 6       Maserati Bora 15.0   8  301 335 3.54 3.570 14.60  0  1    5    8
head(arrange(cars, desc(mpg)))
##              car  mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1 Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## 2       Fiat 128 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## 3    Honda Civic 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## 4   Lotus Europa 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## 5      Fiat X1-9 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## 6  Porsche 914-2 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
head(arrange(cars, desc(cyl), desc(mpg)))
##                 car  mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1  Pontiac Firebird 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## 2 Hornet Sportabout 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 3        Merc 450SL 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## 4        Merc 450SE 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## 5    Ford Pantera L 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## 6  Dodge Challenger 15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
# select - subset columns of a data frame / select(df, names of columns to keep)
head(select(cars, car, mpg))
##                 car  mpg
## 1         Mazda RX4 21.0
## 2     Mazda RX4 Wag 21.0
## 3        Datsun 710 22.8
## 4    Hornet 4 Drive 21.4
## 5 Hornet Sportabout 18.7
## 6           Valiant 18.1
head(select(cars, car:hp))
##                 car  mpg cyl disp  hp
## 1         Mazda RX4 21.0   6  160 110
## 2     Mazda RX4 Wag 21.0   6  160 110
## 3        Datsun 710 22.8   4  108  93
## 4    Hornet 4 Drive 21.4   6  258 110
## 5 Hornet Sportabout 18.7   8  360 175
## 6           Valiant 18.1   6  225 105
head(select(cars, -(car:hp)))
##   drat    wt  qsec vs am gear carb
## 1 3.90 2.620 16.46  0  1    4    4
## 2 3.90 2.875 17.02  0  1    4    4
## 3 3.85 2.320 18.61  1  1    4    1
## 4 3.08 3.215 19.44  1  0    3    1
## 5 3.15 3.440 17.02  0  0    3    2
## 6 2.76 3.460 20.22  1  0    3    1
# select and other dplyr verbs work with starts_with(), ends_with(), matches() and contains()
head(select(cars, starts_with('c')))
##                 car cyl carb
## 1         Mazda RX4   6    4
## 2     Mazda RX4 Wag   6    4
## 3        Datsun 710   4    1
## 4    Hornet 4 Drive   6    1
## 5 Hornet Sportabout   8    2
## 6           Valiant   6    1
# select is often used with distinct - returns table of all unique values 
distinct(select(cars, vs, cyl))
##   vs cyl
## 1  0   6
## 2  1   4
## 3  1   6
## 4  0   8
## 5  0   4
# rename - rename columns of a data frame / rename(df, new name = old name)
head(rename(cars, automobile = car))
##          automobile  mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1         Mazda RX4 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2     Mazda RX4 Wag 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 3        Datsun 710 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## 4    Hornet 4 Drive 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 5 Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## 6           Valiant 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
# mutate - create new columns / mutate(df, new column name = formula for new column)
head(mutate(cars, hp_to_wt = hp/wt))
##                 car  mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1         Mazda RX4 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2     Mazda RX4 Wag 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 3        Datsun 710 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## 4    Hornet 4 Drive 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 5 Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## 6           Valiant 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
##   hp_to_wt
## 1 41.98473
## 2 38.26087
## 3 40.08621
## 4 34.21462
## 5 50.87209
## 6 30.34682


Chaining Syntax
# count number of cars with each number of cylinders and put in descending order 
# n() - counts number of rows in a group
cars %>%
  group_by(cyl) %>%
  summarise(cyl_count=n()) %>%
  arrange(desc(cyl_count))
## Source: local data frame [3 x 2]
## 
##   cyl cyl_count
## 1   8        14
## 2   4        11
## 3   6         7
# calculate mean mpg by number of cylinders 
cars %>%
  group_by(cyl) %>%
  summarise(mean_mpg = mean(mpg, na.rm = TRUE))
## Source: local data frame [3 x 2]
## 
##   cyl mean_mpg
## 1   4 26.66364
## 2   6 19.74286
## 3   8 15.10000
# calculate mean mpg and wt by number of cylinders 
# sumarise_each - applies the same function to multiple columns 
cars %>%
  group_by(cyl) %>%
  summarise_each(funs(mean(., na.rm = TRUE)), mpg, wt)
## Source: local data frame [3 x 3]
## 
##   cyl      mpg       wt
## 1   4 26.66364 2.285727
## 2   6 19.74286 3.117143
## 3   8 15.10000 3.999214
# calculate mean, min, max and sd of mpg and wt rates by number of cyl 
cars %>%
  group_by(cyl) %>%
  summarise_each(funs(mean(., na.rm = TRUE), 
                      min(., na.rm = TRUE), 
                      max(., na.rm = TRUE), 
                      sd(., na.rm = TRUE)), 
                      mpg, wt)
## Source: local data frame [3 x 9]
## 
##   cyl mpg_mean  wt_mean mpg_min wt_min mpg_max wt_max   mpg_sd     wt_sd
## 1   4 26.66364 2.285727    21.4  1.513    33.9  3.190 4.509828 0.5695637
## 2   6 19.74286 3.117143    17.8  2.620    21.4  3.460 1.453567 0.3563455
## 3   8 15.10000 3.999214    10.4  3.170    19.2  5.424 2.560048 0.7594047


Sampling
# sample 5 rows 
cars %>% sample_n(5)
##                  car  mpg cyl  disp  hp drat   wt  qsec vs am gear carb
## 31     Maserati Bora 15.0   8 301.0 335 3.54 3.57 14.60  0  1    5    8
## 5  Hornet Sportabout 18.7   8 360.0 175 3.15 3.44 17.02  0  0    3    2
## 10          Merc 280 19.2   6 167.6 123 3.92 3.44 18.30  1  0    4    4
## 6            Valiant 18.1   6 225.0 105 2.76 3.46 20.22  1  0    3    1
## 18          Fiat 128 32.4   4  78.7  66 4.08 2.20 19.47  1  1    4    1
# sample 10% of rows  
cars %>% 
  sample_frac(.1, replace = FALSE)
##                 car  mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1         Mazda RX4 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2     Mazda RX4 Wag 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 5 Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2