Performance of svTidy Functions

One goal of the {svTidy} package is to provide an interface that is similar to {dplyr} and {tidyr} to performant code (both for speed and memory use), possibly using the {data.table} or {collapse} packages under the hood.

In this document, we compare speed and memory use of {svTidy} with {dplyr} and {tidyr}. Keep in mind that most benchmarks (including the present ones) are artificial and do not necessarily reflect real use cases. Test with your own data. Also, do you really need faster or more memory efficient code? It depends on your particular context! Also take your hardware into account (number of CPU and amount of RAM).

We test {svTidy} function both in standard evaluation (SE) and non standard evaluation (NSE) modes: the later one usually requires more computing time, but it is more convenient to write and read and closer to {dplyr}/{tidyr} syntax.

Functions that are considered optimized

Preparation: using 3/4 of available cores for parallel code in {data.table} and {collapse}.

data.table::setDTthreads(percent = 75)
(.nthreads <- data.table::getDTthreads())
#> [1] 3
options(collapse_nthreads = .nthreads)
options(collapse_na.rm = FALSE)
options(collapse_mask = "all")

Small and medium data sets.

# Small one
data(mtcars)
mtcars <- as_tibble(mtcars, rownames = "model")
mtcars_dt <- as.data.table(mtcars)

# Medium one
data(babynames, package = 'babynames')
babynames <- as_tibble(babynames)
babynames_dt <- as.data.table(babynames)

Here is a couple of examples of fast and memory efficient {svTidy} functions.

filter_()

Small data set.

# Note: collapse::qDF() = quickly convert to a data.frame, for identical results
bench::mark(
  dplyr      = filter(mtcars, mpg > 20) |> qDF(),
  data.table = mtcars_dt[mpg > 20] |> qDF(), 
  svTidyNSE  = filter_(mtcars, ~mpg > 20) |> qDF(),
  svTidySE   = filter_(mtcars, mtcars$mpg > 20) |> qDF())
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr       659.4µs  706.5µs     1349.    1.36MB     8.54
#> 2 data.table  105.9µs  114.5µs     8424.    1.81MB     6.33
#> 3 svTidyNSE      55µs   59.2µs    16209.  154.44KB    11.0 
#> 4 svTidySE     46.4µs   50.3µs    19140.   43.45KB    10.9

Medium data set.

bench::mark(
  dplyr      = filter(babynames, n > 1000) |> qDF(),
  data.table = babynames_dt[n > 1000] |> qDF(),
  svTidyNSE  = filter_(babynames, ~n > 1000) |> qDF(),
  svTidySE   = filter_(babynames, babynames$n > 1000) |> qDF())
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr       10.78ms  11.29ms      88.6    31.4MB     49.8
#> 2 data.table   4.89ms   7.96ms     132.     16.7MB     36.0
#> 3 svTidyNSE    4.39ms   4.51ms     211.     16.7MB     41.7
#> 4 svTidySE     4.37ms   4.56ms     205.     16.7MB     34.1

arrange_()

bench::mark(
  dplyr      = arrange(mtcars, cyl, desc(vs)) |> qDF(),
  data.table = mtcars_dt[order(cyl, -vs)] |> qDF(),
  svTidyNSE  = arrange_(mtcars, ~cyl, ~ -vs) |> qDF(),
  svTidySE   = arrange_(mtcars, 'cyl', '-vs') |> qDF())
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr        2.23ms   2.33ms      423.   539.3KB     8.82
#> 2 data.table 155.29µs 163.71µs     5890.    38.9KB     8.59
#> 3 svTidyNSE  100.53µs 106.31µs     9038.    92.9KB     8.63
#> 4 svTidySE    61.44µs  65.42µs    14616.    85.1KB    11.0
bench::mark(
  dplyr      = arrange(babynames, sex, desc(n)) |> qDF(),
  data.table = babynames_dt[order(sex, -n)] |> qDF(),
  svTidyNSE  = arrange_(babynames, ~sex, ~ -n) |> qDF(),
  svTidySE   = arrange_(babynames, 'sex', '-n') |> qDF())
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr        71.8ms   77.4ms      9.66   112.7MB     9.66
#> 2 data.table   69.4ms   71.5ms     13.6     73.4MB     3.88
#> 3 svTidyNSE    25.6ms   27.6ms     29.4     73.4MB    11.8 
#> 4 svTidySE     26.6ms   34.5ms     27.4     73.4MB     9.77

Functions that could still be optimized

Not all {svTidy} functions are currently faster or more memory efficient than their {dplyr} or {tidyr} counterparts. Those still need refactoring to be optimized. Here are some examples.

bind_rows_()

df1 <- tibble(x = 1:2, y = letters[1:2])
df1_dt <- as.data.table(df1)

bench::mark(
  dplyr      = bind_rows(df1, df1) |> qDF(),
  base       = rbind(df1, df1) |> qDF(),
  data.table = rbindlist(list(df1_dt, df1_dt)) |> qDF(),
  svTidy     = bind_rows_(df1, df1) |> qDF(),
  svTidy2    = bind_rows_(list(df1, df1)) |> qDF())
#> # A tibble: 5 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr        89.1µs   95.6µs    10079.    96.4KB     8.74
#> 2 base           76µs     81µs    11939.    47.5KB    11.0 
#> 3 data.table  115.5µs  122.4µs     7842.   121.5KB     6.32
#> 4 svTidy        627µs  669.4µs     1475.   203.9KB     8.65
#> 5 svTidy2     627.2µs  667.1µs     1482.    52.5KB     8.67
bench::mark(
  dplyr      = bind_rows(babynames, babynames) |> qDF(),
  base       = rbind(babynames, babynames) |> qDF(),
  data.table = rbindlist(list(babynames_dt, babynames_dt)) |> qDF(),
  svTidy     = bind_rows_(babynames, babynames) |> qDF(),
  svTidy2    = bind_rows_(list(babynames, babynames)) |> qDF())
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 5 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr          90ms 113.35ms     7.96      132MB    4.77 
#> 2 base       224.86ms 325.63ms     3.07      256MB    3.07 
#> 3 data.table  52.01ms  59.34ms    14.5       132MB    5.45 
#> 4 svTidy        1.66s    1.66s     0.601     368MB    1.20 
#> 5 svTidy2       1.59s    1.59s     0.628     338MB    0.628

bind_cols_()

df1 <- tibble(x = 1:2, y = letters[1:2])
df2 <- tibble(z = 10:11, w = factor(5:6))
df1_dt <- as.data.table(df1)
df2_dt <- as.data.table(df2)

bench::mark(check = FALSE,
  dplyr      = bind_cols(df1, df2) |> qDF(),
  base       = cbind(df1, df2) |> qDF(),
  data.table = cbind(df1_dt, df2_dt) |> qDF(),
  svTidy     = bind_cols_(df1, df2) |> qDF())
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr         146µs    154µs     6081.   32.87KB     2.09
#> 2 base          194µs    203µs     4790.    2.19KB     2.10
#> 3 data.table    163µs    170µs     5714.  187.32KB     2.09
#> 4 svTidy        153µs    159µs     6074.   13.88KB     2.09