One goal of the {svTidy} package is to provide an interface that is similar to {dplyr} and {tidyr} to performant code (both for speed and memory use), possibly using the {data.table} or {collapse} packages under the hood.
In this document, we compare speed and memory use of {svTidy} with {dplyr} and {tidyr}. Keep in mind that most benchmarks (including the present ones) are artificial and do not necessarily reflect real use cases. Test with your own data. Also, do you really need faster or more memory efficient code? It depends on your particular context! Also take your hardware into account (number of CPU and amount of RAM).
We test {svTidy} function both in standard evaluation (SE) and non standard evaluation (NSE) modes: the later one usually requires more computing time, but it is more convenient to write and read and closer to {dplyr}/{tidyr} syntax.
Preparation: using 3/4 of available cores for parallel code in {data.table} and {collapse}.
data.table::setDTthreads(percent = 75)
(.nthreads <- data.table::getDTthreads())
#> [1] 3
options(collapse_nthreads = .nthreads)
options(collapse_na.rm = FALSE)
options(collapse_mask = "all")Small and medium data sets.
# Small one
data(mtcars)
mtcars <- as_tibble(mtcars, rownames = "model")
mtcars_dt <- as.data.table(mtcars)
# Medium one
data(babynames, package = 'babynames')
babynames <- as_tibble(babynames)
babynames_dt <- as.data.table(babynames)Here is a couple of examples of fast and memory efficient {svTidy} functions.
filter_()Small data set.
# Note: collapse::qDF() = quickly convert to a data.frame, for identical results
bench::mark(
dplyr = filter(mtcars, mpg > 20) |> qDF(),
data.table = mtcars_dt[mpg > 20] |> qDF(),
svTidyNSE = filter_(mtcars, ~mpg > 20) |> qDF(),
svTidySE = filter_(mtcars, mtcars$mpg > 20) |> qDF())
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 659.4µs 706.5µs 1349. 1.36MB 8.54
#> 2 data.table 105.9µs 114.5µs 8424. 1.81MB 6.33
#> 3 svTidyNSE 55µs 59.2µs 16209. 154.44KB 11.0
#> 4 svTidySE 46.4µs 50.3µs 19140. 43.45KB 10.9Medium data set.
bench::mark(
dplyr = filter(babynames, n > 1000) |> qDF(),
data.table = babynames_dt[n > 1000] |> qDF(),
svTidyNSE = filter_(babynames, ~n > 1000) |> qDF(),
svTidySE = filter_(babynames, babynames$n > 1000) |> qDF())
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 10.78ms 11.29ms 88.6 31.4MB 49.8
#> 2 data.table 4.89ms 7.96ms 132. 16.7MB 36.0
#> 3 svTidyNSE 4.39ms 4.51ms 211. 16.7MB 41.7
#> 4 svTidySE 4.37ms 4.56ms 205. 16.7MB 34.1arrange_()bench::mark(
dplyr = arrange(mtcars, cyl, desc(vs)) |> qDF(),
data.table = mtcars_dt[order(cyl, -vs)] |> qDF(),
svTidyNSE = arrange_(mtcars, ~cyl, ~ -vs) |> qDF(),
svTidySE = arrange_(mtcars, 'cyl', '-vs') |> qDF())
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 2.23ms 2.33ms 423. 539.3KB 8.82
#> 2 data.table 155.29µs 163.71µs 5890. 38.9KB 8.59
#> 3 svTidyNSE 100.53µs 106.31µs 9038. 92.9KB 8.63
#> 4 svTidySE 61.44µs 65.42µs 14616. 85.1KB 11.0bench::mark(
dplyr = arrange(babynames, sex, desc(n)) |> qDF(),
data.table = babynames_dt[order(sex, -n)] |> qDF(),
svTidyNSE = arrange_(babynames, ~sex, ~ -n) |> qDF(),
svTidySE = arrange_(babynames, 'sex', '-n') |> qDF())
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 71.8ms 77.4ms 9.66 112.7MB 9.66
#> 2 data.table 69.4ms 71.5ms 13.6 73.4MB 3.88
#> 3 svTidyNSE 25.6ms 27.6ms 29.4 73.4MB 11.8
#> 4 svTidySE 26.6ms 34.5ms 27.4 73.4MB 9.77Not all {svTidy} functions are currently faster or more memory efficient than their {dplyr} or {tidyr} counterparts. Those still need refactoring to be optimized. Here are some examples.
bind_rows_()df1 <- tibble(x = 1:2, y = letters[1:2])
df1_dt <- as.data.table(df1)
bench::mark(
dplyr = bind_rows(df1, df1) |> qDF(),
base = rbind(df1, df1) |> qDF(),
data.table = rbindlist(list(df1_dt, df1_dt)) |> qDF(),
svTidy = bind_rows_(df1, df1) |> qDF(),
svTidy2 = bind_rows_(list(df1, df1)) |> qDF())
#> # A tibble: 5 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 89.1µs 95.6µs 10079. 96.4KB 8.74
#> 2 base 76µs 81µs 11939. 47.5KB 11.0
#> 3 data.table 115.5µs 122.4µs 7842. 121.5KB 6.32
#> 4 svTidy 627µs 669.4µs 1475. 203.9KB 8.65
#> 5 svTidy2 627.2µs 667.1µs 1482. 52.5KB 8.67bench::mark(
dplyr = bind_rows(babynames, babynames) |> qDF(),
base = rbind(babynames, babynames) |> qDF(),
data.table = rbindlist(list(babynames_dt, babynames_dt)) |> qDF(),
svTidy = bind_rows_(babynames, babynames) |> qDF(),
svTidy2 = bind_rows_(list(babynames, babynames)) |> qDF())
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 5 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 90ms 113.35ms 7.96 132MB 4.77
#> 2 base 224.86ms 325.63ms 3.07 256MB 3.07
#> 3 data.table 52.01ms 59.34ms 14.5 132MB 5.45
#> 4 svTidy 1.66s 1.66s 0.601 368MB 1.20
#> 5 svTidy2 1.59s 1.59s 0.628 338MB 0.628bind_cols_()df1 <- tibble(x = 1:2, y = letters[1:2])
df2 <- tibble(z = 10:11, w = factor(5:6))
df1_dt <- as.data.table(df1)
df2_dt <- as.data.table(df2)
bench::mark(check = FALSE,
dplyr = bind_cols(df1, df2) |> qDF(),
base = cbind(df1, df2) |> qDF(),
data.table = cbind(df1_dt, df2_dt) |> qDF(),
svTidy = bind_cols_(df1, df2) |> qDF())
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 146µs 154µs 6081. 32.87KB 2.09
#> 2 base 194µs 203µs 4790. 2.19KB 2.10
#> 3 data.table 163µs 170µs 5714. 187.32KB 2.09
#> 4 svTidy 153µs 159µs 6074. 13.88KB 2.09