Benchmarking

Multiplications are computationally costly

Avoid them as much as possible by using intermediate variables. Example:

# 3 multiplications:
sir_equations1 <- function(time, variables, parameters) {
  with(as.list(c(variables, parameters)), {
    incidence <- beta * I * S
    recovery  <- gamma * I
    dS <- -incidence
    dI <-  incidence - recovery
    dR <-  recovery
    list(c(dS, dI, dR))
  })
}

# 6 multiplications:
sir_equations2 <- function(time, variables, parameters) {
  with(as.list(c(variables, parameters)), {
    dS <- -beta * I * S
    dI <-  beta * I * S - gamma * I
    dR <-  gamma * I
    list(c(dS, dI, dR))
  })
}

initial_values <- c(S = 999, I = 1, R = 0)
time_values <- seq(0, 30, .1)
parameters_values <- c(beta = .004, gamma = .5)

bench::mark(
  three_multiplications = 
    ode(y = initial_values, times = time_values, func = sir_equations1, parms = parameters_values),
  six_multiplications = 
    ode(y = initial_values, times = time_values, func = sir_equations2, parms = parameters_values)
)

## # A tibble: 2 × 6
##   expression                 min   median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr>            <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
## 1 three_multiplications    6.5ms    6.9ms      136.   694.9KB     21.3
## 2 six_multiplications     6.52ms   6.73ms      147.    96.4KB     18.7

`return()`

Function calls are computationally expensive. Do not use any that is not necessary, for example return() at the end of a function:

# without return():
sir_equations1 <- function(time, variables, parameters) {
  with(as.list(c(variables, parameters)), {
    incidence <- beta * I * S
    recovery  <- gamma * I
    dS <- -incidence
    dI <-  incidence - recovery
    dR <-  recovery
    list(c(dS, dI, dR))
  })
}

# with return():
sir_equations2 <- function(time, variables, parameters) {
  with(as.list(c(variables, parameters)), {
    incidence <- beta * I * S
    recovery  <- gamma * I
    dS <- -incidence
    dI <-  incidence - recovery
    dR <-  recovery
    return(list(c(dS, dI, dR)))
  })
}

initial_values <- c(S = 999, I = 1, R = 0)
time_values <- seq(0, 30, .1)
parameters_values <- c(beta = .004, gamma = .5)

bench::mark(
  without_return = 
    ode(y = initial_values, times = time_values, func = sir_equations1, parms = parameters_values),
  with_return = 
    ode(y = initial_values, times = time_values, func = sir_equations2, parms = parameters_values)
)

## # A tibble: 2 × 6
##   expression          min   median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr>     <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
## 1 without_return   6.49ms   6.65ms      150.    85.5KB     18.4
## 2 with_return      6.69ms   6.92ms      144.    88.4KB     18.6

Rewriting built-in functions

Built-in functions comes with a number of internal checks that are useful for interactive use. But if you know what you’re doing, you are better off rewriting some of these funtions from scractch, without the internal checks:

x <- runif(1000)
bench::mark(
  from_scratch = sum(x) / length(x),
  built_in = mean(x)
)

## # A tibble: 2 × 6
##   expression        min   median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
## 1 from_scratch   2.04µs   2.13µs   447183.        0B      0  
## 2 built_in       4.35µs   4.63µs   208619.        0B     20.9

Sequence generation

If you’re generating a sequence with a step of 1, there are multiple ways to do so:

bench::mark(
  seq.int(0, 1000000),
  seq(0, 1000000),
  0:1000000
)

## # A tibble: 3 × 6
##   expression             min   median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr>        <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
## 1 seq.int(0, 1e+06)    237ns    268ns  3498610.        0B        0
## 2 seq(0, 1e+06)       3.69µs   4.09µs   231589.        0B        0
## 3 0:1e+06              128ns    145ns  5214732.        0B        0

How does this perform at different sizes (looking at total_time):

purrr::map(
  10^(0:7),
  function(n) {
    bench::mark(
      seq.int(0, n),
      seq(0, n),
      0:n
    ) %>%
      select(expression, total_time) %>%
      mutate(across(-expression, as.double)) %>%
      mutate(expression = as.character(expression)) %>%
      mutate(n = n)
  }
) %>%
  purrr::list_rbind() %>%
  ggplot() +
  geom_line(aes(x = log10(n), y = total_time, group = expression, color = expression)) +
  scale_x_continuous("Size (log 10)", breaks = 0:7)

Sequence generation with steps different from 1

With a step lower than 1:

bench::mark(seq(1, 1000, by = .01),
            (100:100000) / 100)

## # A tibble: 2 × 6
##   expression                   min   median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr>              <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
## 1 seq(1, 1000, by = 0.01)    465µs    560µs     1373.    1.91MB     34.1
## 2 (100:1e+05)/100            123µs    176µs     4578.    1.14MB     46.1

With a step higher than 1:

bench::mark(seq(1, 1000, by = 3),
            1 + 3 * 0:(1000 / 3))

## # A tibble: 2 × 6
##   expression                min   median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr>           <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
## 1 seq(1, 1000, by = 3)   15.3µs  16.68µs    54796.    6.66KB     11.0
## 2 1 + 3 * 0:(1000/3)      1.1µs   2.61µs   396288.    4.01KB      0

Polynomial functions

What is a good and efficient way to create a polynomial function that gives Y from X, given known coefficients? The idea stems from trying to reproduce the temperature-dependent oviposition rate per female mosquito from a table of polynomial coefficients, from this paper.

coefs <- c(-1.51e-4, 1.02e-2, -2.12e01, 1.80, -5.40)

f1 <- function(x, coefs) {
  degrees <- length(coefs) - 1
  powers <- outer(x, seq(degrees, 0), "^")
  multiplied <- t(apply(powers, 1, \(x) x * coefs))
  apply(multiplied, 1, sum)
}

f2 <- function(x, coefs) {
  powers <- (length(coefs) - 1):0
  sapply(x, \(y) sum(coefs * y^powers))
}

bench::mark(
  f1(22, coefs),
  f2(22, coefs)
)

## # A tibble: 2 × 6
##   expression         min   median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr>    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
## 1 f1(22, coefs)   50.8µs   53.7µs    18090.    32.5KB     14.6
## 2 f2(22, coefs)   12.9µs   13.9µs    69310.        0B     20.8

Access dataframe/tibble column

Many ways to access column values in a dataframe/tibble (result must be a vector):

df <- tibble(a = 1:1000, b = 1:1000)

bench::mark(
  df$a,
  pull(df, a),
  df[["a"]],
)

## # A tibble: 3 × 6
##   expression         min   median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr>    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
## 1 "df$a"          1.38µs   1.56µs   610626.        0B     61.1
## 2 "pull(df, a)" 260.19µs 270.34µs     3659.     199KB     12.4
## 3 "df[[\"a\"]]"   4.87µs   5.22µs   183010.        0B     18.3