Performance and benchmarks

Model setup and hardware information:

using MKL, LinearSolve
using SymBoltz
using OrdinaryDiffEqRosenbrock, OrdinaryDiffEqSDIRK, OrdinaryDiffEqBDF
using Base.Threads, BenchmarkTools, Plots, BenchmarkPlots, StatsPlots
M = ΛCDM()
pars = parameters_Planck18(M)
prob = CosmologyProblem(M, pars; jac = true, sparse = true)

using Dates, InteractiveUtils, LinearAlgebra
LinearAlgebra.BLAS.set_num_threads(1)
println("Current time: ", now(), "\n")
println(sprint(InteractiveUtils.versioninfo)) # show computer information
println(LinearAlgebra.BLAS.get_config())
println("BLAS threads: ", LinearAlgebra.BLAS.get_num_threads())
Current time: 2025-12-20T16:28:38.191

Julia Version 1.12.3
Commit 966d0af0fdf (2025-12-15 11:20 UTC)
Build Info:
  Official https://julialang.org release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 4 × AMD EPYC 7763 64-Core Processor
  WORD_SIZE: 64
  LLVM: libLLVM-18.1.7 (ORCJIT, znver3)
  GC: Built with stock GC
Threads: 4 default, 1 interactive, 4 GC (on 4 virtual cores)
Environment:
  JULIA_DEBUG = Documenter

LBTConfig([ILP64] libmkl_rt.so, [LP64] libmkl_rt.so)
BLAS threads: 1

Background: ODE solver

This plot shows the time to solve the background using different (implicit) Rosenbrock methods with a fixed tolerance.

bench = BenchmarkGroup()
bgalgs = [Rodas4(), Rodas5(), Rodas4P(), Rodas5P()]
for alg in bgalgs
    bgopts = (alg = alg, abstol = 1e-9, reltol = 1e-9)
    bench[nameof(typeof(alg))] = @benchmarkable $solve($prob; bgopts = $bgopts)
end
results = run(bench; verbose = true)
plot(results; size = (800, 400))
Example block output

Background: precision-work diagram

This plot compares the time to solve the background vs. accuracy of the solution using different ODE solvers and tolerances. Every solution is compared to a reference solution with very small tolerance. The points on each curve correspond to a sequence of tolerances.

# following e.g. https://github.com/SciML/ModelingToolkit.jl/issues/2971#issuecomment-2310016590
using DiffEqDevTools

refalg = Rodas4P()
bgsol = solve(prob.bg, refalg; abstol = 1e-12, reltol = 1e-12) # reference solution (results are similar compared to Rodas4/4P/5P/FBDF)

abstols = 1 ./ 10 .^ (7:11)
reltols = 1 ./ 10 .^ (7:11)
bgalgs = [Rodas4(), Rodas5(), Rodas4P(), Rodas5P(), FBDF(), QNDF()] # FBDF/QNDF unstable for some tolerances
setups = [Dict(:alg => alg) for alg in bgalgs]
wp = WorkPrecisionSet(prob.bg, abstols, reltols, setups; appxsol = bgsol, save_everystep = false, error_estimate = :l2)
plot(wp; title = "Reference: $(SymBoltz.algname(refalg))", size = (800, 400), margin = 5*Plots.mm)
Example block output

Note that the FBDF and QNDF methods are unstable for several tolerances.

Perturbations: parallelization

SymBoltz parallelizes integration of different perturbation $k$-modes with multithreading by default. Make sure you run Julia with multiple threads. This is a standard technique in Boltzmann solvers, as linear perturbation modes are mathematically independent. It leads to a performance improvement depending on the number of threads available. It can be disabled, for example if your application permits parallelization at a higher level.

bench = BenchmarkGroup()
ks = 10 .^ range(-1, 4, length = 100)
for thread in [true, false]
    label = thread ? "$(nthreads()) threads" : "1 thread"
    bench[label] = @benchmarkable $solve($prob, $ks; thread = $thread)
end
results = run(bench; verbose = true)
plot(results; size = (800, 400))
Example block output

Perturbations: ODE solver

This plot shows the time to solve several perturbation $k$-modes using different implicit ODE solvers with fixed tolerance.

# Rodas methods very slow with massive neutrinos included (they scale worse with system size)
bench = BenchmarkGroup()
ptalgs = [algtype(linsolve = KLUFactorization()) for algtype in [TRBDF2, KenCarp4, KenCarp47, Kvaerno5, QNDF, FBDF, Rodas5P]]
for alg in ptalgs
    ptopts = (alg = alg, abstol = 1e-7, reltol = 1e-7)
    bench[nameof(typeof(alg))] = @benchmarkable $solve($prob, $ks; ptopts = $ptopts)
end
results = run(bench; verbose = true)
plot(results; size = (800, 400))
Example block output

Perturbations: precision-work diagram

This plot compares the time to solve a perturbation $k$-mode vs. accuracy of the solution using different ODE solvers and tolerances. Each subplot corresponds to a different $k$-mode. Every solution is compared to a reference solution with very small tolerance. The points on each curve correspond to a sequence of tolerances.

ptprob0, ptprobgen = SymBoltz.setuppt(prob.pt, bgsol)
setups = [Dict(:alg => alg) for alg in ptalgs]
refalg = FBDF()
abstols = 1 ./ 10 .^ (5:9)
reltols = 1 ./ 10 .^ (5:9)

function plot_precision_work_perturbations(k; numruns = 8, print_names = true, kwargs...)
    ptprob = ptprobgen(ptprob0, k)
    ptsol = solve(ptprob, refalg; reltol = 1e-12, abstol = 1e-12) # reference solution (results are similar compared to QNDF/FBDF/Rodas4/4P/5P/, somewhat different with KenCarp4/Kvaerno5; use Rodas4P which is also used in CLASS comparison)
    wp = WorkPrecisionSet(ptprob, abstols, reltols, setups; appxsol = ptsol, save_everystep = false, error_estimate = :l2, numruns, print_names, kwargs...)
    return plot(wp; title = "Reference: $(SymBoltz.algname(refalg)), k = $k H₀/c", left_margin = 15*Plots.mm, bottom_margin = 5*Plots.mm)
end

pk1 = plot_precision_work_perturbations(1e1)
Example block output
pk2 = plot_precision_work_perturbations(1e2)
Example block output
pk3 = plot_precision_work_perturbations(1e3)
Example block output
pk4 = plot_precision_work_perturbations(1e4)
Example block output

Perturbations: time per mode

This plot shows the time spent solving individual perturbation $k$-modes using different ODE solvers with fixed tolerance.

ptprob0, ptprobgen = SymBoltz.setuppt(prob.pt, bgsol)
solvemode(k, ptalg) = solve(ptprobgen(ptprob0, k); alg = ptalg, reltol = 1e-7, abstol = 1e-7)

ks = 10 .^ range(-2, 4, length = 50)
times = [[minimum(@elapsed solvemode(k, ptalg) for i in 1:3) for k in ks] for ptalg in ptalgs]

plot(
    log10.(ks), map(ts -> log10.(ts), times); marker = :auto, markersize = 2,
    xlabel = "lg(k)", ylabel = "lg(time / s)", xticks = range(log10(ks[begin]), log10(ks[end]), step=1),
    label = permutedims(SymBoltz.algname.(ptalgs)), legend_position = :topleft
)
Example block output

Perturbations: Jacobian method

The Jacobian of the perturbation ODEs can be computed in three ways:

  1. explicitly from a symbolically generated function,
  2. numerically using forward-mode dual numbers, or
  3. numerically using finite differences.

This plot shows the time to solve several perturbation $k$-modes for each such method. In all cases, the Jacobian is made sparse from the analytical sparsity pattern.

bench = BenchmarkGroup()
ks = 10 .^ range(-2, 4, length = 100)
prob_jac = prob # CosmologyProblem(M, pars; jac = true, sparse = true)
prob_nojac = CosmologyProblem(M, pars; jac = false, sparse = true)

bgopts = (alg = Rodas4P(linsolve = RFLUFactorization(),),)
ptopts = (alg = KenCarp4(linsolve = KLUFactorization(),), save_everystep = false) # generate function for J symbolically
bench["symbolic"] = @benchmarkable $solve($prob_jac, $ks; bgopts = $bgopts, ptopts = $ptopts)

bgopts = (alg = Rodas4P(linsolve = RFLUFactorization(), autodiff = true),)
ptopts = (alg = KenCarp4(linsolve = KLUFactorization(), autodiff = true), save_everystep = false) # compute J with forward-mode AD
bench["forward diff"] = @benchmarkable $solve($prob_nojac, $ks; bgopts = $bgopts, ptopts = $ptopts)

bgopts = (alg = Rodas4P(linsolve = RFLUFactorization(), autodiff = true),) # fails with finite diff background J
ptopts = (alg = KenCarp4(linsolve = KLUFactorization(), autodiff = false), save_everystep = false) # compute J with finite differences
bench["finite diff"] = @benchmarkable $solve($prob_nojac, $ks; bgopts = $bgopts, ptopts = $ptopts)

results = run(bench; verbose = true)
plot(results; size = (800, 400))
Example block output

Perturbations: linear system solver

At every time step, the implicit perturbation ODE solver solves a system of (nonlinear) equations with Newton's method. In turn, Newton's method involves iteratively solving several linear systems $Ax = b$, where $A$ involves the ODE Jacobian matrix $J$. This can be a bottleneck, so it is very important to use a linear solver that does this as fast as possible!

Note that the optimal linear solver depends on both the model and your hardware. See this tutorial on accelerating linear solves. Also run Julia with the optimal BLAS backend for your platform, such as MKL.jl for Intel or AppleAccelerate.jl for Macs, instead of Julia's default OpenBLAS backend.

In particular, for large models the ODE Jacobian can have lots of zeros. Here is an example for a model with many perturbation equations.

lmaxs = [4, 8, 16, 32]
Ms = [ΛCDM(; lmax) for lmax in lmaxs]

probs_dense = [CosmologyProblem(M, pars; ptopts = (jac = true, sparse = false)) for M in Ms]
probs_sparse = [CosmologyProblem(M, pars; ptopts = (jac = true, sparse = true)) for M in Ms]

probs_sparse[end].pt.f.jac_prototype # example of sparse Jacobian
236×236 SparseArrays.SparseMatrixCSC{Float64, Int64} with 871 stored entries:
⎡⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎤
⎢⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⣿⢗⡀⠀⠀⠀⢠⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢠⡤⡀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠻⣦⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠻⣦⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠒⠀⠀⠀⠀⠈⠛⣤⡀⠀⠀⠀⠀⠀⠀⠀⠀⠐⠂⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠻⣦⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠻⣦⡀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠻⣦⡀⠀⠀⠘⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠻⣦⡀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⠀⠀⠀⠀⠀⢀⠀⠀⠀⠀⠀⣀⠀⠀⠈⠻⣦⡀⢀⎥
⎣⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣿⠢⠀⠀⠀⠀⢸⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢸⣿⣿⎦

Sparse matrix methods are therefore very important to speed up the solution of large perturbation systems. This plot compares the time to solve several perturbation $k$-modes with different dense and sparse linear matrix solvers.

ks = 10 .^ range(-2, 4, length=100)
ptopts_dense1 = (alg = KenCarp4(linsolve = LUFactorization()), save_everystep = false)
ptopts_dense2 = (alg = KenCarp4(linsolve = RFLUFactorization()), save_everystep = false)
ptopts_sparse1 = (alg = KenCarp4(linsolve = KLUFactorization()), save_everystep = false)
ptopts_sparse2 = (alg = KenCarp4(linsolve = UMFPACKFactorization()), save_everystep = false)
ts_dense1 = [minimum(@elapsed solve(prob, ks; ptopts = ptopts_dense1) for i in 1:3) for prob in probs_dense]
ts_dense2 = [minimum(@elapsed solve(prob, ks; ptopts = ptopts_dense2) for i in 1:3) for prob in probs_dense]
ts_sparse1 = [minimum(@elapsed solve(prob, ks; ptopts = ptopts_sparse1) for i in 1:3) for prob in probs_sparse]
ts_sparse2 = [minimum(@elapsed solve(prob, ks; ptopts = ptopts_sparse2) for i in 1:3) for prob in probs_sparse]

p1 = plot(ylabel = "time / s", xticks = (lmaxs, ""), ylims = (0.0, ceil(max(maximum(ts_dense1), maximum(ts_dense2)))))
marker = :circle
plot!(p1, lmaxs, ts_sparse1; label = "sparse $(nameof(typeof(ptopts_sparse1.alg.linsolve))), $(length(ks))×k", marker)
plot!(p1, lmaxs, ts_sparse2; label = "sparse $(nameof(typeof(ptopts_sparse2.alg.linsolve))), $(length(ks))×k", marker)
plot!(p1, lmaxs, ts_dense1; label = "dense $(nameof(typeof(ptopts_dense1.alg.linsolve))), $(length(ks))×k", marker)
plot!(p1, lmaxs, ts_dense2; label = "dense $(nameof(typeof(ptopts_dense2.alg.linsolve))), $(length(ks))×k", marker)
text(prob::CosmologyProblem) = "$(length(prob.pt.u0)) eqs,\n$(round(SymBoltz.sparsity_fraction(prob.pt)*100, digits=1)) %\nsparse"
annotate!(p1, lmaxs, zeros(length(lmaxs)), [(text(prob), 5, :top) for prob in probs_sparse])

speedups1 = [ts_dense2[i]/ts_sparse1[i] for i in eachindex(lmaxs)]
speedups2 = [ts_dense2[i]/ts_sparse2[i] for i in eachindex(lmaxs)]
speedups3 = [ts_dense2[i]/ts_dense1[i] for i in eachindex(lmaxs)]
speedups4 = [ts_dense2[i]/ts_dense2[i] for i in eachindex(lmaxs)]
ymax = Int(ceil(maximum(maximum.([speedups1, speedups2, speedups3, speedups4]))))
ylims = (0, ymax)
yticks = 0:1:ymax
yticks = (collect(yticks), collect("$y×" for y in yticks))
p2 = plot(; xlabel = "ℓmax", ylabel = "speedup", xticks = lmaxs, yticks, ylims, marker)
plot!(p2, lmaxs, speedups1; marker, label = nothing)
plot!(p2, lmaxs, speedups2; marker, label = nothing)
plot!(p2, lmaxs, speedups3; marker, label = nothing)
plot!(p2, lmaxs, speedups4; marker, label = nothing)

plot(p1, p2; size = (800, 600), layout = grid(2, 1, heights=(3//4, 1//4)))
Example block output

Except for models with a very small perturbation system, it is a good idea to generate the sparse Jacobian and use the sparse KLUFactorization linear solver.