Performance and benchmarks
Model setup and hardware information:
using MKL, LinearSolve
using SymBoltz
using OrdinaryDiffEqRosenbrock, OrdinaryDiffEqSDIRK, OrdinaryDiffEqBDF
using Base.Threads, BenchmarkTools, Plots, BenchmarkPlots, StatsPlots
M = ΛCDM()
pars = parameters_Planck18(M)
prob = CosmologyProblem(M, pars; jac = true, sparse = true)
using Dates, InteractiveUtils, LinearAlgebra
LinearAlgebra.BLAS.set_num_threads(1)
println("Current time: ", now(), "\n")
println(sprint(InteractiveUtils.versioninfo)) # show computer information
println(LinearAlgebra.BLAS.get_config())
println("BLAS threads: ", LinearAlgebra.BLAS.get_num_threads())Current time: 2025-12-20T16:28:38.191
Julia Version 1.12.3
Commit 966d0af0fdf (2025-12-15 11:20 UTC)
Build Info:
Official https://julialang.org release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 4 × AMD EPYC 7763 64-Core Processor
WORD_SIZE: 64
LLVM: libLLVM-18.1.7 (ORCJIT, znver3)
GC: Built with stock GC
Threads: 4 default, 1 interactive, 4 GC (on 4 virtual cores)
Environment:
JULIA_DEBUG = Documenter
LBTConfig([ILP64] libmkl_rt.so, [LP64] libmkl_rt.so)
BLAS threads: 1Background: ODE solver
This plot shows the time to solve the background using different (implicit) Rosenbrock methods with a fixed tolerance.
bench = BenchmarkGroup()
bgalgs = [Rodas4(), Rodas5(), Rodas4P(), Rodas5P()]
for alg in bgalgs
bgopts = (alg = alg, abstol = 1e-9, reltol = 1e-9)
bench[nameof(typeof(alg))] = @benchmarkable $solve($prob; bgopts = $bgopts)
end
results = run(bench; verbose = true)
plot(results; size = (800, 400))Background: precision-work diagram
This plot compares the time to solve the background vs. accuracy of the solution using different ODE solvers and tolerances. Every solution is compared to a reference solution with very small tolerance. The points on each curve correspond to a sequence of tolerances.
# following e.g. https://github.com/SciML/ModelingToolkit.jl/issues/2971#issuecomment-2310016590
using DiffEqDevTools
refalg = Rodas4P()
bgsol = solve(prob.bg, refalg; abstol = 1e-12, reltol = 1e-12) # reference solution (results are similar compared to Rodas4/4P/5P/FBDF)
abstols = 1 ./ 10 .^ (7:11)
reltols = 1 ./ 10 .^ (7:11)
bgalgs = [Rodas4(), Rodas5(), Rodas4P(), Rodas5P(), FBDF(), QNDF()] # FBDF/QNDF unstable for some tolerances
setups = [Dict(:alg => alg) for alg in bgalgs]
wp = WorkPrecisionSet(prob.bg, abstols, reltols, setups; appxsol = bgsol, save_everystep = false, error_estimate = :l2)
plot(wp; title = "Reference: $(SymBoltz.algname(refalg))", size = (800, 400), margin = 5*Plots.mm)Note that the FBDF and QNDF methods are unstable for several tolerances.
Perturbations: parallelization
SymBoltz parallelizes integration of different perturbation $k$-modes with multithreading by default. Make sure you run Julia with multiple threads. This is a standard technique in Boltzmann solvers, as linear perturbation modes are mathematically independent. It leads to a performance improvement depending on the number of threads available. It can be disabled, for example if your application permits parallelization at a higher level.
bench = BenchmarkGroup()
ks = 10 .^ range(-1, 4, length = 100)
for thread in [true, false]
label = thread ? "$(nthreads()) threads" : "1 thread"
bench[label] = @benchmarkable $solve($prob, $ks; thread = $thread)
end
results = run(bench; verbose = true)
plot(results; size = (800, 400))Perturbations: ODE solver
This plot shows the time to solve several perturbation $k$-modes using different implicit ODE solvers with fixed tolerance.
# Rodas methods very slow with massive neutrinos included (they scale worse with system size)
bench = BenchmarkGroup()
ptalgs = [algtype(linsolve = KLUFactorization()) for algtype in [TRBDF2, KenCarp4, KenCarp47, Kvaerno5, QNDF, FBDF, Rodas5P]]
for alg in ptalgs
ptopts = (alg = alg, abstol = 1e-7, reltol = 1e-7)
bench[nameof(typeof(alg))] = @benchmarkable $solve($prob, $ks; ptopts = $ptopts)
end
results = run(bench; verbose = true)
plot(results; size = (800, 400))Perturbations: precision-work diagram
This plot compares the time to solve a perturbation $k$-mode vs. accuracy of the solution using different ODE solvers and tolerances. Each subplot corresponds to a different $k$-mode. Every solution is compared to a reference solution with very small tolerance. The points on each curve correspond to a sequence of tolerances.
ptprob0, ptprobgen = SymBoltz.setuppt(prob.pt, bgsol)
setups = [Dict(:alg => alg) for alg in ptalgs]
refalg = FBDF()
abstols = 1 ./ 10 .^ (5:9)
reltols = 1 ./ 10 .^ (5:9)
function plot_precision_work_perturbations(k; numruns = 8, print_names = true, kwargs...)
ptprob = ptprobgen(ptprob0, k)
ptsol = solve(ptprob, refalg; reltol = 1e-12, abstol = 1e-12) # reference solution (results are similar compared to QNDF/FBDF/Rodas4/4P/5P/, somewhat different with KenCarp4/Kvaerno5; use Rodas4P which is also used in CLASS comparison)
wp = WorkPrecisionSet(ptprob, abstols, reltols, setups; appxsol = ptsol, save_everystep = false, error_estimate = :l2, numruns, print_names, kwargs...)
return plot(wp; title = "Reference: $(SymBoltz.algname(refalg)), k = $k H₀/c", left_margin = 15*Plots.mm, bottom_margin = 5*Plots.mm)
end
pk1 = plot_precision_work_perturbations(1e1)pk2 = plot_precision_work_perturbations(1e2)pk3 = plot_precision_work_perturbations(1e3)pk4 = plot_precision_work_perturbations(1e4)Perturbations: time per mode
This plot shows the time spent solving individual perturbation $k$-modes using different ODE solvers with fixed tolerance.
ptprob0, ptprobgen = SymBoltz.setuppt(prob.pt, bgsol)
solvemode(k, ptalg) = solve(ptprobgen(ptprob0, k); alg = ptalg, reltol = 1e-7, abstol = 1e-7)
ks = 10 .^ range(-2, 4, length = 50)
times = [[minimum(@elapsed solvemode(k, ptalg) for i in 1:3) for k in ks] for ptalg in ptalgs]
plot(
log10.(ks), map(ts -> log10.(ts), times); marker = :auto, markersize = 2,
xlabel = "lg(k)", ylabel = "lg(time / s)", xticks = range(log10(ks[begin]), log10(ks[end]), step=1),
label = permutedims(SymBoltz.algname.(ptalgs)), legend_position = :topleft
)Perturbations: Jacobian method
The Jacobian of the perturbation ODEs can be computed in three ways:
- explicitly from a symbolically generated function,
- numerically using forward-mode dual numbers, or
- numerically using finite differences.
This plot shows the time to solve several perturbation $k$-modes for each such method. In all cases, the Jacobian is made sparse from the analytical sparsity pattern.
bench = BenchmarkGroup()
ks = 10 .^ range(-2, 4, length = 100)
prob_jac = prob # CosmologyProblem(M, pars; jac = true, sparse = true)
prob_nojac = CosmologyProblem(M, pars; jac = false, sparse = true)
bgopts = (alg = Rodas4P(linsolve = RFLUFactorization(),),)
ptopts = (alg = KenCarp4(linsolve = KLUFactorization(),), save_everystep = false) # generate function for J symbolically
bench["symbolic"] = @benchmarkable $solve($prob_jac, $ks; bgopts = $bgopts, ptopts = $ptopts)
bgopts = (alg = Rodas4P(linsolve = RFLUFactorization(), autodiff = true),)
ptopts = (alg = KenCarp4(linsolve = KLUFactorization(), autodiff = true), save_everystep = false) # compute J with forward-mode AD
bench["forward diff"] = @benchmarkable $solve($prob_nojac, $ks; bgopts = $bgopts, ptopts = $ptopts)
bgopts = (alg = Rodas4P(linsolve = RFLUFactorization(), autodiff = true),) # fails with finite diff background J
ptopts = (alg = KenCarp4(linsolve = KLUFactorization(), autodiff = false), save_everystep = false) # compute J with finite differences
bench["finite diff"] = @benchmarkable $solve($prob_nojac, $ks; bgopts = $bgopts, ptopts = $ptopts)
results = run(bench; verbose = true)
plot(results; size = (800, 400))Perturbations: linear system solver
At every time step, the implicit perturbation ODE solver solves a system of (nonlinear) equations with Newton's method. In turn, Newton's method involves iteratively solving several linear systems $Ax = b$, where $A$ involves the ODE Jacobian matrix $J$. This can be a bottleneck, so it is very important to use a linear solver that does this as fast as possible!
Note that the optimal linear solver depends on both the model and your hardware. See this tutorial on accelerating linear solves. Also run Julia with the optimal BLAS backend for your platform, such as MKL.jl for Intel or AppleAccelerate.jl for Macs, instead of Julia's default OpenBLAS backend.
In particular, for large models the ODE Jacobian can have lots of zeros. Here is an example for a model with many perturbation equations.
lmaxs = [4, 8, 16, 32]
Ms = [ΛCDM(; lmax) for lmax in lmaxs]
probs_dense = [CosmologyProblem(M, pars; ptopts = (jac = true, sparse = false)) for M in Ms]
probs_sparse = [CosmologyProblem(M, pars; ptopts = (jac = true, sparse = true)) for M in Ms]
probs_sparse[end].pt.f.jac_prototype # example of sparse Jacobian236×236 SparseArrays.SparseMatrixCSC{Float64, Int64} with 871 stored entries:
⎡⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎤
⎢⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⢿⣷⣄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠙⣿⢗⡀⠀⠀⠀⢠⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢠⡤⡀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠻⣦⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠻⣦⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠒⠀⠀⠀⠀⠈⠛⣤⡀⠀⠀⠀⠀⠀⠀⠀⠀⠐⠂⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠻⣦⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠻⣦⡀⠀⠀⠀⠀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠻⣦⡀⠀⠀⠘⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠈⠻⣦⡀⠀⠀⠀⎥
⎢⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⠀⠀⠀⠀⠀⢀⠀⠀⠀⠀⠀⣀⠀⠀⠈⠻⣦⡀⢀⎥
⎣⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣿⠢⠀⠀⠀⠀⢸⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢸⣿⣿⎦Sparse matrix methods are therefore very important to speed up the solution of large perturbation systems. This plot compares the time to solve several perturbation $k$-modes with different dense and sparse linear matrix solvers.
ks = 10 .^ range(-2, 4, length=100)
ptopts_dense1 = (alg = KenCarp4(linsolve = LUFactorization()), save_everystep = false)
ptopts_dense2 = (alg = KenCarp4(linsolve = RFLUFactorization()), save_everystep = false)
ptopts_sparse1 = (alg = KenCarp4(linsolve = KLUFactorization()), save_everystep = false)
ptopts_sparse2 = (alg = KenCarp4(linsolve = UMFPACKFactorization()), save_everystep = false)
ts_dense1 = [minimum(@elapsed solve(prob, ks; ptopts = ptopts_dense1) for i in 1:3) for prob in probs_dense]
ts_dense2 = [minimum(@elapsed solve(prob, ks; ptopts = ptopts_dense2) for i in 1:3) for prob in probs_dense]
ts_sparse1 = [minimum(@elapsed solve(prob, ks; ptopts = ptopts_sparse1) for i in 1:3) for prob in probs_sparse]
ts_sparse2 = [minimum(@elapsed solve(prob, ks; ptopts = ptopts_sparse2) for i in 1:3) for prob in probs_sparse]
p1 = plot(ylabel = "time / s", xticks = (lmaxs, ""), ylims = (0.0, ceil(max(maximum(ts_dense1), maximum(ts_dense2)))))
marker = :circle
plot!(p1, lmaxs, ts_sparse1; label = "sparse $(nameof(typeof(ptopts_sparse1.alg.linsolve))), $(length(ks))×k", marker)
plot!(p1, lmaxs, ts_sparse2; label = "sparse $(nameof(typeof(ptopts_sparse2.alg.linsolve))), $(length(ks))×k", marker)
plot!(p1, lmaxs, ts_dense1; label = "dense $(nameof(typeof(ptopts_dense1.alg.linsolve))), $(length(ks))×k", marker)
plot!(p1, lmaxs, ts_dense2; label = "dense $(nameof(typeof(ptopts_dense2.alg.linsolve))), $(length(ks))×k", marker)
text(prob::CosmologyProblem) = "$(length(prob.pt.u0)) eqs,\n$(round(SymBoltz.sparsity_fraction(prob.pt)*100, digits=1)) %\nsparse"
annotate!(p1, lmaxs, zeros(length(lmaxs)), [(text(prob), 5, :top) for prob in probs_sparse])
speedups1 = [ts_dense2[i]/ts_sparse1[i] for i in eachindex(lmaxs)]
speedups2 = [ts_dense2[i]/ts_sparse2[i] for i in eachindex(lmaxs)]
speedups3 = [ts_dense2[i]/ts_dense1[i] for i in eachindex(lmaxs)]
speedups4 = [ts_dense2[i]/ts_dense2[i] for i in eachindex(lmaxs)]
ymax = Int(ceil(maximum(maximum.([speedups1, speedups2, speedups3, speedups4]))))
ylims = (0, ymax)
yticks = 0:1:ymax
yticks = (collect(yticks), collect("$y×" for y in yticks))
p2 = plot(; xlabel = "ℓmax", ylabel = "speedup", xticks = lmaxs, yticks, ylims, marker)
plot!(p2, lmaxs, speedups1; marker, label = nothing)
plot!(p2, lmaxs, speedups2; marker, label = nothing)
plot!(p2, lmaxs, speedups3; marker, label = nothing)
plot!(p2, lmaxs, speedups4; marker, label = nothing)
plot(p1, p2; size = (800, 600), layout = grid(2, 1, heights=(3//4, 1//4)))Except for models with a very small perturbation system, it is a good idea to generate the sparse Jacobian and use the sparse KLUFactorization linear solver.