diff --git a/docs/make.jl b/docs/make.jl index 0595a9398..7dd919b50 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -21,6 +21,7 @@ makedocs( "Adjoint systems" => "solvers/as.md", "Saddle-point and Hermitian quasi-definite systems" => "solvers/sp_sqd.md", "Generalized saddle-point and non-Hermitian partitioned systems" => "solvers/gsp.md"], + "Block-Krylov methods" => "block_krylov.md", "In-place methods" => "inplace.md", "Preconditioners" => "preconditioners.md", "Storage requirements" => "storage.md", @@ -39,6 +40,7 @@ makedocs( "TriMR" => "examples/trimr.md", "BICGSTAB" => "examples/bicgstab.md", "DQGMRES" => "examples/dqgmres.md", + "BLOCK-GMRES" => "examples/block_gmres.md", "CGNE" => "examples/cgne.md", "CRMR" => "examples/crmr.md", "CRAIG" => "examples/craig.md", diff --git a/docs/src/api.md b/docs/src/api.md index d14cf7556..a256d4b0f 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -16,6 +16,7 @@ Krylov.LsmrStats ```@docs KrylovSolver +BlockKrylovSolver MinresSolver MinaresSolver CgSolver @@ -51,6 +52,7 @@ CraigSolver CraigmrSolver GpmrSolver FgmresSolver +BlockGmresSolver ``` ## Utilities diff --git a/docs/src/block_krylov.md b/docs/src/block_krylov.md new file mode 100644 index 000000000..2db83c839 --- /dev/null +++ b/docs/src/block_krylov.md @@ -0,0 +1,6 @@ +## Block-GMRES + +```@docs +block_gmres +block_gmres! +``` diff --git a/docs/src/block_processes.md b/docs/src/block_processes.md index d6e3e5995..e9fc17811 100644 --- a/docs/src/block_processes.md +++ b/docs/src/block_processes.md @@ -99,6 +99,8 @@ H_{k+1,k} = The function [`arnoldi`](@ref arnoldi(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)) returns $V_{k+1}$, $\Gamma$, and $H_{k+1,k}$. +Related method: [`BLOCK-GMRES`](@ref block_gmres). + ```@docs arnoldi(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) ``` diff --git a/docs/src/examples/block_gmres.md b/docs/src/examples/block_gmres.md new file mode 100644 index 000000000..09ecef80c --- /dev/null +++ b/docs/src/examples/block_gmres.md @@ -0,0 +1,19 @@ +```@example block_gmres +using Krylov, MatrixMarket, SuiteSparseMatrixCollection +using LinearAlgebra, Printf + +ssmc = ssmc_db(verbose=false) +matrix = ssmc_matrices(ssmc, "HB", "sherman2") +path = fetch_ssmc(matrix, format="MM") + +n = matrix.nrows[1] +A = MatrixMarket.mmread(joinpath(path[1], "$(matrix.name[1]).mtx")) +B = Matrix{Float64}(I, n, 5) +B_norm = norm(B) + +# Solve Ax = B +X, stats = block_gmres(A, B) +show(stats) +R = B - A * X +@printf("Relative residual: %8.1e\n", norm(R) / B_norm) +``` diff --git a/docs/src/factorization-free.md b/docs/src/factorization-free.md index ccd7bcc59..7b4b399d8 100644 --- a/docs/src/factorization-free.md +++ b/docs/src/factorization-free.md @@ -39,7 +39,7 @@ Some methods only require `A * v` products, whereas other ones also require `A' |:-----------------------------------------------:|:----------------------------------------:| | CG, CR, CAR | CGLS, CRLS, CGNE, CRMR | | SYMMLQ, CG-LANCZOS, MINRES, MINRES-QLP, MINARES | LSLQ, LSQR, LSMR, LNLQ, CRAIG, CRAIGMR | -| DIOM, FOM, DQGMRES, GMRES, FGMRES | BiLQ, QMR, BiLQR, USYMLQ, USYMQR, TriLQR | +| DIOM, FOM, DQGMRES, GMRES, FGMRES, BLOCK-GMRES | BiLQ, QMR, BiLQR, USYMLQ, USYMQR, TriLQR | | CGS, BICGSTAB | TriCG, TriMR | !!! info diff --git a/docs/src/inplace.md b/docs/src/inplace.md index 9950575fe..c74d5fd99 100644 --- a/docs/src/inplace.md +++ b/docs/src/inplace.md @@ -15,7 +15,7 @@ Given an operator `A` and a right-hand side `b`, you can create a `KrylovSolver` For example, use `S = Vector{Float64}` if you want to solve linear systems in double precision on the CPU and `S = CuVector{Float32}` if you want to solve linear systems in single precision on an Nvidia GPU. !!! note - `DiomSolver`, `FomSolver`, `DqgmresSolver`, `GmresSolver`, `FgmresSolver`, `GpmrSolver` and `CgLanczosShiftSolver` require an additional argument (`memory` or `nshifts`). + `DiomSolver`, `FomSolver`, `DqgmresSolver`, `GmresSolver`, `BlockGmresSolver`, `FgmresSolver`, `GpmrSolver` and `CgLanczosShiftSolver` require an additional argument (`memory` or `nshifts`). The workspace is always the first argument of the in-place methods: diff --git a/docs/src/preconditioners.md b/docs/src/preconditioners.md index 021892469..22e24ce90 100644 --- a/docs/src/preconditioners.md +++ b/docs/src/preconditioners.md @@ -35,7 +35,7 @@ Krylov.jl supports both approaches thanks to the argument `ldiv` of the Krylov s ### Square non-Hermitian linear systems -Methods concerned: [`CGS`](@ref cgs), [`BiCGSTAB`](@ref bicgstab), [`DQGMRES`](@ref dqgmres), [`GMRES`](@ref gmres), [`FGMRES`](@ref fgmres), [`DIOM`](@ref diom) and [`FOM`](@ref fom). +Methods concerned: [`CGS`](@ref cgs), [`BiCGSTAB`](@ref bicgstab), [`DQGMRES`](@ref dqgmres), [`GMRES`](@ref gmres), [`BLOCK-GMRES`](@ref block_gmres), [`FGMRES`](@ref fgmres), [`DIOM`](@ref diom) and [`FOM`](@ref fom). A Krylov method dedicated to non-Hermitian linear systems allows the three variants of preconditioning. diff --git a/src/Krylov.jl b/src/Krylov.jl index e2eb70952..760ad6074 100644 --- a/src/Krylov.jl +++ b/src/Krylov.jl @@ -2,13 +2,17 @@ module Krylov using LinearAlgebra, SparseArrays, Printf -include("krylov_utils.jl") include("krylov_stats.jl") -include("krylov_solvers.jl") -include("processes_utils.jl") +include("krylov_utils.jl") include("krylov_processes.jl") +include("krylov_solvers.jl") + +include("block_krylov_utils.jl") include("block_krylov_processes.jl") +include("block_krylov_solvers.jl") + +include("block_gmres.jl") include("cg.jl") include("cr.jl") diff --git a/src/block_gmres.jl b/src/block_gmres.jl new file mode 100644 index 000000000..c1e04164d --- /dev/null +++ b/src/block_gmres.jl @@ -0,0 +1,357 @@ +# An implementation of block-GMRES for the solution of the square linear system AX = B. +# +# Alexis Montoison, +# Argonne National Laboratory -- Chicago, October 2023. + +export block_gmres, block_gmres! + +""" + (X, stats) = block_gmres(A, b::AbstractMatrix{FC}; + memory::Int=20, M=I, N=I, ldiv::Bool=false, + restart::Bool=false, reorthogonalization::Bool=false, + atol::T=√eps(T), rtol::T=√eps(T), itmax::Int=0, + timemax::Float64=Inf, verbose::Int=0, history::Bool=false, + callback=solver->false, iostream::IO=kstdout) + +`T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`. +`FC` is `T` or `Complex{T}`. + + (X, stats) = block_gmres(A, B, X0::AbstractMatrix; kwargs...) + +Block-GMRES can be warm-started from an initial guess `X0` where `kwargs` are the same keyword arguments as above. + +Solve the linear system AX = B of size n with p right-hand sides using block-GMRES. + +#### Input arguments + +* `A`: a linear operator that models a matrix of dimension n; +* `B`: a matrix of size n × p. + +#### Optional argument + +* `X0`: a matrix of size n × p that represents an initial guess of the solution X. + +#### Keyword arguments + +* `memory`: if `restart = true`, the restarted version block-GMRES(k) is used with `k = memory`. If `restart = false`, the parameter `memory` should be used as a hint of the number of iterations to limit dynamic memory allocations. Additional storage will be allocated if the number of iterations exceeds `memory`; +* `M`: linear operator that models a nonsingular matrix of size `n` used for left preconditioning; +* `N`: linear operator that models a nonsingular matrix of size `n` used for right preconditioning; +* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`; +* `restart`: restart the method after `memory` iterations; +* `reorthogonalization`: reorthogonalize the new matrices of the block-Krylov basis against all previous matrix; +* `atol`: absolute stopping tolerance based on the residual norm; +* `rtol`: relative stopping tolerance based on the residual norm; +* `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2 * div(n,p)`; +* `timemax`: the time limit in seconds; +* `verbose`: additional details can be displayed if verbose mode is enabled (verbose > 0). Information will be displayed every `verbose` iterations; +* `history`: collect additional statistics on the run such as residual norms; +* `callback`: function or functor called as `callback(solver)` that returns `true` if the block-Krylov method should terminate, and `false` otherwise; +* `iostream`: stream to which output is logged. + +#### Output arguments + +* `X`: a dense matrix of size n × p; +* `stats`: statistics collected on the run in a [`SimpleStats`](@ref) structure. +""" +function block_gmres end + +""" + solver = block_gmres!(solver::BlockGmresSolver, B; kwargs...) + solver = block_gmres!(solver::BlockGmresSolver, B, X0; kwargs...) + +where `kwargs` are keyword arguments of [`block_gmres`](@ref). + +See [`BlockGmresSolver`](@ref) for more details about the `solver`. +""" +function block_gmres! end + +def_args_block_gmres = (:(A ), + :(B::AbstractMatrix{FC})) + +def_optargs_block_gmres = (:(X0::AbstractMatrix),) + +def_kwargs_block_gmres = (:(; M = I ), + :(; N = I ), + :(; ldiv::Bool = false ), + :(; restart::Bool = false ), + :(; reorthogonalization::Bool = false), + :(; atol::T = √eps(T) ), + :(; rtol::T = √eps(T) ), + :(; itmax::Int = 0 ), + :(; timemax::Float64 = Inf ), + :(; verbose::Int = 0 ), + :(; history::Bool = false ), + :(; callback = solver -> false ), + :(; iostream::IO = kstdout )) + +def_kwargs_block_gmres = mapreduce(extract_parameters, vcat, def_kwargs_block_gmres) + +args_block_gmres = (:A, :B) +optargs_block_gmres = (:X0,) +kwargs_block_gmres = (:M, :N, :ldiv, :restart, :reorthogonalization, :atol, :rtol, :itmax, :timemax, :verbose, :history, :callback, :iostream) + +@eval begin + function block_gmres($(def_args_block_gmres...), $(def_optargs_block_gmres...); memory :: Int=20, $(def_kwargs_block_gmres...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}} + start_time = time_ns() + solver = BlockGmresSolver(A, B; memory) + warm_start!(solver, $(optargs_block_gmres...)) + elapsed_time = ktimer(start_time) + timemax -= elapsed_time + block_gmres!(solver, $(args_block_gmres...); $(kwargs_block_gmres...)) + solver.stats.timer += elapsed_time + return solver.X, solver.stats + end + + function block_gmres($(def_args_block_gmres...); memory :: Int=20, $(def_kwargs_block_gmres...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}} + start_time = time_ns() + solver = BlockGmresSolver(A, B; memory) + elapsed_time = ktimer(start_time) + timemax -= elapsed_time + block_gmres!(solver, $(args_block_gmres...); $(kwargs_block_gmres...)) + solver.stats.timer += elapsed_time + return solver.X, solver.stats + end + + function block_gmres!(solver :: BlockGmresSolver{T,FC,SV,SM}, $(def_args_block_gmres...); $(def_kwargs_block_gmres...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, SV <: AbstractVector{FC}, SM <: AbstractMatrix{FC}} + + # Timer + start_time = time_ns() + timemax_ns = 1e9 * timemax + + n, m = size(A) + s, p = size(B) + m == n || error("System must be square") + n == s || error("Inconsistent problem size") + (verbose > 0) && @printf(iostream, "BLOCK-GMRES: system of size %d with %d right-hand sides\n", n, p) + + # Check M = Iₙ and N = Iₙ + MisI = (M === I) + NisI = (N === I) + + # Check type consistency + eltype(A) == FC || @warn "eltype(A) ≠ $FC. This could lead to errors or additional allocations in operator-matrix products." + ktypeof(B) <: SM || error("ktypeof(B) is not a subtype of $SM") + + # Set up workspace. + allocate_if(!MisI , solver, :Q , SM, n, p) + allocate_if(!NisI , solver, :P , SM, n, p) + allocate_if(restart, solver, :ΔX, SM, n, p) + ΔX, X, W, V, Z = solver.ΔX, solver.X, solver.W, solver.V, solver.Z + C, D, R, H, τ, stats = solver.C, solver.D, solver.R, solver.H, solver.τ, solver.stats + warm_start = solver.warm_start + RNorms = stats.residuals + reset!(stats) + Q = MisI ? W : solver.Q + R₀ = MisI ? W : solver.Q + Xr = restart ? ΔX : X + + # Define the blocks D1 and D2 + D1 = view(D, 1:p, :) + D2 = view(D, p+1:2p, :) + + # Coefficients for mul! + α = -one(FC) + β = one(FC) + γ = one(FC) + + # Initial solution X₀. + fill!(X, zero(FC)) + + # Initial residual R₀. + if warm_start + mul!(W, A, ΔX) + W .= B .- W + restart && (X .+= ΔX) + else + copyto!(W, B) + end + MisI || mulorldiv!(R₀, M, W, ldiv) # R₀ = M(B - AX₀) + RNorm = norm(R₀) # ‖R₀‖_F + + history && push!(RNorms, RNorm) + ε = atol + rtol * RNorm + + mem = length(V) # Memory + npass = 0 # Number of pass + + iter = 0 # Cumulative number of iterations + inner_iter = 0 # Number of iterations in a pass + + itmax == 0 && (itmax = 2*div(n,p)) + inner_itmax = itmax + + (verbose > 0) && @printf(iostream, "%5s %5s %7s %5s\n", "pass", "k", "‖Rₖ‖", "timer") + kdisplay(iter, verbose) && @printf(iostream, "%5d %5d %7.1e %.2fs\n", npass, iter, RNorm, ktimer(start_time)) + + # Stopping criterion + solved = RNorm ≤ ε + tired = iter ≥ itmax + inner_tired = inner_iter ≥ inner_itmax + status = "unknown" + user_requested_exit = false + overtimed = false + + while !(solved || tired || user_requested_exit || overtimed) + + # Initialize workspace. + nr = 0 # Number of blocks Ψᵢⱼ stored in Rₖ. + for i = 1 : mem + fill!(V[i], zero(FC)) # Orthogonal basis of Kₖ(MAN, MR₀). + end + for Ψ in R + fill!(Ψ, zero(FC)) # Upper triangular matrix Rₖ. + end + for block in Z + fill!(block, zero(FC)) # Right-hand of the least squares problem min ‖Hₖ₊₁.ₖYₖ - ΓE₁‖₂. + end + + if restart + fill!(Xr, zero(FC)) # Xr === ΔX when restart is set to true + if npass ≥ 1 + mul!(W, A, X) + W .= B .- W + MisI || mulorldiv!(R₀, M, W, ldiv) + end + end + + # Initial Γ and V₁ + copyto!(V[1], R₀) + householder!(V[1], Z[1], τ[1]) + + npass = npass + 1 + inner_iter = 0 + inner_tired = false + + while !(solved || inner_tired || user_requested_exit || overtimed) + + # Update iteration index + inner_iter = inner_iter + 1 + + # Update workspace if more storage is required and restart is set to false + if !restart && (inner_iter > mem) + for i = 1 : inner_iter + push!(R, SM(undef, p, p)) + end + push!(H, SM(undef, 2p, p)) + push!(τ, SV(undef, p)) + end + + # Continue the block-Arnoldi process. + P = NisI ? V[inner_iter] : solver.P + NisI || mulorldiv!(P, N, V[inner_iter], ldiv) # P ← NVₖ + mul!(W, A, P) # W ← ANVₖ + MisI || mulorldiv!(Q, M, W, ldiv) # Q ← MANVₖ + for i = 1 : inner_iter + mul!(R[nr+i], V[i]', Q) # Ψᵢₖ = Vᵢᴴ * Q + mul!(Q, V[i], R[nr+i], α, β) # Q = Q - Vᵢ * Ψᵢₖ + end + + # Reorthogonalization of the block-Krylov basis. + if reorthogonalization + for i = 1 : inner_iter + mul!(Ψtmp, V[i]', Q) # Ψtmp = Vᵢᴴ * Q + mul!(Q, V[i], Ψtmp, α, β) # Q = Q - Vᵢ * Ψtmp + R[nr+i] .+= Ψtmp + end + end + + # Vₖ₊₁ and Ψₖ₊₁.ₖ are stored in Q and C. + householder!(Q, C, τ[inner_iter]) + + # Update the QR factorization of Hₖ₊₁.ₖ. + # Apply previous Householder reflections Ωᵢ. + for i = 1 : inner_iter-1 + D1 .= R[nr+i] + D2 .= R[nr+i+1] + @kormqr!('L', 'T', H[i], τ[i], D) + R[nr+i] .= D1 + R[nr+i+1] .= D2 + end + + # Compute and apply current Householder reflection Ωₖ. + H[inner_iter][1:p,:] .= R[nr+inner_iter] + H[inner_iter][p+1:2p,:] .= C + householder!(H[inner_iter], R[nr+inner_iter], τ[inner_iter], compact=true) + + # Update Zₖ = (Qₖ)ᴴΓE₁ = (Λ₁, ..., Λₖ, Λbarₖ₊₁) + D1 .= Z[inner_iter] + D2 .= zero(FC) + @kormqr!('L', 'T', H[inner_iter], τ[inner_iter], D) + Z[inner_iter] .= D1 + + # Update residual norm estimate. + # ‖ M(B - AXₖ) ‖_F = ‖Λbarₖ₊₁‖_F + C .= D2 + RNorm = norm(C) + history && push!(RNorms, RNorm) + + # Update the number of coefficients in Rₖ + nr = nr + inner_iter + + # Update stopping criterion. + user_requested_exit = callback(solver) :: Bool + solved = RNorm ≤ ε + inner_tired = restart ? inner_iter ≥ min(mem, inner_itmax) : inner_iter ≥ inner_itmax + timer = time_ns() - start_time + overtimed = timer > timemax_ns + kdisplay(iter+inner_iter, verbose) && @printf(iostream, "%5d %5d %7.1e %.2fs\n", npass, iter+inner_iter, RNorm, ktimer(start_time)) + + # Compute Vₖ₊₁. + if !(solved || inner_tired || user_requested_exit || overtimed) + if !restart && (inner_iter ≥ mem) + push!(V, SM(undef, n, p)) + push!(Z, SM(undef, p, p)) + end + copyto!(V[inner_iter+1], Q) + Z[inner_iter+1] .= D2 + end + end + + # Compute Yₖ by solving RₖYₖ = Zₖ with a backward substitution by block. + Y = Z # Yᵢ = Zᵢ + for i = inner_iter : -1 : 1 + pos = nr + i - inner_iter # position of Ψᵢ.ₖ + for j = inner_iter : -1 : i+1 + mul!(Y[i], R[pos], Y[j], α, β) # Yᵢ ← Yᵢ - ΨᵢⱼYⱼ + pos = pos - j + 1 # position of Ψᵢ.ⱼ₋₁ + end + ldiv!(UpperTriangular(R[pos]), Y[i]) # Yᵢ ← Yᵢ \ Ψᵢᵢ + end + + # Form Xₖ = NVₖYₖ + for i = 1 : inner_iter + mul!(Xr, V[i], Y[i], γ, β) + end + if !NisI + copyto!(solver.P, Xr) + mulorldiv!(Xr, N, solver.P, ldiv) + end + restart && (X .+= Xr) + + # Update inner_itmax, iter, tired and overtimed variables. + inner_itmax = inner_itmax - inner_iter + iter = iter + inner_iter + tired = iter ≥ itmax + timer = time_ns() - start_time + overtimed = timer > timemax_ns + end + (verbose > 0) && @printf(iostream, "\n") + + # Termination status + tired && (status = "maximum number of iterations exceeded") + solved && (status = "solution good enough given atol and rtol") + overtimed && (status = "time limit exceeded") + user_requested_exit && (status = "user-requested exit") + + # Update Xₖ + warm_start && !restart && (X .+= ΔX) + solver.warm_start = false + + # Update stats + stats.niter = iter + stats.solved = solved + stats.timer = ktimer(start_time) + stats.status = status + return solver + end +end diff --git a/src/block_krylov_solvers.jl b/src/block_krylov_solvers.jl new file mode 100644 index 000000000..c6c128aeb --- /dev/null +++ b/src/block_krylov_solvers.jl @@ -0,0 +1,167 @@ +export BlockKrylovSolver + +export BlockGmresSolver + +const BLOCK_KRYLOV_SOLVERS = Dict(:block_gmres => :BlockGmresSolver) + +"Abstract type for using block Krylov solvers in-place" +abstract type BlockKrylovSolver{T,FC,SV,SM} end + +""" +Type for storing the vectors required by the in-place version of BLOCK-GMRES. + +The outer constructors + + solver = BlockGmresSolver(m, n, p, memory, SV, SM) + solver = BlockGmresSolver(A, B; memory=5) + +may be used in order to create these vectors. +`memory` is set to `div(n,p)` if the value given is larger than `div(n,p)`. +""" +mutable struct BlockGmresSolver{T,FC,SV,SM} <: BlockKrylovSolver{T,FC,SV,SM} + m :: Int + n :: Int + p :: Int + ΔX :: SM + X :: SM + W :: SM + P :: SM + Q :: SM + C :: SM + D :: SM + V :: Vector{SM} + Z :: Vector{SM} + R :: Vector{SM} + H :: Vector{SM} + τ :: Vector{SV} + warm_start :: Bool + stats :: SimpleStats{T} +end + +function BlockGmresSolver(m, n, p, memory, SV, SM) + memory = min(div(n,p), memory) + FC = eltype(SV) + T = real(FC) + ΔX = SM(undef, 0, 0) + X = SM(undef, n, p) + W = SM(undef, n, p) + P = SM(undef, 0, 0) + Q = SM(undef, 0, 0) + C = SM(undef, p, p) + D = SM(undef, 2p, p) + V = SM[SM(undef, n, p) for i = 1 : memory] + Z = SM[SM(undef, p, p) for i = 1 : memory] + R = SM[SM(undef, p, p) for i = 1 : div(memory * (memory+1), 2)] + H = SM[SM(undef, 2p, p) for i = 1 : memory] + τ = SV[SV(undef, p) for i = 1 : memory] + stats = SimpleStats(0, false, false, T[], T[], T[], 0.0, "unknown") + solver = BlockGmresSolver{T,FC,SV,SM}(m, n, p, ΔX, X, W, P, Q, C, D, V, Z, R, H, τ, false, stats) + return solver +end + +function BlockGmresSolver(A, B; memory::Int=5) + m, n = size(A) + s, p = size(B) + SM = typeof(B) + SV = matrix_to_vector(SM) + BlockGmresSolver(m, n, p, memory, SV, SM) +end + +for (KS, fun, nsol, nA, nAt, warm_start) in [ + (:BlockGmresSolver, :block_gmres!, 1, 1, 0, true) +] + @eval begin + size(solver :: $KS) = solver.m, solver.n + nrhs(solver :: $KS) = solver.p + statistics(solver :: $KS) = solver.stats + niterations(solver :: $KS) = solver.stats.niter + Aprod(solver :: $KS) = $nA * solver.stats.niter + Atprod(solver :: $KS) = $nAt * solver.stats.niter + nsolution(solver :: $KS) = $nsol + if $nsol == 1 + solution(solver :: $KS) = solver.X + solution(solver :: $KS, p :: Integer) = (p == 1) ? solution(solver) : error("solution(solver) has only one output.") + end + issolved(solver :: $KS) = solver.stats.solved + if $warm_start + function warm_start!(solver :: $KS, X0) + n, p = size(solver.X) + n2, p2 = size(X0) + SM = typeof(solver.X) + (n == n2 && p == p2) || error("X0 should have size ($n, $p)") + allocate_if(true, solver, :ΔX, SM, n, p) + copyto!(solver.ΔX, X0) + solver.warm_start = true + return solver + end + end + end +end + +function ksizeof(attribute) + if isa(attribute, Vector{<:AbstractVector}) && !isempty(attribute) + # A vector of vectors is a vector of pointers in Julia. + # All vectors inside a vector have the same size in Krylov.jl + size_attribute = sizeof(attribute) + length(attribute) * ksizeof(attribute[1]) + else + size_attribute = sizeof(attribute) + end + return size_attribute +end + +function sizeof(stats_solver :: Union{KrylovStats, KrylovSolver, BlockKrylovSolver}) + type = typeof(stats_solver) + nfields = fieldcount(type) + storage = 0 + for i = 1:nfields + field_i = getfield(stats_solver, i) + size_i = ksizeof(field_i) + storage += size_i + end + return storage +end + +""" + show(io, solver; show_stats=true) + +Statistics of `solver` are displayed if `show_stats` is set to true. +""" +function show(io :: IO, solver :: Union{KrylovSolver{T,FC,S}, BlockKrylovSolver{T,FC,S}}; show_stats :: Bool=true) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: AbstractVector{FC}} + workspace = typeof(solver) + name_solver = string(workspace.name.name) + name_stats = string(typeof(solver.stats).name.name) + nbytes = sizeof(solver) + storage = format_bytes(nbytes) + architecture = S <: Vector ? "CPU" : "GPU" + l1 = max(length(name_solver), length(string(FC)) + 11) # length("Precision: ") = 11 + nchar = workspace <: Union{CgLanczosShiftSolver, FomSolver, DiomSolver, DqgmresSolver, GmresSolver, FgmresSolver, GpmrSolver, BlockGmresSolver} ? 8 : 0 # length("Vector{}") = 8 + l2 = max(ndigits(solver.m) + 7, length(architecture) + 14, length(string(S)) + nchar) # length("nrows: ") = 7 and length("Architecture: ") = 14 + l2 = max(l2, length(name_stats) + 2 + length(string(T))) # length("{}") = 2 + l3 = max(ndigits(solver.n) + 7, length(storage) + 9) # length("Storage: ") = 9 and length("cols: ") = 7 + format = Printf.Format("│%$(l1)s│%$(l2)s│%$(l3)s│\n") + format2 = Printf.Format("│%$(l1+1)s│%$(l2)s│%$(l3)s│\n") + @printf(io, "┌%s┬%s┬%s┐\n", "─"^l1, "─"^l2, "─"^l3) + Printf.format(io, format, "$(name_solver)", "nrows: $(solver.m)", "ncols: $(solver.n)") + @printf(io, "├%s┼%s┼%s┤\n", "─"^l1, "─"^l2, "─"^l3) + Printf.format(io, format, "Precision: $FC", "Architecture: $architecture","Storage: $storage") + @printf(io, "├%s┼%s┼%s┤\n", "─"^l1, "─"^l2, "─"^l3) + Printf.format(io, format, "Attribute", "Type", "Size") + @printf(io, "├%s┼%s┼%s┤\n", "─"^l1, "─"^l2, "─"^l3) + for i=1:fieldcount(workspace) + name_i = fieldname(workspace, i) + type_i = fieldtype(workspace, i) + field_i = getfield(solver, name_i) + size_i = ksizeof(field_i) + if (name_i::Symbol in [:w̅, :w̄, :d̅]) && (VERSION < v"1.8.0-DEV") + (size_i ≠ 0) && Printf.format(io, format2, string(name_i), type_i, format_bytes(size_i)) + else + (size_i ≠ 0) && Printf.format(io, format, string(name_i), type_i, format_bytes(size_i)) + end + end + @printf(io, "└%s┴%s┴%s┘\n","─"^l1,"─"^l2,"─"^l3) + if show_stats + @printf(io, "\n") + show(io, solver.stats) + end + return nothing +end diff --git a/src/processes_utils.jl b/src/block_krylov_utils.jl similarity index 82% rename from src/processes_utils.jl rename to src/block_krylov_utils.jl index 057cafb9d..c09071fa4 100644 --- a/src/processes_utils.jl +++ b/src/block_krylov_utils.jl @@ -68,36 +68,6 @@ function mgs!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}) where FC <: FloatOrC return Q, R end -# Reduced QR factorization with Householder reflections: -# Q, R = householder(A) -# -# Input : -# A an n-by-k matrix, n ≥ k -# -# Output : -# Q an n-by-k orthonormal matrix: QᴴQ = Iₖ -# R an k-by-k upper triangular matrix: QR = A -function householder(A::AbstractMatrix{FC}) where FC <: FloatOrComplex - n, k = size(A) - Q = copy(A) - τ = zeros(FC, k) - R = zeros(FC, k, k) - householder!(Q, R, τ) -end - -function householder!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, τ::AbstractVector{FC}) where FC <: FloatOrComplex - n, k = size(Q) - R .= zero(FC) - @kgeqrf!(Q, τ) - for i = 1:k - for j = i:k - R[i,j] = Q[i,j] - end - end - @korgqr!(Q, τ) - return Q, R -end - # Reduced QR factorization with Givens reflections: # Q, R = givens(A) # @@ -185,3 +155,56 @@ function reduced_qr(A::AbstractMatrix{FC}, algo::String) where FC <: FloatOrComp end return Q, R end + +# @kernel function copy_triangle_kernel!(dest, src) +# i, j = @index(Global, NTuple) +# if j >= i +# @inbounds dest[i, j] = src[i, j] +# end +# end + +# function copy_triangle(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, k::Int) where FC <: FloatOrComplex +# backend = get_backend(Q) +# ndrange = (k, k) +# copy_triangle_kernel!(backend)(R, Q; ndrange=ndrange) +# KernelAbstractions.synchronize(backend) +# end + +function copy_triangle(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, k::Int) where FC <: FloatOrComplex + if VERSION < v"1.11" + for i = 1:k + for j = i:k + R[i,j] = Q[i,j] + end + end + else + copytrito!(R, Q, 'U') + end + return R +end + +# Reduced QR factorization with Householder reflections: +# Q, R = householder(A) +# +# Input : +# A an n-by-k matrix, n ≥ k +# +# Output : +# Q an n-by-k orthonormal matrix: QᴴQ = Iₖ +# R an k-by-k upper triangular matrix: QR = A +function householder(A::AbstractMatrix{FC}; compact::Bool=false) where FC <: FloatOrComplex + n, k = size(A) + Q = copy(A) + τ = zeros(FC, k) + R = zeros(FC, k, k) + householder!(Q, R, τ; compact) +end + +function householder!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, τ::AbstractVector{FC}; compact::Bool=false) where FC <: FloatOrComplex + n, k = size(Q) + R .= zero(FC) + @kgeqrf!(Q, τ) + copy_triangle(Q, R, k) + !compact && @korgqr!(Q, τ) + return Q, R +end diff --git a/src/krylov_solve.jl b/src/krylov_solve.jl index 4db963672..23e17e95d 100644 --- a/src/krylov_solve.jl +++ b/src/krylov_solve.jl @@ -43,8 +43,6 @@ for (KS, fun, args, def_args, optargs, def_optargs, kwargs, def_kwargs) in [ (:GpmrSolver , :gpmr! , args_gpmr , def_args_gpmr , optargs_gpmr , def_optargs_gpmr , kwargs_gpmr , def_kwargs_gpmr ) ] @eval begin - solve!(solver :: $KS{T,FC,S}, $(def_args...); $(def_kwargs...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: AbstractVector{FC}} = $(fun)(solver, $(args...); $(kwargs...)) - if !isempty($optargs) function $(fun)(solver :: $KS{T,FC,S}, $(def_args...), $(def_optargs...); $(def_kwargs...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: AbstractVector{FC}} start_time = time_ns() @@ -56,7 +54,29 @@ for (KS, fun, args, def_args, optargs, def_optargs, kwargs, def_kwargs) in [ return solver end + solve!(solver :: $KS{T,FC,S}, $(def_args...); $(def_kwargs...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: AbstractVector{FC}} = $(fun)(solver, $(args...); $(kwargs...)) solve!(solver :: $KS{T,FC,S}, $(def_args...), $(def_optargs...); $(def_kwargs...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: AbstractVector{FC}} = $(fun)(solver, $(args...), $(optargs...); $(kwargs...)) end end end + +for (KS, fun, args, def_args, optargs, def_optargs, kwargs, def_kwargs) in [ + (:BlockGmresSolver, :block_gmres!, args_block_gmres, def_args_block_gmres, optargs_block_gmres, def_optargs_block_gmres, kwargs_block_gmres, def_kwargs_block_gmres), +] + @eval begin + if !isempty($optargs) + function $(fun)(solver :: $KS{T,FC,SV,SM}, $(def_args...), $(def_optargs...); $(def_kwargs...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, SV <: AbstractVector{FC}, SM <: AbstractMatrix{FC}} + start_time = time_ns() + warm_start!(solver, $(optargs...)) + elapsed_time = ktimer(start_time) + timemax -= elapsed_time + $(fun)(solver, $(args...); $(kwargs...)) + solver.stats.timer += elapsed_time + return solver + end + end + + solve!(solver :: $KS{T,FC,SV,SM}, $(def_args...); $(def_kwargs...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, SV <: AbstractVector{FC}, SM <: AbstractMatrix{FC}} = $(fun)(solver, $(args...); $(kwargs...)) + solve!(solver :: $KS{T,FC,SV,SM}, $(def_args...), $(def_optargs...); $(def_kwargs...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, SV <: AbstractVector{FC}, SM <: AbstractMatrix{FC}} = $(fun)(solver, $(args...), $(optargs...); $(kwargs...)) + end +end diff --git a/src/krylov_solvers.jl b/src/krylov_solvers.jl index 2461b04cd..13c98f823 100644 --- a/src/krylov_solvers.jl +++ b/src/krylov_solvers.jl @@ -1979,71 +1979,3 @@ for (KS, fun, nsol, nA, nAt, warm_start) in [ end end end - -function ksizeof(attribute) - if isa(attribute, Vector{<:AbstractVector}) && !isempty(attribute) - # A vector of vectors is a vector of pointers in Julia. - # All vectors inside a vector have the same size in Krylov.jl - size_attribute = sizeof(attribute) + length(attribute) * ksizeof(attribute[1]) - else - size_attribute = sizeof(attribute) - end - return size_attribute -end - -function sizeof(stats_solver :: Union{KrylovStats, KrylovSolver}) - type = typeof(stats_solver) - nfields = fieldcount(type) - storage = 0 - for i = 1:nfields - field_i = getfield(stats_solver, i) - size_i = ksizeof(field_i) - storage += size_i - end - return storage -end - -""" - show(io, solver; show_stats=true) - -Statistics of `solver` are displayed if `show_stats` is set to true. -""" -function show(io :: IO, solver :: KrylovSolver{T,FC,S}; show_stats :: Bool=true) where {T <: AbstractFloat, FC <: FloatOrComplex{T}, S <: AbstractVector{FC}} - workspace = typeof(solver) - name_solver = string(workspace.name.name) - name_stats = string(typeof(solver.stats).name.name) - nbytes = sizeof(solver) - storage = format_bytes(nbytes) - architecture = S <: Vector ? "CPU" : "GPU" - l1 = max(length(name_solver), length(string(FC)) + 11) # length("Precision: ") = 11 - nchar = workspace <: Union{CgLanczosShiftSolver, FomSolver, DiomSolver, DqgmresSolver, GmresSolver, FgmresSolver, GpmrSolver} ? 8 : 0 # length("Vector{}") = 8 - l2 = max(ndigits(solver.m) + 7, length(architecture) + 14, length(string(S)) + nchar) # length("nrows: ") = 7 and length("Architecture: ") = 14 - l2 = max(l2, length(name_stats) + 2 + length(string(T))) # length("{}") = 2 - l3 = max(ndigits(solver.n) + 7, length(storage) + 9) # length("Storage: ") = 9 and length("cols: ") = 7 - format = Printf.Format("│%$(l1)s│%$(l2)s│%$(l3)s│\n") - format2 = Printf.Format("│%$(l1+1)s│%$(l2)s│%$(l3)s│\n") - @printf(io, "┌%s┬%s┬%s┐\n", "─"^l1, "─"^l2, "─"^l3) - Printf.format(io, format, "$(name_solver)", "nrows: $(solver.m)", "ncols: $(solver.n)") - @printf(io, "├%s┼%s┼%s┤\n", "─"^l1, "─"^l2, "─"^l3) - Printf.format(io, format, "Precision: $FC", "Architecture: $architecture","Storage: $storage") - @printf(io, "├%s┼%s┼%s┤\n", "─"^l1, "─"^l2, "─"^l3) - Printf.format(io, format, "Attribute", "Type", "Size") - @printf(io, "├%s┼%s┼%s┤\n", "─"^l1, "─"^l2, "─"^l3) - for i=1:fieldcount(workspace) - name_i = fieldname(workspace, i) - type_i = fieldtype(workspace, i) - field_i = getfield(solver, name_i) - size_i = ksizeof(field_i) - if (name_i::Symbol in [:w̅, :w̄, :d̅]) && (VERSION < v"1.8.0-DEV") - (size_i ≠ 0) && Printf.format(io, format2, string(name_i), type_i, format_bytes(size_i)) - else - (size_i ≠ 0) && Printf.format(io, format, string(name_i), type_i, format_bytes(size_i)) - end - end - @printf(io, "└%s┴%s┴%s┘\n","─"^l1,"─"^l2,"─"^l3) - if show_stats - @printf(io, "\n") - show(io, solver.stats) - end - return nothing -end diff --git a/src/krylov_utils.jl b/src/krylov_utils.jl index e06569c4a..826e5eaeb 100644 --- a/src/krylov_utils.jl +++ b/src/krylov_utils.jl @@ -209,6 +209,10 @@ function ktypeof(v::S) where S <: DenseVector return S end +function ktypeof(v::S) where S <: DenseMatrix + return S +end + function ktypeof(v::S) where S <: AbstractVector if S.name.name == :Zeros || S.name.name == :Ones || S.name.name == :SArray || S.name.name == :MArray || S.name.name == :SizedArray || S.name.name == :FieldArray || S.name.name == :ComponentArray T = eltype(S) @@ -288,6 +292,7 @@ Create a vector of storage type `S` of length `n` only composed of one. kones(S, n) = fill!(S(undef, n), one(eltype(S))) allocate_if(bool, solver, v, S, n) = bool && isempty(solver.:($v)::S) && (solver.:($v)::S = S(undef, n)) +allocate_if(bool, solver, v, S, m, n) = bool && isempty(solver.:($v)::S) && (solver.:($v)::S = S(undef, m, n)) kdisplay(iter, verbose) = (verbose > 0) && (mod(iter, verbose) == 0) @@ -322,34 +327,9 @@ kaxpby!(n :: Integer, s :: T, x :: AbstractVector{Complex{T}}, dx :: Integer, t kcopy!(n :: Integer, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasFloat = BLAS.blascopy!(n, x, dx, y, dy) kcopy!(n :: Integer, x :: AbstractVector{T}, dx :: Integer, y :: AbstractVector{T}, dy :: Integer) where T <: FloatOrComplex = copyto!(y, x) -# geqrf -- Computes the QR factorization of a general m-by-n matrix. -# orgqr -- Generates the real orthogonal matrix of the QR factorization formed by geqrf. -# ungqr -- Generates the complex unitary matrix of the QR factorization formed by geqrf. -# ormqr -- Multiplies a real matrix by the orthogonal matrix of the QR factorization formed by geqrf. -# unmqr -- Multiplies a complex matrix by the unitary matrix of the QR factorization formed by geqrf. - -# getrf -- Computes the LU factorization of a general m-by-n matrix. -# getrs -- Solves a system of linear equations with an LU-factored square matrix. -# getri -- Computes the inverse of an LU-factored general matrix. - -# sytrf -- Computes the Bunch-Kaufman factorization of a symmetric matrix. -# sytrs -- Solves a system of linear equations with an LDLᵀ-factored symmetric matrix. -# sytri -- Computes the inverse of a symmetric matrix using LDLᵀ Bunch-Kaufman factorization. - -# hetrf -- Computes the Bunch-Kaufman factorization of a complex Hermitian matrix. -# hetrs -- Solves a system of linear equations with an LDLᴴ-factored Hermitian matrix. -# hetri -- Computes the inverse of a complex Hermitian matrix using LDLᴴ Bunch-Kaufman factorization. - -# potrf -- Computes the Cholesky factorization of a symmetric (Hermitian) positive-definite matrix. -# potrs -- Solves a system of linear equations with a Cholesky-factored symmetric (Hermitian) positive-definite matrix. -# potri -- Computes the inverse of a Cholesky-factored symmetric (Hermitian) positive-definite matrix. - -# trtrs -- Solves a system of linear equations with a triangular matrix. -# trtri -- Computes the inverse of a triangular matrix. - -kgeqrf!(A :: Matrix{T}, tau :: Vector{T}) where T <: BLAS.BlasFloat = LAPACK.geqrf!(A, tau) -korgqr!(A :: Matrix{T}, tau :: Vector{T}) where T <: BLAS.BlasFloat = LAPACK.orgqr!(A, tau) -kormqr!(side :: Char, trans :: Char, A :: Matrix{T}, tau :: Vector{T}, C :: Matrix{T}) where T <: BLAS.BlasFloat = LAPACK.ormqr!(side, trans, A, tau, C) +kgeqrf!(A :: AbstractMatrix{T}, tau :: AbstractVector{T}) where T <: BLAS.BlasFloat = LAPACK.geqrf!(A, tau) +korgqr!(A :: AbstractMatrix{T}, tau :: AbstractVector{T}) where T <: BLAS.BlasFloat = LAPACK.orgqr!(A, tau) +kormqr!(side :: Char, trans :: Char, A :: AbstractMatrix{T}, tau :: AbstractVector{T}, C :: AbstractMatrix{T}) where T <: BLAS.BlasFloat = LAPACK.ormqr!(side, trans, A, tau, C) macro kgeqrf!(A, tau) return esc(:(Krylov.kgeqrf!($A, $tau)))