diff --git a/docs/make.jl b/docs/make.jl index 0ad50d52f..db49cb759 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -12,6 +12,7 @@ makedocs( sitename = "Krylov.jl", pages = ["Home" => "index.md", "API" => "api.md", + "Krylov processes" => "processes.md", "Krylov methods" => ["Symmetric positive definite linear systems" => "solvers/spd.md", "Symmetric indefinite linear systems" => "solvers/sid.md", "Unsymmetric linear systems" => "solvers/unsymmetric.md", diff --git a/docs/src/graphics/arnoldi.png b/docs/src/graphics/arnoldi.png new file mode 100644 index 000000000..9ef8bd3a3 Binary files /dev/null and b/docs/src/graphics/arnoldi.png differ diff --git a/docs/src/graphics/golub_kahan.png b/docs/src/graphics/golub_kahan.png new file mode 100644 index 000000000..3baec4b4a Binary files /dev/null and b/docs/src/graphics/golub_kahan.png differ diff --git a/docs/src/graphics/hermitian_lanczos.png b/docs/src/graphics/hermitian_lanczos.png new file mode 100644 index 000000000..c70082e72 Binary files /dev/null and b/docs/src/graphics/hermitian_lanczos.png differ diff --git a/docs/src/graphics/montoison_orban.png b/docs/src/graphics/montoison_orban.png new file mode 100644 index 000000000..5a14eda04 Binary files /dev/null and b/docs/src/graphics/montoison_orban.png differ diff --git a/docs/src/graphics/nonhermitian_lanczos.png b/docs/src/graphics/nonhermitian_lanczos.png new file mode 100644 index 000000000..b8d83961c Binary files /dev/null and b/docs/src/graphics/nonhermitian_lanczos.png differ diff --git a/docs/src/graphics/saunders_simon_yip.png b/docs/src/graphics/saunders_simon_yip.png new file mode 100644 index 000000000..c3acfd181 Binary files /dev/null and b/docs/src/graphics/saunders_simon_yip.png differ diff --git a/docs/src/processes.md b/docs/src/processes.md new file mode 100644 index 000000000..208824ae8 --- /dev/null +++ b/docs/src/processes.md @@ -0,0 +1,278 @@ +# [Krylov processes](@id krylov-processes) + +Krylov processes are the foundation of Krylov methods, they generate bases of Krylov subspaces. + +### Notation + +Define $V_k := \begin{bmatrix} v_1 & \ldots & v_k \end{bmatrix} \enspace$ and $\enspace U_k := \begin{bmatrix} u_1 & \ldots & u_k \end{bmatrix}$. + +For a matrix $C \in \mathbb{C}^{n \times n}$ and a vector $t \in \mathbb{C}^{n}$, the $k$-th Krylov subspace generated by $C$ and $t$ is +```math +\mathcal{K}_k(C, t) := +\left\{\sum_{i=0}^{k-1} \omega_i C^i t \, \middle \vert \, \omega_i \in \mathbb{C},~0 \le i \le k-1 \right\}. +``` + +For matrices $C \in \mathbb{C}^{n \times n} \enspace$ and $\enspace T \in \mathbb{C}^{n \times p}$, the $k$-th block Krylov subspace generated by $C$ and $T$ is +```math +\mathcal{K}_k^{\square}(C, T) := +\left\{\sum_{i=0}^{k-1} C^i T \, \Omega_i \, \middle \vert \, \Omega_i \in \mathbb{C}^{p \times p},~0 \le i \le k-1 \right\}. +``` + +## Hermitian Lanczos + +![hermitian_lanczos](./graphics/hermitian_lanczos.png) + +After $k$ iterations of the Hermitian Lanczos process, the situation may be summarized as +```math +\begin{align*} + A V_k &= V_k T_k + \beta_{k+1,k} v_{k+1} e_k^T = V_{k+1} T_{k+1,k}, \\ + V_k^H V_k &= I_k, +\end{align*} +``` +where $V_k$ is an orthonormal basis of the Krylov subspace $\mathcal{K}_k (A,b)$, +```math +T_k = +\begin{bmatrix} + \alpha_1 & \beta_2 & & \\ + \beta_2 & \alpha_2 & \ddots & \\ + & \ddots & \ddots & \beta_k \\ + & & \beta_k & \alpha_k +\end{bmatrix} +, \qquad +T_{k+1,k} = +\begin{bmatrix} + T_{k} \\ + \beta_{k+1} e_{k}^T +\end{bmatrix}. +``` +Note that $T_{k+1,k}$ is a real tridiagonal matrix even if $A$ is a complex matrix. + +The function [`hermitian_lanczos`](@ref hermitian_lanczos) returns $V_{k+1}$ and $T_{k+1,k}$. + +Related methods: [`SYMMLQ`](@ref symmlq), [`CG`](@ref cg), [`CR`](@ref cr), [`MINRES`](@ref minres), [`MINRES-QLP`](@ref minres_qlp), [`CGLS`](@ref cgls), [`CRLS`](@ref crls), [`CGNE`](@ref cgne), [`CRMR`](@ref crmr), [`CG-LANCZOS`](@ref cg_lanczos) and [`CG-LANCZOS-SHIFT`](@ref cg_lanczos_shift). + +```@docs +hermitian_lanczos +``` + +## Non-Hermitian Lanczos + +![nonhermitian_lanczos](./graphics/nonhermitian_lanczos.png) + +After $k$ iterations of the non-Hermitian Lanczos process (also named the Lanczos biorthogonalization process), the situation may be summarized as +```math +\begin{align*} + A V_k &= V_k T_k + \beta_{k+1} v_{k+1} e_k^T = V_{k+1} T_{k+1,k}, \\ + B U_k &= U_k T_k^H + \bar{\gamma}_{k+1} u_{k+1} e_k^T = U_{k+1} T_{k,k+1}^H, \\ + V_k^H U_k &= U_k^H V_k = I_k, +\end{align*} +``` +where $V_k$ and $U_k$ are bases of the Krylov subspaces $\mathcal{K}_k (A,b)$ and $\mathcal{K}_k (A^H,c)$, respectively, +```math +T_k = +\begin{bmatrix} + \alpha_1 & \gamma_2 & & \\ + \beta_2 & \alpha_2 & \ddots & \\ + & \ddots & \ddots & \gamma_k \\ + & & \beta_k & \alpha_k +\end{bmatrix} +, \qquad +T_{k+1,k} = +\begin{bmatrix} + T_{k} \\ + \beta_{k+1} e_{k}^T +\end{bmatrix} +, \qquad +T_{k,k+1} = +\begin{bmatrix} + T_{k} & \gamma_{k+1} e_k +\end{bmatrix}. +``` + +The function [`nonhermitian_lanczos`](@ref nonhermitian_lanczos) returns $V_{k+1}$, $T_{k+1,k}$, $U_{k+1}$ and $T_{k,k+1}^H$. + +Related methods: [`BiLQ`](@ref bilq), [`QMR`](@ref qmr), [`BiLQR`](@ref bilqr), [`CGS`](@ref cgs) and [`BICGSTAB`](@ref bicgstab). + +!!! note + The scaling factors used in our implementation are $\beta_k = |u_k^H v_k|^{\tfrac{1}{2}}$ and $\gamma_k = (u_k^H v_k) / \beta_k$. + +```@docs +nonhermitian_lanczos +``` + +## Arnoldi + +![arnoldi](./graphics/arnoldi.png) + +After $k$ iterations of the Arnoldi process, the situation may be summarized as +```math +\begin{align*} + A V_k &= V_k H_k + h_{k+1,k} v_{k+1} e_k^T = V_{k+1} H_{k+1,k}, \\ + V_k^H V_k &= I_k, +\end{align*} +``` +where $V_k$ is an orthonormal basis of the Krylov subspace $\mathcal{K}_k (A,b)$, +```math +H_k = +\begin{bmatrix} + h_{1,1}~ & h_{1,2}~ & \ldots & h_{1,k} \\ + h_{2,1}~ & \ddots~ & \ddots & \vdots \\ + & \ddots~ & \ddots & h_{k-1,k} \\ + & & h_{k,k-1} & h_{k,k} +\end{bmatrix} +, \qquad +H_{k+1,k} = +\begin{bmatrix} + H_{k} \\ + h_{k+1,k} e_{k}^T +\end{bmatrix}. +``` + +The function [`arnoldi`](@ref arnoldi) returns $V_{k+1}$ and $H_{k+1,k}$. + +Related methods: [`DIOM`](@ref diom), [`FOM`](@ref fom), [`DQGMRES`](@ref dqgmres) and [`GMRES`](@ref gmres). + + +```@docs +arnoldi +``` + +## Golub-Kahan + +![golub_kahan](./graphics/golub_kahan.png) + +After $k$ iterations of the Golub-Kahan bidiagonalization process, the situation may be summarized as +```math +\begin{align*} + A V_k &= U_{k+1} B_k, \\ + A^H U_{k+1} &= V_k B_k^H + \alpha_{k+1} v_{k+1} e_{k+1}^T = V_{k+1} L_{k+1}^H, \\ + V_k^H V_k &= U_k^H U_k = I_k, +\end{align*} +``` +where $V_k$ and $U_k$ are bases of the Krylov subspaces $\mathcal{K}_k (A^HA,A^Hb)$ and $\mathcal{K}_k (AA^H,b)$, respectively, +```math +L_k = +\begin{bmatrix} + \alpha_1 & & & \\ + \beta_2 & \alpha_2 & & \\ + & \ddots & \ddots & \\ + & & \beta_k & \alpha_k +\end{bmatrix} +, \qquad +B_k = +\begin{bmatrix} + \alpha_1 & & & \\ + \beta_2 & \alpha_2 & & \\ + & \ddots & \ddots & \\ + & & \beta_k & \alpha_k \\ + & & & \beta_{k+1} \\ +\end{bmatrix} += +\begin{bmatrix} + L_{k} \\ + \beta_{k+1} e_{k}^T +\end{bmatrix}. +``` +Note that $L_k$ is a real bidiagonal matrix even if $A$ is a complex matrix. + +The function [`golub_kahan`](@ref golub_kahan) returns $V_{k+1}$, $U_{k+1}$ and $L_{k+1}$. + +Related methods: [`LNLQ`](@ref lnlq), [`CRAIG`](@ref craig), [`CRAIGMR`](@ref craigmr), [`LSLQ`](@ref lslq), [`LSQR`](@ref lsqr) and [`LSMR`](@ref lsmr). + +```@docs +golub_kahan +``` + +## Saunders-Simon-Yip + +![saunders_simon_yip](./graphics/saunders_simon_yip.png) + +After $k$ iterations of the Saunders-Simon-Yip process (also named the orthogonal tridiagonalization process), the situation may be summarized as +```math +\begin{align*} + A U_k &= V_k T_k + \beta_{k+1} v_{k+1} e_k^T = V_{k+1} T_{k+1,k}, \\ + B V_k &= U_k T_k^H + \gamma_{k+1} u_{k+1} e_k^T = U_{k+1} T_{k,k+1}^H, \\ + V_k^H V_k &= U_k^H U_k = I_k, +\end{align*} +``` +where $\begin{bmatrix} V_k & 0 \\ 0 & U_k \end{bmatrix}$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}^{\square}_k \left(\begin{bmatrix} 0 & A \\ A^H & 0 \end{bmatrix}, \begin{bmatrix} b & 0 \\ 0 & c \end{bmatrix}\right)$, +```math +T_k = +\begin{bmatrix} + \alpha_1 & \gamma_2 & & \\ + \beta_2 & \alpha_2 & \ddots & \\ + & \ddots & \ddots & \gamma_k \\ + & & \beta_k & \alpha_k +\end{bmatrix} +, \qquad +T_{k+1,k} = +\begin{bmatrix} + T_{k} \\ + \beta_{k+1} e_{k}^T +\end{bmatrix} +, \qquad +T_{k,k+1} = +\begin{bmatrix} + T_{k} & \gamma_{k+1} e_{k} +\end{bmatrix}. +``` + +The function [`saunders_simon_yip`](@ref saunders_simon_yip) returns $V_{k+1}$, $T_{k+1,k}$, $U_{k+1}$ and $T_{k,k+1}^H$. + +Related methods: [`USYMLQ`](@ref usymlq), [`USYMQR`](@ref usymqr), [`TriLQR`](@ref trilqr), [`TriCG`](@ref tricg) and [`TriMR`](@ref trimr). + +```@docs +saunders_simon_yip +``` + +## Montoison-Orban + +![montoison_orban](./graphics/montoison_orban.png) + +After $k$ iterations of the Montoison-Orban process (also named the orthogonal Hessenberg reduction process), the situation may be summarized as +```math +\begin{align*} + A U_k &= V_k H_k + h_{k+1,k} v_{k+1} e_k^T = V_{k+1} H_{k+1,k}, \\ + B V_k &= U_k F_k + f_{k+1,k} u_{k+1} e_k^T = U_{k+1} F_{k+1,k}, \\ + V_k^H V_k &= U_k^H U_k = I_k, +\end{align*} +``` +where $\begin{bmatrix} V_k & 0 \\ 0 & U_k \end{bmatrix}$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}^{\square}_k \left(\begin{bmatrix} 0 & A \\ B & 0 \end{bmatrix}, \begin{bmatrix} b & 0 \\ 0 & c \end{bmatrix}\right)$, +```math +H_k = +\begin{bmatrix} + h_{1,1}~ & h_{1,2}~ & \ldots & h_{1,k} \\ + h_{2,1}~ & \ddots~ & \ddots & \vdots \\ + & \ddots~ & \ddots & h_{k-1,k} \\ + & & h_{k,k-1} & h_{k,k} +\end{bmatrix} +, \qquad +F_k = +\begin{bmatrix} + f_{1,1}~ & f_{1,2}~ & \ldots & f_{1,k} \\ + f_{2,1}~ & \ddots~ & \ddots & \vdots \\ + & \ddots~ & \ddots & f_{k-1,k} \\ + & & f_{k,k-1} & f_{k,k} +\end{bmatrix}, +``` +```math +H_{k+1,k} = +\begin{bmatrix} + H_{k} \\ + h_{k+1,k} e_{k}^T +\end{bmatrix} +, \qquad +F_{k+1,k} = +\begin{bmatrix} + F_{k} \\ + f_{k+1,k} e_{k}^T +\end{bmatrix}. +``` + +The function [`montoison_orban`](@ref montoison_orban) returns $V_{k+1}$, $H_{k+1,k}$, $U_{k+1}$ and $F_{k+1,k}$. + +Related methods: [`GPMR`](@ref gpmr). + +```@docs +montoison_orban +``` diff --git a/src/Krylov.jl b/src/Krylov.jl index e3903e124..aadde1575 100644 --- a/src/Krylov.jl +++ b/src/Krylov.jl @@ -5,6 +5,7 @@ using LinearAlgebra, SparseArrays, Printf include("krylov_utils.jl") include("krylov_stats.jl") include("krylov_solvers.jl") +include("krylov_processes.jl") include("cg.jl") include("cr.jl") diff --git a/src/krylov_processes.jl b/src/krylov_processes.jl new file mode 100644 index 000000000..49a30f4b3 --- /dev/null +++ b/src/krylov_processes.jl @@ -0,0 +1,459 @@ +export hermitian_lanczos, nonhermitian_lanczos, arnoldi, golub_kahan, saunders_simon_yip, montoison_orban + +""" + V, T = hermitian_lanczos(A, b, k) + +#### Input arguments: + +* `A`: a linear operator that models a Hermitian matrix of dimension n. +* `b`: a vector of length n. +* `k`: the number of iterations of the Hermitian Lanczos process. + +#### Output arguments: + +* `V`: a dense n × (k+1) matrix. +* `T`: a sparse (k+1) × k tridiagonal matrix. + +#### Reference + +* C. Lanczos, [*An Iteration Method for the Solution of the Eigenvalue Problem of Linear Differential and Integral Operators*](https://doi.org/10.6028/jres.045.026), Journal of Research of the National Bureau of Standards, 45(4), pp. 225--280, 1950. +""" +function hermitian_lanczos(A, b::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex + n, m = size(A) + R = real(FC) + S = ktypeof(b) + M = vector_to_matrix(S) + + colptr = zeros(Int, k+1) + rowval = zeros(Int, 3k-1) + nzval = zeros(R, 3k-1) + + colptr[1] = 1 + rowval[1] = 1 + rowval[2] = 2 + for i = 1:k + colptr[i+1] = 3i + if i ≥ 2 + pos = colptr[i] + rowval[pos] = i-1 + rowval[pos+1] = i + rowval[pos+2] = i+1 + end + end + + V = M(undef, n, k+1) + T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval) + + for i = 1:k + vᵢ = view(V,:,i) + vᵢ₊₁ = q = view(V,:,i+1) + if i == 1 + βᵢ = @knrm2(n, b) + vᵢ .= b ./ βᵢ + end + mul!(q, A, vᵢ) + αᵢ = @kdotr(n, vᵢ, q) + T[i,i] = αᵢ + @kaxpy!(n, -αᵢ, vᵢ, q) + if i ≥ 2 + vᵢ₋₁ = view(V,:,i-1) + βᵢ = T[i,i-1] + T[i-1,i] = βᵢ + @kaxpy!(n, -βᵢ, vᵢ₋₁, q) + end + βᵢ₊₁ = @knrm2(n, q) + T[i+1,i] = βᵢ₊₁ + vᵢ₊₁ .= q ./ βᵢ₊₁ + end + return V, T +end + +""" + V, T, U, Tᴴ = nonhermitian_lanczos(A, b, c, k) + +#### Input arguments: + +* `A`: a linear operator that models a square matrix of dimension n. +* `b`: a vector of length n. +* `c`: a vector of length n. +* `k`: the number of iterations of the non-Hermitian Lanczos process. + +#### Output arguments: + +* `V`: a dense n × (k+1) matrix. +* `T`: a sparse (k+1) × k tridiagonal matrix. +* `U`: a dense n × (k+1) matrix. +* `Tᴴ`: a sparse (k+1) × k tridiagonal matrix. + +#### Reference + +* C. Lanczos, [*An Iteration Method for the Solution of the Eigenvalue Problem of Linear Differential and Integral Operators*](https://doi.org/10.6028/jres.045.026), Journal of Research of the National Bureau of Standards, 45(4), pp. 225--280, 1950. +""" +function nonhermitian_lanczos(A, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex + n, m = size(A) + Aᴴ = A' + S = ktypeof(b) + M = vector_to_matrix(S) + + colptr = zeros(Int, k+1) + rowval = zeros(Int, 3k-1) + nzval_T = zeros(FC, 3k-1) + nzval_Tᴴ = zeros(FC, 3k-1) + + colptr[1] = 1 + rowval[1] = 1 + rowval[2] = 2 + for i = 1:k + colptr[i+1] = 3i + if i ≥ 2 + pos = colptr[i] + rowval[pos] = i-1 + rowval[pos+1] = i + rowval[pos+2] = i+1 + end + end + + V = M(undef, n, k+1) + U = M(undef, n, k+1) + T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_T) + Tᴴ = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_Tᴴ) + + for i = 1:k + vᵢ = view(V,:,i) + uᵢ = view(U,:,i) + vᵢ₊₁ = q = view(V,:,i+1) + uᵢ₊₁ = p = view(U,:,i+1) + if i == 1 + cᴴb = @kdot(n, c, b) + βᵢ = √(abs(cᴴb)) + γᵢ = cᴴb / βᵢ + vᵢ .= b ./ βᵢ + uᵢ .= c ./ conj(γᵢ) + end + mul!(q, A , vᵢ) + mul!(p, Aᴴ, uᵢ) + if i ≥ 2 + vᵢ₋₁ = view(V,:,i-1) + uᵢ₋₁ = view(U,:,i-1) + βᵢ = T[i,i-1] + γᵢ = T[i-1,i] + @kaxpy!(n, - γᵢ , vᵢ₋₁, q) + @kaxpy!(n, -conj(βᵢ), uᵢ₋₁, p) + end + αᵢ = @kdot(n, uᵢ, q) + T[i,i] = αᵢ + Tᴴ[i,i] = conj(αᵢ) + @kaxpy!(m, - αᵢ , vᵢ, q) + @kaxpy!(n, -conj(αᵢ), uᵢ, p) + pᴴq = @kdot(n, p, q) + βᵢ₊₁ = √(abs(pᴴq)) + γᵢ₊₁ = pᴴq / βᵢ₊₁ + vᵢ₊₁ .= q ./ βᵢ₊₁ + uᵢ₊₁ .= p ./ conj(γᵢ₊₁) + T[i+1,i] = βᵢ₊₁ + Tᴴ[i+1,i] = conj(γᵢ₊₁) + if i ≤ k-1 + T[i,i+1] = γᵢ₊₁ + Tᴴ[i,i+1] = conj(βᵢ₊₁) + end + end + return V, T, U, Tᴴ +end + +""" + V, H = arnoldi(A, b, k) + +#### Input arguments: + +* `A`: a linear operator that models a square matrix of dimension n. +* `b`: a vector of length n. +* `k`: the number of iterations of the Arnoldi process. + +#### Output arguments: + +* `V`: a dense n × (k+1) matrix. +* `H`: a sparse (k+1) × k upper Hessenberg matrix. + +#### Reference + +* W. E. Arnoldi, [*The principle of minimized iterations in the solution of the matrix eigenvalue problem*](https://doi.org/10.1090/qam/42792), Quarterly of Applied Mathematics, 9, pp. 17--29, 1951. +""" +function arnoldi(A, b::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex + n, m = size(A) + S = ktypeof(b) + M = vector_to_matrix(S) + + nnz = div(k*(k+1), 2) + k + colptr = zeros(Int, k+1) + rowval = zeros(Int, nnz) + nzval = zeros(FC, nnz) + + colptr[1] = 1 + for i = 1:k + pos = colptr[i] + colptr[i+1] = pos+i+1 + for j = 1:i+1 + rowval[pos+j-1] = j + end + end + + V = M(undef, n, k+1) + H = SparseMatrixCSC(k+1, k, colptr, rowval, nzval) + + for i = 1:k + vᵢ = view(V,:,i) + vᵢ₊₁ = q = view(V,:,i+1) + if i == 1 + β = @knrm2(n, b) + vᵢ .= b ./ β + end + mul!(q, A, vᵢ) + for j = 1:i + vⱼ = view(V,:,j) + H[j,i] = @kdot(n, vⱼ, q) + @kaxpy!(n, -H[j,i], vⱼ, q) + end + H[i+1,i] = @knrm2(n, q) + vᵢ₊₁ .= q ./ H[i+1,i] + end + return V, H +end + +""" + V, U, L = golub_kahan(A, b, k) + +#### Input arguments: + +* `A`: a linear operator that models a matrix of dimension n × m. +* `b`: a vector of length n. +* `k`: the number of iterations of the Golub-Kahan process. + +#### Output arguments: + +* `V`: a dense m × (k+1) matrix. +* `U`: a dense n × (k+1) matrix. +* `L`: a sparse (k+1) × (k+1) lower bidiagonal matrix. + +#### Reference + +* G. H. Golub and W. Kahan, [*Calculating the Singular Values and Pseudo-Inverse of a Matrix*](https://doi.org/10.1137/0702016), SIAM Journal on Numerical Analysis, 2(2), pp. 225--224, 1965. +""" +function golub_kahan(A, b::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex + n, m = size(A) + R = real(FC) + Aᴴ = A' + S = ktypeof(b) + M = vector_to_matrix(S) + + colptr = zeros(Int, k+2) + rowval = zeros(Int, 2k+1) + nzval = zeros(R, 2k+1) + + colptr[1] = 1 + for i = 1:k + pos = colptr[i] + colptr[i+1] = pos+2 + rowval[pos] = i + rowval[pos+1] = i+1 + end + rowval[2k+1] = k+1 + colptr[k+2] = 2k+2 + + V = M(undef, m, k+1) + U = M(undef, n, k+1) + L = SparseMatrixCSC(k+1, k+1, colptr, rowval, nzval) + + for i = 1:k + uᵢ = view(U,:,i) + vᵢ = view(V,:,i) + uᵢ₊₁ = q = view(U,:,i+1) + vᵢ₊₁ = p = view(V,:,i+1) + if i == 1 + wᵢ = vᵢ + βᵢ = @knrm2(n, b) + uᵢ .= b ./ βᵢ + mul!(wᵢ, Aᴴ, uᵢ) + αᵢ = @knrm2(m, wᵢ) + L[1,1] = αᵢ + vᵢ .= wᵢ ./ αᵢ + end + mul!(q, A, vᵢ) + αᵢ = L[i,i] + @kaxpy!(n, -αᵢ, uᵢ, q) + βᵢ₊₁ = @knrm2(n, q) + uᵢ₊₁ .= q ./ βᵢ₊₁ + mul!(p, Aᴴ, uᵢ₊₁) + @kaxpy!(m, -βᵢ₊₁, vᵢ, p) + αᵢ₊₁ = @knrm2(m, p) + vᵢ₊₁ .= p ./ αᵢ₊₁ + L[i+1,i] = βᵢ₊₁ + L[i+1,i+1] = αᵢ₊₁ + end + return V, U, L +end + +""" + V, T, U, Tᴴ = saunders_simon_yip(A, b, c, k) + +#### Input arguments: + +* `A`: a linear operator that models a matrix of dimension n × m. +* `b`: a vector of length n. +* `c`: a vector of length m. +* `k`: the number of iterations of the Saunders-Simon-Yip process. + +#### Output arguments: + +* `V`: a dense n × (k+1) matrix. +* `T`: a sparse (k+1) × k tridiagonal matrix. +* `U`: a dense m × (k+1) matrix. +* `Tᴴ`: a sparse (k+1) × k tridiagonal matrix. + +#### Reference + +* M. A. Saunders, H. D. Simon, and E. L. Yip, [*Two Conjugate-Gradient-Type Methods for Unsymmetric Linear Equations*](https://doi.org/10.1137/0725052), SIAM Journal on Numerical Analysis, 25(4), pp. 927--940, 1988. +""" +function saunders_simon_yip(A, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex + n, m = size(A) + Aᴴ = A' + S = ktypeof(b) + M = vector_to_matrix(S) + + colptr = zeros(Int, k+1) + rowval = zeros(Int, 3k-1) + nzval_T = zeros(FC, 3k-1) + nzval_Tᴴ = zeros(FC, 3k-1) + + colptr[1] = 1 + rowval[1] = 1 + rowval[2] = 2 + for i = 1:k + colptr[i+1] = 3i + if i ≥ 2 + pos = colptr[i] + rowval[pos] = i-1 + rowval[pos+1] = i + rowval[pos+2] = i+1 + end + end + + V = M(undef, n, k+1) + U = M(undef, m, k+1) + T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_T) + Tᴴ = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_Tᴴ) + + for i = 1:k + vᵢ = view(V,:,i) + uᵢ = view(U,:,i) + vᵢ₊₁ = q = view(V,:,i+1) + uᵢ₊₁ = p = view(U,:,i+1) + if i == 1 + β = @knrm2(n, b) + γ = @knrm2(m, c) + vᵢ .= b ./ β + uᵢ .= c ./ γ + end + mul!(q, A , uᵢ) + mul!(p, Aᴴ, vᵢ) + if i ≥ 2 + vᵢ₋₁ = view(V,:,i-1) + uᵢ₋₁ = view(U,:,i-1) + βᵢ = T[i,i-1] + γᵢ = T[i-1,i] + @kaxpy!(n, -γᵢ, vᵢ₋₁, q) + @kaxpy!(m, -βᵢ, uᵢ₋₁, p) + end + αᵢ = @kdot(n, vᵢ, q) + T[i,i] = αᵢ + Tᴴ[i,i] = conj(αᵢ) + @kaxpy!(n, - αᵢ , vᵢ, q) + @kaxpy!(m, -conj(αᵢ), uᵢ, p) + βᵢ₊₁ = @knrm2(n, q) + γᵢ₊₁ = @knrm2(m, p) + vᵢ₊₁ .= q ./ βᵢ₊₁ + uᵢ₊₁ .= p ./ γᵢ₊₁ + T[i+1,i] = βᵢ₊₁ + Tᴴ[i+1,i] = conj(γᵢ₊₁) + if i ≤ k-1 + T[i,i+1] = γᵢ₊₁ + Tᴴ[i,i+1] = conj(βᵢ₊₁) + end + end + return V, T, U, Tᴴ +end + +""" + V, H, U, F = montoison_orban(A, B, b, c, k) + +#### Input arguments: + +* `A`: a linear operator that models a matrix of dimension n × m. +* `B`: a linear operator that models a matrix of dimension m × n. +* `b`: a vector of length n. +* `c`: a vector of length m. +* `k`: the number of iterations of the Montoison-Orban process. + +#### Output arguments: + +* `V`: a dense n × (k+1) matrix. +* `H`: a sparse (k+1) × k upper Hessenberg matrix. +* `U`: a dense m × (k+1) matrix. +* `F`: a sparse (k+1) × k upper Hessenberg matrix. + +#### Reference + +* A. Montoison and D. Orban, [*GPMR: An Iterative Method for Unsymmetric Partitioned Linear Systems*](https://dx.doi.org/10.13140/RG.2.2.24069.68326), Cahier du GERAD G-2021-62, GERAD, Montréal, 2021. +""" +function montoison_orban(A, B, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex + n, m = size(A) + S = ktypeof(b) + M = vector_to_matrix(S) + + nnz = div(k*(k+1), 2) + k + colptr = zeros(Int, k+1) + rowval = zeros(Int, nnz) + nzval_H = zeros(FC, nnz) + nzval_F = zeros(FC, nnz) + + colptr[1] = 1 + for i = 1:k + pos = colptr[i] + colptr[i+1] = pos+i+1 + for j = 1:i+1 + rowval[pos+j-1] = j + end + end + + V = M(undef, n, k+1) + U = M(undef, m, k+1) + H = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_H) + F = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_F) + + for i = 1:k + vᵢ = view(V,:,i) + uᵢ = view(U,:,i) + vᵢ₊₁ = q = view(V,:,i+1) + uᵢ₊₁ = p = view(U,:,i+1) + if i == 1 + β = @knrm2(n, b) + γ = @knrm2(m, c) + vᵢ .= b ./ β + uᵢ .= c ./ γ + end + mul!(q, A, uᵢ) + mul!(p, B, vᵢ) + for j = 1:i + vⱼ = view(V,:,j) + uⱼ = view(U,:,j) + H[j,i] = @kdot(n, vⱼ, q) + @kaxpy!(n, -H[j,i], vⱼ, q) + F[j,i] = @kdot(m, uⱼ, p) + @kaxpy!(m, -F[j,i], uⱼ, p) + end + H[i+1,i] = @knrm2(n, q) + vᵢ₊₁ .= q ./ H[i+1,i] + F[i+1,i] = @knrm2(m, p) + uᵢ₊₁ .= p ./ F[i+1,i] + end + return V, H, U, F +end diff --git a/test/gpu/amd.jl b/test/gpu/amd.jl index baad2bdcf..beec647b8 100644 --- a/test/gpu/amd.jl +++ b/test/gpu/amd.jl @@ -1,7 +1,6 @@ -using LinearAlgebra, SparseArrays, Test -using Krylov, AMDGPU +using AMDGPU -include("../test_utils.jl") +include("gpu.jl") @testset "AMD -- AMDGPU.jl" begin @@ -19,6 +18,7 @@ include("../test_utils.jl") for FC in (Float32, Float64, ComplexF32, ComplexF64) S = ROCVector{FC} + M = ROCMatrix{FC} T = real(FC) n = 10 x = rand(FC, n) @@ -70,8 +70,8 @@ include("../test_utils.jl") @testset "vector_to_matrix" begin S = ROCVector{FC} - M = Krylov.vector_to_matrix(S) - @test M == ROCMatrix{FC} + M2 = Krylov.vector_to_matrix(S) + @test M2 == M end ε = eps(T) @@ -93,5 +93,9 @@ include("../test_utils.jl") x, stats = cg(A, b) @test norm(b - A * x) ≤ atol + rtol * norm(b) end + + # @testset "processes -- $FC" begin + # test_processes(S, M) + # end end end diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl new file mode 100644 index 000000000..e0ca265b7 --- /dev/null +++ b/test/gpu/gpu.jl @@ -0,0 +1,38 @@ +using LinearAlgebra, SparseArrays, Test +using Krylov + +include("../test_utils.jl") + +function test_processes(S, M) + n = 250 + m = 500 + k = 20 + FC = eltype(S) + + cpu_A, cpu_b = symmetric_indefinite(n, FC=FC) + gpu_A, gpu_b = M(cpu_A), S(cpu_b) + V, T = hermitian_lanczos(gpu_A, gpu_b, k) + + cpu_A, cpu_b = nonsymmetric_definite(n, FC=FC) + cpu_c = -cpu_b + gpu_A, gpu_b, gpu_c = M(cpu_A), S(cpu_b), S(cpu_c) + V, T, U, Tᴴ = nonhermitian_lanczos(gpu_A, gpu_b, gpu_c, k) + + cpu_A, cpu_b = nonsymmetric_indefinite(n, FC=FC) + gpu_A, gpu_b = M(cpu_A), S(cpu_b) + V, H = arnoldi(gpu_A, gpu_b, k) + + cpu_A, cpu_b = under_consistent(n, m, FC=FC) + gpu_A, gpu_b = M(cpu_A), S(cpu_b) + V, U, L = golub_kahan(gpu_A, gpu_b, k) + + cpu_A, cpu_b = under_consistent(n, m, FC=FC) + _, cpu_c = over_consistent(m, n, FC=FC) + gpu_A, gpu_b, gpu_c = M(cpu_A), S(cpu_b), S(cpu_c) + V, T, U, Tᴴ = saunders_simon_yip(gpu_A, gpu_b, gpu_c, k) + + cpu_A, cpu_b = under_consistent(n, m, FC=FC) + cpu_B, cpu_c = over_consistent(m, n, FC=FC) + gpu_A, gpu_B, gpu_b, gpu_c = M(cpu_A), M(cpu_B), S(cpu_b), S(cpu_c) + V, H, U, F = montoison_orban(gpu_A, gpu_B, gpu_b, gpu_c, k) +end diff --git a/test/gpu/intel.jl b/test/gpu/intel.jl index 67ad0a7d5..e65a2bf47 100644 --- a/test/gpu/intel.jl +++ b/test/gpu/intel.jl @@ -1,7 +1,6 @@ -using LinearAlgebra, SparseArrays, Test -using Krylov, oneAPI +using oneAPI -include("../test_utils.jl") +include("gpu.jl") import Krylov.kdot # https://github.com/JuliaGPU/GPUArrays.jl/pull/427 @@ -27,6 +26,7 @@ end for FC ∈ (Float32, ComplexF32) S = oneVector{FC} + M = oneMatrix{FC} T = real(FC) n = 10 x = rand(FC, n) @@ -78,8 +78,8 @@ end @testset "vector_to_matrix" begin S = oneVector{FC} - M = Krylov.vector_to_matrix(S) - @test M == oneMatrix{FC} + M2 = Krylov.vector_to_matrix(S) + @test M2 == M end ε = eps(T) @@ -101,5 +101,9 @@ end x, stats = cg(A, b) @test norm(b - A * x) ≤ atol + rtol * norm(b) end + + # @testset "processes -- $FC" begin + # test_processes(S, M) + # end end end diff --git a/test/gpu/metal.jl b/test/gpu/metal.jl index 35325c863..9fd24392a 100644 --- a/test/gpu/metal.jl +++ b/test/gpu/metal.jl @@ -1,7 +1,6 @@ -using LinearAlgebra, SparseArrays, Test -using Krylov, Metal +using Metal -include("../test_utils.jl") +include("gpu.jl") # https://github.com/JuliaGPU/Metal.jl/pull/48 const MtlVector{T} = MtlArray{T,1} @@ -31,6 +30,7 @@ end for FC in (Float32, ComplexF32) S = MtlVector{FC} + M = MtlMatrix{FC} T = real(FC) n = 10 x = rand(FC, n) @@ -82,8 +82,8 @@ end @testset "vector_to_matrix" begin S = MtlVector{FC} - M = Krylov.vector_to_matrix(S) - @test M == MtlMatrix{FC} + M2 = Krylov.vector_to_matrix(S) + @test M2 == M end ε = eps(T) @@ -105,5 +105,9 @@ end x, stats = cg(A, b) @test norm(b - A * x) ≤ atol + rtol * norm(b) end + + # @testset "processes -- $FC" begin + # test_processes(S, M) + # end end end diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl index c72c5c6ba..b27ee11d8 100644 --- a/test/gpu/nvidia.jl +++ b/test/gpu/nvidia.jl @@ -1,7 +1,6 @@ -using LinearAlgebra, SparseArrays, Test -using LinearOperators, Krylov, CUDA, CUDA.CUSPARSE, CUDA.CUSOLVER +using LinearOperators, CUDA, CUDA.CUSPARSE, CUDA.CUSOLVER -include("../test_utils.jl") +include("gpu.jl") @testset "Nvidia -- CUDA.jl" begin @@ -95,6 +94,7 @@ include("../test_utils.jl") for FC in (Float32, Float64, ComplexF32, ComplexF64) S = CuVector{FC} + M = CuMatrix{FC} T = real(FC) n = 10 x = rand(FC, n) @@ -146,8 +146,8 @@ include("../test_utils.jl") @testset "vector_to_matrix" begin S = CuVector{FC} - M = Krylov.vector_to_matrix(S) - @test M == CuMatrix{FC} + M2 = Krylov.vector_to_matrix(S) + @test M2 == M end ε = eps(T) @@ -169,5 +169,9 @@ include("../test_utils.jl") x, stats = cg(A, b) @test norm(b - A * x) ≤ atol + rtol * norm(b) end + + @testset "processes -- $FC" begin + test_processes(S, M) + end end end diff --git a/test/runtests.jl b/test/runtests.jl index 75e8f0941..b69865f61 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,6 +4,7 @@ import Krylov.KRYLOV_SOLVERS include("test_utils.jl") include("test_aux.jl") include("test_stats.jl") +include("test_processes.jl") include("test_fgmres.jl") include("test_gpmr.jl") diff --git a/test/test_processes.jl b/test/test_processes.jl new file mode 100644 index 000000000..7411269bb --- /dev/null +++ b/test/test_processes.jl @@ -0,0 +1,148 @@ +""" + P = permutation_paige(k) + +Return the sparse (2k) × (2k) matrix + + [e₁ • eₖ ] + [ e₁ • eₖ] +""" +function permutation_paige(k) + P = spzeros(Float64, 2k, 2k) + for i = 1:k + P[i,2i-1] = 1.0 + P[i+k,2i] = 1.0 + end + return P +end + +@testset "processes" begin + n = 250 + m = 500 + k = 20 + + for FC in (Float64, ComplexF64) + R = real(FC) + nbits_FC = sizeof(FC) + nbits_R = sizeof(R) + nbits_I = sizeof(Int) + + @testset "Data Type: FC" begin + + @testset "Hermitian Lanczos" begin + A, b = symmetric_indefinite(n, FC=FC) + V, T = hermitian_lanczos(A, b, k) + + @test A * V[:,1:k] ≈ V * T + + storage_hermitian_lanczos_bytes(n, k) = 4k * nbits_I + (3k-1) * nbits_R + n*(k+1) * nbits_FC + + expected_hermitian_lanczos_bytes = storage_hermitian_lanczos_bytes(n, k) + actual_hermitian_lanczos_bytes = @allocated hermitian_lanczos(A, b, k) + @test expected_hermitian_lanczos_bytes ≤ actual_hermitian_lanczos_bytes ≤ 1.02 * expected_hermitian_lanczos_bytes + end + + @testset "Non-hermitian Lanczos" begin + A, b = nonsymmetric_definite(n, FC=FC) + c = -b + V, T, U, Tᴴ = nonhermitian_lanczos(A, b, c, k) + + @test T[1:k,1:k] ≈ Tᴴ[1:k,1:k]' + @test A * V[:,1:k] ≈ V * T + @test A' * U[:,1:k] ≈ U * Tᴴ + + storage_nonhermitian_lanczos_bytes(n, k) = 4k * nbits_I + (6k-2) * nbits_FC + 2*n*(k+1) * nbits_FC + + expected_nonhermitian_lanczos_bytes = storage_nonhermitian_lanczos_bytes(n, k) + actual_nonhermitian_lanczos_bytes = @allocated nonhermitian_lanczos(A, b, c, k) + @test expected_nonhermitian_lanczos_bytes ≤ actual_nonhermitian_lanczos_bytes ≤ 1.02 * expected_nonhermitian_lanczos_bytes + end + + @testset "Arnoldi" begin + A, b = nonsymmetric_indefinite(n, FC=FC) + V, H = arnoldi(A, b, k) + + @test A * V[:,1:k] ≈ V * H + + function storage_arnoldi_bytes(n, k) + nnz = div(k*(k+1), 2) + k + return (nnz + k+1) * nbits_I + nnz * nbits_FC + n*(k+1) * nbits_FC + end + + expected_arnoldi_bytes = storage_arnoldi_bytes(n, k) + actual_arnoldi_bytes = @allocated arnoldi(A, b, k) + @test expected_arnoldi_bytes ≤ actual_arnoldi_bytes ≤ 1.02 * expected_arnoldi_bytes + end + + @testset "Golub-Kahan" begin + A, b = under_consistent(n, m, FC=FC) + V, U, L = golub_kahan(A, b, k) + B = L[1:k+1,1:k] + + @test A * V[:,1:k] ≈ U * B + @test A' * U ≈ V * L' + @test A' * A * V[:,1:k] ≈ V * L' * B + @test A * A' * U[:,1:k] ≈ U * B * L[1:k,1:k]' + + storage_golub_kahan_bytes(n, m, k) = 3*(k+1) * nbits_I + (2k+1) * nbits_R + (n+m)*(k+1) * nbits_FC + + expected_golub_kahan_bytes = storage_golub_kahan_bytes(n, m, k) + actual_golub_kahan_bytes = @allocated golub_kahan(A, b, k) + @test expected_golub_kahan_bytes ≤ actual_golub_kahan_bytes ≤ 1.02 * expected_golub_kahan_bytes + end + + @testset "Saunders-Simon-Yip" begin + A, b = under_consistent(n, m, FC=FC) + _, c = over_consistent(m, n, FC=FC) + V, T, U, Tᴴ = saunders_simon_yip(A, b, c, k) + + @test T[1:k,1:k] ≈ Tᴴ[1:k,1:k]' + @test A * U[:,1:k] ≈ V * T + @test A' * V[:,1:k] ≈ U * Tᴴ + @test A' * A * U[:,1:k-1] ≈ U * Tᴴ * T[1:k,1:k-1] + @test A * A' * V[:,1:k-1] ≈ V * T * Tᴴ[1:k,1:k-1] + + K = [zeros(FC,n,n) A; A' zeros(FC,m,m)] + Pₖ = permutation_paige(k) + Wₖ = [V[:,1:k] zeros(FC,n,k); zeros(FC,m,k) U[:,1:k]] * Pₖ + Pₖ₊₁ = permutation_paige(k+1) + Wₖ₊₁ = [V zeros(FC,n,k+1); zeros(FC,m,k+1) U] * Pₖ₊₁ + G = Pₖ₊₁' * [zeros(FC,k+1,k) T; Tᴴ zeros(FC,k+1,k)] * Pₖ + @test K * Wₖ ≈ Wₖ₊₁ * G + + storage_saunders_simon_yip_bytes(n, m, k) = 4k * nbits_I + (6k-2) * nbits_FC + (n+m)*(k+1) * nbits_FC + + expected_saunders_simon_yip_bytes = storage_saunders_simon_yip_bytes(n, m, k) + actual_saunders_simon_yip_bytes = @allocated saunders_simon_yip(A, b, c, k) + @test expected_saunders_simon_yip_bytes ≤ actual_saunders_simon_yip_bytes ≤ 1.02 * expected_saunders_simon_yip_bytes + end + + @testset "Montoison-Orban" begin + A, b = under_consistent(n, m, FC=FC) + B, c = over_consistent(m, n, FC=FC) + V, H, U, F = montoison_orban(A, B, b, c, k) + + @test A * U[:,1:k] ≈ V * H + @test B * V[:,1:k] ≈ U * F + @test B * A * U[:,1:k-1] ≈ U * F * H[1:k,1:k-1] + @test A * B * V[:,1:k-1] ≈ V * H * F[1:k,1:k-1] + + K = [zeros(FC,n,n) A; B zeros(FC,m,m)] + Pₖ = permutation_paige(k) + Wₖ = [V[:,1:k] zeros(FC,n,k); zeros(FC,m,k) U[:,1:k]] * Pₖ + Pₖ₊₁ = permutation_paige(k+1) + Wₖ₊₁ = [V zeros(FC,n,k+1); zeros(FC,m,k+1) U] * Pₖ₊₁ + G = Pₖ₊₁' * [zeros(FC,k+1,k) H; F zeros(FC,k+1,k)] * Pₖ + @test K * Wₖ ≈ Wₖ₊₁ * G + + function storage_montoison_orban_bytes(n, m, k) + nnz = div(k*(k+1), 2) + k + return (nnz + k+1) * nbits_I + 2*nnz * nbits_FC + (n+m)*(k+1) * nbits_FC + end + + expected_montoison_orban_bytes = storage_montoison_orban_bytes(n, m, k) + actual_montoison_orban_bytes = @allocated montoison_orban(A, B, b, c, k) + @test expected_montoison_orban_bytes ≤ actual_montoison_orban_bytes ≤ 1.02 * expected_montoison_orban_bytes + end + end + end +end