diff --git a/docs/make.jl b/docs/make.jl
index 0ad50d52f..db49cb759 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -12,6 +12,7 @@ makedocs(
   sitename = "Krylov.jl",
   pages = ["Home" => "index.md",
            "API" => "api.md",
+           "Krylov processes" => "processes.md",
            "Krylov methods" => ["Symmetric positive definite linear systems" => "solvers/spd.md",
                                 "Symmetric indefinite linear systems" => "solvers/sid.md",
                                 "Unsymmetric linear systems" => "solvers/unsymmetric.md",
diff --git a/docs/src/graphics/arnoldi.png b/docs/src/graphics/arnoldi.png
new file mode 100644
index 000000000..9ef8bd3a3
Binary files /dev/null and b/docs/src/graphics/arnoldi.png differ
diff --git a/docs/src/graphics/golub_kahan.png b/docs/src/graphics/golub_kahan.png
new file mode 100644
index 000000000..3baec4b4a
Binary files /dev/null and b/docs/src/graphics/golub_kahan.png differ
diff --git a/docs/src/graphics/hermitian_lanczos.png b/docs/src/graphics/hermitian_lanczos.png
new file mode 100644
index 000000000..c70082e72
Binary files /dev/null and b/docs/src/graphics/hermitian_lanczos.png differ
diff --git a/docs/src/graphics/montoison_orban.png b/docs/src/graphics/montoison_orban.png
new file mode 100644
index 000000000..5a14eda04
Binary files /dev/null and b/docs/src/graphics/montoison_orban.png differ
diff --git a/docs/src/graphics/nonhermitian_lanczos.png b/docs/src/graphics/nonhermitian_lanczos.png
new file mode 100644
index 000000000..b8d83961c
Binary files /dev/null and b/docs/src/graphics/nonhermitian_lanczos.png differ
diff --git a/docs/src/graphics/saunders_simon_yip.png b/docs/src/graphics/saunders_simon_yip.png
new file mode 100644
index 000000000..c3acfd181
Binary files /dev/null and b/docs/src/graphics/saunders_simon_yip.png differ
diff --git a/docs/src/processes.md b/docs/src/processes.md
new file mode 100644
index 000000000..208824ae8
--- /dev/null
+++ b/docs/src/processes.md
@@ -0,0 +1,278 @@
+# [Krylov processes](@id krylov-processes)
+
+Krylov processes are the foundation of Krylov methods, they generate bases of Krylov subspaces.
+
+### Notation
+
+Define $V_k := \begin{bmatrix} v_1 & \ldots & v_k \end{bmatrix} \enspace$ and $\enspace U_k := \begin{bmatrix} u_1 & \ldots & u_k \end{bmatrix}$.
+
+For a matrix $C \in \mathbb{C}^{n \times n}$ and a vector $t \in \mathbb{C}^{n}$, the $k$-th Krylov subspace generated by $C$ and $t$ is
+```math
+\mathcal{K}_k(C, t) :=
+\left\{\sum_{i=0}^{k-1} \omega_i C^i t \, \middle \vert \, \omega_i \in \mathbb{C},~0 \le i \le k-1 \right\}.
+```
+
+For matrices $C \in \mathbb{C}^{n \times n} \enspace$ and $\enspace T \in \mathbb{C}^{n \times p}$, the $k$-th block Krylov subspace generated by $C$ and $T$ is
+```math
+\mathcal{K}_k^{\square}(C, T) :=
+\left\{\sum_{i=0}^{k-1} C^i T \, \Omega_i \, \middle \vert \, \Omega_i \in \mathbb{C}^{p \times p},~0 \le i \le k-1 \right\}.
+```
+
+## Hermitian Lanczos
+
+![hermitian_lanczos](./graphics/hermitian_lanczos.png)
+
+After $k$ iterations of the Hermitian Lanczos process, the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= V_k T_k + \beta_{k+1,k} v_{k+1} e_k^T = V_{k+1}  T_{k+1,k}, \\
+  V_k^H V_k &= I_k,
+\end{align*}
+```
+where $V_k$ is an orthonormal basis of the Krylov subspace $\mathcal{K}_k (A,b)$,
+```math
+T_k =
+\begin{bmatrix}
+  \alpha_1 & \beta_2  &          &         \\
+  \beta_2  & \alpha_2 & \ddots   &         \\
+           & \ddots   & \ddots   & \beta_k \\
+           &          & \beta_k  & \alpha_k
+\end{bmatrix}
+, \qquad
+T_{k+1,k} =
+\begin{bmatrix}
+  T_{k} \\
+  \beta_{k+1} e_{k}^T
+\end{bmatrix}.
+```
+Note that $T_{k+1,k}$ is a real tridiagonal matrix even if $A$ is a complex matrix.
+
+The function [`hermitian_lanczos`](@ref hermitian_lanczos) returns $V_{k+1}$ and $T_{k+1,k}$.
+
+Related methods: [`SYMMLQ`](@ref symmlq), [`CG`](@ref cg), [`CR`](@ref cr), [`MINRES`](@ref minres), [`MINRES-QLP`](@ref minres_qlp), [`CGLS`](@ref cgls), [`CRLS`](@ref crls), [`CGNE`](@ref cgne), [`CRMR`](@ref crmr), [`CG-LANCZOS`](@ref cg_lanczos) and [`CG-LANCZOS-SHIFT`](@ref cg_lanczos_shift).
+
+```@docs
+hermitian_lanczos
+```
+
+## Non-Hermitian Lanczos
+
+![nonhermitian_lanczos](./graphics/nonhermitian_lanczos.png)
+
+After $k$ iterations of the non-Hermitian Lanczos process (also named the Lanczos biorthogonalization process), the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= V_k T_k   +        \beta_{k+1} v_{k+1} e_k^T = V_{k+1} T_{k+1,k},   \\
+  B U_k &= U_k T_k^H + \bar{\gamma}_{k+1} u_{k+1} e_k^T = U_{k+1} T_{k,k+1}^H, \\
+  V_k^H U_k &= U_k^H V_k = I_k,
+\end{align*}
+```
+where $V_k$ and $U_k$ are bases of the Krylov subspaces $\mathcal{K}_k (A,b)$ and $\mathcal{K}_k (A^H,c)$, respectively,
+```math
+T_k = 
+\begin{bmatrix}
+  \alpha_1 & \gamma_2 &          &          \\
+  \beta_2  & \alpha_2 & \ddots   &          \\
+           & \ddots   & \ddots   & \gamma_k \\
+           &          & \beta_k  & \alpha_k
+\end{bmatrix}
+, \qquad
+T_{k+1,k} =
+\begin{bmatrix}
+  T_{k} \\
+  \beta_{k+1} e_{k}^T
+\end{bmatrix}
+, \qquad
+T_{k,k+1} =
+\begin{bmatrix}
+  T_{k} & \gamma_{k+1} e_k
+\end{bmatrix}.
+```
+
+The function [`nonhermitian_lanczos`](@ref nonhermitian_lanczos) returns $V_{k+1}$, $T_{k+1,k}$, $U_{k+1}$ and $T_{k,k+1}^H$.
+
+Related methods: [`BiLQ`](@ref bilq), [`QMR`](@ref qmr), [`BiLQR`](@ref bilqr), [`CGS`](@ref cgs) and [`BICGSTAB`](@ref bicgstab).
+
+!!! note
+    The scaling factors used in our implementation are $\beta_k = |u_k^H v_k|^{\tfrac{1}{2}}$ and $\gamma_k = (u_k^H v_k) / \beta_k$.
+
+```@docs
+nonhermitian_lanczos
+```
+
+## Arnoldi
+
+![arnoldi](./graphics/arnoldi.png)
+
+After $k$ iterations of the Arnoldi process, the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= V_k H_k + h_{k+1,k} v_{k+1} e_k^T = V_{k+1} H_{k+1,k}, \\
+  V_k^H V_k &= I_k,
+\end{align*}
+```
+where $V_k$ is an orthonormal basis of the Krylov subspace $\mathcal{K}_k (A,b)$,
+```math
+H_k =
+\begin{bmatrix}
+  h_{1,1}~ & h_{1,2}~ & \ldots    & h_{1,k}   \\
+  h_{2,1}~ & \ddots~  & \ddots    & \vdots    \\
+           & \ddots~  & \ddots    & h_{k-1,k} \\
+           &          & h_{k,k-1} & h_{k,k}
+\end{bmatrix}
+, \qquad
+H_{k+1,k} =
+\begin{bmatrix}
+  H_{k} \\
+  h_{k+1,k} e_{k}^T
+\end{bmatrix}.
+```
+
+The function [`arnoldi`](@ref arnoldi) returns $V_{k+1}$ and $H_{k+1,k}$.
+
+Related methods: [`DIOM`](@ref diom), [`FOM`](@ref fom), [`DQGMRES`](@ref dqgmres) and [`GMRES`](@ref gmres).
+
+
+```@docs
+arnoldi
+```
+
+## Golub-Kahan
+
+![golub_kahan](./graphics/golub_kahan.png)
+
+After $k$ iterations of the Golub-Kahan bidiagonalization process, the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= U_{k+1} B_k,   \\
+  A^H U_{k+1} &= V_k B_k^H + \alpha_{k+1} v_{k+1} e_{k+1}^T = V_{k+1} L_{k+1}^H, \\
+  V_k^H V_k &= U_k^H U_k = I_k,
+\end{align*}
+```
+where $V_k$ and $U_k$ are bases of the Krylov subspaces $\mathcal{K}_k (A^HA,A^Hb)$ and $\mathcal{K}_k (AA^H,b)$, respectively,
+```math
+L_k =
+\begin{bmatrix}
+  \alpha_1 &          &          &          \\
+  \beta_2  & \alpha_2 &          &          \\
+           & \ddots   & \ddots   &          \\
+           &          & \beta_k  & \alpha_k
+\end{bmatrix}
+, \qquad
+B_k =
+\begin{bmatrix}
+  \alpha_1 &          &          &             \\
+  \beta_2  & \alpha_2 &          &             \\
+           & \ddots   & \ddots   &             \\
+           &          & \beta_k  & \alpha_k    \\
+           &          &          & \beta_{k+1} \\
+\end{bmatrix}
+=
+\begin{bmatrix}
+  L_{k} \\
+  \beta_{k+1} e_{k}^T
+\end{bmatrix}.
+```
+Note that $L_k$ is a real bidiagonal matrix even if $A$ is a complex matrix.
+
+The function [`golub_kahan`](@ref golub_kahan) returns $V_{k+1}$, $U_{k+1}$ and $L_{k+1}$.
+
+Related methods: [`LNLQ`](@ref lnlq), [`CRAIG`](@ref craig), [`CRAIGMR`](@ref craigmr), [`LSLQ`](@ref lslq), [`LSQR`](@ref lsqr) and [`LSMR`](@ref lsmr).
+
+```@docs
+golub_kahan
+```
+
+## Saunders-Simon-Yip
+
+![saunders_simon_yip](./graphics/saunders_simon_yip.png)
+
+After $k$ iterations of the Saunders-Simon-Yip process (also named the orthogonal tridiagonalization process), the situation may be summarized as
+```math
+\begin{align*}
+  A U_k &= V_k T_k   + \beta_{k+1}  v_{k+1} e_k^T = V_{k+1} T_{k+1,k},   \\
+  B V_k &= U_k T_k^H + \gamma_{k+1} u_{k+1} e_k^T = U_{k+1} T_{k,k+1}^H, \\
+  V_k^H V_k &= U_k^H U_k = I_k,
+\end{align*}
+```
+where $\begin{bmatrix} V_k & 0 \\ 0 & U_k \end{bmatrix}$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}^{\square}_k \left(\begin{bmatrix} 0 & A \\ A^H & 0 \end{bmatrix}, \begin{bmatrix} b & 0 \\ 0 & c \end{bmatrix}\right)$,
+```math
+T_k = 
+\begin{bmatrix}
+  \alpha_1 & \gamma_2 &          &          \\
+  \beta_2  & \alpha_2 & \ddots   &          \\
+           & \ddots   & \ddots   & \gamma_k \\
+           &          & \beta_k  & \alpha_k
+\end{bmatrix}
+, \qquad
+T_{k+1,k} =
+\begin{bmatrix}
+  T_{k} \\
+  \beta_{k+1} e_{k}^T
+\end{bmatrix}
+, \qquad
+T_{k,k+1} =
+\begin{bmatrix}
+  T_{k} & \gamma_{k+1} e_{k}
+\end{bmatrix}.
+```
+
+The function [`saunders_simon_yip`](@ref saunders_simon_yip) returns $V_{k+1}$, $T_{k+1,k}$, $U_{k+1}$ and $T_{k,k+1}^H$.
+
+Related methods: [`USYMLQ`](@ref usymlq), [`USYMQR`](@ref usymqr), [`TriLQR`](@ref trilqr), [`TriCG`](@ref tricg) and [`TriMR`](@ref trimr).
+
+```@docs
+saunders_simon_yip
+```
+
+## Montoison-Orban
+
+![montoison_orban](./graphics/montoison_orban.png)
+
+After $k$ iterations of the Montoison-Orban process (also named the orthogonal Hessenberg reduction process), the situation may be summarized as
+```math
+\begin{align*}
+  A U_k &= V_k H_k + h_{k+1,k} v_{k+1} e_k^T = V_{k+1} H_{k+1,k}, \\
+  B V_k &= U_k F_k + f_{k+1,k} u_{k+1} e_k^T = U_{k+1} F_{k+1,k}, \\
+  V_k^H V_k &= U_k^H U_k = I_k,
+\end{align*}
+```
+where $\begin{bmatrix} V_k & 0 \\ 0 & U_k \end{bmatrix}$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}^{\square}_k \left(\begin{bmatrix} 0 & A \\ B & 0 \end{bmatrix}, \begin{bmatrix} b & 0 \\ 0 & c \end{bmatrix}\right)$,
+```math
+H_k =
+\begin{bmatrix}
+  h_{1,1}~ & h_{1,2}~ & \ldots    & h_{1,k}   \\
+  h_{2,1}~ & \ddots~  & \ddots    & \vdots    \\
+           & \ddots~  & \ddots    & h_{k-1,k} \\
+           &          & h_{k,k-1} & h_{k,k}
+\end{bmatrix}
+, \qquad
+F_k =
+\begin{bmatrix}
+  f_{1,1}~ & f_{1,2}~ & \ldots    & f_{1,k}   \\
+  f_{2,1}~ & \ddots~  & \ddots    & \vdots    \\
+           & \ddots~  & \ddots    & f_{k-1,k} \\
+           &          & f_{k,k-1} & f_{k,k}
+\end{bmatrix},
+```
+```math
+H_{k+1,k} =
+\begin{bmatrix}
+  H_{k} \\
+  h_{k+1,k} e_{k}^T
+\end{bmatrix}
+, \qquad
+F_{k+1,k} =
+\begin{bmatrix}
+  F_{k} \\
+  f_{k+1,k} e_{k}^T
+\end{bmatrix}.
+```
+
+The function [`montoison_orban`](@ref montoison_orban) returns $V_{k+1}$, $H_{k+1,k}$, $U_{k+1}$ and $F_{k+1,k}$.
+
+Related methods: [`GPMR`](@ref gpmr).
+
+```@docs
+montoison_orban
+```
diff --git a/src/Krylov.jl b/src/Krylov.jl
index e3903e124..aadde1575 100644
--- a/src/Krylov.jl
+++ b/src/Krylov.jl
@@ -5,6 +5,7 @@ using LinearAlgebra, SparseArrays, Printf
 include("krylov_utils.jl")
 include("krylov_stats.jl")
 include("krylov_solvers.jl")
+include("krylov_processes.jl")
 
 include("cg.jl")
 include("cr.jl")
diff --git a/src/krylov_processes.jl b/src/krylov_processes.jl
new file mode 100644
index 000000000..49a30f4b3
--- /dev/null
+++ b/src/krylov_processes.jl
@@ -0,0 +1,459 @@
+export hermitian_lanczos, nonhermitian_lanczos, arnoldi, golub_kahan, saunders_simon_yip, montoison_orban
+
+"""
+    V, T = hermitian_lanczos(A, b, k)
+
+#### Input arguments:
+
+* `A`: a linear operator that models a Hermitian matrix of dimension n.
+* `b`: a vector of length n.
+* `k`: the number of iterations of the Hermitian Lanczos process.
+
+#### Output arguments:
+
+* `V`: a dense n × (k+1) matrix.
+* `T`: a sparse (k+1) × k tridiagonal matrix.
+
+#### Reference
+
+* C. Lanczos, [*An Iteration Method for the Solution of the Eigenvalue Problem of Linear Differential and Integral Operators*](https://doi.org/10.6028/jres.045.026), Journal of Research of the National Bureau of Standards, 45(4), pp. 225--280, 1950.
+"""
+function hermitian_lanczos(A, b::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex
+  n, m = size(A)
+  R = real(FC)
+  S = ktypeof(b)
+  M = vector_to_matrix(S)
+
+  colptr = zeros(Int, k+1)
+  rowval = zeros(Int, 3k-1)
+  nzval = zeros(R, 3k-1)
+
+  colptr[1] = 1
+  rowval[1] = 1
+  rowval[2] = 2
+  for i = 1:k
+    colptr[i+1] = 3i
+    if i ≥ 2
+      pos = colptr[i]
+      rowval[pos] = i-1
+      rowval[pos+1] = i
+      rowval[pos+2] = i+1
+    end
+  end
+
+  V = M(undef, n, k+1)
+  T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval)
+
+  for i = 1:k
+    vᵢ = view(V,:,i)
+    vᵢ₊₁ = q = view(V,:,i+1)
+    if i == 1
+      βᵢ = @knrm2(n, b)
+      vᵢ .= b ./ βᵢ
+    end
+    mul!(q, A, vᵢ)
+    αᵢ = @kdotr(n, vᵢ, q)
+    T[i,i] = αᵢ
+    @kaxpy!(n, -αᵢ, vᵢ, q)
+    if i ≥ 2
+      vᵢ₋₁ = view(V,:,i-1)
+      βᵢ = T[i,i-1]
+      T[i-1,i] = βᵢ
+      @kaxpy!(n, -βᵢ, vᵢ₋₁, q)
+    end
+    βᵢ₊₁ = @knrm2(n, q)
+    T[i+1,i] = βᵢ₊₁
+    vᵢ₊₁ .= q ./ βᵢ₊₁
+  end
+  return V, T
+end
+
+"""
+    V, T, U, Tᴴ = nonhermitian_lanczos(A, b, c, k)
+
+#### Input arguments:
+
+* `A`: a linear operator that models a square matrix of dimension n.
+* `b`: a vector of length n.
+* `c`: a vector of length n.
+* `k`: the number of iterations of the non-Hermitian Lanczos process.
+
+#### Output arguments:
+
+* `V`: a dense n × (k+1) matrix.
+* `T`: a sparse (k+1) × k tridiagonal matrix.
+* `U`: a dense n × (k+1) matrix.
+* `Tᴴ`: a sparse (k+1) × k tridiagonal matrix.
+
+#### Reference
+
+* C. Lanczos, [*An Iteration Method for the Solution of the Eigenvalue Problem of Linear Differential and Integral Operators*](https://doi.org/10.6028/jres.045.026), Journal of Research of the National Bureau of Standards, 45(4), pp. 225--280, 1950.
+"""
+function nonhermitian_lanczos(A, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex
+  n, m = size(A)
+  Aᴴ = A'
+  S = ktypeof(b)
+  M = vector_to_matrix(S)
+
+  colptr = zeros(Int, k+1)
+  rowval = zeros(Int, 3k-1)
+  nzval_T = zeros(FC, 3k-1)
+  nzval_Tᴴ = zeros(FC, 3k-1)
+
+  colptr[1] = 1
+  rowval[1] = 1
+  rowval[2] = 2
+  for i = 1:k
+    colptr[i+1] = 3i
+    if i ≥ 2
+      pos = colptr[i]
+      rowval[pos] = i-1
+      rowval[pos+1] = i
+      rowval[pos+2] = i+1
+    end
+  end
+
+  V = M(undef, n, k+1)
+  U = M(undef, n, k+1)
+  T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_T)
+  Tᴴ = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_Tᴴ)
+
+  for i = 1:k
+    vᵢ = view(V,:,i)
+    uᵢ = view(U,:,i)
+    vᵢ₊₁ = q = view(V,:,i+1)
+    uᵢ₊₁ = p = view(U,:,i+1)
+    if i == 1
+      cᴴb = @kdot(n, c, b)
+      βᵢ = √(abs(cᴴb))
+      γᵢ = cᴴb / βᵢ
+      vᵢ .= b ./ βᵢ
+      uᵢ .= c ./ conj(γᵢ)
+    end
+    mul!(q, A , vᵢ)
+    mul!(p, Aᴴ, uᵢ)
+    if i ≥ 2
+      vᵢ₋₁ = view(V,:,i-1)
+      uᵢ₋₁ = view(U,:,i-1)
+      βᵢ = T[i,i-1]
+      γᵢ = T[i-1,i]
+      @kaxpy!(n, -     γᵢ , vᵢ₋₁, q)
+      @kaxpy!(n, -conj(βᵢ), uᵢ₋₁, p)
+    end
+    αᵢ = @kdot(n, uᵢ, q)
+    T[i,i] = αᵢ
+    Tᴴ[i,i] = conj(αᵢ)
+    @kaxpy!(m, -     αᵢ , vᵢ, q)
+    @kaxpy!(n, -conj(αᵢ), uᵢ, p)
+    pᴴq = @kdot(n, p, q)
+    βᵢ₊₁ = √(abs(pᴴq))
+    γᵢ₊₁ = pᴴq / βᵢ₊₁
+    vᵢ₊₁ .= q ./ βᵢ₊₁
+    uᵢ₊₁ .= p ./ conj(γᵢ₊₁)
+    T[i+1,i] = βᵢ₊₁
+    Tᴴ[i+1,i] = conj(γᵢ₊₁)
+    if i ≤ k-1
+      T[i,i+1] = γᵢ₊₁
+      Tᴴ[i,i+1] = conj(βᵢ₊₁)
+    end
+  end
+  return V, T, U, Tᴴ
+end
+
+"""
+    V, H = arnoldi(A, b, k)
+
+#### Input arguments:
+
+* `A`: a linear operator that models a square matrix of dimension n.
+* `b`: a vector of length n.
+* `k`: the number of iterations of the Arnoldi process.
+
+#### Output arguments:
+
+* `V`: a dense n × (k+1) matrix.
+* `H`: a sparse (k+1) × k upper Hessenberg matrix.
+
+#### Reference
+
+* W. E. Arnoldi, [*The principle of minimized iterations in the solution of the matrix eigenvalue problem*](https://doi.org/10.1090/qam/42792), Quarterly of Applied Mathematics, 9, pp. 17--29, 1951.
+"""
+function arnoldi(A, b::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex
+  n, m = size(A)
+  S = ktypeof(b)
+  M = vector_to_matrix(S)
+
+  nnz = div(k*(k+1), 2) + k
+  colptr = zeros(Int, k+1)
+  rowval = zeros(Int, nnz)
+  nzval = zeros(FC, nnz)
+
+  colptr[1] = 1
+  for i = 1:k
+    pos = colptr[i]
+    colptr[i+1] = pos+i+1
+    for j = 1:i+1
+      rowval[pos+j-1] = j
+    end
+  end
+
+  V = M(undef, n, k+1)
+  H = SparseMatrixCSC(k+1, k, colptr, rowval, nzval)
+
+  for i = 1:k
+    vᵢ = view(V,:,i)
+    vᵢ₊₁ = q = view(V,:,i+1)
+    if i == 1
+      β = @knrm2(n, b)
+      vᵢ .= b ./ β
+    end
+    mul!(q, A, vᵢ)
+    for j = 1:i
+      vⱼ = view(V,:,j)
+      H[j,i] = @kdot(n, vⱼ, q)
+      @kaxpy!(n, -H[j,i], vⱼ, q)
+    end
+    H[i+1,i] = @knrm2(n, q)
+    vᵢ₊₁ .= q ./ H[i+1,i]
+  end
+  return V, H
+end
+
+"""
+    V, U, L = golub_kahan(A, b, k)
+
+#### Input arguments:
+
+* `A`: a linear operator that models a matrix of dimension n × m.
+* `b`: a vector of length n.
+* `k`: the number of iterations of the Golub-Kahan process.
+
+#### Output arguments:
+
+* `V`: a dense m × (k+1) matrix.
+* `U`: a dense n × (k+1) matrix.
+* `L`: a sparse (k+1) × (k+1) lower bidiagonal matrix.
+
+#### Reference
+
+* G. H. Golub and W. Kahan, [*Calculating the Singular Values and Pseudo-Inverse of a Matrix*](https://doi.org/10.1137/0702016), SIAM Journal on Numerical Analysis, 2(2), pp. 225--224, 1965.
+"""
+function golub_kahan(A, b::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex
+  n, m = size(A)
+  R = real(FC)
+  Aᴴ = A'
+  S = ktypeof(b)
+  M = vector_to_matrix(S)
+
+  colptr = zeros(Int, k+2)
+  rowval = zeros(Int, 2k+1)
+  nzval = zeros(R, 2k+1)
+
+  colptr[1] = 1
+  for i = 1:k
+    pos = colptr[i]
+    colptr[i+1] = pos+2
+    rowval[pos] = i
+    rowval[pos+1] = i+1
+  end
+  rowval[2k+1] = k+1
+  colptr[k+2] = 2k+2
+
+  V = M(undef, m, k+1)
+  U = M(undef, n, k+1)
+  L = SparseMatrixCSC(k+1, k+1, colptr, rowval, nzval)
+
+  for i = 1:k
+    uᵢ = view(U,:,i)
+    vᵢ = view(V,:,i)
+    uᵢ₊₁ = q = view(U,:,i+1)
+    vᵢ₊₁ = p = view(V,:,i+1)
+    if i == 1
+      wᵢ = vᵢ
+      βᵢ = @knrm2(n, b)
+      uᵢ .= b ./ βᵢ
+      mul!(wᵢ, Aᴴ, uᵢ)
+      αᵢ = @knrm2(m, wᵢ)
+      L[1,1] = αᵢ
+      vᵢ .= wᵢ ./ αᵢ
+    end
+    mul!(q, A, vᵢ)
+    αᵢ = L[i,i] 
+    @kaxpy!(n, -αᵢ, uᵢ, q)
+    βᵢ₊₁ = @knrm2(n, q)
+    uᵢ₊₁ .= q ./ βᵢ₊₁
+    mul!(p, Aᴴ, uᵢ₊₁)
+    @kaxpy!(m, -βᵢ₊₁, vᵢ, p)
+    αᵢ₊₁ = @knrm2(m, p)
+    vᵢ₊₁ .= p ./ αᵢ₊₁
+    L[i+1,i]   = βᵢ₊₁
+    L[i+1,i+1] = αᵢ₊₁
+  end
+  return V, U, L
+end
+
+"""
+    V, T, U, Tᴴ = saunders_simon_yip(A, b, c, k)
+
+#### Input arguments:
+
+* `A`: a linear operator that models a matrix of dimension n × m.
+* `b`: a vector of length n.
+* `c`: a vector of length m.
+* `k`: the number of iterations of the Saunders-Simon-Yip process.
+
+#### Output arguments:
+
+* `V`: a dense n × (k+1) matrix.
+* `T`: a sparse (k+1) × k tridiagonal matrix.
+* `U`: a dense m × (k+1) matrix.
+* `Tᴴ`: a sparse (k+1) × k tridiagonal matrix.
+
+#### Reference
+
+* M. A. Saunders, H. D. Simon, and E. L. Yip, [*Two Conjugate-Gradient-Type Methods for Unsymmetric Linear Equations*](https://doi.org/10.1137/0725052), SIAM Journal on Numerical Analysis, 25(4), pp. 927--940, 1988.
+"""
+function saunders_simon_yip(A, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex
+  n, m = size(A)
+  Aᴴ = A'
+  S = ktypeof(b)
+  M = vector_to_matrix(S)
+
+  colptr = zeros(Int, k+1)
+  rowval = zeros(Int, 3k-1)
+  nzval_T = zeros(FC, 3k-1)
+  nzval_Tᴴ = zeros(FC, 3k-1)
+
+  colptr[1] = 1
+  rowval[1] = 1
+  rowval[2] = 2
+  for i = 1:k
+    colptr[i+1] = 3i
+    if i ≥ 2
+      pos = colptr[i]
+      rowval[pos] = i-1
+      rowval[pos+1] = i
+      rowval[pos+2] = i+1
+    end
+  end
+
+  V = M(undef, n, k+1)
+  U = M(undef, m, k+1)
+  T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_T)
+  Tᴴ = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_Tᴴ)
+
+  for i = 1:k
+    vᵢ = view(V,:,i)
+    uᵢ = view(U,:,i)
+    vᵢ₊₁ = q = view(V,:,i+1)
+    uᵢ₊₁ = p = view(U,:,i+1)
+    if i == 1
+      β = @knrm2(n, b)
+      γ = @knrm2(m, c)
+      vᵢ .= b ./ β
+      uᵢ .= c ./ γ
+    end
+    mul!(q, A , uᵢ)
+    mul!(p, Aᴴ, vᵢ)
+    if i ≥ 2
+      vᵢ₋₁ = view(V,:,i-1)
+      uᵢ₋₁ = view(U,:,i-1)
+      βᵢ = T[i,i-1]
+      γᵢ = T[i-1,i]
+      @kaxpy!(n, -γᵢ, vᵢ₋₁, q)
+      @kaxpy!(m, -βᵢ, uᵢ₋₁, p)
+    end
+    αᵢ = @kdot(n, vᵢ, q)
+    T[i,i] = αᵢ
+    Tᴴ[i,i] = conj(αᵢ)
+    @kaxpy!(n, -     αᵢ , vᵢ, q)
+    @kaxpy!(m, -conj(αᵢ), uᵢ, p)
+    βᵢ₊₁ = @knrm2(n, q)
+    γᵢ₊₁ = @knrm2(m, p)
+    vᵢ₊₁ .= q ./ βᵢ₊₁
+    uᵢ₊₁ .= p ./ γᵢ₊₁
+    T[i+1,i] = βᵢ₊₁
+    Tᴴ[i+1,i] = conj(γᵢ₊₁)
+    if i ≤ k-1
+      T[i,i+1] = γᵢ₊₁
+      Tᴴ[i,i+1] = conj(βᵢ₊₁)
+    end
+  end
+  return V, T, U, Tᴴ
+end
+
+"""
+    V, H, U, F = montoison_orban(A, B, b, c, k)
+
+#### Input arguments:
+
+* `A`: a linear operator that models a matrix of dimension n × m.
+* `B`: a linear operator that models a matrix of dimension m × n.
+* `b`: a vector of length n.
+* `c`: a vector of length m.
+* `k`: the number of iterations of the Montoison-Orban process.
+
+#### Output arguments:
+
+* `V`: a dense n × (k+1) matrix.
+* `H`: a sparse (k+1) × k upper Hessenberg matrix.
+* `U`: a dense m × (k+1) matrix.
+* `F`: a sparse (k+1) × k upper Hessenberg matrix.
+
+#### Reference
+
+* A. Montoison and D. Orban, [*GPMR: An Iterative Method for Unsymmetric Partitioned Linear Systems*](https://dx.doi.org/10.13140/RG.2.2.24069.68326), Cahier du GERAD G-2021-62, GERAD, Montréal, 2021.
+"""
+function montoison_orban(A, B, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex
+  n, m = size(A)
+  S = ktypeof(b)
+  M = vector_to_matrix(S)
+
+  nnz = div(k*(k+1), 2) + k
+  colptr = zeros(Int, k+1)
+  rowval = zeros(Int, nnz)
+  nzval_H = zeros(FC, nnz)
+  nzval_F = zeros(FC, nnz)
+
+  colptr[1] = 1
+  for i = 1:k
+    pos = colptr[i]
+    colptr[i+1] = pos+i+1
+    for j = 1:i+1
+      rowval[pos+j-1] = j
+    end
+  end
+
+  V = M(undef, n, k+1)
+  U = M(undef, m, k+1)
+  H = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_H)
+  F = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_F)
+
+  for i = 1:k
+    vᵢ = view(V,:,i)
+    uᵢ = view(U,:,i)
+    vᵢ₊₁ = q = view(V,:,i+1)
+    uᵢ₊₁ = p = view(U,:,i+1)
+    if i == 1
+      β = @knrm2(n, b)
+      γ = @knrm2(m, c)
+      vᵢ .= b ./ β
+      uᵢ .= c ./ γ
+    end
+    mul!(q, A, uᵢ)
+    mul!(p, B, vᵢ)
+    for j = 1:i
+      vⱼ = view(V,:,j)
+      uⱼ = view(U,:,j)
+      H[j,i] = @kdot(n, vⱼ, q)
+      @kaxpy!(n, -H[j,i], vⱼ, q)
+      F[j,i] = @kdot(m, uⱼ, p)
+      @kaxpy!(m, -F[j,i], uⱼ, p)
+    end
+    H[i+1,i] = @knrm2(n, q)
+    vᵢ₊₁ .= q ./ H[i+1,i]
+    F[i+1,i] = @knrm2(m, p)
+    uᵢ₊₁ .= p ./ F[i+1,i]
+  end
+  return V, H, U, F
+end
diff --git a/test/gpu/amd.jl b/test/gpu/amd.jl
index baad2bdcf..beec647b8 100644
--- a/test/gpu/amd.jl
+++ b/test/gpu/amd.jl
@@ -1,7 +1,6 @@
-using LinearAlgebra, SparseArrays, Test
-using Krylov, AMDGPU
+using AMDGPU
 
-include("../test_utils.jl")
+include("gpu.jl")
 
 @testset "AMD -- AMDGPU.jl" begin
 
@@ -19,6 +18,7 @@ include("../test_utils.jl")
 
   for FC in (Float32, Float64, ComplexF32, ComplexF64)
     S = ROCVector{FC}
+    M = ROCMatrix{FC}
     T = real(FC)
     n = 10
     x = rand(FC, n)
@@ -70,8 +70,8 @@ include("../test_utils.jl")
 
     @testset "vector_to_matrix" begin
       S = ROCVector{FC}
-      M = Krylov.vector_to_matrix(S)
-      @test M == ROCMatrix{FC}
+      M2 = Krylov.vector_to_matrix(S)
+      @test M2 == M
     end
 
     ε = eps(T)
@@ -93,5 +93,9 @@ include("../test_utils.jl")
       x, stats = cg(A, b)
       @test norm(b - A * x) ≤ atol + rtol * norm(b)
     end
+
+    # @testset "processes -- $FC" begin
+    #   test_processes(S, M)
+    # end
   end
 end
diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl
new file mode 100644
index 000000000..e0ca265b7
--- /dev/null
+++ b/test/gpu/gpu.jl
@@ -0,0 +1,38 @@
+using LinearAlgebra, SparseArrays, Test
+using Krylov
+
+include("../test_utils.jl")
+
+function test_processes(S, M)
+  n = 250
+  m = 500
+  k = 20
+  FC = eltype(S)
+
+  cpu_A, cpu_b = symmetric_indefinite(n, FC=FC)
+  gpu_A, gpu_b = M(cpu_A), S(cpu_b)
+  V, T = hermitian_lanczos(gpu_A, gpu_b, k)
+
+  cpu_A, cpu_b = nonsymmetric_definite(n, FC=FC)
+  cpu_c = -cpu_b
+  gpu_A, gpu_b, gpu_c = M(cpu_A), S(cpu_b), S(cpu_c)
+  V, T, U, Tᴴ = nonhermitian_lanczos(gpu_A, gpu_b, gpu_c, k)
+
+  cpu_A, cpu_b = nonsymmetric_indefinite(n, FC=FC)
+  gpu_A, gpu_b = M(cpu_A), S(cpu_b)
+  V, H = arnoldi(gpu_A, gpu_b, k)
+
+  cpu_A, cpu_b = under_consistent(n, m, FC=FC)
+  gpu_A, gpu_b = M(cpu_A), S(cpu_b)
+  V, U, L = golub_kahan(gpu_A, gpu_b, k)
+
+  cpu_A, cpu_b = under_consistent(n, m, FC=FC)
+  _, cpu_c = over_consistent(m, n, FC=FC)
+  gpu_A, gpu_b, gpu_c = M(cpu_A), S(cpu_b), S(cpu_c)
+  V, T, U, Tᴴ = saunders_simon_yip(gpu_A, gpu_b, gpu_c, k)
+
+  cpu_A, cpu_b = under_consistent(n, m, FC=FC)
+  cpu_B, cpu_c = over_consistent(m, n, FC=FC)
+  gpu_A, gpu_B, gpu_b, gpu_c = M(cpu_A), M(cpu_B), S(cpu_b), S(cpu_c)
+  V, H, U, F = montoison_orban(gpu_A, gpu_B, gpu_b, gpu_c, k)
+end
diff --git a/test/gpu/intel.jl b/test/gpu/intel.jl
index 67ad0a7d5..e65a2bf47 100644
--- a/test/gpu/intel.jl
+++ b/test/gpu/intel.jl
@@ -1,7 +1,6 @@
-using LinearAlgebra, SparseArrays, Test
-using Krylov, oneAPI
+using oneAPI
 
-include("../test_utils.jl")
+include("gpu.jl")
 
 import Krylov.kdot
 # https://github.com/JuliaGPU/GPUArrays.jl/pull/427
@@ -27,6 +26,7 @@ end
 
   for FC ∈ (Float32, ComplexF32)
     S = oneVector{FC}
+    M = oneMatrix{FC}
     T = real(FC)
     n = 10
     x = rand(FC, n)
@@ -78,8 +78,8 @@ end
 
     @testset "vector_to_matrix" begin
       S = oneVector{FC}
-      M = Krylov.vector_to_matrix(S)
-      @test M == oneMatrix{FC}
+      M2 = Krylov.vector_to_matrix(S)
+      @test M2 == M
     end
 
     ε = eps(T)
@@ -101,5 +101,9 @@ end
       x, stats = cg(A, b)
       @test norm(b - A * x) ≤ atol + rtol * norm(b)
     end
+
+    # @testset "processes -- $FC" begin
+    #   test_processes(S, M)
+    # end
   end
 end
diff --git a/test/gpu/metal.jl b/test/gpu/metal.jl
index 35325c863..9fd24392a 100644
--- a/test/gpu/metal.jl
+++ b/test/gpu/metal.jl
@@ -1,7 +1,6 @@
-using LinearAlgebra, SparseArrays, Test
-using Krylov, Metal
+using Metal
 
-include("../test_utils.jl")
+include("gpu.jl")
 
 # https://github.com/JuliaGPU/Metal.jl/pull/48
 const MtlVector{T} = MtlArray{T,1}
@@ -31,6 +30,7 @@ end
 
   for FC in (Float32, ComplexF32)
     S = MtlVector{FC}
+    M = MtlMatrix{FC}
     T = real(FC)
     n = 10
     x = rand(FC, n)
@@ -82,8 +82,8 @@ end
 
     @testset "vector_to_matrix" begin
       S = MtlVector{FC}
-      M = Krylov.vector_to_matrix(S)
-      @test M == MtlMatrix{FC}
+      M2 = Krylov.vector_to_matrix(S)
+      @test M2 == M
     end
 
     ε = eps(T)
@@ -105,5 +105,9 @@ end
       x, stats = cg(A, b)
       @test norm(b - A * x) ≤ atol + rtol * norm(b)
     end
+
+    # @testset "processes -- $FC" begin
+    #   test_processes(S, M)
+    # end
   end
 end
diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl
index c72c5c6ba..b27ee11d8 100644
--- a/test/gpu/nvidia.jl
+++ b/test/gpu/nvidia.jl
@@ -1,7 +1,6 @@
-using LinearAlgebra, SparseArrays, Test
-using LinearOperators, Krylov, CUDA, CUDA.CUSPARSE, CUDA.CUSOLVER
+using LinearOperators, CUDA, CUDA.CUSPARSE, CUDA.CUSOLVER
 
-include("../test_utils.jl")
+include("gpu.jl")
 
 @testset "Nvidia -- CUDA.jl" begin
 
@@ -95,6 +94,7 @@ include("../test_utils.jl")
 
   for FC in (Float32, Float64, ComplexF32, ComplexF64)
     S = CuVector{FC}
+    M = CuMatrix{FC}
     T = real(FC)
     n = 10
     x = rand(FC, n)
@@ -146,8 +146,8 @@ include("../test_utils.jl")
 
     @testset "vector_to_matrix" begin
       S = CuVector{FC}
-      M = Krylov.vector_to_matrix(S)
-      @test M == CuMatrix{FC}
+      M2 = Krylov.vector_to_matrix(S)
+      @test M2 == M
     end
 
     ε = eps(T)
@@ -169,5 +169,9 @@ include("../test_utils.jl")
       x, stats = cg(A, b)
       @test norm(b - A * x) ≤ atol + rtol * norm(b)
     end
+
+    @testset "processes -- $FC" begin
+      test_processes(S, M)
+    end
   end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 75e8f0941..b69865f61 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,6 +4,7 @@ import Krylov.KRYLOV_SOLVERS
 include("test_utils.jl")
 include("test_aux.jl")
 include("test_stats.jl")
+include("test_processes.jl")
 
 include("test_fgmres.jl")
 include("test_gpmr.jl")
diff --git a/test/test_processes.jl b/test/test_processes.jl
new file mode 100644
index 000000000..7411269bb
--- /dev/null
+++ b/test/test_processes.jl
@@ -0,0 +1,148 @@
+"""
+    P = permutation_paige(k)
+
+Return the sparse (2k) × (2k) matrix
+
+    [e₁    •    eₖ   ]
+    [   e₁    •    eₖ]
+"""
+function permutation_paige(k)
+  P = spzeros(Float64, 2k, 2k)
+  for i = 1:k
+    P[i,2i-1] = 1.0
+    P[i+k,2i] = 1.0
+  end
+  return P
+end
+
+@testset "processes" begin
+  n = 250
+  m = 500
+  k = 20
+  
+  for FC in (Float64, ComplexF64)
+    R = real(FC)
+    nbits_FC = sizeof(FC)
+    nbits_R = sizeof(R)
+    nbits_I = sizeof(Int)
+
+    @testset "Data Type: FC" begin
+      
+      @testset "Hermitian Lanczos" begin
+        A, b = symmetric_indefinite(n, FC=FC)
+        V, T = hermitian_lanczos(A, b, k)
+
+        @test A * V[:,1:k] ≈ V * T
+
+        storage_hermitian_lanczos_bytes(n, k) = 4k * nbits_I + (3k-1) * nbits_R + n*(k+1) * nbits_FC
+
+        expected_hermitian_lanczos_bytes = storage_hermitian_lanczos_bytes(n, k)
+        actual_hermitian_lanczos_bytes = @allocated hermitian_lanczos(A, b, k)
+        @test expected_hermitian_lanczos_bytes ≤ actual_hermitian_lanczos_bytes ≤ 1.02 * expected_hermitian_lanczos_bytes
+      end
+
+      @testset "Non-hermitian Lanczos" begin
+        A, b = nonsymmetric_definite(n, FC=FC)
+        c = -b
+        V, T, U, Tᴴ = nonhermitian_lanczos(A, b, c, k)
+
+        @test T[1:k,1:k] ≈ Tᴴ[1:k,1:k]'
+        @test A  * V[:,1:k] ≈ V * T
+        @test A' * U[:,1:k] ≈ U * Tᴴ
+
+        storage_nonhermitian_lanczos_bytes(n, k) = 4k * nbits_I + (6k-2) * nbits_FC + 2*n*(k+1) * nbits_FC
+
+        expected_nonhermitian_lanczos_bytes = storage_nonhermitian_lanczos_bytes(n, k)
+        actual_nonhermitian_lanczos_bytes = @allocated nonhermitian_lanczos(A, b, c, k)
+        @test expected_nonhermitian_lanczos_bytes ≤ actual_nonhermitian_lanczos_bytes ≤ 1.02 * expected_nonhermitian_lanczos_bytes
+      end
+
+      @testset "Arnoldi" begin
+        A, b = nonsymmetric_indefinite(n, FC=FC)
+        V, H = arnoldi(A, b, k)
+
+        @test A * V[:,1:k] ≈ V * H
+
+        function storage_arnoldi_bytes(n, k)
+          nnz = div(k*(k+1), 2) + k
+          return (nnz + k+1) * nbits_I + nnz * nbits_FC + n*(k+1) * nbits_FC
+        end
+
+        expected_arnoldi_bytes = storage_arnoldi_bytes(n, k)
+        actual_arnoldi_bytes = @allocated arnoldi(A, b, k)
+        @test expected_arnoldi_bytes ≤ actual_arnoldi_bytes ≤ 1.02 * expected_arnoldi_bytes
+      end
+
+      @testset "Golub-Kahan" begin
+        A, b = under_consistent(n, m, FC=FC)
+        V, U, L = golub_kahan(A, b, k)
+        B = L[1:k+1,1:k]
+
+        @test A  * V[:,1:k] ≈ U * B
+        @test A' * U ≈ V * L'
+        @test A' * A  * V[:,1:k] ≈ V * L' * B
+        @test A  * A' * U[:,1:k] ≈ U * B * L[1:k,1:k]'
+
+        storage_golub_kahan_bytes(n, m, k) = 3*(k+1) * nbits_I + (2k+1) * nbits_R + (n+m)*(k+1) * nbits_FC
+
+        expected_golub_kahan_bytes = storage_golub_kahan_bytes(n, m, k)
+        actual_golub_kahan_bytes = @allocated golub_kahan(A, b, k)
+        @test expected_golub_kahan_bytes ≤ actual_golub_kahan_bytes ≤ 1.02 * expected_golub_kahan_bytes
+      end
+
+      @testset "Saunders-Simon-Yip" begin
+        A, b = under_consistent(n, m, FC=FC)
+        _, c = over_consistent(m, n, FC=FC)
+        V, T, U, Tᴴ = saunders_simon_yip(A, b, c, k)
+
+        @test T[1:k,1:k] ≈ Tᴴ[1:k,1:k]'
+        @test A  * U[:,1:k] ≈ V * T
+        @test A' * V[:,1:k] ≈ U * Tᴴ
+        @test A' * A  * U[:,1:k-1] ≈ U * Tᴴ * T[1:k,1:k-1]
+        @test A  * A' * V[:,1:k-1] ≈ V * T * Tᴴ[1:k,1:k-1]
+
+        K = [zeros(FC,n,n) A; A' zeros(FC,m,m)]
+        Pₖ = permutation_paige(k)
+        Wₖ = [V[:,1:k] zeros(FC,n,k); zeros(FC,m,k) U[:,1:k]] * Pₖ
+        Pₖ₊₁ = permutation_paige(k+1)
+        Wₖ₊₁ = [V zeros(FC,n,k+1); zeros(FC,m,k+1) U] * Pₖ₊₁
+        G = Pₖ₊₁' * [zeros(FC,k+1,k) T; Tᴴ zeros(FC,k+1,k)] * Pₖ
+        @test K * Wₖ ≈ Wₖ₊₁ * G
+
+        storage_saunders_simon_yip_bytes(n, m, k) = 4k * nbits_I + (6k-2) * nbits_FC + (n+m)*(k+1) * nbits_FC
+
+        expected_saunders_simon_yip_bytes = storage_saunders_simon_yip_bytes(n, m, k)
+        actual_saunders_simon_yip_bytes = @allocated saunders_simon_yip(A, b, c, k)
+        @test expected_saunders_simon_yip_bytes ≤ actual_saunders_simon_yip_bytes ≤ 1.02 * expected_saunders_simon_yip_bytes
+      end
+
+      @testset "Montoison-Orban" begin
+        A, b = under_consistent(n, m, FC=FC)
+        B, c = over_consistent(m, n, FC=FC)
+        V, H, U, F = montoison_orban(A, B, b, c, k)
+
+        @test A * U[:,1:k] ≈ V * H
+        @test B * V[:,1:k] ≈ U * F
+        @test B * A * U[:,1:k-1] ≈ U * F * H[1:k,1:k-1]
+        @test A * B * V[:,1:k-1] ≈ V * H * F[1:k,1:k-1]
+
+        K = [zeros(FC,n,n) A; B zeros(FC,m,m)]
+        Pₖ = permutation_paige(k)
+        Wₖ = [V[:,1:k] zeros(FC,n,k); zeros(FC,m,k) U[:,1:k]] * Pₖ
+        Pₖ₊₁ = permutation_paige(k+1)
+        Wₖ₊₁ = [V zeros(FC,n,k+1); zeros(FC,m,k+1) U] * Pₖ₊₁
+        G = Pₖ₊₁' * [zeros(FC,k+1,k) H; F zeros(FC,k+1,k)] * Pₖ
+        @test K * Wₖ ≈ Wₖ₊₁ * G
+
+        function storage_montoison_orban_bytes(n, m, k)
+          nnz = div(k*(k+1), 2) + k
+          return (nnz + k+1) * nbits_I + 2*nnz * nbits_FC + (n+m)*(k+1) * nbits_FC
+        end
+
+        expected_montoison_orban_bytes = storage_montoison_orban_bytes(n, m, k)
+        actual_montoison_orban_bytes = @allocated montoison_orban(A, B, b, c, k)
+        @test expected_montoison_orban_bytes ≤ actual_montoison_orban_bytes ≤ 1.02 * expected_montoison_orban_bytes
+      end
+    end
+  end
+end